├── .gitignore
├── .merlin
├── B0.ml
├── BRZO
├── DEVEL.md
├── LICENSE.md
├── README.md
├── data
    ├── urep-spec.html
    └── urep-test.html
├── examples.ml
├── old-proposal.md
├── perf8.ml
├── test.ml
├── test_long.ml
├── trip8.ml
├── utf_16.ml
├── utf_uchar.ml
├── utf_uchar.mli
├── utf_x.mli
├── utf_x_adhoc.ml
├── utf_x_adhoc.mli
├── utf_x_dfa.ml
├── utf_x_dfa.mli
├── utf_x_if.ml
├── utf_x_if.mli
├── utf_x_pat.ml
└── utf_x_pat.mli


/.gitignore:
--------------------------------------------------------------------------------
1 | _b0
2 | _build
3 | data
4 | tmp
5 | *.install
6 | CLOCK.org
7 | 


--------------------------------------------------------------------------------
/.merlin:
--------------------------------------------------------------------------------
1 | PKG b0.kit uutf
2 | S ./**
3 | B _b0/b/**
4 | 


--------------------------------------------------------------------------------
/B0.ml:
--------------------------------------------------------------------------------
 1 | open B0_kit.V000
 2 | 
 3 | let uutf = B0_ocaml.libname "uutf"
 4 | 
 5 | let file v = `File (Fpath.v v)
 6 | let mod_srcs n = file (n ^ ".mli"), file (n ^ ".ml")
 7 | 
 8 | let iface = file "utf_x.mli"
 9 | let uchar_mli, uchar_ml = mod_srcs "utf_uchar"
10 | let utf_16_ml = file "utf_16.ml"
11 | let adhoc_mli, adhoc_ml = mod_srcs "utf_x_adhoc"
12 | let if_mli, if_ml = mod_srcs "utf_x_if"
13 | let dfa_mli, dfa_ml = mod_srcs "utf_x_dfa"
14 | let pat_mli, pat_ml = mod_srcs "utf_x_pat"
15 | 
16 | let base = [iface; uchar_mli; uchar_ml; utf_16_ml]
17 | let all_impls =
18 |   [adhoc_mli; adhoc_ml; if_mli; if_ml; dfa_mli; dfa_ml; pat_mli; pat_ml ] @
19 |   base
20 | 
21 | let examples =
22 |   let srcs = [adhoc_mli; adhoc_ml; file "examples.ml"] @ base in
23 |   B0_ocaml.exe "examples" ~doc:"Sample code" ~srcs
24 | 
25 | let test =
26 |   let srcs = (file "test.ml" :: all_impls) in
27 |   B0_ocaml.exe "test" ~doc:"Test suite" ~srcs
28 | 
29 | let test =
30 |   let srcs = (file "test_long.ml" :: all_impls) in
31 |   B0_ocaml.exe "test_long" ~doc:"Long test suite" ~srcs
32 | 
33 | let perf8 =
34 |   let srcs = (file "perf8.ml" :: all_impls) in
35 |   let requires = [uutf] in
36 |   B0_ocaml.exe "perf8" ~doc:"Performance test tool for UTF-8" ~requires ~srcs
37 | 
38 | let trip8 =
39 |   let srcs = (file "trip8.ml" :: all_impls) in
40 |   let requires = [uutf] in
41 |   B0_ocaml.exe "trip8" ~doc:"Best-effort UTF-8 recode" ~requires ~srcs
42 | 


--------------------------------------------------------------------------------
/BRZO:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dbuenzli/stdlib-utf/3b9a96e7530830bd5ea87c3c92dc9e9bd02d4dee/BRZO


--------------------------------------------------------------------------------
/DEVEL.md:
--------------------------------------------------------------------------------
 1 | Running stuff.
 2 | 
 3 | ```
 4 | opam install uutf
 5 | opam pin add b0 --dev
 6 | curl -L https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt \
 7 |         > data/UTF-8-demo.txt 
 8 | time b0 -a perf8 -- --adhoc data/UTF-8-demo.txt 
 9 | time b0 -a perf8 -- --dfa data/UTF-8-demo.txt 
10 | time b0 -a perf8 -- --validate --adhoc data/UTF-8-demo.txt
11 | 
12 | # If you are interested in absolute numbers avoid timing the build
13 | 
14 | b0
15 | time $(b0 unit build-dir perf8)/perf8 ...
16 | ```
17 | 
18 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | (**************************************************************************)
 3 | (*                                                                        *)
 4 | (*                                 OCaml                                  *)
 5 | (*                                                                        *)
 6 | (*                         The OCaml programmers                          *)
 7 | (*                                                                        *)
 8 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 9 | (*     en Automatique.                                                    *)
10 | (*                                                                        *)
11 | (*   All rights reserved.  This file is distributed under the terms of    *)
12 | (*   the GNU Lesser General Public License version 2.1, with the          *)
13 | (*   special exception on linking described in the file LICENSE.          *)
14 | (*                                                                        *)
15 | (**************************************************************************)
16 | ```
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Adding UTF-X decoding support to the OCaml Stdlib 
  2 | =================================================
  3 | 
  4 | **This has been merged upstream and will be part of OCaml 4.14**
  5 | 
  6 | [Upstream issue][upstream]. [Upstream PR][upstream-pr].
  7 | 
  8 | Plays around improving [this old proposal][old-proposal]. Especially
  9 | on [Alain's comment][alain-comment] to avoid exceptions. 
 10 | 
 11 | For now we simply focus on providing an API in `Bytes`, the rest can
 12 | follow easily and/or later. 
 13 | 
 14 | We introduce the abstract type `Uchar.utf_decode` for decode results
 15 | which is just an `int` value that has the decoded `Uchar.t` (or
 16 | `Uchar.rep` in case of decode error) in the lower bits and information
 17 | about the decode in bits above `Uchar.max`. The scheme is compatible
 18 | with both 32-bit and 64-bit platforms, see the implementation for more
 19 | details. No exception have to be introduced and the API does not
 20 | allocate on decodes.
 21 | 
 22 | The [resulting API](utf_x.mli) seems nice to
 23 | use. [`examples.ml`](examples.ml) show how [`Uutf`-like
 24 | folders][uutf-fold] and `Seq` based iterators can easily be derived in
 25 | a few lines.
 26 | 
 27 | It is also fool proof w.r.t. to best-effort decodes, `Uchar.rep` will
 28 | be appropriately added on decoding errors as it should be for security
 29 | reasons. See the `Seq` iterator example and more information on this
 30 | below.
 31 | 
 32 | A few implementation are provided which differ only in their UTF-8
 33 | treatment:
 34 | 
 35 | 1. [`utf_x_adhoc.ml`](utf_x_adhoc.ml), uses a 256 bytes string to index
 36 |    the cases mentionned in [TUS Table 3.7][tus] and then performs some
 37 |    ugly ad-hoc decoding. This ugly code is mine :-)
 38 | 2. [`utf_x_if.ml`](utf_x_if.ml), is similar to adhoc, but uses `if`
 39 |    branches instead of a table, the result is a bit harder to comprehend.
 40 | 3. [`utf_x_pat.ml`](utf_x_pat.ml), is similar to adhoc but lets my
 41 |    favourite compiler work out the dispatch table itself by using pattern 
 42 |    matching on byte ranges. Yay !
 43 | 4. [`utf_x_dfa.ml`](utf_x_dfa.ml), uses the UTF-8 decoding MIT-licensed
 44 |    DFA devised by Bjoern Hoehrmann [here][dfa]. This uses a 364 bytes
 45 |    string for the DFA and makes the decoding code much more elegant (if 
 46 |    I hadn't to rewrite it to an imperative loop to make it more 
 47 |    efficient).
 48 |    
 49 | Rough benchmarks made with [`perf8.ml`](perf8.ml) and measured via
 50 | `time` on my machine seems to indicate we have, ordered by most
 51 | performant:  `pat`, `if`, `adhoc` and `dfa`.
 52 | 
 53 | The first three being quite close to each other (within
 54 | 10%). Comparing the fastest to the slowest, the `pat` version takes
 55 | the following percentage of the time taken by the `dfa` version.
 56 | 
 57 | * 62% [Markus Kuhn's UTF-8 demo][kuhn-utf-8]. A mixture of 
 58 |   things.
 59 | * 62% [Hindi wikipedia articles multistream][hindi-wiki] dataset.
 60 |   Exercises ASCII and [Devanagari][devanagari] decodes which are 
 61 |   three bytes long in UTF-8.
 62 | * 58% [Croatian wikipedia articles multistream][hr-wiki] dataset.
 63 |   Exercises ASCII and [Latin Extended-A][latin-ext-A] decodes which
 64 |   are two bytes long in UTF-8.
 65 | * 56% on this README, mainly ASCII.
 66 | 
 67 | So I think the best candidate for upstreaming would be `pat`.
 68 | 
 69 | ## Best-effort decodes and U+FFFD replacements
 70 | 
 71 | The API also tries to take into account recommended strategies for
 72 | U+FFFD replacement on best-effort decoding. For a discussion about
 73 | these see [here][how-many-urep].
 74 | 
 75 | If nothing special is done, e.g. like the `Seq` iterator in
 76 | [`examples.ml`](examples.ml) the resulting API should behave like the
 77 | "state machine view". Supposedly this is what is mandated by the
 78 | WHATWG [Encoding][whatwg-encoding] standard, if you can infer it
 79 | through their crazy prose code specifications – TUS has a description
 80 | of it on page 126 of the [version 14.0.0 text][tus] if you can
 81 | remember all the precise jargon definitions :-) I believe that a short and
 82 | concise spec of all that is:
 83 | 
 84 | > In case of decoding error return an U+FFFE, if you error on the
 85 | > first byte, consume the byte, otherwise consume the bytes preceeding
 86 | > the erroring byte.
 87 | 
 88 | The API provides enough information on decodes to implement other
 89 | replacement modes like one `Uchar.rep` per bogus byte.
 90 | 
 91 | The [`urep-test.html`](data/urep-test.html) and
 92 | [`urep-spec.html`](data/urep-spec.html) files taken from the above
 93 | mentioned page can be used to test the WHATWG encoding standard
 94 | behaviour (note that `Uutf` does not respect this).
 95 | 
 96 | [upstream-pr]: https://github.com/ocaml/ocaml/pull/10710 
 97 | [upstream]: https://github.com/ocaml/ocaml/issues/10660
 98 | [old-proposal]: https://gist.github.com/dbuenzli/211e1fb4d8dfce0d22c6d6616260cdd9
 99 | [alain-comment]: https://gist.github.com/dbuenzli/211e1fb4d8dfce0d22c6d6616260cdd9#gistcomment-2574875
100 | [how-many-urep]: https://hsivonen.fi/broken-utf-8/
101 | [tus]: http://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
102 | [whatwg-encoding]: https://encoding.spec.whatwg.org/
103 | [dfa]: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/#variations
104 | [hindi-wiki]: https://dumps.wikimedia.org/hiwiki/20210920/
105 | [hr-wiki]: https://dumps.wikimedia.org/hrwiki/20210920/
106 | [kuhn-utf-8]: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt
107 | [uutf-fold]: https://erratique.ch/software/uutf/doc/Uutf.String.html#1_Stringfolders
108 | [devanagari]: http://www.unicode.org/charts/PDF/U0900.pdf
109 | [latin-ext-a]: https://www.unicode.org/charts/PDF/U0100.pdf
110 | 


--------------------------------------------------------------------------------
/data/urep-spec.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | 	<meta charset="utf-8">
  5 | 	<title>Broken UTF-8</title>
  6 | 	<style>
  7 | 	* {
  8 | 		font-size: 1em;
  9 | 	}
 10 | 	</style>
 11 | </head>
 12 | <body>
 13 | <h1>Broken UTF-8</h1>
 14 | <p>Any copyright to this file is dedicated to the Public Domain. <a href=https://creativecommons.org/publicdomain/zero/1.0/>https://creativecommons.org/publicdomain/zero/1.0/</a></p>
 15 | <p>Five-byte and six-byte sequences were defined in <a href=https://tools.ietf.org/html/rfc2279>RFC 2297</a> but are no longer part of the UTF-8 definition.
 16 | <h2>Non-shortest forms for lowest single-byte (U+0000)</h2>
 17 | <dl>
 18 | <dt>Two-byte sequence (C0 80)</dt>
 19 | <dd>��</dd>
 20 | <dt>Three-byte sequence (E0 80 80)</dt>
 21 | <dd>���</dd>
 22 | <dt>Four-byte sequence (F0 80 80 80)</dt>
 23 | <dd>����</dd>
 24 | <dt>Five-byte sequence (F8 80 80 80 80)</dt>
 25 | <dd>�����</dd>
 26 | <dt>Six-byte sequence (FC 80 80 80 80 80)</dt>
 27 | <dd>������</dd>
 28 | </dl>
 29 | <h2>Non-shortest forms for highest single-byte (U+007F)</h2>
 30 | <dl>
 31 | <dt>Two-byte sequence (C1 BF)</dt>
 32 | <dd>��</dd>
 33 | <dt>Three-byte sequence (E0 81 BF)</dt>
 34 | <dd>���</dd>
 35 | <dt>Four-byte sequence (F0 80 81 BF)</dt>
 36 | <dd>����</dd>
 37 | <dt>Five-byte sequence (F8 80 80 81 BF)</dt>
 38 | <dd>�����</dd>
 39 | <dt>Six-byte sequence (FC 80 80 80 81 BF)</dt>
 40 | <dd>������</dd>
 41 | </dl>
 42 | <h2>Non-shortest forms for lowest two-byte (U+0080)</h2>
 43 | <dl>
 44 | <dt>Three-byte sequence (E0 82 80)</dt>
 45 | <dd>���</dd>
 46 | <dt>Four-byte sequence (F0 80 82 80)</dt>
 47 | <dd>����</dd>
 48 | <dt>Five-byte sequence (F8 80 80 82 80)</dt>
 49 | <dd>�����</dd>
 50 | <dt>Six-byte sequence (FC 80 80 80 82 80)</dt>
 51 | <dd>������</dd>
 52 | </dl>
 53 | <h2>Non-shortest forms for highest two-byte (U+07FF)</h2>
 54 | <dl>
 55 | <dt>Three-byte sequence (E0 9F BF)</dt>
 56 | <dd>���</dd>
 57 | <dt>Four-byte sequence (F0 80 9F BF)</dt>
 58 | <dd>����</dd>
 59 | <dt>Five-byte sequence (F8 80 80 9F BF)</dt>
 60 | <dd>�����</dd>
 61 | <dt>Six-byte sequence (FC 80 80 80 9F BF)</dt>
 62 | <dd>������</dd>
 63 | </dl>
 64 | <h2>Non-shortest forms for lowest three-byte (U+0800)</h2>
 65 | <dl>
 66 | <dt>Four-byte sequence (F0 80 A0 80)</dt>
 67 | <dd>����</dd>
 68 | <dt>Five-byte sequence (F8 80 80 A0 80)</dt>
 69 | <dd>�����</dd>
 70 | <dt>Six-byte sequence (FC 80 80 80 A0 80)</dt>
 71 | <dd>������</dd>
 72 | </dl>
 73 | <h2>Non-shortest forms for highest three-byte (U+FFFF)</h2>
 74 | <dl>
 75 | <dt>Four-byte sequence (F0 8F BF BF)</dt>
 76 | <dd>����</dd>
 77 | <dt>Five-byte sequence (F8 80 8F BF BF)</dt>
 78 | <dd>�����</dd>
 79 | <dt>Six-byte sequence (FC 80 80 8F BF BF)</dt>
 80 | <dd>������</dd>
 81 | </dl>
 82 | <h2>Non-shortest forms for lowest four-byte (U+10000)</h2>
 83 | <dl>
 84 | <dt>Five-byte sequence (F8 80 90 80 80)</dt>
 85 | <dd>�����</dd>
 86 | <dt>Six-byte sequence (FC 80 80 90 80 80)</dt>
 87 | <dd>������</dd>
 88 | </dl>
 89 | <h2>Non-shortest forms for last Unicode (U+10FFFF)</h2>
 90 | <dl>
 91 | <dt>Five-byte sequence (F8 84 8F BF BF)</dt>
 92 | <dd>�����</dd>
 93 | <dt>Six-byte sequence (FC 80 84 8F BF BF)</dt>
 94 | <dd>������</dd>
 95 | </dl>
 96 | <h2>Out of range</h2>
 97 | <dl>
 98 | <dt>One past Unicode (F4 90 80 80)</dt>
 99 | <dd>����</dd>
100 | <dt>Longest five-byte sequence (FB BF BF BF BF)</dt>
101 | <dd>�����</dd>
102 | <dt>Longest six-byte sequence (FD BF BF BF BF BF)</dt>
103 | <dd>������</dd>
104 | <dt>First surrogate (ED A0 80)</dt>
105 | <dd>���</dd>
106 | <dt>Last surrogate (ED BF BF)</dt>
107 | <dd>���</dd>
108 | <dt>CESU-8 surrogate pair (ED A0 BD ED B2 A9)</dt>
109 | <dd>������</dd>
110 | </dl>
111 | <h2>Out of range and non-shortest</h2>
112 | <dl>
113 | <dt>One past Unicode as five-byte sequence (F8 84 90 80 80)</dt>
114 | <dd>�����</dd>
115 | <dt>One past Unicode as six-byte sequence (FC 80 84 90 80 80)</dt>
116 | <dd>������</dd>
117 | <dt>First surrogate as four-byte sequence (F0 8D A0 80)</dt>
118 | <dd>����</dd>
119 | <dt>Last surrogate as four-byte sequence (F0 8D BF BF)</dt>
120 | <dd>����</dd>
121 | <dt>CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)</dt>
122 | <dd>��������</dd>
123 | </dl>
124 | <h2>Lone trails</h2>
125 | <dl>
126 | <dt>One (80)</dt>
127 | <dd>�</dd>
128 | <dt>Two (80 80)</dt>
129 | <dd>��</dd>
130 | <dt>Three (80 80 80)</dt>
131 | <dd>���</dd>
132 | <dt>Four (80 80 80 80)</dt>
133 | <dd>����</dd>
134 | <dt>Five (80 80 80 80 80)</dt>
135 | <dd>�����</dd>
136 | <dt>Six (80 80 80 80 80 80)</dt>
137 | <dd>������</dd>
138 | <dt>Seven (80 80 80 80 80 80 80)</dt>
139 | <dd>�������</dd>
140 | <dt>After valid two-byte (C2 B6 80)</dt>
141 | <dd>¶�</dd>
142 | <dt>After valid three-byte (E2 98 83 80)</dt>
143 | <dd>☃�</dd>
144 | <dt>After valid four-byte (F0 9F 92 A9 80)</dt>
145 | <dd>💩�</dd>
146 | <dt>After five-byte (FB BF BF BF BF 80)</dt>
147 | <dd>������</dd>
148 | <dt>After six-byte (FD BF BF BF BF BF 80)</dt>
149 | <dd>�������</dd>
150 | </dl>
151 | <h2>Truncated sequences</h2>
152 | <dl>
153 | <dt>Two-byte lead (C2)</dt>
154 | <dd>�</dd>
155 | <dt>Three-byte lead (E2)</dt>
156 | <dd>�</dd>
157 | <dt>Three-byte lead and one trail (E2 98)</dt>
158 | <dd>�</dd>
159 | <dt>Four-byte lead (F0)</dt>
160 | <dd>�</dd>
161 | <dt>Four-byte lead and one trail (F0 9F)</dt>
162 | <dd>�</dd>
163 | <dt>Four-byte lead and two trails (F0 9F 92)</dt>
164 | <dd>�</dd>
165 | </dl>
166 | <h2>Leftovers</h2>
167 | <dl>
168 | <dt>FE (FE)</dt>
169 | <dd>�</dd>
170 | <dt>FE and trail (FE 80)</dt>
171 | <dd>��</dd>
172 | <dt>FF (FF)</dt>
173 | <dd>�</dd>
174 | <dt>FF and trail (FF 80)</dt>
175 | <dd>��</dd>
176 | </dl>
177 | </body>
178 | </html>
179 | 


--------------------------------------------------------------------------------
/data/urep-test.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dbuenzli/stdlib-utf/3b9a96e7530830bd5ea87c3c92dc9e9bd02d4dee/data/urep-test.html


--------------------------------------------------------------------------------
/examples.ml:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | open Utf_x_adhoc
17 | 
18 | (* Uutf style folders
19 |    https://erratique.ch/software/uutf/doc/Uutf.String.html *)
20 | 
21 | type utf_decode = [ `Malformed of string | `Uchar of Uchar.t ]
22 | type 'a folder = 'a -> int -> utf_decode -> 'a
23 | 
24 | let fold_utf_8 : 'a folder -> 'a -> string -> 'a = fun f acc s ->
25 |   let rec loop b i acc =
26 |     if i >= Bytes.length b then acc else
27 |     let dec = Bytes.get_utf_8_uchar b i in
28 |     let n = Uchar.utf_decode_length dec in
29 |     let i' = i + n in
30 |     match Uchar.utf_decode_is_valid dec with
31 |     | false -> loop b i' (f acc i (`Malformed (String.sub s i n)))
32 |     | true -> loop b i' (f acc i (`Uchar (Uchar.utf_decode_uchar dec)))
33 |   in
34 |   loop (Bytes.unsafe_of_string s) 0 acc
35 | 
36 | (* Seq based iterators.
37 | 
38 |    Replace malformations by {!Uchar.rep} using the WHATWG Encoding
39 |    standard, see https://hsivonen.fi/broken-utf-8/. It's nice that no
40 |    particular logic has to be added for decode errors. *)
41 | 
42 | let utf_8_string_to_uchar_seq  : string -> Uchar.t Seq.t = fun s ->
43 |   let rec uchar b i () =
44 |     if i >= Bytes.length b then Seq.Nil else
45 |     let dec = Bytes.get_utf_8_uchar b i in
46 |     let u = Uchar.utf_decode_uchar dec in
47 |     Seq.Cons (u, uchar b (i + Uchar.utf_decode_length dec))
48 |   in
49 |   uchar (Bytes.unsafe_of_string s) 0
50 | 


--------------------------------------------------------------------------------
/old-proposal.md:
--------------------------------------------------------------------------------
 1 | # Adding UTF-X decoding support to the stdlib 
 2 | 
 3 | Since 4.06 we have UTF-X encoding support via the `Buffer` module. The
 4 | following is a proposal to add UTF-X *decoding* (and `bytes` encoding)
 5 | support with the following goals:
 6 | 
 7 | 1. Provide a low-level, allocation-free codec API in `bytes`. This API
 8 |    provides all the information needed to implement loops for making
 9 |    higher-level UTF-X codec APIs (e.g. `Uutf`'s [folding](http://erratique.ch/software/uutf/doc/Uutf.String.html#1_Stringfolders) functions) 
10 |    operating on `bytes` and `string` values. It's not geared towards the end-user 
11 |    but rather towards library programmers.
12 |    
13 | 2. Provide a less performant and less precise (no way to detect
14 |    decoding errors; `Uchar.rep` replacement) decoding convenience API
15 |    based on the `Uchar.t Seq.t` datatype for both `Bytes` and `String`.
16 | 
17 | ## High-level `Seq` based API (`Bytes` and `String`)
18 | 
19 | In both `Bytes` and `String` we add the following high-level interface
20 | (implemented via the low-level interface):
21 | 
22 | ```ocaml
23 | val utf_8_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t
24 | val utf_16le_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t
25 | val utf_16be_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t
26 | ```
27 | 
28 | These are "best-effort" functions: any decoding error is substituted by
29 | a `Uchar.rep` character until a new synchronizing byte is found.
30 | 
31 | The `utf_{8,16le,16be}_of_uchar_seq` functions would be a bit difficult to
32 | provide due to dependencies (if implemented via `Buffer`), so
33 | we leave them out.
34 | 
35 | We could add `Buffer.add_utf_{8,16le,16,be}_uchar_seq` but that's just
36 | a `Seq.iter` away with the existing `Buffer.add_utf_{8,16le,16be}_uchar`.
37 | So we stop here w.r.t. high-level API.
38 | 
39 | ## Low-level `Bytes` API 
40 | 
41 | We begin with the following additions to the `Uchar` module: 
42 | 
43 | ```ocaml
44 | val utf_8_byte_length : Uchar.t -> int
45 | val utf_16_byte_length : Uchar.t -> int
46 | ```
47 | 
48 | In the `Bytes` module we then add the following:
49 | 
50 | ```ocaml
51 | exception Utf_error of int
52 | (** The control flow exception raised by
53 |     [{get,set}_uchar_utf_{8,16le,16be}] functions. The integer is an
54 |     index where a new decode/encode can be tried or the length of the
55 |     buffer if the end was reached. {b Important.} This exception is used
56 |     for control flow you must not let it uncaught, it is raised with notrace. *)
57 | 
58 | val get_uchar_utf_8_length : t -> int -> int
59 | (** [get_uchar_utf_8_length b i] is the byte length an UTF-8 encoded
60 |     character at index [i] in [b]. This is [0] if the byte is not a valid
61 |     UTF-8 starter byte. *)
62 | 
63 | val get_uchar_utf_8 : t -> int -> Uchar.t
64 | (** [get_uchar_utf_8 b i] is the UTF-8 encoded Unicode character at index
65 |     [i] in [b]. The number of bytes consumed by the decode can be
66 |     determined by [get_uchar_utf_8_length b i].  In case of errors
67 |     @raise Utf_error [n] with [n] the next index were a new decode
68 |     can be tried or [length b] if there is no such index. *)
69 | 
70 | val get_uchar_utf_16le : t -> int -> Uchar.t
71 | (** [get_uchar_utf_16le b i] is the UTF-16LE encoded Unicode character at
72 |     index [i] in [b]. The number of bytes consumed by the decode can
73 |     be determined by using {!Uchar.utf_16_byte_length} on the
74 |     result. @raise Utf_error [n] with [n] the next index were a new
75 |     decode can be tried or [length b] if there is no such index. *)
76 | 
77 | val get_uchar_utf_16be : t -> int -> Uchar.t
78 | (** [get_uchar_utf_16be] is like {!get_uchar_utf16_le} but decodes
79 |     UTF-16BE. *)
80 | 
81 | val set_uchar_utf_8 : t -> int -> Uchar.t -> int
82 | (** [set_uchar_utf_8 b i u] sets the UTF-8 encoding of [u] in [b]
83 |     starting at [i] and returns the next index were an encoding can
84 |     be performed or the buffer length if there is no such
85 |     index. @raise Utf_error [n] if there's not enough space for the
86 |     encode. *)
87 | 
88 | val set_utf_16le : t -> int -> Uchar.t -> int
89 | (** [set_utf_16le] is like {!set_uchar_utf_8} but UTF-16LE encodes. *)
90 | 
91 | val set_utf_16be : t -> int -> Uchar.t -> int
92 | (** [set_utf_16be] is like {!set_uchar_utf_8} but UTF-16BE encodes. *)
93 | ```
94 | 
95 | For codecing sequences of `Uchar.t` as loops:
96 | 
97 | 1. The `set` functions are good.
98 | 2. The `get` functions incur a bit of overhead to compute the next index (the invoked function has the information but we can't return it without allocating). For UTF-8 either we either get the next index via `get_uchar_utf_8_length` (assuming a table-based implementation: two memory accesses) or with `Uchar.utf_8_byte_length` on the result (up to three branches). For UTF-16 we use `Uchar.utf_16_byte_length` (one branch).
99 | 


--------------------------------------------------------------------------------
/perf8.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | 
 17 | let log fmt = Printf.printf (fmt ^^ "\n%!")
 18 | 
 19 | let impl_to_string = function
 20 | | `Adhoc -> "ADHOC" | `Dfa -> "DFA" | `If -> "IF" | `Pat -> "PAT"
 21 | | `Uutf -> "UUTF"
 22 | 
 23 | (* Data preparation. We take any file read it and duplicate
 24 |    its content until we get at least a 1GB string. *)
 25 | 
 26 | let string_of_file file =
 27 |   let ic = open_in_bin file in
 28 |   let len = in_channel_length ic in
 29 |   let buf = Bytes.create len in
 30 |   really_input ic buf 0 len; close_in ic; Bytes.unsafe_to_string buf
 31 | 
 32 | let make_it_approx_1GB s =
 33 |   let one_gb = 1024 * 1024 * 1024 in
 34 |   let len = String.length s in
 35 |   if len >= 1024 * 1024 * 1024 then s else
 36 |   let count = one_gb / len in
 37 |   let rec rep acc n = if n <= 0 then acc else rep (s :: acc) (n - 1) in
 38 |   String.concat "" (rep [] count)
 39 | 
 40 | let get_data file =
 41 |   log "Preparing data with %s" file;
 42 |   let s = make_it_approx_1GB (string_of_file file) in
 43 |   let size_gb = float (String.length s) /. 1024. /. 1024. /. 1024. in
 44 |   s, size_gb
 45 | 
 46 | (* Best-effort decode *)
 47 | 
 48 | let adhoc_best_effort_decode s =
 49 |   let rec loop b i =
 50 |     if i >= Utf_x_adhoc.Bytes.length b then () else
 51 |     let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in
 52 |     let used = Utf_x_adhoc.Uchar.utf_decode_length d in
 53 |     let u = Utf_x_adhoc.Uchar.utf_decode_uchar d in
 54 |     Sys.opaque_identity @@ (ignore u);
 55 |     loop b (i + used)
 56 |   in
 57 |   loop (Bytes.unsafe_of_string s) 0
 58 | 
 59 | let dfa_best_effort_decode s =
 60 |   let rec loop b i =
 61 |     if i >= Utf_x_dfa.Bytes.length b then () else
 62 |     let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in
 63 |     let used = Utf_x_dfa.Uchar.utf_decode_length d in
 64 |     let u = Utf_x_dfa.Uchar.utf_decode_uchar d in
 65 |     Sys.opaque_identity @@ (ignore u);
 66 |     loop b (i + used)
 67 |   in
 68 |   loop (Bytes.unsafe_of_string s) 0
 69 | 
 70 | let if_best_effort_decode s =
 71 |   let rec loop b i =
 72 |     if i >= Utf_x_if.Bytes.length b then () else
 73 |     let d = Utf_x_if.Bytes.get_utf_8_uchar b i in
 74 |     let used = Utf_x_if.Uchar.utf_decode_length d in
 75 |     let u = Utf_x_if.Uchar.utf_decode_uchar d in
 76 |     Sys.opaque_identity @@ (ignore u);
 77 |     loop b (i + used)
 78 |   in
 79 |   loop (Bytes.unsafe_of_string s) 0
 80 | 
 81 | let pat_best_effort_decode s =
 82 |   let rec loop b i =
 83 |     if i >= Utf_x_pat.Bytes.length b then () else
 84 |     let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in
 85 |     let used = Utf_x_pat.Uchar.utf_decode_length d in
 86 |     let u = Utf_x_pat.Uchar.utf_decode_uchar d in
 87 |     Sys.opaque_identity @@ (ignore u);
 88 |     loop b (i + used)
 89 |   in
 90 |   loop (Bytes.unsafe_of_string s) 0
 91 | 
 92 | let uutf_best_effort_decode s =
 93 |   let f () _ = function
 94 |   | `Uchar u -> Sys.opaque_identity @@ ignore u
 95 |   | `Malformed u -> Sys.opaque_identity @@ ignore Uutf.u_rep
 96 |   in
 97 |   Uutf.String.fold_utf_8 f () s
 98 | 
 99 | let best_effort_decode impl file =
100 |   let s, size_gb = get_data file in
101 |   Gc.full_major ();
102 |   log "Decoding %.2fGB of data with %s" size_gb (impl_to_string impl);
103 |   match impl with
104 |   | `Adhoc -> adhoc_best_effort_decode s
105 |   | `Dfa -> dfa_best_effort_decode s
106 |   | `If -> if_best_effort_decode s
107 |   | `Pat -> pat_best_effort_decode s
108 |   | `Uutf -> uutf_best_effort_decode s
109 | 
110 | (* Recode *)
111 | 
112 | let adhoc_recode s =
113 |   let rec loop b i b' j =
114 |     if i >= Utf_x_adhoc.Bytes.length b then (Bytes.unsafe_to_string b') else
115 |     let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in
116 |     match Utf_x_adhoc.Uchar.utf_decode_is_valid d with
117 |     | false -> raise Exit
118 |     | true ->
119 |         let used = Utf_x_adhoc.Uchar.utf_decode_length d in
120 |         let u = Utf_x_adhoc.Uchar.utf_decode_uchar d in
121 |         match Utf_x_adhoc.Bytes.set_utf_8_uchar b' j u with
122 |         | 0 -> raise Exit
123 |         | used' -> loop b (i + used) b' (j + used')
124 |   in
125 |   try
126 |     let b' = Bytes.create (String.length s) in
127 |     Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
128 |   with Exit -> None
129 | 
130 | let dfa_recode s =
131 |   let rec loop b i b' j =
132 |     if i >= Utf_x_dfa.Bytes.length b then (Bytes.unsafe_to_string b') else
133 |     let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in
134 |     match Utf_x_dfa.Uchar.utf_decode_is_valid d with
135 |     | false -> raise Exit
136 |     | true ->
137 |         let used = Utf_x_dfa.Uchar.utf_decode_length d in
138 |         let u = Utf_x_dfa.Uchar.utf_decode_uchar d in
139 |         match Utf_x_dfa.Bytes.set_utf_8_uchar b' j u with
140 |         | 0 -> raise Exit
141 |         | used' -> loop b (i + used) b' (j + used')
142 |   in
143 |   try
144 |     let b' = Bytes.create (String.length s) in
145 |     Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
146 |   with Exit -> None
147 | 
148 | let if_recode s =
149 |   let rec loop b i b' j =
150 |     if i >= Utf_x_if.Bytes.length b then (Bytes.unsafe_to_string b') else
151 |     let d = Utf_x_if.Bytes.get_utf_8_uchar b i in
152 |     match Utf_x_if.Uchar.utf_decode_is_valid d with
153 |     | false -> raise Exit
154 |     | true ->
155 |         let used = Utf_x_if.Uchar.utf_decode_length d in
156 |         let u = Utf_x_if.Uchar.utf_decode_uchar d in
157 |         match Utf_x_if.Bytes.set_utf_8_uchar b' j u with
158 |         | 0 -> raise Exit
159 |         | used' -> loop b (i + used) b' (j + used')
160 |   in
161 |   try
162 |     let b' = Bytes.create (String.length s) in
163 |     Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
164 |   with Exit -> None
165 | 
166 | let pat_recode s =
167 |   let rec loop b i b' j =
168 |     if i >= Utf_x_pat.Bytes.length b then (Bytes.unsafe_to_string b') else
169 |     let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in
170 |     match Utf_x_pat.Uchar.utf_decode_is_valid d with
171 |     | false -> raise Exit
172 |     | true ->
173 |         let used = Utf_x_pat.Uchar.utf_decode_length d in
174 |         let u = Utf_x_pat.Uchar.utf_decode_uchar d in
175 |         match Utf_x_pat.Bytes.set_utf_8_uchar b' j u with
176 |         | 0 -> raise Exit
177 |         | used' -> loop b (i + used) b' (j + used')
178 |   in
179 |   try
180 |     let b' = Bytes.create (String.length s) in
181 |     Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
182 |   with Exit -> None
183 | 
184 | let uutf_recode s =
185 |   let f b _ = function
186 |   | `Uchar u -> Buffer.add_utf_8_uchar b u; b
187 |   | `Malformed u -> raise Exit
188 |   in
189 |   try
190 |     let b = Buffer.create (String.length s) in
191 |     let b = Uutf.String.fold_utf_8 f b s in
192 |     Some (Buffer.contents b)
193 |   with Exit -> None
194 | 
195 | let recode impl file =
196 |   let s, size_gb = get_data file in
197 |   Gc.full_major ();
198 |   log "Recoding %.2fGB of data with %s decode" size_gb (impl_to_string impl);
199 |   let s' = match impl with
200 |   | `Adhoc -> adhoc_recode s
201 |   | `Dfa -> dfa_recode s
202 |   | `If -> if_recode s
203 |   | `Pat -> pat_recode s
204 |   | `Uutf -> uutf_recode s
205 |   in
206 |   match s' with
207 |   | None -> log "Recode failure, non valid input data ?"
208 |   | Some s' when String.equal s s' -> log "Recode success!"
209 |   | Some _ -> assert false
210 | 
211 | (* Validate *)
212 | 
213 | let adhoc_validate = Utf_x_adhoc.Bytes.is_valid_utf_8
214 | let dfa_validate = Utf_x_dfa.Bytes.is_valid_utf_8
215 | let if_validate = Utf_x_if.Bytes.is_valid_utf_8
216 | let pat_validate = Utf_x_pat.Bytes.is_valid_utf_8
217 | let uutf_validate s =
218 |   let f () _ = function
219 |   | `Uchar u -> Sys.opaque_identity @@ ignore u
220 |   | `Malformed u -> raise Exit
221 |   in
222 |   try Uutf.String.fold_utf_8 f () s; true with
223 |   | Exit -> false
224 | 
225 | let validate impl file =
226 |   let s, size_gb = get_data file in
227 |   log "Validating %.2fGB of data with %s" size_gb (impl_to_string impl);
228 |   let valid = match impl with
229 |   | `Adhoc -> adhoc_validate (Bytes.unsafe_of_string s)
230 |   | `Dfa -> dfa_validate (Bytes.unsafe_of_string s)
231 |   | `If -> if_validate (Bytes.unsafe_of_string s)
232 |   | `Pat -> pat_validate (Bytes.unsafe_of_string s)
233 |   | `Uutf -> uutf_validate s
234 |   in
235 |   log "Valid: %b" valid
236 | 
237 | (* Command line interface *)
238 | 
239 | let do_cmd cmd impl file = match cmd with
240 | | `Decode -> best_effort_decode impl file
241 | | `Recode -> recode impl file
242 | | `Validate -> validate impl file
243 | 
244 | let main () =
245 |   let usage = "Usage: perf8 [--adhoc | --dfa | --if | --pat | --uutf] FILE" in
246 |   let impl = ref `Adhoc in
247 |   let cmd = ref `Decode in
248 |   let args =
249 |     [ "--adhoc", Arg.Unit (fun () -> impl := `Adhoc),
250 |       "Test the adhoc implementation (default).";
251 |       "--dfa", Arg.Unit (fun () -> impl := `Dfa),
252 |       "Test the DFA implementation";
253 |       "--if", Arg.Unit (fun () -> impl := `If),
254 |       "Test the if branches implementation.";
255 |       "--pat", Arg.Unit (fun () -> impl := `Pat),
256 |       "Test the pattern implementation.";
257 |       "--uutf", Arg.Unit (fun () -> impl := `Uutf),
258 |       "Test the Uutf implementation.";
259 |       "--decode", Arg.Unit (fun () -> cmd := `Decode),
260 |       "Decode the UTF-8 data.";
261 |       "--validate", Arg.Unit (fun () -> cmd := `Validate),
262 |       "Validate the UTF-8 data.";
263 |       "--recode", Arg.Unit (fun () -> cmd := `Recode),
264 |       "Decode and recode the UTF-8 data."]
265 |   in
266 |   let file = ref None in
267 |   let set_file s = file := Some s in
268 |   Arg.parse args set_file usage;
269 |   match !file with
270 |   | None -> Printf.eprintf "perf8: missing FILE argument\n%s\n" usage; exit 1
271 |   | Some file -> do_cmd !cmd !impl file
272 | 
273 | let () = if !Sys.interactive then () else main ()
274 | 


--------------------------------------------------------------------------------
/test.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | (* To add to the compiler test suite. *)
 17 | 
 18 | open Utf_x_pat
 19 | 
 20 | let fold_uchars f acc =
 21 |   let rec loop f acc u =
 22 |     let acc = f acc u in
 23 |     if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
 24 |   in
 25 |   loop f acc Uchar.min
 26 | 
 27 | (* This tests that we encode and decode each character according
 28 |    to its specification. *)
 29 | 
 30 | let utf_8_spec =
 31 |   (* UTF-8 byte sequences, cf. table 3.7 Unicode 14. *)
 32 |   [(0x0000,0x007F),     [|(0x00,0x7F)|];
 33 |    (0x0080,0x07FF),     [|(0xC2,0xDF); (0x80,0xBF)|];
 34 |    (0x0800,0x0FFF),     [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|];
 35 |    (0x1000,0xCFFF),     [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|];
 36 |    (0xD000,0xD7FF),     [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|];
 37 |    (0xE000,0xFFFF),     [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|];
 38 |    (0x10000,0x3FFFF),   [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|];
 39 |    (0x40000,0xFFFFF),   [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|];
 40 |    (0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]]
 41 | 
 42 | let utf_16be_spec =
 43 |   (* UTF-16BE byte sequences, derived from table 3.5 Unicode 14. *)
 44 |   [(0x0000,0xD7FF),    [|(0x00,0xD7); (0x00,0xFF)|];
 45 |    (0xE000,0xFFFF),    [|(0xE0,0xFF); (0x00,0xFF)|];
 46 |    (0x10000,0x10FFFF), [|(0xD8,0xDB); (0x00,0xFF); (0xDC,0xDF); (0x00,0xFF)|]]
 47 | 
 48 | let uchar_map_of_spec spec =
 49 |   (* array mapping Uchar.t as ints to byte sequences according to [spec]. *)
 50 |   let map = Array.make ((Uchar.to_int Uchar.max) + 1) Bytes.empty in
 51 |   let add_range ((umin, umax), bytes) =
 52 |     let len = Array.length bytes in
 53 |     let bmin i = if i < len then fst bytes.(i) else max_int in
 54 |     let bmax i = if i < len then snd bytes.(i) else min_int in
 55 |     let uchar = ref umin in
 56 |     let buf = Bytes.create len in
 57 |     let add len' = match len = len' with
 58 |     | false -> ()
 59 |     | true -> map.(!uchar) <- Bytes.copy buf; incr uchar
 60 |     in
 61 |     for b0 = bmin 0 to bmax 0 do Bytes.set_uint8 buf 0 b0;
 62 |       for b1 = bmin 1 to bmax 1 do Bytes.set_uint8 buf 1 b1;
 63 |         for b2 = bmin 2 to bmax 2 do Bytes.set_uint8 buf 2 b2;
 64 |           for b3 = bmin 3 to bmax 3 do Bytes.set_uint8 buf 3 b3; add 4
 65 |           done; add 3;
 66 |         done; add 2;
 67 |       done; add 1;
 68 |     done; assert (!uchar - 1 = umax)
 69 |   in
 70 |   List.iter add_range spec;
 71 |   map
 72 | 
 73 | let uchar_map_get u map = map.(Uchar.to_int u)
 74 | let utf_8 = uchar_map_of_spec utf_8_spec
 75 | let utf_16be = uchar_map_of_spec utf_16be_spec
 76 | let utf_16le =
 77 |   let swap u b =
 78 |     let len = Bytes.length b in
 79 |     if len = 0 then () else
 80 |     for i = 0 to Bytes.length b / 2 - 1 do
 81 |       let j = i * 2 in
 82 |       Bytes.set_uint16_le b j (Bytes.get_uint16_be b j);
 83 |     done;
 84 |   in
 85 |   let map = Array.map Bytes.copy utf_16be in
 86 |   Array.iteri swap map; map
 87 | 
 88 | let test_utf utf utf_len get_utf set_utf utf_is_valid =
 89 |   (* Test codec and validation of each Uchar.t against the spec. *)
 90 |   let f () u =
 91 |     let utf_len = utf_len u in
 92 |     let buf = Bytes.create utf_len in
 93 |     assert (set_utf buf 0 u = utf_len);
 94 |     assert (Bytes.equal buf (uchar_map_get u utf));
 95 |     assert (Bytes.equal buf (uchar_map_get u utf));
 96 |     let dec = get_utf buf 0 in
 97 |     assert (Uchar.utf_decode_is_valid dec);
 98 |     assert (Uchar.utf_decode_length dec = utf_len);
 99 |     assert (Uchar.equal (Uchar.utf_decode_uchar dec) u);
100 |     assert (utf_is_valid buf);
101 |     ()
102 |   in
103 |   fold_uchars f ()
104 | 
105 | let () =
106 |   test_utf utf_8 Uchar.utf_8_byte_length
107 |     Bytes.get_utf_8_uchar Bytes.set_utf_8_uchar Bytes.is_valid_utf_8
108 | 
109 | let () =
110 |   test_utf utf_16be Uchar.utf_16_byte_length
111 |     Bytes.get_utf_16be_uchar Bytes.set_utf_16be_uchar Bytes.is_valid_utf_16be
112 | 
113 | let () =
114 |   test_utf utf_16le Uchar.utf_16_byte_length
115 |     Bytes.get_utf_16le_uchar Bytes.set_utf_16le_uchar Bytes.is_valid_utf_16le
116 | 
117 | let () =
118 |   (* Test out of bounds *)
119 |   let raises f = assert (try f (); false with Invalid_argument _ -> true) in
120 |   (raises @@ fun () -> Bytes.get_utf_8_uchar Bytes.empty 0);
121 |   (raises @@ fun () -> Bytes.set_utf_8_uchar Bytes.empty 0 Uchar.min);
122 |   (raises @@ fun () -> Bytes.get_utf_16le_uchar Bytes.empty 0);
123 |   (raises @@ fun () -> Bytes.set_utf_16le_uchar Bytes.empty 0 Uchar.min);
124 |   (raises @@ fun () -> Bytes.get_utf_16be_uchar Bytes.empty 0);
125 |   (raises @@ fun () -> Bytes.set_utf_16be_uchar Bytes.empty 0 Uchar.min);
126 |   ()
127 | 
128 | let () =
129 |   (* Test lack of space encodes *)
130 |   let b = Bytes.make 1 '\xab' in
131 |   assert (Bytes.set_utf_8_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
132 |   assert (Bytes.set_utf_16be_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
133 |   assert (Bytes.set_utf_16le_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
134 |   ()
135 | 
136 | let () =
137 |   (* Test used bytes and replacement according to WHATWG recommendation.
138 |      This is just a recommandation.
139 |      These examples are from TUS p. 126-127 Unicode 14  *)
140 |   let b = Bytes.of_string "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41" in
141 |   let ok i = i = Bytes.length b - 1 in
142 |   for i = 0 to Bytes.length b - 1 do
143 |     let dec = Bytes.get_utf_8_uchar b i in
144 |     if not (ok i) then begin
145 |       assert (Uchar.utf_decode_is_valid dec = false);
146 |       assert (Uchar.utf_decode_length dec = 1);
147 |       assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
148 |     end else begin
149 |       assert (Uchar.utf_decode_is_valid dec = true);
150 |       assert (Uchar.utf_decode_length dec = 1);
151 |       assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
152 |     end
153 |   done;
154 |   let b = Bytes.of_string "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41" in
155 |   let ok i = i = Bytes.length b - 1 in
156 |   for i = 0 to Bytes.length b - 1 do
157 |     let dec = Bytes.get_utf_8_uchar b i in
158 |     if not (ok i) then begin
159 |       assert (Uchar.utf_decode_is_valid dec = false);
160 |       assert (Uchar.utf_decode_length dec = 1);
161 |       assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
162 |     end else begin
163 |       assert (Uchar.utf_decode_is_valid dec = true);
164 |       assert (Uchar.utf_decode_length dec = 1);
165 |       assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
166 |     end
167 |   done;
168 |   let b = Bytes.of_string "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42" in
169 |   let ok i = i = 5 || i = 8 in
170 |   for i = 0 to Bytes.length b - 1 do
171 |     let dec = Bytes.get_utf_8_uchar b i in
172 |     if not (ok i) then begin
173 |       assert (Uchar.utf_decode_is_valid dec = false);
174 |       assert (Uchar.utf_decode_length dec = 1);
175 |       assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
176 |     end else begin
177 |       assert (Uchar.utf_decode_is_valid dec = true);
178 |       assert (Uchar.utf_decode_length dec = 1);
179 |       assert (Uchar.equal (Uchar.utf_decode_uchar dec)
180 |                 (Uchar.of_char (Bytes.get b i)))
181 |     end
182 |   done;
183 |   let b = Bytes.of_string "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41" in
184 |   let d0 = Bytes.get_utf_8_uchar b 0 in
185 |   assert (Uchar.utf_decode_is_valid d0 = false);
186 |   assert (Uchar.utf_decode_length d0 = 2);
187 |   assert (Uchar.equal (Uchar.utf_decode_uchar d0) Uchar.rep);
188 |   let d2 = Bytes.get_utf_8_uchar b 2 in
189 |   assert (Uchar.utf_decode_is_valid d2 = false);
190 |   assert (Uchar.utf_decode_length d2 = 1);
191 |   assert (Uchar.equal (Uchar.utf_decode_uchar d2) Uchar.rep);
192 |   let d3 = Bytes.get_utf_8_uchar b 3 in
193 |   assert (Uchar.utf_decode_is_valid d3 = false);
194 |   assert (Uchar.utf_decode_length d3 = 3);
195 |   assert (Uchar.equal (Uchar.utf_decode_uchar d3) Uchar.rep);
196 |   let d6 = Bytes.get_utf_8_uchar b 6 in
197 |   assert (Uchar.utf_decode_is_valid d6 = false);
198 |   assert (Uchar.utf_decode_length d6 = 2);
199 |   assert (Uchar.equal (Uchar.utf_decode_uchar d6) Uchar.rep);
200 |   let d8 = Bytes.get_utf_8_uchar b 8 in
201 |   assert (Uchar.utf_decode_length d8 = 1);
202 |   assert (Uchar.equal (Uchar.utf_decode_uchar d8) (Uchar.of_int 0x0041));
203 |   ()
204 | 
205 | let () = Printf.printf "All tests passed!\n"
206 | 


--------------------------------------------------------------------------------
/test_long.ml:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | open Utf_x_pat
17 | 
18 | (* Assumes the good encoding and decodes have been checked by test.ml
19 |    Exhaustively tests all 1-4 bytes invalid sequences for decodes. This
20 |    ensures we do not decode invalid sequence to uchars. *)
21 | 
22 | let fold_uchars f acc =
23 |   let rec loop f acc u =
24 |     let acc = f acc u in
25 |     if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
26 |   in
27 |   loop f acc Uchar.min
28 | 
29 | module Sset = Set.Make (String)
30 | 
31 | let utf_8_encs, utf_16be_encs, utf_16le_encs =
32 |   Printf.printf "Building encoding sequence sets\n%!";
33 |   let add (set8, set16be, set16le) u =
34 |     let s = Bytes.unsafe_to_string in
35 |     let e8 = Bytes.create (Uchar.utf_8_byte_length u) in
36 |     let e16be = Bytes.create (Uchar.utf_16_byte_length u) in
37 |     let e16le = Bytes.create (Uchar.utf_16_byte_length u) in
38 |     ignore (Bytes.set_utf_8_uchar e8 0 u);
39 |     ignore (Bytes.set_utf_16be_uchar e16be 0 u);
40 |     ignore (Bytes.set_utf_16le_uchar e16le 0 u);
41 |     Sset.add (s e8) set8, Sset.add (s e16be) set16be, Sset.add (s e16le) set16le
42 |   in
43 |   fold_uchars add (Sset.empty, Sset.empty, Sset.empty)
44 | 
45 | let test_seqs utf utf_encs get_utf_char is_valid_utf =
46 |   let test seq =
47 |     let dec = get_utf_char seq 0 in
48 |     let valid = Uchar.utf_decode_is_valid dec in
49 |     let valid_seq = is_valid_utf seq in
50 |     let is_enc = Sset.mem (Bytes.unsafe_to_string seq) utf_encs in
51 |     if not ((valid && is_enc) || (not valid && not is_enc)) ||
52 |        not ((valid_seq && is_enc) || (not valid_seq && not is_enc))
53 |     then begin
54 |       for i = 0 to Bytes.length seq - 1 do
55 |         Printf.printf "%02X " (Bytes.get_uint8 seq i);
56 |       done;
57 |       Printf.printf "valid: %b is_encoding: %b decode: U+%04X is_valid:%b\n"
58 |         valid is_enc (Uchar.to_int (Uchar.utf_decode_uchar dec)) valid_seq;
59 |       assert false
60 |     end;
61 |     valid
62 |   in
63 |   let[@inline] set buf i b = Bytes.unsafe_set buf i (Char.unsafe_chr b) in
64 |   let s1 = Bytes.create 1 and s2 = Bytes.create 2
65 |   and s3 = Bytes.create 3 and s4 = Bytes.create 4 in
66 |   Printf.printf "Testing %s invalid decodes…\n%!" utf;
67 |   for b0 = 0x00 to 0xFF do
68 |     set s1 0 b0;
69 |     if test s1 then ((* this prefix decoded, stop here *)) else begin
70 |       set s2 0 b0;
71 |       for b1 = 0x00 to 0xFF do
72 |         set s2 1 b1;
73 |         if test s2 then ((* this prefix decoded, stop here *)) else begin
74 |           set s3 0 b0;
75 |           set s3 1 b1;
76 |           for b2 = 0x00 to 0xFF do
77 |             set s3 2 b2;
78 |             if test s3 then ((* this prefix decoded, stop here *)) else begin
79 |               set s4 0 b0;
80 |               set s4 1 b1;
81 |               set s4 2 b2;
82 |               for b3 = 0x00 to 0xFF do set s4 3 b3; ignore (test s4) done;
83 |             end
84 |         done;
85 |        end
86 |       done;
87 |     end
88 |   done
89 | 
90 | let () =
91 |   test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar Bytes.is_valid_utf_8;
92 |   test_seqs "UTF-16BE"
93 |     utf_16be_encs Bytes.get_utf_16be_uchar Bytes.is_valid_utf_16be;
94 |   test_seqs "UTF-16LE"
95 |     utf_16le_encs Bytes.get_utf_16le_uchar Bytes.is_valid_utf_16le;
96 |   ()
97 | 
98 | let () = Printf.printf "All tests passed!\n"
99 | 


--------------------------------------------------------------------------------
/trip8.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | let log fmt = Printf.printf (fmt ^^ "\n%!")
 17 | 
 18 | let string_of_file file =
 19 |   let ic = open_in_bin file in
 20 |   let len = in_channel_length ic in
 21 |   let buf = Bytes.create len in
 22 |   really_input ic buf 0 len; close_in ic; Bytes.unsafe_to_string buf
 23 | 
 24 | let adhoc_trip s b =
 25 |   let rec loop b i buf =
 26 |     if i >= Utf_x_adhoc.Bytes.length b then () else
 27 |     let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in
 28 |     let used = Utf_x_adhoc.Uchar.utf_decode_length d in
 29 |     Buffer.add_utf_8_uchar buf (Utf_x_adhoc.Uchar.utf_decode_uchar d);
 30 |     loop b (i + used) buf
 31 |   in
 32 |   loop (Bytes.unsafe_of_string s) 0 b
 33 | 
 34 | let dfa_trip s b =
 35 |   let rec loop b i buf =
 36 |     if i >= Utf_x_dfa.Bytes.length b then () else
 37 |     let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in
 38 |     let used = Utf_x_dfa.Uchar.utf_decode_length d in
 39 |     Buffer.add_utf_8_uchar buf (Utf_x_dfa.Uchar.utf_decode_uchar d);
 40 |     loop b (i + used) buf
 41 |   in
 42 |   loop (Bytes.unsafe_of_string s) 0 b
 43 | 
 44 | let if_trip s b =
 45 |   let rec loop b i buf =
 46 |     if i >= Utf_x_if.Bytes.length b then () else
 47 |     let d = Utf_x_if.Bytes.get_utf_8_uchar b i in
 48 |     let used = Utf_x_if.Uchar.utf_decode_length d in
 49 |     Buffer.add_utf_8_uchar buf (Utf_x_if.Uchar.utf_decode_uchar d);
 50 |     loop b (i + used) buf
 51 |   in
 52 |   loop (Bytes.unsafe_of_string s) 0 b
 53 | 
 54 | let pat_trip s b =
 55 |   let rec loop b i buf =
 56 |     if i >= Utf_x_pat.Bytes.length b then () else
 57 |     let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in
 58 |     let used = Utf_x_pat.Uchar.utf_decode_length d in
 59 |     Buffer.add_utf_8_uchar buf (Utf_x_pat.Uchar.utf_decode_uchar d);
 60 |     loop b (i + used) buf
 61 |   in
 62 |   loop (Bytes.unsafe_of_string s) 0 b
 63 | 
 64 | let uutf_trip s b =
 65 |   let f b _ = function
 66 |   | `Uchar u -> Buffer.add_utf_8_uchar b u; b
 67 |   | `Malformed u -> Buffer.add_utf_8_uchar b Uchar.rep; b
 68 |   in
 69 |   ignore (Uutf.String.fold_utf_8 f b s)
 70 | 
 71 | let trip impl file =
 72 |   let s = string_of_file file in
 73 |   let b = Buffer.create (String.length s) in
 74 |   let () = match impl with
 75 |   | `Adhoc -> adhoc_trip s b
 76 |   | `Dfa -> dfa_trip s b
 77 |   | `If -> if_trip s b
 78 |   | `Pat -> pat_trip s b
 79 |   | `Uutf -> uutf_trip s b
 80 |   in
 81 |   print_string (Buffer.contents b)
 82 | 
 83 | (* Command line interface *)
 84 | 
 85 | let main () =
 86 |   let usage = "Usage: trip8 [--adhoc | --dfa | --if | --pat | --uutf] FILE" in
 87 |   let impl = ref `Adhoc in
 88 |   let args =
 89 |     [ "--adhoc", Arg.Unit (fun () -> impl := `Adhoc),
 90 |       "Use the adhoc implementation (default).";
 91 |       "--dfa", Arg.Unit (fun () -> impl := `Dfa),
 92 |       "Use the DFA implementation";
 93 |       "--if", Arg.Unit (fun () -> impl := `If),
 94 |       "Use the if branches implementation.";
 95 |       "--pat", Arg.Unit (fun () -> impl := `Pat),
 96 |       "Use the pattern implementation.";
 97 |       "--uutf", Arg.Unit (fun () -> impl := `Uutf),
 98 |       "Use the Uutf implementation." ]
 99 |   in
100 |   let file = ref None in
101 |   let set_file s = file := Some s in
102 |   Arg.parse args set_file usage;
103 |   match !file with
104 |   | None -> Printf.eprintf "trip8: missing FILE argument\n%s\n" usage; exit 1
105 |   | Some file -> trip !impl file
106 | 
107 | let () = if !Sys.interactive then () else main ()
108 | 


--------------------------------------------------------------------------------
/utf_16.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 |   (* This won't be needed *)
 17 | 
 18 |   let dec_invalid = Utf_uchar.utf_decode_invalid
 19 |   let[@inline] dec_ret n u = Utf_uchar.utf_decode n (Uchar.unsafe_of_int u)
 20 |   external length : bytes -> int = "%bytes_length"
 21 |   type t = Bytes.t
 22 |   external swap16 : int -> int = "%bswap16" (* That one will be in Bytes *)
 23 | 
 24 |   (* That will be needed *)
 25 | 
 26 |   external unsafe_get_uint16_ne : t -> int -> int = "%caml_bytes_get16u"
 27 |   external unsafe_set_uint16_ne : t -> int -> int -> unit = "%caml_bytes_set16u"
 28 | 
 29 |   let unsafe_get_uint16_le b i =
 30 |     if Sys.big_endian
 31 |     then swap16 (unsafe_get_uint16_ne b i)
 32 |     else unsafe_get_uint16_ne b i
 33 | 
 34 |   let unsafe_get_uint16_be b i =
 35 |     if Sys.big_endian
 36 |     then unsafe_get_uint16_ne b i
 37 |     else swap16 (unsafe_get_uint16_ne b i)
 38 | 
 39 |   let unsafe_set_uint16_le b i x =
 40 |     if Sys.big_endian
 41 |     then unsafe_set_uint16_ne b i (swap16 x)
 42 |     else unsafe_set_uint16_ne b i x
 43 | 
 44 |   let unsafe_set_uint16_be b i x =
 45 |     if Sys.big_endian
 46 |     then unsafe_set_uint16_ne b i x
 47 |     else unsafe_set_uint16_ne b i (swap16 x)
 48 | 
 49 |   (* UTF-16BE *)
 50 | 
 51 |   let get_utf_16be_uchar b i =
 52 |     let get = unsafe_get_uint16_be in
 53 |     let max = length b - 1 in
 54 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
 55 |     if i = max then dec_invalid 1 else
 56 |     match get b i with
 57 |     | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
 58 |     | u when u > 0xDBFF -> dec_invalid 2
 59 |     | hi -> (* combine [hi] with a low surrogate *)
 60 |         let last = i + 3 in
 61 |         if last > max then dec_invalid (max - i + 1) else
 62 |         match get b (i + 2) with
 63 |         | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
 64 |         | lo ->
 65 |             let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
 66 |             dec_ret 4 u
 67 | 
 68 |   let set_utf_16be_uchar b i u =
 69 |     let set = unsafe_set_uint16_be in
 70 |     let max = length b - 1 in
 71 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
 72 |     match Uchar.to_int u with
 73 |     | u when u < 0 -> assert false
 74 |     | u when u <= 0xFFFF ->
 75 |         let last = i + 1 in
 76 |         if last > max then 0 else (set b i u; 2)
 77 |     | u ->
 78 |         let last = i + 3 in
 79 |         if last > max then 0 else
 80 |         let u' = u - 0x10000 in
 81 |         let hi = (0xD800 lor (u' lsr 10)) in
 82 |         let lo = (0xDC00 lor (u' land 0x3FF)) in
 83 |         set b i hi; set b (i + 2) lo; 4
 84 | 
 85 |   let is_valid_utf_16be b =
 86 |     let rec loop max b i =
 87 |       let get = unsafe_get_uint16_be in
 88 |       if i > max then true else
 89 |       if i = max then false else
 90 |       match get b i with
 91 |       | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
 92 |       | u when u > 0xDBFF -> false
 93 |       | hi ->
 94 |           let last = i + 3 in
 95 |           if last > max then false else
 96 |           match get b (i + 2) with
 97 |           | u when u < 0xDC00 || u > 0xDFFF -> false
 98 |           | lo -> loop max b (i + 4)
 99 |     in
100 |     loop (length b - 1) b 0
101 | 
102 |   (* UTF-16LE *)
103 | 
104 |   let get_utf_16le_uchar b i =
105 |     let get = unsafe_get_uint16_le in
106 |     let max = length b - 1 in
107 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
108 |     if i = max then dec_invalid 1 else
109 |     match get b i with
110 |     | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
111 |     | u when u > 0xDBFF -> dec_invalid 2
112 |     | hi -> (* combine [hi] with a low surrogate *)
113 |         let last = i + 3 in
114 |         if last > max then dec_invalid (max - i + 1) else
115 |         match get b (i + 2) with
116 |         | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
117 |         | lo ->
118 |             let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
119 |             dec_ret 4 u
120 | 
121 |   let set_utf_16le_uchar b i u =
122 |     let set = unsafe_set_uint16_le in
123 |     let max = length b - 1 in
124 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
125 |     match Uchar.to_int u with
126 |     | u when u < 0 -> assert false
127 |     | u when u <= 0xFFFF ->
128 |         let last = i + 1 in
129 |         if last > max then 0 else (set b i u; 2)
130 |     | u ->
131 |         let last = i + 3 in
132 |         if last > max then 0 else
133 |         let u' = u - 0x10000 in
134 |         let hi = (0xD800 lor (u' lsr 10)) in
135 |         let lo = (0xDC00 lor (u' land 0x3FF)) in
136 |         set b i hi; set b (i + 2) lo; 4
137 | 
138 |   let is_valid_utf_16le b =
139 |     let rec loop max b i =
140 |       let get = unsafe_get_uint16_le in
141 |       if i > max then true else
142 |       if i = max then false else
143 |       match get b i with
144 |       | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
145 |       | u when u > 0xDBFF -> false
146 |       | hi ->
147 |           let last = i + 3 in
148 |           if last > max then false else
149 |           match get b (i + 2) with
150 |           | u when u < 0xDC00 || u > 0xDFFF -> false
151 |           | lo -> loop max b (i + 4)
152 |     in
153 |     loop (length b - 1) b 0
154 | 


--------------------------------------------------------------------------------
/utf_uchar.ml:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | include Uchar
17 | 
18 | (* UTF codecs tools *)
19 | 
20 | type utf_decode = int
21 | (* This is an int [0xDUUUUUU] decomposed as follows:
22 |    - [D] is four bits for decode information, the highest bit is set if the
23 |      decode is valid. The three lower bits indicate the number of elements
24 |      from the source that were consumed by the decode.
25 |    - [UUUUUU] is the decoded Unicode character or the Unicode replacement
26 |      character U+FFFD if for invalid decodes. *)
27 | 
28 | let valid_bit = 27
29 | let decode_bits = 24
30 | 
31 | let[@inline] utf_decode_is_valid d = (d lsr valid_bit) = 1
32 | let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111
33 | let[@inline] utf_decode_uchar d = Uchar.unsafe_of_int (d land 0xFFFFFF)
34 | let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (Uchar.to_int u)
35 | let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor 0xFFFD (* rep *)
36 | 
37 | let utf_8_byte_length u = match to_int u with
38 | | u when u < 0 -> assert false
39 | | u when u <= 0x007F -> 1
40 | | u when u <= 0x07FF -> 2
41 | | u when u <= 0xFFFF -> 3
42 | | u when u <= 0x10FFFF -> 4
43 | | _ -> assert false
44 | 
45 | let utf_16_byte_length u = match to_int u with
46 | | u when u < 0 -> assert false
47 | | u when u <= 0xFFFF -> 2
48 | | u when u <= 0x10FFFF -> 4
49 | | _ -> assert false
50 | 


--------------------------------------------------------------------------------
/utf_uchar.mli:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | include Utf_x.Uchar
17 | 


--------------------------------------------------------------------------------
/utf_x.mli:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | (* Uchar additions *)
 17 | 
 18 | module type Uchar = sig
 19 |   include module type of Uchar
 20 | 
 21 |   (** {1:utf_codec UTF codecs tools}
 22 | 
 23 |       @since 4.14 *)
 24 | 
 25 |   type utf_decode [@@immediate]
 26 |   (** The type for UTF decode results. Values of this type represent
 27 |       the result of a Unicode Transformation Format decoding attempt. *)
 28 | 
 29 |   val utf_decode_is_valid : utf_decode -> bool
 30 |   (** [utf_decode_is_valid d] is [true] if and only if [d] holds a valid
 31 |       decode. *)
 32 | 
 33 |   val utf_decode_uchar : utf_decode -> Uchar.t
 34 |   (** [utf_x_decode_uchar d] is the Unicode character decoded by [d] if
 35 |       [utf_x_decode_is_valid d] is [true] and {!Uchar.rep} otherwise. *)
 36 | 
 37 |   val utf_decode_length : utf_decode -> int
 38 |   (** [utf_decode_length d] is the number of elements from the source
 39 |       that were consumed by the decode [d]. This is always strictly
 40 |       positive and smaller or equal to [4]. The kind of source elements
 41 |       depends on the actual decoder; for the decoders of the standard
 42 |       library this function always returns a length in bytes. *)
 43 | 
 44 |   val utf_decode : int -> Uchar.t -> utf_decode
 45 |   (** [utf_decode n u] is a valid UTF decode for [u] that consumed [n]
 46 |       elements from the source for decoding. [n] must be positive and
 47 |       smaller or equal to [4] (this is not checked by the module). *)
 48 | 
 49 |   val utf_decode_invalid : int -> utf_decode
 50 |   (** [utf_decode_invalid n] is an invalid UTF decode that consumed [n]
 51 |       elements from the source to error. [n] must be positive and
 52 |       smaller or equal to [4] (this is not checked by the module). The
 53 |       resulting decode has {!rep} as the decoded Unicode character. *)
 54 | 
 55 |   val utf_8_byte_length : t -> int
 56 |   (** [utf_8_byte_length u] is the number of bytes needed to encode
 57 |       [u] in UTF-8. *)
 58 | 
 59 |   val utf_16_byte_length : t -> int
 60 |   (** [utf_16_byte_length u] is the number of bytes needed to encode
 61 |       [u] in UTF-16. *)
 62 | end
 63 | 
 64 | (* Bytes additions *)
 65 | 
 66 | module type S = sig
 67 | module Uchar : Uchar
 68 | module Bytes : sig
 69 |   include module type of Bytes
 70 | 
 71 |   (** {1:utf_x_codecs UTF-X codecs and validations}
 72 | 
 73 |       @since 4.14 *)
 74 | 
 75 |   (** {2:utf_8 UTF-8} *)
 76 | 
 77 |   val get_utf_8_uchar : t -> int -> Uchar.utf_decode
 78 |   (** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in
 79 |       [b]. *)
 80 | 
 81 |   val set_utf_8_uchar : t -> int -> Uchar.t -> int
 82 |   (** [set_utf_8_uchar b i u] UTF-8 encodes [u] at index [i] in [b]
 83 |       and returns the number of bytes [n] that were written starting
 84 |       at [i]. If [n] is [0] there was not enough space to encode [u]
 85 |       at [i] and [b] was left untouched. Otherwise a new character can
 86 |       be encoded at [i + n]. *)
 87 | 
 88 |   val is_valid_utf_8 : t -> bool
 89 |   (** [is_valid_utf_8 b] is [true] if and only if [b] contains valid
 90 |       UTF-8 data. *)
 91 | 
 92 |   (** {2:utf_16be UTF-16BE} *)
 93 | 
 94 |   val get_utf_16be_uchar : t -> int -> Uchar.utf_decode
 95 |   (** [get_utf_16be_uchar b i] decodes an UTF-16BE character at index
 96 |       [i] in [b]. *)
 97 | 
 98 |   val set_utf_16be_uchar : t -> int -> Uchar.t -> int
 99 |   (** [set_utf_16be_uchar b i u] UTF-16BE encodes [u] at index [i] in [b]
100 |       and returns the number of bytes [n] that were written starting
101 |       at [i]. If [n] is [0] there was not enough space to encode [u]
102 |       at [i] and [b] was left untouched. Otherwise a new character can
103 |       be encoded at [i + n]. *)
104 | 
105 |   val is_valid_utf_16be : t -> bool
106 |   (** [is_valid_utf_16be b] is [true] if and only if [b] contains valid
107 |       UTF-16BE data. *)
108 | 
109 |   (** {2:utf_16le UTF-16LE} *)
110 | 
111 |   val get_utf_16le_uchar : t -> int -> Uchar.utf_decode
112 |   (** [get_utf_16le_uchar b i] decodes an UTF-16LE character at index
113 |       [i] in [b]. *)
114 | 
115 |   val set_utf_16le_uchar : t -> int -> Uchar.t -> int
116 |   (** [set_utf_16le_uchar b i u] UTF-16LE encodes [u] at index [i] in [b]
117 |       and returns the number of bytes [n] that were written starting
118 |       at [i]. If [n] is [0] there was not enough space to encode [u]
119 |       at [i] and [b] was left untouched. Otherwise a new character can
120 |       be encoded at [i + n]. *)
121 | 
122 |   val is_valid_utf_16le : t -> bool
123 |   (** [is_valid_utf_16le b] is [true] if and only if [b] contains valid
124 |       UTF-16LE data. *)
125 | end
126 | end
127 | 


--------------------------------------------------------------------------------
/utf_x_adhoc.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | module Uchar = Utf_uchar
 17 | module Bytes = struct
 18 |   include Bytes
 19 | 
 20 |   external swap16 : int -> int = "%bswap16" (* That one will be in Bytes *)
 21 | 
 22 |   (* Unsafe internal additions *)
 23 | 
 24 |   external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
 25 |   external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
 26 | 
 27 |   external unsafe_get_uint16_ne : t -> int -> int = "%caml_bytes_get16u"
 28 |   external unsafe_set_uint16_ne : t -> int -> int -> unit = "%caml_bytes_set16u"
 29 | 
 30 |   let unsafe_get_uint16_le b i =
 31 |     if Sys.big_endian
 32 |     then swap16 (unsafe_get_uint16_ne b i)
 33 |     else unsafe_get_uint16_ne b i
 34 | 
 35 |   let unsafe_get_uint16_be b i =
 36 |     if Sys.big_endian
 37 |     then unsafe_get_uint16_ne b i
 38 |     else swap16 (unsafe_get_uint16_ne b i)
 39 | 
 40 |   let unsafe_set_uint16_le b i x =
 41 |     if Sys.big_endian
 42 |     then unsafe_set_uint16_ne b i (swap16 x)
 43 |     else unsafe_set_uint16_ne b i x
 44 | 
 45 |   let unsafe_set_uint16_be b i x =
 46 |     if Sys.big_endian
 47 |     then unsafe_set_uint16_ne b i x
 48 |     else unsafe_set_uint16_ne b i (swap16 x)
 49 | 
 50 |   (* UTF-X codecs and validations *)
 51 | 
 52 |   let dec_invalid = Uchar.utf_decode_invalid
 53 |   let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
 54 | 
 55 |   (* In case of decoding error, if we error on the first byte, we
 56 |      consume the byte, otherwise we consume the [n] bytes preceeding
 57 |      erroring byte.
 58 | 
 59 |      This means that if a client uses decodes without caring about
 60 |      validity it naturally replace bogus data with Uchar.rep according
 61 |      to the WHATWG Encoding standard. Other schemes are possible by
 62 |      consulting the number of used bytes on invalid decodes. For more
 63 |      details see https://hsivonen.fi/broken-utf-8/
 64 | 
 65 |      For this reason in [get_utf_8_uchar] we gradually check the next
 66 |      byte is available rather than doing it immediately after the
 67 |      first byte. Contrast with [is_valid_utf_8]. *)
 68 | 
 69 |   (* UTF-8 *)
 70 | 
 71 |   let utf_8_decode_case =
 72 |     (* Case according to first byte. This corresponds to the the lines
 73 |        of The Unicode Standard Table 3.7 with lines E1..EC and EE..EF
 74 |        sharing the same case and unmentioned bytes mapped to 8.
 75 | 
 76 |        let dump_table () =
 77 |          let print_case c = Printf.printf "\\%03d" c in
 78 |          let cut i = if i mod 16 = 0 then (print_char '\\'; print_newline ()) in
 79 |          for i = 0x00 to 0x7F do (cut i; print_case 0) done;
 80 |          for i = 0x80 to 0xC1 do (cut i; print_case 8) done;
 81 |          for i = 0xC2 to 0xDF do (cut i; print_case 1) done;
 82 |          for i = 0xE0 to 0xE0 do (cut i; print_case 2) done;
 83 |          for i = 0xE1 to 0xEC do (cut i; print_case 3) done;
 84 |          for i = 0xED to 0xED do (cut i; print_case 4) done;
 85 |          for i = 0xEE to 0xEF do (cut i; print_case 3) done;
 86 |          for i = 0xF0 to 0xF0 do (cut i; print_case 5) done;
 87 |          for i = 0xF1 to 0xF3 do (cut i; print_case 6) done;
 88 |          for i = 0xF4 to 0xF4 do (cut i; print_case 7) done;
 89 |          for i = 0xF5 to 0xFF do (cut i; print_case 8) done;
 90 |          () *)
 91 |       "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 92 |        \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 93 |        \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 94 |        \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 95 |        \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 96 |        \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 97 |        \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 98 |        \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 99 |        \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
100 |        \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
101 |        \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
102 |        \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
103 |        \008\008\001\001\001\001\001\001\001\001\001\001\001\001\001\001\
104 |        \001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\
105 |        \002\003\003\003\003\003\003\003\003\003\003\003\003\004\003\003\
106 |        \005\006\006\006\007\008\008\008\008\008\008\008\008\008\008\008"
107 | 
108 |   let utf_8_decode_case' = unsafe_of_string utf_8_decode_case
109 | 
110 |   let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
111 |   let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
112 |   let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
113 |   let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
114 |   let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
115 | 
116 |   let[@inline] utf_8_uchar_2 b0 b1 =
117 |     ((b0 land 0x1F) lsl 6) lor
118 |     ((b1 land 0x3F))
119 | 
120 |   let[@inline] utf_8_uchar_3 b0 b1 b2 =
121 |     ((b0 land 0x0F) lsl 12) lor
122 |     ((b1 land 0x3F) lsl 6) lor
123 |     ((b2 land 0x3F))
124 | 
125 |   let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
126 |     ((b0 land 0x07) lsl 18) lor
127 |     ((b1 land 0x3F) lsl 12) lor
128 |     ((b2 land 0x3F) lsl 6) lor
129 |     ((b3 land 0x3F))
130 | 
131 |   let get_utf_8_uchar b i =
132 |     let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
133 |     let get = unsafe_get_uint8 in
134 |     let max = length b - 1 in
135 |     match get utf_8_decode_case' b0 with
136 |     | 0 (* 00..7F *) -> dec_ret 1 b0
137 |     | 1 (* C2..DF *) ->
138 |         let i = i + 1 in if i > max then dec_invalid 1 else
139 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
140 |         dec_ret 2 (utf_8_uchar_2 b0 b1)
141 |     | 2 (* E0 *) ->
142 |         let i = i + 1 in if i > max then dec_invalid 1 else
143 |         let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
144 |         let i = i + 1 in if i > max then dec_invalid 2 else
145 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
146 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
147 |     | 3 -> (* E1..EC or EE..EF *)
148 |         let i = i + 1 in if i > max then dec_invalid 1 else
149 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
150 |         let i = i + 1 in if i > max then dec_invalid 2 else
151 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
152 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
153 |     | 4 -> (* ED *)
154 |         let i = i + 1 in if i > max then dec_invalid 1 else
155 |         let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
156 |         let i = i + 1 in if i > max then dec_invalid 2 else
157 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
158 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
159 |     | 5 -> (* F0 *)
160 |         let i = i + 1 in if i > max then dec_invalid 1 else
161 |         let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
162 |         let i = i + 1 in if i > max then dec_invalid 2 else
163 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
164 |         let i = i + 1 in if i > max then dec_invalid 3 else
165 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
166 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
167 |     | 6 -> (* F1..F3 *)
168 |         let i = i + 1 in if i > max then dec_invalid 1 else
169 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
170 |         let i = i + 1 in if i > max then dec_invalid 2 else
171 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
172 |         let i = i + 1 in if i > max then dec_invalid 3 else
173 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
174 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
175 |     | 7 -> (* F4 *)
176 |         let i = i + 1 in if i > max then dec_invalid 1 else
177 |         let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
178 |         let i = i + 1 in if i > max then dec_invalid 2 else
179 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
180 |         let i = i + 1 in if i > max then dec_invalid 3 else
181 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
182 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
183 |     | 8 -> dec_invalid 1
184 |     | _ -> assert false
185 | 
186 |   let set_utf_8_uchar b i u =
187 |     let set = unsafe_set_uint8 in
188 |     let max = length b - 1 in
189 |     match Uchar.to_int u with
190 |     | u when u < 0 -> assert false
191 |     | u when u <= 0x007F ->
192 |         set_uint8 b i u;
193 |         1
194 |     | u when u <= 0x07FF ->
195 |         let last = i + 1 in
196 |         if last > max then 0 else
197 |         (set_uint8 b i (0xC0 lor (u lsr 6));
198 |          set b last (0x80 lor (u land 0x3F));
199 |          2)
200 |     | u when u <= 0xFFFF ->
201 |         let last = i + 2 in
202 |         if last > max then 0 else
203 |         (set_uint8 b i (0xE0 lor (u lsr 12));
204 |          set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
205 |          set b last (0x80 lor (u land 0x3F));
206 |          3)
207 |     | u ->
208 |         let last = i + 3 in
209 |         if last > max then 0 else
210 |         (set_uint8 b i (0xF0 lor (u lsr 18));
211 |          set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
212 |          set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
213 |          set b last (0x80 lor (u land 0x3F));
214 |          4)
215 | 
216 |   let is_valid_utf_8 b =
217 |     let rec loop max b i =
218 |       if i > max then true else
219 |       let get = unsafe_get_uint8 in
220 |       match get utf_8_decode_case' (get b i) with
221 |       | 0 (* 00..7F *) -> loop max b (i + 1)
222 |       | 1 (* C2..DF *) ->
223 |           let last = i + 1 in
224 |           if last > max
225 |           || not_in_x80_to_xBF (get b last)
226 |           then false
227 |           else loop max b (last + 1)
228 |       | 2 (* E0 *) ->
229 |           let last = i + 2 in
230 |           if last > max
231 |           || not_in_xA0_to_xBF (get b (i + 1))
232 |           || not_in_x80_to_xBF (get b last)
233 |           then false
234 |           else loop max b (last + 1)
235 |       | 3 -> (* E1..EC or EE..EF *)
236 |           let last = i + 2 in
237 |           if last > max
238 |           || not_in_x80_to_xBF (get b (i + 1))
239 |           || not_in_x80_to_xBF (get b last)
240 |           then false
241 |           else loop max b (last + 1)
242 |       | 4 -> (* ED *)
243 |           let last = i + 2 in
244 |           if last > max
245 |           || not_in_x80_to_x9F (get b (i + 1))
246 |           || not_in_x80_to_xBF (get b last)
247 |           then false
248 |           else loop max b (last + 1)
249 |       | 5 -> (* F0 *)
250 |           let last = i + 3 in
251 |           if last > max
252 |           || not_in_x90_to_xBF (get b (i + 1))
253 |           || not_in_x80_to_xBF (get b (i + 2))
254 |           || not_in_x80_to_xBF (get b last)
255 |           then false
256 |           else loop max b (last + 1)
257 |       | 6 -> (* F1..F3 *)
258 |           let last = i + 3 in
259 |           if last > max
260 |           || not_in_x80_to_xBF (get b (i + 1))
261 |           || not_in_x80_to_xBF (get b (i + 2))
262 |           || not_in_x80_to_xBF (get b last)
263 |           then false
264 |           else loop max b (last + 1)
265 |       | 7 -> (* F4 *)
266 |           let last = i + 3 in
267 |           if last > max
268 |           || not_in_x80_to_x8F (get b (i + 1))
269 |           || not_in_x80_to_xBF (get b (i + 2))
270 |           || not_in_x80_to_xBF (get b last)
271 |           then false
272 |           else loop max b (last + 1)
273 |       | 8 -> false
274 |       | _ -> assert false
275 |     in
276 |     loop (length b - 1) b 0
277 | 
278 |   (* UTF-16BE *)
279 | 
280 |   let get_utf_16be_uchar b i =
281 |     let get = unsafe_get_uint16_be in
282 |     let max = length b - 1 in
283 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
284 |     if i = max then dec_invalid 1 else
285 |     match get b i with
286 |     | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
287 |     | u when u > 0xDBFF -> dec_invalid 2
288 |     | hi -> (* combine [hi] with a low surrogate *)
289 |         let last = i + 3 in
290 |         if last > max then dec_invalid (max - i + 1) else
291 |         match get b (i + 2) with
292 |         | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
293 |         | lo ->
294 |             let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
295 |             dec_ret 4 u
296 | 
297 |   let set_utf_16be_uchar b i u =
298 |     let set = unsafe_set_uint16_be in
299 |     let max = length b - 1 in
300 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
301 |     match Uchar.to_int u with
302 |     | u when u < 0 -> assert false
303 |     | u when u <= 0xFFFF ->
304 |         let last = i + 1 in
305 |         if last > max then 0 else (set b i u; 2)
306 |     | u ->
307 |         let last = i + 3 in
308 |         if last > max then 0 else
309 |         let u' = u - 0x10000 in
310 |         let hi = (0xD800 lor (u' lsr 10)) in
311 |         let lo = (0xDC00 lor (u' land 0x3FF)) in
312 |         set b i hi; set b (i + 2) lo; 4
313 | 
314 |   let is_valid_utf_16be b =
315 |     let rec loop max b i =
316 |       let get = unsafe_get_uint16_be in
317 |       if i > max then true else
318 |       if i = max then false else
319 |       match get b i with
320 |       | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
321 |       | u when u > 0xDBFF -> false
322 |       | hi ->
323 |           let last = i + 3 in
324 |           if last > max then false else
325 |           match get b (i + 2) with
326 |           | u when u < 0xDC00 || u > 0xDFFF -> false
327 |           | lo -> loop max b (i + 4)
328 |     in
329 |     loop (length b - 1) b 0
330 | 
331 |   (* UTF-16LE *)
332 | 
333 |   let get_utf_16le_uchar b i =
334 |     let get = unsafe_get_uint16_le in
335 |     let max = length b - 1 in
336 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
337 |     if i = max then dec_invalid 1 else
338 |     match get b i with
339 |     | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
340 |     | u when u > 0xDBFF -> dec_invalid 2
341 |     | hi -> (* combine [hi] with a low surrogate *)
342 |         let last = i + 3 in
343 |         if last > max then dec_invalid (max - i + 1) else
344 |         match get b (i + 2) with
345 |         | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
346 |         | lo ->
347 |             let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
348 |             dec_ret 4 u
349 | 
350 |   let set_utf_16le_uchar b i u =
351 |     let set = unsafe_set_uint16_le in
352 |     let max = length b - 1 in
353 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
354 |     match Uchar.to_int u with
355 |     | u when u < 0 -> assert false
356 |     | u when u <= 0xFFFF ->
357 |         let last = i + 1 in
358 |         if last > max then -2 else (set b i u; 2)
359 |     | u ->
360 |         let last = i + 3 in
361 |         if last > max then -4 else
362 |         let u' = u - 0x10000 in
363 |         let hi = (0xD800 lor (u' lsr 10)) in
364 |         let lo = (0xDC00 lor (u' land 0x3FF)) in
365 |         set b i hi; set b (i + 2) lo; 4
366 | 
367 |   let is_valid_utf_16le b =
368 |     let rec loop max b i =
369 |       let get = unsafe_get_uint16_le in
370 |       if i > max then true else
371 |       if i = max then false else
372 |       match get b i with
373 |       | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
374 |       | u when u > 0xDBFF -> false
375 |       | hi ->
376 |           let last = i + 3 in
377 |           if last > max then false else
378 |           match get b (i + 2) with
379 |           | u when u < 0xDC00 || u > 0xDFFF -> false
380 |           | lo -> loop max b (i + 4)
381 |     in
382 |     loop (length b - 1) b 0
383 | end
384 | 


--------------------------------------------------------------------------------
/utf_x_adhoc.mli:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | include Utf_x.S
17 | 


--------------------------------------------------------------------------------
/utf_x_dfa.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | module Uchar = Utf_uchar
 17 | module Bytes = struct
 18 |   include Bytes
 19 | 
 20 |   (* Unsafe internal additions *)
 21 | 
 22 |   external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
 23 |   external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
 24 | 
 25 |   (* UTF-X codecs and validations *)
 26 | 
 27 |   let dec_invalid = Uchar.utf_decode_invalid
 28 |   let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
 29 | 
 30 |   (* UTF-8 *)
 31 | 
 32 |   let utf_8_dfa =
 33 |     (* The first 256 entries map bytes to their class. Then the
 34 |        DFA follows. The DFA automaton is MIT-licensed
 35 |        Copyright (c) 2008-2009 Bjoern Hoehrmann.
 36 |        See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/#variations*)
 37 |     unsafe_of_string
 38 |     "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 39 |      \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 40 |      \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 41 |      \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 42 |      \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 43 |      \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 44 |      \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 45 |      \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
 46 |      \001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\
 47 |      \009\009\009\009\009\009\009\009\009\009\009\009\009\009\009\009\
 48 |      \007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\
 49 |      \007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\
 50 |      \008\008\002\002\002\002\002\002\002\002\002\002\002\002\002\002\
 51 |      \002\002\002\002\002\002\002\002\002\002\002\002\002\002\002\002\
 52 |      \010\003\003\003\003\003\003\003\003\003\003\003\003\004\003\003\
 53 |      \011\006\006\006\005\008\008\008\008\008\008\008\008\008\008\008\
 54 |      \000\012\024\036\060\096\084\012\012\012\048\072\
 55 |      \012\012\012\012\012\012\012\012\012\012\012\012\
 56 |      \012\000\012\012\012\012\012\000\012\000\012\012\
 57 |      \012\024\012\012\012\012\012\024\012\024\012\012\
 58 |      \012\012\012\012\012\012\012\024\012\012\012\012\
 59 |      \012\024\012\012\012\012\012\012\012\024\012\012\
 60 |      \012\012\012\012\012\012\012\036\012\036\012\012\
 61 |      \012\036\012\012\012\012\012\036\012\036\012\012\
 62 |      \012\036\012\012\012\012\012\012\012\012\012\012"
 63 | 
 64 |   let accept = 0
 65 |   let reject = 12
 66 | 
 67 | (* Rewriting the following as an imperative loop is uglier but
 68 |    more efficient.
 69 | 
 70 |   let get_utf_8_uchar b i =
 71 |     let max = length b - 1 in
 72 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
 73 |     let rec loop st u j =
 74 |       if j > max then dec_err (j - i) else
 75 |       let byte = unsafe_get_uint8 b j in
 76 |       let class' = unsafe_get_uint8 utf_8_dfa byte in
 77 |       let u' =
 78 |         if st <> accept
 79 |         then (u lsl 6) lor (byte land 0x3F)
 80 |         else byte land (0xFF lsr class')
 81 |       in
 82 |       let st' = unsafe_get_uint8 utf_8_dfa (256 + st + class') in
 83 |       if st' = reject then dec_err (j - i + 1) else
 84 |       if st' = accept then dec_ret (j - i + 1) u' else
 85 |       loop st' u' (j + 1)
 86 |     in
 87 |     loop accept 0 i
 88 | *)
 89 | 
 90 |   let get_utf_8_uchar b i =
 91 |     let max = length b - 1 in
 92 |     if i < 0 || i > max then invalid_arg "index out of bounds" else
 93 |     let stop = ref false in
 94 |     let st = ref accept in
 95 |     let j = ref i in
 96 |     let u = ref 0 in
 97 |     while (not !stop) do
 98 |       let byte = unsafe_get_uint8 b !j in
 99 |       let class' = unsafe_get_uint8 utf_8_dfa byte in
100 |       u :=
101 |         (if !st <> accept
102 |          then (!u lsl 6) lor (byte land 0x3F)
103 |          else byte land (0xFF lsr class'));
104 |       st := unsafe_get_uint8 utf_8_dfa (256 + !st + class');
105 |       if (!st = reject || !st = accept)
106 |       then stop := true
107 |       else (incr j; if !j > max then stop := true);
108 |     done;
109 |     if !j > max then dec_invalid (!j - i) else
110 |     if !st = accept then dec_ret (!j - i + 1) !u else
111 |     if !st = reject then dec_invalid (if i = !j then 1 else (!j - i)) else
112 |     assert false
113 | 
114 |   let set_utf_8_uchar b i u =
115 |     let set = unsafe_set_uint8 in
116 |     let max = length b - 1 in
117 |     match Uchar.to_int u with
118 |     | u when u <= 0x007F ->
119 |         set_uint8 b i u;
120 |         1
121 |     | u when u <= 0x07FF ->
122 |         let last = i + 1 in
123 |         if last > max then 0 else
124 |         (set_uint8 b i (0xC0 lor (u lsr 6));
125 |          set b last (0x80 lor (u land 0x3F));
126 |          2)
127 |     | u when u <= 0xFFFF ->
128 |         let last = i + 2 in
129 |         if last > max then 0 else
130 |         (set_uint8 b i (0xE0 lor (u lsr 12));
131 |          set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
132 |          set b last (0x80 lor (u land 0x3F));
133 |          3)
134 |     | u ->
135 |         let last = i + 3 in
136 |         if last > max then 0 else
137 |         (set_uint8 b i (0xF0 lor (u lsr 18));
138 |          set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
139 |          set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
140 |          set b last (0x80 lor (u land 0x3F));
141 |          4)
142 | 
143 |   let is_valid_utf_8 b =
144 |     (* This could be optimized using the state machine. We didn't
145 |        do it for now since it seems the adhoc implementation yields
146 |        better perfs. *)
147 |     let rec loop b i =
148 |       if i >= length b then true else
149 |       let dec = get_utf_8_uchar b i in
150 |       if Uchar.utf_decode_is_valid dec
151 |       then loop b (i + Uchar.utf_decode_length dec)
152 |       else false
153 |     in
154 |     loop b 0
155 | 
156 |   include Utf_16
157 | end
158 | 


--------------------------------------------------------------------------------
/utf_x_dfa.mli:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | include Utf_x.S
17 | 


--------------------------------------------------------------------------------
/utf_x_if.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | module Uchar = Utf_uchar
 17 | module Bytes = struct
 18 |   include Bytes
 19 | 
 20 |   (* Unsafe internal additions *)
 21 | 
 22 |   external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
 23 |   external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
 24 | 
 25 |   (* UTF-X codecs and validations *)
 26 | 
 27 |   let dec_invalid = Uchar.utf_decode_invalid
 28 |   let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
 29 | 
 30 |   (* In case of decoding error, if we error on the first byte, we
 31 |      consume the byte, otherwise we consume the [n] bytes preceeding
 32 |      erroring byte.
 33 | 
 34 |      This means that if a client uses decodes without caring about
 35 |      validity it naturally replace bogus data with Uchar.rep according
 36 |      to the WHATWG Encoding standard. Other schemes are possible by
 37 |      consulting the number of used bytes on invalid decodes. For more
 38 |      details see https://hsivonen.fi/broken-utf-8/
 39 | 
 40 |      For this reason in [get_utf_8_uchar] we gradually check the next
 41 |      byte is available rather than doing it immediately after the
 42 |      first byte. Contrast with [is_valid_utf_8]. *)
 43 | 
 44 |   (* UTF-8 *)
 45 | 
 46 |   let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
 47 |   let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
 48 |   let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
 49 |   let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
 50 |   let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
 51 | 
 52 |   let[@inline] utf_8_uchar_2 b0 b1 =
 53 |     ((b0 land 0x1F) lsl 6) lor
 54 |     ((b1 land 0x3F))
 55 | 
 56 |   let[@inline] utf_8_uchar_3 b0 b1 b2 =
 57 |     ((b0 land 0x0F) lsl 12) lor
 58 |     ((b1 land 0x3F) lsl 6) lor
 59 |     ((b2 land 0x3F))
 60 | 
 61 |   let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
 62 |     ((b0 land 0x07) lsl 18) lor
 63 |     ((b1 land 0x3F) lsl 12) lor
 64 |     ((b2 land 0x3F) lsl 6) lor
 65 |     ((b3 land 0x3F))
 66 | 
 67 |   let get_utf_8_uchar b i =
 68 |     let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
 69 |     let get = unsafe_get_uint8 in
 70 |     if b0 <= 0x7F then (* 00..7F *) dec_ret 1 b0 else
 71 |     if b0 land 0xE0 = 0xC0 && b0 >= 0xC2 then begin
 72 |       (* C2..DF *)
 73 |       let max = length b - 1 in
 74 |       let i = i + 1 in if i > max then dec_invalid 1 else
 75 |       let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
 76 |       dec_ret 2 (utf_8_uchar_2 b0 b1)
 77 |     end else if b0 land 0xF0 = 0xE0 then begin
 78 |       let max = length b - 1 in
 79 |       if b0 = 0xE0 then begin
 80 |         (* E0 *)
 81 |         let i = i + 1 in if i > max then dec_invalid 1 else
 82 |         let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
 83 |         let i = i + 1 in if i > max then dec_invalid 2 else
 84 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
 85 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
 86 |       end else if b0 = 0xED then begin
 87 |         (* ED *)
 88 |         let i = i + 1 in if i > max then dec_invalid 1 else
 89 |         let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
 90 |         let i = i + 1 in if i > max then dec_invalid 2 else
 91 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
 92 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
 93 |       end else begin
 94 |         (* E1..EC or EE..EF *)
 95 |         let i = i + 1 in if i > max then dec_invalid 1 else
 96 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
 97 |         let i = i + 1 in if i > max then dec_invalid 2 else
 98 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
 99 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
100 |       end
101 |     end else if b0 land 0xF8 = 0xF0 && b0 <= 0xF4 then begin
102 |       let max = length b - 1 in
103 |       if b0 = 0xF0 then begin
104 |         (* F0 *)
105 |         let i = i + 1 in if i > max then dec_invalid 1 else
106 |         let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
107 |         let i = i + 1 in if i > max then dec_invalid 2 else
108 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
109 |         let i = i + 1 in if i > max then dec_invalid 3 else
110 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
111 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
112 |       end else if b0 = 0xF4 then begin
113 |         (* F4 *)
114 |         let i = i + 1 in if i > max then dec_invalid 1 else
115 |         let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
116 |         let i = i + 1 in if i > max then dec_invalid 2 else
117 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
118 |         let i = i + 1 in if i > max then dec_invalid 3 else
119 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
120 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
121 |       end else begin
122 |         let i = i + 1 in if i > max then dec_invalid 1 else
123 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
124 |         let i = i + 1 in if i > max then dec_invalid 2 else
125 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
126 |         let i = i + 1 in if i > max then dec_invalid 3 else
127 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
128 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
129 |       end
130 |     end else dec_invalid 1
131 | 
132 |   let set_utf_8_uchar b i u =
133 |     let set = unsafe_set_uint8 in
134 |     let max = length b - 1 in
135 |     match Uchar.to_int u with
136 |     | u when u < 0 -> assert false
137 |     | u when u <= 0x007F ->
138 |         set_uint8 b i u;
139 |         1
140 |     | u when u <= 0x07FF ->
141 |         let last = i + 1 in
142 |         if last > max then 0 else
143 |         (set_uint8 b i (0xC0 lor (u lsr 6));
144 |          set b last (0x80 lor (u land 0x3F));
145 |          2)
146 |     | u when u <= 0xFFFF ->
147 |         let last = i + 2 in
148 |         if last > max then 0 else
149 |         (set_uint8 b i (0xE0 lor (u lsr 12));
150 |          set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
151 |          set b last (0x80 lor (u land 0x3F));
152 |          3)
153 |     | u ->
154 |         let last = i + 3 in
155 |         if last > max then 0 else
156 |         (set_uint8 b i (0xF0 lor (u lsr 18));
157 |          set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
158 |          set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
159 |          set b last (0x80 lor (u land 0x3F));
160 |          4)
161 | 
162 |   let is_valid_utf_8 b =
163 |     let rec loop max b i =
164 |       if i > max then true else
165 |       let get = unsafe_get_uint8 in
166 |       let b0 = get b i in
167 |       if b0 <= 0x7F then (* 00..7F *) loop max b (i + 1) else
168 |       if b0 land 0xE0 = 0xC0 && b0 >= 0xC2 then begin
169 |         (* C2..DF *)
170 |         let last = i + 1 in
171 |         if last > max
172 |         || not_in_x80_to_xBF (get b last)
173 |         then false
174 |         else loop max b (last + 1)
175 |       end else if b0 land 0xF0 = 0xE0 then begin
176 |         if b0 = 0xE0 then begin
177 |           (* E0 *)
178 |           let last = i + 2 in
179 |           if last > max
180 |           || not_in_xA0_to_xBF (get b (i + 1))
181 |           || not_in_x80_to_xBF (get b last)
182 |           then false
183 |           else loop max b (last + 1)
184 |         end else if b0 = 0xED then begin
185 |           let last = i + 2 in
186 |           if last > max
187 |           || not_in_x80_to_x9F (get b (i + 1))
188 |           || not_in_x80_to_xBF (get b last)
189 |           then false
190 |           else loop max b (last + 1)
191 |         end else begin
192 |           (* E1..EC or EE..EF *)
193 |           let last = i + 2 in
194 |           if last > max
195 |           || not_in_x80_to_xBF (get b (i + 1))
196 |           || not_in_x80_to_xBF (get b last)
197 |           then false
198 |           else loop max b (last + 1)
199 |         end
200 |       end else if b0 land 0xF8 = 0xF0 && b0 <= 0xF4 then begin
201 |         if b0 = 0xF0 then begin
202 |           (* F0 *)
203 |           let last = i + 3 in
204 |           if last > max
205 |           || not_in_x90_to_xBF (get b (i + 1))
206 |           || not_in_x80_to_xBF (get b (i + 2))
207 |           || not_in_x80_to_xBF (get b last)
208 |           then false
209 |           else loop max b (last + 1)
210 |         end else if b0 = 0xF4 then begin
211 |           (* F4 *)
212 |           let last = i + 3 in
213 |           if last > max
214 |           || not_in_x80_to_x8F (get b (i + 1))
215 |           || not_in_x80_to_xBF (get b (i + 2))
216 |           || not_in_x80_to_xBF (get b last)
217 |           then false
218 |           else loop max b (last + 1)
219 |         end else begin
220 |           (* F1..F3 *)
221 |           let last = i + 3 in
222 |           if last > max
223 |           || not_in_x80_to_xBF (get b (i + 1))
224 |           || not_in_x80_to_xBF (get b (i + 2))
225 |           || not_in_x80_to_xBF (get b last)
226 |           then false
227 |           else loop max b (last + 1)
228 |         end
229 |       end else false
230 |     in
231 |     loop (length b - 1) b 0
232 | 
233 |   include Utf_16
234 | end
235 | 


--------------------------------------------------------------------------------
/utf_x_if.mli:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | include Utf_x.S
17 | 


--------------------------------------------------------------------------------
/utf_x_pat.ml:
--------------------------------------------------------------------------------
  1 | (**************************************************************************)
  2 | (*                                                                        *)
  3 | (*                                 OCaml                                  *)
  4 | (*                                                                        *)
  5 | (*                         The OCaml programmers                          *)
  6 | (*                                                                        *)
  7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
  8 | (*     en Automatique.                                                    *)
  9 | (*                                                                        *)
 10 | (*   All rights reserved.  This file is distributed under the terms of    *)
 11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
 12 | (*   special exception on linking described in the file LICENSE.          *)
 13 | (*                                                                        *)
 14 | (**************************************************************************)
 15 | 
 16 | module Uchar = Utf_uchar
 17 | module Bytes = struct
 18 |   include Bytes
 19 | 
 20 |   (* Unsafe internal additions *)
 21 | 
 22 |   external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
 23 |   external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
 24 | 
 25 |   (* UTF-X codecs and validations *)
 26 | 
 27 |   let dec_invalid = Uchar.utf_decode_invalid
 28 |   let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
 29 | 
 30 |   (* In case of decoding error, if we error on the first byte, we
 31 |      consume the byte, otherwise we consume the [n] bytes preceeding
 32 |      the erroring byte.
 33 | 
 34 |      This means that if a client uses decodes without caring about
 35 |      validity it naturally replace bogus data with Uchar.rep according
 36 |      to the WHATWG Encoding standard. Other schemes are possible by
 37 |      consulting the number of used bytes on invalid decodes. For more
 38 |      details see https://hsivonen.fi/broken-utf-8/
 39 | 
 40 |      For this reason in [get_utf_8_uchar] we gradually check the next
 41 |      byte is available rather than doing it immediately after the
 42 |      first byte. Contrast with [is_valid_utf_8]. *)
 43 | 
 44 |   (* UTF-8 *)
 45 | 
 46 |   let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
 47 |   let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
 48 |   let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
 49 |   let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
 50 |   let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
 51 | 
 52 |   let[@inline] utf_8_uchar_2 b0 b1 =
 53 |     ((b0 land 0x1F) lsl 6) lor
 54 |     ((b1 land 0x3F))
 55 | 
 56 |   let[@inline] utf_8_uchar_3 b0 b1 b2 =
 57 |     ((b0 land 0x0F) lsl 12) lor
 58 |     ((b1 land 0x3F) lsl 6) lor
 59 |     ((b2 land 0x3F))
 60 | 
 61 |   let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
 62 |     ((b0 land 0x07) lsl 18) lor
 63 |     ((b1 land 0x3F) lsl 12) lor
 64 |     ((b2 land 0x3F) lsl 6) lor
 65 |     ((b3 land 0x3F))
 66 | 
 67 |   let get_utf_8_uchar b i =
 68 |     let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
 69 |     let get = unsafe_get_uint8 in
 70 |     let max = length b - 1 in
 71 |     match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *)
 72 |     | '\x00' .. '\x7F' -> dec_ret 1 b0
 73 |     | '\xC2' .. '\xDF' ->
 74 |         let i = i + 1 in if i > max then dec_invalid 1 else
 75 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
 76 |         dec_ret 2 (utf_8_uchar_2 b0 b1)
 77 |     | '\xE0' ->
 78 |         let i = i + 1 in if i > max then dec_invalid 1 else
 79 |         let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
 80 |         let i = i + 1 in if i > max then dec_invalid 2 else
 81 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
 82 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
 83 |     | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
 84 |         let i = i + 1 in if i > max then dec_invalid 1 else
 85 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
 86 |         let i = i + 1 in if i > max then dec_invalid 2 else
 87 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
 88 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
 89 |     | '\xED' ->
 90 |         let i = i + 1 in if i > max then dec_invalid 1 else
 91 |         let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
 92 |         let i = i + 1 in if i > max then dec_invalid 2 else
 93 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
 94 |         dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
 95 |     | '\xF0' ->
 96 |         let i = i + 1 in if i > max then dec_invalid 1 else
 97 |         let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
 98 |         let i = i + 1 in if i > max then dec_invalid 2 else
 99 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
100 |         let i = i + 1 in if i > max then dec_invalid 3 else
101 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
102 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
103 |     | '\xF1' .. '\xF3' ->
104 |         let i = i + 1 in if i > max then dec_invalid 1 else
105 |         let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
106 |         let i = i + 1 in if i > max then dec_invalid 2 else
107 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
108 |         let i = i + 1 in if i > max then dec_invalid 3 else
109 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
110 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
111 |     | '\xF4' ->
112 |         let i = i + 1 in if i > max then dec_invalid 1 else
113 |         let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
114 |         let i = i + 1 in if i > max then dec_invalid 2 else
115 |         let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
116 |         let i = i + 1 in if i > max then dec_invalid 3 else
117 |         let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
118 |         dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
119 |     | _ -> dec_invalid 1
120 | 
121 |   let set_utf_8_uchar b i u =
122 |     let set = unsafe_set_uint8 in
123 |     let max = length b - 1 in
124 |     match Uchar.to_int u with
125 |     | u when u < 0 -> assert false
126 |     | u when u <= 0x007F ->
127 |         set_uint8 b i u;
128 |         1
129 |     | u when u <= 0x07FF ->
130 |         let last = i + 1 in
131 |         if last > max then 0 else
132 |         (set_uint8 b i (0xC0 lor (u lsr 6));
133 |          set b last (0x80 lor (u land 0x3F));
134 |          2)
135 |     | u when u <= 0xFFFF ->
136 |         let last = i + 2 in
137 |         if last > max then 0 else
138 |         (set_uint8 b i (0xE0 lor (u lsr 12));
139 |          set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
140 |          set b last (0x80 lor (u land 0x3F));
141 |          3)
142 |     | u ->
143 |         let last = i + 3 in
144 |         if last > max then 0 else
145 |         (set_uint8 b i (0xF0 lor (u lsr 18));
146 |          set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
147 |          set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
148 |          set b last (0x80 lor (u land 0x3F));
149 |          4)
150 | 
151 |   let is_valid_utf_8 b =
152 |     let rec loop max b i =
153 |       if i > max then true else
154 |       let get = unsafe_get_uint8 in
155 |       match Char.unsafe_chr (get b i) with
156 |       | '\x00' .. '\x7F' -> loop max b (i + 1)
157 |       | '\xC2' .. '\xDF' ->
158 |           let last = i + 1 in
159 |           if last > max
160 |           || not_in_x80_to_xBF (get b last)
161 |           then false
162 |           else loop max b (last + 1)
163 |       | '\xE0' ->
164 |           let last = i + 2 in
165 |           if last > max
166 |           || not_in_xA0_to_xBF (get b (i + 1))
167 |           || not_in_x80_to_xBF (get b last)
168 |           then false
169 |           else loop max b (last + 1)
170 |       | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
171 |           let last = i + 2 in
172 |           if last > max
173 |           || not_in_x80_to_xBF (get b (i + 1))
174 |           || not_in_x80_to_xBF (get b last)
175 |           then false
176 |           else loop max b (last + 1)
177 |       | '\xED' ->
178 |           let last = i + 2 in
179 |           if last > max
180 |           || not_in_x80_to_x9F (get b (i + 1))
181 |           || not_in_x80_to_xBF (get b last)
182 |           then false
183 |           else loop max b (last + 1)
184 |       | '\xF0' ->
185 |           let last = i + 3 in
186 |           if last > max
187 |           || not_in_x90_to_xBF (get b (i + 1))
188 |           || not_in_x80_to_xBF (get b (i + 2))
189 |           || not_in_x80_to_xBF (get b last)
190 |           then false
191 |           else loop max b (last + 1)
192 |       | '\xF1' .. '\xF3' ->
193 |           let last = i + 3 in
194 |           if last > max
195 |           || not_in_x80_to_xBF (get b (i + 1))
196 |           || not_in_x80_to_xBF (get b (i + 2))
197 |           || not_in_x80_to_xBF (get b last)
198 |           then false
199 |           else loop max b (last + 1)
200 |       | '\xF4' ->
201 |           let last = i + 3 in
202 |           if last > max
203 |           || not_in_x80_to_x8F (get b (i + 1))
204 |           || not_in_x80_to_xBF (get b (i + 2))
205 |           || not_in_x80_to_xBF (get b last)
206 |           then false
207 |           else loop max b (last + 1)
208 |       | _ -> false
209 |     in
210 |     loop (length b - 1) b 0
211 | 
212 |   include Utf_16
213 | end
214 | 


--------------------------------------------------------------------------------
/utf_x_pat.mli:
--------------------------------------------------------------------------------
 1 | (**************************************************************************)
 2 | (*                                                                        *)
 3 | (*                                 OCaml                                  *)
 4 | (*                                                                        *)
 5 | (*                         The OCaml programmers                          *)
 6 | (*                                                                        *)
 7 | (*   Copyright 2021 Institut National de Recherche en Informatique et     *)
 8 | (*     en Automatique.                                                    *)
 9 | (*                                                                        *)
10 | (*   All rights reserved.  This file is distributed under the terms of    *)
11 | (*   the GNU Lesser General Public License version 2.1, with the          *)
12 | (*   special exception on linking described in the file LICENSE.          *)
13 | (*                                                                        *)
14 | (**************************************************************************)
15 | 
16 | include Utf_x.S
17 | 


--------------------------------------------------------------------------------