├── .gitignore
├── .merlin
├── B0.ml
├── BRZO
├── DEVEL.md
├── LICENSE.md
├── README.md
├── data
├── urep-spec.html
└── urep-test.html
├── examples.ml
├── old-proposal.md
├── perf8.ml
├── test.ml
├── test_long.ml
├── trip8.ml
├── utf_16.ml
├── utf_uchar.ml
├── utf_uchar.mli
├── utf_x.mli
├── utf_x_adhoc.ml
├── utf_x_adhoc.mli
├── utf_x_dfa.ml
├── utf_x_dfa.mli
├── utf_x_if.ml
├── utf_x_if.mli
├── utf_x_pat.ml
└── utf_x_pat.mli
/.gitignore:
--------------------------------------------------------------------------------
1 | _b0
2 | _build
3 | data
4 | tmp
5 | *.install
6 | CLOCK.org
7 |
--------------------------------------------------------------------------------
/.merlin:
--------------------------------------------------------------------------------
1 | PKG b0.kit uutf
2 | S ./**
3 | B _b0/b/**
4 |
--------------------------------------------------------------------------------
/B0.ml:
--------------------------------------------------------------------------------
1 | open B0_kit.V000
2 |
3 | let uutf = B0_ocaml.libname "uutf"
4 |
5 | let file v = `File (Fpath.v v)
6 | let mod_srcs n = file (n ^ ".mli"), file (n ^ ".ml")
7 |
8 | let iface = file "utf_x.mli"
9 | let uchar_mli, uchar_ml = mod_srcs "utf_uchar"
10 | let utf_16_ml = file "utf_16.ml"
11 | let adhoc_mli, adhoc_ml = mod_srcs "utf_x_adhoc"
12 | let if_mli, if_ml = mod_srcs "utf_x_if"
13 | let dfa_mli, dfa_ml = mod_srcs "utf_x_dfa"
14 | let pat_mli, pat_ml = mod_srcs "utf_x_pat"
15 |
16 | let base = [iface; uchar_mli; uchar_ml; utf_16_ml]
17 | let all_impls =
18 | [adhoc_mli; adhoc_ml; if_mli; if_ml; dfa_mli; dfa_ml; pat_mli; pat_ml ] @
19 | base
20 |
21 | let examples =
22 | let srcs = [adhoc_mli; adhoc_ml; file "examples.ml"] @ base in
23 | B0_ocaml.exe "examples" ~doc:"Sample code" ~srcs
24 |
25 | let test =
26 | let srcs = (file "test.ml" :: all_impls) in
27 | B0_ocaml.exe "test" ~doc:"Test suite" ~srcs
28 |
29 | let test =
30 | let srcs = (file "test_long.ml" :: all_impls) in
31 | B0_ocaml.exe "test_long" ~doc:"Long test suite" ~srcs
32 |
33 | let perf8 =
34 | let srcs = (file "perf8.ml" :: all_impls) in
35 | let requires = [uutf] in
36 | B0_ocaml.exe "perf8" ~doc:"Performance test tool for UTF-8" ~requires ~srcs
37 |
38 | let trip8 =
39 | let srcs = (file "trip8.ml" :: all_impls) in
40 | let requires = [uutf] in
41 | B0_ocaml.exe "trip8" ~doc:"Best-effort UTF-8 recode" ~requires ~srcs
42 |
--------------------------------------------------------------------------------
/BRZO:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dbuenzli/stdlib-utf/3b9a96e7530830bd5ea87c3c92dc9e9bd02d4dee/BRZO
--------------------------------------------------------------------------------
/DEVEL.md:
--------------------------------------------------------------------------------
1 | Running stuff.
2 |
3 | ```
4 | opam install uutf
5 | opam pin add b0 --dev
6 | curl -L https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt \
7 | > data/UTF-8-demo.txt
8 | time b0 -a perf8 -- --adhoc data/UTF-8-demo.txt
9 | time b0 -a perf8 -- --dfa data/UTF-8-demo.txt
10 | time b0 -a perf8 -- --validate --adhoc data/UTF-8-demo.txt
11 |
12 | # If you are interested in absolute numbers avoid timing the build
13 |
14 | b0
15 | time $(b0 unit build-dir perf8)/perf8 ...
16 | ```
17 |
18 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | ```
2 | (**************************************************************************)
3 | (* *)
4 | (* OCaml *)
5 | (* *)
6 | (* The OCaml programmers *)
7 | (* *)
8 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
9 | (* en Automatique. *)
10 | (* *)
11 | (* All rights reserved. This file is distributed under the terms of *)
12 | (* the GNU Lesser General Public License version 2.1, with the *)
13 | (* special exception on linking described in the file LICENSE. *)
14 | (* *)
15 | (**************************************************************************)
16 | ```
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Adding UTF-X decoding support to the OCaml Stdlib
2 | =================================================
3 |
4 | **This has been merged upstream and will be part of OCaml 4.14**
5 |
6 | [Upstream issue][upstream]. [Upstream PR][upstream-pr].
7 |
8 | Plays around improving [this old proposal][old-proposal]. Especially
9 | on [Alain's comment][alain-comment] to avoid exceptions.
10 |
11 | For now we simply focus on providing an API in `Bytes`, the rest can
12 | follow easily and/or later.
13 |
14 | We introduce the abstract type `Uchar.utf_decode` for decode results
15 | which is just an `int` value that has the decoded `Uchar.t` (or
16 | `Uchar.rep` in case of decode error) in the lower bits and information
17 | about the decode in bits above `Uchar.max`. The scheme is compatible
18 | with both 32-bit and 64-bit platforms, see the implementation for more
19 | details. No exception have to be introduced and the API does not
20 | allocate on decodes.
21 |
22 | The [resulting API](utf_x.mli) seems nice to
23 | use. [`examples.ml`](examples.ml) show how [`Uutf`-like
24 | folders][uutf-fold] and `Seq` based iterators can easily be derived in
25 | a few lines.
26 |
27 | It is also fool proof w.r.t. to best-effort decodes, `Uchar.rep` will
28 | be appropriately added on decoding errors as it should be for security
29 | reasons. See the `Seq` iterator example and more information on this
30 | below.
31 |
32 | A few implementation are provided which differ only in their UTF-8
33 | treatment:
34 |
35 | 1. [`utf_x_adhoc.ml`](utf_x_adhoc.ml), uses a 256 bytes string to index
36 | the cases mentionned in [TUS Table 3.7][tus] and then performs some
37 | ugly ad-hoc decoding. This ugly code is mine :-)
38 | 2. [`utf_x_if.ml`](utf_x_if.ml), is similar to adhoc, but uses `if`
39 | branches instead of a table, the result is a bit harder to comprehend.
40 | 3. [`utf_x_pat.ml`](utf_x_pat.ml), is similar to adhoc but lets my
41 | favourite compiler work out the dispatch table itself by using pattern
42 | matching on byte ranges. Yay !
43 | 4. [`utf_x_dfa.ml`](utf_x_dfa.ml), uses the UTF-8 decoding MIT-licensed
44 | DFA devised by Bjoern Hoehrmann [here][dfa]. This uses a 364 bytes
45 | string for the DFA and makes the decoding code much more elegant (if
46 | I hadn't to rewrite it to an imperative loop to make it more
47 | efficient).
48 |
49 | Rough benchmarks made with [`perf8.ml`](perf8.ml) and measured via
50 | `time` on my machine seems to indicate we have, ordered by most
51 | performant: `pat`, `if`, `adhoc` and `dfa`.
52 |
53 | The first three being quite close to each other (within
54 | 10%). Comparing the fastest to the slowest, the `pat` version takes
55 | the following percentage of the time taken by the `dfa` version.
56 |
57 | * 62% [Markus Kuhn's UTF-8 demo][kuhn-utf-8]. A mixture of
58 | things.
59 | * 62% [Hindi wikipedia articles multistream][hindi-wiki] dataset.
60 | Exercises ASCII and [Devanagari][devanagari] decodes which are
61 | three bytes long in UTF-8.
62 | * 58% [Croatian wikipedia articles multistream][hr-wiki] dataset.
63 | Exercises ASCII and [Latin Extended-A][latin-ext-A] decodes which
64 | are two bytes long in UTF-8.
65 | * 56% on this README, mainly ASCII.
66 |
67 | So I think the best candidate for upstreaming would be `pat`.
68 |
69 | ## Best-effort decodes and U+FFFD replacements
70 |
71 | The API also tries to take into account recommended strategies for
72 | U+FFFD replacement on best-effort decoding. For a discussion about
73 | these see [here][how-many-urep].
74 |
75 | If nothing special is done, e.g. like the `Seq` iterator in
76 | [`examples.ml`](examples.ml) the resulting API should behave like the
77 | "state machine view". Supposedly this is what is mandated by the
78 | WHATWG [Encoding][whatwg-encoding] standard, if you can infer it
79 | through their crazy prose code specifications – TUS has a description
80 | of it on page 126 of the [version 14.0.0 text][tus] if you can
81 | remember all the precise jargon definitions :-) I believe that a short and
82 | concise spec of all that is:
83 |
84 | > In case of decoding error return an U+FFFE, if you error on the
85 | > first byte, consume the byte, otherwise consume the bytes preceeding
86 | > the erroring byte.
87 |
88 | The API provides enough information on decodes to implement other
89 | replacement modes like one `Uchar.rep` per bogus byte.
90 |
91 | The [`urep-test.html`](data/urep-test.html) and
92 | [`urep-spec.html`](data/urep-spec.html) files taken from the above
93 | mentioned page can be used to test the WHATWG encoding standard
94 | behaviour (note that `Uutf` does not respect this).
95 |
96 | [upstream-pr]: https://github.com/ocaml/ocaml/pull/10710
97 | [upstream]: https://github.com/ocaml/ocaml/issues/10660
98 | [old-proposal]: https://gist.github.com/dbuenzli/211e1fb4d8dfce0d22c6d6616260cdd9
99 | [alain-comment]: https://gist.github.com/dbuenzli/211e1fb4d8dfce0d22c6d6616260cdd9#gistcomment-2574875
100 | [how-many-urep]: https://hsivonen.fi/broken-utf-8/
101 | [tus]: http://www.unicode.org/versions/Unicode14.0.0/ch03.pdf
102 | [whatwg-encoding]: https://encoding.spec.whatwg.org/
103 | [dfa]: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/#variations
104 | [hindi-wiki]: https://dumps.wikimedia.org/hiwiki/20210920/
105 | [hr-wiki]: https://dumps.wikimedia.org/hrwiki/20210920/
106 | [kuhn-utf-8]: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt
107 | [uutf-fold]: https://erratique.ch/software/uutf/doc/Uutf.String.html#1_Stringfolders
108 | [devanagari]: http://www.unicode.org/charts/PDF/U0900.pdf
109 | [latin-ext-a]: https://www.unicode.org/charts/PDF/U0100.pdf
110 |
--------------------------------------------------------------------------------
/data/urep-spec.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Broken UTF-8
6 |
11 |
12 |
13 | Broken UTF-8
14 | Any copyright to this file is dedicated to the Public Domain. https://creativecommons.org/publicdomain/zero/1.0/
15 | Five-byte and six-byte sequences were defined in RFC 2297 but are no longer part of the UTF-8 definition.
16 |
Non-shortest forms for lowest single-byte (U+0000)
17 |
18 | - Two-byte sequence (C0 80)
19 | - ��
20 | - Three-byte sequence (E0 80 80)
21 | - ���
22 | - Four-byte sequence (F0 80 80 80)
23 | - ����
24 | - Five-byte sequence (F8 80 80 80 80)
25 | - �����
26 | - Six-byte sequence (FC 80 80 80 80 80)
27 | - ������
28 |
29 | Non-shortest forms for highest single-byte (U+007F)
30 |
31 | - Two-byte sequence (C1 BF)
32 | - ��
33 | - Three-byte sequence (E0 81 BF)
34 | - ���
35 | - Four-byte sequence (F0 80 81 BF)
36 | - ����
37 | - Five-byte sequence (F8 80 80 81 BF)
38 | - �����
39 | - Six-byte sequence (FC 80 80 80 81 BF)
40 | - ������
41 |
42 | Non-shortest forms for lowest two-byte (U+0080)
43 |
44 | - Three-byte sequence (E0 82 80)
45 | - ���
46 | - Four-byte sequence (F0 80 82 80)
47 | - ����
48 | - Five-byte sequence (F8 80 80 82 80)
49 | - �����
50 | - Six-byte sequence (FC 80 80 80 82 80)
51 | - ������
52 |
53 | Non-shortest forms for highest two-byte (U+07FF)
54 |
55 | - Three-byte sequence (E0 9F BF)
56 | - ���
57 | - Four-byte sequence (F0 80 9F BF)
58 | - ����
59 | - Five-byte sequence (F8 80 80 9F BF)
60 | - �����
61 | - Six-byte sequence (FC 80 80 80 9F BF)
62 | - ������
63 |
64 | Non-shortest forms for lowest three-byte (U+0800)
65 |
66 | - Four-byte sequence (F0 80 A0 80)
67 | - ����
68 | - Five-byte sequence (F8 80 80 A0 80)
69 | - �����
70 | - Six-byte sequence (FC 80 80 80 A0 80)
71 | - ������
72 |
73 | Non-shortest forms for highest three-byte (U+FFFF)
74 |
75 | - Four-byte sequence (F0 8F BF BF)
76 | - ����
77 | - Five-byte sequence (F8 80 8F BF BF)
78 | - �����
79 | - Six-byte sequence (FC 80 80 8F BF BF)
80 | - ������
81 |
82 | Non-shortest forms for lowest four-byte (U+10000)
83 |
84 | - Five-byte sequence (F8 80 90 80 80)
85 | - �����
86 | - Six-byte sequence (FC 80 80 90 80 80)
87 | - ������
88 |
89 | Non-shortest forms for last Unicode (U+10FFFF)
90 |
91 | - Five-byte sequence (F8 84 8F BF BF)
92 | - �����
93 | - Six-byte sequence (FC 80 84 8F BF BF)
94 | - ������
95 |
96 | Out of range
97 |
98 | - One past Unicode (F4 90 80 80)
99 | - ����
100 | - Longest five-byte sequence (FB BF BF BF BF)
101 | - �����
102 | - Longest six-byte sequence (FD BF BF BF BF BF)
103 | - ������
104 | - First surrogate (ED A0 80)
105 | - ���
106 | - Last surrogate (ED BF BF)
107 | - ���
108 | - CESU-8 surrogate pair (ED A0 BD ED B2 A9)
109 | - ������
110 |
111 | Out of range and non-shortest
112 |
113 | - One past Unicode as five-byte sequence (F8 84 90 80 80)
114 | - �����
115 | - One past Unicode as six-byte sequence (FC 80 84 90 80 80)
116 | - ������
117 | - First surrogate as four-byte sequence (F0 8D A0 80)
118 | - ����
119 | - Last surrogate as four-byte sequence (F0 8D BF BF)
120 | - ����
121 | - CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)
122 | - ��������
123 |
124 | Lone trails
125 |
126 | - One (80)
127 | - �
128 | - Two (80 80)
129 | - ��
130 | - Three (80 80 80)
131 | - ���
132 | - Four (80 80 80 80)
133 | - ����
134 | - Five (80 80 80 80 80)
135 | - �����
136 | - Six (80 80 80 80 80 80)
137 | - ������
138 | - Seven (80 80 80 80 80 80 80)
139 | - �������
140 | - After valid two-byte (C2 B6 80)
141 | - ¶�
142 | - After valid three-byte (E2 98 83 80)
143 | - ☃�
144 | - After valid four-byte (F0 9F 92 A9 80)
145 | - 💩�
146 | - After five-byte (FB BF BF BF BF 80)
147 | - ������
148 | - After six-byte (FD BF BF BF BF BF 80)
149 | - �������
150 |
151 | Truncated sequences
152 |
153 | - Two-byte lead (C2)
154 | - �
155 | - Three-byte lead (E2)
156 | - �
157 | - Three-byte lead and one trail (E2 98)
158 | - �
159 | - Four-byte lead (F0)
160 | - �
161 | - Four-byte lead and one trail (F0 9F)
162 | - �
163 | - Four-byte lead and two trails (F0 9F 92)
164 | - �
165 |
166 | Leftovers
167 |
168 | - FE (FE)
169 | - �
170 | - FE and trail (FE 80)
171 | - ��
172 | - FF (FF)
173 | - �
174 | - FF and trail (FF 80)
175 | - ��
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/data/urep-test.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dbuenzli/stdlib-utf/3b9a96e7530830bd5ea87c3c92dc9e9bd02d4dee/data/urep-test.html
--------------------------------------------------------------------------------
/examples.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | open Utf_x_adhoc
17 |
18 | (* Uutf style folders
19 | https://erratique.ch/software/uutf/doc/Uutf.String.html *)
20 |
21 | type utf_decode = [ `Malformed of string | `Uchar of Uchar.t ]
22 | type 'a folder = 'a -> int -> utf_decode -> 'a
23 |
24 | let fold_utf_8 : 'a folder -> 'a -> string -> 'a = fun f acc s ->
25 | let rec loop b i acc =
26 | if i >= Bytes.length b then acc else
27 | let dec = Bytes.get_utf_8_uchar b i in
28 | let n = Uchar.utf_decode_length dec in
29 | let i' = i + n in
30 | match Uchar.utf_decode_is_valid dec with
31 | | false -> loop b i' (f acc i (`Malformed (String.sub s i n)))
32 | | true -> loop b i' (f acc i (`Uchar (Uchar.utf_decode_uchar dec)))
33 | in
34 | loop (Bytes.unsafe_of_string s) 0 acc
35 |
36 | (* Seq based iterators.
37 |
38 | Replace malformations by {!Uchar.rep} using the WHATWG Encoding
39 | standard, see https://hsivonen.fi/broken-utf-8/. It's nice that no
40 | particular logic has to be added for decode errors. *)
41 |
42 | let utf_8_string_to_uchar_seq : string -> Uchar.t Seq.t = fun s ->
43 | let rec uchar b i () =
44 | if i >= Bytes.length b then Seq.Nil else
45 | let dec = Bytes.get_utf_8_uchar b i in
46 | let u = Uchar.utf_decode_uchar dec in
47 | Seq.Cons (u, uchar b (i + Uchar.utf_decode_length dec))
48 | in
49 | uchar (Bytes.unsafe_of_string s) 0
50 |
--------------------------------------------------------------------------------
/old-proposal.md:
--------------------------------------------------------------------------------
1 | # Adding UTF-X decoding support to the stdlib
2 |
3 | Since 4.06 we have UTF-X encoding support via the `Buffer` module. The
4 | following is a proposal to add UTF-X *decoding* (and `bytes` encoding)
5 | support with the following goals:
6 |
7 | 1. Provide a low-level, allocation-free codec API in `bytes`. This API
8 | provides all the information needed to implement loops for making
9 | higher-level UTF-X codec APIs (e.g. `Uutf`'s [folding](http://erratique.ch/software/uutf/doc/Uutf.String.html#1_Stringfolders) functions)
10 | operating on `bytes` and `string` values. It's not geared towards the end-user
11 | but rather towards library programmers.
12 |
13 | 2. Provide a less performant and less precise (no way to detect
14 | decoding errors; `Uchar.rep` replacement) decoding convenience API
15 | based on the `Uchar.t Seq.t` datatype for both `Bytes` and `String`.
16 |
17 | ## High-level `Seq` based API (`Bytes` and `String`)
18 |
19 | In both `Bytes` and `String` we add the following high-level interface
20 | (implemented via the low-level interface):
21 |
22 | ```ocaml
23 | val utf_8_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t
24 | val utf_16le_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t
25 | val utf_16be_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t
26 | ```
27 |
28 | These are "best-effort" functions: any decoding error is substituted by
29 | a `Uchar.rep` character until a new synchronizing byte is found.
30 |
31 | The `utf_{8,16le,16be}_of_uchar_seq` functions would be a bit difficult to
32 | provide due to dependencies (if implemented via `Buffer`), so
33 | we leave them out.
34 |
35 | We could add `Buffer.add_utf_{8,16le,16,be}_uchar_seq` but that's just
36 | a `Seq.iter` away with the existing `Buffer.add_utf_{8,16le,16be}_uchar`.
37 | So we stop here w.r.t. high-level API.
38 |
39 | ## Low-level `Bytes` API
40 |
41 | We begin with the following additions to the `Uchar` module:
42 |
43 | ```ocaml
44 | val utf_8_byte_length : Uchar.t -> int
45 | val utf_16_byte_length : Uchar.t -> int
46 | ```
47 |
48 | In the `Bytes` module we then add the following:
49 |
50 | ```ocaml
51 | exception Utf_error of int
52 | (** The control flow exception raised by
53 | [{get,set}_uchar_utf_{8,16le,16be}] functions. The integer is an
54 | index where a new decode/encode can be tried or the length of the
55 | buffer if the end was reached. {b Important.} This exception is used
56 | for control flow you must not let it uncaught, it is raised with notrace. *)
57 |
58 | val get_uchar_utf_8_length : t -> int -> int
59 | (** [get_uchar_utf_8_length b i] is the byte length an UTF-8 encoded
60 | character at index [i] in [b]. This is [0] if the byte is not a valid
61 | UTF-8 starter byte. *)
62 |
63 | val get_uchar_utf_8 : t -> int -> Uchar.t
64 | (** [get_uchar_utf_8 b i] is the UTF-8 encoded Unicode character at index
65 | [i] in [b]. The number of bytes consumed by the decode can be
66 | determined by [get_uchar_utf_8_length b i]. In case of errors
67 | @raise Utf_error [n] with [n] the next index were a new decode
68 | can be tried or [length b] if there is no such index. *)
69 |
70 | val get_uchar_utf_16le : t -> int -> Uchar.t
71 | (** [get_uchar_utf_16le b i] is the UTF-16LE encoded Unicode character at
72 | index [i] in [b]. The number of bytes consumed by the decode can
73 | be determined by using {!Uchar.utf_16_byte_length} on the
74 | result. @raise Utf_error [n] with [n] the next index were a new
75 | decode can be tried or [length b] if there is no such index. *)
76 |
77 | val get_uchar_utf_16be : t -> int -> Uchar.t
78 | (** [get_uchar_utf_16be] is like {!get_uchar_utf16_le} but decodes
79 | UTF-16BE. *)
80 |
81 | val set_uchar_utf_8 : t -> int -> Uchar.t -> int
82 | (** [set_uchar_utf_8 b i u] sets the UTF-8 encoding of [u] in [b]
83 | starting at [i] and returns the next index were an encoding can
84 | be performed or the buffer length if there is no such
85 | index. @raise Utf_error [n] if there's not enough space for the
86 | encode. *)
87 |
88 | val set_utf_16le : t -> int -> Uchar.t -> int
89 | (** [set_utf_16le] is like {!set_uchar_utf_8} but UTF-16LE encodes. *)
90 |
91 | val set_utf_16be : t -> int -> Uchar.t -> int
92 | (** [set_utf_16be] is like {!set_uchar_utf_8} but UTF-16BE encodes. *)
93 | ```
94 |
95 | For codecing sequences of `Uchar.t` as loops:
96 |
97 | 1. The `set` functions are good.
98 | 2. The `get` functions incur a bit of overhead to compute the next index (the invoked function has the information but we can't return it without allocating). For UTF-8 either we either get the next index via `get_uchar_utf_8_length` (assuming a table-based implementation: two memory accesses) or with `Uchar.utf_8_byte_length` on the result (up to three branches). For UTF-16 we use `Uchar.utf_16_byte_length` (one branch).
99 |
--------------------------------------------------------------------------------
/perf8.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 |
17 | let log fmt = Printf.printf (fmt ^^ "\n%!")
18 |
19 | let impl_to_string = function
20 | | `Adhoc -> "ADHOC" | `Dfa -> "DFA" | `If -> "IF" | `Pat -> "PAT"
21 | | `Uutf -> "UUTF"
22 |
23 | (* Data preparation. We take any file read it and duplicate
24 | its content until we get at least a 1GB string. *)
25 |
26 | let string_of_file file =
27 | let ic = open_in_bin file in
28 | let len = in_channel_length ic in
29 | let buf = Bytes.create len in
30 | really_input ic buf 0 len; close_in ic; Bytes.unsafe_to_string buf
31 |
32 | let make_it_approx_1GB s =
33 | let one_gb = 1024 * 1024 * 1024 in
34 | let len = String.length s in
35 | if len >= 1024 * 1024 * 1024 then s else
36 | let count = one_gb / len in
37 | let rec rep acc n = if n <= 0 then acc else rep (s :: acc) (n - 1) in
38 | String.concat "" (rep [] count)
39 |
40 | let get_data file =
41 | log "Preparing data with %s" file;
42 | let s = make_it_approx_1GB (string_of_file file) in
43 | let size_gb = float (String.length s) /. 1024. /. 1024. /. 1024. in
44 | s, size_gb
45 |
46 | (* Best-effort decode *)
47 |
48 | let adhoc_best_effort_decode s =
49 | let rec loop b i =
50 | if i >= Utf_x_adhoc.Bytes.length b then () else
51 | let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in
52 | let used = Utf_x_adhoc.Uchar.utf_decode_length d in
53 | let u = Utf_x_adhoc.Uchar.utf_decode_uchar d in
54 | Sys.opaque_identity @@ (ignore u);
55 | loop b (i + used)
56 | in
57 | loop (Bytes.unsafe_of_string s) 0
58 |
59 | let dfa_best_effort_decode s =
60 | let rec loop b i =
61 | if i >= Utf_x_dfa.Bytes.length b then () else
62 | let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in
63 | let used = Utf_x_dfa.Uchar.utf_decode_length d in
64 | let u = Utf_x_dfa.Uchar.utf_decode_uchar d in
65 | Sys.opaque_identity @@ (ignore u);
66 | loop b (i + used)
67 | in
68 | loop (Bytes.unsafe_of_string s) 0
69 |
70 | let if_best_effort_decode s =
71 | let rec loop b i =
72 | if i >= Utf_x_if.Bytes.length b then () else
73 | let d = Utf_x_if.Bytes.get_utf_8_uchar b i in
74 | let used = Utf_x_if.Uchar.utf_decode_length d in
75 | let u = Utf_x_if.Uchar.utf_decode_uchar d in
76 | Sys.opaque_identity @@ (ignore u);
77 | loop b (i + used)
78 | in
79 | loop (Bytes.unsafe_of_string s) 0
80 |
81 | let pat_best_effort_decode s =
82 | let rec loop b i =
83 | if i >= Utf_x_pat.Bytes.length b then () else
84 | let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in
85 | let used = Utf_x_pat.Uchar.utf_decode_length d in
86 | let u = Utf_x_pat.Uchar.utf_decode_uchar d in
87 | Sys.opaque_identity @@ (ignore u);
88 | loop b (i + used)
89 | in
90 | loop (Bytes.unsafe_of_string s) 0
91 |
92 | let uutf_best_effort_decode s =
93 | let f () _ = function
94 | | `Uchar u -> Sys.opaque_identity @@ ignore u
95 | | `Malformed u -> Sys.opaque_identity @@ ignore Uutf.u_rep
96 | in
97 | Uutf.String.fold_utf_8 f () s
98 |
99 | let best_effort_decode impl file =
100 | let s, size_gb = get_data file in
101 | Gc.full_major ();
102 | log "Decoding %.2fGB of data with %s" size_gb (impl_to_string impl);
103 | match impl with
104 | | `Adhoc -> adhoc_best_effort_decode s
105 | | `Dfa -> dfa_best_effort_decode s
106 | | `If -> if_best_effort_decode s
107 | | `Pat -> pat_best_effort_decode s
108 | | `Uutf -> uutf_best_effort_decode s
109 |
110 | (* Recode *)
111 |
112 | let adhoc_recode s =
113 | let rec loop b i b' j =
114 | if i >= Utf_x_adhoc.Bytes.length b then (Bytes.unsafe_to_string b') else
115 | let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in
116 | match Utf_x_adhoc.Uchar.utf_decode_is_valid d with
117 | | false -> raise Exit
118 | | true ->
119 | let used = Utf_x_adhoc.Uchar.utf_decode_length d in
120 | let u = Utf_x_adhoc.Uchar.utf_decode_uchar d in
121 | match Utf_x_adhoc.Bytes.set_utf_8_uchar b' j u with
122 | | 0 -> raise Exit
123 | | used' -> loop b (i + used) b' (j + used')
124 | in
125 | try
126 | let b' = Bytes.create (String.length s) in
127 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
128 | with Exit -> None
129 |
130 | let dfa_recode s =
131 | let rec loop b i b' j =
132 | if i >= Utf_x_dfa.Bytes.length b then (Bytes.unsafe_to_string b') else
133 | let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in
134 | match Utf_x_dfa.Uchar.utf_decode_is_valid d with
135 | | false -> raise Exit
136 | | true ->
137 | let used = Utf_x_dfa.Uchar.utf_decode_length d in
138 | let u = Utf_x_dfa.Uchar.utf_decode_uchar d in
139 | match Utf_x_dfa.Bytes.set_utf_8_uchar b' j u with
140 | | 0 -> raise Exit
141 | | used' -> loop b (i + used) b' (j + used')
142 | in
143 | try
144 | let b' = Bytes.create (String.length s) in
145 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
146 | with Exit -> None
147 |
148 | let if_recode s =
149 | let rec loop b i b' j =
150 | if i >= Utf_x_if.Bytes.length b then (Bytes.unsafe_to_string b') else
151 | let d = Utf_x_if.Bytes.get_utf_8_uchar b i in
152 | match Utf_x_if.Uchar.utf_decode_is_valid d with
153 | | false -> raise Exit
154 | | true ->
155 | let used = Utf_x_if.Uchar.utf_decode_length d in
156 | let u = Utf_x_if.Uchar.utf_decode_uchar d in
157 | match Utf_x_if.Bytes.set_utf_8_uchar b' j u with
158 | | 0 -> raise Exit
159 | | used' -> loop b (i + used) b' (j + used')
160 | in
161 | try
162 | let b' = Bytes.create (String.length s) in
163 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
164 | with Exit -> None
165 |
166 | let pat_recode s =
167 | let rec loop b i b' j =
168 | if i >= Utf_x_pat.Bytes.length b then (Bytes.unsafe_to_string b') else
169 | let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in
170 | match Utf_x_pat.Uchar.utf_decode_is_valid d with
171 | | false -> raise Exit
172 | | true ->
173 | let used = Utf_x_pat.Uchar.utf_decode_length d in
174 | let u = Utf_x_pat.Uchar.utf_decode_uchar d in
175 | match Utf_x_pat.Bytes.set_utf_8_uchar b' j u with
176 | | 0 -> raise Exit
177 | | used' -> loop b (i + used) b' (j + used')
178 | in
179 | try
180 | let b' = Bytes.create (String.length s) in
181 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0)
182 | with Exit -> None
183 |
184 | let uutf_recode s =
185 | let f b _ = function
186 | | `Uchar u -> Buffer.add_utf_8_uchar b u; b
187 | | `Malformed u -> raise Exit
188 | in
189 | try
190 | let b = Buffer.create (String.length s) in
191 | let b = Uutf.String.fold_utf_8 f b s in
192 | Some (Buffer.contents b)
193 | with Exit -> None
194 |
195 | let recode impl file =
196 | let s, size_gb = get_data file in
197 | Gc.full_major ();
198 | log "Recoding %.2fGB of data with %s decode" size_gb (impl_to_string impl);
199 | let s' = match impl with
200 | | `Adhoc -> adhoc_recode s
201 | | `Dfa -> dfa_recode s
202 | | `If -> if_recode s
203 | | `Pat -> pat_recode s
204 | | `Uutf -> uutf_recode s
205 | in
206 | match s' with
207 | | None -> log "Recode failure, non valid input data ?"
208 | | Some s' when String.equal s s' -> log "Recode success!"
209 | | Some _ -> assert false
210 |
211 | (* Validate *)
212 |
213 | let adhoc_validate = Utf_x_adhoc.Bytes.is_valid_utf_8
214 | let dfa_validate = Utf_x_dfa.Bytes.is_valid_utf_8
215 | let if_validate = Utf_x_if.Bytes.is_valid_utf_8
216 | let pat_validate = Utf_x_pat.Bytes.is_valid_utf_8
217 | let uutf_validate s =
218 | let f () _ = function
219 | | `Uchar u -> Sys.opaque_identity @@ ignore u
220 | | `Malformed u -> raise Exit
221 | in
222 | try Uutf.String.fold_utf_8 f () s; true with
223 | | Exit -> false
224 |
225 | let validate impl file =
226 | let s, size_gb = get_data file in
227 | log "Validating %.2fGB of data with %s" size_gb (impl_to_string impl);
228 | let valid = match impl with
229 | | `Adhoc -> adhoc_validate (Bytes.unsafe_of_string s)
230 | | `Dfa -> dfa_validate (Bytes.unsafe_of_string s)
231 | | `If -> if_validate (Bytes.unsafe_of_string s)
232 | | `Pat -> pat_validate (Bytes.unsafe_of_string s)
233 | | `Uutf -> uutf_validate s
234 | in
235 | log "Valid: %b" valid
236 |
237 | (* Command line interface *)
238 |
239 | let do_cmd cmd impl file = match cmd with
240 | | `Decode -> best_effort_decode impl file
241 | | `Recode -> recode impl file
242 | | `Validate -> validate impl file
243 |
244 | let main () =
245 | let usage = "Usage: perf8 [--adhoc | --dfa | --if | --pat | --uutf] FILE" in
246 | let impl = ref `Adhoc in
247 | let cmd = ref `Decode in
248 | let args =
249 | [ "--adhoc", Arg.Unit (fun () -> impl := `Adhoc),
250 | "Test the adhoc implementation (default).";
251 | "--dfa", Arg.Unit (fun () -> impl := `Dfa),
252 | "Test the DFA implementation";
253 | "--if", Arg.Unit (fun () -> impl := `If),
254 | "Test the if branches implementation.";
255 | "--pat", Arg.Unit (fun () -> impl := `Pat),
256 | "Test the pattern implementation.";
257 | "--uutf", Arg.Unit (fun () -> impl := `Uutf),
258 | "Test the Uutf implementation.";
259 | "--decode", Arg.Unit (fun () -> cmd := `Decode),
260 | "Decode the UTF-8 data.";
261 | "--validate", Arg.Unit (fun () -> cmd := `Validate),
262 | "Validate the UTF-8 data.";
263 | "--recode", Arg.Unit (fun () -> cmd := `Recode),
264 | "Decode and recode the UTF-8 data."]
265 | in
266 | let file = ref None in
267 | let set_file s = file := Some s in
268 | Arg.parse args set_file usage;
269 | match !file with
270 | | None -> Printf.eprintf "perf8: missing FILE argument\n%s\n" usage; exit 1
271 | | Some file -> do_cmd !cmd !impl file
272 |
273 | let () = if !Sys.interactive then () else main ()
274 |
--------------------------------------------------------------------------------
/test.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | (* To add to the compiler test suite. *)
17 |
18 | open Utf_x_pat
19 |
20 | let fold_uchars f acc =
21 | let rec loop f acc u =
22 | let acc = f acc u in
23 | if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
24 | in
25 | loop f acc Uchar.min
26 |
27 | (* This tests that we encode and decode each character according
28 | to its specification. *)
29 |
30 | let utf_8_spec =
31 | (* UTF-8 byte sequences, cf. table 3.7 Unicode 14. *)
32 | [(0x0000,0x007F), [|(0x00,0x7F)|];
33 | (0x0080,0x07FF), [|(0xC2,0xDF); (0x80,0xBF)|];
34 | (0x0800,0x0FFF), [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|];
35 | (0x1000,0xCFFF), [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|];
36 | (0xD000,0xD7FF), [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|];
37 | (0xE000,0xFFFF), [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|];
38 | (0x10000,0x3FFFF), [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|];
39 | (0x40000,0xFFFFF), [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|];
40 | (0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]]
41 |
42 | let utf_16be_spec =
43 | (* UTF-16BE byte sequences, derived from table 3.5 Unicode 14. *)
44 | [(0x0000,0xD7FF), [|(0x00,0xD7); (0x00,0xFF)|];
45 | (0xE000,0xFFFF), [|(0xE0,0xFF); (0x00,0xFF)|];
46 | (0x10000,0x10FFFF), [|(0xD8,0xDB); (0x00,0xFF); (0xDC,0xDF); (0x00,0xFF)|]]
47 |
48 | let uchar_map_of_spec spec =
49 | (* array mapping Uchar.t as ints to byte sequences according to [spec]. *)
50 | let map = Array.make ((Uchar.to_int Uchar.max) + 1) Bytes.empty in
51 | let add_range ((umin, umax), bytes) =
52 | let len = Array.length bytes in
53 | let bmin i = if i < len then fst bytes.(i) else max_int in
54 | let bmax i = if i < len then snd bytes.(i) else min_int in
55 | let uchar = ref umin in
56 | let buf = Bytes.create len in
57 | let add len' = match len = len' with
58 | | false -> ()
59 | | true -> map.(!uchar) <- Bytes.copy buf; incr uchar
60 | in
61 | for b0 = bmin 0 to bmax 0 do Bytes.set_uint8 buf 0 b0;
62 | for b1 = bmin 1 to bmax 1 do Bytes.set_uint8 buf 1 b1;
63 | for b2 = bmin 2 to bmax 2 do Bytes.set_uint8 buf 2 b2;
64 | for b3 = bmin 3 to bmax 3 do Bytes.set_uint8 buf 3 b3; add 4
65 | done; add 3;
66 | done; add 2;
67 | done; add 1;
68 | done; assert (!uchar - 1 = umax)
69 | in
70 | List.iter add_range spec;
71 | map
72 |
73 | let uchar_map_get u map = map.(Uchar.to_int u)
74 | let utf_8 = uchar_map_of_spec utf_8_spec
75 | let utf_16be = uchar_map_of_spec utf_16be_spec
76 | let utf_16le =
77 | let swap u b =
78 | let len = Bytes.length b in
79 | if len = 0 then () else
80 | for i = 0 to Bytes.length b / 2 - 1 do
81 | let j = i * 2 in
82 | Bytes.set_uint16_le b j (Bytes.get_uint16_be b j);
83 | done;
84 | in
85 | let map = Array.map Bytes.copy utf_16be in
86 | Array.iteri swap map; map
87 |
88 | let test_utf utf utf_len get_utf set_utf utf_is_valid =
89 | (* Test codec and validation of each Uchar.t against the spec. *)
90 | let f () u =
91 | let utf_len = utf_len u in
92 | let buf = Bytes.create utf_len in
93 | assert (set_utf buf 0 u = utf_len);
94 | assert (Bytes.equal buf (uchar_map_get u utf));
95 | assert (Bytes.equal buf (uchar_map_get u utf));
96 | let dec = get_utf buf 0 in
97 | assert (Uchar.utf_decode_is_valid dec);
98 | assert (Uchar.utf_decode_length dec = utf_len);
99 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) u);
100 | assert (utf_is_valid buf);
101 | ()
102 | in
103 | fold_uchars f ()
104 |
105 | let () =
106 | test_utf utf_8 Uchar.utf_8_byte_length
107 | Bytes.get_utf_8_uchar Bytes.set_utf_8_uchar Bytes.is_valid_utf_8
108 |
109 | let () =
110 | test_utf utf_16be Uchar.utf_16_byte_length
111 | Bytes.get_utf_16be_uchar Bytes.set_utf_16be_uchar Bytes.is_valid_utf_16be
112 |
113 | let () =
114 | test_utf utf_16le Uchar.utf_16_byte_length
115 | Bytes.get_utf_16le_uchar Bytes.set_utf_16le_uchar Bytes.is_valid_utf_16le
116 |
117 | let () =
118 | (* Test out of bounds *)
119 | let raises f = assert (try f (); false with Invalid_argument _ -> true) in
120 | (raises @@ fun () -> Bytes.get_utf_8_uchar Bytes.empty 0);
121 | (raises @@ fun () -> Bytes.set_utf_8_uchar Bytes.empty 0 Uchar.min);
122 | (raises @@ fun () -> Bytes.get_utf_16le_uchar Bytes.empty 0);
123 | (raises @@ fun () -> Bytes.set_utf_16le_uchar Bytes.empty 0 Uchar.min);
124 | (raises @@ fun () -> Bytes.get_utf_16be_uchar Bytes.empty 0);
125 | (raises @@ fun () -> Bytes.set_utf_16be_uchar Bytes.empty 0 Uchar.min);
126 | ()
127 |
128 | let () =
129 | (* Test lack of space encodes *)
130 | let b = Bytes.make 1 '\xab' in
131 | assert (Bytes.set_utf_8_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
132 | assert (Bytes.set_utf_16be_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
133 | assert (Bytes.set_utf_16le_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
134 | ()
135 |
136 | let () =
137 | (* Test used bytes and replacement according to WHATWG recommendation.
138 | This is just a recommandation.
139 | These examples are from TUS p. 126-127 Unicode 14 *)
140 | let b = Bytes.of_string "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41" in
141 | let ok i = i = Bytes.length b - 1 in
142 | for i = 0 to Bytes.length b - 1 do
143 | let dec = Bytes.get_utf_8_uchar b i in
144 | if not (ok i) then begin
145 | assert (Uchar.utf_decode_is_valid dec = false);
146 | assert (Uchar.utf_decode_length dec = 1);
147 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
148 | end else begin
149 | assert (Uchar.utf_decode_is_valid dec = true);
150 | assert (Uchar.utf_decode_length dec = 1);
151 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
152 | end
153 | done;
154 | let b = Bytes.of_string "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41" in
155 | let ok i = i = Bytes.length b - 1 in
156 | for i = 0 to Bytes.length b - 1 do
157 | let dec = Bytes.get_utf_8_uchar b i in
158 | if not (ok i) then begin
159 | assert (Uchar.utf_decode_is_valid dec = false);
160 | assert (Uchar.utf_decode_length dec = 1);
161 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
162 | end else begin
163 | assert (Uchar.utf_decode_is_valid dec = true);
164 | assert (Uchar.utf_decode_length dec = 1);
165 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
166 | end
167 | done;
168 | let b = Bytes.of_string "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42" in
169 | let ok i = i = 5 || i = 8 in
170 | for i = 0 to Bytes.length b - 1 do
171 | let dec = Bytes.get_utf_8_uchar b i in
172 | if not (ok i) then begin
173 | assert (Uchar.utf_decode_is_valid dec = false);
174 | assert (Uchar.utf_decode_length dec = 1);
175 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
176 | end else begin
177 | assert (Uchar.utf_decode_is_valid dec = true);
178 | assert (Uchar.utf_decode_length dec = 1);
179 | assert (Uchar.equal (Uchar.utf_decode_uchar dec)
180 | (Uchar.of_char (Bytes.get b i)))
181 | end
182 | done;
183 | let b = Bytes.of_string "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41" in
184 | let d0 = Bytes.get_utf_8_uchar b 0 in
185 | assert (Uchar.utf_decode_is_valid d0 = false);
186 | assert (Uchar.utf_decode_length d0 = 2);
187 | assert (Uchar.equal (Uchar.utf_decode_uchar d0) Uchar.rep);
188 | let d2 = Bytes.get_utf_8_uchar b 2 in
189 | assert (Uchar.utf_decode_is_valid d2 = false);
190 | assert (Uchar.utf_decode_length d2 = 1);
191 | assert (Uchar.equal (Uchar.utf_decode_uchar d2) Uchar.rep);
192 | let d3 = Bytes.get_utf_8_uchar b 3 in
193 | assert (Uchar.utf_decode_is_valid d3 = false);
194 | assert (Uchar.utf_decode_length d3 = 3);
195 | assert (Uchar.equal (Uchar.utf_decode_uchar d3) Uchar.rep);
196 | let d6 = Bytes.get_utf_8_uchar b 6 in
197 | assert (Uchar.utf_decode_is_valid d6 = false);
198 | assert (Uchar.utf_decode_length d6 = 2);
199 | assert (Uchar.equal (Uchar.utf_decode_uchar d6) Uchar.rep);
200 | let d8 = Bytes.get_utf_8_uchar b 8 in
201 | assert (Uchar.utf_decode_length d8 = 1);
202 | assert (Uchar.equal (Uchar.utf_decode_uchar d8) (Uchar.of_int 0x0041));
203 | ()
204 |
205 | let () = Printf.printf "All tests passed!\n"
206 |
--------------------------------------------------------------------------------
/test_long.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | open Utf_x_pat
17 |
18 | (* Assumes the good encoding and decodes have been checked by test.ml
19 | Exhaustively tests all 1-4 bytes invalid sequences for decodes. This
20 | ensures we do not decode invalid sequence to uchars. *)
21 |
22 | let fold_uchars f acc =
23 | let rec loop f acc u =
24 | let acc = f acc u in
25 | if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
26 | in
27 | loop f acc Uchar.min
28 |
29 | module Sset = Set.Make (String)
30 |
31 | let utf_8_encs, utf_16be_encs, utf_16le_encs =
32 | Printf.printf "Building encoding sequence sets\n%!";
33 | let add (set8, set16be, set16le) u =
34 | let s = Bytes.unsafe_to_string in
35 | let e8 = Bytes.create (Uchar.utf_8_byte_length u) in
36 | let e16be = Bytes.create (Uchar.utf_16_byte_length u) in
37 | let e16le = Bytes.create (Uchar.utf_16_byte_length u) in
38 | ignore (Bytes.set_utf_8_uchar e8 0 u);
39 | ignore (Bytes.set_utf_16be_uchar e16be 0 u);
40 | ignore (Bytes.set_utf_16le_uchar e16le 0 u);
41 | Sset.add (s e8) set8, Sset.add (s e16be) set16be, Sset.add (s e16le) set16le
42 | in
43 | fold_uchars add (Sset.empty, Sset.empty, Sset.empty)
44 |
45 | let test_seqs utf utf_encs get_utf_char is_valid_utf =
46 | let test seq =
47 | let dec = get_utf_char seq 0 in
48 | let valid = Uchar.utf_decode_is_valid dec in
49 | let valid_seq = is_valid_utf seq in
50 | let is_enc = Sset.mem (Bytes.unsafe_to_string seq) utf_encs in
51 | if not ((valid && is_enc) || (not valid && not is_enc)) ||
52 | not ((valid_seq && is_enc) || (not valid_seq && not is_enc))
53 | then begin
54 | for i = 0 to Bytes.length seq - 1 do
55 | Printf.printf "%02X " (Bytes.get_uint8 seq i);
56 | done;
57 | Printf.printf "valid: %b is_encoding: %b decode: U+%04X is_valid:%b\n"
58 | valid is_enc (Uchar.to_int (Uchar.utf_decode_uchar dec)) valid_seq;
59 | assert false
60 | end;
61 | valid
62 | in
63 | let[@inline] set buf i b = Bytes.unsafe_set buf i (Char.unsafe_chr b) in
64 | let s1 = Bytes.create 1 and s2 = Bytes.create 2
65 | and s3 = Bytes.create 3 and s4 = Bytes.create 4 in
66 | Printf.printf "Testing %s invalid decodes…\n%!" utf;
67 | for b0 = 0x00 to 0xFF do
68 | set s1 0 b0;
69 | if test s1 then ((* this prefix decoded, stop here *)) else begin
70 | set s2 0 b0;
71 | for b1 = 0x00 to 0xFF do
72 | set s2 1 b1;
73 | if test s2 then ((* this prefix decoded, stop here *)) else begin
74 | set s3 0 b0;
75 | set s3 1 b1;
76 | for b2 = 0x00 to 0xFF do
77 | set s3 2 b2;
78 | if test s3 then ((* this prefix decoded, stop here *)) else begin
79 | set s4 0 b0;
80 | set s4 1 b1;
81 | set s4 2 b2;
82 | for b3 = 0x00 to 0xFF do set s4 3 b3; ignore (test s4) done;
83 | end
84 | done;
85 | end
86 | done;
87 | end
88 | done
89 |
90 | let () =
91 | test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar Bytes.is_valid_utf_8;
92 | test_seqs "UTF-16BE"
93 | utf_16be_encs Bytes.get_utf_16be_uchar Bytes.is_valid_utf_16be;
94 | test_seqs "UTF-16LE"
95 | utf_16le_encs Bytes.get_utf_16le_uchar Bytes.is_valid_utf_16le;
96 | ()
97 |
98 | let () = Printf.printf "All tests passed!\n"
99 |
--------------------------------------------------------------------------------
/trip8.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | let log fmt = Printf.printf (fmt ^^ "\n%!")
17 |
18 | let string_of_file file =
19 | let ic = open_in_bin file in
20 | let len = in_channel_length ic in
21 | let buf = Bytes.create len in
22 | really_input ic buf 0 len; close_in ic; Bytes.unsafe_to_string buf
23 |
24 | let adhoc_trip s b =
25 | let rec loop b i buf =
26 | if i >= Utf_x_adhoc.Bytes.length b then () else
27 | let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in
28 | let used = Utf_x_adhoc.Uchar.utf_decode_length d in
29 | Buffer.add_utf_8_uchar buf (Utf_x_adhoc.Uchar.utf_decode_uchar d);
30 | loop b (i + used) buf
31 | in
32 | loop (Bytes.unsafe_of_string s) 0 b
33 |
34 | let dfa_trip s b =
35 | let rec loop b i buf =
36 | if i >= Utf_x_dfa.Bytes.length b then () else
37 | let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in
38 | let used = Utf_x_dfa.Uchar.utf_decode_length d in
39 | Buffer.add_utf_8_uchar buf (Utf_x_dfa.Uchar.utf_decode_uchar d);
40 | loop b (i + used) buf
41 | in
42 | loop (Bytes.unsafe_of_string s) 0 b
43 |
44 | let if_trip s b =
45 | let rec loop b i buf =
46 | if i >= Utf_x_if.Bytes.length b then () else
47 | let d = Utf_x_if.Bytes.get_utf_8_uchar b i in
48 | let used = Utf_x_if.Uchar.utf_decode_length d in
49 | Buffer.add_utf_8_uchar buf (Utf_x_if.Uchar.utf_decode_uchar d);
50 | loop b (i + used) buf
51 | in
52 | loop (Bytes.unsafe_of_string s) 0 b
53 |
54 | let pat_trip s b =
55 | let rec loop b i buf =
56 | if i >= Utf_x_pat.Bytes.length b then () else
57 | let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in
58 | let used = Utf_x_pat.Uchar.utf_decode_length d in
59 | Buffer.add_utf_8_uchar buf (Utf_x_pat.Uchar.utf_decode_uchar d);
60 | loop b (i + used) buf
61 | in
62 | loop (Bytes.unsafe_of_string s) 0 b
63 |
64 | let uutf_trip s b =
65 | let f b _ = function
66 | | `Uchar u -> Buffer.add_utf_8_uchar b u; b
67 | | `Malformed u -> Buffer.add_utf_8_uchar b Uchar.rep; b
68 | in
69 | ignore (Uutf.String.fold_utf_8 f b s)
70 |
71 | let trip impl file =
72 | let s = string_of_file file in
73 | let b = Buffer.create (String.length s) in
74 | let () = match impl with
75 | | `Adhoc -> adhoc_trip s b
76 | | `Dfa -> dfa_trip s b
77 | | `If -> if_trip s b
78 | | `Pat -> pat_trip s b
79 | | `Uutf -> uutf_trip s b
80 | in
81 | print_string (Buffer.contents b)
82 |
83 | (* Command line interface *)
84 |
85 | let main () =
86 | let usage = "Usage: trip8 [--adhoc | --dfa | --if | --pat | --uutf] FILE" in
87 | let impl = ref `Adhoc in
88 | let args =
89 | [ "--adhoc", Arg.Unit (fun () -> impl := `Adhoc),
90 | "Use the adhoc implementation (default).";
91 | "--dfa", Arg.Unit (fun () -> impl := `Dfa),
92 | "Use the DFA implementation";
93 | "--if", Arg.Unit (fun () -> impl := `If),
94 | "Use the if branches implementation.";
95 | "--pat", Arg.Unit (fun () -> impl := `Pat),
96 | "Use the pattern implementation.";
97 | "--uutf", Arg.Unit (fun () -> impl := `Uutf),
98 | "Use the Uutf implementation." ]
99 | in
100 | let file = ref None in
101 | let set_file s = file := Some s in
102 | Arg.parse args set_file usage;
103 | match !file with
104 | | None -> Printf.eprintf "trip8: missing FILE argument\n%s\n" usage; exit 1
105 | | Some file -> trip !impl file
106 |
107 | let () = if !Sys.interactive then () else main ()
108 |
--------------------------------------------------------------------------------
/utf_16.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | (* This won't be needed *)
17 |
18 | let dec_invalid = Utf_uchar.utf_decode_invalid
19 | let[@inline] dec_ret n u = Utf_uchar.utf_decode n (Uchar.unsafe_of_int u)
20 | external length : bytes -> int = "%bytes_length"
21 | type t = Bytes.t
22 | external swap16 : int -> int = "%bswap16" (* That one will be in Bytes *)
23 |
24 | (* That will be needed *)
25 |
26 | external unsafe_get_uint16_ne : t -> int -> int = "%caml_bytes_get16u"
27 | external unsafe_set_uint16_ne : t -> int -> int -> unit = "%caml_bytes_set16u"
28 |
29 | let unsafe_get_uint16_le b i =
30 | if Sys.big_endian
31 | then swap16 (unsafe_get_uint16_ne b i)
32 | else unsafe_get_uint16_ne b i
33 |
34 | let unsafe_get_uint16_be b i =
35 | if Sys.big_endian
36 | then unsafe_get_uint16_ne b i
37 | else swap16 (unsafe_get_uint16_ne b i)
38 |
39 | let unsafe_set_uint16_le b i x =
40 | if Sys.big_endian
41 | then unsafe_set_uint16_ne b i (swap16 x)
42 | else unsafe_set_uint16_ne b i x
43 |
44 | let unsafe_set_uint16_be b i x =
45 | if Sys.big_endian
46 | then unsafe_set_uint16_ne b i x
47 | else unsafe_set_uint16_ne b i (swap16 x)
48 |
49 | (* UTF-16BE *)
50 |
51 | let get_utf_16be_uchar b i =
52 | let get = unsafe_get_uint16_be in
53 | let max = length b - 1 in
54 | if i < 0 || i > max then invalid_arg "index out of bounds" else
55 | if i = max then dec_invalid 1 else
56 | match get b i with
57 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
58 | | u when u > 0xDBFF -> dec_invalid 2
59 | | hi -> (* combine [hi] with a low surrogate *)
60 | let last = i + 3 in
61 | if last > max then dec_invalid (max - i + 1) else
62 | match get b (i + 2) with
63 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
64 | | lo ->
65 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
66 | dec_ret 4 u
67 |
68 | let set_utf_16be_uchar b i u =
69 | let set = unsafe_set_uint16_be in
70 | let max = length b - 1 in
71 | if i < 0 || i > max then invalid_arg "index out of bounds" else
72 | match Uchar.to_int u with
73 | | u when u < 0 -> assert false
74 | | u when u <= 0xFFFF ->
75 | let last = i + 1 in
76 | if last > max then 0 else (set b i u; 2)
77 | | u ->
78 | let last = i + 3 in
79 | if last > max then 0 else
80 | let u' = u - 0x10000 in
81 | let hi = (0xD800 lor (u' lsr 10)) in
82 | let lo = (0xDC00 lor (u' land 0x3FF)) in
83 | set b i hi; set b (i + 2) lo; 4
84 |
85 | let is_valid_utf_16be b =
86 | let rec loop max b i =
87 | let get = unsafe_get_uint16_be in
88 | if i > max then true else
89 | if i = max then false else
90 | match get b i with
91 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
92 | | u when u > 0xDBFF -> false
93 | | hi ->
94 | let last = i + 3 in
95 | if last > max then false else
96 | match get b (i + 2) with
97 | | u when u < 0xDC00 || u > 0xDFFF -> false
98 | | lo -> loop max b (i + 4)
99 | in
100 | loop (length b - 1) b 0
101 |
102 | (* UTF-16LE *)
103 |
104 | let get_utf_16le_uchar b i =
105 | let get = unsafe_get_uint16_le in
106 | let max = length b - 1 in
107 | if i < 0 || i > max then invalid_arg "index out of bounds" else
108 | if i = max then dec_invalid 1 else
109 | match get b i with
110 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
111 | | u when u > 0xDBFF -> dec_invalid 2
112 | | hi -> (* combine [hi] with a low surrogate *)
113 | let last = i + 3 in
114 | if last > max then dec_invalid (max - i + 1) else
115 | match get b (i + 2) with
116 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
117 | | lo ->
118 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
119 | dec_ret 4 u
120 |
121 | let set_utf_16le_uchar b i u =
122 | let set = unsafe_set_uint16_le in
123 | let max = length b - 1 in
124 | if i < 0 || i > max then invalid_arg "index out of bounds" else
125 | match Uchar.to_int u with
126 | | u when u < 0 -> assert false
127 | | u when u <= 0xFFFF ->
128 | let last = i + 1 in
129 | if last > max then 0 else (set b i u; 2)
130 | | u ->
131 | let last = i + 3 in
132 | if last > max then 0 else
133 | let u' = u - 0x10000 in
134 | let hi = (0xD800 lor (u' lsr 10)) in
135 | let lo = (0xDC00 lor (u' land 0x3FF)) in
136 | set b i hi; set b (i + 2) lo; 4
137 |
138 | let is_valid_utf_16le b =
139 | let rec loop max b i =
140 | let get = unsafe_get_uint16_le in
141 | if i > max then true else
142 | if i = max then false else
143 | match get b i with
144 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
145 | | u when u > 0xDBFF -> false
146 | | hi ->
147 | let last = i + 3 in
148 | if last > max then false else
149 | match get b (i + 2) with
150 | | u when u < 0xDC00 || u > 0xDFFF -> false
151 | | lo -> loop max b (i + 4)
152 | in
153 | loop (length b - 1) b 0
154 |
--------------------------------------------------------------------------------
/utf_uchar.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | include Uchar
17 |
18 | (* UTF codecs tools *)
19 |
20 | type utf_decode = int
21 | (* This is an int [0xDUUUUUU] decomposed as follows:
22 | - [D] is four bits for decode information, the highest bit is set if the
23 | decode is valid. The three lower bits indicate the number of elements
24 | from the source that were consumed by the decode.
25 | - [UUUUUU] is the decoded Unicode character or the Unicode replacement
26 | character U+FFFD if for invalid decodes. *)
27 |
28 | let valid_bit = 27
29 | let decode_bits = 24
30 |
31 | let[@inline] utf_decode_is_valid d = (d lsr valid_bit) = 1
32 | let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111
33 | let[@inline] utf_decode_uchar d = Uchar.unsafe_of_int (d land 0xFFFFFF)
34 | let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (Uchar.to_int u)
35 | let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor 0xFFFD (* rep *)
36 |
37 | let utf_8_byte_length u = match to_int u with
38 | | u when u < 0 -> assert false
39 | | u when u <= 0x007F -> 1
40 | | u when u <= 0x07FF -> 2
41 | | u when u <= 0xFFFF -> 3
42 | | u when u <= 0x10FFFF -> 4
43 | | _ -> assert false
44 |
45 | let utf_16_byte_length u = match to_int u with
46 | | u when u < 0 -> assert false
47 | | u when u <= 0xFFFF -> 2
48 | | u when u <= 0x10FFFF -> 4
49 | | _ -> assert false
50 |
--------------------------------------------------------------------------------
/utf_uchar.mli:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | include Utf_x.Uchar
17 |
--------------------------------------------------------------------------------
/utf_x.mli:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | (* Uchar additions *)
17 |
18 | module type Uchar = sig
19 | include module type of Uchar
20 |
21 | (** {1:utf_codec UTF codecs tools}
22 |
23 | @since 4.14 *)
24 |
25 | type utf_decode [@@immediate]
26 | (** The type for UTF decode results. Values of this type represent
27 | the result of a Unicode Transformation Format decoding attempt. *)
28 |
29 | val utf_decode_is_valid : utf_decode -> bool
30 | (** [utf_decode_is_valid d] is [true] if and only if [d] holds a valid
31 | decode. *)
32 |
33 | val utf_decode_uchar : utf_decode -> Uchar.t
34 | (** [utf_x_decode_uchar d] is the Unicode character decoded by [d] if
35 | [utf_x_decode_is_valid d] is [true] and {!Uchar.rep} otherwise. *)
36 |
37 | val utf_decode_length : utf_decode -> int
38 | (** [utf_decode_length d] is the number of elements from the source
39 | that were consumed by the decode [d]. This is always strictly
40 | positive and smaller or equal to [4]. The kind of source elements
41 | depends on the actual decoder; for the decoders of the standard
42 | library this function always returns a length in bytes. *)
43 |
44 | val utf_decode : int -> Uchar.t -> utf_decode
45 | (** [utf_decode n u] is a valid UTF decode for [u] that consumed [n]
46 | elements from the source for decoding. [n] must be positive and
47 | smaller or equal to [4] (this is not checked by the module). *)
48 |
49 | val utf_decode_invalid : int -> utf_decode
50 | (** [utf_decode_invalid n] is an invalid UTF decode that consumed [n]
51 | elements from the source to error. [n] must be positive and
52 | smaller or equal to [4] (this is not checked by the module). The
53 | resulting decode has {!rep} as the decoded Unicode character. *)
54 |
55 | val utf_8_byte_length : t -> int
56 | (** [utf_8_byte_length u] is the number of bytes needed to encode
57 | [u] in UTF-8. *)
58 |
59 | val utf_16_byte_length : t -> int
60 | (** [utf_16_byte_length u] is the number of bytes needed to encode
61 | [u] in UTF-16. *)
62 | end
63 |
64 | (* Bytes additions *)
65 |
66 | module type S = sig
67 | module Uchar : Uchar
68 | module Bytes : sig
69 | include module type of Bytes
70 |
71 | (** {1:utf_x_codecs UTF-X codecs and validations}
72 |
73 | @since 4.14 *)
74 |
75 | (** {2:utf_8 UTF-8} *)
76 |
77 | val get_utf_8_uchar : t -> int -> Uchar.utf_decode
78 | (** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in
79 | [b]. *)
80 |
81 | val set_utf_8_uchar : t -> int -> Uchar.t -> int
82 | (** [set_utf_8_uchar b i u] UTF-8 encodes [u] at index [i] in [b]
83 | and returns the number of bytes [n] that were written starting
84 | at [i]. If [n] is [0] there was not enough space to encode [u]
85 | at [i] and [b] was left untouched. Otherwise a new character can
86 | be encoded at [i + n]. *)
87 |
88 | val is_valid_utf_8 : t -> bool
89 | (** [is_valid_utf_8 b] is [true] if and only if [b] contains valid
90 | UTF-8 data. *)
91 |
92 | (** {2:utf_16be UTF-16BE} *)
93 |
94 | val get_utf_16be_uchar : t -> int -> Uchar.utf_decode
95 | (** [get_utf_16be_uchar b i] decodes an UTF-16BE character at index
96 | [i] in [b]. *)
97 |
98 | val set_utf_16be_uchar : t -> int -> Uchar.t -> int
99 | (** [set_utf_16be_uchar b i u] UTF-16BE encodes [u] at index [i] in [b]
100 | and returns the number of bytes [n] that were written starting
101 | at [i]. If [n] is [0] there was not enough space to encode [u]
102 | at [i] and [b] was left untouched. Otherwise a new character can
103 | be encoded at [i + n]. *)
104 |
105 | val is_valid_utf_16be : t -> bool
106 | (** [is_valid_utf_16be b] is [true] if and only if [b] contains valid
107 | UTF-16BE data. *)
108 |
109 | (** {2:utf_16le UTF-16LE} *)
110 |
111 | val get_utf_16le_uchar : t -> int -> Uchar.utf_decode
112 | (** [get_utf_16le_uchar b i] decodes an UTF-16LE character at index
113 | [i] in [b]. *)
114 |
115 | val set_utf_16le_uchar : t -> int -> Uchar.t -> int
116 | (** [set_utf_16le_uchar b i u] UTF-16LE encodes [u] at index [i] in [b]
117 | and returns the number of bytes [n] that were written starting
118 | at [i]. If [n] is [0] there was not enough space to encode [u]
119 | at [i] and [b] was left untouched. Otherwise a new character can
120 | be encoded at [i + n]. *)
121 |
122 | val is_valid_utf_16le : t -> bool
123 | (** [is_valid_utf_16le b] is [true] if and only if [b] contains valid
124 | UTF-16LE data. *)
125 | end
126 | end
127 |
--------------------------------------------------------------------------------
/utf_x_adhoc.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | module Uchar = Utf_uchar
17 | module Bytes = struct
18 | include Bytes
19 |
20 | external swap16 : int -> int = "%bswap16" (* That one will be in Bytes *)
21 |
22 | (* Unsafe internal additions *)
23 |
24 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
25 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
26 |
27 | external unsafe_get_uint16_ne : t -> int -> int = "%caml_bytes_get16u"
28 | external unsafe_set_uint16_ne : t -> int -> int -> unit = "%caml_bytes_set16u"
29 |
30 | let unsafe_get_uint16_le b i =
31 | if Sys.big_endian
32 | then swap16 (unsafe_get_uint16_ne b i)
33 | else unsafe_get_uint16_ne b i
34 |
35 | let unsafe_get_uint16_be b i =
36 | if Sys.big_endian
37 | then unsafe_get_uint16_ne b i
38 | else swap16 (unsafe_get_uint16_ne b i)
39 |
40 | let unsafe_set_uint16_le b i x =
41 | if Sys.big_endian
42 | then unsafe_set_uint16_ne b i (swap16 x)
43 | else unsafe_set_uint16_ne b i x
44 |
45 | let unsafe_set_uint16_be b i x =
46 | if Sys.big_endian
47 | then unsafe_set_uint16_ne b i x
48 | else unsafe_set_uint16_ne b i (swap16 x)
49 |
50 | (* UTF-X codecs and validations *)
51 |
52 | let dec_invalid = Uchar.utf_decode_invalid
53 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
54 |
55 | (* In case of decoding error, if we error on the first byte, we
56 | consume the byte, otherwise we consume the [n] bytes preceeding
57 | erroring byte.
58 |
59 | This means that if a client uses decodes without caring about
60 | validity it naturally replace bogus data with Uchar.rep according
61 | to the WHATWG Encoding standard. Other schemes are possible by
62 | consulting the number of used bytes on invalid decodes. For more
63 | details see https://hsivonen.fi/broken-utf-8/
64 |
65 | For this reason in [get_utf_8_uchar] we gradually check the next
66 | byte is available rather than doing it immediately after the
67 | first byte. Contrast with [is_valid_utf_8]. *)
68 |
69 | (* UTF-8 *)
70 |
71 | let utf_8_decode_case =
72 | (* Case according to first byte. This corresponds to the the lines
73 | of The Unicode Standard Table 3.7 with lines E1..EC and EE..EF
74 | sharing the same case and unmentioned bytes mapped to 8.
75 |
76 | let dump_table () =
77 | let print_case c = Printf.printf "\\%03d" c in
78 | let cut i = if i mod 16 = 0 then (print_char '\\'; print_newline ()) in
79 | for i = 0x00 to 0x7F do (cut i; print_case 0) done;
80 | for i = 0x80 to 0xC1 do (cut i; print_case 8) done;
81 | for i = 0xC2 to 0xDF do (cut i; print_case 1) done;
82 | for i = 0xE0 to 0xE0 do (cut i; print_case 2) done;
83 | for i = 0xE1 to 0xEC do (cut i; print_case 3) done;
84 | for i = 0xED to 0xED do (cut i; print_case 4) done;
85 | for i = 0xEE to 0xEF do (cut i; print_case 3) done;
86 | for i = 0xF0 to 0xF0 do (cut i; print_case 5) done;
87 | for i = 0xF1 to 0xF3 do (cut i; print_case 6) done;
88 | for i = 0xF4 to 0xF4 do (cut i; print_case 7) done;
89 | for i = 0xF5 to 0xFF do (cut i; print_case 8) done;
90 | () *)
91 | "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
92 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
93 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
94 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
95 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
96 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
97 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
98 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
99 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
100 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
101 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
102 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\
103 | \008\008\001\001\001\001\001\001\001\001\001\001\001\001\001\001\
104 | \001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\
105 | \002\003\003\003\003\003\003\003\003\003\003\003\003\004\003\003\
106 | \005\006\006\006\007\008\008\008\008\008\008\008\008\008\008\008"
107 |
108 | let utf_8_decode_case' = unsafe_of_string utf_8_decode_case
109 |
110 | let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
111 | let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
112 | let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
113 | let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
114 | let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
115 |
116 | let[@inline] utf_8_uchar_2 b0 b1 =
117 | ((b0 land 0x1F) lsl 6) lor
118 | ((b1 land 0x3F))
119 |
120 | let[@inline] utf_8_uchar_3 b0 b1 b2 =
121 | ((b0 land 0x0F) lsl 12) lor
122 | ((b1 land 0x3F) lsl 6) lor
123 | ((b2 land 0x3F))
124 |
125 | let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
126 | ((b0 land 0x07) lsl 18) lor
127 | ((b1 land 0x3F) lsl 12) lor
128 | ((b2 land 0x3F) lsl 6) lor
129 | ((b3 land 0x3F))
130 |
131 | let get_utf_8_uchar b i =
132 | let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
133 | let get = unsafe_get_uint8 in
134 | let max = length b - 1 in
135 | match get utf_8_decode_case' b0 with
136 | | 0 (* 00..7F *) -> dec_ret 1 b0
137 | | 1 (* C2..DF *) ->
138 | let i = i + 1 in if i > max then dec_invalid 1 else
139 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
140 | dec_ret 2 (utf_8_uchar_2 b0 b1)
141 | | 2 (* E0 *) ->
142 | let i = i + 1 in if i > max then dec_invalid 1 else
143 | let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
144 | let i = i + 1 in if i > max then dec_invalid 2 else
145 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
146 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
147 | | 3 -> (* E1..EC or EE..EF *)
148 | let i = i + 1 in if i > max then dec_invalid 1 else
149 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
150 | let i = i + 1 in if i > max then dec_invalid 2 else
151 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
152 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
153 | | 4 -> (* ED *)
154 | let i = i + 1 in if i > max then dec_invalid 1 else
155 | let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
156 | let i = i + 1 in if i > max then dec_invalid 2 else
157 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
158 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
159 | | 5 -> (* F0 *)
160 | let i = i + 1 in if i > max then dec_invalid 1 else
161 | let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
162 | let i = i + 1 in if i > max then dec_invalid 2 else
163 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
164 | let i = i + 1 in if i > max then dec_invalid 3 else
165 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
166 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
167 | | 6 -> (* F1..F3 *)
168 | let i = i + 1 in if i > max then dec_invalid 1 else
169 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
170 | let i = i + 1 in if i > max then dec_invalid 2 else
171 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
172 | let i = i + 1 in if i > max then dec_invalid 3 else
173 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
174 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
175 | | 7 -> (* F4 *)
176 | let i = i + 1 in if i > max then dec_invalid 1 else
177 | let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
178 | let i = i + 1 in if i > max then dec_invalid 2 else
179 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
180 | let i = i + 1 in if i > max then dec_invalid 3 else
181 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
182 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
183 | | 8 -> dec_invalid 1
184 | | _ -> assert false
185 |
186 | let set_utf_8_uchar b i u =
187 | let set = unsafe_set_uint8 in
188 | let max = length b - 1 in
189 | match Uchar.to_int u with
190 | | u when u < 0 -> assert false
191 | | u when u <= 0x007F ->
192 | set_uint8 b i u;
193 | 1
194 | | u when u <= 0x07FF ->
195 | let last = i + 1 in
196 | if last > max then 0 else
197 | (set_uint8 b i (0xC0 lor (u lsr 6));
198 | set b last (0x80 lor (u land 0x3F));
199 | 2)
200 | | u when u <= 0xFFFF ->
201 | let last = i + 2 in
202 | if last > max then 0 else
203 | (set_uint8 b i (0xE0 lor (u lsr 12));
204 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
205 | set b last (0x80 lor (u land 0x3F));
206 | 3)
207 | | u ->
208 | let last = i + 3 in
209 | if last > max then 0 else
210 | (set_uint8 b i (0xF0 lor (u lsr 18));
211 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
212 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
213 | set b last (0x80 lor (u land 0x3F));
214 | 4)
215 |
216 | let is_valid_utf_8 b =
217 | let rec loop max b i =
218 | if i > max then true else
219 | let get = unsafe_get_uint8 in
220 | match get utf_8_decode_case' (get b i) with
221 | | 0 (* 00..7F *) -> loop max b (i + 1)
222 | | 1 (* C2..DF *) ->
223 | let last = i + 1 in
224 | if last > max
225 | || not_in_x80_to_xBF (get b last)
226 | then false
227 | else loop max b (last + 1)
228 | | 2 (* E0 *) ->
229 | let last = i + 2 in
230 | if last > max
231 | || not_in_xA0_to_xBF (get b (i + 1))
232 | || not_in_x80_to_xBF (get b last)
233 | then false
234 | else loop max b (last + 1)
235 | | 3 -> (* E1..EC or EE..EF *)
236 | let last = i + 2 in
237 | if last > max
238 | || not_in_x80_to_xBF (get b (i + 1))
239 | || not_in_x80_to_xBF (get b last)
240 | then false
241 | else loop max b (last + 1)
242 | | 4 -> (* ED *)
243 | let last = i + 2 in
244 | if last > max
245 | || not_in_x80_to_x9F (get b (i + 1))
246 | || not_in_x80_to_xBF (get b last)
247 | then false
248 | else loop max b (last + 1)
249 | | 5 -> (* F0 *)
250 | let last = i + 3 in
251 | if last > max
252 | || not_in_x90_to_xBF (get b (i + 1))
253 | || not_in_x80_to_xBF (get b (i + 2))
254 | || not_in_x80_to_xBF (get b last)
255 | then false
256 | else loop max b (last + 1)
257 | | 6 -> (* F1..F3 *)
258 | let last = i + 3 in
259 | if last > max
260 | || not_in_x80_to_xBF (get b (i + 1))
261 | || not_in_x80_to_xBF (get b (i + 2))
262 | || not_in_x80_to_xBF (get b last)
263 | then false
264 | else loop max b (last + 1)
265 | | 7 -> (* F4 *)
266 | let last = i + 3 in
267 | if last > max
268 | || not_in_x80_to_x8F (get b (i + 1))
269 | || not_in_x80_to_xBF (get b (i + 2))
270 | || not_in_x80_to_xBF (get b last)
271 | then false
272 | else loop max b (last + 1)
273 | | 8 -> false
274 | | _ -> assert false
275 | in
276 | loop (length b - 1) b 0
277 |
278 | (* UTF-16BE *)
279 |
280 | let get_utf_16be_uchar b i =
281 | let get = unsafe_get_uint16_be in
282 | let max = length b - 1 in
283 | if i < 0 || i > max then invalid_arg "index out of bounds" else
284 | if i = max then dec_invalid 1 else
285 | match get b i with
286 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
287 | | u when u > 0xDBFF -> dec_invalid 2
288 | | hi -> (* combine [hi] with a low surrogate *)
289 | let last = i + 3 in
290 | if last > max then dec_invalid (max - i + 1) else
291 | match get b (i + 2) with
292 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
293 | | lo ->
294 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
295 | dec_ret 4 u
296 |
297 | let set_utf_16be_uchar b i u =
298 | let set = unsafe_set_uint16_be in
299 | let max = length b - 1 in
300 | if i < 0 || i > max then invalid_arg "index out of bounds" else
301 | match Uchar.to_int u with
302 | | u when u < 0 -> assert false
303 | | u when u <= 0xFFFF ->
304 | let last = i + 1 in
305 | if last > max then 0 else (set b i u; 2)
306 | | u ->
307 | let last = i + 3 in
308 | if last > max then 0 else
309 | let u' = u - 0x10000 in
310 | let hi = (0xD800 lor (u' lsr 10)) in
311 | let lo = (0xDC00 lor (u' land 0x3FF)) in
312 | set b i hi; set b (i + 2) lo; 4
313 |
314 | let is_valid_utf_16be b =
315 | let rec loop max b i =
316 | let get = unsafe_get_uint16_be in
317 | if i > max then true else
318 | if i = max then false else
319 | match get b i with
320 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
321 | | u when u > 0xDBFF -> false
322 | | hi ->
323 | let last = i + 3 in
324 | if last > max then false else
325 | match get b (i + 2) with
326 | | u when u < 0xDC00 || u > 0xDFFF -> false
327 | | lo -> loop max b (i + 4)
328 | in
329 | loop (length b - 1) b 0
330 |
331 | (* UTF-16LE *)
332 |
333 | let get_utf_16le_uchar b i =
334 | let get = unsafe_get_uint16_le in
335 | let max = length b - 1 in
336 | if i < 0 || i > max then invalid_arg "index out of bounds" else
337 | if i = max then dec_invalid 1 else
338 | match get b i with
339 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
340 | | u when u > 0xDBFF -> dec_invalid 2
341 | | hi -> (* combine [hi] with a low surrogate *)
342 | let last = i + 3 in
343 | if last > max then dec_invalid (max - i + 1) else
344 | match get b (i + 2) with
345 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
346 | | lo ->
347 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
348 | dec_ret 4 u
349 |
350 | let set_utf_16le_uchar b i u =
351 | let set = unsafe_set_uint16_le in
352 | let max = length b - 1 in
353 | if i < 0 || i > max then invalid_arg "index out of bounds" else
354 | match Uchar.to_int u with
355 | | u when u < 0 -> assert false
356 | | u when u <= 0xFFFF ->
357 | let last = i + 1 in
358 | if last > max then -2 else (set b i u; 2)
359 | | u ->
360 | let last = i + 3 in
361 | if last > max then -4 else
362 | let u' = u - 0x10000 in
363 | let hi = (0xD800 lor (u' lsr 10)) in
364 | let lo = (0xDC00 lor (u' land 0x3FF)) in
365 | set b i hi; set b (i + 2) lo; 4
366 |
367 | let is_valid_utf_16le b =
368 | let rec loop max b i =
369 | let get = unsafe_get_uint16_le in
370 | if i > max then true else
371 | if i = max then false else
372 | match get b i with
373 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
374 | | u when u > 0xDBFF -> false
375 | | hi ->
376 | let last = i + 3 in
377 | if last > max then false else
378 | match get b (i + 2) with
379 | | u when u < 0xDC00 || u > 0xDFFF -> false
380 | | lo -> loop max b (i + 4)
381 | in
382 | loop (length b - 1) b 0
383 | end
384 |
--------------------------------------------------------------------------------
/utf_x_adhoc.mli:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | include Utf_x.S
17 |
--------------------------------------------------------------------------------
/utf_x_dfa.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | module Uchar = Utf_uchar
17 | module Bytes = struct
18 | include Bytes
19 |
20 | (* Unsafe internal additions *)
21 |
22 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
23 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
24 |
25 | (* UTF-X codecs and validations *)
26 |
27 | let dec_invalid = Uchar.utf_decode_invalid
28 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
29 |
30 | (* UTF-8 *)
31 |
32 | let utf_8_dfa =
33 | (* The first 256 entries map bytes to their class. Then the
34 | DFA follows. The DFA automaton is MIT-licensed
35 | Copyright (c) 2008-2009 Bjoern Hoehrmann.
36 | See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/#variations*)
37 | unsafe_of_string
38 | "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
39 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
40 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
41 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
42 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
43 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
44 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
45 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
46 | \001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\
47 | \009\009\009\009\009\009\009\009\009\009\009\009\009\009\009\009\
48 | \007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\
49 | \007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\
50 | \008\008\002\002\002\002\002\002\002\002\002\002\002\002\002\002\
51 | \002\002\002\002\002\002\002\002\002\002\002\002\002\002\002\002\
52 | \010\003\003\003\003\003\003\003\003\003\003\003\003\004\003\003\
53 | \011\006\006\006\005\008\008\008\008\008\008\008\008\008\008\008\
54 | \000\012\024\036\060\096\084\012\012\012\048\072\
55 | \012\012\012\012\012\012\012\012\012\012\012\012\
56 | \012\000\012\012\012\012\012\000\012\000\012\012\
57 | \012\024\012\012\012\012\012\024\012\024\012\012\
58 | \012\012\012\012\012\012\012\024\012\012\012\012\
59 | \012\024\012\012\012\012\012\012\012\024\012\012\
60 | \012\012\012\012\012\012\012\036\012\036\012\012\
61 | \012\036\012\012\012\012\012\036\012\036\012\012\
62 | \012\036\012\012\012\012\012\012\012\012\012\012"
63 |
64 | let accept = 0
65 | let reject = 12
66 |
67 | (* Rewriting the following as an imperative loop is uglier but
68 | more efficient.
69 |
70 | let get_utf_8_uchar b i =
71 | let max = length b - 1 in
72 | if i < 0 || i > max then invalid_arg "index out of bounds" else
73 | let rec loop st u j =
74 | if j > max then dec_err (j - i) else
75 | let byte = unsafe_get_uint8 b j in
76 | let class' = unsafe_get_uint8 utf_8_dfa byte in
77 | let u' =
78 | if st <> accept
79 | then (u lsl 6) lor (byte land 0x3F)
80 | else byte land (0xFF lsr class')
81 | in
82 | let st' = unsafe_get_uint8 utf_8_dfa (256 + st + class') in
83 | if st' = reject then dec_err (j - i + 1) else
84 | if st' = accept then dec_ret (j - i + 1) u' else
85 | loop st' u' (j + 1)
86 | in
87 | loop accept 0 i
88 | *)
89 |
90 | let get_utf_8_uchar b i =
91 | let max = length b - 1 in
92 | if i < 0 || i > max then invalid_arg "index out of bounds" else
93 | let stop = ref false in
94 | let st = ref accept in
95 | let j = ref i in
96 | let u = ref 0 in
97 | while (not !stop) do
98 | let byte = unsafe_get_uint8 b !j in
99 | let class' = unsafe_get_uint8 utf_8_dfa byte in
100 | u :=
101 | (if !st <> accept
102 | then (!u lsl 6) lor (byte land 0x3F)
103 | else byte land (0xFF lsr class'));
104 | st := unsafe_get_uint8 utf_8_dfa (256 + !st + class');
105 | if (!st = reject || !st = accept)
106 | then stop := true
107 | else (incr j; if !j > max then stop := true);
108 | done;
109 | if !j > max then dec_invalid (!j - i) else
110 | if !st = accept then dec_ret (!j - i + 1) !u else
111 | if !st = reject then dec_invalid (if i = !j then 1 else (!j - i)) else
112 | assert false
113 |
114 | let set_utf_8_uchar b i u =
115 | let set = unsafe_set_uint8 in
116 | let max = length b - 1 in
117 | match Uchar.to_int u with
118 | | u when u <= 0x007F ->
119 | set_uint8 b i u;
120 | 1
121 | | u when u <= 0x07FF ->
122 | let last = i + 1 in
123 | if last > max then 0 else
124 | (set_uint8 b i (0xC0 lor (u lsr 6));
125 | set b last (0x80 lor (u land 0x3F));
126 | 2)
127 | | u when u <= 0xFFFF ->
128 | let last = i + 2 in
129 | if last > max then 0 else
130 | (set_uint8 b i (0xE0 lor (u lsr 12));
131 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
132 | set b last (0x80 lor (u land 0x3F));
133 | 3)
134 | | u ->
135 | let last = i + 3 in
136 | if last > max then 0 else
137 | (set_uint8 b i (0xF0 lor (u lsr 18));
138 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
139 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
140 | set b last (0x80 lor (u land 0x3F));
141 | 4)
142 |
143 | let is_valid_utf_8 b =
144 | (* This could be optimized using the state machine. We didn't
145 | do it for now since it seems the adhoc implementation yields
146 | better perfs. *)
147 | let rec loop b i =
148 | if i >= length b then true else
149 | let dec = get_utf_8_uchar b i in
150 | if Uchar.utf_decode_is_valid dec
151 | then loop b (i + Uchar.utf_decode_length dec)
152 | else false
153 | in
154 | loop b 0
155 |
156 | include Utf_16
157 | end
158 |
--------------------------------------------------------------------------------
/utf_x_dfa.mli:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | include Utf_x.S
17 |
--------------------------------------------------------------------------------
/utf_x_if.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | module Uchar = Utf_uchar
17 | module Bytes = struct
18 | include Bytes
19 |
20 | (* Unsafe internal additions *)
21 |
22 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
23 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
24 |
25 | (* UTF-X codecs and validations *)
26 |
27 | let dec_invalid = Uchar.utf_decode_invalid
28 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
29 |
30 | (* In case of decoding error, if we error on the first byte, we
31 | consume the byte, otherwise we consume the [n] bytes preceeding
32 | erroring byte.
33 |
34 | This means that if a client uses decodes without caring about
35 | validity it naturally replace bogus data with Uchar.rep according
36 | to the WHATWG Encoding standard. Other schemes are possible by
37 | consulting the number of used bytes on invalid decodes. For more
38 | details see https://hsivonen.fi/broken-utf-8/
39 |
40 | For this reason in [get_utf_8_uchar] we gradually check the next
41 | byte is available rather than doing it immediately after the
42 | first byte. Contrast with [is_valid_utf_8]. *)
43 |
44 | (* UTF-8 *)
45 |
46 | let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
47 | let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
48 | let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
49 | let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
50 | let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
51 |
52 | let[@inline] utf_8_uchar_2 b0 b1 =
53 | ((b0 land 0x1F) lsl 6) lor
54 | ((b1 land 0x3F))
55 |
56 | let[@inline] utf_8_uchar_3 b0 b1 b2 =
57 | ((b0 land 0x0F) lsl 12) lor
58 | ((b1 land 0x3F) lsl 6) lor
59 | ((b2 land 0x3F))
60 |
61 | let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
62 | ((b0 land 0x07) lsl 18) lor
63 | ((b1 land 0x3F) lsl 12) lor
64 | ((b2 land 0x3F) lsl 6) lor
65 | ((b3 land 0x3F))
66 |
67 | let get_utf_8_uchar b i =
68 | let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
69 | let get = unsafe_get_uint8 in
70 | if b0 <= 0x7F then (* 00..7F *) dec_ret 1 b0 else
71 | if b0 land 0xE0 = 0xC0 && b0 >= 0xC2 then begin
72 | (* C2..DF *)
73 | let max = length b - 1 in
74 | let i = i + 1 in if i > max then dec_invalid 1 else
75 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
76 | dec_ret 2 (utf_8_uchar_2 b0 b1)
77 | end else if b0 land 0xF0 = 0xE0 then begin
78 | let max = length b - 1 in
79 | if b0 = 0xE0 then begin
80 | (* E0 *)
81 | let i = i + 1 in if i > max then dec_invalid 1 else
82 | let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
83 | let i = i + 1 in if i > max then dec_invalid 2 else
84 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
85 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
86 | end else if b0 = 0xED then begin
87 | (* ED *)
88 | let i = i + 1 in if i > max then dec_invalid 1 else
89 | let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
90 | let i = i + 1 in if i > max then dec_invalid 2 else
91 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
92 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
93 | end else begin
94 | (* E1..EC or EE..EF *)
95 | let i = i + 1 in if i > max then dec_invalid 1 else
96 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
97 | let i = i + 1 in if i > max then dec_invalid 2 else
98 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
99 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
100 | end
101 | end else if b0 land 0xF8 = 0xF0 && b0 <= 0xF4 then begin
102 | let max = length b - 1 in
103 | if b0 = 0xF0 then begin
104 | (* F0 *)
105 | let i = i + 1 in if i > max then dec_invalid 1 else
106 | let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
107 | let i = i + 1 in if i > max then dec_invalid 2 else
108 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
109 | let i = i + 1 in if i > max then dec_invalid 3 else
110 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
111 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
112 | end else if b0 = 0xF4 then begin
113 | (* F4 *)
114 | let i = i + 1 in if i > max then dec_invalid 1 else
115 | let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
116 | let i = i + 1 in if i > max then dec_invalid 2 else
117 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
118 | let i = i + 1 in if i > max then dec_invalid 3 else
119 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
120 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
121 | end else begin
122 | let i = i + 1 in if i > max then dec_invalid 1 else
123 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
124 | let i = i + 1 in if i > max then dec_invalid 2 else
125 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
126 | let i = i + 1 in if i > max then dec_invalid 3 else
127 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
128 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
129 | end
130 | end else dec_invalid 1
131 |
132 | let set_utf_8_uchar b i u =
133 | let set = unsafe_set_uint8 in
134 | let max = length b - 1 in
135 | match Uchar.to_int u with
136 | | u when u < 0 -> assert false
137 | | u when u <= 0x007F ->
138 | set_uint8 b i u;
139 | 1
140 | | u when u <= 0x07FF ->
141 | let last = i + 1 in
142 | if last > max then 0 else
143 | (set_uint8 b i (0xC0 lor (u lsr 6));
144 | set b last (0x80 lor (u land 0x3F));
145 | 2)
146 | | u when u <= 0xFFFF ->
147 | let last = i + 2 in
148 | if last > max then 0 else
149 | (set_uint8 b i (0xE0 lor (u lsr 12));
150 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
151 | set b last (0x80 lor (u land 0x3F));
152 | 3)
153 | | u ->
154 | let last = i + 3 in
155 | if last > max then 0 else
156 | (set_uint8 b i (0xF0 lor (u lsr 18));
157 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
158 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
159 | set b last (0x80 lor (u land 0x3F));
160 | 4)
161 |
162 | let is_valid_utf_8 b =
163 | let rec loop max b i =
164 | if i > max then true else
165 | let get = unsafe_get_uint8 in
166 | let b0 = get b i in
167 | if b0 <= 0x7F then (* 00..7F *) loop max b (i + 1) else
168 | if b0 land 0xE0 = 0xC0 && b0 >= 0xC2 then begin
169 | (* C2..DF *)
170 | let last = i + 1 in
171 | if last > max
172 | || not_in_x80_to_xBF (get b last)
173 | then false
174 | else loop max b (last + 1)
175 | end else if b0 land 0xF0 = 0xE0 then begin
176 | if b0 = 0xE0 then begin
177 | (* E0 *)
178 | let last = i + 2 in
179 | if last > max
180 | || not_in_xA0_to_xBF (get b (i + 1))
181 | || not_in_x80_to_xBF (get b last)
182 | then false
183 | else loop max b (last + 1)
184 | end else if b0 = 0xED then begin
185 | let last = i + 2 in
186 | if last > max
187 | || not_in_x80_to_x9F (get b (i + 1))
188 | || not_in_x80_to_xBF (get b last)
189 | then false
190 | else loop max b (last + 1)
191 | end else begin
192 | (* E1..EC or EE..EF *)
193 | let last = i + 2 in
194 | if last > max
195 | || not_in_x80_to_xBF (get b (i + 1))
196 | || not_in_x80_to_xBF (get b last)
197 | then false
198 | else loop max b (last + 1)
199 | end
200 | end else if b0 land 0xF8 = 0xF0 && b0 <= 0xF4 then begin
201 | if b0 = 0xF0 then begin
202 | (* F0 *)
203 | let last = i + 3 in
204 | if last > max
205 | || not_in_x90_to_xBF (get b (i + 1))
206 | || not_in_x80_to_xBF (get b (i + 2))
207 | || not_in_x80_to_xBF (get b last)
208 | then false
209 | else loop max b (last + 1)
210 | end else if b0 = 0xF4 then begin
211 | (* F4 *)
212 | let last = i + 3 in
213 | if last > max
214 | || not_in_x80_to_x8F (get b (i + 1))
215 | || not_in_x80_to_xBF (get b (i + 2))
216 | || not_in_x80_to_xBF (get b last)
217 | then false
218 | else loop max b (last + 1)
219 | end else begin
220 | (* F1..F3 *)
221 | let last = i + 3 in
222 | if last > max
223 | || not_in_x80_to_xBF (get b (i + 1))
224 | || not_in_x80_to_xBF (get b (i + 2))
225 | || not_in_x80_to_xBF (get b last)
226 | then false
227 | else loop max b (last + 1)
228 | end
229 | end else false
230 | in
231 | loop (length b - 1) b 0
232 |
233 | include Utf_16
234 | end
235 |
--------------------------------------------------------------------------------
/utf_x_if.mli:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | include Utf_x.S
17 |
--------------------------------------------------------------------------------
/utf_x_pat.ml:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | module Uchar = Utf_uchar
17 | module Bytes = struct
18 | include Bytes
19 |
20 | (* Unsafe internal additions *)
21 |
22 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set"
23 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get"
24 |
25 | (* UTF-X codecs and validations *)
26 |
27 | let dec_invalid = Uchar.utf_decode_invalid
28 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
29 |
30 | (* In case of decoding error, if we error on the first byte, we
31 | consume the byte, otherwise we consume the [n] bytes preceeding
32 | the erroring byte.
33 |
34 | This means that if a client uses decodes without caring about
35 | validity it naturally replace bogus data with Uchar.rep according
36 | to the WHATWG Encoding standard. Other schemes are possible by
37 | consulting the number of used bytes on invalid decodes. For more
38 | details see https://hsivonen.fi/broken-utf-8/
39 |
40 | For this reason in [get_utf_8_uchar] we gradually check the next
41 | byte is available rather than doing it immediately after the
42 | first byte. Contrast with [is_valid_utf_8]. *)
43 |
44 | (* UTF-8 *)
45 |
46 | let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
47 | let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
48 | let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
49 | let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
50 | let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
51 |
52 | let[@inline] utf_8_uchar_2 b0 b1 =
53 | ((b0 land 0x1F) lsl 6) lor
54 | ((b1 land 0x3F))
55 |
56 | let[@inline] utf_8_uchar_3 b0 b1 b2 =
57 | ((b0 land 0x0F) lsl 12) lor
58 | ((b1 land 0x3F) lsl 6) lor
59 | ((b2 land 0x3F))
60 |
61 | let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
62 | ((b0 land 0x07) lsl 18) lor
63 | ((b1 land 0x3F) lsl 12) lor
64 | ((b2 land 0x3F) lsl 6) lor
65 | ((b3 land 0x3F))
66 |
67 | let get_utf_8_uchar b i =
68 | let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
69 | let get = unsafe_get_uint8 in
70 | let max = length b - 1 in
71 | match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *)
72 | | '\x00' .. '\x7F' -> dec_ret 1 b0
73 | | '\xC2' .. '\xDF' ->
74 | let i = i + 1 in if i > max then dec_invalid 1 else
75 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
76 | dec_ret 2 (utf_8_uchar_2 b0 b1)
77 | | '\xE0' ->
78 | let i = i + 1 in if i > max then dec_invalid 1 else
79 | let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
80 | let i = i + 1 in if i > max then dec_invalid 2 else
81 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
82 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
83 | | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
84 | let i = i + 1 in if i > max then dec_invalid 1 else
85 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
86 | let i = i + 1 in if i > max then dec_invalid 2 else
87 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
88 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
89 | | '\xED' ->
90 | let i = i + 1 in if i > max then dec_invalid 1 else
91 | let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
92 | let i = i + 1 in if i > max then dec_invalid 2 else
93 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
94 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
95 | | '\xF0' ->
96 | let i = i + 1 in if i > max then dec_invalid 1 else
97 | let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
98 | let i = i + 1 in if i > max then dec_invalid 2 else
99 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
100 | let i = i + 1 in if i > max then dec_invalid 3 else
101 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
102 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
103 | | '\xF1' .. '\xF3' ->
104 | let i = i + 1 in if i > max then dec_invalid 1 else
105 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
106 | let i = i + 1 in if i > max then dec_invalid 2 else
107 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
108 | let i = i + 1 in if i > max then dec_invalid 3 else
109 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
110 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
111 | | '\xF4' ->
112 | let i = i + 1 in if i > max then dec_invalid 1 else
113 | let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
114 | let i = i + 1 in if i > max then dec_invalid 2 else
115 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
116 | let i = i + 1 in if i > max then dec_invalid 3 else
117 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
118 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
119 | | _ -> dec_invalid 1
120 |
121 | let set_utf_8_uchar b i u =
122 | let set = unsafe_set_uint8 in
123 | let max = length b - 1 in
124 | match Uchar.to_int u with
125 | | u when u < 0 -> assert false
126 | | u when u <= 0x007F ->
127 | set_uint8 b i u;
128 | 1
129 | | u when u <= 0x07FF ->
130 | let last = i + 1 in
131 | if last > max then 0 else
132 | (set_uint8 b i (0xC0 lor (u lsr 6));
133 | set b last (0x80 lor (u land 0x3F));
134 | 2)
135 | | u when u <= 0xFFFF ->
136 | let last = i + 2 in
137 | if last > max then 0 else
138 | (set_uint8 b i (0xE0 lor (u lsr 12));
139 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
140 | set b last (0x80 lor (u land 0x3F));
141 | 3)
142 | | u ->
143 | let last = i + 3 in
144 | if last > max then 0 else
145 | (set_uint8 b i (0xF0 lor (u lsr 18));
146 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
147 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
148 | set b last (0x80 lor (u land 0x3F));
149 | 4)
150 |
151 | let is_valid_utf_8 b =
152 | let rec loop max b i =
153 | if i > max then true else
154 | let get = unsafe_get_uint8 in
155 | match Char.unsafe_chr (get b i) with
156 | | '\x00' .. '\x7F' -> loop max b (i + 1)
157 | | '\xC2' .. '\xDF' ->
158 | let last = i + 1 in
159 | if last > max
160 | || not_in_x80_to_xBF (get b last)
161 | then false
162 | else loop max b (last + 1)
163 | | '\xE0' ->
164 | let last = i + 2 in
165 | if last > max
166 | || not_in_xA0_to_xBF (get b (i + 1))
167 | || not_in_x80_to_xBF (get b last)
168 | then false
169 | else loop max b (last + 1)
170 | | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
171 | let last = i + 2 in
172 | if last > max
173 | || not_in_x80_to_xBF (get b (i + 1))
174 | || not_in_x80_to_xBF (get b last)
175 | then false
176 | else loop max b (last + 1)
177 | | '\xED' ->
178 | let last = i + 2 in
179 | if last > max
180 | || not_in_x80_to_x9F (get b (i + 1))
181 | || not_in_x80_to_xBF (get b last)
182 | then false
183 | else loop max b (last + 1)
184 | | '\xF0' ->
185 | let last = i + 3 in
186 | if last > max
187 | || not_in_x90_to_xBF (get b (i + 1))
188 | || not_in_x80_to_xBF (get b (i + 2))
189 | || not_in_x80_to_xBF (get b last)
190 | then false
191 | else loop max b (last + 1)
192 | | '\xF1' .. '\xF3' ->
193 | let last = i + 3 in
194 | if last > max
195 | || not_in_x80_to_xBF (get b (i + 1))
196 | || not_in_x80_to_xBF (get b (i + 2))
197 | || not_in_x80_to_xBF (get b last)
198 | then false
199 | else loop max b (last + 1)
200 | | '\xF4' ->
201 | let last = i + 3 in
202 | if last > max
203 | || not_in_x80_to_x8F (get b (i + 1))
204 | || not_in_x80_to_xBF (get b (i + 2))
205 | || not_in_x80_to_xBF (get b last)
206 | then false
207 | else loop max b (last + 1)
208 | | _ -> false
209 | in
210 | loop (length b - 1) b 0
211 |
212 | include Utf_16
213 | end
214 |
--------------------------------------------------------------------------------
/utf_x_pat.mli:
--------------------------------------------------------------------------------
1 | (**************************************************************************)
2 | (* *)
3 | (* OCaml *)
4 | (* *)
5 | (* The OCaml programmers *)
6 | (* *)
7 | (* Copyright 2021 Institut National de Recherche en Informatique et *)
8 | (* en Automatique. *)
9 | (* *)
10 | (* All rights reserved. This file is distributed under the terms of *)
11 | (* the GNU Lesser General Public License version 2.1, with the *)
12 | (* special exception on linking described in the file LICENSE. *)
13 | (* *)
14 | (**************************************************************************)
15 |
16 | include Utf_x.S
17 |
--------------------------------------------------------------------------------