├── .gitignore ├── .merlin ├── B0.ml ├── BRZO ├── DEVEL.md ├── LICENSE.md ├── README.md ├── data ├── urep-spec.html └── urep-test.html ├── examples.ml ├── old-proposal.md ├── perf8.ml ├── test.ml ├── test_long.ml ├── trip8.ml ├── utf_16.ml ├── utf_uchar.ml ├── utf_uchar.mli ├── utf_x.mli ├── utf_x_adhoc.ml ├── utf_x_adhoc.mli ├── utf_x_dfa.ml ├── utf_x_dfa.mli ├── utf_x_if.ml ├── utf_x_if.mli ├── utf_x_pat.ml └── utf_x_pat.mli /.gitignore: -------------------------------------------------------------------------------- 1 | _b0 2 | _build 3 | data 4 | tmp 5 | *.install 6 | CLOCK.org 7 | -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG b0.kit uutf 2 | S ./** 3 | B _b0/b/** 4 | -------------------------------------------------------------------------------- /B0.ml: -------------------------------------------------------------------------------- 1 | open B0_kit.V000 2 | 3 | let uutf = B0_ocaml.libname "uutf" 4 | 5 | let file v = `File (Fpath.v v) 6 | let mod_srcs n = file (n ^ ".mli"), file (n ^ ".ml") 7 | 8 | let iface = file "utf_x.mli" 9 | let uchar_mli, uchar_ml = mod_srcs "utf_uchar" 10 | let utf_16_ml = file "utf_16.ml" 11 | let adhoc_mli, adhoc_ml = mod_srcs "utf_x_adhoc" 12 | let if_mli, if_ml = mod_srcs "utf_x_if" 13 | let dfa_mli, dfa_ml = mod_srcs "utf_x_dfa" 14 | let pat_mli, pat_ml = mod_srcs "utf_x_pat" 15 | 16 | let base = [iface; uchar_mli; uchar_ml; utf_16_ml] 17 | let all_impls = 18 | [adhoc_mli; adhoc_ml; if_mli; if_ml; dfa_mli; dfa_ml; pat_mli; pat_ml ] @ 19 | base 20 | 21 | let examples = 22 | let srcs = [adhoc_mli; adhoc_ml; file "examples.ml"] @ base in 23 | B0_ocaml.exe "examples" ~doc:"Sample code" ~srcs 24 | 25 | let test = 26 | let srcs = (file "test.ml" :: all_impls) in 27 | B0_ocaml.exe "test" ~doc:"Test suite" ~srcs 28 | 29 | let test = 30 | let srcs = (file "test_long.ml" :: all_impls) in 31 | B0_ocaml.exe "test_long" ~doc:"Long test suite" ~srcs 32 | 33 | let perf8 = 34 | let srcs = (file "perf8.ml" :: all_impls) in 35 | let requires = [uutf] in 36 | B0_ocaml.exe "perf8" ~doc:"Performance test tool for UTF-8" ~requires ~srcs 37 | 38 | let trip8 = 39 | let srcs = (file "trip8.ml" :: all_impls) in 40 | let requires = [uutf] in 41 | B0_ocaml.exe "trip8" ~doc:"Best-effort UTF-8 recode" ~requires ~srcs 42 | -------------------------------------------------------------------------------- /BRZO: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbuenzli/stdlib-utf/3b9a96e7530830bd5ea87c3c92dc9e9bd02d4dee/BRZO -------------------------------------------------------------------------------- /DEVEL.md: -------------------------------------------------------------------------------- 1 | Running stuff. 2 | 3 | ``` 4 | opam install uutf 5 | opam pin add b0 --dev 6 | curl -L https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt \ 7 | > data/UTF-8-demo.txt 8 | time b0 -a perf8 -- --adhoc data/UTF-8-demo.txt 9 | time b0 -a perf8 -- --dfa data/UTF-8-demo.txt 10 | time b0 -a perf8 -- --validate --adhoc data/UTF-8-demo.txt 11 | 12 | # If you are interested in absolute numbers avoid timing the build 13 | 14 | b0 15 | time $(b0 unit build-dir perf8)/perf8 ... 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ``` 2 | (**************************************************************************) 3 | (* *) 4 | (* OCaml *) 5 | (* *) 6 | (* The OCaml programmers *) 7 | (* *) 8 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 9 | (* en Automatique. *) 10 | (* *) 11 | (* All rights reserved. This file is distributed under the terms of *) 12 | (* the GNU Lesser General Public License version 2.1, with the *) 13 | (* special exception on linking described in the file LICENSE. *) 14 | (* *) 15 | (**************************************************************************) 16 | ``` 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Adding UTF-X decoding support to the OCaml Stdlib 2 | ================================================= 3 | 4 | **This has been merged upstream and will be part of OCaml 4.14** 5 | 6 | [Upstream issue][upstream]. [Upstream PR][upstream-pr]. 7 | 8 | Plays around improving [this old proposal][old-proposal]. Especially 9 | on [Alain's comment][alain-comment] to avoid exceptions. 10 | 11 | For now we simply focus on providing an API in `Bytes`, the rest can 12 | follow easily and/or later. 13 | 14 | We introduce the abstract type `Uchar.utf_decode` for decode results 15 | which is just an `int` value that has the decoded `Uchar.t` (or 16 | `Uchar.rep` in case of decode error) in the lower bits and information 17 | about the decode in bits above `Uchar.max`. The scheme is compatible 18 | with both 32-bit and 64-bit platforms, see the implementation for more 19 | details. No exception have to be introduced and the API does not 20 | allocate on decodes. 21 | 22 | The [resulting API](utf_x.mli) seems nice to 23 | use. [`examples.ml`](examples.ml) show how [`Uutf`-like 24 | folders][uutf-fold] and `Seq` based iterators can easily be derived in 25 | a few lines. 26 | 27 | It is also fool proof w.r.t. to best-effort decodes, `Uchar.rep` will 28 | be appropriately added on decoding errors as it should be for security 29 | reasons. See the `Seq` iterator example and more information on this 30 | below. 31 | 32 | A few implementation are provided which differ only in their UTF-8 33 | treatment: 34 | 35 | 1. [`utf_x_adhoc.ml`](utf_x_adhoc.ml), uses a 256 bytes string to index 36 | the cases mentionned in [TUS Table 3.7][tus] and then performs some 37 | ugly ad-hoc decoding. This ugly code is mine :-) 38 | 2. [`utf_x_if.ml`](utf_x_if.ml), is similar to adhoc, but uses `if` 39 | branches instead of a table, the result is a bit harder to comprehend. 40 | 3. [`utf_x_pat.ml`](utf_x_pat.ml), is similar to adhoc but lets my 41 | favourite compiler work out the dispatch table itself by using pattern 42 | matching on byte ranges. Yay ! 43 | 4. [`utf_x_dfa.ml`](utf_x_dfa.ml), uses the UTF-8 decoding MIT-licensed 44 | DFA devised by Bjoern Hoehrmann [here][dfa]. This uses a 364 bytes 45 | string for the DFA and makes the decoding code much more elegant (if 46 | I hadn't to rewrite it to an imperative loop to make it more 47 | efficient). 48 | 49 | Rough benchmarks made with [`perf8.ml`](perf8.ml) and measured via 50 | `time` on my machine seems to indicate we have, ordered by most 51 | performant: `pat`, `if`, `adhoc` and `dfa`. 52 | 53 | The first three being quite close to each other (within 54 | 10%). Comparing the fastest to the slowest, the `pat` version takes 55 | the following percentage of the time taken by the `dfa` version. 56 | 57 | * 62% [Markus Kuhn's UTF-8 demo][kuhn-utf-8]. A mixture of 58 | things. 59 | * 62% [Hindi wikipedia articles multistream][hindi-wiki] dataset. 60 | Exercises ASCII and [Devanagari][devanagari] decodes which are 61 | three bytes long in UTF-8. 62 | * 58% [Croatian wikipedia articles multistream][hr-wiki] dataset. 63 | Exercises ASCII and [Latin Extended-A][latin-ext-A] decodes which 64 | are two bytes long in UTF-8. 65 | * 56% on this README, mainly ASCII. 66 | 67 | So I think the best candidate for upstreaming would be `pat`. 68 | 69 | ## Best-effort decodes and U+FFFD replacements 70 | 71 | The API also tries to take into account recommended strategies for 72 | U+FFFD replacement on best-effort decoding. For a discussion about 73 | these see [here][how-many-urep]. 74 | 75 | If nothing special is done, e.g. like the `Seq` iterator in 76 | [`examples.ml`](examples.ml) the resulting API should behave like the 77 | "state machine view". Supposedly this is what is mandated by the 78 | WHATWG [Encoding][whatwg-encoding] standard, if you can infer it 79 | through their crazy prose code specifications – TUS has a description 80 | of it on page 126 of the [version 14.0.0 text][tus] if you can 81 | remember all the precise jargon definitions :-) I believe that a short and 82 | concise spec of all that is: 83 | 84 | > In case of decoding error return an U+FFFE, if you error on the 85 | > first byte, consume the byte, otherwise consume the bytes preceeding 86 | > the erroring byte. 87 | 88 | The API provides enough information on decodes to implement other 89 | replacement modes like one `Uchar.rep` per bogus byte. 90 | 91 | The [`urep-test.html`](data/urep-test.html) and 92 | [`urep-spec.html`](data/urep-spec.html) files taken from the above 93 | mentioned page can be used to test the WHATWG encoding standard 94 | behaviour (note that `Uutf` does not respect this). 95 | 96 | [upstream-pr]: https://github.com/ocaml/ocaml/pull/10710 97 | [upstream]: https://github.com/ocaml/ocaml/issues/10660 98 | [old-proposal]: https://gist.github.com/dbuenzli/211e1fb4d8dfce0d22c6d6616260cdd9 99 | [alain-comment]: https://gist.github.com/dbuenzli/211e1fb4d8dfce0d22c6d6616260cdd9#gistcomment-2574875 100 | [how-many-urep]: https://hsivonen.fi/broken-utf-8/ 101 | [tus]: http://www.unicode.org/versions/Unicode14.0.0/ch03.pdf 102 | [whatwg-encoding]: https://encoding.spec.whatwg.org/ 103 | [dfa]: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/#variations 104 | [hindi-wiki]: https://dumps.wikimedia.org/hiwiki/20210920/ 105 | [hr-wiki]: https://dumps.wikimedia.org/hrwiki/20210920/ 106 | [kuhn-utf-8]: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt 107 | [uutf-fold]: https://erratique.ch/software/uutf/doc/Uutf.String.html#1_Stringfolders 108 | [devanagari]: http://www.unicode.org/charts/PDF/U0900.pdf 109 | [latin-ext-a]: https://www.unicode.org/charts/PDF/U0100.pdf 110 | -------------------------------------------------------------------------------- /data/urep-spec.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Broken UTF-8 6 | 11 | 12 | 13 |

Broken UTF-8

14 |

Any copyright to this file is dedicated to the Public Domain. https://creativecommons.org/publicdomain/zero/1.0/

15 |

Five-byte and six-byte sequences were defined in RFC 2297 but are no longer part of the UTF-8 definition. 16 |

Non-shortest forms for lowest single-byte (U+0000)

17 |
18 |
Two-byte sequence (C0 80)
19 |
��
20 |
Three-byte sequence (E0 80 80)
21 |
���
22 |
Four-byte sequence (F0 80 80 80)
23 |
����
24 |
Five-byte sequence (F8 80 80 80 80)
25 |
�����
26 |
Six-byte sequence (FC 80 80 80 80 80)
27 |
������
28 |
29 |

Non-shortest forms for highest single-byte (U+007F)

30 |
31 |
Two-byte sequence (C1 BF)
32 |
��
33 |
Three-byte sequence (E0 81 BF)
34 |
���
35 |
Four-byte sequence (F0 80 81 BF)
36 |
����
37 |
Five-byte sequence (F8 80 80 81 BF)
38 |
�����
39 |
Six-byte sequence (FC 80 80 80 81 BF)
40 |
������
41 |
42 |

Non-shortest forms for lowest two-byte (U+0080)

43 |
44 |
Three-byte sequence (E0 82 80)
45 |
���
46 |
Four-byte sequence (F0 80 82 80)
47 |
����
48 |
Five-byte sequence (F8 80 80 82 80)
49 |
�����
50 |
Six-byte sequence (FC 80 80 80 82 80)
51 |
������
52 |
53 |

Non-shortest forms for highest two-byte (U+07FF)

54 |
55 |
Three-byte sequence (E0 9F BF)
56 |
���
57 |
Four-byte sequence (F0 80 9F BF)
58 |
����
59 |
Five-byte sequence (F8 80 80 9F BF)
60 |
�����
61 |
Six-byte sequence (FC 80 80 80 9F BF)
62 |
������
63 |
64 |

Non-shortest forms for lowest three-byte (U+0800)

65 |
66 |
Four-byte sequence (F0 80 A0 80)
67 |
����
68 |
Five-byte sequence (F8 80 80 A0 80)
69 |
�����
70 |
Six-byte sequence (FC 80 80 80 A0 80)
71 |
������
72 |
73 |

Non-shortest forms for highest three-byte (U+FFFF)

74 |
75 |
Four-byte sequence (F0 8F BF BF)
76 |
����
77 |
Five-byte sequence (F8 80 8F BF BF)
78 |
�����
79 |
Six-byte sequence (FC 80 80 8F BF BF)
80 |
������
81 |
82 |

Non-shortest forms for lowest four-byte (U+10000)

83 |
84 |
Five-byte sequence (F8 80 90 80 80)
85 |
�����
86 |
Six-byte sequence (FC 80 80 90 80 80)
87 |
������
88 |
89 |

Non-shortest forms for last Unicode (U+10FFFF)

90 |
91 |
Five-byte sequence (F8 84 8F BF BF)
92 |
�����
93 |
Six-byte sequence (FC 80 84 8F BF BF)
94 |
������
95 |
96 |

Out of range

97 |
98 |
One past Unicode (F4 90 80 80)
99 |
����
100 |
Longest five-byte sequence (FB BF BF BF BF)
101 |
�����
102 |
Longest six-byte sequence (FD BF BF BF BF BF)
103 |
������
104 |
First surrogate (ED A0 80)
105 |
���
106 |
Last surrogate (ED BF BF)
107 |
���
108 |
CESU-8 surrogate pair (ED A0 BD ED B2 A9)
109 |
������
110 |
111 |

Out of range and non-shortest

112 |
113 |
One past Unicode as five-byte sequence (F8 84 90 80 80)
114 |
�����
115 |
One past Unicode as six-byte sequence (FC 80 84 90 80 80)
116 |
������
117 |
First surrogate as four-byte sequence (F0 8D A0 80)
118 |
����
119 |
Last surrogate as four-byte sequence (F0 8D BF BF)
120 |
����
121 |
CESU-8 surrogate pair as two four-byte overlongs (F0 8D A0 BD F0 8D B2 A9)
122 |
��������
123 |
124 |

Lone trails

125 |
126 |
One (80)
127 |
128 |
Two (80 80)
129 |
��
130 |
Three (80 80 80)
131 |
���
132 |
Four (80 80 80 80)
133 |
����
134 |
Five (80 80 80 80 80)
135 |
�����
136 |
Six (80 80 80 80 80 80)
137 |
������
138 |
Seven (80 80 80 80 80 80 80)
139 |
�������
140 |
After valid two-byte (C2 B6 80)
141 |
¶�
142 |
After valid three-byte (E2 98 83 80)
143 |
☃�
144 |
After valid four-byte (F0 9F 92 A9 80)
145 |
💩�
146 |
After five-byte (FB BF BF BF BF 80)
147 |
������
148 |
After six-byte (FD BF BF BF BF BF 80)
149 |
�������
150 |
151 |

Truncated sequences

152 |
153 |
Two-byte lead (C2)
154 |
155 |
Three-byte lead (E2)
156 |
157 |
Three-byte lead and one trail (E2 98)
158 |
159 |
Four-byte lead (F0)
160 |
161 |
Four-byte lead and one trail (F0 9F)
162 |
163 |
Four-byte lead and two trails (F0 9F 92)
164 |
165 |
166 |

Leftovers

167 |
168 |
FE (FE)
169 |
170 |
FE and trail (FE 80)
171 |
��
172 |
FF (FF)
173 |
174 |
FF and trail (FF 80)
175 |
��
176 |
177 | 178 | 179 | -------------------------------------------------------------------------------- /data/urep-test.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbuenzli/stdlib-utf/3b9a96e7530830bd5ea87c3c92dc9e9bd02d4dee/data/urep-test.html -------------------------------------------------------------------------------- /examples.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | open Utf_x_adhoc 17 | 18 | (* Uutf style folders 19 | https://erratique.ch/software/uutf/doc/Uutf.String.html *) 20 | 21 | type utf_decode = [ `Malformed of string | `Uchar of Uchar.t ] 22 | type 'a folder = 'a -> int -> utf_decode -> 'a 23 | 24 | let fold_utf_8 : 'a folder -> 'a -> string -> 'a = fun f acc s -> 25 | let rec loop b i acc = 26 | if i >= Bytes.length b then acc else 27 | let dec = Bytes.get_utf_8_uchar b i in 28 | let n = Uchar.utf_decode_length dec in 29 | let i' = i + n in 30 | match Uchar.utf_decode_is_valid dec with 31 | | false -> loop b i' (f acc i (`Malformed (String.sub s i n))) 32 | | true -> loop b i' (f acc i (`Uchar (Uchar.utf_decode_uchar dec))) 33 | in 34 | loop (Bytes.unsafe_of_string s) 0 acc 35 | 36 | (* Seq based iterators. 37 | 38 | Replace malformations by {!Uchar.rep} using the WHATWG Encoding 39 | standard, see https://hsivonen.fi/broken-utf-8/. It's nice that no 40 | particular logic has to be added for decode errors. *) 41 | 42 | let utf_8_string_to_uchar_seq : string -> Uchar.t Seq.t = fun s -> 43 | let rec uchar b i () = 44 | if i >= Bytes.length b then Seq.Nil else 45 | let dec = Bytes.get_utf_8_uchar b i in 46 | let u = Uchar.utf_decode_uchar dec in 47 | Seq.Cons (u, uchar b (i + Uchar.utf_decode_length dec)) 48 | in 49 | uchar (Bytes.unsafe_of_string s) 0 50 | -------------------------------------------------------------------------------- /old-proposal.md: -------------------------------------------------------------------------------- 1 | # Adding UTF-X decoding support to the stdlib 2 | 3 | Since 4.06 we have UTF-X encoding support via the `Buffer` module. The 4 | following is a proposal to add UTF-X *decoding* (and `bytes` encoding) 5 | support with the following goals: 6 | 7 | 1. Provide a low-level, allocation-free codec API in `bytes`. This API 8 | provides all the information needed to implement loops for making 9 |   higher-level UTF-X codec APIs (e.g. `Uutf`'s [folding](http://erratique.ch/software/uutf/doc/Uutf.String.html#1_Stringfolders) functions) 10 | operating on `bytes` and `string` values. It's not geared towards the end-user 11 | but rather towards library programmers. 12 | 13 | 2. Provide a less performant and less precise (no way to detect 14 | decoding errors; `Uchar.rep` replacement) decoding convenience API 15 | based on the `Uchar.t Seq.t` datatype for both `Bytes` and `String`. 16 | 17 | ## High-level `Seq` based API (`Bytes` and `String`) 18 | 19 | In both `Bytes` and `String` we add the following high-level interface 20 | (implemented via the low-level interface): 21 | 22 | ```ocaml 23 | val utf_8_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t 24 | val utf_16le_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t 25 | val utf_16be_to_uchar_seq : ?start:int -> ?len:int -> t -> Uchar.t Seq.t 26 | ``` 27 | 28 | These are "best-effort" functions: any decoding error is substituted by 29 | a `Uchar.rep` character until a new synchronizing byte is found. 30 | 31 | The `utf_{8,16le,16be}_of_uchar_seq` functions would be a bit difficult to 32 | provide due to dependencies (if implemented via `Buffer`), so 33 | we leave them out. 34 | 35 | We could add `Buffer.add_utf_{8,16le,16,be}_uchar_seq` but that's just 36 | a `Seq.iter` away with the existing `Buffer.add_utf_{8,16le,16be}_uchar`. 37 | So we stop here w.r.t. high-level API. 38 | 39 | ## Low-level `Bytes` API 40 | 41 | We begin with the following additions to the `Uchar` module: 42 | 43 | ```ocaml 44 | val utf_8_byte_length : Uchar.t -> int 45 | val utf_16_byte_length : Uchar.t -> int 46 | ``` 47 | 48 | In the `Bytes` module we then add the following: 49 | 50 | ```ocaml 51 | exception Utf_error of int 52 | (** The control flow exception raised by 53 | [{get,set}_uchar_utf_{8,16le,16be}] functions. The integer is an 54 | index where a new decode/encode can be tried or the length of the 55 | buffer if the end was reached. {b Important.} This exception is used 56 | for control flow you must not let it uncaught, it is raised with notrace. *) 57 | 58 | val get_uchar_utf_8_length : t -> int -> int 59 | (** [get_uchar_utf_8_length b i] is the byte length an UTF-8 encoded 60 | character at index [i] in [b]. This is [0] if the byte is not a valid 61 | UTF-8 starter byte. *) 62 | 63 | val get_uchar_utf_8 : t -> int -> Uchar.t 64 | (** [get_uchar_utf_8 b i] is the UTF-8 encoded Unicode character at index 65 | [i] in [b]. The number of bytes consumed by the decode can be 66 | determined by [get_uchar_utf_8_length b i]. In case of errors 67 | @raise Utf_error [n] with [n] the next index were a new decode 68 | can be tried or [length b] if there is no such index. *) 69 | 70 | val get_uchar_utf_16le : t -> int -> Uchar.t 71 | (** [get_uchar_utf_16le b i] is the UTF-16LE encoded Unicode character at 72 | index [i] in [b]. The number of bytes consumed by the decode can 73 | be determined by using {!Uchar.utf_16_byte_length} on the 74 | result. @raise Utf_error [n] with [n] the next index were a new 75 | decode can be tried or [length b] if there is no such index. *) 76 | 77 | val get_uchar_utf_16be : t -> int -> Uchar.t 78 | (** [get_uchar_utf_16be] is like {!get_uchar_utf16_le} but decodes 79 | UTF-16BE. *) 80 | 81 | val set_uchar_utf_8 : t -> int -> Uchar.t -> int 82 | (** [set_uchar_utf_8 b i u] sets the UTF-8 encoding of [u] in [b] 83 | starting at [i] and returns the next index were an encoding can 84 | be performed or the buffer length if there is no such 85 | index. @raise Utf_error [n] if there's not enough space for the 86 | encode. *) 87 | 88 | val set_utf_16le : t -> int -> Uchar.t -> int 89 | (** [set_utf_16le] is like {!set_uchar_utf_8} but UTF-16LE encodes. *) 90 | 91 | val set_utf_16be : t -> int -> Uchar.t -> int 92 | (** [set_utf_16be] is like {!set_uchar_utf_8} but UTF-16BE encodes. *) 93 | ``` 94 | 95 | For codecing sequences of `Uchar.t` as loops: 96 | 97 | 1. The `set` functions are good. 98 | 2. The `get` functions incur a bit of overhead to compute the next index (the invoked function has the information but we can't return it without allocating). For UTF-8 either we either get the next index via `get_uchar_utf_8_length` (assuming a table-based implementation: two memory accesses) or with `Uchar.utf_8_byte_length` on the result (up to three branches). For UTF-16 we use `Uchar.utf_16_byte_length` (one branch). 99 | -------------------------------------------------------------------------------- /perf8.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | 17 | let log fmt = Printf.printf (fmt ^^ "\n%!") 18 | 19 | let impl_to_string = function 20 | | `Adhoc -> "ADHOC" | `Dfa -> "DFA" | `If -> "IF" | `Pat -> "PAT" 21 | | `Uutf -> "UUTF" 22 | 23 | (* Data preparation. We take any file read it and duplicate 24 | its content until we get at least a 1GB string. *) 25 | 26 | let string_of_file file = 27 | let ic = open_in_bin file in 28 | let len = in_channel_length ic in 29 | let buf = Bytes.create len in 30 | really_input ic buf 0 len; close_in ic; Bytes.unsafe_to_string buf 31 | 32 | let make_it_approx_1GB s = 33 | let one_gb = 1024 * 1024 * 1024 in 34 | let len = String.length s in 35 | if len >= 1024 * 1024 * 1024 then s else 36 | let count = one_gb / len in 37 | let rec rep acc n = if n <= 0 then acc else rep (s :: acc) (n - 1) in 38 | String.concat "" (rep [] count) 39 | 40 | let get_data file = 41 | log "Preparing data with %s" file; 42 | let s = make_it_approx_1GB (string_of_file file) in 43 | let size_gb = float (String.length s) /. 1024. /. 1024. /. 1024. in 44 | s, size_gb 45 | 46 | (* Best-effort decode *) 47 | 48 | let adhoc_best_effort_decode s = 49 | let rec loop b i = 50 | if i >= Utf_x_adhoc.Bytes.length b then () else 51 | let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in 52 | let used = Utf_x_adhoc.Uchar.utf_decode_length d in 53 | let u = Utf_x_adhoc.Uchar.utf_decode_uchar d in 54 | Sys.opaque_identity @@ (ignore u); 55 | loop b (i + used) 56 | in 57 | loop (Bytes.unsafe_of_string s) 0 58 | 59 | let dfa_best_effort_decode s = 60 | let rec loop b i = 61 | if i >= Utf_x_dfa.Bytes.length b then () else 62 | let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in 63 | let used = Utf_x_dfa.Uchar.utf_decode_length d in 64 | let u = Utf_x_dfa.Uchar.utf_decode_uchar d in 65 | Sys.opaque_identity @@ (ignore u); 66 | loop b (i + used) 67 | in 68 | loop (Bytes.unsafe_of_string s) 0 69 | 70 | let if_best_effort_decode s = 71 | let rec loop b i = 72 | if i >= Utf_x_if.Bytes.length b then () else 73 | let d = Utf_x_if.Bytes.get_utf_8_uchar b i in 74 | let used = Utf_x_if.Uchar.utf_decode_length d in 75 | let u = Utf_x_if.Uchar.utf_decode_uchar d in 76 | Sys.opaque_identity @@ (ignore u); 77 | loop b (i + used) 78 | in 79 | loop (Bytes.unsafe_of_string s) 0 80 | 81 | let pat_best_effort_decode s = 82 | let rec loop b i = 83 | if i >= Utf_x_pat.Bytes.length b then () else 84 | let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in 85 | let used = Utf_x_pat.Uchar.utf_decode_length d in 86 | let u = Utf_x_pat.Uchar.utf_decode_uchar d in 87 | Sys.opaque_identity @@ (ignore u); 88 | loop b (i + used) 89 | in 90 | loop (Bytes.unsafe_of_string s) 0 91 | 92 | let uutf_best_effort_decode s = 93 | let f () _ = function 94 | | `Uchar u -> Sys.opaque_identity @@ ignore u 95 | | `Malformed u -> Sys.opaque_identity @@ ignore Uutf.u_rep 96 | in 97 | Uutf.String.fold_utf_8 f () s 98 | 99 | let best_effort_decode impl file = 100 | let s, size_gb = get_data file in 101 | Gc.full_major (); 102 | log "Decoding %.2fGB of data with %s" size_gb (impl_to_string impl); 103 | match impl with 104 | | `Adhoc -> adhoc_best_effort_decode s 105 | | `Dfa -> dfa_best_effort_decode s 106 | | `If -> if_best_effort_decode s 107 | | `Pat -> pat_best_effort_decode s 108 | | `Uutf -> uutf_best_effort_decode s 109 | 110 | (* Recode *) 111 | 112 | let adhoc_recode s = 113 | let rec loop b i b' j = 114 | if i >= Utf_x_adhoc.Bytes.length b then (Bytes.unsafe_to_string b') else 115 | let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in 116 | match Utf_x_adhoc.Uchar.utf_decode_is_valid d with 117 | | false -> raise Exit 118 | | true -> 119 | let used = Utf_x_adhoc.Uchar.utf_decode_length d in 120 | let u = Utf_x_adhoc.Uchar.utf_decode_uchar d in 121 | match Utf_x_adhoc.Bytes.set_utf_8_uchar b' j u with 122 | | 0 -> raise Exit 123 | | used' -> loop b (i + used) b' (j + used') 124 | in 125 | try 126 | let b' = Bytes.create (String.length s) in 127 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0) 128 | with Exit -> None 129 | 130 | let dfa_recode s = 131 | let rec loop b i b' j = 132 | if i >= Utf_x_dfa.Bytes.length b then (Bytes.unsafe_to_string b') else 133 | let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in 134 | match Utf_x_dfa.Uchar.utf_decode_is_valid d with 135 | | false -> raise Exit 136 | | true -> 137 | let used = Utf_x_dfa.Uchar.utf_decode_length d in 138 | let u = Utf_x_dfa.Uchar.utf_decode_uchar d in 139 | match Utf_x_dfa.Bytes.set_utf_8_uchar b' j u with 140 | | 0 -> raise Exit 141 | | used' -> loop b (i + used) b' (j + used') 142 | in 143 | try 144 | let b' = Bytes.create (String.length s) in 145 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0) 146 | with Exit -> None 147 | 148 | let if_recode s = 149 | let rec loop b i b' j = 150 | if i >= Utf_x_if.Bytes.length b then (Bytes.unsafe_to_string b') else 151 | let d = Utf_x_if.Bytes.get_utf_8_uchar b i in 152 | match Utf_x_if.Uchar.utf_decode_is_valid d with 153 | | false -> raise Exit 154 | | true -> 155 | let used = Utf_x_if.Uchar.utf_decode_length d in 156 | let u = Utf_x_if.Uchar.utf_decode_uchar d in 157 | match Utf_x_if.Bytes.set_utf_8_uchar b' j u with 158 | | 0 -> raise Exit 159 | | used' -> loop b (i + used) b' (j + used') 160 | in 161 | try 162 | let b' = Bytes.create (String.length s) in 163 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0) 164 | with Exit -> None 165 | 166 | let pat_recode s = 167 | let rec loop b i b' j = 168 | if i >= Utf_x_pat.Bytes.length b then (Bytes.unsafe_to_string b') else 169 | let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in 170 | match Utf_x_pat.Uchar.utf_decode_is_valid d with 171 | | false -> raise Exit 172 | | true -> 173 | let used = Utf_x_pat.Uchar.utf_decode_length d in 174 | let u = Utf_x_pat.Uchar.utf_decode_uchar d in 175 | match Utf_x_pat.Bytes.set_utf_8_uchar b' j u with 176 | | 0 -> raise Exit 177 | | used' -> loop b (i + used) b' (j + used') 178 | in 179 | try 180 | let b' = Bytes.create (String.length s) in 181 | Some (loop (Bytes.unsafe_of_string s) 0 b' 0) 182 | with Exit -> None 183 | 184 | let uutf_recode s = 185 | let f b _ = function 186 | | `Uchar u -> Buffer.add_utf_8_uchar b u; b 187 | | `Malformed u -> raise Exit 188 | in 189 | try 190 | let b = Buffer.create (String.length s) in 191 | let b = Uutf.String.fold_utf_8 f b s in 192 | Some (Buffer.contents b) 193 | with Exit -> None 194 | 195 | let recode impl file = 196 | let s, size_gb = get_data file in 197 | Gc.full_major (); 198 | log "Recoding %.2fGB of data with %s decode" size_gb (impl_to_string impl); 199 | let s' = match impl with 200 | | `Adhoc -> adhoc_recode s 201 | | `Dfa -> dfa_recode s 202 | | `If -> if_recode s 203 | | `Pat -> pat_recode s 204 | | `Uutf -> uutf_recode s 205 | in 206 | match s' with 207 | | None -> log "Recode failure, non valid input data ?" 208 | | Some s' when String.equal s s' -> log "Recode success!" 209 | | Some _ -> assert false 210 | 211 | (* Validate *) 212 | 213 | let adhoc_validate = Utf_x_adhoc.Bytes.is_valid_utf_8 214 | let dfa_validate = Utf_x_dfa.Bytes.is_valid_utf_8 215 | let if_validate = Utf_x_if.Bytes.is_valid_utf_8 216 | let pat_validate = Utf_x_pat.Bytes.is_valid_utf_8 217 | let uutf_validate s = 218 | let f () _ = function 219 | | `Uchar u -> Sys.opaque_identity @@ ignore u 220 | | `Malformed u -> raise Exit 221 | in 222 | try Uutf.String.fold_utf_8 f () s; true with 223 | | Exit -> false 224 | 225 | let validate impl file = 226 | let s, size_gb = get_data file in 227 | log "Validating %.2fGB of data with %s" size_gb (impl_to_string impl); 228 | let valid = match impl with 229 | | `Adhoc -> adhoc_validate (Bytes.unsafe_of_string s) 230 | | `Dfa -> dfa_validate (Bytes.unsafe_of_string s) 231 | | `If -> if_validate (Bytes.unsafe_of_string s) 232 | | `Pat -> pat_validate (Bytes.unsafe_of_string s) 233 | | `Uutf -> uutf_validate s 234 | in 235 | log "Valid: %b" valid 236 | 237 | (* Command line interface *) 238 | 239 | let do_cmd cmd impl file = match cmd with 240 | | `Decode -> best_effort_decode impl file 241 | | `Recode -> recode impl file 242 | | `Validate -> validate impl file 243 | 244 | let main () = 245 | let usage = "Usage: perf8 [--adhoc | --dfa | --if | --pat | --uutf] FILE" in 246 | let impl = ref `Adhoc in 247 | let cmd = ref `Decode in 248 | let args = 249 | [ "--adhoc", Arg.Unit (fun () -> impl := `Adhoc), 250 | "Test the adhoc implementation (default)."; 251 | "--dfa", Arg.Unit (fun () -> impl := `Dfa), 252 | "Test the DFA implementation"; 253 | "--if", Arg.Unit (fun () -> impl := `If), 254 | "Test the if branches implementation."; 255 | "--pat", Arg.Unit (fun () -> impl := `Pat), 256 | "Test the pattern implementation."; 257 | "--uutf", Arg.Unit (fun () -> impl := `Uutf), 258 | "Test the Uutf implementation."; 259 | "--decode", Arg.Unit (fun () -> cmd := `Decode), 260 | "Decode the UTF-8 data."; 261 | "--validate", Arg.Unit (fun () -> cmd := `Validate), 262 | "Validate the UTF-8 data."; 263 | "--recode", Arg.Unit (fun () -> cmd := `Recode), 264 | "Decode and recode the UTF-8 data."] 265 | in 266 | let file = ref None in 267 | let set_file s = file := Some s in 268 | Arg.parse args set_file usage; 269 | match !file with 270 | | None -> Printf.eprintf "perf8: missing FILE argument\n%s\n" usage; exit 1 271 | | Some file -> do_cmd !cmd !impl file 272 | 273 | let () = if !Sys.interactive then () else main () 274 | -------------------------------------------------------------------------------- /test.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | (* To add to the compiler test suite. *) 17 | 18 | open Utf_x_pat 19 | 20 | let fold_uchars f acc = 21 | let rec loop f acc u = 22 | let acc = f acc u in 23 | if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u) 24 | in 25 | loop f acc Uchar.min 26 | 27 | (* This tests that we encode and decode each character according 28 | to its specification. *) 29 | 30 | let utf_8_spec = 31 | (* UTF-8 byte sequences, cf. table 3.7 Unicode 14. *) 32 | [(0x0000,0x007F), [|(0x00,0x7F)|]; 33 | (0x0080,0x07FF), [|(0xC2,0xDF); (0x80,0xBF)|]; 34 | (0x0800,0x0FFF), [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|]; 35 | (0x1000,0xCFFF), [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|]; 36 | (0xD000,0xD7FF), [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|]; 37 | (0xE000,0xFFFF), [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|]; 38 | (0x10000,0x3FFFF), [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|]; 39 | (0x40000,0xFFFFF), [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|]; 40 | (0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]] 41 | 42 | let utf_16be_spec = 43 | (* UTF-16BE byte sequences, derived from table 3.5 Unicode 14. *) 44 | [(0x0000,0xD7FF), [|(0x00,0xD7); (0x00,0xFF)|]; 45 | (0xE000,0xFFFF), [|(0xE0,0xFF); (0x00,0xFF)|]; 46 | (0x10000,0x10FFFF), [|(0xD8,0xDB); (0x00,0xFF); (0xDC,0xDF); (0x00,0xFF)|]] 47 | 48 | let uchar_map_of_spec spec = 49 | (* array mapping Uchar.t as ints to byte sequences according to [spec]. *) 50 | let map = Array.make ((Uchar.to_int Uchar.max) + 1) Bytes.empty in 51 | let add_range ((umin, umax), bytes) = 52 | let len = Array.length bytes in 53 | let bmin i = if i < len then fst bytes.(i) else max_int in 54 | let bmax i = if i < len then snd bytes.(i) else min_int in 55 | let uchar = ref umin in 56 | let buf = Bytes.create len in 57 | let add len' = match len = len' with 58 | | false -> () 59 | | true -> map.(!uchar) <- Bytes.copy buf; incr uchar 60 | in 61 | for b0 = bmin 0 to bmax 0 do Bytes.set_uint8 buf 0 b0; 62 | for b1 = bmin 1 to bmax 1 do Bytes.set_uint8 buf 1 b1; 63 | for b2 = bmin 2 to bmax 2 do Bytes.set_uint8 buf 2 b2; 64 | for b3 = bmin 3 to bmax 3 do Bytes.set_uint8 buf 3 b3; add 4 65 | done; add 3; 66 | done; add 2; 67 | done; add 1; 68 | done; assert (!uchar - 1 = umax) 69 | in 70 | List.iter add_range spec; 71 | map 72 | 73 | let uchar_map_get u map = map.(Uchar.to_int u) 74 | let utf_8 = uchar_map_of_spec utf_8_spec 75 | let utf_16be = uchar_map_of_spec utf_16be_spec 76 | let utf_16le = 77 | let swap u b = 78 | let len = Bytes.length b in 79 | if len = 0 then () else 80 | for i = 0 to Bytes.length b / 2 - 1 do 81 | let j = i * 2 in 82 | Bytes.set_uint16_le b j (Bytes.get_uint16_be b j); 83 | done; 84 | in 85 | let map = Array.map Bytes.copy utf_16be in 86 | Array.iteri swap map; map 87 | 88 | let test_utf utf utf_len get_utf set_utf utf_is_valid = 89 | (* Test codec and validation of each Uchar.t against the spec. *) 90 | let f () u = 91 | let utf_len = utf_len u in 92 | let buf = Bytes.create utf_len in 93 | assert (set_utf buf 0 u = utf_len); 94 | assert (Bytes.equal buf (uchar_map_get u utf)); 95 | assert (Bytes.equal buf (uchar_map_get u utf)); 96 | let dec = get_utf buf 0 in 97 | assert (Uchar.utf_decode_is_valid dec); 98 | assert (Uchar.utf_decode_length dec = utf_len); 99 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) u); 100 | assert (utf_is_valid buf); 101 | () 102 | in 103 | fold_uchars f () 104 | 105 | let () = 106 | test_utf utf_8 Uchar.utf_8_byte_length 107 | Bytes.get_utf_8_uchar Bytes.set_utf_8_uchar Bytes.is_valid_utf_8 108 | 109 | let () = 110 | test_utf utf_16be Uchar.utf_16_byte_length 111 | Bytes.get_utf_16be_uchar Bytes.set_utf_16be_uchar Bytes.is_valid_utf_16be 112 | 113 | let () = 114 | test_utf utf_16le Uchar.utf_16_byte_length 115 | Bytes.get_utf_16le_uchar Bytes.set_utf_16le_uchar Bytes.is_valid_utf_16le 116 | 117 | let () = 118 | (* Test out of bounds *) 119 | let raises f = assert (try f (); false with Invalid_argument _ -> true) in 120 | (raises @@ fun () -> Bytes.get_utf_8_uchar Bytes.empty 0); 121 | (raises @@ fun () -> Bytes.set_utf_8_uchar Bytes.empty 0 Uchar.min); 122 | (raises @@ fun () -> Bytes.get_utf_16le_uchar Bytes.empty 0); 123 | (raises @@ fun () -> Bytes.set_utf_16le_uchar Bytes.empty 0 Uchar.min); 124 | (raises @@ fun () -> Bytes.get_utf_16be_uchar Bytes.empty 0); 125 | (raises @@ fun () -> Bytes.set_utf_16be_uchar Bytes.empty 0 Uchar.min); 126 | () 127 | 128 | let () = 129 | (* Test lack of space encodes *) 130 | let b = Bytes.make 1 '\xab' in 131 | assert (Bytes.set_utf_8_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab'); 132 | assert (Bytes.set_utf_16be_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab'); 133 | assert (Bytes.set_utf_16le_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab'); 134 | () 135 | 136 | let () = 137 | (* Test used bytes and replacement according to WHATWG recommendation. 138 | This is just a recommandation. 139 | These examples are from TUS p. 126-127 Unicode 14 *) 140 | let b = Bytes.of_string "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41" in 141 | let ok i = i = Bytes.length b - 1 in 142 | for i = 0 to Bytes.length b - 1 do 143 | let dec = Bytes.get_utf_8_uchar b i in 144 | if not (ok i) then begin 145 | assert (Uchar.utf_decode_is_valid dec = false); 146 | assert (Uchar.utf_decode_length dec = 1); 147 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep) 148 | end else begin 149 | assert (Uchar.utf_decode_is_valid dec = true); 150 | assert (Uchar.utf_decode_length dec = 1); 151 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041)) 152 | end 153 | done; 154 | let b = Bytes.of_string "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41" in 155 | let ok i = i = Bytes.length b - 1 in 156 | for i = 0 to Bytes.length b - 1 do 157 | let dec = Bytes.get_utf_8_uchar b i in 158 | if not (ok i) then begin 159 | assert (Uchar.utf_decode_is_valid dec = false); 160 | assert (Uchar.utf_decode_length dec = 1); 161 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep) 162 | end else begin 163 | assert (Uchar.utf_decode_is_valid dec = true); 164 | assert (Uchar.utf_decode_length dec = 1); 165 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041)) 166 | end 167 | done; 168 | let b = Bytes.of_string "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42" in 169 | let ok i = i = 5 || i = 8 in 170 | for i = 0 to Bytes.length b - 1 do 171 | let dec = Bytes.get_utf_8_uchar b i in 172 | if not (ok i) then begin 173 | assert (Uchar.utf_decode_is_valid dec = false); 174 | assert (Uchar.utf_decode_length dec = 1); 175 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep) 176 | end else begin 177 | assert (Uchar.utf_decode_is_valid dec = true); 178 | assert (Uchar.utf_decode_length dec = 1); 179 | assert (Uchar.equal (Uchar.utf_decode_uchar dec) 180 | (Uchar.of_char (Bytes.get b i))) 181 | end 182 | done; 183 | let b = Bytes.of_string "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41" in 184 | let d0 = Bytes.get_utf_8_uchar b 0 in 185 | assert (Uchar.utf_decode_is_valid d0 = false); 186 | assert (Uchar.utf_decode_length d0 = 2); 187 | assert (Uchar.equal (Uchar.utf_decode_uchar d0) Uchar.rep); 188 | let d2 = Bytes.get_utf_8_uchar b 2 in 189 | assert (Uchar.utf_decode_is_valid d2 = false); 190 | assert (Uchar.utf_decode_length d2 = 1); 191 | assert (Uchar.equal (Uchar.utf_decode_uchar d2) Uchar.rep); 192 | let d3 = Bytes.get_utf_8_uchar b 3 in 193 | assert (Uchar.utf_decode_is_valid d3 = false); 194 | assert (Uchar.utf_decode_length d3 = 3); 195 | assert (Uchar.equal (Uchar.utf_decode_uchar d3) Uchar.rep); 196 | let d6 = Bytes.get_utf_8_uchar b 6 in 197 | assert (Uchar.utf_decode_is_valid d6 = false); 198 | assert (Uchar.utf_decode_length d6 = 2); 199 | assert (Uchar.equal (Uchar.utf_decode_uchar d6) Uchar.rep); 200 | let d8 = Bytes.get_utf_8_uchar b 8 in 201 | assert (Uchar.utf_decode_length d8 = 1); 202 | assert (Uchar.equal (Uchar.utf_decode_uchar d8) (Uchar.of_int 0x0041)); 203 | () 204 | 205 | let () = Printf.printf "All tests passed!\n" 206 | -------------------------------------------------------------------------------- /test_long.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | open Utf_x_pat 17 | 18 | (* Assumes the good encoding and decodes have been checked by test.ml 19 | Exhaustively tests all 1-4 bytes invalid sequences for decodes. This 20 | ensures we do not decode invalid sequence to uchars. *) 21 | 22 | let fold_uchars f acc = 23 | let rec loop f acc u = 24 | let acc = f acc u in 25 | if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u) 26 | in 27 | loop f acc Uchar.min 28 | 29 | module Sset = Set.Make (String) 30 | 31 | let utf_8_encs, utf_16be_encs, utf_16le_encs = 32 | Printf.printf "Building encoding sequence sets\n%!"; 33 | let add (set8, set16be, set16le) u = 34 | let s = Bytes.unsafe_to_string in 35 | let e8 = Bytes.create (Uchar.utf_8_byte_length u) in 36 | let e16be = Bytes.create (Uchar.utf_16_byte_length u) in 37 | let e16le = Bytes.create (Uchar.utf_16_byte_length u) in 38 | ignore (Bytes.set_utf_8_uchar e8 0 u); 39 | ignore (Bytes.set_utf_16be_uchar e16be 0 u); 40 | ignore (Bytes.set_utf_16le_uchar e16le 0 u); 41 | Sset.add (s e8) set8, Sset.add (s e16be) set16be, Sset.add (s e16le) set16le 42 | in 43 | fold_uchars add (Sset.empty, Sset.empty, Sset.empty) 44 | 45 | let test_seqs utf utf_encs get_utf_char is_valid_utf = 46 | let test seq = 47 | let dec = get_utf_char seq 0 in 48 | let valid = Uchar.utf_decode_is_valid dec in 49 | let valid_seq = is_valid_utf seq in 50 | let is_enc = Sset.mem (Bytes.unsafe_to_string seq) utf_encs in 51 | if not ((valid && is_enc) || (not valid && not is_enc)) || 52 | not ((valid_seq && is_enc) || (not valid_seq && not is_enc)) 53 | then begin 54 | for i = 0 to Bytes.length seq - 1 do 55 | Printf.printf "%02X " (Bytes.get_uint8 seq i); 56 | done; 57 | Printf.printf "valid: %b is_encoding: %b decode: U+%04X is_valid:%b\n" 58 | valid is_enc (Uchar.to_int (Uchar.utf_decode_uchar dec)) valid_seq; 59 | assert false 60 | end; 61 | valid 62 | in 63 | let[@inline] set buf i b = Bytes.unsafe_set buf i (Char.unsafe_chr b) in 64 | let s1 = Bytes.create 1 and s2 = Bytes.create 2 65 | and s3 = Bytes.create 3 and s4 = Bytes.create 4 in 66 | Printf.printf "Testing %s invalid decodes…\n%!" utf; 67 | for b0 = 0x00 to 0xFF do 68 | set s1 0 b0; 69 | if test s1 then ((* this prefix decoded, stop here *)) else begin 70 | set s2 0 b0; 71 | for b1 = 0x00 to 0xFF do 72 | set s2 1 b1; 73 | if test s2 then ((* this prefix decoded, stop here *)) else begin 74 | set s3 0 b0; 75 | set s3 1 b1; 76 | for b2 = 0x00 to 0xFF do 77 | set s3 2 b2; 78 | if test s3 then ((* this prefix decoded, stop here *)) else begin 79 | set s4 0 b0; 80 | set s4 1 b1; 81 | set s4 2 b2; 82 | for b3 = 0x00 to 0xFF do set s4 3 b3; ignore (test s4) done; 83 | end 84 | done; 85 | end 86 | done; 87 | end 88 | done 89 | 90 | let () = 91 | test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar Bytes.is_valid_utf_8; 92 | test_seqs "UTF-16BE" 93 | utf_16be_encs Bytes.get_utf_16be_uchar Bytes.is_valid_utf_16be; 94 | test_seqs "UTF-16LE" 95 | utf_16le_encs Bytes.get_utf_16le_uchar Bytes.is_valid_utf_16le; 96 | () 97 | 98 | let () = Printf.printf "All tests passed!\n" 99 | -------------------------------------------------------------------------------- /trip8.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | let log fmt = Printf.printf (fmt ^^ "\n%!") 17 | 18 | let string_of_file file = 19 | let ic = open_in_bin file in 20 | let len = in_channel_length ic in 21 | let buf = Bytes.create len in 22 | really_input ic buf 0 len; close_in ic; Bytes.unsafe_to_string buf 23 | 24 | let adhoc_trip s b = 25 | let rec loop b i buf = 26 | if i >= Utf_x_adhoc.Bytes.length b then () else 27 | let d = Utf_x_adhoc.Bytes.get_utf_8_uchar b i in 28 | let used = Utf_x_adhoc.Uchar.utf_decode_length d in 29 | Buffer.add_utf_8_uchar buf (Utf_x_adhoc.Uchar.utf_decode_uchar d); 30 | loop b (i + used) buf 31 | in 32 | loop (Bytes.unsafe_of_string s) 0 b 33 | 34 | let dfa_trip s b = 35 | let rec loop b i buf = 36 | if i >= Utf_x_dfa.Bytes.length b then () else 37 | let d = Utf_x_dfa.Bytes.get_utf_8_uchar b i in 38 | let used = Utf_x_dfa.Uchar.utf_decode_length d in 39 | Buffer.add_utf_8_uchar buf (Utf_x_dfa.Uchar.utf_decode_uchar d); 40 | loop b (i + used) buf 41 | in 42 | loop (Bytes.unsafe_of_string s) 0 b 43 | 44 | let if_trip s b = 45 | let rec loop b i buf = 46 | if i >= Utf_x_if.Bytes.length b then () else 47 | let d = Utf_x_if.Bytes.get_utf_8_uchar b i in 48 | let used = Utf_x_if.Uchar.utf_decode_length d in 49 | Buffer.add_utf_8_uchar buf (Utf_x_if.Uchar.utf_decode_uchar d); 50 | loop b (i + used) buf 51 | in 52 | loop (Bytes.unsafe_of_string s) 0 b 53 | 54 | let pat_trip s b = 55 | let rec loop b i buf = 56 | if i >= Utf_x_pat.Bytes.length b then () else 57 | let d = Utf_x_pat.Bytes.get_utf_8_uchar b i in 58 | let used = Utf_x_pat.Uchar.utf_decode_length d in 59 | Buffer.add_utf_8_uchar buf (Utf_x_pat.Uchar.utf_decode_uchar d); 60 | loop b (i + used) buf 61 | in 62 | loop (Bytes.unsafe_of_string s) 0 b 63 | 64 | let uutf_trip s b = 65 | let f b _ = function 66 | | `Uchar u -> Buffer.add_utf_8_uchar b u; b 67 | | `Malformed u -> Buffer.add_utf_8_uchar b Uchar.rep; b 68 | in 69 | ignore (Uutf.String.fold_utf_8 f b s) 70 | 71 | let trip impl file = 72 | let s = string_of_file file in 73 | let b = Buffer.create (String.length s) in 74 | let () = match impl with 75 | | `Adhoc -> adhoc_trip s b 76 | | `Dfa -> dfa_trip s b 77 | | `If -> if_trip s b 78 | | `Pat -> pat_trip s b 79 | | `Uutf -> uutf_trip s b 80 | in 81 | print_string (Buffer.contents b) 82 | 83 | (* Command line interface *) 84 | 85 | let main () = 86 | let usage = "Usage: trip8 [--adhoc | --dfa | --if | --pat | --uutf] FILE" in 87 | let impl = ref `Adhoc in 88 | let args = 89 | [ "--adhoc", Arg.Unit (fun () -> impl := `Adhoc), 90 | "Use the adhoc implementation (default)."; 91 | "--dfa", Arg.Unit (fun () -> impl := `Dfa), 92 | "Use the DFA implementation"; 93 | "--if", Arg.Unit (fun () -> impl := `If), 94 | "Use the if branches implementation."; 95 | "--pat", Arg.Unit (fun () -> impl := `Pat), 96 | "Use the pattern implementation."; 97 | "--uutf", Arg.Unit (fun () -> impl := `Uutf), 98 | "Use the Uutf implementation." ] 99 | in 100 | let file = ref None in 101 | let set_file s = file := Some s in 102 | Arg.parse args set_file usage; 103 | match !file with 104 | | None -> Printf.eprintf "trip8: missing FILE argument\n%s\n" usage; exit 1 105 | | Some file -> trip !impl file 106 | 107 | let () = if !Sys.interactive then () else main () 108 | -------------------------------------------------------------------------------- /utf_16.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | (* This won't be needed *) 17 | 18 | let dec_invalid = Utf_uchar.utf_decode_invalid 19 | let[@inline] dec_ret n u = Utf_uchar.utf_decode n (Uchar.unsafe_of_int u) 20 | external length : bytes -> int = "%bytes_length" 21 | type t = Bytes.t 22 | external swap16 : int -> int = "%bswap16" (* That one will be in Bytes *) 23 | 24 | (* That will be needed *) 25 | 26 | external unsafe_get_uint16_ne : t -> int -> int = "%caml_bytes_get16u" 27 | external unsafe_set_uint16_ne : t -> int -> int -> unit = "%caml_bytes_set16u" 28 | 29 | let unsafe_get_uint16_le b i = 30 | if Sys.big_endian 31 | then swap16 (unsafe_get_uint16_ne b i) 32 | else unsafe_get_uint16_ne b i 33 | 34 | let unsafe_get_uint16_be b i = 35 | if Sys.big_endian 36 | then unsafe_get_uint16_ne b i 37 | else swap16 (unsafe_get_uint16_ne b i) 38 | 39 | let unsafe_set_uint16_le b i x = 40 | if Sys.big_endian 41 | then unsafe_set_uint16_ne b i (swap16 x) 42 | else unsafe_set_uint16_ne b i x 43 | 44 | let unsafe_set_uint16_be b i x = 45 | if Sys.big_endian 46 | then unsafe_set_uint16_ne b i x 47 | else unsafe_set_uint16_ne b i (swap16 x) 48 | 49 | (* UTF-16BE *) 50 | 51 | let get_utf_16be_uchar b i = 52 | let get = unsafe_get_uint16_be in 53 | let max = length b - 1 in 54 | if i < 0 || i > max then invalid_arg "index out of bounds" else 55 | if i = max then dec_invalid 1 else 56 | match get b i with 57 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u 58 | | u when u > 0xDBFF -> dec_invalid 2 59 | | hi -> (* combine [hi] with a low surrogate *) 60 | let last = i + 3 in 61 | if last > max then dec_invalid (max - i + 1) else 62 | match get b (i + 2) with 63 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *) 64 | | lo -> 65 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in 66 | dec_ret 4 u 67 | 68 | let set_utf_16be_uchar b i u = 69 | let set = unsafe_set_uint16_be in 70 | let max = length b - 1 in 71 | if i < 0 || i > max then invalid_arg "index out of bounds" else 72 | match Uchar.to_int u with 73 | | u when u < 0 -> assert false 74 | | u when u <= 0xFFFF -> 75 | let last = i + 1 in 76 | if last > max then 0 else (set b i u; 2) 77 | | u -> 78 | let last = i + 3 in 79 | if last > max then 0 else 80 | let u' = u - 0x10000 in 81 | let hi = (0xD800 lor (u' lsr 10)) in 82 | let lo = (0xDC00 lor (u' land 0x3FF)) in 83 | set b i hi; set b (i + 2) lo; 4 84 | 85 | let is_valid_utf_16be b = 86 | let rec loop max b i = 87 | let get = unsafe_get_uint16_be in 88 | if i > max then true else 89 | if i = max then false else 90 | match get b i with 91 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2) 92 | | u when u > 0xDBFF -> false 93 | | hi -> 94 | let last = i + 3 in 95 | if last > max then false else 96 | match get b (i + 2) with 97 | | u when u < 0xDC00 || u > 0xDFFF -> false 98 | | lo -> loop max b (i + 4) 99 | in 100 | loop (length b - 1) b 0 101 | 102 | (* UTF-16LE *) 103 | 104 | let get_utf_16le_uchar b i = 105 | let get = unsafe_get_uint16_le in 106 | let max = length b - 1 in 107 | if i < 0 || i > max then invalid_arg "index out of bounds" else 108 | if i = max then dec_invalid 1 else 109 | match get b i with 110 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u 111 | | u when u > 0xDBFF -> dec_invalid 2 112 | | hi -> (* combine [hi] with a low surrogate *) 113 | let last = i + 3 in 114 | if last > max then dec_invalid (max - i + 1) else 115 | match get b (i + 2) with 116 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *) 117 | | lo -> 118 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in 119 | dec_ret 4 u 120 | 121 | let set_utf_16le_uchar b i u = 122 | let set = unsafe_set_uint16_le in 123 | let max = length b - 1 in 124 | if i < 0 || i > max then invalid_arg "index out of bounds" else 125 | match Uchar.to_int u with 126 | | u when u < 0 -> assert false 127 | | u when u <= 0xFFFF -> 128 | let last = i + 1 in 129 | if last > max then 0 else (set b i u; 2) 130 | | u -> 131 | let last = i + 3 in 132 | if last > max then 0 else 133 | let u' = u - 0x10000 in 134 | let hi = (0xD800 lor (u' lsr 10)) in 135 | let lo = (0xDC00 lor (u' land 0x3FF)) in 136 | set b i hi; set b (i + 2) lo; 4 137 | 138 | let is_valid_utf_16le b = 139 | let rec loop max b i = 140 | let get = unsafe_get_uint16_le in 141 | if i > max then true else 142 | if i = max then false else 143 | match get b i with 144 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2) 145 | | u when u > 0xDBFF -> false 146 | | hi -> 147 | let last = i + 3 in 148 | if last > max then false else 149 | match get b (i + 2) with 150 | | u when u < 0xDC00 || u > 0xDFFF -> false 151 | | lo -> loop max b (i + 4) 152 | in 153 | loop (length b - 1) b 0 154 | -------------------------------------------------------------------------------- /utf_uchar.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | include Uchar 17 | 18 | (* UTF codecs tools *) 19 | 20 | type utf_decode = int 21 | (* This is an int [0xDUUUUUU] decomposed as follows: 22 | - [D] is four bits for decode information, the highest bit is set if the 23 | decode is valid. The three lower bits indicate the number of elements 24 | from the source that were consumed by the decode. 25 | - [UUUUUU] is the decoded Unicode character or the Unicode replacement 26 | character U+FFFD if for invalid decodes. *) 27 | 28 | let valid_bit = 27 29 | let decode_bits = 24 30 | 31 | let[@inline] utf_decode_is_valid d = (d lsr valid_bit) = 1 32 | let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111 33 | let[@inline] utf_decode_uchar d = Uchar.unsafe_of_int (d land 0xFFFFFF) 34 | let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (Uchar.to_int u) 35 | let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor 0xFFFD (* rep *) 36 | 37 | let utf_8_byte_length u = match to_int u with 38 | | u when u < 0 -> assert false 39 | | u when u <= 0x007F -> 1 40 | | u when u <= 0x07FF -> 2 41 | | u when u <= 0xFFFF -> 3 42 | | u when u <= 0x10FFFF -> 4 43 | | _ -> assert false 44 | 45 | let utf_16_byte_length u = match to_int u with 46 | | u when u < 0 -> assert false 47 | | u when u <= 0xFFFF -> 2 48 | | u when u <= 0x10FFFF -> 4 49 | | _ -> assert false 50 | -------------------------------------------------------------------------------- /utf_uchar.mli: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | include Utf_x.Uchar 17 | -------------------------------------------------------------------------------- /utf_x.mli: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | (* Uchar additions *) 17 | 18 | module type Uchar = sig 19 | include module type of Uchar 20 | 21 | (** {1:utf_codec UTF codecs tools} 22 | 23 | @since 4.14 *) 24 | 25 | type utf_decode [@@immediate] 26 | (** The type for UTF decode results. Values of this type represent 27 | the result of a Unicode Transformation Format decoding attempt. *) 28 | 29 | val utf_decode_is_valid : utf_decode -> bool 30 | (** [utf_decode_is_valid d] is [true] if and only if [d] holds a valid 31 | decode. *) 32 | 33 | val utf_decode_uchar : utf_decode -> Uchar.t 34 | (** [utf_x_decode_uchar d] is the Unicode character decoded by [d] if 35 | [utf_x_decode_is_valid d] is [true] and {!Uchar.rep} otherwise. *) 36 | 37 | val utf_decode_length : utf_decode -> int 38 | (** [utf_decode_length d] is the number of elements from the source 39 | that were consumed by the decode [d]. This is always strictly 40 | positive and smaller or equal to [4]. The kind of source elements 41 | depends on the actual decoder; for the decoders of the standard 42 | library this function always returns a length in bytes. *) 43 | 44 | val utf_decode : int -> Uchar.t -> utf_decode 45 | (** [utf_decode n u] is a valid UTF decode for [u] that consumed [n] 46 | elements from the source for decoding. [n] must be positive and 47 | smaller or equal to [4] (this is not checked by the module). *) 48 | 49 | val utf_decode_invalid : int -> utf_decode 50 | (** [utf_decode_invalid n] is an invalid UTF decode that consumed [n] 51 | elements from the source to error. [n] must be positive and 52 | smaller or equal to [4] (this is not checked by the module). The 53 | resulting decode has {!rep} as the decoded Unicode character. *) 54 | 55 | val utf_8_byte_length : t -> int 56 | (** [utf_8_byte_length u] is the number of bytes needed to encode 57 | [u] in UTF-8. *) 58 | 59 | val utf_16_byte_length : t -> int 60 | (** [utf_16_byte_length u] is the number of bytes needed to encode 61 | [u] in UTF-16. *) 62 | end 63 | 64 | (* Bytes additions *) 65 | 66 | module type S = sig 67 | module Uchar : Uchar 68 | module Bytes : sig 69 | include module type of Bytes 70 | 71 | (** {1:utf_x_codecs UTF-X codecs and validations} 72 | 73 | @since 4.14 *) 74 | 75 | (** {2:utf_8 UTF-8} *) 76 | 77 | val get_utf_8_uchar : t -> int -> Uchar.utf_decode 78 | (** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in 79 | [b]. *) 80 | 81 | val set_utf_8_uchar : t -> int -> Uchar.t -> int 82 | (** [set_utf_8_uchar b i u] UTF-8 encodes [u] at index [i] in [b] 83 | and returns the number of bytes [n] that were written starting 84 | at [i]. If [n] is [0] there was not enough space to encode [u] 85 | at [i] and [b] was left untouched. Otherwise a new character can 86 | be encoded at [i + n]. *) 87 | 88 | val is_valid_utf_8 : t -> bool 89 | (** [is_valid_utf_8 b] is [true] if and only if [b] contains valid 90 | UTF-8 data. *) 91 | 92 | (** {2:utf_16be UTF-16BE} *) 93 | 94 | val get_utf_16be_uchar : t -> int -> Uchar.utf_decode 95 | (** [get_utf_16be_uchar b i] decodes an UTF-16BE character at index 96 | [i] in [b]. *) 97 | 98 | val set_utf_16be_uchar : t -> int -> Uchar.t -> int 99 | (** [set_utf_16be_uchar b i u] UTF-16BE encodes [u] at index [i] in [b] 100 | and returns the number of bytes [n] that were written starting 101 | at [i]. If [n] is [0] there was not enough space to encode [u] 102 | at [i] and [b] was left untouched. Otherwise a new character can 103 | be encoded at [i + n]. *) 104 | 105 | val is_valid_utf_16be : t -> bool 106 | (** [is_valid_utf_16be b] is [true] if and only if [b] contains valid 107 | UTF-16BE data. *) 108 | 109 | (** {2:utf_16le UTF-16LE} *) 110 | 111 | val get_utf_16le_uchar : t -> int -> Uchar.utf_decode 112 | (** [get_utf_16le_uchar b i] decodes an UTF-16LE character at index 113 | [i] in [b]. *) 114 | 115 | val set_utf_16le_uchar : t -> int -> Uchar.t -> int 116 | (** [set_utf_16le_uchar b i u] UTF-16LE encodes [u] at index [i] in [b] 117 | and returns the number of bytes [n] that were written starting 118 | at [i]. If [n] is [0] there was not enough space to encode [u] 119 | at [i] and [b] was left untouched. Otherwise a new character can 120 | be encoded at [i + n]. *) 121 | 122 | val is_valid_utf_16le : t -> bool 123 | (** [is_valid_utf_16le b] is [true] if and only if [b] contains valid 124 | UTF-16LE data. *) 125 | end 126 | end 127 | -------------------------------------------------------------------------------- /utf_x_adhoc.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | module Uchar = Utf_uchar 17 | module Bytes = struct 18 | include Bytes 19 | 20 | external swap16 : int -> int = "%bswap16" (* That one will be in Bytes *) 21 | 22 | (* Unsafe internal additions *) 23 | 24 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set" 25 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get" 26 | 27 | external unsafe_get_uint16_ne : t -> int -> int = "%caml_bytes_get16u" 28 | external unsafe_set_uint16_ne : t -> int -> int -> unit = "%caml_bytes_set16u" 29 | 30 | let unsafe_get_uint16_le b i = 31 | if Sys.big_endian 32 | then swap16 (unsafe_get_uint16_ne b i) 33 | else unsafe_get_uint16_ne b i 34 | 35 | let unsafe_get_uint16_be b i = 36 | if Sys.big_endian 37 | then unsafe_get_uint16_ne b i 38 | else swap16 (unsafe_get_uint16_ne b i) 39 | 40 | let unsafe_set_uint16_le b i x = 41 | if Sys.big_endian 42 | then unsafe_set_uint16_ne b i (swap16 x) 43 | else unsafe_set_uint16_ne b i x 44 | 45 | let unsafe_set_uint16_be b i x = 46 | if Sys.big_endian 47 | then unsafe_set_uint16_ne b i x 48 | else unsafe_set_uint16_ne b i (swap16 x) 49 | 50 | (* UTF-X codecs and validations *) 51 | 52 | let dec_invalid = Uchar.utf_decode_invalid 53 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) 54 | 55 | (* In case of decoding error, if we error on the first byte, we 56 | consume the byte, otherwise we consume the [n] bytes preceeding 57 | erroring byte. 58 | 59 | This means that if a client uses decodes without caring about 60 | validity it naturally replace bogus data with Uchar.rep according 61 | to the WHATWG Encoding standard. Other schemes are possible by 62 | consulting the number of used bytes on invalid decodes. For more 63 | details see https://hsivonen.fi/broken-utf-8/ 64 | 65 | For this reason in [get_utf_8_uchar] we gradually check the next 66 | byte is available rather than doing it immediately after the 67 | first byte. Contrast with [is_valid_utf_8]. *) 68 | 69 | (* UTF-8 *) 70 | 71 | let utf_8_decode_case = 72 | (* Case according to first byte. This corresponds to the the lines 73 | of The Unicode Standard Table 3.7 with lines E1..EC and EE..EF 74 | sharing the same case and unmentioned bytes mapped to 8. 75 | 76 | let dump_table () = 77 | let print_case c = Printf.printf "\\%03d" c in 78 | let cut i = if i mod 16 = 0 then (print_char '\\'; print_newline ()) in 79 | for i = 0x00 to 0x7F do (cut i; print_case 0) done; 80 | for i = 0x80 to 0xC1 do (cut i; print_case 8) done; 81 | for i = 0xC2 to 0xDF do (cut i; print_case 1) done; 82 | for i = 0xE0 to 0xE0 do (cut i; print_case 2) done; 83 | for i = 0xE1 to 0xEC do (cut i; print_case 3) done; 84 | for i = 0xED to 0xED do (cut i; print_case 4) done; 85 | for i = 0xEE to 0xEF do (cut i; print_case 3) done; 86 | for i = 0xF0 to 0xF0 do (cut i; print_case 5) done; 87 | for i = 0xF1 to 0xF3 do (cut i; print_case 6) done; 88 | for i = 0xF4 to 0xF4 do (cut i; print_case 7) done; 89 | for i = 0xF5 to 0xFF do (cut i; print_case 8) done; 90 | () *) 91 | "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 92 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 93 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 94 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 95 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 96 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 97 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 98 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 99 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\ 100 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\ 101 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\ 102 | \008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\008\ 103 | \008\008\001\001\001\001\001\001\001\001\001\001\001\001\001\001\ 104 | \001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\ 105 | \002\003\003\003\003\003\003\003\003\003\003\003\003\004\003\003\ 106 | \005\006\006\006\007\008\008\008\008\008\008\008\008\008\008\008" 107 | 108 | let utf_8_decode_case' = unsafe_of_string utf_8_decode_case 109 | 110 | let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10 111 | let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101 112 | let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100 113 | let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b 114 | let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8 115 | 116 | let[@inline] utf_8_uchar_2 b0 b1 = 117 | ((b0 land 0x1F) lsl 6) lor 118 | ((b1 land 0x3F)) 119 | 120 | let[@inline] utf_8_uchar_3 b0 b1 b2 = 121 | ((b0 land 0x0F) lsl 12) lor 122 | ((b1 land 0x3F) lsl 6) lor 123 | ((b2 land 0x3F)) 124 | 125 | let[@inline] utf_8_uchar_4 b0 b1 b2 b3 = 126 | ((b0 land 0x07) lsl 18) lor 127 | ((b1 land 0x3F) lsl 12) lor 128 | ((b2 land 0x3F) lsl 6) lor 129 | ((b3 land 0x3F)) 130 | 131 | let get_utf_8_uchar b i = 132 | let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *) 133 | let get = unsafe_get_uint8 in 134 | let max = length b - 1 in 135 | match get utf_8_decode_case' b0 with 136 | | 0 (* 00..7F *) -> dec_ret 1 b0 137 | | 1 (* C2..DF *) -> 138 | let i = i + 1 in if i > max then dec_invalid 1 else 139 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 140 | dec_ret 2 (utf_8_uchar_2 b0 b1) 141 | | 2 (* E0 *) -> 142 | let i = i + 1 in if i > max then dec_invalid 1 else 143 | let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else 144 | let i = i + 1 in if i > max then dec_invalid 2 else 145 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 146 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 147 | | 3 -> (* E1..EC or EE..EF *) 148 | let i = i + 1 in if i > max then dec_invalid 1 else 149 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 150 | let i = i + 1 in if i > max then dec_invalid 2 else 151 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 152 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 153 | | 4 -> (* ED *) 154 | let i = i + 1 in if i > max then dec_invalid 1 else 155 | let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else 156 | let i = i + 1 in if i > max then dec_invalid 2 else 157 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 158 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 159 | | 5 -> (* F0 *) 160 | let i = i + 1 in if i > max then dec_invalid 1 else 161 | let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else 162 | let i = i + 1 in if i > max then dec_invalid 2 else 163 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 164 | let i = i + 1 in if i > max then dec_invalid 3 else 165 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 166 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 167 | | 6 -> (* F1..F3 *) 168 | let i = i + 1 in if i > max then dec_invalid 1 else 169 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 170 | let i = i + 1 in if i > max then dec_invalid 2 else 171 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 172 | let i = i + 1 in if i > max then dec_invalid 3 else 173 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 174 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 175 | | 7 -> (* F4 *) 176 | let i = i + 1 in if i > max then dec_invalid 1 else 177 | let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else 178 | let i = i + 1 in if i > max then dec_invalid 2 else 179 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 180 | let i = i + 1 in if i > max then dec_invalid 3 else 181 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 182 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 183 | | 8 -> dec_invalid 1 184 | | _ -> assert false 185 | 186 | let set_utf_8_uchar b i u = 187 | let set = unsafe_set_uint8 in 188 | let max = length b - 1 in 189 | match Uchar.to_int u with 190 | | u when u < 0 -> assert false 191 | | u when u <= 0x007F -> 192 | set_uint8 b i u; 193 | 1 194 | | u when u <= 0x07FF -> 195 | let last = i + 1 in 196 | if last > max then 0 else 197 | (set_uint8 b i (0xC0 lor (u lsr 6)); 198 | set b last (0x80 lor (u land 0x3F)); 199 | 2) 200 | | u when u <= 0xFFFF -> 201 | let last = i + 2 in 202 | if last > max then 0 else 203 | (set_uint8 b i (0xE0 lor (u lsr 12)); 204 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F)); 205 | set b last (0x80 lor (u land 0x3F)); 206 | 3) 207 | | u -> 208 | let last = i + 3 in 209 | if last > max then 0 else 210 | (set_uint8 b i (0xF0 lor (u lsr 18)); 211 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F)); 212 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F)); 213 | set b last (0x80 lor (u land 0x3F)); 214 | 4) 215 | 216 | let is_valid_utf_8 b = 217 | let rec loop max b i = 218 | if i > max then true else 219 | let get = unsafe_get_uint8 in 220 | match get utf_8_decode_case' (get b i) with 221 | | 0 (* 00..7F *) -> loop max b (i + 1) 222 | | 1 (* C2..DF *) -> 223 | let last = i + 1 in 224 | if last > max 225 | || not_in_x80_to_xBF (get b last) 226 | then false 227 | else loop max b (last + 1) 228 | | 2 (* E0 *) -> 229 | let last = i + 2 in 230 | if last > max 231 | || not_in_xA0_to_xBF (get b (i + 1)) 232 | || not_in_x80_to_xBF (get b last) 233 | then false 234 | else loop max b (last + 1) 235 | | 3 -> (* E1..EC or EE..EF *) 236 | let last = i + 2 in 237 | if last > max 238 | || not_in_x80_to_xBF (get b (i + 1)) 239 | || not_in_x80_to_xBF (get b last) 240 | then false 241 | else loop max b (last + 1) 242 | | 4 -> (* ED *) 243 | let last = i + 2 in 244 | if last > max 245 | || not_in_x80_to_x9F (get b (i + 1)) 246 | || not_in_x80_to_xBF (get b last) 247 | then false 248 | else loop max b (last + 1) 249 | | 5 -> (* F0 *) 250 | let last = i + 3 in 251 | if last > max 252 | || not_in_x90_to_xBF (get b (i + 1)) 253 | || not_in_x80_to_xBF (get b (i + 2)) 254 | || not_in_x80_to_xBF (get b last) 255 | then false 256 | else loop max b (last + 1) 257 | | 6 -> (* F1..F3 *) 258 | let last = i + 3 in 259 | if last > max 260 | || not_in_x80_to_xBF (get b (i + 1)) 261 | || not_in_x80_to_xBF (get b (i + 2)) 262 | || not_in_x80_to_xBF (get b last) 263 | then false 264 | else loop max b (last + 1) 265 | | 7 -> (* F4 *) 266 | let last = i + 3 in 267 | if last > max 268 | || not_in_x80_to_x8F (get b (i + 1)) 269 | || not_in_x80_to_xBF (get b (i + 2)) 270 | || not_in_x80_to_xBF (get b last) 271 | then false 272 | else loop max b (last + 1) 273 | | 8 -> false 274 | | _ -> assert false 275 | in 276 | loop (length b - 1) b 0 277 | 278 | (* UTF-16BE *) 279 | 280 | let get_utf_16be_uchar b i = 281 | let get = unsafe_get_uint16_be in 282 | let max = length b - 1 in 283 | if i < 0 || i > max then invalid_arg "index out of bounds" else 284 | if i = max then dec_invalid 1 else 285 | match get b i with 286 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u 287 | | u when u > 0xDBFF -> dec_invalid 2 288 | | hi -> (* combine [hi] with a low surrogate *) 289 | let last = i + 3 in 290 | if last > max then dec_invalid (max - i + 1) else 291 | match get b (i + 2) with 292 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *) 293 | | lo -> 294 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in 295 | dec_ret 4 u 296 | 297 | let set_utf_16be_uchar b i u = 298 | let set = unsafe_set_uint16_be in 299 | let max = length b - 1 in 300 | if i < 0 || i > max then invalid_arg "index out of bounds" else 301 | match Uchar.to_int u with 302 | | u when u < 0 -> assert false 303 | | u when u <= 0xFFFF -> 304 | let last = i + 1 in 305 | if last > max then 0 else (set b i u; 2) 306 | | u -> 307 | let last = i + 3 in 308 | if last > max then 0 else 309 | let u' = u - 0x10000 in 310 | let hi = (0xD800 lor (u' lsr 10)) in 311 | let lo = (0xDC00 lor (u' land 0x3FF)) in 312 | set b i hi; set b (i + 2) lo; 4 313 | 314 | let is_valid_utf_16be b = 315 | let rec loop max b i = 316 | let get = unsafe_get_uint16_be in 317 | if i > max then true else 318 | if i = max then false else 319 | match get b i with 320 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2) 321 | | u when u > 0xDBFF -> false 322 | | hi -> 323 | let last = i + 3 in 324 | if last > max then false else 325 | match get b (i + 2) with 326 | | u when u < 0xDC00 || u > 0xDFFF -> false 327 | | lo -> loop max b (i + 4) 328 | in 329 | loop (length b - 1) b 0 330 | 331 | (* UTF-16LE *) 332 | 333 | let get_utf_16le_uchar b i = 334 | let get = unsafe_get_uint16_le in 335 | let max = length b - 1 in 336 | if i < 0 || i > max then invalid_arg "index out of bounds" else 337 | if i = max then dec_invalid 1 else 338 | match get b i with 339 | | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u 340 | | u when u > 0xDBFF -> dec_invalid 2 341 | | hi -> (* combine [hi] with a low surrogate *) 342 | let last = i + 3 in 343 | if last > max then dec_invalid (max - i + 1) else 344 | match get b (i + 2) with 345 | | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *) 346 | | lo -> 347 | let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in 348 | dec_ret 4 u 349 | 350 | let set_utf_16le_uchar b i u = 351 | let set = unsafe_set_uint16_le in 352 | let max = length b - 1 in 353 | if i < 0 || i > max then invalid_arg "index out of bounds" else 354 | match Uchar.to_int u with 355 | | u when u < 0 -> assert false 356 | | u when u <= 0xFFFF -> 357 | let last = i + 1 in 358 | if last > max then -2 else (set b i u; 2) 359 | | u -> 360 | let last = i + 3 in 361 | if last > max then -4 else 362 | let u' = u - 0x10000 in 363 | let hi = (0xD800 lor (u' lsr 10)) in 364 | let lo = (0xDC00 lor (u' land 0x3FF)) in 365 | set b i hi; set b (i + 2) lo; 4 366 | 367 | let is_valid_utf_16le b = 368 | let rec loop max b i = 369 | let get = unsafe_get_uint16_le in 370 | if i > max then true else 371 | if i = max then false else 372 | match get b i with 373 | | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2) 374 | | u when u > 0xDBFF -> false 375 | | hi -> 376 | let last = i + 3 in 377 | if last > max then false else 378 | match get b (i + 2) with 379 | | u when u < 0xDC00 || u > 0xDFFF -> false 380 | | lo -> loop max b (i + 4) 381 | in 382 | loop (length b - 1) b 0 383 | end 384 | -------------------------------------------------------------------------------- /utf_x_adhoc.mli: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | include Utf_x.S 17 | -------------------------------------------------------------------------------- /utf_x_dfa.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | module Uchar = Utf_uchar 17 | module Bytes = struct 18 | include Bytes 19 | 20 | (* Unsafe internal additions *) 21 | 22 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set" 23 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get" 24 | 25 | (* UTF-X codecs and validations *) 26 | 27 | let dec_invalid = Uchar.utf_decode_invalid 28 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) 29 | 30 | (* UTF-8 *) 31 | 32 | let utf_8_dfa = 33 | (* The first 256 entries map bytes to their class. Then the 34 | DFA follows. The DFA automaton is MIT-licensed 35 | Copyright (c) 2008-2009 Bjoern Hoehrmann. 36 | See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/#variations*) 37 | unsafe_of_string 38 | "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 39 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 40 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 41 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 42 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 43 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 44 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 45 | \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\ 46 | \001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\001\ 47 | \009\009\009\009\009\009\009\009\009\009\009\009\009\009\009\009\ 48 | \007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\ 49 | \007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\007\ 50 | \008\008\002\002\002\002\002\002\002\002\002\002\002\002\002\002\ 51 | \002\002\002\002\002\002\002\002\002\002\002\002\002\002\002\002\ 52 | \010\003\003\003\003\003\003\003\003\003\003\003\003\004\003\003\ 53 | \011\006\006\006\005\008\008\008\008\008\008\008\008\008\008\008\ 54 | \000\012\024\036\060\096\084\012\012\012\048\072\ 55 | \012\012\012\012\012\012\012\012\012\012\012\012\ 56 | \012\000\012\012\012\012\012\000\012\000\012\012\ 57 | \012\024\012\012\012\012\012\024\012\024\012\012\ 58 | \012\012\012\012\012\012\012\024\012\012\012\012\ 59 | \012\024\012\012\012\012\012\012\012\024\012\012\ 60 | \012\012\012\012\012\012\012\036\012\036\012\012\ 61 | \012\036\012\012\012\012\012\036\012\036\012\012\ 62 | \012\036\012\012\012\012\012\012\012\012\012\012" 63 | 64 | let accept = 0 65 | let reject = 12 66 | 67 | (* Rewriting the following as an imperative loop is uglier but 68 | more efficient. 69 | 70 | let get_utf_8_uchar b i = 71 | let max = length b - 1 in 72 | if i < 0 || i > max then invalid_arg "index out of bounds" else 73 | let rec loop st u j = 74 | if j > max then dec_err (j - i) else 75 | let byte = unsafe_get_uint8 b j in 76 | let class' = unsafe_get_uint8 utf_8_dfa byte in 77 | let u' = 78 | if st <> accept 79 | then (u lsl 6) lor (byte land 0x3F) 80 | else byte land (0xFF lsr class') 81 | in 82 | let st' = unsafe_get_uint8 utf_8_dfa (256 + st + class') in 83 | if st' = reject then dec_err (j - i + 1) else 84 | if st' = accept then dec_ret (j - i + 1) u' else 85 | loop st' u' (j + 1) 86 | in 87 | loop accept 0 i 88 | *) 89 | 90 | let get_utf_8_uchar b i = 91 | let max = length b - 1 in 92 | if i < 0 || i > max then invalid_arg "index out of bounds" else 93 | let stop = ref false in 94 | let st = ref accept in 95 | let j = ref i in 96 | let u = ref 0 in 97 | while (not !stop) do 98 | let byte = unsafe_get_uint8 b !j in 99 | let class' = unsafe_get_uint8 utf_8_dfa byte in 100 | u := 101 | (if !st <> accept 102 | then (!u lsl 6) lor (byte land 0x3F) 103 | else byte land (0xFF lsr class')); 104 | st := unsafe_get_uint8 utf_8_dfa (256 + !st + class'); 105 | if (!st = reject || !st = accept) 106 | then stop := true 107 | else (incr j; if !j > max then stop := true); 108 | done; 109 | if !j > max then dec_invalid (!j - i) else 110 | if !st = accept then dec_ret (!j - i + 1) !u else 111 | if !st = reject then dec_invalid (if i = !j then 1 else (!j - i)) else 112 | assert false 113 | 114 | let set_utf_8_uchar b i u = 115 | let set = unsafe_set_uint8 in 116 | let max = length b - 1 in 117 | match Uchar.to_int u with 118 | | u when u <= 0x007F -> 119 | set_uint8 b i u; 120 | 1 121 | | u when u <= 0x07FF -> 122 | let last = i + 1 in 123 | if last > max then 0 else 124 | (set_uint8 b i (0xC0 lor (u lsr 6)); 125 | set b last (0x80 lor (u land 0x3F)); 126 | 2) 127 | | u when u <= 0xFFFF -> 128 | let last = i + 2 in 129 | if last > max then 0 else 130 | (set_uint8 b i (0xE0 lor (u lsr 12)); 131 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F)); 132 | set b last (0x80 lor (u land 0x3F)); 133 | 3) 134 | | u -> 135 | let last = i + 3 in 136 | if last > max then 0 else 137 | (set_uint8 b i (0xF0 lor (u lsr 18)); 138 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F)); 139 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F)); 140 | set b last (0x80 lor (u land 0x3F)); 141 | 4) 142 | 143 | let is_valid_utf_8 b = 144 | (* This could be optimized using the state machine. We didn't 145 | do it for now since it seems the adhoc implementation yields 146 | better perfs. *) 147 | let rec loop b i = 148 | if i >= length b then true else 149 | let dec = get_utf_8_uchar b i in 150 | if Uchar.utf_decode_is_valid dec 151 | then loop b (i + Uchar.utf_decode_length dec) 152 | else false 153 | in 154 | loop b 0 155 | 156 | include Utf_16 157 | end 158 | -------------------------------------------------------------------------------- /utf_x_dfa.mli: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | include Utf_x.S 17 | -------------------------------------------------------------------------------- /utf_x_if.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | module Uchar = Utf_uchar 17 | module Bytes = struct 18 | include Bytes 19 | 20 | (* Unsafe internal additions *) 21 | 22 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set" 23 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get" 24 | 25 | (* UTF-X codecs and validations *) 26 | 27 | let dec_invalid = Uchar.utf_decode_invalid 28 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) 29 | 30 | (* In case of decoding error, if we error on the first byte, we 31 | consume the byte, otherwise we consume the [n] bytes preceeding 32 | erroring byte. 33 | 34 | This means that if a client uses decodes without caring about 35 | validity it naturally replace bogus data with Uchar.rep according 36 | to the WHATWG Encoding standard. Other schemes are possible by 37 | consulting the number of used bytes on invalid decodes. For more 38 | details see https://hsivonen.fi/broken-utf-8/ 39 | 40 | For this reason in [get_utf_8_uchar] we gradually check the next 41 | byte is available rather than doing it immediately after the 42 | first byte. Contrast with [is_valid_utf_8]. *) 43 | 44 | (* UTF-8 *) 45 | 46 | let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10 47 | let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101 48 | let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100 49 | let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b 50 | let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8 51 | 52 | let[@inline] utf_8_uchar_2 b0 b1 = 53 | ((b0 land 0x1F) lsl 6) lor 54 | ((b1 land 0x3F)) 55 | 56 | let[@inline] utf_8_uchar_3 b0 b1 b2 = 57 | ((b0 land 0x0F) lsl 12) lor 58 | ((b1 land 0x3F) lsl 6) lor 59 | ((b2 land 0x3F)) 60 | 61 | let[@inline] utf_8_uchar_4 b0 b1 b2 b3 = 62 | ((b0 land 0x07) lsl 18) lor 63 | ((b1 land 0x3F) lsl 12) lor 64 | ((b2 land 0x3F) lsl 6) lor 65 | ((b3 land 0x3F)) 66 | 67 | let get_utf_8_uchar b i = 68 | let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *) 69 | let get = unsafe_get_uint8 in 70 | if b0 <= 0x7F then (* 00..7F *) dec_ret 1 b0 else 71 | if b0 land 0xE0 = 0xC0 && b0 >= 0xC2 then begin 72 | (* C2..DF *) 73 | let max = length b - 1 in 74 | let i = i + 1 in if i > max then dec_invalid 1 else 75 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 76 | dec_ret 2 (utf_8_uchar_2 b0 b1) 77 | end else if b0 land 0xF0 = 0xE0 then begin 78 | let max = length b - 1 in 79 | if b0 = 0xE0 then begin 80 | (* E0 *) 81 | let i = i + 1 in if i > max then dec_invalid 1 else 82 | let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else 83 | let i = i + 1 in if i > max then dec_invalid 2 else 84 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 85 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 86 | end else if b0 = 0xED then begin 87 | (* ED *) 88 | let i = i + 1 in if i > max then dec_invalid 1 else 89 | let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else 90 | let i = i + 1 in if i > max then dec_invalid 2 else 91 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 92 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 93 | end else begin 94 | (* E1..EC or EE..EF *) 95 | let i = i + 1 in if i > max then dec_invalid 1 else 96 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 97 | let i = i + 1 in if i > max then dec_invalid 2 else 98 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 99 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 100 | end 101 | end else if b0 land 0xF8 = 0xF0 && b0 <= 0xF4 then begin 102 | let max = length b - 1 in 103 | if b0 = 0xF0 then begin 104 | (* F0 *) 105 | let i = i + 1 in if i > max then dec_invalid 1 else 106 | let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else 107 | let i = i + 1 in if i > max then dec_invalid 2 else 108 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 109 | let i = i + 1 in if i > max then dec_invalid 3 else 110 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 111 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 112 | end else if b0 = 0xF4 then begin 113 | (* F4 *) 114 | let i = i + 1 in if i > max then dec_invalid 1 else 115 | let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else 116 | let i = i + 1 in if i > max then dec_invalid 2 else 117 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 118 | let i = i + 1 in if i > max then dec_invalid 3 else 119 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 120 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 121 | end else begin 122 | let i = i + 1 in if i > max then dec_invalid 1 else 123 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 124 | let i = i + 1 in if i > max then dec_invalid 2 else 125 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 126 | let i = i + 1 in if i > max then dec_invalid 3 else 127 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 128 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 129 | end 130 | end else dec_invalid 1 131 | 132 | let set_utf_8_uchar b i u = 133 | let set = unsafe_set_uint8 in 134 | let max = length b - 1 in 135 | match Uchar.to_int u with 136 | | u when u < 0 -> assert false 137 | | u when u <= 0x007F -> 138 | set_uint8 b i u; 139 | 1 140 | | u when u <= 0x07FF -> 141 | let last = i + 1 in 142 | if last > max then 0 else 143 | (set_uint8 b i (0xC0 lor (u lsr 6)); 144 | set b last (0x80 lor (u land 0x3F)); 145 | 2) 146 | | u when u <= 0xFFFF -> 147 | let last = i + 2 in 148 | if last > max then 0 else 149 | (set_uint8 b i (0xE0 lor (u lsr 12)); 150 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F)); 151 | set b last (0x80 lor (u land 0x3F)); 152 | 3) 153 | | u -> 154 | let last = i + 3 in 155 | if last > max then 0 else 156 | (set_uint8 b i (0xF0 lor (u lsr 18)); 157 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F)); 158 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F)); 159 | set b last (0x80 lor (u land 0x3F)); 160 | 4) 161 | 162 | let is_valid_utf_8 b = 163 | let rec loop max b i = 164 | if i > max then true else 165 | let get = unsafe_get_uint8 in 166 | let b0 = get b i in 167 | if b0 <= 0x7F then (* 00..7F *) loop max b (i + 1) else 168 | if b0 land 0xE0 = 0xC0 && b0 >= 0xC2 then begin 169 | (* C2..DF *) 170 | let last = i + 1 in 171 | if last > max 172 | || not_in_x80_to_xBF (get b last) 173 | then false 174 | else loop max b (last + 1) 175 | end else if b0 land 0xF0 = 0xE0 then begin 176 | if b0 = 0xE0 then begin 177 | (* E0 *) 178 | let last = i + 2 in 179 | if last > max 180 | || not_in_xA0_to_xBF (get b (i + 1)) 181 | || not_in_x80_to_xBF (get b last) 182 | then false 183 | else loop max b (last + 1) 184 | end else if b0 = 0xED then begin 185 | let last = i + 2 in 186 | if last > max 187 | || not_in_x80_to_x9F (get b (i + 1)) 188 | || not_in_x80_to_xBF (get b last) 189 | then false 190 | else loop max b (last + 1) 191 | end else begin 192 | (* E1..EC or EE..EF *) 193 | let last = i + 2 in 194 | if last > max 195 | || not_in_x80_to_xBF (get b (i + 1)) 196 | || not_in_x80_to_xBF (get b last) 197 | then false 198 | else loop max b (last + 1) 199 | end 200 | end else if b0 land 0xF8 = 0xF0 && b0 <= 0xF4 then begin 201 | if b0 = 0xF0 then begin 202 | (* F0 *) 203 | let last = i + 3 in 204 | if last > max 205 | || not_in_x90_to_xBF (get b (i + 1)) 206 | || not_in_x80_to_xBF (get b (i + 2)) 207 | || not_in_x80_to_xBF (get b last) 208 | then false 209 | else loop max b (last + 1) 210 | end else if b0 = 0xF4 then begin 211 | (* F4 *) 212 | let last = i + 3 in 213 | if last > max 214 | || not_in_x80_to_x8F (get b (i + 1)) 215 | || not_in_x80_to_xBF (get b (i + 2)) 216 | || not_in_x80_to_xBF (get b last) 217 | then false 218 | else loop max b (last + 1) 219 | end else begin 220 | (* F1..F3 *) 221 | let last = i + 3 in 222 | if last > max 223 | || not_in_x80_to_xBF (get b (i + 1)) 224 | || not_in_x80_to_xBF (get b (i + 2)) 225 | || not_in_x80_to_xBF (get b last) 226 | then false 227 | else loop max b (last + 1) 228 | end 229 | end else false 230 | in 231 | loop (length b - 1) b 0 232 | 233 | include Utf_16 234 | end 235 | -------------------------------------------------------------------------------- /utf_x_if.mli: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | include Utf_x.S 17 | -------------------------------------------------------------------------------- /utf_x_pat.ml: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | module Uchar = Utf_uchar 17 | module Bytes = struct 18 | include Bytes 19 | 20 | (* Unsafe internal additions *) 21 | 22 | external unsafe_set_uint8 : t -> int -> int -> unit = "%bytes_unsafe_set" 23 | external unsafe_get_uint8 : t -> int -> int = "%bytes_unsafe_get" 24 | 25 | (* UTF-X codecs and validations *) 26 | 27 | let dec_invalid = Uchar.utf_decode_invalid 28 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) 29 | 30 | (* In case of decoding error, if we error on the first byte, we 31 | consume the byte, otherwise we consume the [n] bytes preceeding 32 | the erroring byte. 33 | 34 | This means that if a client uses decodes without caring about 35 | validity it naturally replace bogus data with Uchar.rep according 36 | to the WHATWG Encoding standard. Other schemes are possible by 37 | consulting the number of used bytes on invalid decodes. For more 38 | details see https://hsivonen.fi/broken-utf-8/ 39 | 40 | For this reason in [get_utf_8_uchar] we gradually check the next 41 | byte is available rather than doing it immediately after the 42 | first byte. Contrast with [is_valid_utf_8]. *) 43 | 44 | (* UTF-8 *) 45 | 46 | let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10 47 | let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101 48 | let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100 49 | let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b 50 | let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8 51 | 52 | let[@inline] utf_8_uchar_2 b0 b1 = 53 | ((b0 land 0x1F) lsl 6) lor 54 | ((b1 land 0x3F)) 55 | 56 | let[@inline] utf_8_uchar_3 b0 b1 b2 = 57 | ((b0 land 0x0F) lsl 12) lor 58 | ((b1 land 0x3F) lsl 6) lor 59 | ((b2 land 0x3F)) 60 | 61 | let[@inline] utf_8_uchar_4 b0 b1 b2 b3 = 62 | ((b0 land 0x07) lsl 18) lor 63 | ((b1 land 0x3F) lsl 12) lor 64 | ((b2 land 0x3F) lsl 6) lor 65 | ((b3 land 0x3F)) 66 | 67 | let get_utf_8_uchar b i = 68 | let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *) 69 | let get = unsafe_get_uint8 in 70 | let max = length b - 1 in 71 | match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *) 72 | | '\x00' .. '\x7F' -> dec_ret 1 b0 73 | | '\xC2' .. '\xDF' -> 74 | let i = i + 1 in if i > max then dec_invalid 1 else 75 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 76 | dec_ret 2 (utf_8_uchar_2 b0 b1) 77 | | '\xE0' -> 78 | let i = i + 1 in if i > max then dec_invalid 1 else 79 | let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else 80 | let i = i + 1 in if i > max then dec_invalid 2 else 81 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 82 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 83 | | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' -> 84 | let i = i + 1 in if i > max then dec_invalid 1 else 85 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 86 | let i = i + 1 in if i > max then dec_invalid 2 else 87 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 88 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 89 | | '\xED' -> 90 | let i = i + 1 in if i > max then dec_invalid 1 else 91 | let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else 92 | let i = i + 1 in if i > max then dec_invalid 2 else 93 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 94 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 95 | | '\xF0' -> 96 | let i = i + 1 in if i > max then dec_invalid 1 else 97 | let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else 98 | let i = i + 1 in if i > max then dec_invalid 2 else 99 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 100 | let i = i + 1 in if i > max then dec_invalid 3 else 101 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 102 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 103 | | '\xF1' .. '\xF3' -> 104 | let i = i + 1 in if i > max then dec_invalid 1 else 105 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 106 | let i = i + 1 in if i > max then dec_invalid 2 else 107 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 108 | let i = i + 1 in if i > max then dec_invalid 3 else 109 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 110 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 111 | | '\xF4' -> 112 | let i = i + 1 in if i > max then dec_invalid 1 else 113 | let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else 114 | let i = i + 1 in if i > max then dec_invalid 2 else 115 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 116 | let i = i + 1 in if i > max then dec_invalid 3 else 117 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 118 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 119 | | _ -> dec_invalid 1 120 | 121 | let set_utf_8_uchar b i u = 122 | let set = unsafe_set_uint8 in 123 | let max = length b - 1 in 124 | match Uchar.to_int u with 125 | | u when u < 0 -> assert false 126 | | u when u <= 0x007F -> 127 | set_uint8 b i u; 128 | 1 129 | | u when u <= 0x07FF -> 130 | let last = i + 1 in 131 | if last > max then 0 else 132 | (set_uint8 b i (0xC0 lor (u lsr 6)); 133 | set b last (0x80 lor (u land 0x3F)); 134 | 2) 135 | | u when u <= 0xFFFF -> 136 | let last = i + 2 in 137 | if last > max then 0 else 138 | (set_uint8 b i (0xE0 lor (u lsr 12)); 139 | set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F)); 140 | set b last (0x80 lor (u land 0x3F)); 141 | 3) 142 | | u -> 143 | let last = i + 3 in 144 | if last > max then 0 else 145 | (set_uint8 b i (0xF0 lor (u lsr 18)); 146 | set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F)); 147 | set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F)); 148 | set b last (0x80 lor (u land 0x3F)); 149 | 4) 150 | 151 | let is_valid_utf_8 b = 152 | let rec loop max b i = 153 | if i > max then true else 154 | let get = unsafe_get_uint8 in 155 | match Char.unsafe_chr (get b i) with 156 | | '\x00' .. '\x7F' -> loop max b (i + 1) 157 | | '\xC2' .. '\xDF' -> 158 | let last = i + 1 in 159 | if last > max 160 | || not_in_x80_to_xBF (get b last) 161 | then false 162 | else loop max b (last + 1) 163 | | '\xE0' -> 164 | let last = i + 2 in 165 | if last > max 166 | || not_in_xA0_to_xBF (get b (i + 1)) 167 | || not_in_x80_to_xBF (get b last) 168 | then false 169 | else loop max b (last + 1) 170 | | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' -> 171 | let last = i + 2 in 172 | if last > max 173 | || not_in_x80_to_xBF (get b (i + 1)) 174 | || not_in_x80_to_xBF (get b last) 175 | then false 176 | else loop max b (last + 1) 177 | | '\xED' -> 178 | let last = i + 2 in 179 | if last > max 180 | || not_in_x80_to_x9F (get b (i + 1)) 181 | || not_in_x80_to_xBF (get b last) 182 | then false 183 | else loop max b (last + 1) 184 | | '\xF0' -> 185 | let last = i + 3 in 186 | if last > max 187 | || not_in_x90_to_xBF (get b (i + 1)) 188 | || not_in_x80_to_xBF (get b (i + 2)) 189 | || not_in_x80_to_xBF (get b last) 190 | then false 191 | else loop max b (last + 1) 192 | | '\xF1' .. '\xF3' -> 193 | let last = i + 3 in 194 | if last > max 195 | || not_in_x80_to_xBF (get b (i + 1)) 196 | || not_in_x80_to_xBF (get b (i + 2)) 197 | || not_in_x80_to_xBF (get b last) 198 | then false 199 | else loop max b (last + 1) 200 | | '\xF4' -> 201 | let last = i + 3 in 202 | if last > max 203 | || not_in_x80_to_x8F (get b (i + 1)) 204 | || not_in_x80_to_xBF (get b (i + 2)) 205 | || not_in_x80_to_xBF (get b last) 206 | then false 207 | else loop max b (last + 1) 208 | | _ -> false 209 | in 210 | loop (length b - 1) b 0 211 | 212 | include Utf_16 213 | end 214 | -------------------------------------------------------------------------------- /utf_x_pat.mli: -------------------------------------------------------------------------------- 1 | (**************************************************************************) 2 | (* *) 3 | (* OCaml *) 4 | (* *) 5 | (* The OCaml programmers *) 6 | (* *) 7 | (* Copyright 2021 Institut National de Recherche en Informatique et *) 8 | (* en Automatique. *) 9 | (* *) 10 | (* All rights reserved. This file is distributed under the terms of *) 11 | (* the GNU Lesser General Public License version 2.1, with the *) 12 | (* special exception on linking described in the file LICENSE. *) 13 | (* *) 14 | (**************************************************************************) 15 | 16 | include Utf_x.S 17 | --------------------------------------------------------------------------------