├── BRZO ├── .ocp-indent ├── .gitignore ├── .merlin ├── src ├── uuseg.mllib ├── uuseg_word.mli ├── uuseg_sentence.mli ├── uuseg_grapheme_cluster.mli ├── uuseg_line_break.mli ├── uuseg_buf.mli ├── uuseg_buf.ml ├── uuseg_base.mli ├── uuseg_base.ml ├── uuseg.ml ├── uuseg_string.mli ├── uuseg_string.ml ├── uuseg_grapheme_cluster.ml ├── uuseg_word.ml ├── uuseg_sentence.ml ├── uuseg.mli └── uuseg_line_break.ml ├── _tags ├── doc └── index.mld ├── pkg ├── META └── pkg.ml ├── LICENSE.md ├── test ├── examples.ml ├── usegtrip.ml └── test_uuseg.ml ├── DEVEL.md ├── README.md ├── opam ├── CHANGES.md └── B0.ml /BRZO: -------------------------------------------------------------------------------- 1 | (srcs-x pkg test) -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | strict_with=always,match_clause=4,strict_else=never -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _b0 2 | _build 3 | tmp 4 | *.install 5 | test/*Test.txt -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG uutf uucp cmdliner b0.kit 2 | S src 3 | S test 4 | B _b0/** 5 | -------------------------------------------------------------------------------- /src/uuseg.mllib: -------------------------------------------------------------------------------- 1 | Uuseg_base 2 | Uuseg_buf 3 | Uuseg_grapheme_cluster 4 | Uuseg_word 5 | Uuseg_sentence 6 | Uuseg_line_break 7 | Uuseg 8 | Uuseg_string -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | true : bin_annot, safe_string 2 | 3 | <_b0> : -traverse 4 | : include 5 | : package(uucp) 6 | 7 | : include 8 | : package(uucp) 9 | : package(cmdliner uutf) 10 | -------------------------------------------------------------------------------- /doc/index.mld: -------------------------------------------------------------------------------- 1 | {0 Uuseg {%html: %%VERSION%%%}} 2 | 3 | Uuseg segments Unicode text. See {!Uuseg} for more details. 4 | 5 | {1:library_uuseg Library [uuseg]} 6 | 7 | {!modules: 8 | Uuseg 9 | Uuseg_string 10 | } 11 | -------------------------------------------------------------------------------- /pkg/META: -------------------------------------------------------------------------------- 1 | description = "Unicode text segmentation for OCaml" 2 | version = "%%VERSION_NUM%%" 3 | requires = "uucp" 4 | archive(byte) = "uuseg.cma" 5 | archive(native) = "uuseg.cmxa" 6 | plugin(byte) = "uuseg.cma" 7 | plugin(native) = "uuseg.cmxs" 8 | exists_if = "uuseg.cma uuseg.cmxa" 9 | 10 | package "string" ( 11 | description = "The uuseg.string library (deprecated)" 12 | version = "%%VERSION_NUM%%" 13 | requires = "uuseg" 14 | exports = "uuseg" 15 | warning = "Deprecated, use the uuseg library." 16 | ) 17 | -------------------------------------------------------------------------------- /src/uuseg_word.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Word segmenter. *) 7 | 8 | (** {1 Segmenter} *) 9 | 10 | type t 11 | val create : unit -> t 12 | val copy : t -> t 13 | val equal : t -> t -> bool 14 | val add : t -> [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret 15 | -------------------------------------------------------------------------------- /src/uuseg_sentence.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Sentence segmenter. *) 7 | 8 | (** {1 Segmenter} *) 9 | 10 | type t 11 | val create : unit -> t 12 | val copy : t -> t 13 | val equal : t -> t -> bool 14 | val add : t -> [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret 15 | -------------------------------------------------------------------------------- /src/uuseg_grapheme_cluster.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Grapheme cluster segmenter. *) 7 | 8 | (** {1 Segmenter} *) 9 | 10 | type t 11 | val create : unit -> t 12 | val copy : t -> t 13 | val equal : t -> t -> bool 14 | val add : t -> [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret 15 | -------------------------------------------------------------------------------- /pkg/pkg.ml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ocaml 2 | #use "topfind" 3 | #require "topkg" 4 | open Topkg 5 | 6 | let uutf = Conf.with_pkg "uutf" 7 | let cmdliner = Conf.with_pkg "cmdliner" 8 | 9 | let () = 10 | Pkg.describe "uuseg" @@ fun c -> 11 | let uutf = Conf.value c uutf in 12 | let cmdliner = Conf.value c cmdliner in 13 | Ok [ Pkg.mllib ~api:["Uuseg"; "Uuseg_string"] "src/uuseg.mllib"; 14 | Pkg.bin ~cond:(uutf && cmdliner) "test/usegtrip"; 15 | Pkg.doc "doc/index.mld" ~dst:"odoc-pages/index.mld"; 16 | Pkg.doc "test/examples.ml"; ] 17 | -------------------------------------------------------------------------------- /src/uuseg_line_break.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Line break segmenter. *) 7 | 8 | (** {1 Segmenter} *) 9 | 10 | type t 11 | val create : unit -> t 12 | val copy : t -> t 13 | val equal : t -> t -> bool 14 | val mandatory : t -> bool 15 | val add : t -> [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret 16 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 The uuseg programmers 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /src/uuseg_buf.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** [`Uchar] buffers *) 7 | 8 | (** {1 [`Uchar] buffers} *) 9 | 10 | type t 11 | (** The type for [`Uchar] buffers. *) 12 | 13 | val create : int -> t 14 | (** [create n] is a buffer of initial size [n]. *) 15 | 16 | val copy : t -> t 17 | (** [copy b] is a copy of [b]. *) 18 | 19 | val empty : t -> bool 20 | (** [empty b] is [true] iff [b] is empty. *) 21 | 22 | val len : t -> int 23 | (** [len b] is [b]'s length. *) 24 | 25 | val add : t -> [`Uchar of Uchar.t] -> unit 26 | (** [add b u] adds [u] at the end of [b]. *) 27 | 28 | val flush : t -> [> `Uchar of Uchar.t ] 29 | (** [flush b] is the first [`Uchar] of [b] and removes it from [b]. *) 30 | 31 | val get_first : t -> Uchar.t 32 | (** [get_first b] is the first [`Uchar] of [b], if any. *) 33 | -------------------------------------------------------------------------------- /test/examples.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: CC0-1.0 4 | ---------------------------------------------------------------------------*) 5 | 6 | let utf_8_segments seg s = 7 | let flush_segment buf acc = 8 | let segment = Buffer.contents buf in 9 | Buffer.clear buf; if segment = "" then acc else segment :: acc 10 | in 11 | let rec add buf acc segmenter v = match Uuseg.add segmenter v with 12 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf acc segmenter `Await 13 | | `Boundary -> add buf (flush_segment buf acc) segmenter `Await 14 | | `Await | `End -> acc 15 | in 16 | let rec loop buf acc s i max segmenter = 17 | if i > max then flush_segment buf (add buf acc segmenter `End) else 18 | let dec = String.get_utf_8_uchar s i in 19 | let acc = add buf acc segmenter (`Uchar (Uchar.utf_decode_uchar dec)) in 20 | loop buf acc s (i + Uchar.utf_decode_length dec) max segmenter 21 | in 22 | let buf = Buffer.create 42 in 23 | let segmenter = Uuseg.create seg in 24 | List.rev (loop buf [] s 0 (String.length s - 1) segmenter) 25 | -------------------------------------------------------------------------------- /src/uuseg_buf.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let dummy_u = (* to initalize arrays, never read. *) 7 | `Uchar (Uchar.unsafe_of_int 0x0000) 8 | 9 | type t = 10 | { mutable buf : [`Uchar of Uchar.t] array; 11 | mutable first : int; 12 | mutable last : int; } 13 | 14 | let create n = { buf = Array.make n dummy_u; first = 0; last = -1; } 15 | let copy b = { b with buf = Array.copy b.buf; } 16 | let empty b = b.last = -1 17 | let len b = (b.last - b.first) + 1 18 | let grow b = 19 | let len = Array.length b.buf in 20 | let newbuf = Array.make (2 * len) dummy_u in 21 | Array.blit b.buf 0 newbuf 0 len; b.buf <- newbuf 22 | 23 | let add b add = 24 | let last = b.last + 1 in 25 | if last = Array.length b.buf then grow b; 26 | b.buf.(last) <- add; b.last <- last 27 | 28 | let flush b = 29 | let `Uchar _ as add = b.buf.(b.first) in 30 | b.first <- b.first + 1; 31 | if b.first > b.last then (b.first <- 0; b.last <- -1); 32 | add 33 | 34 | let get_first b = let `Uchar u = b.buf.(b.first) in u 35 | -------------------------------------------------------------------------------- /DEVEL.md: -------------------------------------------------------------------------------- 1 | # New Unicode release 2 | 3 | First you will an install of [`uucp`] that supports the new Unicode 4 | version installed. 5 | 6 | You can then bump the Unicode release number at the top of the `B0.ml` 7 | file. Veryify that everything is as expected with: 8 | 9 | b0 -- unicode-version 10 | 11 | Update the opam file with: 12 | 13 | b0 -- .opam file > opam 14 | 15 | Before the formal Unicode release date check the proposed update of 16 | [UAX29] and [UAX14]. If the rules need to be adjusted then do so. 17 | 18 | Compile Uuseg against the uucp and check the reference tests (see 19 | below). 20 | 21 | [UAX29]: https://www.unicode.org/reports/tr29/ 22 | [UAX14]: https://www.unicode.org/reports/tr14/ 23 | [`uucp`]: https://erratique.ch/software/uucp 24 | 25 | # Reference tests 26 | 27 | To test the package on the reference segmentation tests you must 28 | download a copy of the tests to: 29 | 30 | test/LineBreakTest.txt 31 | test/GraphemeBreakTest.txt 32 | test/WordBreakTest.txt 33 | test/SentenceBreakTest.txt 34 | 35 | These file are ignored by git. If you have `curl` in your `PATH` 36 | you can simply issue: 37 | 38 | b0 -- download-tests 39 | 40 | this downloads the tests for the Unicode version mentioned in `B0.ml`. 41 | 42 | You can then check them via: 43 | 44 | b0 test 45 | -------------------------------------------------------------------------------- /src/uuseg_base.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Segmenter commonalities. *) 7 | 8 | (** {1 Common} *) 9 | 10 | type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ] 11 | (** See {!Uuseg.ret}. *) 12 | 13 | val pp_ret : Format.formatter -> [< ret ] -> unit 14 | (** See {!Uuseg.pp_ret}. *) 15 | 16 | val err_exp_await : [< ret] -> 'a 17 | (** See {!Uuseg.err_exp_await}. *) 18 | 19 | val err_ended : [< ret] -> 'a 20 | (** See {!Uuseg.err_ended}. *) 21 | 22 | (** {1:tid Type identifiers} *) 23 | 24 | (** Type introspection. 25 | 26 | {b Note.} Available in 5.1. *) 27 | module Type : sig 28 | 29 | type (_, _) eq = Equal : ('a, 'a) eq (** *) 30 | (** The type for type quality testing. *) 31 | 32 | module Id : sig 33 | 34 | (** {1:typeids Type identifiers} *) 35 | 36 | type 'a t 37 | (** The type for type identifiers for a type ['a]. *) 38 | 39 | val make : unit -> 'a t 40 | (** [make ()] is a new type identifier. *) 41 | 42 | val provably_equal : 'a t -> 'b t -> ('a, 'b) eq option 43 | (** [provably_equal id0 id1] determines if [id0] and [id1] are equal. *) 44 | 45 | val uid : 'a t -> int 46 | (** [uid id] is a runtime unique identifier for [id]. *) 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /src/uuseg_base.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ] 7 | 8 | let pp_ret ppf v = match (v :> ret) with 9 | | `Await -> Format.fprintf ppf "`Await" 10 | | `Boundary -> Format.fprintf ppf "`Boundary" 11 | | `End -> Format.fprintf ppf "`End" 12 | | `Uchar u -> Format.fprintf ppf "`Uchar U+%04X" (Uchar.to_int u) 13 | 14 | let err_exp_await add = 15 | invalid_arg (Format.asprintf "can't add %a, expected `Await" pp_ret add) 16 | 17 | let err_ended add = 18 | invalid_arg (Format.asprintf "can't add %a, `End already added" pp_ret add) 19 | 20 | (* Type identifiers *) 21 | 22 | module Type = struct 23 | (* See http://alan.petitepomme.net/cwn/2015.03.24.html#1 24 | In the stdlib since 5.1. *) 25 | 26 | type ('a, 'b) eq = Equal : ('a, 'a) eq 27 | 28 | module Id = struct 29 | type _ id = .. 30 | module type ID = sig 31 | type t 32 | type _ id += Id : t id 33 | end 34 | 35 | type 'a t = (module ID with type t = 'a) 36 | 37 | let make (type a) () : a t = 38 | (module struct type t = a type _ id += Id : t id end) 39 | 40 | let provably_equal 41 | (type a b) ((module A) : a t) ((module B) : b t) : (a, b) eq option 42 | = 43 | match A.Id with B.Id -> Some Equal | _ -> None 44 | 45 | let uid (type a) ((module A) : a t) = 46 | Obj.Extension_constructor.id (Obj.Extension_constructor.of_val A.Id) 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Uuseg — Unicode text segmentation for OCaml 2 | =============================================================================== 3 | %%VERSION%% 4 | 5 | Uuseg is an OCaml library for segmenting Unicode text. It implements 6 | the locale independent [Unicode text segmentation algorithms][1] to 7 | detect grapheme cluster, word and sentence boundaries and the [Unicode 8 | line breaking algorithm][2] to detect line break opportunities. 9 | 10 | The library is independent from any IO mechanism or Unicode text data 11 | structure and it can process text without a complete in-memory 12 | representation. 13 | 14 | Uuseg is distributed under the ISC license. It depends on [Uucp]. 15 | 16 | [1]: http://www.unicode.org/reports/tr29/ 17 | [2]: http://www.unicode.org/reports/tr14/ 18 | [Uucp]: http://erratique.ch/software/uucp 19 | 20 | Homepage: 21 | 22 | ## Installation 23 | 24 | Uuseg can be installed with `opam`: 25 | 26 | opam install uuseg 27 | opam install uuseg cmdliner uutf # For the usegtrip tool. 28 | 29 | If you don't use `opam` consult the [`opam`](opam) file for build 30 | instructions. 31 | 32 | ## Documentation 33 | 34 | The documentation can be consulted [online] or via `odig doc uuseg`. 35 | 36 | Questions are welcome but better asked on the [OCaml forum] than on 37 | the issue tracker. 38 | 39 | [online]: http://erratique.ch/software/uuseg/doc/ 40 | [OCaml forum]: https://discuss.ocaml.org/ 41 | 42 | 43 | ## Sample programs 44 | 45 | The [`usegtrip`] tool segments text provided on standard input. 46 | 47 | See also the [doc examples]. 48 | 49 | [`usegtrip`]: test/usegtrip.ml 50 | [doc examples]: test/examples.ml 51 | -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "uuseg" 3 | synopsis: "Unicode text segmentation for OCaml" 4 | description: """\ 5 | Uuseg is an OCaml library for segmenting Unicode text. It implements 6 | the locale independent [Unicode text segmentation algorithms][1] to 7 | detect grapheme cluster, word and sentence boundaries and the [Unicode 8 | line breaking algorithm][2] to detect line break opportunities. 9 | 10 | The library is independent from any IO mechanism or Unicode text data 11 | structure and it can process text without a complete in-memory 12 | representation. 13 | 14 | Uuseg is distributed under the ISC license. It depends on [Uucp]. 15 | 16 | [1]: http://www.unicode.org/reports/tr29/ 17 | [2]: http://www.unicode.org/reports/tr14/ 18 | [Uucp]: http://erratique.ch/software/uucp 19 | 20 | Homepage: """ 21 | maintainer: "Daniel Bünzli " 22 | authors: "The uuseg programmers" 23 | license: "ISC" 24 | tags: ["unicode" "text" "segmentation" "org:erratique"] 25 | homepage: "https://erratique.ch/software/uuseg" 26 | doc: "https://erratique.ch/software/uuseg/doc/" 27 | bug-reports: "https://github.com/dbuenzli/uuseg/issues" 28 | depends: [ 29 | "ocaml" {>= "4.14.0"} 30 | "ocamlfind" {build} 31 | "ocamlbuild" {build} 32 | "topkg" {build & >= "1.1.0"} 33 | "uucp" {>= "17.0.0" & < "18.0.0"} 34 | ] 35 | depopts: ["uutf" "cmdliner"] 36 | conflicts: [ 37 | "uutf" {< "1.0.0"} 38 | "cmdliner" {< "1.1.0"} 39 | ] 40 | build: [ 41 | "ocaml" 42 | "pkg/pkg.ml" 43 | "build" 44 | "--dev-pkg" 45 | "%{dev}%" 46 | "--with-uutf" 47 | "%{uutf:installed}%" 48 | "--with-cmdliner" 49 | "%{cmdliner:installed}%" 50 | ] 51 | dev-repo: "git+https://erratique.ch/repos/uuseg.git" 52 | x-maintenance-intent: ["(latest)"] 53 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | v17.0.0 2025-09-11 Zagreb 2 | ------------------------- 3 | 4 | - Unicode 17.0.0 support. 5 | - Add `Uuseg.equal` thanks to @lord for suggesting. 6 | 7 | v16.0.0 2024-09-11 Zagreb 8 | ------------------------- 9 | 10 | - Unicode 16.0.0 support. 11 | 12 | v15.1.0 2023-09-15 Zagreb 13 | ------------------------- 14 | 15 | - Unicode 15.1.0 support. 16 | - Requires OCaml 4.14.0 for the UTF decoders. 17 | - The `Uuseg_string` module was rewritten to use the standard library 18 | UTF decoders and was moved to the `uuseg` library. The `uuseg.string` 19 | library is deprecated, it warns on usage and simply requires `uuseg`. 20 | - The sample code was rewritten to use the standard library UTF 21 | decoders. 22 | 23 | v15.0.0 2022-09-15 Zagreb 24 | ------------------------- 25 | 26 | - Unicode 15.0.0 support. 27 | 28 | v14.0.0 2021-09-17 Zagreb 29 | ------------------------- 30 | 31 | - Unicode 14.0.0 support. 32 | 33 | v13.0.0 2020-03-11 La Forclaz (VS) 34 | ---------------------------------- 35 | 36 | - Unicode 13.0.0 support. 37 | - Grapheme clusters and word boundaries w.r.t. emojis are segmented 38 | according to the specification (#5 is closed). 39 | - Internal rewrite of word and line break boundaries. Implementations 40 | are less hairy, less ad-hoc (not there yet though) and more correct. 41 | - Require OCaml >= 4.03.0. 42 | 43 | v12.0.0 2019-03-08 La Forclaz (VS) 44 | ---------------------------------- 45 | 46 | - Unicode 12.0.0 support. Grapheme cluster and word boundaries 47 | w.r.t. emojis are still only partially according to the specification 48 | see issue #5 for details. 49 | 50 | v11.0.0 2018-06-06 Zürich 51 | ------------------------- 52 | 53 | - Unicode 11.0.0 support. Grapheme cluster and word boundaries 54 | w.r.t. emojis are only partially supported according to the 55 | specification see issue #5 for details. 56 | 57 | v10.0.0 2017-06-20 Cambridge (UK) 58 | --------------------------------- 59 | 60 | - Unicode 10.0.0 support. 61 | 62 | v1.0.1 2016-03-07 La Forclaz (VS) 63 | --------------------------------- 64 | 65 | - OCaml 4.05 compatibility (removal of `Uchar.dump`). 66 | 67 | v1.0.0 2016-11-23 Zagreb 68 | ------------------------ 69 | 70 | - Unicode 9.0.0 support. 71 | - OCaml standard library `Uchar.t` support. 72 | - Removes and substitutes `type Uuseg.uchar = int` by the (abstract) 73 | `Uchar.t` type. `Uchar.{of,to}_int` allows to recover the previous 74 | representation. 75 | - Removes the `Uuseg.is_uchar`. `Uchar.is_valid` can be used instead. 76 | - Safe string support. 77 | - Build depend on topkg. 78 | - Relicense from BSD3 to ISC. 79 | 80 | v0.9.0 2015-06-17 Cambridge (UK) 81 | -------------------------------- 82 | 83 | - Support for Unicode 8.0.0's new line breaking and sentence boundary rules. 84 | - `Uuseg.custom` add a unit argument. 85 | 86 | 87 | v0.8.0 2014-12-23 Cugy (VD) 88 | --------------------------- 89 | 90 | First release. 91 | -------------------------------------------------------------------------------- /src/uuseg.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | 7 | let unicode_version = Uucp.unicode_version 8 | 9 | (* Segmenters *) 10 | 11 | type 'a segmenter = 12 | { id : 'a Uuseg_base.Type.Id.t; 13 | name : string; 14 | create : unit -> 'a; 15 | copy : 'a -> 'a; 16 | equal : 'a -> 'a -> bool; 17 | mandatory : 'a -> bool; 18 | add : 'a -> [ `Uchar of Uchar.t | `Await | `End ] -> 19 | [ `Boundary | `Uchar of Uchar.t | `Await | `End ] } 20 | 21 | type custom = C : 'a segmenter -> custom 22 | 23 | type boundary = 24 | [ `Grapheme_cluster | `Word | `Sentence | `Line_break | `Custom of custom ] 25 | 26 | let pp_boundary ppf b = match (b :> boundary) with 27 | | `Grapheme_cluster -> Format.fprintf ppf "`Grapheme_cluster" 28 | | `Word -> Format.fprintf ppf "`Word" 29 | | `Sentence -> Format.fprintf ppf "`Sentence" 30 | | `Line_break -> Format.fprintf ppf "`Line_break" 31 | | `Custom (C s) -> Format.fprintf ppf "`Custom %s" s.name 32 | 33 | (* Built-in segmenters *) 34 | 35 | let mandatory_default _ = true 36 | 37 | let grapheme_cluster = 38 | C { id = Uuseg_base.Type.Id.make (); 39 | name = "Uuseg.grapheme_cluster"; 40 | create = Uuseg_grapheme_cluster.create; 41 | copy = Uuseg_grapheme_cluster.copy; 42 | equal = Uuseg_grapheme_cluster.equal; 43 | mandatory = mandatory_default; 44 | add = Uuseg_grapheme_cluster.add; } 45 | 46 | let word = 47 | C { id = Uuseg_base.Type.Id.make (); 48 | name = "Uuseg.word"; 49 | create = Uuseg_word.create; 50 | copy = Uuseg_word.copy; 51 | equal = Uuseg_word.equal; 52 | mandatory = mandatory_default; 53 | add = Uuseg_word.add; } 54 | 55 | let sentence = 56 | C { id = Uuseg_base.Type.Id.make (); 57 | name = "Uuseg.sentence"; 58 | create = Uuseg_sentence.create; 59 | copy = Uuseg_sentence.copy; 60 | equal = Uuseg_sentence.equal; 61 | mandatory = mandatory_default; 62 | add = Uuseg_sentence.add; } 63 | 64 | let line_break = 65 | C { id = Uuseg_base.Type.Id.make (); 66 | name = "Uuseg.line_break"; 67 | create = Uuseg_line_break.create; 68 | copy = Uuseg_line_break.copy; 69 | equal = Uuseg_line_break.equal; 70 | mandatory = Uuseg_line_break.mandatory; 71 | add = Uuseg_line_break.add; } 72 | 73 | (* Generic segmenter inteface *) 74 | 75 | type t = Seg : boundary * 'a * 'a segmenter -> t 76 | type ret = Uuseg_base.ret 77 | 78 | let create boundary = 79 | let (C seg) = match boundary with 80 | | `Grapheme_cluster -> grapheme_cluster 81 | | `Word -> word 82 | | `Sentence -> sentence 83 | | `Line_break -> line_break 84 | | `Custom c -> c 85 | in 86 | Seg ((boundary :> boundary), seg.create (), seg) 87 | 88 | let boundary (Seg (boundary, _, _)) = boundary 89 | let add (Seg (_, s, seg)) add = seg.add s add 90 | let mandatory (Seg (_, s, seg)) = seg.mandatory s 91 | let copy (Seg (b, s, seg)) = Seg (b, seg.copy s, seg) 92 | 93 | let equal (Seg (b0, s0, seg0)) (Seg (b1, s1, seg1)) = match b0, b1 with 94 | | `Custom _, _ | _, `Custom _ -> 95 | invalid_arg "Cannot test custom segmenters for equality" 96 | | _ -> 97 | match Uuseg_base.Type.Id.provably_equal seg0.id seg1.id with 98 | | None -> false 99 | | Some Uuseg_base.Type.Equal -> seg0.equal s0 s1 100 | 101 | let pp_ret = Uuseg_base.pp_ret 102 | 103 | (* Custom segmenters *) 104 | 105 | let custom ?(mandatory = mandatory_default) ~name ~create ~copy ~add () = 106 | (* N.B. when we require > 5.1 we can replace Uuseg_base.Type by 107 | Stdlib.Type and open up equality testing for custom *) 108 | let id = Uuseg_base.Type.Id.make () in 109 | let equal _ _ = assert false in 110 | C { id; name; create; copy; equal; mandatory; add } 111 | 112 | let err_exp_await = Uuseg_base.err_exp_await 113 | let err_ended = Uuseg_base.err_ended 114 | -------------------------------------------------------------------------------- /B0.ml: -------------------------------------------------------------------------------- 1 | open B0_kit.V000 2 | open Result.Syntax 3 | 4 | let unicode_version = 17, 0, 0, None (* Adjust on new releases *) 5 | let next_major = B0_version.next_major unicode_version 6 | 7 | (* OCaml library names *) 8 | 9 | let b0_std = B0_ocaml.libname "b0.std" 10 | let uucp = B0_ocaml.libname "uucp" 11 | let uutf = B0_ocaml.libname "uutf" 12 | let cmdliner = B0_ocaml.libname "cmdliner" 13 | 14 | let uuseg = B0_ocaml.libname "uuseg" 15 | 16 | (* Libraries *) 17 | 18 | let uuseg_lib = 19 | let srcs = [ `Dir ~/"src" ] in 20 | let requires = [ uucp ] in 21 | B0_ocaml.lib uuseg ~doc:"The uuseg library" ~srcs ~requires 22 | 23 | let uuseg_string_lib = 24 | let exports = [uuseg] in 25 | B0_ocaml.deprecated_lib ~exports (B0_ocaml.libname "uuseg.string") 26 | 27 | (* Tools *) 28 | 29 | let usegtrip = 30 | let srcs = [ `File ~/"test/usegtrip.ml" ] in 31 | let requires = [ cmdliner; uutf; uuseg ] in 32 | B0_ocaml.exe "usegtrip" ~public:true ~doc:"The usegtrip tool" ~srcs ~requires 33 | 34 | (* Tests *) 35 | 36 | let test = 37 | let srcs = [ `File ~/"test/test_uuseg.ml" ] in 38 | let meta = 39 | B0_meta.(empty |> tag test |> tag run |> ~~ B0_unit.Action.cwd `Scope_dir) 40 | in 41 | let requires = [ b0_std; uucp; uuseg; cmdliner ] in 42 | B0_ocaml.exe "test_uucp" ~doc:"Test segmentations" ~srcs ~meta ~requires 43 | 44 | let examples = 45 | let srcs = [ `File ~/"test/examples.ml" ] in 46 | let meta = B0_meta.(empty |> tag test) in 47 | let requires = [ uuseg ] in 48 | B0_ocaml.exe "examples" ~doc:"Doc samples" ~srcs ~meta ~requires 49 | 50 | (* Actions *) 51 | 52 | let show_version = 53 | B0_unit.of_action "unicode-version" ~doc:"Show supported unicode version" @@ 54 | fun _ _ ~args:_ -> 55 | Ok (Log.stdout (fun m -> m "%s" (B0_version.to_string unicode_version))) 56 | 57 | let test_url kind = 58 | Fmt.str "http://www.unicode.org/Public/%s/ucd/auxiliary/%sBreakTest.txt" 59 | (B0_version.to_string unicode_version) kind 60 | 61 | let download_tests = 62 | let doc = "Download the UCD break tests" in 63 | B0_unit.of_action "download-tests" ~doc @@ fun env _ ~args:_ -> 64 | let get kind = 65 | let test_url = test_url kind in 66 | let test_file = Fpath.v (Fmt.str "test/%sBreakTest.txt" kind) in 67 | let test_file = B0_env.in_scope_dir env test_file in 68 | (Log.stdout @@ fun m -> 69 | m "@[Downloading %s@,to %a@]" test_url Fpath.pp test_file); 70 | B0_action_kit.fetch_url env test_url test_file 71 | in 72 | let tests = ["Line"; "Grapheme"; "Word"; "Sentence"] in 73 | List.iter_iter_on_error ~error:(Log.if_error ~use:()) get tests; 74 | Ok () 75 | 76 | (* Packs *) 77 | 78 | let default = 79 | let meta = 80 | B0_meta.empty 81 | |> ~~ B0_meta.authors ["The uuseg programmers"] 82 | |> ~~ B0_meta.maintainers ["Daniel Bünzli "] 83 | |> ~~ B0_meta.homepage "https://erratique.ch/software/uuseg" 84 | |> ~~ B0_meta.online_doc "https://erratique.ch/software/uuseg/doc/" 85 | |> ~~ B0_meta.licenses ["ISC"] 86 | |> ~~ B0_meta.repo "git+https://erratique.ch/repos/uuseg.git" 87 | |> ~~ B0_meta.issues "https://github.com/dbuenzli/uuseg/issues" 88 | |> ~~ B0_meta.description_tags 89 | ["unicode"; "text"; "segmentation"; "org:erratique"] 90 | |> B0_meta.tag B0_opam.tag 91 | |> ~~ B0_opam.build 92 | {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%" 93 | "--with-uutf" "%{uutf:installed}%" 94 | "--with-cmdliner" "%{cmdliner:installed}%" ]]|} 95 | |> ~~ B0_opam.depopts [ "uutf", ""; "cmdliner", ""] 96 | |> ~~ B0_opam.conflicts 97 | [ "uutf", {|< "1.0.0"|}; 98 | "cmdliner", {|< "1.1.0"|}] 99 | |> ~~ B0_opam.depends 100 | [ "ocaml", {|>= "4.14.0"|}; 101 | "ocamlfind", {|build|}; 102 | "ocamlbuild", {|build|}; 103 | "topkg", {|build & >= "1.1.0"|}; 104 | "uucp", 105 | Fmt.str {|>= "%s" & < "%s"|} 106 | (B0_version.to_string unicode_version) 107 | (B0_version.to_string next_major)] 108 | in 109 | B0_pack.make "default" ~doc:"uuseg package" ~meta ~locked:true @@ 110 | B0_unit.list () 111 | -------------------------------------------------------------------------------- /src/uuseg_string.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Unicode text segmentation on UTF OCaml strings. 7 | 8 | {!Uuseg} functions acting directly on UTF encoded OCaml strings. 9 | 10 | {b Warning.} All these functions silently replace malformed encoded Unicode 11 | data by a {!Stdlib.Uchar.rep} character. *) 12 | 13 | (** {1:segment Segment} *) 14 | 15 | type 'a folder = 'a -> string -> 'a 16 | (** The type for segment folders. The function takes an accumulator 17 | and a segment. Segments are the UTF encoded characters delimited 18 | by two [`Boundary] occurences. If the segmenter has no initial or 19 | final [`Boundary], the folding function inserts an implicit 20 | one. Empty segments – which by definition do not happen with 21 | the default segmenters – are not reported. *) 22 | 23 | val fold_utf_8 : [< Uuseg.boundary] -> 'a folder -> 'a -> string -> 'a 24 | (** [fold_utf_8 b f acc s] folds over the [b] UTF-8 encoded segments of 25 | the UTF-8 encoded string [s] using [f] and [acc]. *) 26 | 27 | val fold_utf_16be : [< Uuseg.boundary] -> 'a folder -> 'a -> string -> 'a 28 | (** [fold_utf16be] is like {!fold_utf_8} but on UTF-16BE encoded strings. *) 29 | 30 | val fold_utf_16le : [< Uuseg.boundary] -> 'a folder -> 'a -> string -> 'a 31 | (** [fold_utf16le] is like {!fold_utf_8} but on UTF-16BE encoded 32 | strings. *) 33 | 34 | (** {1:pp Pretty-printers} 35 | 36 | Using OCaml's {!Format.pp_print_string} with Unicode encoded 37 | strings will most of the time derail the pretty-printing process 38 | for two reasons. First the Unicode encoding of a character may 39 | span more than one byte and [pp_print_string] considers one 40 | character to be one byte. Second there may be a discrepancy 41 | between the sequence of user-perceived characters (grapheme 42 | clusters e.g. é) and the actual sequences of Unicode characters in 43 | the data (e.g. é represented by the decomposition e + ´, 44 | ). 45 | 46 | The following formatters fix these problems for many (but not all) 47 | scripts. *) 48 | 49 | val pp_utf_8 : Format.formatter -> string -> unit 50 | (** [pp_utf8 ppf s] prints the UTF-8 encoded string [s]. Each grapheme 51 | cluster is considered as taking a length of 1. *) 52 | 53 | val pp_utf_8_text : Format.formatter -> string -> unit 54 | (** [pp_utf_8_text ppf s] prints the UTF-8 encoded string [s]. Each 55 | grapheme cluster is considered as taking a length of 1. Each 56 | line break opportunity is hinted with {!Format.pp_print_break} 57 | and mandatory line breaks issue a {!Format.pp_force_newline} call. 58 | 59 | Take into account the following points: 60 | {ul 61 | {- Any {{!Uucp.White.is_white_space}white space} Unicode character 62 | occuring before a break opportunity will be translated to a space 63 | (U+0020) in output if no break occurs.} 64 | {- The sequence CR LF (U+000D, U+000A) and all kind of mandatory 65 | line breaks are translated to whathever line separator is output 66 | by {!Format.pp_force_newline}. See {!pp_utf_8_lines} for the 67 | list of characters treated as mandatory line breaks.} 68 | {- Soft hyphens are handled but due to limitations in {!Format} are 69 | not replaced by hard ones on breaks.}} *) 70 | 71 | val pp_utf_8_lines : Format.formatter -> string -> unit 72 | (** [pp_utf_8_lines ppf s] prints the UTF-8 encoded string [s]. Each 73 | grapheme cluster is considered as taking a length of 1. Each 74 | mandatory line break (including the sequence CR LF (U+000D, 75 | U+000A)) issues a {!Format.pp_force_newline} and is translated to 76 | whathever line separator this function outputs. 77 | 78 | This function correctly handles all kinds of line ends present 79 | Unicode, as of 7.0.0 this is FORM FEED (U+000C), LINE TABULATION 80 | (U+000B), LINE SEPARATOR (U+2028), PARAGRAPH SEPARATOR (U+2020), 81 | NEXT LINE (U+085), LINE FEED (U+000A), CARRIAGE RETURN (U+000D), 82 | and the sequence CR LF (U+000D, U+000A). *) 83 | -------------------------------------------------------------------------------- /src/uuseg_string.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type 'a folder = 'a -> string -> 'a 7 | 8 | let fold dec_uchar enc_uchar seg f acc0 s = 9 | let flush_segment buf acc = 10 | let segment = Buffer.contents buf in 11 | Buffer.clear buf; if segment = "" then acc else f acc segment 12 | in 13 | let rec add buf acc segmenter v = match Uuseg.add segmenter v with 14 | | `Uchar u -> enc_uchar buf u; add buf acc segmenter `Await 15 | | `Boundary -> add buf (flush_segment buf acc) segmenter `Await 16 | | `Await | `End -> acc 17 | in 18 | let rec loop buf acc s i max segmenter = 19 | if i > max then flush_segment buf (add buf acc segmenter `End) else 20 | let dec = dec_uchar s i in 21 | let acc = add buf acc segmenter (`Uchar (Uchar.utf_decode_uchar dec)) in 22 | loop buf acc s (i + Uchar.utf_decode_length dec) max segmenter 23 | in 24 | let buf = Buffer.create 42 in 25 | let segmenter = Uuseg.create seg in 26 | loop buf acc0 s 0 (String.length s - 1) segmenter 27 | 28 | let fold_utf_8 seg f acc0 s = 29 | fold String.get_utf_8_uchar Buffer.add_utf_8_uchar seg f acc0 s 30 | 31 | let fold_utf_16be seg f acc0 s = 32 | fold String.get_utf_16be_uchar Buffer.add_utf_16be_uchar seg f acc0 s 33 | 34 | let fold_utf_16le seg f acc0 s = 35 | fold String.get_utf_16le_uchar Buffer.add_utf_16le_uchar seg f acc0 s 36 | 37 | let pp_utf_8 ppf s = 38 | let flush buf = 39 | let gc = Buffer.contents buf in 40 | if gc = "" then () else (Format.fprintf ppf "@<1>%s" gc; Buffer.clear buf) 41 | in 42 | let rec add buf segmenter v = match Uuseg.add segmenter v with 43 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf segmenter `Await 44 | | `Boundary -> flush buf; add buf segmenter `Await 45 | | `Await | `End -> () 46 | in 47 | let rec loop buf s i max segmenter = 48 | if i > max then (add buf segmenter `End; flush buf) else 49 | let dec = String.get_utf_8_uchar s i in 50 | add buf segmenter (`Uchar (Uchar.utf_decode_uchar dec)); 51 | loop buf s (i + Uchar.utf_decode_length dec) max segmenter 52 | in 53 | let buf = Buffer.create 10 in 54 | let segmenter = Uuseg.create `Grapheme_cluster in 55 | loop buf s 0 (String.length s - 1) segmenter 56 | 57 | let pp_utf_8_text ~only_mandatory ppf s = 58 | let b = Buffer.create 10 in 59 | let buf_buf = ref None in (* buffer to handle CRLF and suppress white *) 60 | let buf_flush () = 61 | let gc = Buffer.contents b in 62 | if gc = "" then () else (Format.fprintf ppf "@<1>%s" gc; Buffer.clear b) 63 | in 64 | let buf_add u = match !buf_buf with 65 | | None -> buf_buf := Some u 66 | | Some last -> 67 | match Uchar.to_int last with 68 | | 0x000D when Uchar.to_int u = 0x000A -> buf_buf := Some u (* rem CR *) 69 | | _ -> Buffer.add_utf_8_uchar b last; buf_buf := Some u 70 | in 71 | let buf_cut mandatory = 72 | let bbuf = !buf_buf in 73 | buf_buf := None; 74 | match bbuf with 75 | | None -> buf_flush (); Format.pp_print_cut ppf () 76 | | Some u when mandatory && Uucp.White.is_white_space u -> 77 | buf_flush (); Format.pp_force_newline ppf () 78 | | Some u when mandatory -> (* should not happen *) 79 | Buffer.add_utf_8_uchar b u; buf_flush (); Format.pp_force_newline ppf () 80 | | Some u when Uucp.White.is_white_space u -> 81 | buf_flush (); Format.pp_print_break ppf 1 0; 82 | | Some u -> 83 | Buffer.add_utf_8_uchar b u; buf_flush (); Format.pp_print_cut ppf () 84 | in 85 | let gseg = Uuseg.create `Grapheme_cluster in 86 | let lseg = Uuseg.create `Line_break in 87 | let rec line_add a = match Uuseg.add lseg a with 88 | | `Uchar u -> buf_add u; line_add `Await 89 | | `Boundary -> 90 | let m = Uuseg.mandatory lseg in 91 | if (only_mandatory && m) || (not only_mandatory) then buf_cut m; 92 | line_add `Await 93 | | `Await | `End -> () 94 | in 95 | let rec add a = match Uuseg.add gseg a with 96 | | `Uchar _ as a -> line_add a; add `Await 97 | | `Boundary -> buf_flush (); add `Await 98 | | `Await -> () 99 | | `End -> line_add `End; () 100 | in 101 | let rec loop s i max = 102 | if i > max then add `End else 103 | let dec = String.get_utf_8_uchar s i in 104 | add (`Uchar (Uchar.utf_decode_uchar dec)); 105 | loop s (i + Uchar.utf_decode_length dec) max 106 | in 107 | loop s 0 (String.length s - 1) 108 | 109 | let pp_utf_8_lines = pp_utf_8_text ~only_mandatory:true 110 | let pp_utf_8_text = pp_utf_8_text ~only_mandatory:false 111 | -------------------------------------------------------------------------------- /src/uuseg_grapheme_cluster.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* These are the rules as found in [1], with property values aliases [2] 7 | substituted. 8 | 9 | GB1. sot ÷ Any 10 | GB2. Any ÷ eot 11 | GB3. CR × LF 12 | GB4. (CN|CR|LF) ÷ 13 | GB5. ÷ (CN|CR|LF) 14 | GB6. L × (L|V|LV|LVT) 15 | GB7. (LV|V) × (V|T) 16 | GB8. (LVT|T) × T 17 | GB9. × (EX|ZWJ) 18 | GB9a. × SM 19 | GB9b. PP × 20 | GB9c. \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* 21 | \p{InCB=Linker} [\p{InCB=Extend}\p{InCB=Linker}]* 22 | × 23 | \p{InCB=Consonant} 24 | GB11. \p{Extended_Pictographic} EX* ZWJ x \p{Extended_Pictographic} 25 | GB12. sot (RI RI)* RI × RI 26 | GB13. [^RI] (RI RI)* × RI 27 | GB999. Any ÷ Any 28 | 29 | [1]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries 30 | [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt 31 | [3]: http://www.unicode.org/Public/7.0.0/ucd/auxiliary/GraphemeBreakTest.html 32 | 33 | By the structure of the rules we see that grapheme clusters 34 | boundaries can *mostly* be determined by simply looking at the 35 | grapheme cluster break property value of the character on the left 36 | and on the right of a boundary. The exceptions are GB9c, GB10 and GB12-13 37 | which are handled specially by enriching the segmenter state in 38 | a horribly ad-hoc fashion. *) 39 | 40 | type gcb = 41 | | CN | CR | EX | EB | EBG | EM | GAZ | L | LF | LV | LVT | PP | RI 42 | | SM | T | V | XX | ZWJ | Sot 43 | 44 | type incb = Consonant | Extend | Linker | None' 45 | 46 | (* WARNING. The indexes used here need to be synchronized with those 47 | assigned by uucp for Uucp.Break.Low.{grapheme_cluster,indic_conjunct_break} 48 | *) 49 | 50 | let byte_to_gcb = 51 | [| CN; CR; EX; EB; EBG; EM; GAZ; L; LF; LV; LVT; PP; RI; 52 | SM; T; V; XX; ZWJ; |] 53 | 54 | let gcb u = byte_to_gcb.(Uucp.Break.Low.grapheme_cluster u) 55 | 56 | let byte_to_incb = [| Consonant; Extend; Linker; None' |] 57 | let incb u = byte_to_incb.(Uucp.Break.Low.indic_conjunct_break u) 58 | 59 | type left_gb9c_state = (* Ad-hoc state for matching GB9c *) 60 | | Reset | Has_consonant | Has_linker 61 | 62 | type state = 63 | | Fill (* get next uchar to decide boundary. *) 64 | | Flush (* an uchar is buffered, client needs to get it out with `Await. *) 65 | | End (* `End was added. *) 66 | 67 | type t = 68 | { mutable state : state; (* current state. *) 69 | mutable left_gb9c : left_gb9c_state; (* state for matching gb9c. *) 70 | mutable left : gcb; (* break property value left of boundary. *) 71 | mutable left_odd_ri : bool; (* odd number of RI on the left. *) 72 | mutable left_emoji_seq : bool; (* emoji seq on the left. *) 73 | mutable buf : [ `Uchar of Uchar.t ] } (* bufferized add. *) 74 | 75 | let nul_buf = `Uchar (Uchar.unsafe_of_int 0x0000) 76 | 77 | let create () = 78 | { state = Fill; 79 | left_gb9c = Reset; 80 | left = Sot; left_odd_ri = false; left_emoji_seq = false; 81 | buf = nul_buf (* overwritten *); } 82 | 83 | let copy s = { s with state = s.state; } 84 | let equal = ( = ) 85 | 86 | let gb9c_match s right_incb = match s.left_gb9c, right_incb with 87 | | Has_linker, Consonant -> true 88 | | _, _ -> false 89 | 90 | let break s right right_incb right_u = match s.left, right with 91 | | (* GB1 *) Sot, _ -> true 92 | (* GB2 is handled by `End *) 93 | | (* GB3 *) CR, LF -> false 94 | | (* GB4 *) (CN|CR|LF), _ -> true 95 | | (* GB5 *) _, (CN|CR|LF) -> true 96 | | (* GB6 *) L, (L|V|LV|LVT) -> false 97 | | (* GB7 *) (LV|V), (V|T) -> false 98 | | (* GB8 *) (LVT|T), T -> false 99 | | (* GB9+a *) _, (EX|ZWJ|SM) -> false 100 | | (* GB9b *) PP, _ -> false 101 | | (* GB9c *) _, _ when gb9c_match s right_incb -> false 102 | | (* GB11 *) ZWJ, _ when s.left_emoji_seq && 103 | Uucp.Emoji.is_extended_pictographic right_u -> false 104 | | (* GB12+13 *) RI, RI when s.left_odd_ri -> false 105 | | (* GB999 *) _, _ -> true 106 | 107 | let update_left s right right_incb right_u = 108 | s.left <- right; 109 | begin match s.left with 110 | | EX | ZWJ -> 111 | s.left_odd_ri <- false 112 | (* keep s.left_emoji_seq as is *) 113 | | RI -> 114 | s.left_odd_ri <- not s.left_odd_ri; 115 | s.left_emoji_seq <- false; 116 | | _ when Uucp.Emoji.is_extended_pictographic right_u -> 117 | s.left_odd_ri <- false; 118 | s.left_emoji_seq <- true; 119 | | _ -> 120 | s.left_odd_ri <- false; 121 | s.left_emoji_seq <- false 122 | end; 123 | s.left_gb9c <- begin match right_incb with 124 | | None' -> Reset 125 | | Consonant -> Has_consonant 126 | | Linker when s.left_gb9c = Has_consonant -> Has_linker 127 | | Extend | Linker -> s.left_gb9c 128 | end 129 | 130 | let add s = function 131 | | `Uchar u as add -> 132 | begin match s.state with 133 | | Fill -> 134 | let right = gcb u in 135 | let right_incb = incb u in 136 | let break = break s right right_incb u in 137 | update_left s right right_incb u; 138 | if not break then add else 139 | (s.state <- Flush; s.buf <- add; `Boundary) 140 | | Flush -> Uuseg_base.err_exp_await add 141 | | End -> Uuseg_base.err_ended add 142 | end 143 | | `Await -> 144 | begin match s.state with 145 | | Flush -> s.state <- Fill; (s.buf :> Uuseg_base.ret) 146 | | End -> `End 147 | | Fill -> `Await 148 | end 149 | | `End -> 150 | begin match s.state with 151 | | Fill -> s.state <- End; if s.left = Sot then `End else `Boundary 152 | | Flush -> Uuseg_base.err_exp_await `End 153 | | End -> Uuseg_base.err_ended `End 154 | end 155 | -------------------------------------------------------------------------------- /test/usegtrip.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let str = Printf.sprintf 7 | let pp = Format.fprintf 8 | let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) " 9 | (Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d) 10 | (Uutf.decoder_byte_count d) 11 | 12 | let pp_malformed ppf bs = 13 | let l = String.length bs in 14 | pp ppf "@[malformed bytes @[("; 15 | if l > 0 then pp ppf "%02X" (Char.code (bs.[0])); 16 | for i = 1 to l - 1 do pp ppf "@ %02X" (Char.code (bs.[i])) done; 17 | pp ppf ")@]@]" 18 | 19 | let exec = Filename.basename Sys.executable_name 20 | let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec 21 | 22 | let input_malformed = ref false 23 | let log_malformed inf d bs = 24 | input_malformed := true; 25 | log "%s:%a: %a@." inf pp_pos d pp_malformed bs 26 | 27 | let u_rep = `Uchar Uutf.u_rep 28 | 29 | (* Output *) 30 | 31 | let uchar_ascii delim ppf = 32 | let last_was_u = ref false in 33 | function 34 | | `Uchar u -> 35 | if !last_was_u then (Format.pp_print_char ppf ' '); 36 | last_was_u := true; pp ppf "U+%04X" (Uchar.to_int u) 37 | | `Boundary -> 38 | last_was_u := false; pp ppf "%s" delim 39 | | `End -> () 40 | 41 | let uchar_encoder enc delim = 42 | let enc = match enc with 43 | | `ISO_8859_1 | `US_ASCII -> `UTF_8 44 | | #Uutf.encoding as enc -> enc 45 | in 46 | let delim = 47 | let add acc _ = function 48 | | `Uchar _ as u -> u :: acc 49 | | `Malformed bs -> 50 | log "delimiter: %a" pp_malformed bs; u_rep :: acc 51 | in 52 | List.rev (Uutf.String.fold_utf_8 add [] delim) 53 | in 54 | let e = Uutf.encoder enc (`Channel stdout) in 55 | function 56 | | `Uchar _ | `End as v -> ignore (Uutf.encode e v) 57 | | `Boundary -> List.iter (fun u -> ignore (Uutf.encode e u)) delim 58 | 59 | let out_fun delim ascii oe = 60 | if ascii then uchar_ascii delim Format.std_formatter else 61 | uchar_encoder oe delim 62 | 63 | (* Trip *) 64 | 65 | let segment boundary inf d first_dec out = 66 | let segmenter = Uuseg.create boundary in 67 | let rec add v = match Uuseg.add segmenter v with 68 | | `Uchar _ | `Boundary as v -> out v; add `Await 69 | | `Await | `End -> () 70 | in 71 | let rec loop d = function 72 | | `Uchar _ as v -> add v; loop d (Uutf.decode d) 73 | | `End as v -> add v; out `End 74 | | `Malformed bs -> log_malformed inf d bs; add u_rep; loop d (Uutf.decode d) 75 | | `Await -> assert false 76 | in 77 | if Uutf.decoder_removed_bom d then add (`Uchar Uutf.u_bom); 78 | loop d first_dec 79 | 80 | let trip seg inf enc delim ascii = 81 | try 82 | let ic = if inf = "-" then stdin else open_in inf in 83 | let d = Uutf.decoder ?encoding:enc (`Channel ic) in 84 | let first_dec = Uutf.decode d in (* guess encoding if needed. *) 85 | let out = out_fun delim ascii (Uutf.decoder_encoding d) in 86 | segment seg inf d first_dec out; 87 | if inf <> "-" then close_in ic; 88 | flush stdout 89 | with Sys_error e -> log "%s@." e; exit 1 90 | 91 | (* Version *) 92 | 93 | let unicode_version () = Format.printf "%s@." Uuseg.unicode_version 94 | 95 | (* Cmd *) 96 | 97 | let do_cmd cmd seg inf enc delim ascii = match cmd with 98 | | `Unicode_version -> unicode_version () 99 | | `Trip -> trip seg inf enc delim ascii 100 | 101 | (* Cmdline interface *) 102 | 103 | open Cmdliner 104 | 105 | let cmd = 106 | let doc = "Output supported Unicode version." in 107 | let unicode_version = `Unicode_version, Arg.info ["unicode-version"] ~doc in 108 | Arg.(value & vflag `Trip [unicode_version]) 109 | 110 | let seg_docs = "SEGMENTATION" 111 | let seg = 112 | let docs = seg_docs in 113 | let doc = "Line break opportunities boundaries." in 114 | let line = `Line_break, Arg.info ["l"; "line"] ~doc ~docs in 115 | let doc = "Grapheme cluster boundaries." in 116 | let gc = `Grapheme_cluster, Arg.info ["g"; "grapheme-cluster"] ~doc ~docs in 117 | let doc = "Word boundaries (default)." in 118 | let w = `Word, Arg.info ["w"; "word"] ~doc ~docs in 119 | let doc = "Sentence boundaries." in 120 | let s = `Sentence, Arg.info ["s"; "sentence"] ~doc ~docs in 121 | Arg.(value & vflag `Word [line; gc; w; s]) 122 | 123 | let file = 124 | let doc = "The input file. Reads from stdin if unspecified." in 125 | Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE") 126 | 127 | let enc = 128 | let enc = [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE; 129 | "UTF-16BE", `UTF_16BE; "ASCII", `US_ASCII; "latin1", `ISO_8859_1 ] 130 | in 131 | let doc = str "Input encoding, must %s. If unspecified the encoding is \ 132 | guessed. The output encoding is the same as the input \ 133 | encoding except for ASCII and latin1 where UTF-8 is output." 134 | (Arg.doc_alts_enum enc) 135 | in 136 | Arg.(value & opt (some (enum enc)) None & info [ "e"; "encoding" ] ~doc) 137 | 138 | let ascii = 139 | let doc = "Output the input text as space (U+0020) separated Unicode 140 | scalar values written in the US-ASCII charset." 141 | in 142 | Arg.(value & flag & info ["a"; "ascii"] ~doc) 143 | 144 | let delim = 145 | let doc = "The UTF-8 encoded delimiter used to denote boundaries." in 146 | Arg.(value & opt string "|" & Arg.info [ "d"; "delimiter" ] ~doc ~docv:"SEP") 147 | 148 | let cmd = 149 | let doc = "segment Unicode text" in 150 | let man = [ 151 | `S "DESCRIPTION"; 152 | `P "$(tname) inputs Unicode text from stdin and rewrites it 153 | to stdout with segment boundaries as determined according 154 | the locale independent specifications of UAX 29 and UAX 14. 155 | Boundaries are represented by the UTF-8 encoded delimiter string 156 | specified with the option $(b,-d) (defaults to `|')."; 157 | `P "Invalid byte sequences in the input are reported on stderr and 158 | replaced by the Unicode replacement character (U+FFFD) in the output."; 159 | `S seg_docs; 160 | `S "OPTIONS"; 161 | `S "EXIT STATUS"; 162 | `P "$(tname) exits with one of the following values:"; 163 | `I ("0", "no error occured"); 164 | `I ("1", "a command line parsing error occured"); 165 | `I ("2", "the input text was malformed"); 166 | `S "BUGS"; 167 | `P "This program is distributed with the Uuseg OCaml library. 168 | See http://erratique.ch/software/uuseg for contact 169 | information."; ] 170 | in 171 | Cmd.v (Cmd.info "usegtrip" ~version:"%%VERSION%%" ~doc ~man) 172 | Term.(const do_cmd $ cmd $ seg $ file $ enc $ delim $ ascii) 173 | 174 | let main () = match Cmd.eval cmd with 175 | | 0 -> if !input_malformed then exit 2 else exit 0 176 | | c when c = Cmd.Exit.cli_error -> exit 1 177 | | c -> exit c 178 | 179 | let () = if !Sys.interactive then () else main () 180 | -------------------------------------------------------------------------------- /src/uuseg_word.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* These are the rules as found in [1], with property values aliases [2] 7 | substituted. 8 | 9 | WB1. sot ÷ Any 10 | WB2. Any ÷ eot 11 | WB3. CR × LF 12 | WB3a. (NL|CR|LF) ÷ 13 | WB3b. ÷ (NL|CR|LF) 14 | WB3c. ZWJ × \p{Extended_Pictographic} 15 | WB3d. WSegSpace × WSegSpace 16 | WB4. X (Extend|FO|ZWJ)* → X 17 | WB5. (LE|HL) × (LE|HL) 18 | WB6. (LE|HL) × (ML|MB|SQ) (LE|HL) 19 | WB7. (LE|HL) (ML|MB|SQ) × (LE|HL) 20 | WB7a. HL × SQ 21 | WB7b. HL × DQ HL 22 | WB7c. HL DQ × HL 23 | WB8. NU × NU 24 | WB9. (LE|HL) × NU 25 | WB10. NU × (LE|HL) 26 | WB11. NU (MN|MB|SQ) × NU 27 | WB12. NU × (MN|MB|SQ) NU 28 | WB13. KA × KA 29 | WB13a. (LE|HL|NU|KA|EX) × EX 30 | WB13b. EX × (LE|HL|NU|KA) 31 | WB15 sot (RI RI)* RI × RI 32 | WB15 [^RI] (RI RI)* RI × RI 33 | WB13c. RI × RI 34 | WB999. Any ÷ Any 35 | 36 | [1]: http://www.unicode.org/reports/tr29/#Word_boundaries 37 | [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt 38 | 39 | Given the structure of the rules we keep a window of four word 40 | break property value slots, two on the left, two on the right of a 41 | boundary and pattern match these slots to find the rule that 42 | applies. Because of WB4 these slots may actually correspond to more 43 | than one character and we need to bufferize the data for the slot r0. 44 | 45 | Besides we maintain two views of the window slots, one which has the 46 | word break property of concrete characters and another one that has 47 | the word break property as seen by the WB4 rewrite rule (for the 48 | right slots this coincides), see the *_wb4 fields in the state. 49 | 50 | ---??---> 51 | +----+----++----+----+ 52 | ...| l1 | l0 || r0 | r1 | 53 | +----+----++----+----+ 54 | already returned to client / \ buffered in segmenter *) 55 | 56 | type word = 57 | | CR | DQ | EX | EB | EBG | EM | Extend | FO | GAZ | HL | KA | LE | LF 58 | | MB | ML | MN | NL | NU | RI | SQ | WSegSpace | XX | ZWJ | Invalid | Sot | Eot 59 | 60 | (* WARNING. The indexes used here need to be synchronized with those 61 | assigned by uucp for Uucp.Break.Low.word. *) 62 | 63 | let byte_to_word = 64 | [| CR; DQ; EX; EB; EBG; EM; Extend; FO; GAZ; HL; KA; LE; LF; 65 | MB; ML; MN; NL; NU; RI; SQ; WSegSpace; XX; ZWJ |] 66 | 67 | let word u = byte_to_word.(Uucp.Break.Low.word u) 68 | 69 | type state = 70 | | Fill (* fill slots on the right of boundary. *) 71 | | Flush (* flush the first element of slot r0 to get to next boundary. *) 72 | | Decide (* decide boundary of slot r0 *) 73 | 74 | type t = 75 | { mutable state : state; 76 | mutable l1 : word; mutable l1_wb4 : word; (* l1 according to wb4 *) 77 | mutable l0 : word; mutable l0_wb4 : word; (* l0 according to wb4 *) 78 | mutable l0_odd_ri : bool; (* odd number of RI on left of break point. *) 79 | mutable r0 : word; (* of first element in r0_data. *) 80 | mutable r0_data : Uuseg_buf.t; (* data in r0 *) 81 | mutable r1 : word; 82 | mutable r1_data : [`Uchar of Uchar.t ] option; (* data in r1 (if any) *) 83 | mutable ended : bool; (* [true] if [`End] was added. *) } 84 | 85 | let create () = 86 | { state = Fill; 87 | l1 = Invalid; l1_wb4 = Invalid; 88 | l0 = Sot; l0_wb4 = Sot; 89 | l0_odd_ri = false; 90 | r0 = Invalid; 91 | r0_data = Uuseg_buf.create 13; 92 | r1 = Invalid; 93 | r1_data = None; 94 | ended = false; } 95 | 96 | let copy s = { s with r0_data = Uuseg_buf.copy s.r0_data; } 97 | let equal = ( = ) 98 | 99 | let has_break s = match s.l1, s.l0 (**),(**) s.r0, s.r1 with 100 | | (* WB1 *) _, Sot, _, _ -> true 101 | | (* WB2 *) _, _, Eot, _ -> true 102 | | (* WB3 *) _, CR, LF, _ -> false 103 | | (* WB3a *) _, (NL|CR|LF), _, _ -> true 104 | | (* WB3b *) _, _, (NL|CR|LF), _ -> true 105 | | (* WB3c *) _, ZWJ, _, _ when 106 | not (Uuseg_buf.empty s.r0_data) && 107 | Uucp.Emoji.is_extended_pictographic (Uuseg_buf.get_first s.r0_data) -> 108 | false 109 | | (* WB3d *) _, WSegSpace, WSegSpace, _ -> false 110 | | _ -> (* apply WB4 rewrite and match *) 111 | match s.l1_wb4, s.l0_wb4 (**),(**) s.r0, s.r1 with 112 | | (* WB4 *) _, _, (Extend|FO|ZWJ), _ -> false 113 | | (* WB5 *) _, (LE|HL), (LE|HL), _ -> false 114 | | (* WB6 *) _, (LE|HL), (ML|MB|SQ), (LE|HL) -> false 115 | | (* WB7 *) (LE|HL), (ML|MB|SQ), (LE|HL), _ -> false 116 | | (* WB7a *) _, HL, SQ, _ -> false 117 | | (* WB7b *) _, HL, DQ, HL -> false 118 | | (* WB7c *) HL, DQ, HL, _ -> false 119 | | (* WB8 *) _, NU, NU, _ -> false 120 | | (* WB9 *) _, (LE|HL), NU, _ -> false 121 | | (* WB10 *) _, NU, (LE|HL), _ -> false 122 | | (* WB11 *) NU, (MN|MB|SQ), NU, _ -> false 123 | | (* WB12 *) _, NU, (MN|MB|SQ), NU -> false 124 | | (* WB13 *) _, KA, KA, _ -> false 125 | | (* WB13a *) _, (LE|HL|NU|KA|EX), EX, _ -> false 126 | | (* WB13b *) _, EX, (LE|HL|NU|KA), _ -> false 127 | | (* WB15-16 *) _, RI, RI, _ when s.l0_odd_ri -> false 128 | | (* WB999 *) _, _, _, _ -> true 129 | 130 | let next s = (* moves to the next boundary and returns the char in r0 *) 131 | s.l1 <- s.l0; 132 | s.l0 <- s.r0; 133 | (* The lb4 window only moves when r0 is not one of the absorbed chars. *) 134 | begin match s.r0 with 135 | | Extend | FO | ZWJ -> () (* wb4 window doesn't move *) 136 | | _ -> 137 | s.l1_wb4 <- s.l0_wb4; 138 | s.l0_wb4 <- s.r0; 139 | s.l0_odd_ri <- (match s.l0_wb4 with RI -> not s.l0_odd_ri | _ -> false); 140 | end; 141 | let ret = Uuseg_buf.flush s.r0_data in 142 | match Uuseg_buf.empty s.r0_data with 143 | | false -> s.r0 <- word (Uuseg_buf.get_first s.r0_data); ret 144 | | true -> 145 | s.r0 <- s.r1; 146 | s.r1 <- Invalid; 147 | match s.r1_data with 148 | | None -> ret 149 | | Some u -> Uuseg_buf.add s.r0_data u; s.r1_data <- None; ret 150 | 151 | let need_fill s = Uuseg_buf.empty s.r0_data || s.r1_data = None 152 | let flush s = match s.ended with 153 | | false -> 154 | let ret = next s in 155 | (if need_fill s then s.state <- Fill else s.state <- Decide); 156 | ret 157 | | true -> 158 | match s.r0 with 159 | | Eot -> `End 160 | | _ -> s.state <- Decide; next s 161 | 162 | let decide s = if has_break s then (s.state <- Flush; `Boundary) else flush s 163 | 164 | let add s = function 165 | | `Uchar u as add -> 166 | if s.ended then Uuseg_base.err_ended add else 167 | begin match s.state with 168 | | Fill when Uuseg_buf.empty s.r0_data -> 169 | Uuseg_buf.add s.r0_data add; s.r0 <- word u; `Await 170 | | Fill -> 171 | begin match word u with 172 | | Extend | FO | ZWJ -> (* WB4 *) Uuseg_buf.add s.r0_data add; `Await 173 | | word -> s.r1_data <- Some add; s.r1 <- word; decide s 174 | end 175 | | Flush | Decide -> Uuseg_base.err_exp_await add 176 | end 177 | | `Await -> 178 | begin match s.state with 179 | | Flush -> flush s 180 | | Decide -> decide s 181 | | Fill -> `Await 182 | end 183 | | `End -> 184 | if s.ended then Uuseg_base.err_ended `End else 185 | begin match s.state with 186 | | Fill -> 187 | s.ended <- true; 188 | (if Uuseg_buf.empty s.r0_data then s.r0 <- Eot else s.r1 <- Eot); 189 | if s.l0 = Sot && s.r0 = Eot then (* empty string *) `End else 190 | decide s 191 | | Flush | Decide -> Uuseg_base.err_exp_await `End 192 | end 193 | -------------------------------------------------------------------------------- /test/test_uuseg.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* Uuseg tests, including Unicode's Segmentation and Line break conformance 7 | tests. *) 8 | 9 | open B0_testing 10 | open B0_std 11 | 12 | let pp_boundary ppf = function 13 | | `Grapheme_cluster -> Format.fprintf ppf "Grapheme cluster" 14 | | `Word -> Format.fprintf ppf "Word" 15 | | `Sentence -> Format.fprintf ppf "Sentence" 16 | | `Line_break -> Format.fprintf ppf "Line break" 17 | 18 | let rec pp_spec ppf = function 19 | | [] -> () 20 | | `B :: spec -> Format.fprintf ppf "÷ "; pp_spec ppf spec 21 | | `U u :: spec -> 22 | Format.fprintf ppf "%04X " (Uchar.to_int u); 23 | (match spec with (`U _) :: _ -> Format.fprintf ppf "× " | _ -> ()); 24 | pp_spec ppf spec 25 | 26 | (* Conformance data decoding *) 27 | 28 | let cp_of_string v = (* parses a code point value. *) 29 | let is_hex c = (0x30 <= c && c <= 0x39) || (0x41 <= c && c <= 0x46) in 30 | let cp = ref 0 in 31 | for k = 0 to (String.length v) - 1 do 32 | let c = Char.code v.[k] in 33 | if not (is_hex c) then failwith v else 34 | cp := !cp * 16 + (if c <= 0x39 then c - 48 else c - 55) 35 | done; 36 | !cp 37 | 38 | let decode_conformance_specs ignores ic = 39 | let rec loop specs = 40 | match try Some (input_line ic) with End_of_file -> None with 41 | | None -> List.rev specs 42 | | Some l -> 43 | if String.length l > 0 && l.[0] = '#' then loop specs else 44 | try begin match String.split_on_char '#' l with 45 | | [comment] -> loop specs 46 | | test :: comment -> 47 | let spec = String.split_on_char ' ' test in 48 | begin try 49 | let rec to_spec acc = function 50 | | ( "\xC3\x97" (* × *) | "\xC3\x97\t" ) :: rest -> 51 | to_spec acc rest 52 | | ( "\xC3\xB7" (* ÷ *) | "\xC3\xB7\t") :: rest -> 53 | to_spec (`B :: acc) rest 54 | | uchar :: rest -> 55 | let u = cp_of_string uchar in 56 | if not (Uchar.is_valid u) then raise Exit else 57 | to_spec (`U (Uchar.of_int u) :: acc) rest 58 | | [] -> 59 | List.rev acc 60 | in 61 | let spec = to_spec [] spec in 62 | if ignores = [] then loop (spec :: specs) else 63 | try 64 | let reason = List.assoc spec ignores in 65 | Test.log "Skip test (%s): %s" reason test; 66 | loop (specs) 67 | with 68 | | Not_found -> loop (spec :: specs) 69 | with Exit -> 70 | Test.log 71 | "Skip test (surrogate not a scalar value): %s" test; 72 | loop specs 73 | end 74 | | [] -> failwith "" 75 | end 76 | with Failure f -> 77 | Test.fail "FAILURE: `%s'" f; 78 | Test.fail "Unable to parse line:\n`%s'\n" l; 79 | loop specs 80 | in 81 | loop [] 82 | 83 | let rec seq_of_spec acc = function 84 | | `U u :: rest -> seq_of_spec (u :: acc) rest 85 | | `B :: rest -> seq_of_spec acc rest 86 | | [] -> List.rev acc 87 | 88 | (* Conformance testing *) 89 | 90 | let test_spec ?__POS__ seg src spec = 91 | let n = Uuseg.create seg in 92 | let ended = ref false in 93 | let rec add acc v = match Uuseg.add n v with 94 | | `Uchar u -> add ((`U u) :: acc) `Await 95 | | `Boundary -> add (`B :: acc) `Await 96 | | `Await -> ended := false; acc 97 | | `End -> ended := true; acc 98 | in 99 | let add_uchar acc u = add acc (`Uchar u) in 100 | let nseq = List.rev (add (List.fold_left add_uchar [] src) `End) in 101 | if not !ended then 102 | Test.fail ?__POS__ "%a did not finish with `End." pp_boundary seg else 103 | if nseq <> spec then 104 | Test.fail ?__POS__ "@[%a mismatch:@,impl: %a@,spec: %a@]" 105 | pp_boundary seg pp_spec nseq pp_spec spec 106 | else Test.pass () 107 | 108 | let test_conformance seg name ignores inf = 109 | Test.test (Printf.sprintf "conformance of %s" name) @@ fun () -> 110 | let specs = 111 | try 112 | In_channel.with_open_bin (Fpath.to_string inf) @@ fun ic -> 113 | decode_conformance_specs ignores ic 114 | with Sys_error e -> Test.failstop "%s" e 115 | in 116 | let test spec = test_spec seg (seq_of_spec [] spec) spec in 117 | let fail ?__POS__ n ~assertions = 118 | Test.log_fail "%a assertions %a" 119 | Test.Fmt.fail_count_ratio (n, assertions) Test.Fmt.failed () 120 | in 121 | Test.block ~__POS__ ~fail @@ fun () -> 122 | List.iter test specs 123 | 124 | let test_others = 125 | let u = Uchar.of_int in 126 | Test.test "other specifications" @@ fun () -> 127 | let g = `Grapheme_cluster in 128 | let test_spec ?__POS__ seg src spec = 129 | ignore (test_spec ?__POS__ seg src spec) 130 | in 131 | test_spec ~__POS__ g [] []; 132 | test_spec ~__POS__ g [u 0x0020] [`B; `U (u 0x0020); `B;]; 133 | test_spec ~__POS__ g (* éa *) [u 0x0065; u 0x0301; u 0x0061;] 134 | [`B; `U (u 0x0065); `U (u 0x0301); `B; `U (u 0x0061); `B;]; 135 | let w = `Word in 136 | test_spec ~__POS__ w [] []; 137 | let s = `Sentence in 138 | test_spec ~__POS__ s [] []; 139 | let l = `Line_break in 140 | test_spec ~__POS__ l [] []; 141 | () 142 | 143 | let test_uuseg_string = 144 | Test.test "Uuseg_string" @@ fun () -> 145 | let rec pp_list ppf = function 146 | | [] -> () 147 | | s :: ss -> Format.fprintf ppf "%S;@ " s; pp_list ppf ss 148 | in 149 | let fold8 seg s = 150 | List.rev (Uuseg_string.fold_utf_8 seg (fun acc s -> s :: acc) [] s) 151 | in 152 | let test ?__POS__ l spec = 153 | if l = spec then () else 154 | Test.fail ?__POS__ 155 | "@[Mismatch:@,impl: @[[%a]@]@,spec: @[[%a]@]" pp_list l pp_list spec 156 | in 157 | test ~__POS__ (fold8 `Grapheme_cluster "") []; 158 | test ~__POS__ (fold8 `Grapheme_cluster "ab cd") ["a"; "b"; " "; "c"; "d"]; 159 | test ~__POS__ (fold8 `Word "") []; 160 | test ~__POS__ (fold8 `Word "ab cd") ["ab"; " "; "cd"]; 161 | test ~__POS__ (fold8 `Sentence "") []; 162 | test ~__POS__ (fold8 `Sentence "ab cd") ["ab cd"]; 163 | test ~__POS__ (fold8 `Line_break "") []; 164 | test ~__POS__ (fold8 `Line_break "ab cd") ["ab "; "cd"]; 165 | () 166 | 167 | let test_LB30b_assumption = 168 | Test.test "LB30b's data assumption" @@ fun () -> 169 | (* This is needed by our implementation of LB30b *) 170 | let rec loop u = 171 | let c = Uucp.Emoji.is_extended_pictographic u && 172 | Uucp.Gc.general_category u = `Cn 173 | in 174 | if c then begin 175 | if Uucp.Break.line u = `ID || 176 | Uucp.Break.line u = `XX (* Since Unicode 16.0.0 a few unassigned 177 | chars satisfy [c]. Those get transformed 178 | to AL *) 179 | then () else 180 | (Test.fail "LB30b assumption failure for U+%04X: %a" 181 | (Uchar.to_int u) Uucp.Break.pp_line (Uucp.Break.line u)) 182 | end; 183 | if Uchar.equal u Uchar.max then () else loop (Uchar.succ u) 184 | in 185 | loop Uchar.min 186 | 187 | let main () = 188 | let open Cmdliner in 189 | let g_file = 190 | Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/GraphemeBreakTest.txt") 191 | & info ["g"] ~doc:"The GraphemeBreakTest.txt file") 192 | in 193 | let w_file = 194 | Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/WordBreakTest.txt") 195 | & info ["w"] ~doc:"The WordBreakTest.txt file") 196 | in 197 | let s_file = 198 | Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/SentenceBreakTest.txt") 199 | & info ["s"] ~doc:"The SentenceBreakTest.txt file") 200 | in 201 | let l_file = 202 | Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/LineBreakTest.txt") 203 | & info ["lb"] ~doc:"The LineBreakTest.txt file") 204 | in 205 | let args = 206 | let open Term.Syntax in 207 | let+ g_file and+ w_file and+ s_file and+ l_file in 208 | (g_file, w_file, s_file, l_file) 209 | in 210 | let doc = "Run Unicode segmentation conformance tests" in 211 | Test.main' ~doc args @@ fun (g_file, w_file, s_file, l_file) -> 212 | test_LB30b_assumption (); 213 | test_conformance `Grapheme_cluster "grapheme cluster boundary" [] g_file (); 214 | test_conformance `Word "word boundary" [] w_file (); 215 | test_conformance `Sentence "sentence boundary" [] s_file (); 216 | test_conformance `Line_break "line break boundary" [] l_file (); 217 | test_others (); 218 | test_uuseg_string () 219 | 220 | let () = if !Sys.interactive then () else exit (main ()) 221 | -------------------------------------------------------------------------------- /src/uuseg_sentence.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* These are the rules as found in [1], with property values aliases [2] 7 | substituted. 8 | 9 | SB1. sot ÷ Any 10 | SB2. Any ÷ eot 11 | SB3. CR × LF 12 | SB4. (SE|CR|LF) ÷ 13 | SB5. X (EX|FO)* → X 14 | SB6. AT × NU 15 | SB7. (UP|LO) AT × UP 16 | 17 | SB8. AT CL* SP* × (¬(LE|UP|LO|SE|CR|LF|ST|AT))* LO 18 | rewrite w.o. ¬ AT CL* SP* × (CL|NU|SC|SP|XX)* LO 19 | factorize AT CL* SP* × (NU|XX)? (CL|NU|SC|SP|XX)* LO 20 | SB8a. (ST|AT) CL* SP* × (SC|ST|AT) 21 | SB9. (ST|AT) CL* × (CL|SP|SE|CR|LF) 22 | SB10. (ST|AT) CL* SP* × (SP|SE|CR|LF) 23 | SB11. (ST|AT) CL* SP* (SE|CR|LF)? ÷ 24 | SB12. Any × Any 25 | 26 | 27 | [1]: http://www.unicode.org/reports/tr29/#Sentence_boundaries 28 | [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt 29 | 30 | Given the structure of the rules we keep a window of three break 31 | property value slots, two on the left, one on the right of a 32 | boundary and pattern match these slots to find the rule that 33 | applies. the rules SB8 to SB11 are handled separately, and we keep 34 | a buffer to handle the lookahead needed for SB8. 35 | ---??---> 36 | +----+----++----+-------... 37 | | l1 | l0 || r0 | SB8_buffer 38 | +----+----++----+-------... 39 | already returned to client / \ buffered in segmenter *) 40 | 41 | type sentence = 42 | | AT | CL | CR | EX | FO | LE | LF | LO | NU | SC | SE | SP | ST | UP | XX 43 | | Invalid | Sot | Eot 44 | 45 | (* WARNING. The indexes used here need to be synchronized with those 46 | assigned by uucp for Uucp.Break.Low.sentence. *) 47 | 48 | let byte_to_sentence = 49 | [| AT; CL; CR; EX; FO; LE; LF; LO; NU; SC; SE; SP; ST; UP; XX|] 50 | 51 | let sentence u = byte_to_sentence.(Uucp.Break.Low.sentence u) 52 | 53 | type state = 54 | | Fill (* fill slots on the right of boundary *) 55 | | Fill_CL_SP (* ad-hoc state to handle SB8 to SB11 *) 56 | | Fill_SB8 (* ad-hoc state to handle SB8 *) 57 | | Flush (* flush slot r0 to get to next boundary. *) 58 | | End (* `End was added. *) 59 | 60 | type t = 61 | { mutable state : state; (* current state. *) 62 | window : sentence array; (* break window. *) 63 | mutable l0 : int; (* index in [window] of [l0]. *) 64 | r0_buf : Uuseg_buf.t; (* buffer for r0. *) 65 | sb8_buf : Uuseg_buf.t; } (* buffer for resolving SB8. *) 66 | 67 | let create () = 68 | { state = Fill; 69 | window = [|Invalid; Sot; Invalid;|]; 70 | l0 = 1; 71 | r0_buf = Uuseg_buf.create 13; 72 | sb8_buf = Uuseg_buf.create 13; } 73 | 74 | let copy s = 75 | { s with window = Array.copy s.window; 76 | r0_buf = Uuseg_buf.copy s.r0_buf; 77 | sb8_buf = Uuseg_buf.copy s.sb8_buf; } 78 | 79 | let equal = ( = ) 80 | 81 | let l0_sentence s = s.window.(s.l0) 82 | let r0_sentence s = s.window.((s.l0 + 1) mod Array.length s.window) 83 | let r0_sentence_set s l = s.window.((s.l0 + 1) mod Array.length s.window) <- l 84 | let r0_add s add = Uuseg_buf.add s.r0_buf add 85 | let r0_empty s = Uuseg_buf.empty s.r0_buf 86 | let r0_len s = Uuseg_buf.len s.r0_buf 87 | let r0_flush s = Uuseg_buf.flush s.r0_buf 88 | let window_move s = 89 | s.l0 <- (s.l0 + 1) mod Array.length s.window; 90 | r0_sentence_set s Invalid 91 | 92 | (* WARNING. The code that follows is truly horrible, with more time a 93 | better way can certainly be found. *) 94 | 95 | let decide_sb8_sb11 s sentence (`Uchar _ as add) = 96 | (* AT or ST is in l0 and we have (AT|ST) CL* SP* and 97 | sentence <> CL | SP | EX | FO *) 98 | match sentence with 99 | | SC | ST | AT (* SB8a *) 100 | | SE | CR | LF (* SB9 SB10 *) -> 101 | s.state <- Flush; r0_sentence_set s sentence; add 102 | | LO when l0_sentence s = AT (* SB8 *) -> 103 | s.state <- Flush; r0_sentence_set s sentence; add 104 | | NU | XX when l0_sentence s = AT (* check SB8 *) -> 105 | s.state <- Fill_SB8; Uuseg_buf.add s.sb8_buf add; `Await 106 | | NU | XX | LE | UP | LO (* SB11 *) -> 107 | s.state <- Flush; r0_sentence_set s sentence; r0_add s add; `Boundary 108 | | CL | SP | EX | FO | Invalid | Sot | Eot -> 109 | assert false 110 | 111 | let decide s = 112 | let no_boundary s = r0_flush s in 113 | let wlen = Array.length s.window in 114 | let l0 = s.l0 in 115 | let r0 = (l0 + 1) mod wlen in 116 | let l1 = (l0 + 2) mod wlen in 117 | let w = s.window in 118 | match w.(l1), w.(l0) (**),(**) w.(r0) with 119 | | (* SB1 *) _, Sot, _ -> `Boundary 120 | (* SB2 is handled in [add]. *) 121 | | (* SB3 *) _, CR, LF -> no_boundary s 122 | | (* SB4 *) _, (SE|CR|LF), _ -> `Boundary 123 | (* SB5 is handled in [add]. *) 124 | | (* SB6 *) _, AT, NU -> no_boundary s 125 | | (* SB7 *) (UP|LO), AT, UP -> no_boundary s 126 | | (* SB8-SB11 is also handled in [add]. *) 127 | _, (AT|ST), sentence -> decide_sb8_sb11 s sentence (r0_flush s) 128 | | (* SB12 *) _, _, _ -> no_boundary s 129 | 130 | let rec add s = function 131 | | `Uchar u as addv -> 132 | begin match s.state with 133 | | Fill -> 134 | begin match sentence u with 135 | | EX | FO as sentence -> 136 | begin match l0_sentence s with 137 | | SE | CR | LF | Sot -> 138 | s.state <- Flush; r0_sentence_set s sentence; r0_add s addv; 139 | decide s 140 | | _ -> addv (* SB5 *) 141 | end 142 | | CL | SP as sentence -> 143 | begin match l0_sentence s with 144 | | AT | ST -> 145 | s.state <- Fill_CL_SP; r0_sentence_set s sentence; 146 | addv (* SB9, SB10 *) 147 | | _ -> 148 | s.state <- Flush; r0_sentence_set s sentence; r0_add s addv; 149 | decide s 150 | end 151 | | sentence -> 152 | s.state <- Flush; r0_sentence_set s sentence; r0_add s addv; 153 | decide s 154 | end 155 | | Fill_CL_SP (* we have AT or ST in l0 *) -> 156 | begin match sentence u with 157 | | EX | FO -> addv (* SB5 *) 158 | | CL -> 159 | begin match r0_sentence s with 160 | | CL -> addv (* SB9 (or eventually SB8) *) 161 | | SP -> 162 | begin match l0_sentence s with 163 | | ST -> (* SB11 *) 164 | s.state <- Flush; r0_sentence_set s CL; r0_add s addv; 165 | `Boundary 166 | | AT -> (* check SB8 *) 167 | s.state <- Fill_SB8; Uuseg_buf.add s.sb8_buf addv; `Await 168 | | _ -> assert false 169 | end 170 | | _ -> assert false 171 | end 172 | | SP -> 173 | begin match r0_sentence s with 174 | | CL -> r0_sentence_set s SP; addv (* SB10 (or eventually SB8 *) 175 | | SP -> addv 176 | | _ -> assert false 177 | end 178 | | sentence -> 179 | begin match r0_sentence s with 180 | | CL | SP -> decide_sb8_sb11 s sentence addv 181 | |_ -> assert false 182 | end 183 | end 184 | | Fill_SB8 -> 185 | begin match sentence u with 186 | | EX | FO (* SB5 *) 187 | | CL | NU | SC | SP | XX -> Uuseg_buf.add s.sb8_buf addv; `Await 188 | | LO -> (* SB8 *) 189 | Uuseg_buf.add s.sb8_buf addv; s.state <- Flush; add s `Await 190 | | _ -> (* SB11 *) 191 | Uuseg_buf.add s.sb8_buf addv; s.state <- Flush; `Boundary 192 | end 193 | | Flush -> Uuseg_base.err_exp_await addv 194 | | End -> Uuseg_base.err_ended addv 195 | end 196 | | `Await -> 197 | begin match s.state with 198 | | Flush -> 199 | if not (r0_empty s) then r0_flush s else 200 | begin 201 | match Uuseg_buf.len s.sb8_buf with 202 | | 0 -> s.state <- Fill; window_move s; `Await 203 | | 1 -> 204 | let u = Uuseg_buf.flush s.sb8_buf in 205 | s.state <- Fill; window_move s; add s u 206 | | _ -> 207 | let `Uchar uc as u = Uuseg_buf.flush s.sb8_buf in 208 | r0_sentence_set s (sentence uc); u 209 | end 210 | | End -> 211 | if not (Uuseg_buf.empty s.sb8_buf) 212 | then 213 | (* According to SB8 we bufferized only CL|NU|SC|SP|XX 214 | given the rules no new boundary will occur except the one 215 | due to SB2. So we just flush the sb8_buf. *) 216 | Uuseg_buf.flush s.sb8_buf 217 | else 218 | if r0_sentence s = Eot 219 | then (r0_sentence_set s Invalid; `Boundary (* SB2 *)) 220 | else `End 221 | | Fill | Fill_CL_SP | Fill_SB8 -> `Await 222 | end 223 | | `End -> 224 | begin match s.state with 225 | | Fill -> 226 | s.state <- End; 227 | if l0_sentence s = Sot then `End (* No boundary on empty seq *) else 228 | `Boundary (* SB1 and SB2 *) 229 | | Fill_CL_SP -> s.state <- End; `Boundary (* SB2 *) 230 | | Fill_SB8 -> s.state <- End; r0_sentence_set s Eot; `Boundary (* SB11 *) 231 | | Flush -> Uuseg_base.err_exp_await `End 232 | | End -> Uuseg_base.err_ended `End 233 | end 234 | -------------------------------------------------------------------------------- /src/uuseg.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** Unicode text segmentation. 7 | 8 | [Uuseg] segments Unicode text. It implements the locale 9 | independent Unicode text segmentation algorithms to detect 10 | grapheme cluster, word and sentence boundaries and the Unicode 11 | line breaking algorithm to detect line break opportunities. 12 | 13 | The module is independent from any IO mechanism or Unicode text 14 | data structure and it can process text without a complete 15 | in-memory representation. 16 | 17 | The supported Unicode version 18 | is determined by the {!unicode_version} value. 19 | 20 | Consult the {{!basics}basics}, {{!limits}limitations} and 21 | {{!examples}examples} of use. 22 | 23 | {3 References} 24 | {ul 25 | {- 26 | {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. 27 | (latest version)} 28 | {- 29 | {e {{:http://www.unicode.org/reports/tr29/}UAX #29 Unicode Text 30 | Segmentation}}. (latest version)} 31 | {- 32 | {e {{:http://www.unicode.org/reports/tr14/}UAX #14 Unicode Line Breaking 33 | Algorithm}}. (latest version)} 34 | {- Web based {{:http://unicode.org/cldr/utility/breaks.jsp}ICU 35 | break utility}.}} 36 | *) 37 | 38 | (** {1 Segment} *) 39 | 40 | val unicode_version : string 41 | (** [unicode_version] is the Unicode version supported by [Uuseg]. *) 42 | 43 | type custom 44 | (** The type for custom segmenters. See {!val:custom}. *) 45 | 46 | type boundary = 47 | [ `Grapheme_cluster 48 | (** {{:http://www.unicode.org/glossary/#extended_grapheme_cluster} 49 | Extended grapheme clusters} according to 50 | {{:https://www.unicode.org/reports/tr29/#C1-1}UAX29-C1-1} *) 51 | | `Word 52 | (** Words according to 53 | {{:https://www.unicode.org/reports/tr29/#C2-1}UAX29-C2-1} *) 54 | | `Sentence 55 | (** Sentences according to 56 | {{:https://www.unicode.org/reports/tr29/#C3-1}UAX29-C3-1} *) 57 | | `Line_break 58 | (** Line breaks accordings to 59 | {{:http://www.unicode.org/reports/tr14/}UAX #14} *) 60 | | `Custom of custom ] 61 | (** The type for boundaries. *) 62 | 63 | val pp_boundary : Format.formatter -> boundary -> unit 64 | (** [pp_boundary ppf b] prints an unspecified representation of [b] 65 | on [ppf]. *) 66 | 67 | type t 68 | (** The type for Unicode text segmenters. *) 69 | 70 | type ret = [ `Boundary | `Uchar of Uchar.t | `Await | `End ] 71 | (** The type for segmenter results. See {!add}. *) 72 | 73 | val create : [< boundary ] -> t 74 | (** [create b] is an Unicode text segmenter for boundaries of type [b]. *) 75 | 76 | val boundary : t -> boundary 77 | (** [boundary s] is the type of boundaries detected by [s]. *) 78 | 79 | val add : t -> [ `Uchar of Uchar.t | `Await | `End ] -> ret 80 | (** [add s v] is: 81 | {ul 82 | {- [`Boundary] if there is a boundary at that point in the sequence of 83 | characters. The client must then call [add] with [`Await] 84 | until [`Await] is returned.} 85 | {- [`Uchar u] if [u] is the next character in the sequence. 86 | The client must then call [add] with [`Await] until [`Await] is 87 | returned.} 88 | {- [`Await] when the segmenter is ready to add a new [`Uchar] 89 | or [`End].} 90 | {- [`End] when [`End] was added and all [`Boundary] and [`Uchar] were 91 | output.}} 92 | 93 | For [v] use [`Uchar u] to add a new character to the sequence to 94 | segment and [`End] to signal the end of sequence. After adding one 95 | of these two values always call [add] with [`Await] until [`Await] 96 | or [`End] is returned. 97 | 98 | @raise Invalid_argument if [`Uchar] or [`End] is added while 99 | that last add did not return [`Await] or if an [`Uchar] or [`End] 100 | is added after an [`End] was already added. *) 101 | 102 | val mandatory : t -> bool 103 | (** [mandatory s] is [true] if the last [`Boundary] returned by {!add} 104 | was mandatory. This function only makes sense for [`Line_break] 105 | segmenters or [`Custom] segmenters that sport that notion. For 106 | other segmenters or if no [`Boundary] was returned so far, [true] 107 | is returned. *) 108 | 109 | val copy : t -> t 110 | (** [copy s] is a copy of [s] in its current state. Subsequent {!add}s on 111 | [s] do not affect the copy. *) 112 | 113 | val equal : t -> t -> bool 114 | (** [equal s0 s1] is [true] iff [s0] and [s1] are in the same state, 115 | that is any sequence of {!add}s made on both [s0] and [s1] produce 116 | the same outputs. 117 | 118 | @raise Invalid_argument on {{!custom}custom segmenters}. *) 119 | 120 | val pp_ret : Format.formatter -> [< ret] -> unit 121 | (** [pp_ret ppf v] prints an unspecified representation of [v] on [ppf]. *) 122 | 123 | (** {1:custom Custom segmenters} *) 124 | 125 | val custom : 126 | ?mandatory:('a -> bool) -> 127 | name:string -> 128 | create:(unit -> 'a) -> 129 | copy:('a -> 'a) -> 130 | add: ('a -> [ `Uchar of Uchar.t | `Await | `End ] -> ret) -> unit -> custom 131 | (** [create ~mandatory ~name ~create ~copy ~add] is a custom segmenter. 132 | {ul 133 | {- [name] is a name to identify the segmenter.} 134 | {- [create] is called when the segmenter is {{!create}created} 135 | it should return a custom segmenter value.} 136 | {- [copy] is called with the segmenter value whenever the 137 | segmenter is {{!copy}copied}. It should return a copy of the 138 | segmenter value.} 139 | {- [mandatory] is called with the segmenter value to define 140 | the result of the {!mandatory} function. Defaults always 141 | returns [true].} 142 | {- [add] is called with the segmenter value to define 143 | the result of the {!add} value. The returned value 144 | should respect the semantics of {!add}. Use the functions 145 | {!err_exp_await} and {!err_ended} to raise [Invalid_argument] 146 | exception in {!add}s error cases.}} *) 147 | 148 | val err_exp_await : [< ret] -> 'a 149 | (** [err_exp_await fnd] should be used by custom segmenters when 150 | the client tries to {!add} an [`Uchar] or [`End] while the last 151 | returned value was not an [`Await]. *) 152 | 153 | val err_ended : [< ret] -> 'a 154 | (** [err_ended ()] should be used by custom segmenter when the client 155 | tries to {!add} [`Uchar] or [`End] after [`End] was already added. *) 156 | 157 | (** {1:limits Limitations} 158 | 159 | A [`Grapheme_cluster] segmenter will always consume only a small 160 | bounded amount of memory on any text. Other segmenters will also 161 | do so on non-degenerate text, but it's possible to feed them with 162 | input that will make them buffer an arbitrary amount of 163 | characters. *) 164 | 165 | (** {1:basics Basics} 166 | 167 | A segmenter is a stateful filter that inputs a sequence of characters 168 | and outputs the same sequence except characters are interleaved 169 | with [`Boundary] values whenever the segmenter detects a boundary. 170 | 171 | The function {!create} returns a new segmenter for a given boundary 172 | type: 173 | {[ 174 | let words = Uuseg.create `Word 175 | ]} 176 | To add characters to the sequence to segment, call {!add} on 177 | [words] with [`Uchar _]. To end the sequence call {!add} on [words] 178 | with [`End]. The segmented sequence of characters is returned character 179 | by character, interleaved with [`Boundary] values at the appropriate 180 | places, by the successive calls to {!add}. 181 | 182 | The client and the segmenter must wait on each other to limit 183 | internal buffering: each time the client adds to the sequence 184 | by calling {!add} with [`Uchar] or [`End] it must continue to 185 | call {!add} with [`Await] until the segmenter returns [`Await] 186 | or [`End]. In practice this leads to the following kind of control flow: 187 | {[ 188 | let rec add acc v = match Uuseg.add words v with 189 | | `Uchar u -> add (`Uchar u :: acc) `Await 190 | | `Boundary -> add (`B :: acc) `Await 191 | | `Await | `End -> acc 192 | ]} 193 | For example to segment the sequence <[U+0041], [U+0020], [U+0042]> 194 | (["a b"]) to a list of characters interleaved with [`B] values on word 195 | boundaries we can write: 196 | {[ 197 | let uchar = `Uchar (Uchar.of_int u) 198 | let seq = [uchar 0x0041; uchar 0x0020; uchar 0x0042] 199 | let seq_words = List.rev (add (List.fold_left add [] seq) `End) 200 | ]} 201 | *) 202 | 203 | (** {1:examples Examples} 204 | 205 | [utf_8_segments seg s] is the list of UTF-8 encoded [seg] segments of 206 | the UTF-8 encoded string [s]. 207 | 208 | {[ 209 | let utf_8_segments seg s = 210 | let flush_segment buf acc = 211 | let segment = Buffer.contents buf in 212 | Buffer.clear buf; if segment = "" then acc else segment :: acc 213 | in 214 | let rec add buf acc segmenter v = match Uuseg.add segmenter v with 215 | | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf acc segmenter `Await 216 | | `Boundary -> add buf (flush_segment buf acc) segmenter `Await 217 | | `Await | `End -> acc 218 | in 219 | let rec loop buf acc s i max segmenter = 220 | if i > max then flush_segment buf (add buf acc segmenter `End) else 221 | let dec = String.get_utf_8_uchar s i in 222 | let acc = add buf acc segmenter (`Uchar (Uchar.utf_decode_uchar dec)) in 223 | loop buf acc s (i + Uchar.utf_decode_length dec) max segmenter 224 | in 225 | let buf = Buffer.create 42 in 226 | let segmenter = Uuseg.create seg in 227 | List.rev (loop buf [] s 0 (String.length s - 1) segmenter) 228 | ]} 229 | 230 | Note that this function can be derived directly from 231 | {!Uuseg_string.fold_utf_8}. *) 232 | -------------------------------------------------------------------------------- /src/uuseg_line_break.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2014 The uuseg programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (* These are the rules as found in [1] 7 | LB1 (per suggestion) 8 | (AI|SG|XX) → AL 9 | SA when (Mn|Mc) → CM 10 | SA → AL 11 | CJ → NS 12 | LB2 sot × 13 | LB3 ! eot 14 | LB4 BK ! 15 | LB5 CR × LF 16 | (CR|LF|NL) ! 17 | LB6 × (BK|CR|LF|NL) 18 | LB7 × (SP|ZW) 19 | LB8 ZW SP* ÷ 20 | LB8a ZWJ × 21 | LB9 ¬(BK|CR|LF|NL|SP|ZW as X) (CM|ZWJ) * → X 22 | LB10 (CM|ZWJ) → AL 23 | LB11 × WJ 24 | WJ × 25 | LB12 GL × 26 | LB12a ¬(SP|BA|HY|HH) × GL 27 | LB13 × (CL|CP|EX|SY) 28 | LB14 (OP|OP30) SP* × 29 | LB15 QU SP* × (OP|OP30) 30 | LB15a (sot|BK|CR|LF|NL|OP|QU|QU_Pi|QU_Pf|GL|SP|ZW) QU_Pi SP* × 31 | LB15b × QU_Pf (SP|GL|WJ|CL|QU|CP|EX|IS|SY|BK|CR|LF|NL|ZW|eot) 32 | LB15c SP ÷ IS NU 33 | LB15d × IS 34 | LB16 (CL|CP) SP* × NS 35 | LB17 B2 SP* × B2 36 | LB18 SP ÷ 37 | LB19 × (QU|QU_Pf) 38 | (QU|QU_Pi) × 39 | LB19a ¬EastAsian × QU_Pi 40 | × QU_Pi Eot 41 | × QU_Pi ¬EastAsian 42 | QU_Pf × ¬EastAsian 43 | Sot QU_Pf × 44 | ¬EastAsian QU_Pf × 45 | LB20 ÷ CB 46 | CB ÷ 47 | LB20a (sot|BK|CR|LF|NL|SP|ZW|CB|GL) (HY|HH) × (AL|AL_30b|AL_circ|HL) 48 | LB21 × (BA|HH|HY|NS) 49 | BB × 50 | LB21a HL (HY|HH) × ¬(HL) 51 | LB21b SY × HL 52 | LB22 × IN 53 | LB23 (AL|AL_30b|AL_circ|HL) × NU 54 | NU × (AL|AL_30b|AL_circ|HL) 55 | LB23a PR × (ID|ID_30b|EB|EM) 56 | (ID|ID_30b|EB|EM) × PO 57 | LB24 (PR|PO) × (AL|AL_30b|AL_circ|HL) 58 | (AL|AL_30b|AL_circ|HL) × (PR|PO) 59 | LB25 NU (SY|IS)* (CL|CP) × (PO|PR) 60 | NU (SY |IS)* × (PO|PR|NU) 61 | (PO|PR) × OP NU 62 | (PO|PR) × OP IS NU 63 | (PR|HY|IS|PO) × NU 64 | LB26 JL × (JL|JV|H2|H3) 65 | (JV|H2) × (JV|JT) 66 | (JT|H3) × JT 67 | LB27 (JL|JV|JT|H2|H3) × (IN|PO) 68 | PR × (JL|JV|JT|H2|H3) 69 | LB28 (AL|AL_30b|AL_circ|HL) × (AL|AL_30b|AL_circ|HL) 70 | LB28a AP × (AK|AL_circ|AS) 71 | (AK|AL_circ|AS) × (VF|VI) 72 | (AK|AL_circ|AS) VI × (AK|AL_circ) 73 | (AK|AL_circ|AS) × (AK|AL_circ|AS) VF 74 | LB29 IS × (AL|HL) 75 | LB30 (AL|AL_30b|AL_circ|HL|NU) × OP-EastAsian 76 | CP-EastAsian × (AL|AL_30b|AL_circ|HL|NU) 77 | LB30a sot (RI RI)* RI × RI 78 | [^RI] (RI RI)* RI × RI 79 | LB30b EB × EM 80 | ID_30b x EM 81 | LB31 ALL ÷ 82 | ÷ ALL 83 | 84 | [1]: http://www.unicode.org/reports/tr14/#Algorithm 85 | [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt 86 | 87 | Given the structure of the rules we keep a window of three line 88 | break property value slots, three on the left, two on the right of a 89 | boundary and pattern match these slots to find the rule that 90 | applies. Because of LB9 these slots may actually correspond to more 91 | than one character and we need to bufferize the data for the slots 92 | on the right. 93 | 94 | Besides we maintain two views of the window slots, one which has 95 | the word berak property of concrete characters and another one 96 | that has the word break property as seen by the LB9 rule and 97 | those that have SP* elements. 98 | 99 | 100 | ---??---> 101 | +----+----+----++----+----+ 102 | ... | l2 | l1 | l0 || r0 | r1 | 103 | +----+----+----++----+----+ 104 | already returned to client / \ buffered in segmenter *) 105 | 106 | type line = 107 | | AI | AK | AL | AP | AS | B2 | BA | BB | BK | CB | CJ | CL | CM | CP 108 | | CR | EX | EB | EM | GL | H2 | H3 | HH | HL | HY | ID | IN 109 | | IS | JL | JT | JV | LF | NL | NS | NU | OP | PO | PR 110 | | QU | RI | SA | SG | SP | SY | VF | VI | WJ | XX | ZW | ZWJ | Invalid | Sot 111 | | Eot 112 | | AL_circ (* Added to handle the U+255C constant in LB28a. We need to split 113 | AL (the class of U+255C), the full set is AL + AL_circ + 114 | AL_30b (see below) *) 115 | | QU_Pf (* Added to handle LB15{a,b} and LB19. We need to split QU 116 | (the full set is QU + QU_Pf + QU_Pi *) 117 | | QU_Pi 118 | (* Added to handle LB30b *) 119 | | AL_30b 120 | | ID_30b 121 | 122 | (* WARNING. The indexes used here need to be synchronized with those 123 | assigned by uucp for Uucp.Break.Low.line_break. *) 124 | 125 | let byte_to_line = 126 | [| AL (* LB1 AI → AL *); AK; AL; AP; AS; B2; BA; BB; BK; CB; 127 | NS (* LB1 CJ → NS *); CL; 128 | CM; CP; CR; EX; EB; EM; GL; H2; H3; HH; HL; HY; ID; IN; IS; JL; JT; JV; LF; 129 | NL; NS; NU; OP; PO; PR; QU; RI; SA; AL (* LB1 SG → AL *); SP; SY; VF; VI; 130 | WJ; XX; ZW; ZWJ |] 131 | 132 | let eastasian (`Uchar u) = match Uucp.Break.east_asian_width u with 133 | | `F | `W | `H -> true | _ -> false 134 | 135 | let line u = match byte_to_line.(Uucp.Break.Low.line u) with 136 | | SA -> (* LB1 for SA *) 137 | begin match Uucp.Gc.general_category u with 138 | | `Mn | `Mc -> CM 139 | | _ -> AL 140 | end 141 | | ID -> (* Decompose because of LB30b, this assumption is tested in test.ml *) 142 | if Uucp.Emoji.is_extended_pictographic u && 143 | Uucp.Gc.general_category u = `Cn then ID_30b else ID 144 | | AL -> (* Decompose because of LB28a *) 145 | if Uchar.to_int u = 0x25CC then AL_circ else AL 146 | | XX -> 147 | (* Some unassigned characters match the second rule of LB30b *) 148 | if Uucp.Emoji.is_extended_pictographic u && 149 | Uucp.Gc.general_category u = `Cn 150 | then AL_30b 151 | else AL (* LB1 XX → AL *) 152 | | QU -> (* Decompose because of LB15{a,b} *) 153 | begin match Uucp.Gc.general_category u with 154 | | `Pf -> QU_Pf 155 | | `Pi -> QU_Pi 156 | | _ -> QU 157 | end 158 | | l -> l 159 | 160 | type state = 161 | | Fill (* fill slot on the right of boundary. *) 162 | | Flush (* flush the first lement of slot r0 to get to next boundary. *) 163 | | Decide (* decide boundary of slot r0. *) 164 | 165 | (* XXX with the time the window has enlarged quite a bit, one day 166 | rewrite that with circular buffers. *) 167 | 168 | type t = 169 | { mutable state : state; (* current state. *) 170 | mutable l2 : line; mutable l2_rewrite : line; (* l2 according to LB9/LB10 *) 171 | mutable l1 : line; mutable l1_rewrite : line; (* l1 according to LB9/LB10 *) 172 | mutable l1_data : [`Uchar of Uchar.t ]; (* data in l1 *) 173 | mutable l0 : line; mutable l0_rewrite : line; (* l0 according to LB9/LB10 *) 174 | mutable l0_data : [`Uchar of Uchar.t ]; (* data in l0 *) 175 | mutable l0_odd_ri : bool; (* odd number of RI on the left of break point. *) 176 | mutable lb25_nu : bool; (* NU (SI|IS)* on l0 or l1 (for lb25) *) 177 | mutable r0 : line; (* of element in r0_data *) 178 | mutable r0_data : [`Uchar of Uchar.t ]; (* data in r0 *) 179 | mutable r1 : line; (* of element in r1_data *) 180 | mutable r1_data : [`Uchar of Uchar.t ]; (* data in r1 *) 181 | mutable r2 : line; (* of element in r2_data *) 182 | mutable r2_data : [`Uchar of Uchar.t ]; (* data in r2 *) 183 | mutable mandatory : bool; (* [true] if break is mandatory. *) 184 | mutable ended : bool; (* [true] if [`End was added]. *) } 185 | 186 | let nul_buf = `Uchar (Uchar.unsafe_of_int 0x0000) 187 | let create () = 188 | { state = Fill; 189 | l2 = Invalid; l2_rewrite = Invalid; 190 | l1 = Invalid; l1_rewrite = Invalid; l1_data = nul_buf; 191 | l0 = Sot; l0_rewrite = Sot; l0_data = nul_buf; 192 | l0_odd_ri = false; 193 | lb25_nu = false; 194 | r0 = Invalid; 195 | r0_data = nul_buf (* overwritten *); 196 | r1 = Invalid; 197 | r1_data = nul_buf (* overwritten *); 198 | r2 = Invalid; 199 | r2_data = nul_buf (* overwritten *); 200 | mandatory = false; 201 | ended = false } 202 | 203 | let mandatory s = s.mandatory 204 | let copy s = { s with state = s.state } 205 | let equal = ( = ) 206 | 207 | let lb10_rewrite = function CM | ZWJ -> AL | l -> l 208 | let is_lb9_X = function BK | CR | LF | NL | SP | ZW | Sot -> false | _ -> true 209 | let is_lb12_l0 = function SP | BA | HY | HH -> false | _ -> true 210 | 211 | let has_break s = (* N.B. sets s.mandatory by side effect. *) 212 | let mandatory s = s.mandatory <- true; true in 213 | s.mandatory <- false; 214 | (* NB. s.l2 and s.r1 are not needed here. *) 215 | match s.l1, s.l0 (**),(**) s.r0 with 216 | (* LB1 is handled by [byte_to_line] and [line]. *) 217 | | (* LB2 *) _, Sot, _ -> false 218 | | (* LB3 *) _, _, Eot -> mandatory s 219 | | (* LB4 *) _, BK, _ -> mandatory s 220 | | (* LB5 *) _, CR, LF -> false 221 | | _, (CR|LF|NL), _ -> mandatory s 222 | | (* LB6 *) _, _, (BK|CR|LF|NL) -> false 223 | | (* LB7 *) _, _, (SP|ZW) -> false 224 | | (* LB8 *) _, ZW, _ -> true 225 | | _(* ZW *), _(* SP* *), _ when s.l1_rewrite = ZW && 226 | s.l0_rewrite = SP -> true 227 | | (* LB8a *) _, ZWJ, _ -> false 228 | | (* LB9 implicitely entails ¬(BK|CR|LF|NL|SP|ZW as X) × (CM|ZWJ) *) 229 | _, x, (CM|ZWJ) when is_lb9_X s.l0_rewrite -> false 230 | | _ -> (* apply LB9/LB10 rewrite and match *) 231 | let l2m = lb10_rewrite s.l2_rewrite in 232 | let l1m = lb10_rewrite s.l1_rewrite in 233 | let l0m = lb10_rewrite s.l0_rewrite in 234 | let r0m = lb10_rewrite s.r0 in 235 | let r1m = lb10_rewrite s.r1 in 236 | let r2m = lb10_rewrite s.r2 in 237 | match l2m, l1m , l0m, (**) r0m, r1m, r2m with 238 | | (* LB11 *) _, _, _, 239 | WJ, _, _ -> false 240 | | _, _, WJ, 241 | _, _, _ -> false 242 | | (* LB12 *) _, _, GL, 243 | _, _, _ -> false 244 | | (* LB12a *) _, _, l0, 245 | GL, _, _ when is_lb12_l0 l0 -> false 246 | | (* LB13 *) _, _, _, 247 | (CL|CP|EX|SY), _, _ -> false 248 | | (* LB14 *) _, _, OP, 249 | _, _, _ -> false 250 | | _, OP, SP, 251 | _, _, _ -> false 252 | | (* LB15a *) _, (Sot|BK|CR|LF|NL|OP|QU|QU_Pi|QU_Pf|GL|SP|ZW), QU_Pi, 253 | _, _, _ -> false 254 | | (Sot|BK|CR|LF|NL|OP|QU|QU_Pi|QU_Pf|GL|SP|ZW), QU_Pi, SP, 255 | _, _, _ -> false 256 | | (* LB15b *) _, _, _, 257 | QU_Pf, (SP|GL|WJ|CL|QU|QU_Pi|QU_Pf|CP|EX|IS|SY|BK|CR| 258 | LF|NL|ZW|Eot), _ -> false 259 | | (* LB15c *) _, _, SP, 260 | IS, NU, _ -> true 261 | | (* LB15d *) _, _, _, 262 | IS, _, _ -> false 263 | | (* LB16 *) _, _, (CL|CP), 264 | NS, _, _ -> false 265 | | _, (CL|CP), 266 | SP, NS, _, _ -> false 267 | | (* LB17 *) _, _, B2, 268 | B2, _, _ -> false 269 | | _, B2, SP, 270 | B2, _, _ -> false 271 | | (* LB18 *) _, _, SP, 272 | _, _, _ -> true 273 | | (* LB19 *) _, _, _, 274 | (QU|QU_Pf), _, _ -> false 275 | | _, _, (QU|QU_Pi), 276 | _, _, _ -> false 277 | | (* LB19a *) _, _, _, 278 | QU_Pi, _, _ when not (eastasian s.l0_data) -> false 279 | | _, _, _, 280 | QU_Pi, Eot, _ -> false 281 | | _, _, _, 282 | QU_Pi, _, _ when not (eastasian s.r1_data) -> false 283 | | _, _, QU_Pf, 284 | _, _, _ when not (eastasian s.r0_data) -> false 285 | | _, Sot, QU_Pf, 286 | _, _, _ -> false 287 | | _, _, QU_Pf, 288 | _, _, _ when not (eastasian s.l1_data) -> false 289 | | (* LB20 *) _, _, _, 290 | CB, _, _ -> true 291 | | _, _, CB, 292 | _, _, _ -> true 293 | | (* LB20a *) _, (Sot|BK|CR|LF|NL|SP|ZW|CB|GL), (HY|HH), 294 | (AL|AL_30b|AL_circ|HL), _, _ -> false 295 | | (* LB21 *) _, _, _, 296 | (BA|HH|HY|NS), _, _ -> false 297 | | _, _, BB, 298 | _, _, _ -> false 299 | | (* LB21a *) _, HL, (HY|HH), 300 | r0, _, _ when r0 <> HL -> false 301 | | (* LB21b *) _, _, SY, 302 | HL, _, _ -> false 303 | | (* LB22 *) _, _, _, 304 | IN, _, _ -> false 305 | | (* LB23 *) _, _, (AL|AL_30b|AL_circ|HL), 306 | NU, _, _ -> false 307 | | _, _, NU, 308 | (AL|AL_30b|AL_circ|HL), _, _ -> false 309 | | (* LB23a *) _, _, PR, 310 | (ID|ID_30b|EB|EM), _, _ -> false 311 | | _, _, 312 | (ID|ID_30b|EB|EM), PO, _, _ -> false 313 | | (* LB24 *) _, _, (PR|PO), 314 | (AL|AL_30b|AL_circ|HL), _, _ -> false 315 | | _, _, (AL|AL_30b|AL_circ|HL), 316 | (PR|PO), _, _ -> false 317 | | (* LB25 *) _, _, (CL|CP), 318 | (PO|PR), _, _ when s.lb25_nu -> false 319 | | _, _, (NU|SY|IS), 320 | (PO|PR|NU), _, _ when s.lb25_nu -> false 321 | | _, _, (PO|PR), 322 | OP, IS, NU -> false 323 | | _, _, (PO|PR), 324 | OP, NU, _ -> false 325 | | _, _, (PR|HY|IS|PO), 326 | NU, _, _ -> false 327 | | (* LB26 *) _, _, JL, 328 | (JL|JV|H2|H3), _, _ -> false 329 | | _, _, (JV|H2), 330 | (JV|JT), _, _ -> false 331 | | _, _, (JT|H3), 332 | JT, _, _ -> false 333 | | (* LB27 *) _, _, (JL|JV|JT|H2|H3), 334 | PO, _, _ -> false 335 | | _, _, PR, 336 | (JL|JV|JT|H2|H3), _, _ -> false 337 | | (* LB28 *) _, _, (AL|AL_30b|AL_circ|HL), 338 | (AL|AL_30b|AL_circ|HL), _, _ -> false 339 | | (* LB28a *) _, _, AP, 340 | (AK|AL_circ|AS), _, _ -> false 341 | | _, _, (AK|AL_circ|AS), 342 | (VF|VI), _, _ -> false 343 | | _, (AK|AL_circ|AS), VI, 344 | (AK|AL_circ), _, _ -> false 345 | | _, _, (AK|AL_circ|AS), 346 | (AK|AL_circ|AS), VF, _ -> false 347 | | (* LB29 *) _, _, IS, 348 | (AL|AL_30b|AL_circ|HL), _, _ -> false 349 | | (* LB30 *) _, _, (AL|AL_30b|AL_circ|HL|NU), 350 | OP, _, _ when not (eastasian s.r0_data) -> false 351 | | _, _, CP, 352 | (AL|AL_30b|AL_circ|HL|NU), _, _ 353 | when not (eastasian s.l0_data) -> false 354 | | (* LB30a *) _, _, RI, 355 | RI, _, _ when s.l0_odd_ri -> false 356 | | (* LB30b *) _, _, EB, 357 | EM, _, _ -> false 358 | | _, _, (AL_30b|ID_30b), 359 | EM, _, _ -> false 360 | | (* LB31 *) _, _, _, 361 | _, _, _ -> true 362 | 363 | let next s = (* moves to the next boundary *) 364 | s.l2 <- s.l1; 365 | s.l1 <- s.l0; 366 | s.l1_data <- s.l0_data; 367 | s.l0 <- s.r0; 368 | s.l0_data <- s.r0_data; 369 | begin 370 | (* Check the regexp for LB25 *) 371 | if not s.lb25_nu then (if s.l0 = NU then s.lb25_nu <- true else ()) else 372 | (* We are matching check if we need to keep the matching *) 373 | if s.l0 = NU || s.l0 = SY || s.l0 = IS then () (* keep matching *) else 374 | if s.l1 = NU || s.l1 = SY || s.l1 = IS then () (* Keep matching *) else 375 | s.lb25_nu <- false 376 | end; 377 | (* Only move rewrite window if l0_rewrite doesn't absorb the char 378 | by rule LB9 or the various SP* *) 379 | begin match s.r0 with 380 | | CM | ZWJ when is_lb9_X s.l0_rewrite -> () 381 | | SP when s.l0_rewrite = SP -> () 382 | | _ -> 383 | s.l2_rewrite <- s.l1_rewrite; 384 | s.l1_rewrite <- s.l0_rewrite; 385 | s.l0_rewrite <- s.r0; 386 | s.l0_odd_ri <- 387 | (match s.l0_rewrite with RI -> not s.l0_odd_ri | _ -> false); 388 | end; 389 | let data = s.r0_data in 390 | s.r0 <- s.r1; 391 | s.r0_data <- s.r1_data; 392 | s.r1 <- s.r2; 393 | s.r1_data <- s.r2_data; 394 | s.r2 <- Invalid; 395 | s.r2_data <- nul_buf; 396 | (data :> Uuseg_base.ret) 397 | 398 | let need_fill s = s.r2 = Invalid || s.r1 = Invalid || s.r0 = Invalid 399 | 400 | let flush s = match s.ended with 401 | | false -> 402 | let ret = next s in 403 | (if need_fill s then s.state <- Fill else s.state <- Decide); 404 | ret 405 | | true -> 406 | match s.r0 with 407 | | Eot -> `End 408 | | _ -> s.state <- Decide; next s 409 | 410 | let decide s = if has_break s then (s.state <- Flush; `Boundary) else flush s 411 | 412 | let add s = function 413 | | `Uchar u as add -> 414 | if s.ended then Uuseg_base.err_ended add else 415 | begin match s.state with 416 | | Fill when s.r0 = Invalid -> s.r0_data <- add; s.r0 <- line u; `Await 417 | | Fill when s.r1 = Invalid -> s.r1_data <- add; s.r1 <- line u; `Await 418 | | Fill -> s.r2_data <- add; s.r2 <- line u; decide s 419 | | Flush | Decide -> Uuseg_base.err_exp_await add 420 | end 421 | | `Await -> 422 | begin match s.state with 423 | | Flush -> flush s 424 | | Decide -> decide s 425 | | Fill -> `Await 426 | end 427 | | `End -> 428 | if s.ended then Uuseg_base.err_ended `End else 429 | begin match s.state with 430 | | Fill -> 431 | s.ended <- true; 432 | (if s.r0 = Invalid then s.r0 <- Eot else 433 | if s.r1 = Invalid then s.r1 <- Eot else 434 | s.r2 <- Eot); 435 | if s.l0 = Sot && s.r0 = Eot then (* empty string *) `End else 436 | decide s 437 | | Flush | Decide -> Uuseg_base.err_exp_await `End 438 | end 439 | --------------------------------------------------------------------------------