├── BRZO
├── .ocp-indent
├── .gitignore
├── .merlin
├── src
    ├── uuseg.mllib
    ├── uuseg_word.mli
    ├── uuseg_sentence.mli
    ├── uuseg_grapheme_cluster.mli
    ├── uuseg_line_break.mli
    ├── uuseg_buf.mli
    ├── uuseg_buf.ml
    ├── uuseg_base.mli
    ├── uuseg_base.ml
    ├── uuseg.ml
    ├── uuseg_string.mli
    ├── uuseg_string.ml
    ├── uuseg_grapheme_cluster.ml
    ├── uuseg_word.ml
    ├── uuseg_sentence.ml
    ├── uuseg.mli
    └── uuseg_line_break.ml
├── _tags
├── doc
    └── index.mld
├── pkg
    ├── META
    └── pkg.ml
├── LICENSE.md
├── test
    ├── examples.ml
    ├── usegtrip.ml
    └── test_uuseg.ml
├── DEVEL.md
├── README.md
├── opam
├── CHANGES.md
└── B0.ml


/BRZO:
--------------------------------------------------------------------------------
1 | (srcs-x pkg test)


--------------------------------------------------------------------------------
/.ocp-indent:
--------------------------------------------------------------------------------
1 | strict_with=always,match_clause=4,strict_else=never


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _b0
2 | _build
3 | tmp
4 | *.install
5 | test/*Test.txt


--------------------------------------------------------------------------------
/.merlin:
--------------------------------------------------------------------------------
1 | PKG uutf uucp cmdliner b0.kit
2 | S src
3 | S test
4 | B _b0/**
5 | 


--------------------------------------------------------------------------------
/src/uuseg.mllib:
--------------------------------------------------------------------------------
1 | Uuseg_base
2 | Uuseg_buf
3 | Uuseg_grapheme_cluster
4 | Uuseg_word
5 | Uuseg_sentence
6 | Uuseg_line_break
7 | Uuseg
8 | Uuseg_string


--------------------------------------------------------------------------------
/_tags:
--------------------------------------------------------------------------------
 1 | true : bin_annot, safe_string
 2 | 
 3 | <_b0> : -traverse
 4 | <src> : include
 5 | <src/**> : package(uucp)
 6 | 
 7 | <test> : include
 8 | <test/**> : package(uucp)
 9 | <test/usegtrip.*> : package(cmdliner uutf)
10 | 


--------------------------------------------------------------------------------
/doc/index.mld:
--------------------------------------------------------------------------------
 1 | {0 Uuseg {%html: <span class="version">%%VERSION%%</span>%}}
 2 | 
 3 | Uuseg segments Unicode text. See {!Uuseg} for more details.
 4 | 
 5 | {1:library_uuseg Library [uuseg]}
 6 | 
 7 | {!modules:
 8 | Uuseg
 9 | Uuseg_string
10 | }
11 | 


--------------------------------------------------------------------------------
/pkg/META:
--------------------------------------------------------------------------------
 1 | description = "Unicode text segmentation for OCaml"
 2 | version = "%%VERSION_NUM%%"
 3 | requires = "uucp"
 4 | archive(byte) = "uuseg.cma"
 5 | archive(native) = "uuseg.cmxa"
 6 | plugin(byte) = "uuseg.cma"
 7 | plugin(native) = "uuseg.cmxs"
 8 | exists_if = "uuseg.cma uuseg.cmxa"
 9 | 
10 | package "string" (
11 |   description = "The uuseg.string library (deprecated)"
12 |   version = "%%VERSION_NUM%%"
13 |   requires = "uuseg"
14 |   exports = "uuseg"
15 |   warning = "Deprecated, use the uuseg library."
16 | )
17 | 


--------------------------------------------------------------------------------
/src/uuseg_word.mli:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | (** Word segmenter. *)
 7 | 
 8 | (** {1 Segmenter} *)
 9 | 
10 | type t
11 | val create : unit -> t
12 | val copy : t -> t
13 | val equal : t -> t -> bool
14 | val add : t ->  [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret
15 | 


--------------------------------------------------------------------------------
/src/uuseg_sentence.mli:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | (** Sentence segmenter. *)
 7 | 
 8 | (** {1 Segmenter} *)
 9 | 
10 | type t
11 | val create : unit -> t
12 | val copy : t -> t
13 | val equal : t -> t -> bool
14 | val add : t ->  [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret
15 | 


--------------------------------------------------------------------------------
/src/uuseg_grapheme_cluster.mli:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | (** Grapheme cluster segmenter. *)
 7 | 
 8 | (** {1 Segmenter} *)
 9 | 
10 | type t
11 | val create : unit -> t
12 | val copy : t -> t
13 | val equal : t -> t -> bool
14 | val add : t -> [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret
15 | 


--------------------------------------------------------------------------------
/pkg/pkg.ml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ocaml
 2 | #use "topfind"
 3 | #require "topkg"
 4 | open Topkg
 5 | 
 6 | let uutf = Conf.with_pkg "uutf"
 7 | let cmdliner = Conf.with_pkg "cmdliner"
 8 | 
 9 | let () =
10 |   Pkg.describe "uuseg" @@ fun c ->
11 |   let uutf = Conf.value c uutf in
12 |   let cmdliner = Conf.value c cmdliner in
13 |   Ok [ Pkg.mllib ~api:["Uuseg"; "Uuseg_string"] "src/uuseg.mllib";
14 |        Pkg.bin ~cond:(uutf && cmdliner) "test/usegtrip";
15 |        Pkg.doc "doc/index.mld" ~dst:"odoc-pages/index.mld";
16 |        Pkg.doc "test/examples.ml"; ]
17 | 


--------------------------------------------------------------------------------
/src/uuseg_line_break.mli:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | (** Line break segmenter. *)
 7 | 
 8 | (** {1 Segmenter} *)
 9 | 
10 | type t
11 | val create : unit -> t
12 | val copy : t -> t
13 | val equal : t -> t -> bool
14 | val mandatory : t -> bool
15 | val add : t ->  [ `Await | `End | `Uchar of Uchar.t ] -> Uuseg_base.ret
16 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 The uuseg programmers
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for any
 4 | purpose with or without fee is hereby granted, provided that the above
 5 | copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 | 


--------------------------------------------------------------------------------
/src/uuseg_buf.mli:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | (** [`Uchar] buffers *)
 7 | 
 8 | (** {1 [`Uchar] buffers} *)
 9 | 
10 | type t
11 | (** The type for [`Uchar] buffers. *)
12 | 
13 | val create : int -> t
14 | (** [create n] is a buffer of initial size [n]. *)
15 | 
16 | val copy : t -> t
17 | (** [copy b] is a copy of [b]. *)
18 | 
19 | val empty : t -> bool
20 | (** [empty b] is [true] iff [b] is empty. *)
21 | 
22 | val len : t -> int
23 | (** [len b] is [b]'s length. *)
24 | 
25 | val add : t -> [`Uchar of Uchar.t] -> unit
26 | (** [add b u] adds [u] at the end of [b]. *)
27 | 
28 | val flush : t -> [> `Uchar of Uchar.t ]
29 | (** [flush b] is the first [`Uchar] of [b] and removes it from [b]. *)
30 | 
31 | val get_first : t -> Uchar.t
32 | (** [get_first b] is the first [`Uchar] of [b], if any. *)
33 | 


--------------------------------------------------------------------------------
/test/examples.ml:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: CC0-1.0
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | let utf_8_segments seg s =
 7 |   let flush_segment buf acc =
 8 |     let segment = Buffer.contents buf in
 9 |     Buffer.clear buf; if segment = "" then acc else segment :: acc
10 |   in
11 |   let rec add buf acc segmenter v = match Uuseg.add segmenter v with
12 |   | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf acc segmenter `Await
13 |   | `Boundary -> add buf (flush_segment buf acc) segmenter `Await
14 |   | `Await | `End -> acc
15 |   in
16 |   let rec loop buf acc s i max segmenter =
17 |     if i > max then flush_segment buf (add buf acc segmenter `End) else
18 |     let dec = String.get_utf_8_uchar s i in
19 |     let acc = add buf acc segmenter (`Uchar (Uchar.utf_decode_uchar dec)) in
20 |     loop buf acc s (i + Uchar.utf_decode_length dec) max segmenter
21 |   in
22 |   let buf = Buffer.create 42 in
23 |   let segmenter = Uuseg.create seg in
24 |   List.rev (loop buf [] s 0 (String.length s - 1) segmenter)
25 | 


--------------------------------------------------------------------------------
/src/uuseg_buf.ml:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | let dummy_u =                            (* to initalize arrays, never read. *)
 7 |   `Uchar (Uchar.unsafe_of_int 0x0000)
 8 | 
 9 | type t =
10 |   { mutable buf : [`Uchar of Uchar.t] array;
11 |     mutable first : int;
12 |     mutable last : int; }
13 | 
14 | let create n = { buf = Array.make n dummy_u; first = 0; last = -1; }
15 | let copy b = { b with buf = Array.copy b.buf; }
16 | let empty b = b.last = -1
17 | let len b = (b.last - b.first) + 1
18 | let grow b =
19 |   let len = Array.length b.buf in
20 |   let newbuf = Array.make (2 * len) dummy_u in
21 |   Array.blit b.buf 0 newbuf 0 len; b.buf <- newbuf
22 | 
23 | let add b add =
24 |   let last = b.last + 1 in
25 |   if last = Array.length b.buf then grow b;
26 |   b.buf.(last) <- add; b.last <- last
27 | 
28 | let flush b =
29 |   let `Uchar _ as add = b.buf.(b.first) in
30 |   b.first <- b.first + 1;
31 |   if b.first > b.last then (b.first <- 0; b.last <- -1);
32 |   add
33 | 
34 | let get_first b = let `Uchar u = b.buf.(b.first) in u
35 | 


--------------------------------------------------------------------------------
/DEVEL.md:
--------------------------------------------------------------------------------
 1 | # New Unicode release
 2 | 
 3 | First you will an install of [`uucp`] that supports the new Unicode
 4 | version installed.
 5 | 
 6 | You can then bump the Unicode release number at the top of the `B0.ml`
 7 | file. Veryify that everything is as expected with:
 8 | 
 9 |     b0 -- unicode-version
10 | 
11 | Update the opam file with: 
12 | 
13 |     b0 -- .opam file > opam
14 | 
15 | Before the formal Unicode release date check the proposed update of
16 | [UAX29] and [UAX14]. If the rules need to be adjusted then do so.
17 | 
18 | Compile Uuseg against the uucp and check the reference tests (see
19 | below).
20 | 
21 | [UAX29]: https://www.unicode.org/reports/tr29/
22 | [UAX14]: https://www.unicode.org/reports/tr14/
23 | [`uucp`]: https://erratique.ch/software/uucp
24 | 
25 | # Reference tests
26 | 
27 | To test the package on the reference segmentation tests you must 
28 | download a copy of the tests to:
29 | 
30 |     test/LineBreakTest.txt
31 |     test/GraphemeBreakTest.txt
32 |     test/WordBreakTest.txt
33 |     test/SentenceBreakTest.txt
34 | 
35 | These file are ignored by git. If you have `curl` in your `PATH`
36 | you can simply issue:
37 | 
38 |     b0 -- download-tests
39 | 
40 | this downloads the tests for the Unicode version mentioned in `B0.ml`. 
41 | 
42 | You can then check them via: 
43 | 
44 |     b0 test
45 | 


--------------------------------------------------------------------------------
/src/uuseg_base.mli:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | (** Segmenter commonalities. *)
 7 | 
 8 | (** {1 Common} *)
 9 | 
10 | type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ]
11 | (** See {!Uuseg.ret}. *)
12 | 
13 | val pp_ret : Format.formatter -> [< ret ] -> unit
14 | (** See {!Uuseg.pp_ret}. *)
15 | 
16 | val err_exp_await : [< ret] -> 'a
17 | (** See {!Uuseg.err_exp_await}. *)
18 | 
19 | val err_ended : [< ret] -> 'a
20 | (** See {!Uuseg.err_ended}. *)
21 | 
22 | (** {1:tid Type identifiers} *)
23 | 
24 | (** Type introspection.
25 | 
26 |     {b Note.} Available in 5.1. *)
27 | module Type : sig
28 | 
29 |   type (_, _) eq = Equal : ('a, 'a) eq (** *)
30 |   (** The type for type quality testing. *)
31 | 
32 |   module Id : sig
33 | 
34 |     (** {1:typeids Type identifiers} *)
35 | 
36 |     type 'a t
37 |     (** The type for type identifiers for a type ['a]. *)
38 | 
39 |     val make : unit -> 'a t
40 |     (** [make ()] is a new type identifier. *)
41 | 
42 |     val provably_equal : 'a t -> 'b t -> ('a, 'b) eq option
43 |     (** [provably_equal id0 id1] determines if [id0] and [id1] are equal. *)
44 | 
45 |     val uid : 'a t -> int
46 |     (** [uid id] is a runtime unique identifier for [id]. *)
47 |   end
48 | end
49 | 


--------------------------------------------------------------------------------
/src/uuseg_base.ml:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | type ret = [ `Await | `Boundary | `End | `Uchar of Uchar.t ]
 7 | 
 8 | let pp_ret ppf v = match (v :> ret) with
 9 | | `Await -> Format.fprintf ppf "`Await"
10 | | `Boundary -> Format.fprintf ppf "`Boundary"
11 | | `End -> Format.fprintf ppf "`End"
12 | | `Uchar u -> Format.fprintf ppf "`Uchar U+%04X" (Uchar.to_int u)
13 | 
14 | let err_exp_await add =
15 |   invalid_arg (Format.asprintf "can't add %a, expected `Await" pp_ret add)
16 | 
17 | let err_ended add =
18 |   invalid_arg (Format.asprintf "can't add %a, `End already added" pp_ret add)
19 | 
20 | (* Type identifiers *)
21 | 
22 | module Type = struct
23 |   (* See http://alan.petitepomme.net/cwn/2015.03.24.html#1
24 |      In the stdlib since 5.1. *)
25 | 
26 |   type ('a, 'b) eq = Equal : ('a, 'a) eq
27 | 
28 |   module Id = struct
29 |     type _ id = ..
30 |     module type ID = sig
31 |       type t
32 |       type _ id += Id : t id
33 |     end
34 | 
35 |     type 'a t = (module ID with type t = 'a)
36 | 
37 |     let make (type a) () : a t =
38 |       (module struct type t = a type _ id += Id : t id end)
39 | 
40 |     let provably_equal
41 |         (type a b) ((module A) : a t) ((module B) : b t) : (a, b) eq option
42 |       =
43 |       match A.Id with B.Id -> Some Equal | _ -> None
44 | 
45 |     let uid (type a) ((module A) : a t) =
46 |       Obj.Extension_constructor.id (Obj.Extension_constructor.of_val A.Id)
47 |   end
48 | end
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Uuseg — Unicode text segmentation for OCaml
 2 | ===============================================================================
 3 | %%VERSION%%
 4 | 
 5 | Uuseg is an OCaml library for segmenting Unicode text. It implements
 6 | the locale independent [Unicode text segmentation algorithms][1] to
 7 | detect grapheme cluster, word and sentence boundaries and the [Unicode
 8 | line breaking algorithm][2] to detect line break opportunities.
 9 | 
10 | The library is independent from any IO mechanism or Unicode text data
11 | structure and it can process text without a complete in-memory
12 | representation.
13 | 
14 | Uuseg is distributed under the ISC license. It depends on [Uucp].
15 | 
16 | [1]: http://www.unicode.org/reports/tr29/
17 | [2]: http://www.unicode.org/reports/tr14/
18 | [Uucp]: http://erratique.ch/software/uucp
19 | 
20 | Homepage: <http://erratique.ch/software/uuseg>
21 | 
22 | ## Installation
23 | 
24 | Uuseg can be installed with `opam`:
25 | 
26 |     opam install uuseg
27 |     opam install uuseg cmdliner uutf  # For the usegtrip tool. 
28 | 
29 | If you don't use `opam` consult the [`opam`](opam) file for build
30 | instructions.
31 | 
32 | ## Documentation
33 | 
34 | The documentation can be consulted [online] or via `odig doc uuseg`.
35 | 
36 | Questions are welcome but better asked on the [OCaml forum] than on 
37 | the issue tracker.
38 | 
39 | [online]: http://erratique.ch/software/uuseg/doc/
40 | [OCaml forum]: https://discuss.ocaml.org/
41 | 
42 | 
43 | ## Sample programs
44 | 
45 | The [`usegtrip`] tool segments text provided on standard input.
46 | 
47 | See also the [doc examples].
48 | 
49 | [`usegtrip`]: test/usegtrip.ml
50 | [doc examples]: test/examples.ml
51 | 


--------------------------------------------------------------------------------
/opam:
--------------------------------------------------------------------------------
 1 | opam-version: "2.0"
 2 | name: "uuseg"
 3 | synopsis: "Unicode text segmentation for OCaml"
 4 | description: """\
 5 | Uuseg is an OCaml library for segmenting Unicode text. It implements
 6 | the locale independent [Unicode text segmentation algorithms][1] to
 7 | detect grapheme cluster, word and sentence boundaries and the [Unicode
 8 | line breaking algorithm][2] to detect line break opportunities.
 9 | 
10 | The library is independent from any IO mechanism or Unicode text data
11 | structure and it can process text without a complete in-memory
12 | representation.
13 | 
14 | Uuseg is distributed under the ISC license. It depends on [Uucp].
15 | 
16 | [1]: http://www.unicode.org/reports/tr29/
17 | [2]: http://www.unicode.org/reports/tr14/
18 | [Uucp]: http://erratique.ch/software/uucp
19 | 
20 | Homepage: <http://erratique.ch/software/uuseg>"""
21 | maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>"
22 | authors: "The uuseg programmers"
23 | license: "ISC"
24 | tags: ["unicode" "text" "segmentation" "org:erratique"]
25 | homepage: "https://erratique.ch/software/uuseg"
26 | doc: "https://erratique.ch/software/uuseg/doc/"
27 | bug-reports: "https://github.com/dbuenzli/uuseg/issues"
28 | depends: [
29 |   "ocaml" {>= "4.14.0"}
30 |   "ocamlfind" {build}
31 |   "ocamlbuild" {build}
32 |   "topkg" {build & >= "1.1.0"}
33 |   "uucp" {>= "17.0.0" & < "18.0.0"}
34 | ]
35 | depopts: ["uutf" "cmdliner"]
36 | conflicts: [
37 |   "uutf" {< "1.0.0"}
38 |   "cmdliner" {< "1.1.0"}
39 | ]
40 | build: [
41 |   "ocaml"
42 |   "pkg/pkg.ml"
43 |   "build"
44 |   "--dev-pkg"
45 |   "%{dev}%"
46 |   "--with-uutf"
47 |   "%{uutf:installed}%"
48 |   "--with-cmdliner"
49 |   "%{cmdliner:installed}%"
50 | ]
51 | dev-repo: "git+https://erratique.ch/repos/uuseg.git"
52 | x-maintenance-intent: ["(latest)"]
53 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | v17.0.0 2025-09-11 Zagreb
 2 | -------------------------
 3 | 
 4 | - Unicode 17.0.0 support.
 5 | - Add `Uuseg.equal` thanks to @lord for suggesting.
 6 | 
 7 | v16.0.0 2024-09-11 Zagreb
 8 | -------------------------
 9 | 
10 | - Unicode 16.0.0 support.
11 | 
12 | v15.1.0 2023-09-15 Zagreb
13 | -------------------------
14 | 
15 | - Unicode 15.1.0 support. 
16 | - Requires OCaml 4.14.0 for the UTF decoders.
17 | - The `Uuseg_string` module was rewritten to use the standard library
18 |   UTF decoders and was moved to the `uuseg` library. The `uuseg.string`
19 |   library is deprecated, it warns on usage and simply requires `uuseg`.
20 | - The sample code was rewritten to use the standard library UTF
21 |   decoders.
22 | 
23 | v15.0.0 2022-09-15 Zagreb
24 | -------------------------
25 | 
26 | - Unicode 15.0.0 support.
27 | 
28 | v14.0.0 2021-09-17 Zagreb
29 | -------------------------
30 | 
31 | - Unicode 14.0.0 support.
32 | 
33 | v13.0.0 2020-03-11 La Forclaz (VS)
34 | ----------------------------------
35 | 
36 | - Unicode 13.0.0 support.
37 | - Grapheme clusters and word boundaries w.r.t. emojis are segmented
38 |   according to the specification (#5 is closed).
39 | - Internal rewrite of word and line break boundaries. Implementations
40 |   are less hairy, less ad-hoc (not there yet though) and more correct.
41 | - Require OCaml >= 4.03.0.
42 | 
43 | v12.0.0 2019-03-08 La Forclaz (VS)
44 | ----------------------------------
45 | 
46 | - Unicode 12.0.0 support. Grapheme cluster and word boundaries
47 |   w.r.t. emojis are still only partially according to the specification
48 |   see issue #5 for details.
49 | 
50 | v11.0.0 2018-06-06 Zürich
51 | -------------------------
52 | 
53 | - Unicode 11.0.0 support. Grapheme cluster and word boundaries
54 |   w.r.t. emojis are only partially supported according to the
55 |   specification see issue #5 for details.
56 |   
57 | v10.0.0 2017-06-20 Cambridge (UK)
58 | ---------------------------------
59 | 
60 | - Unicode 10.0.0 support.
61 | 
62 | v1.0.1 2016-03-07 La Forclaz (VS)
63 | ---------------------------------
64 | 
65 | - OCaml 4.05 compatibility (removal of `Uchar.dump`).
66 | 
67 | v1.0.0 2016-11-23 Zagreb
68 | ------------------------
69 | 
70 | - Unicode 9.0.0 support.
71 | - OCaml standard library `Uchar.t` support.
72 |   - Removes and substitutes `type Uuseg.uchar = int` by the (abstract)
73 |     `Uchar.t` type. `Uchar.{of,to}_int` allows to recover the previous
74 |     representation.
75 |   - Removes the `Uuseg.is_uchar`. `Uchar.is_valid` can be used instead.
76 | - Safe string support.
77 | - Build depend on topkg.
78 | - Relicense from BSD3 to ISC.
79 | 
80 | v0.9.0 2015-06-17 Cambridge (UK)
81 | --------------------------------
82 | 
83 | - Support for Unicode 8.0.0's new line breaking and sentence boundary rules.
84 | - `Uuseg.custom` add a unit argument.
85 | 
86 | 
87 | v0.8.0 2014-12-23 Cugy (VD)
88 | ---------------------------
89 | 
90 | First release.
91 | 


--------------------------------------------------------------------------------
/src/uuseg.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | 
  7 | let unicode_version = Uucp.unicode_version
  8 | 
  9 | (* Segmenters *)
 10 | 
 11 | type 'a segmenter =
 12 |   { id : 'a Uuseg_base.Type.Id.t;
 13 |     name : string;
 14 |     create : unit -> 'a;
 15 |     copy : 'a -> 'a;
 16 |     equal : 'a -> 'a -> bool;
 17 |     mandatory : 'a -> bool;
 18 |     add : 'a -> [ `Uchar of Uchar.t | `Await | `End ] ->
 19 |       [ `Boundary | `Uchar of Uchar.t | `Await | `End ] }
 20 | 
 21 | type custom = C : 'a segmenter -> custom
 22 | 
 23 | type boundary =
 24 |   [ `Grapheme_cluster | `Word | `Sentence | `Line_break | `Custom of custom ]
 25 | 
 26 | let pp_boundary ppf b = match (b :> boundary) with
 27 | | `Grapheme_cluster -> Format.fprintf ppf "`Grapheme_cluster"
 28 | | `Word -> Format.fprintf ppf "`Word"
 29 | | `Sentence -> Format.fprintf ppf "`Sentence"
 30 | | `Line_break -> Format.fprintf ppf "`Line_break"
 31 | | `Custom (C s) -> Format.fprintf ppf "`Custom %s" s.name
 32 | 
 33 | (* Built-in segmenters *)
 34 | 
 35 | let mandatory_default _ = true
 36 | 
 37 | let grapheme_cluster =
 38 |   C { id = Uuseg_base.Type.Id.make ();
 39 |       name = "Uuseg.grapheme_cluster";
 40 |       create = Uuseg_grapheme_cluster.create;
 41 |       copy = Uuseg_grapheme_cluster.copy;
 42 |       equal = Uuseg_grapheme_cluster.equal;
 43 |       mandatory = mandatory_default;
 44 |       add = Uuseg_grapheme_cluster.add; }
 45 | 
 46 | let word =
 47 |   C { id = Uuseg_base.Type.Id.make ();
 48 |       name = "Uuseg.word";
 49 |       create = Uuseg_word.create;
 50 |       copy = Uuseg_word.copy;
 51 |       equal = Uuseg_word.equal;
 52 |       mandatory = mandatory_default;
 53 |       add = Uuseg_word.add; }
 54 | 
 55 | let sentence =
 56 |   C { id = Uuseg_base.Type.Id.make ();
 57 |       name = "Uuseg.sentence";
 58 |       create = Uuseg_sentence.create;
 59 |       copy = Uuseg_sentence.copy;
 60 |       equal = Uuseg_sentence.equal;
 61 |       mandatory = mandatory_default;
 62 |       add = Uuseg_sentence.add; }
 63 | 
 64 | let line_break =
 65 |   C { id = Uuseg_base.Type.Id.make ();
 66 |       name = "Uuseg.line_break";
 67 |       create = Uuseg_line_break.create;
 68 |       copy = Uuseg_line_break.copy;
 69 |       equal = Uuseg_line_break.equal;
 70 |       mandatory = Uuseg_line_break.mandatory;
 71 |       add = Uuseg_line_break.add; }
 72 | 
 73 | (* Generic segmenter inteface *)
 74 | 
 75 | type t = Seg : boundary * 'a * 'a segmenter -> t
 76 | type ret = Uuseg_base.ret
 77 | 
 78 | let create boundary =
 79 |   let (C seg) = match boundary with
 80 |   | `Grapheme_cluster -> grapheme_cluster
 81 |   | `Word -> word
 82 |   | `Sentence -> sentence
 83 |   | `Line_break -> line_break
 84 |   | `Custom c -> c
 85 |   in
 86 |   Seg ((boundary :> boundary), seg.create (), seg)
 87 | 
 88 | let boundary (Seg (boundary, _, _)) = boundary
 89 | let add (Seg (_, s, seg)) add = seg.add s add
 90 | let mandatory (Seg (_, s, seg)) = seg.mandatory s
 91 | let copy (Seg (b, s, seg)) = Seg (b, seg.copy s, seg)
 92 | 
 93 | let equal (Seg (b0, s0, seg0)) (Seg (b1, s1, seg1)) = match b0, b1 with
 94 | | `Custom _, _ | _, `Custom _ ->
 95 |     invalid_arg "Cannot test custom segmenters for equality"
 96 | | _ ->
 97 |     match Uuseg_base.Type.Id.provably_equal seg0.id seg1.id with
 98 |     | None -> false
 99 |     | Some Uuseg_base.Type.Equal -> seg0.equal s0 s1
100 | 
101 | let pp_ret = Uuseg_base.pp_ret
102 | 
103 | (* Custom segmenters *)
104 | 
105 | let custom ?(mandatory = mandatory_default) ~name ~create ~copy ~add () =
106 |   (* N.B. when we require > 5.1 we can replace Uuseg_base.Type by
107 |      Stdlib.Type and open up equality testing for custom *)
108 |   let id = Uuseg_base.Type.Id.make () in
109 |   let equal _ _ = assert false in
110 |   C { id; name; create; copy; equal; mandatory; add }
111 | 
112 | let err_exp_await = Uuseg_base.err_exp_await
113 | let err_ended = Uuseg_base.err_ended
114 | 


--------------------------------------------------------------------------------
/B0.ml:
--------------------------------------------------------------------------------
  1 | open B0_kit.V000
  2 | open Result.Syntax
  3 | 
  4 | let unicode_version = 17, 0, 0, None (* Adjust on new releases *)
  5 | let next_major = B0_version.next_major unicode_version
  6 | 
  7 | (* OCaml library names *)
  8 | 
  9 | let b0_std = B0_ocaml.libname "b0.std"
 10 | let uucp = B0_ocaml.libname "uucp"
 11 | let uutf = B0_ocaml.libname "uutf"
 12 | let cmdliner = B0_ocaml.libname "cmdliner"
 13 | 
 14 | let uuseg = B0_ocaml.libname "uuseg"
 15 | 
 16 | (* Libraries *)
 17 | 
 18 | let uuseg_lib =
 19 |   let srcs = [ `Dir ~/"src" ] in
 20 |   let requires = [ uucp ] in
 21 |   B0_ocaml.lib uuseg ~doc:"The uuseg library" ~srcs ~requires
 22 | 
 23 | let uuseg_string_lib =
 24 |   let exports = [uuseg] in
 25 |   B0_ocaml.deprecated_lib ~exports (B0_ocaml.libname "uuseg.string")
 26 | 
 27 | (* Tools *)
 28 | 
 29 | let usegtrip =
 30 |   let srcs = [ `File ~/"test/usegtrip.ml" ] in
 31 |   let requires = [ cmdliner; uutf; uuseg ] in
 32 |   B0_ocaml.exe "usegtrip" ~public:true ~doc:"The usegtrip tool" ~srcs ~requires
 33 | 
 34 | (* Tests *)
 35 | 
 36 | let test =
 37 |   let srcs = [ `File ~/"test/test_uuseg.ml" ] in
 38 |   let meta =
 39 |     B0_meta.(empty |> tag test |> tag run |> ~~ B0_unit.Action.cwd `Scope_dir)
 40 |   in
 41 |   let requires = [ b0_std; uucp; uuseg; cmdliner ] in
 42 |   B0_ocaml.exe "test_uucp" ~doc:"Test segmentations" ~srcs ~meta ~requires
 43 | 
 44 | let examples =
 45 |   let srcs = [ `File ~/"test/examples.ml" ] in
 46 |   let meta = B0_meta.(empty |> tag test) in
 47 |   let requires = [ uuseg ] in
 48 |   B0_ocaml.exe "examples" ~doc:"Doc samples" ~srcs ~meta ~requires
 49 | 
 50 | (* Actions *)
 51 | 
 52 | let show_version =
 53 |   B0_unit.of_action "unicode-version" ~doc:"Show supported unicode version" @@
 54 |   fun _ _ ~args:_ ->
 55 |   Ok (Log.stdout (fun m -> m "%s" (B0_version.to_string unicode_version)))
 56 | 
 57 | let test_url kind =
 58 |   Fmt.str "http://www.unicode.org/Public/%s/ucd/auxiliary/%sBreakTest.txt"
 59 |     (B0_version.to_string unicode_version) kind
 60 | 
 61 | let download_tests =
 62 |   let doc = "Download the UCD break tests" in
 63 |   B0_unit.of_action "download-tests" ~doc @@ fun env _ ~args:_ ->
 64 |   let get kind =
 65 |     let test_url = test_url kind in
 66 |     let test_file = Fpath.v (Fmt.str "test/%sBreakTest.txt" kind) in
 67 |     let test_file = B0_env.in_scope_dir env test_file in
 68 |     (Log.stdout @@ fun m ->
 69 |      m "@[<v>Downloading %s@,to %a@]" test_url Fpath.pp test_file);
 70 |     B0_action_kit.fetch_url env test_url test_file
 71 |   in
 72 |   let tests = ["Line"; "Grapheme"; "Word"; "Sentence"] in
 73 |   List.iter_iter_on_error ~error:(Log.if_error ~use:()) get tests;
 74 |   Ok ()
 75 | 
 76 | (* Packs *)
 77 | 
 78 | let default =
 79 |   let meta =
 80 |     B0_meta.empty
 81 |     |> ~~ B0_meta.authors ["The uuseg programmers"]
 82 |     |> ~~ B0_meta.maintainers ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"]
 83 |     |> ~~ B0_meta.homepage "https://erratique.ch/software/uuseg"
 84 |     |> ~~ B0_meta.online_doc "https://erratique.ch/software/uuseg/doc/"
 85 |     |> ~~ B0_meta.licenses ["ISC"]
 86 |     |> ~~ B0_meta.repo "git+https://erratique.ch/repos/uuseg.git"
 87 |     |> ~~ B0_meta.issues "https://github.com/dbuenzli/uuseg/issues"
 88 |     |> ~~ B0_meta.description_tags
 89 |       ["unicode"; "text"; "segmentation"; "org:erratique"]
 90 |     |> B0_meta.tag B0_opam.tag
 91 |     |> ~~ B0_opam.build
 92 |       {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"
 93 |          "--with-uutf" "%{uutf:installed}%"
 94 |          "--with-cmdliner" "%{cmdliner:installed}%" ]]|}
 95 |     |> ~~ B0_opam.depopts [ "uutf", ""; "cmdliner", ""]
 96 |     |> ~~ B0_opam.conflicts
 97 |       [ "uutf", {|< "1.0.0"|};
 98 |         "cmdliner", {|< "1.1.0"|}]
 99 |     |> ~~ B0_opam.depends
100 |       [ "ocaml", {|>= "4.14.0"|};
101 |         "ocamlfind", {|build|};
102 |         "ocamlbuild", {|build|};
103 |         "topkg", {|build & >= "1.1.0"|};
104 |         "uucp",
105 |         Fmt.str {|>= "%s" & < "%s"|}
106 |           (B0_version.to_string unicode_version)
107 |           (B0_version.to_string next_major)]
108 |   in
109 |   B0_pack.make "default" ~doc:"uuseg package" ~meta ~locked:true @@
110 |   B0_unit.list ()
111 | 


--------------------------------------------------------------------------------
/src/uuseg_string.mli:
--------------------------------------------------------------------------------
 1 | (*---------------------------------------------------------------------------
 2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
 3 |    SPDX-License-Identifier: ISC
 4 |   ---------------------------------------------------------------------------*)
 5 | 
 6 | (** Unicode text segmentation on UTF OCaml strings.
 7 | 
 8 |     {!Uuseg} functions acting directly on UTF encoded OCaml strings.
 9 | 
10 |     {b Warning.} All these functions silently replace malformed encoded Unicode
11 |     data by a {!Stdlib.Uchar.rep} character. *)
12 | 
13 | (** {1:segment Segment} *)
14 | 
15 | type 'a folder = 'a -> string -> 'a
16 | (** The type for segment folders. The function takes an accumulator
17 |     and a segment. Segments are the UTF encoded characters delimited
18 |     by two [`Boundary] occurences. If the segmenter has no initial or
19 |     final [`Boundary], the folding function inserts an implicit
20 |     one. Empty segments – which by definition do not happen with
21 |     the default segmenters – are not reported. *)
22 | 
23 | val fold_utf_8 : [< Uuseg.boundary] -> 'a folder -> 'a -> string -> 'a
24 | (** [fold_utf_8 b f acc s] folds over the [b] UTF-8 encoded segments of
25 |     the UTF-8 encoded string [s] using [f] and [acc]. *)
26 | 
27 | val fold_utf_16be : [< Uuseg.boundary] -> 'a folder -> 'a -> string -> 'a
28 | (** [fold_utf16be] is like {!fold_utf_8} but on UTF-16BE encoded strings. *)
29 | 
30 | val fold_utf_16le : [< Uuseg.boundary] -> 'a folder -> 'a -> string -> 'a
31 | (** [fold_utf16le] is like {!fold_utf_8} but on UTF-16BE encoded
32 |     strings. *)
33 | 
34 | (** {1:pp Pretty-printers}
35 | 
36 |     Using OCaml's {!Format.pp_print_string} with Unicode encoded
37 |     strings will most of the time derail the pretty-printing process
38 |     for two reasons. First the Unicode encoding of a character may
39 |     span more than one byte and [pp_print_string] considers one
40 |     character to be one byte. Second there may be a discrepancy
41 |     between the sequence of user-perceived characters (grapheme
42 |     clusters e.g. é) and the actual sequences of Unicode characters in
43 |     the data (e.g. é represented by the decomposition e + ´,
44 |     <U+0065,U+0301>).
45 | 
46 |     The following formatters fix these problems for many (but not all)
47 |     scripts. *)
48 | 
49 | val pp_utf_8 : Format.formatter -> string -> unit
50 | (** [pp_utf8 ppf s] prints the UTF-8 encoded string [s]. Each grapheme
51 |     cluster is considered as taking a length of 1. *)
52 | 
53 | val pp_utf_8_text : Format.formatter -> string -> unit
54 | (** [pp_utf_8_text ppf s] prints the UTF-8 encoded string [s]. Each
55 |     grapheme cluster is considered as taking a length of 1. Each
56 |     line break opportunity is hinted with {!Format.pp_print_break}
57 |     and mandatory line breaks issue a {!Format.pp_force_newline} call.
58 | 
59 |     Take into account the following points:
60 |     {ul
61 |     {- Any {{!Uucp.White.is_white_space}white space} Unicode character
62 |        occuring before a break opportunity will be translated to a space
63 |        (U+0020) in output if no break occurs.}
64 |     {- The sequence CR LF (U+000D, U+000A) and all kind of mandatory
65 |        line breaks are translated to whathever line separator is output
66 |        by {!Format.pp_force_newline}. See {!pp_utf_8_lines} for the
67 |        list of characters treated as mandatory line breaks.}
68 |     {- Soft hyphens are handled but due to limitations in {!Format} are
69 |        not replaced by hard ones on breaks.}} *)
70 | 
71 | val pp_utf_8_lines : Format.formatter -> string -> unit
72 | (** [pp_utf_8_lines ppf s] prints the UTF-8 encoded string [s]. Each
73 |     grapheme cluster is considered as taking a length of 1. Each
74 |     mandatory line break (including the sequence CR LF (U+000D,
75 |     U+000A)) issues a {!Format.pp_force_newline} and is translated to
76 |     whathever line separator this function outputs.
77 | 
78 |     This function correctly handles all kinds of line ends present
79 |     Unicode, as of 7.0.0 this is FORM FEED (U+000C), LINE TABULATION
80 |     (U+000B), LINE SEPARATOR (U+2028), PARAGRAPH SEPARATOR (U+2020),
81 |     NEXT LINE (U+085), LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
82 |     and the sequence CR LF (U+000D, U+000A). *)
83 | 


--------------------------------------------------------------------------------
/src/uuseg_string.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | type 'a folder = 'a -> string -> 'a
  7 | 
  8 | let fold dec_uchar enc_uchar seg f acc0 s =
  9 |   let flush_segment buf acc =
 10 |     let segment = Buffer.contents buf in
 11 |     Buffer.clear buf; if segment = "" then acc else f acc segment
 12 |   in
 13 |   let rec add buf acc segmenter v = match Uuseg.add segmenter v with
 14 |   | `Uchar u -> enc_uchar buf u; add buf acc segmenter `Await
 15 |   | `Boundary -> add buf (flush_segment buf acc) segmenter `Await
 16 |   | `Await | `End -> acc
 17 |   in
 18 |   let rec loop buf acc s i max segmenter =
 19 |     if i > max then flush_segment buf (add buf acc segmenter `End) else
 20 |     let dec = dec_uchar s i in
 21 |     let acc = add buf acc segmenter (`Uchar (Uchar.utf_decode_uchar dec)) in
 22 |     loop buf acc s (i + Uchar.utf_decode_length dec) max segmenter
 23 |   in
 24 |   let buf = Buffer.create 42 in
 25 |   let segmenter = Uuseg.create seg in
 26 |   loop buf acc0 s 0 (String.length s - 1) segmenter
 27 | 
 28 | let fold_utf_8 seg f acc0 s =
 29 |   fold String.get_utf_8_uchar Buffer.add_utf_8_uchar seg f acc0 s
 30 | 
 31 | let fold_utf_16be seg f acc0 s =
 32 |   fold String.get_utf_16be_uchar Buffer.add_utf_16be_uchar seg f acc0 s
 33 | 
 34 | let fold_utf_16le seg f acc0 s =
 35 |   fold String.get_utf_16le_uchar Buffer.add_utf_16le_uchar seg f acc0 s
 36 | 
 37 | let pp_utf_8 ppf s =
 38 |   let flush buf =
 39 |     let gc = Buffer.contents buf in
 40 |     if gc = "" then () else (Format.fprintf ppf "@<1>%s" gc; Buffer.clear buf)
 41 |   in
 42 |   let rec add buf segmenter v = match Uuseg.add segmenter v with
 43 |   | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf segmenter `Await
 44 |   | `Boundary -> flush buf; add buf segmenter `Await
 45 |   | `Await | `End -> ()
 46 |   in
 47 |   let rec loop buf s i max segmenter =
 48 |     if i > max then (add buf segmenter `End; flush buf) else
 49 |     let dec = String.get_utf_8_uchar s i in
 50 |     add buf segmenter (`Uchar (Uchar.utf_decode_uchar dec));
 51 |     loop buf s (i + Uchar.utf_decode_length dec) max segmenter
 52 |   in
 53 |   let buf = Buffer.create 10 in
 54 |   let segmenter = Uuseg.create `Grapheme_cluster in
 55 |   loop buf s 0 (String.length s - 1) segmenter
 56 | 
 57 | let pp_utf_8_text ~only_mandatory ppf s =
 58 |   let b = Buffer.create 10 in
 59 |   let buf_buf = ref None in (* buffer to handle CRLF and suppress white *)
 60 |   let buf_flush () =
 61 |     let gc = Buffer.contents b in
 62 |     if gc = "" then () else (Format.fprintf ppf "@<1>%s" gc; Buffer.clear b)
 63 |   in
 64 |   let buf_add u = match !buf_buf with
 65 |   | None -> buf_buf := Some u
 66 |   | Some last ->
 67 |       match Uchar.to_int last with
 68 |       | 0x000D when Uchar.to_int u = 0x000A -> buf_buf := Some u (* rem CR *)
 69 |       | _ -> Buffer.add_utf_8_uchar b last; buf_buf := Some u
 70 |   in
 71 |   let buf_cut mandatory =
 72 |     let bbuf = !buf_buf in
 73 |     buf_buf := None;
 74 |     match bbuf with
 75 |     | None -> buf_flush (); Format.pp_print_cut ppf ()
 76 |     | Some u when mandatory && Uucp.White.is_white_space u ->
 77 |         buf_flush (); Format.pp_force_newline ppf ()
 78 |     | Some u when mandatory -> (* should not happen *)
 79 |         Buffer.add_utf_8_uchar b u; buf_flush (); Format.pp_force_newline ppf ()
 80 |     | Some u when Uucp.White.is_white_space u ->
 81 |         buf_flush (); Format.pp_print_break ppf 1 0;
 82 |     | Some u ->
 83 |         Buffer.add_utf_8_uchar b u; buf_flush (); Format.pp_print_cut ppf ()
 84 |   in
 85 |   let gseg = Uuseg.create `Grapheme_cluster in
 86 |   let lseg = Uuseg.create `Line_break in
 87 |   let rec line_add a = match Uuseg.add lseg a with
 88 |   | `Uchar u -> buf_add u; line_add `Await
 89 |   | `Boundary ->
 90 |       let m = Uuseg.mandatory lseg in
 91 |       if (only_mandatory && m) || (not only_mandatory) then buf_cut m;
 92 |       line_add `Await
 93 |   | `Await | `End -> ()
 94 |   in
 95 |   let rec add a = match Uuseg.add gseg a with
 96 |   | `Uchar _ as a -> line_add a; add `Await
 97 |   | `Boundary -> buf_flush (); add `Await
 98 |   | `Await -> ()
 99 |   | `End -> line_add `End; ()
100 |   in
101 |   let rec loop s i max =
102 |     if i > max then add `End else
103 |     let dec = String.get_utf_8_uchar s i in
104 |     add (`Uchar (Uchar.utf_decode_uchar dec));
105 |     loop s (i + Uchar.utf_decode_length dec) max
106 |   in
107 |   loop s 0 (String.length s - 1)
108 | 
109 | let pp_utf_8_lines = pp_utf_8_text ~only_mandatory:true
110 | let pp_utf_8_text = pp_utf_8_text ~only_mandatory:false
111 | 


--------------------------------------------------------------------------------
/src/uuseg_grapheme_cluster.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | (* These are the rules as found in [1], with property values aliases [2]
  7 |    substituted.
  8 | 
  9 |    GB1.               sot ÷ Any
 10 |    GB2.               Any ÷ eot
 11 |    GB3.                CR × LF
 12 |    GB4.        (CN|CR|LF) ÷
 13 |    GB5.                   ÷ (CN|CR|LF)
 14 |    GB6.                 L × (L|V|LV|LVT)
 15 |    GB7.            (LV|V) × (V|T)
 16 |    GB8.           (LVT|T) × T
 17 |    GB9.                   × (EX|ZWJ)
 18 |    GB9a.                  × SM
 19 |    GB9b.               PP ×
 20 |    GB9c. \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]*
 21 |          \p{InCB=Linker} [\p{InCB=Extend}\p{InCB=Linker}]*
 22 |          ×
 23 |          \p{InCB=Consonant}
 24 |    GB11. \p{Extended_Pictographic} EX* ZWJ x \p{Extended_Pictographic}
 25 |    GB12.  sot (RI RI)* RI × RI
 26 |    GB13.   [^RI] (RI RI)* × RI
 27 |    GB999.             Any ÷ Any
 28 | 
 29 |    [1]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
 30 |    [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt
 31 |    [3]: http://www.unicode.org/Public/7.0.0/ucd/auxiliary/GraphemeBreakTest.html
 32 | 
 33 |    By the structure of the rules we see that grapheme clusters
 34 |    boundaries can *mostly* be determined by simply looking at the
 35 |    grapheme cluster break property value of the character on the left
 36 |    and on the right of a boundary. The exceptions are GB9c, GB10 and GB12-13
 37 |    which are handled specially by enriching the segmenter state in
 38 |    a horribly ad-hoc fashion. *)
 39 | 
 40 | type gcb =
 41 |   | CN | CR | EX | EB | EBG | EM | GAZ | L | LF | LV | LVT | PP | RI
 42 |   | SM | T | V | XX | ZWJ | Sot
 43 | 
 44 | type incb = Consonant | Extend | Linker | None'
 45 | 
 46 | (* WARNING. The indexes used here need to be synchronized with those
 47 |    assigned by uucp for Uucp.Break.Low.{grapheme_cluster,indic_conjunct_break}
 48 | *)
 49 | 
 50 | let byte_to_gcb =
 51 |   [| CN; CR; EX; EB; EBG; EM; GAZ; L; LF; LV; LVT; PP; RI;
 52 |      SM; T; V; XX; ZWJ; |]
 53 | 
 54 | let gcb u = byte_to_gcb.(Uucp.Break.Low.grapheme_cluster u)
 55 | 
 56 | let byte_to_incb = [| Consonant; Extend; Linker; None' |]
 57 | let incb u = byte_to_incb.(Uucp.Break.Low.indic_conjunct_break u)
 58 | 
 59 | type left_gb9c_state = (* Ad-hoc state for matching GB9c *)
 60 | | Reset | Has_consonant | Has_linker
 61 | 
 62 | type state =
 63 | | Fill  (* get next uchar to decide boundary. *)
 64 | | Flush (* an uchar is buffered, client needs to get it out with `Await. *)
 65 | | End   (* `End was added. *)
 66 | 
 67 | type t =
 68 |   { mutable state : state;                                 (* current state. *)
 69 |     mutable left_gb9c : left_gb9c_state;         (* state for matching gb9c. *)
 70 |     mutable left : gcb;            (* break property value left of boundary. *)
 71 |     mutable left_odd_ri : bool;             (* odd number of RI on the left. *)
 72 |     mutable left_emoji_seq : bool;                 (* emoji seq on the left. *)
 73 |     mutable buf : [ `Uchar of Uchar.t ] }                 (* bufferized add. *)
 74 | 
 75 | let nul_buf = `Uchar (Uchar.unsafe_of_int 0x0000)
 76 | 
 77 | let create () =
 78 |   { state = Fill;
 79 |     left_gb9c = Reset;
 80 |     left = Sot; left_odd_ri = false; left_emoji_seq = false;
 81 |     buf = nul_buf (* overwritten *); }
 82 | 
 83 | let copy s = { s with state = s.state; }
 84 | let equal = ( = )
 85 | 
 86 | let gb9c_match s right_incb = match s.left_gb9c, right_incb with
 87 | | Has_linker, Consonant -> true
 88 | | _, _ -> false
 89 | 
 90 | let break s right right_incb right_u = match s.left, right with
 91 | | (* GB1 *)   Sot, _ -> true
 92 |   (* GB2 is handled by `End *)
 93 | | (* GB3 *)   CR, LF -> false
 94 | | (* GB4 *)   (CN|CR|LF), _ -> true
 95 | | (* GB5 *)   _, (CN|CR|LF) -> true
 96 | | (* GB6 *)   L, (L|V|LV|LVT) -> false
 97 | | (* GB7 *)   (LV|V), (V|T) -> false
 98 | | (* GB8 *)   (LVT|T), T -> false
 99 | | (* GB9+a *) _, (EX|ZWJ|SM) -> false
100 | | (* GB9b *)  PP, _ -> false
101 | | (* GB9c *)  _, _ when gb9c_match s right_incb -> false
102 | | (* GB11 *)  ZWJ, _ when s.left_emoji_seq &&
103 |                           Uucp.Emoji.is_extended_pictographic right_u -> false
104 | | (* GB12+13 *) RI, RI when s.left_odd_ri -> false
105 | | (* GB999 *) _, _ -> true
106 | 
107 | let update_left s right right_incb right_u =
108 |   s.left <- right;
109 |   begin match s.left with
110 |   | EX | ZWJ ->
111 |       s.left_odd_ri <- false
112 |       (* keep s.left_emoji_seq as is *)
113 |   | RI ->
114 |       s.left_odd_ri <- not s.left_odd_ri;
115 |       s.left_emoji_seq <- false;
116 |   | _ when Uucp.Emoji.is_extended_pictographic right_u ->
117 |       s.left_odd_ri <- false;
118 |       s.left_emoji_seq <- true;
119 |   | _ ->
120 |       s.left_odd_ri <- false;
121 |       s.left_emoji_seq <- false
122 |   end;
123 |   s.left_gb9c <- begin match right_incb with
124 |   | None' -> Reset
125 |   | Consonant -> Has_consonant
126 |   | Linker when s.left_gb9c = Has_consonant -> Has_linker
127 |   | Extend | Linker -> s.left_gb9c
128 |   end
129 | 
130 | let add s = function
131 | | `Uchar u as add ->
132 |     begin match s.state with
133 |     | Fill ->
134 |         let right = gcb u in
135 |         let right_incb = incb u in
136 |         let break = break s right right_incb u in
137 |         update_left s right right_incb u;
138 |         if not break then add else
139 |         (s.state <- Flush; s.buf <- add; `Boundary)
140 |     | Flush -> Uuseg_base.err_exp_await add
141 |     | End -> Uuseg_base.err_ended add
142 |     end
143 | | `Await ->
144 |     begin match s.state with
145 |     | Flush -> s.state <- Fill; (s.buf :> Uuseg_base.ret)
146 |     | End -> `End
147 |     | Fill -> `Await
148 |     end
149 | | `End ->
150 |     begin match s.state with
151 |     | Fill -> s.state <- End; if s.left = Sot then `End else `Boundary
152 |     | Flush -> Uuseg_base.err_exp_await `End
153 |     | End -> Uuseg_base.err_ended `End
154 |     end
155 | 


--------------------------------------------------------------------------------
/test/usegtrip.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | let str = Printf.sprintf
  7 | let pp = Format.fprintf
  8 | let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) "
  9 |     (Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d)
 10 |     (Uutf.decoder_byte_count d)
 11 | 
 12 | let pp_malformed ppf bs =
 13 |   let l = String.length bs in
 14 |   pp ppf "@[malformed bytes @[(";
 15 |   if l > 0 then pp ppf "%02X" (Char.code (bs.[0]));
 16 |   for i = 1 to l - 1 do pp ppf "@ %02X" (Char.code (bs.[i])) done;
 17 |   pp ppf ")@]@]"
 18 | 
 19 | let exec = Filename.basename Sys.executable_name
 20 | let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec
 21 | 
 22 | let input_malformed = ref false
 23 | let log_malformed inf d bs =
 24 |   input_malformed := true;
 25 |   log "%s:%a: %a@." inf pp_pos d pp_malformed bs
 26 | 
 27 | let u_rep = `Uchar Uutf.u_rep
 28 | 
 29 | (* Output *)
 30 | 
 31 | let uchar_ascii delim ppf =
 32 |   let last_was_u = ref false in
 33 |   function
 34 |   | `Uchar u ->
 35 |       if !last_was_u then (Format.pp_print_char ppf ' ');
 36 |       last_was_u := true; pp ppf "U+%04X" (Uchar.to_int u)
 37 |   | `Boundary ->
 38 |       last_was_u := false; pp ppf "%s" delim
 39 |   | `End -> ()
 40 | 
 41 | let uchar_encoder enc delim =
 42 |   let enc = match enc with
 43 |   | `ISO_8859_1 | `US_ASCII -> `UTF_8
 44 |   | #Uutf.encoding as enc -> enc
 45 |   in
 46 |   let delim =
 47 |     let add acc _ = function
 48 |     | `Uchar _ as u -> u :: acc
 49 |     | `Malformed bs ->
 50 |         log "delimiter: %a" pp_malformed bs; u_rep :: acc
 51 |     in
 52 |     List.rev (Uutf.String.fold_utf_8 add [] delim)
 53 |   in
 54 |   let e = Uutf.encoder enc (`Channel stdout) in
 55 |   function
 56 |   | `Uchar _ | `End as v -> ignore (Uutf.encode e v)
 57 |   | `Boundary -> List.iter (fun u -> ignore (Uutf.encode e u)) delim
 58 | 
 59 | let out_fun delim ascii oe =
 60 |   if ascii then uchar_ascii delim Format.std_formatter else
 61 |   uchar_encoder oe delim
 62 | 
 63 | (* Trip *)
 64 | 
 65 | let segment boundary inf d first_dec out =
 66 |   let segmenter = Uuseg.create boundary in
 67 |   let rec add v = match Uuseg.add segmenter v with
 68 |   | `Uchar _ | `Boundary as v -> out v; add `Await
 69 |   | `Await | `End -> ()
 70 |   in
 71 |   let rec loop d = function
 72 |   | `Uchar _ as v -> add v; loop d (Uutf.decode d)
 73 |   | `End as v -> add v; out `End
 74 |   | `Malformed bs -> log_malformed inf d bs; add u_rep; loop d (Uutf.decode d)
 75 |   | `Await -> assert false
 76 |   in
 77 |   if Uutf.decoder_removed_bom d then add (`Uchar Uutf.u_bom);
 78 |   loop d first_dec
 79 | 
 80 | let trip seg inf enc delim ascii =
 81 |   try
 82 |     let ic = if inf = "-" then stdin else open_in inf in
 83 |     let d = Uutf.decoder ?encoding:enc (`Channel ic) in
 84 |     let first_dec = Uutf.decode d in            (* guess encoding if needed. *)
 85 |     let out = out_fun delim ascii (Uutf.decoder_encoding d) in
 86 |     segment seg inf d first_dec out;
 87 |     if inf <> "-" then close_in ic;
 88 |     flush stdout
 89 |   with Sys_error e -> log "%s@." e; exit 1
 90 | 
 91 | (* Version *)
 92 | 
 93 | let unicode_version () = Format.printf "%s@." Uuseg.unicode_version
 94 | 
 95 | (* Cmd *)
 96 | 
 97 | let do_cmd cmd seg inf enc delim ascii = match cmd with
 98 | | `Unicode_version -> unicode_version ()
 99 | | `Trip -> trip seg inf enc delim ascii
100 | 
101 | (* Cmdline interface *)
102 | 
103 | open Cmdliner
104 | 
105 | let cmd =
106 |   let doc = "Output supported Unicode version." in
107 |   let unicode_version = `Unicode_version, Arg.info ["unicode-version"] ~doc in
108 |   Arg.(value & vflag `Trip [unicode_version])
109 | 
110 | let seg_docs = "SEGMENTATION"
111 | let seg =
112 |   let docs = seg_docs in
113 |   let doc = "Line break opportunities boundaries." in
114 |   let line = `Line_break, Arg.info ["l"; "line"] ~doc ~docs in
115 |   let doc = "Grapheme cluster boundaries." in
116 |   let gc = `Grapheme_cluster, Arg.info ["g"; "grapheme-cluster"] ~doc ~docs in
117 |   let doc = "Word boundaries (default)." in
118 |   let w = `Word, Arg.info ["w"; "word"] ~doc ~docs in
119 |   let doc = "Sentence boundaries." in
120 |   let s = `Sentence, Arg.info ["s"; "sentence"] ~doc ~docs in
121 |   Arg.(value & vflag `Word [line; gc; w; s])
122 | 
123 | let file =
124 |   let doc = "The input file. Reads from stdin if unspecified." in
125 |   Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE")
126 | 
127 | let enc =
128 |   let enc = [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE;
129 |               "UTF-16BE", `UTF_16BE; "ASCII", `US_ASCII; "latin1", `ISO_8859_1 ]
130 |   in
131 |   let doc = str "Input encoding, must %s. If unspecified the encoding is \
132 |                  guessed. The output encoding is the same as the input \
133 |                  encoding except for ASCII and latin1 where UTF-8 is output."
134 |                 (Arg.doc_alts_enum enc)
135 |   in
136 |   Arg.(value & opt (some (enum enc)) None & info [ "e"; "encoding" ] ~doc)
137 | 
138 | let ascii =
139 |   let doc = "Output the input text as space (U+0020) separated Unicode
140 |              scalar values written in the US-ASCII charset."
141 |   in
142 |   Arg.(value & flag & info ["a"; "ascii"] ~doc)
143 | 
144 | let delim =
145 |   let doc = "The UTF-8 encoded delimiter used to denote boundaries." in
146 |   Arg.(value & opt string "|" & Arg.info [ "d"; "delimiter" ] ~doc ~docv:"SEP")
147 | 
148 | let cmd =
149 |   let doc = "segment Unicode text" in
150 |   let man = [
151 |     `S "DESCRIPTION";
152 |     `P "$(tname) inputs Unicode text from stdin and rewrites it
153 |         to stdout with segment boundaries as determined according
154 |         the locale independent specifications of UAX 29 and UAX 14.
155 |         Boundaries are represented by the UTF-8 encoded delimiter string
156 |         specified with the option $(b,-d) (defaults to `|').";
157 |     `P "Invalid byte sequences in the input are reported on stderr and
158 |         replaced by the Unicode replacement character (U+FFFD) in the output.";
159 |     `S seg_docs;
160 |     `S "OPTIONS";
161 |     `S "EXIT STATUS";
162 |     `P "$(tname) exits with one of the following values:";
163 |     `I ("0", "no error occured");
164 |     `I ("1", "a command line parsing error occured");
165 |     `I ("2", "the input text was malformed");
166 |     `S "BUGS";
167 |     `P "This program is distributed with the Uuseg OCaml library.
168 |         See http://erratique.ch/software/uuseg for contact
169 |         information."; ]
170 |   in
171 |   Cmd.v (Cmd.info "usegtrip" ~version:"%%VERSION%%" ~doc ~man)
172 |     Term.(const do_cmd $ cmd $ seg $ file $ enc $ delim $ ascii)
173 | 
174 | let main () = match Cmd.eval cmd with
175 | | 0 -> if !input_malformed then exit 2 else exit 0
176 | | c when c = Cmd.Exit.cli_error -> exit 1
177 | | c -> exit c
178 | 
179 | let () = if !Sys.interactive then () else main ()
180 | 


--------------------------------------------------------------------------------
/src/uuseg_word.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | (* These are the rules as found in [1], with property values aliases [2]
  7 |    substituted.
  8 | 
  9 |    WB1.                 sot ÷ Any
 10 |    WB2.                 Any ÷ eot
 11 |    WB3.                  CR × LF
 12 |    WB3a.         (NL|CR|LF) ÷
 13 |    WB3b.                    ÷ (NL|CR|LF)
 14 |    WB3c.                ZWJ × \p{Extended_Pictographic}
 15 |    WB3d.          WSegSpace × WSegSpace
 16 |    WB4.  X (Extend|FO|ZWJ)* → X
 17 |    WB5.             (LE|HL) × (LE|HL)
 18 |    WB6.             (LE|HL) × (ML|MB|SQ) (LE|HL)
 19 |    WB7.  (LE|HL) (ML|MB|SQ) × (LE|HL)
 20 |    WB7a.                 HL × SQ
 21 |    WB7b.                 HL × DQ HL
 22 |    WB7c.              HL DQ × HL
 23 |    WB8.                  NU × NU
 24 |    WB9.             (LE|HL) × NU
 25 |    WB10.                 NU × (LE|HL)
 26 |    WB11.      NU (MN|MB|SQ) × NU
 27 |    WB12.                 NU × (MN|MB|SQ) NU
 28 |    WB13.                 KA × KA
 29 |    WB13a.  (LE|HL|NU|KA|EX) × EX
 30 |    WB13b.                EX × (LE|HL|NU|KA)
 31 |    WB15     sot (RI RI)* RI × RI
 32 |    WB15   [^RI] (RI RI)* RI × RI
 33 |    WB13c.                RI × RI
 34 |    WB999.               Any ÷ Any
 35 | 
 36 |    [1]: http://www.unicode.org/reports/tr29/#Word_boundaries
 37 |    [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt
 38 | 
 39 |    Given the structure of the rules we keep a window of four word
 40 |    break property value slots, two on the left, two on the right of a
 41 |    boundary and pattern match these slots to find the rule that
 42 |    applies. Because of WB4 these slots may actually correspond to more
 43 |    than one character and we need to bufferize the data for the slot r0.
 44 | 
 45 |    Besides we maintain two views of the window slots, one which has the
 46 |    word break property of concrete characters and another one that has
 47 |    the word break property as seen by the WB4 rewrite rule (for the
 48 |    right slots this coincides), see the *_wb4 fields in the state.
 49 | 
 50 |                             ---??--->
 51 |                      +----+----++----+----+
 52 |                   ...| l1 | l0 || r0 | r1 |
 53 |                      +----+----++----+----+
 54 |    already returned to client /  \ buffered in segmenter *)
 55 | 
 56 | type word =
 57 | | CR | DQ | EX | EB | EBG | EM | Extend | FO | GAZ | HL | KA | LE | LF
 58 | | MB | ML | MN | NL | NU | RI | SQ | WSegSpace | XX | ZWJ | Invalid | Sot | Eot
 59 | 
 60 | (* WARNING. The indexes used here need to be synchronized with those
 61 |    assigned by uucp for Uucp.Break.Low.word. *)
 62 | 
 63 | let byte_to_word =
 64 |   [| CR; DQ; EX; EB; EBG; EM; Extend; FO; GAZ; HL; KA; LE; LF;
 65 |      MB; ML; MN; NL; NU; RI; SQ; WSegSpace; XX; ZWJ |]
 66 | 
 67 | let word u = byte_to_word.(Uucp.Break.Low.word u)
 68 | 
 69 | type state =
 70 | | Fill   (* fill slots on the right of boundary. *)
 71 | | Flush  (* flush the first element of slot r0 to get to next boundary. *)
 72 | | Decide (* decide boundary of slot r0 *)
 73 | 
 74 | type t =
 75 |   { mutable state : state;
 76 |     mutable l1 : word; mutable l1_wb4 : word; (* l1 according to wb4 *)
 77 |     mutable l0 : word; mutable l0_wb4 : word; (* l0 according to wb4 *)
 78 |     mutable l0_odd_ri : bool; (* odd number of RI on left of break point. *)
 79 |     mutable r0 : word; (* of first element in r0_data. *)
 80 |     mutable r0_data : Uuseg_buf.t; (* data in r0 *)
 81 |     mutable r1 : word;
 82 |     mutable r1_data : [`Uchar of Uchar.t ] option; (* data in r1 (if any) *)
 83 |     mutable ended : bool; (* [true] if [`End] was added. *) }
 84 | 
 85 | let create () =
 86 |   { state = Fill;
 87 |     l1 = Invalid; l1_wb4 = Invalid;
 88 |     l0 = Sot; l0_wb4 = Sot;
 89 |     l0_odd_ri = false;
 90 |     r0 = Invalid;
 91 |     r0_data = Uuseg_buf.create 13;
 92 |     r1 = Invalid;
 93 |     r1_data = None;
 94 |     ended = false; }
 95 | 
 96 | let copy s = { s with r0_data = Uuseg_buf.copy s.r0_data; }
 97 | let equal = ( = )
 98 | 
 99 | let has_break s = match s.l1, s.l0 (**),(**) s.r0, s.r1 with
100 | | (* WB1 *)  _, Sot, _, _ -> true
101 | | (* WB2 *)  _, _, Eot, _ -> true
102 | | (* WB3 *)  _, CR, LF, _ -> false
103 | | (* WB3a *) _, (NL|CR|LF), _, _ -> true
104 | | (* WB3b *) _, _, (NL|CR|LF), _ -> true
105 | | (* WB3c *) _, ZWJ, _, _ when
106 |     not (Uuseg_buf.empty s.r0_data) &&
107 |     Uucp.Emoji.is_extended_pictographic (Uuseg_buf.get_first s.r0_data) ->
108 |     false
109 | | (* WB3d *)  _, WSegSpace, WSegSpace, _ -> false
110 | | _ -> (* apply WB4 rewrite and match *)
111 |     match s.l1_wb4, s.l0_wb4 (**),(**) s.r0, s.r1 with
112 |     | (* WB4 *)   _, _, (Extend|FO|ZWJ), _ -> false
113 |     | (* WB5 *)   _, (LE|HL), (LE|HL), _ -> false
114 |     | (* WB6 *)   _, (LE|HL), (ML|MB|SQ), (LE|HL) -> false
115 |     | (* WB7 *)   (LE|HL), (ML|MB|SQ), (LE|HL), _ -> false
116 |     | (* WB7a *)  _, HL, SQ, _ -> false
117 |     | (* WB7b *)  _, HL, DQ, HL -> false
118 |     | (* WB7c *)  HL, DQ, HL, _ -> false
119 |     | (* WB8 *)   _, NU, NU, _ -> false
120 |     | (* WB9 *)   _, (LE|HL), NU, _ -> false
121 |     | (* WB10 *)  _, NU, (LE|HL), _ -> false
122 |     | (* WB11 *)  NU, (MN|MB|SQ), NU, _ -> false
123 |     | (* WB12 *)  _, NU, (MN|MB|SQ), NU -> false
124 |     | (* WB13 *)  _, KA, KA, _ -> false
125 |     | (* WB13a *) _, (LE|HL|NU|KA|EX), EX, _ -> false
126 |     | (* WB13b *) _, EX, (LE|HL|NU|KA), _ -> false
127 |     | (* WB15-16 *) _, RI, RI, _ when s.l0_odd_ri -> false
128 |     | (* WB999 *) _, _, _, _ -> true
129 | 
130 | let next s = (* moves to the next boundary and returns the char in r0 *)
131 |   s.l1 <- s.l0;
132 |   s.l0 <- s.r0;
133 |   (* The lb4 window only moves when r0 is not one of the absorbed chars. *)
134 |   begin match s.r0 with
135 |   | Extend | FO | ZWJ -> () (* wb4 window doesn't move *)
136 |   | _ ->
137 |       s.l1_wb4 <- s.l0_wb4;
138 |       s.l0_wb4 <- s.r0;
139 |       s.l0_odd_ri <- (match s.l0_wb4 with RI -> not s.l0_odd_ri | _ -> false);
140 |   end;
141 |   let ret = Uuseg_buf.flush s.r0_data in
142 |   match Uuseg_buf.empty s.r0_data with
143 |   | false -> s.r0 <- word (Uuseg_buf.get_first s.r0_data); ret
144 |   | true ->
145 |       s.r0 <- s.r1;
146 |       s.r1 <- Invalid;
147 |       match s.r1_data with
148 |       | None -> ret
149 |       | Some u -> Uuseg_buf.add s.r0_data u; s.r1_data <- None; ret
150 | 
151 | let need_fill s = Uuseg_buf.empty s.r0_data || s.r1_data = None
152 | let flush s = match s.ended with
153 | | false ->
154 |     let ret = next s in
155 |     (if need_fill s then s.state <- Fill else s.state <- Decide);
156 |     ret
157 | | true ->
158 |     match s.r0 with
159 |     | Eot -> `End
160 |     | _ -> s.state <- Decide; next s
161 | 
162 | let decide s = if has_break s then (s.state <- Flush; `Boundary) else flush s
163 | 
164 | let add s = function
165 | | `Uchar u as add ->
166 |     if s.ended then Uuseg_base.err_ended add else
167 |     begin match s.state with
168 |     | Fill when Uuseg_buf.empty s.r0_data ->
169 |         Uuseg_buf.add s.r0_data add; s.r0 <- word u; `Await
170 |     | Fill ->
171 |         begin match word u with
172 |         | Extend | FO | ZWJ -> (* WB4 *) Uuseg_buf.add s.r0_data add; `Await
173 |         | word -> s.r1_data <- Some add; s.r1 <- word; decide s
174 |         end
175 |     | Flush | Decide -> Uuseg_base.err_exp_await add
176 |     end
177 | | `Await ->
178 |     begin match s.state with
179 |     | Flush -> flush s
180 |     | Decide -> decide s
181 |     | Fill -> `Await
182 |     end
183 | | `End ->
184 |     if s.ended then Uuseg_base.err_ended `End else
185 |     begin match s.state with
186 |     | Fill ->
187 |         s.ended <- true;
188 |         (if Uuseg_buf.empty s.r0_data then s.r0 <- Eot else s.r1 <- Eot);
189 |         if s.l0 = Sot && s.r0 = Eot then (* empty string *) `End else
190 |         decide s
191 |     | Flush | Decide -> Uuseg_base.err_exp_await `End
192 |     end
193 | 


--------------------------------------------------------------------------------
/test/test_uuseg.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | (* Uuseg tests, including Unicode's Segmentation and Line break conformance
  7 |    tests. *)
  8 | 
  9 | open B0_testing
 10 | open B0_std
 11 | 
 12 | let pp_boundary ppf = function
 13 | | `Grapheme_cluster -> Format.fprintf ppf "Grapheme cluster"
 14 | | `Word -> Format.fprintf ppf "Word"
 15 | | `Sentence -> Format.fprintf ppf "Sentence"
 16 | | `Line_break -> Format.fprintf ppf "Line break"
 17 | 
 18 | let rec pp_spec ppf = function
 19 | | [] -> ()
 20 | | `B :: spec -> Format.fprintf ppf "÷ "; pp_spec ppf spec
 21 | | `U u :: spec ->
 22 |     Format.fprintf ppf "%04X " (Uchar.to_int u);
 23 |     (match spec with (`U _) :: _ -> Format.fprintf ppf "× " | _ -> ());
 24 |     pp_spec ppf spec
 25 | 
 26 | (* Conformance data decoding *)
 27 | 
 28 | let cp_of_string v = (* parses a code point value. *)
 29 |   let is_hex c = (0x30 <= c && c <= 0x39) || (0x41 <= c && c <= 0x46) in
 30 |   let cp = ref 0 in
 31 |   for k = 0 to (String.length v) - 1 do
 32 |     let c = Char.code v.[k] in
 33 |     if not (is_hex c) then failwith v else
 34 |     cp := !cp * 16 + (if c <= 0x39 then c - 48 else c - 55)
 35 |   done;
 36 |   !cp
 37 | 
 38 | let decode_conformance_specs ignores ic =
 39 |   let rec loop specs =
 40 |     match try Some (input_line ic) with End_of_file -> None with
 41 |     | None -> List.rev specs
 42 |     | Some l ->
 43 |         if String.length l > 0 && l.[0] = '#' then loop specs else
 44 |         try begin match String.split_on_char '#' l with
 45 |         | [comment] -> loop specs
 46 |         | test :: comment ->
 47 |             let spec = String.split_on_char ' ' test in
 48 |             begin try
 49 |               let rec to_spec acc = function
 50 |               | ( "\xC3\x97" (* × *) | "\xC3\x97\t" ) :: rest ->
 51 |                   to_spec acc rest
 52 |               | ( "\xC3\xB7" (* ÷ *) | "\xC3\xB7\t") :: rest ->
 53 |                   to_spec (`B :: acc) rest
 54 |               | uchar :: rest ->
 55 |                   let u = cp_of_string uchar in
 56 |                   if not (Uchar.is_valid u) then raise Exit else
 57 |                   to_spec (`U (Uchar.of_int u) :: acc) rest
 58 |               | [] ->
 59 |                   List.rev acc
 60 |               in
 61 |               let spec = to_spec [] spec in
 62 |               if ignores = [] then loop (spec :: specs) else
 63 |               try
 64 |                 let reason = List.assoc spec ignores in
 65 |                 Test.log "Skip test (%s): %s" reason test;
 66 |                 loop (specs)
 67 |               with
 68 |               | Not_found -> loop (spec :: specs)
 69 |               with Exit ->
 70 |                 Test.log
 71 |                   "Skip test (surrogate not a scalar value): %s" test;
 72 |                   loop specs
 73 |             end
 74 |         | [] -> failwith ""
 75 |         end
 76 |         with Failure f ->
 77 |           Test.fail "FAILURE: `%s'" f;
 78 |           Test.fail "Unable to parse line:\n`%s'\n" l;
 79 |           loop specs
 80 |   in
 81 |   loop []
 82 | 
 83 | let rec seq_of_spec acc = function
 84 | | `U u :: rest -> seq_of_spec (u :: acc) rest
 85 | | `B :: rest -> seq_of_spec acc rest
 86 | | [] -> List.rev acc
 87 | 
 88 | (* Conformance testing *)
 89 | 
 90 | let test_spec ?__POS__ seg src spec =
 91 |   let n = Uuseg.create seg in
 92 |   let ended = ref false in
 93 |   let rec add acc v = match Uuseg.add n v with
 94 |   | `Uchar u -> add ((`U u) :: acc) `Await
 95 |   | `Boundary -> add (`B :: acc) `Await
 96 |   | `Await -> ended := false; acc
 97 |   | `End -> ended := true; acc
 98 |   in
 99 |   let add_uchar acc u = add acc (`Uchar u) in
100 |   let nseq = List.rev (add (List.fold_left add_uchar [] src) `End) in
101 |   if not !ended then
102 |     Test.fail ?__POS__ "%a did not finish with `End." pp_boundary seg else
103 |   if nseq <> spec then
104 |     Test.fail ?__POS__ "@[<v>%a mismatch:@,impl: %a@,spec: %a@]"
105 |       pp_boundary seg pp_spec nseq pp_spec spec
106 |   else Test.pass ()
107 | 
108 | let test_conformance seg name ignores inf =
109 |   Test.test (Printf.sprintf "conformance of %s" name) @@ fun () ->
110 |   let specs =
111 |     try
112 |       In_channel.with_open_bin (Fpath.to_string inf) @@ fun ic ->
113 |       decode_conformance_specs ignores ic
114 |     with Sys_error e -> Test.failstop "%s" e
115 |   in
116 |   let test spec = test_spec seg (seq_of_spec [] spec) spec in
117 |   let fail ?__POS__ n ~assertions =
118 |     Test.log_fail "%a assertions %a"
119 |       Test.Fmt.fail_count_ratio (n, assertions) Test.Fmt.failed ()
120 |   in
121 |   Test.block ~__POS__ ~fail @@ fun () ->
122 |   List.iter test specs
123 | 
124 | let test_others =
125 |   let u = Uchar.of_int in
126 |   Test.test "other specifications" @@ fun () ->
127 |   let g = `Grapheme_cluster in
128 |   let test_spec ?__POS__ seg src spec =
129 |     ignore (test_spec ?__POS__ seg src spec)
130 |   in
131 |   test_spec ~__POS__ g [] [];
132 |   test_spec ~__POS__ g [u 0x0020] [`B; `U (u 0x0020); `B;];
133 |   test_spec ~__POS__ g (* éa *) [u 0x0065; u 0x0301; u 0x0061;]
134 |     [`B; `U (u 0x0065); `U (u 0x0301); `B; `U (u 0x0061); `B;];
135 |   let w = `Word in
136 |   test_spec ~__POS__ w [] [];
137 |   let s = `Sentence in
138 |   test_spec ~__POS__ s [] [];
139 |   let l = `Line_break in
140 |   test_spec ~__POS__ l [] [];
141 |   ()
142 | 
143 | let test_uuseg_string =
144 |   Test.test "Uuseg_string" @@ fun () ->
145 |   let rec pp_list ppf = function
146 |   | [] -> ()
147 |   | s :: ss -> Format.fprintf ppf "%S;@ " s; pp_list ppf ss
148 |   in
149 |   let fold8 seg s =
150 |     List.rev (Uuseg_string.fold_utf_8 seg (fun acc s -> s :: acc) [] s)
151 |   in
152 |   let test ?__POS__ l spec =
153 |     if l = spec then () else
154 |     Test.fail ?__POS__
155 |       "@[<v>Mismatch:@,impl: @[[%a]@]@,spec: @[[%a]@]" pp_list l pp_list spec
156 |   in
157 |   test ~__POS__ (fold8 `Grapheme_cluster "") [];
158 |   test ~__POS__  (fold8 `Grapheme_cluster "ab cd") ["a"; "b"; " "; "c"; "d"];
159 |   test ~__POS__ (fold8 `Word "") [];
160 |   test ~__POS__ (fold8 `Word "ab cd") ["ab"; " "; "cd"];
161 |   test ~__POS__ (fold8 `Sentence "") [];
162 |   test ~__POS__ (fold8 `Sentence "ab cd") ["ab cd"];
163 |   test ~__POS__ (fold8 `Line_break "") [];
164 |   test ~__POS__ (fold8 `Line_break "ab cd") ["ab "; "cd"];
165 |   ()
166 | 
167 | let test_LB30b_assumption =
168 |   Test.test "LB30b's data assumption" @@ fun () ->
169 |   (* This is needed by our implementation of LB30b *)
170 |   let rec loop u =
171 |     let c = Uucp.Emoji.is_extended_pictographic u &&
172 |             Uucp.Gc.general_category u = `Cn
173 |     in
174 |     if c then begin
175 |       if Uucp.Break.line u = `ID ||
176 |          Uucp.Break.line u = `XX (* Since Unicode 16.0.0 a few unassigned
177 |                                     chars satisfy [c]. Those get transformed
178 |                                     to AL *)
179 |       then () else
180 |       (Test.fail "LB30b assumption failure for U+%04X: %a"
181 |          (Uchar.to_int u) Uucp.Break.pp_line (Uucp.Break.line u))
182 |     end;
183 |     if Uchar.equal u Uchar.max then () else loop (Uchar.succ u)
184 |   in
185 |   loop Uchar.min
186 | 
187 | let main () =
188 |   let open Cmdliner in
189 |   let g_file =
190 |     Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/GraphemeBreakTest.txt")
191 |          & info ["g"] ~doc:"The GraphemeBreakTest.txt file")
192 |   in
193 |   let w_file =
194 |     Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/WordBreakTest.txt")
195 |          & info ["w"] ~doc:"The WordBreakTest.txt file")
196 |   in
197 |   let s_file =
198 |     Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/SentenceBreakTest.txt")
199 |          & info ["s"] ~doc:"The SentenceBreakTest.txt file")
200 |   in
201 |   let l_file =
202 |     Arg.(value & opt B0_std_cli.filepath (Fpath.v "test/LineBreakTest.txt")
203 |          & info ["lb"] ~doc:"The LineBreakTest.txt file")
204 |   in
205 |   let args =
206 |     let open Term.Syntax in
207 |     let+ g_file and+ w_file and+ s_file and+ l_file in
208 |     (g_file, w_file, s_file, l_file)
209 |   in
210 |   let doc = "Run Unicode segmentation conformance tests" in
211 |   Test.main' ~doc args @@ fun (g_file, w_file, s_file, l_file) ->
212 |   test_LB30b_assumption ();
213 |   test_conformance `Grapheme_cluster "grapheme cluster boundary" [] g_file ();
214 |   test_conformance `Word "word boundary" [] w_file ();
215 |   test_conformance `Sentence "sentence boundary" [] s_file ();
216 |   test_conformance `Line_break "line break boundary" [] l_file ();
217 |   test_others ();
218 |   test_uuseg_string ()
219 | 
220 | let () = if !Sys.interactive then () else exit (main ())
221 | 


--------------------------------------------------------------------------------
/src/uuseg_sentence.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | (* These are the rules as found in [1], with property values aliases [2]
  7 |    substituted.
  8 | 
  9 |    SB1.                          sot ÷ Any
 10 |    SB2.                          Any ÷ eot
 11 |    SB3.                           CR × LF
 12 |    SB4.                   (SE|CR|LF) ÷
 13 |    SB5.                   X (EX|FO)* → X
 14 |    SB6.                           AT × NU
 15 |    SB7.                   (UP|LO) AT × UP
 16 | 
 17 |    SB8.                   AT CL* SP* × (¬(LE|UP|LO|SE|CR|LF|ST|AT))* LO
 18 |     rewrite w.o. ¬        AT CL* SP* × (CL|NU|SC|SP|XX)* LO
 19 |     factorize             AT CL* SP* × (NU|XX)? (CL|NU|SC|SP|XX)* LO
 20 |    SB8a.             (ST|AT) CL* SP* × (SC|ST|AT)
 21 |    SB9.                  (ST|AT) CL* × (CL|SP|SE|CR|LF)
 22 |    SB10.             (ST|AT) CL* SP* × (SP|SE|CR|LF)
 23 |    SB11. (ST|AT) CL* SP* (SE|CR|LF)? ÷
 24 |    SB12.                         Any × Any
 25 | 
 26 | 
 27 |    [1]: http://www.unicode.org/reports/tr29/#Sentence_boundaries
 28 |    [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt
 29 | 
 30 |    Given the structure of the rules we keep a window of three break
 31 |    property value slots, two on the left, one on the right of a
 32 |    boundary and pattern match these slots to find the rule that
 33 |    applies. the rules SB8 to SB11 are handled separately, and we keep
 34 |    a buffer to handle the lookahead needed for SB8.
 35 |                              ---??--->
 36 |                       +----+----++----+-------...
 37 |                       | l1 | l0 || r0 | SB8_buffer
 38 |                       +----+----++----+-------...
 39 |     already returned to client /  \ buffered in segmenter *)
 40 | 
 41 | type sentence =
 42 |   | AT | CL | CR | EX | FO | LE | LF | LO | NU | SC | SE | SP | ST | UP | XX
 43 |   | Invalid | Sot | Eot
 44 | 
 45 | (* WARNING. The indexes used here need to be synchronized with those
 46 |    assigned by uucp for Uucp.Break.Low.sentence. *)
 47 | 
 48 | let byte_to_sentence =
 49 |   [| AT; CL; CR; EX; FO; LE; LF; LO; NU; SC; SE; SP; ST; UP; XX|]
 50 | 
 51 | let sentence u = byte_to_sentence.(Uucp.Break.Low.sentence u)
 52 | 
 53 | type state =
 54 |   | Fill       (* fill slots on the right of boundary *)
 55 |   | Fill_CL_SP (* ad-hoc state to handle SB8 to SB11 *)
 56 |   | Fill_SB8   (* ad-hoc state to handle SB8 *)
 57 |   | Flush      (* flush slot r0 to get to next boundary. *)
 58 |   | End        (* `End was added. *)
 59 | 
 60 | type t =
 61 |   { mutable state : state;                                 (* current state. *)
 62 |     window : sentence array;                                (* break window. *)
 63 |     mutable l0 : int;                          (* index in [window] of [l0]. *)
 64 |     r0_buf : Uuseg_buf.t;                                  (* buffer for r0. *)
 65 |     sb8_buf : Uuseg_buf.t; }                    (* buffer for resolving SB8. *)
 66 | 
 67 | let create () =
 68 |   { state = Fill;
 69 |     window = [|Invalid; Sot; Invalid;|];
 70 |     l0 = 1;
 71 |     r0_buf = Uuseg_buf.create 13;
 72 |     sb8_buf = Uuseg_buf.create 13; }
 73 | 
 74 | let copy s =
 75 |   { s with window = Array.copy s.window;
 76 |            r0_buf = Uuseg_buf.copy s.r0_buf;
 77 |            sb8_buf = Uuseg_buf.copy s.sb8_buf; }
 78 | 
 79 | let equal = ( = )
 80 | 
 81 | let l0_sentence s = s.window.(s.l0)
 82 | let r0_sentence s = s.window.((s.l0 + 1) mod Array.length s.window)
 83 | let r0_sentence_set s l = s.window.((s.l0 + 1) mod Array.length s.window) <- l
 84 | let r0_add s add = Uuseg_buf.add s.r0_buf add
 85 | let r0_empty s = Uuseg_buf.empty s.r0_buf
 86 | let r0_len s = Uuseg_buf.len s.r0_buf
 87 | let r0_flush s = Uuseg_buf.flush s.r0_buf
 88 | let window_move s =
 89 |   s.l0 <- (s.l0 + 1) mod Array.length s.window;
 90 |   r0_sentence_set s Invalid
 91 | 
 92 | (* WARNING. The code that follows is truly horrible, with more time a
 93 |    better way can certainly be found. *)
 94 | 
 95 | let decide_sb8_sb11 s sentence (`Uchar _ as add) =
 96 |   (* AT or ST is in l0 and we have (AT|ST) CL* SP* and
 97 |      sentence <> CL | SP | EX | FO *)
 98 |   match sentence with
 99 |   | SC | ST | AT (* SB8a *)
100 |   | SE | CR | LF (* SB9 SB10 *) ->
101 |       s.state <- Flush; r0_sentence_set s sentence; add
102 |   | LO when l0_sentence s = AT (* SB8 *) ->
103 |       s.state <- Flush; r0_sentence_set s sentence; add
104 |   | NU | XX when l0_sentence s = AT (* check SB8 *) ->
105 |       s.state <- Fill_SB8; Uuseg_buf.add s.sb8_buf add; `Await
106 |   | NU | XX | LE | UP | LO (* SB11 *) ->
107 |       s.state <- Flush; r0_sentence_set s sentence; r0_add s add; `Boundary
108 |   | CL | SP | EX | FO | Invalid | Sot | Eot ->
109 |       assert false
110 | 
111 | let decide s =
112 |   let no_boundary s = r0_flush s in
113 |   let wlen = Array.length s.window in
114 |   let l0 = s.l0 in
115 |   let r0 = (l0 + 1) mod wlen in
116 |   let l1 = (l0 + 2) mod wlen in
117 |   let w = s.window in
118 |   match w.(l1), w.(l0) (**),(**) w.(r0) with
119 |   | (* SB1 *) _, Sot, _ -> `Boundary
120 |   (* SB2 is handled in [add]. *)
121 |   | (* SB3 *) _, CR, LF -> no_boundary s
122 |   | (* SB4 *) _, (SE|CR|LF), _ -> `Boundary
123 |   (* SB5 is handled in [add]. *)
124 |   | (* SB6 *)  _, AT, NU -> no_boundary s
125 |   | (* SB7 *) (UP|LO), AT, UP -> no_boundary s
126 |   | (* SB8-SB11 is also handled in [add]. *)
127 |                _, (AT|ST), sentence -> decide_sb8_sb11 s sentence (r0_flush s)
128 |   | (* SB12 *) _, _, _ -> no_boundary s
129 | 
130 | let rec add s = function
131 | | `Uchar u as addv ->
132 |     begin match s.state with
133 |     | Fill ->
134 |         begin match sentence u with
135 |         | EX | FO as sentence ->
136 |             begin match l0_sentence s with
137 |             | SE | CR | LF | Sot ->
138 |                 s.state <- Flush; r0_sentence_set s sentence; r0_add s addv;
139 |                 decide s
140 |             | _ -> addv (* SB5 *)
141 |             end
142 |         | CL | SP as sentence ->
143 |             begin match l0_sentence s with
144 |             | AT | ST ->
145 |                 s.state <- Fill_CL_SP; r0_sentence_set s sentence;
146 |                 addv (* SB9, SB10 *)
147 |             | _ ->
148 |                 s.state <- Flush; r0_sentence_set s sentence; r0_add s addv;
149 |                 decide s
150 |             end
151 |         | sentence ->
152 |             s.state <- Flush; r0_sentence_set s sentence; r0_add s addv;
153 |             decide s
154 |         end
155 |     | Fill_CL_SP (* we have AT or ST in l0 *) ->
156 |         begin match sentence u with
157 |         | EX | FO -> addv (* SB5 *)
158 |         | CL ->
159 |             begin match r0_sentence s with
160 |             | CL -> addv (* SB9 (or eventually SB8) *)
161 |             | SP ->
162 |                 begin match l0_sentence s with
163 |                 | ST -> (* SB11 *)
164 |                     s.state <- Flush; r0_sentence_set s CL; r0_add s addv;
165 |                     `Boundary
166 |                 | AT -> (* check SB8 *)
167 |                     s.state <- Fill_SB8; Uuseg_buf.add s.sb8_buf addv; `Await
168 |                 | _ -> assert false
169 |                 end
170 |             | _ -> assert false
171 |             end
172 |         | SP ->
173 |             begin match r0_sentence s with
174 |             | CL -> r0_sentence_set s SP; addv (* SB10 (or eventually SB8 *)
175 |             | SP -> addv
176 |             | _ -> assert false
177 |             end
178 |         | sentence ->
179 |             begin match r0_sentence s with
180 |             | CL | SP -> decide_sb8_sb11 s sentence addv
181 |             |_ -> assert false
182 |             end
183 |         end
184 |     | Fill_SB8 ->
185 |         begin match sentence u with
186 |         | EX | FO (* SB5 *)
187 |         | CL | NU | SC | SP | XX -> Uuseg_buf.add s.sb8_buf addv; `Await
188 |         | LO -> (* SB8 *)
189 |             Uuseg_buf.add s.sb8_buf addv; s.state <- Flush; add s `Await
190 |         | _ -> (* SB11 *)
191 |             Uuseg_buf.add s.sb8_buf addv; s.state <- Flush; `Boundary
192 |         end
193 |     | Flush -> Uuseg_base.err_exp_await addv
194 |     | End -> Uuseg_base.err_ended addv
195 |     end
196 | | `Await ->
197 |     begin match s.state with
198 |     | Flush ->
199 |         if not (r0_empty s) then r0_flush s else
200 |         begin
201 |           match Uuseg_buf.len s.sb8_buf with
202 |           | 0 ->  s.state <- Fill; window_move s; `Await
203 |           | 1 ->
204 |               let u = Uuseg_buf.flush s.sb8_buf in
205 |               s.state <- Fill; window_move s; add s u
206 |           | _ ->
207 |               let `Uchar uc as u = Uuseg_buf.flush s.sb8_buf in
208 |               r0_sentence_set s (sentence uc); u
209 |         end
210 |     | End ->
211 |         if not (Uuseg_buf.empty s.sb8_buf)
212 |         then
213 |           (* According to SB8 we bufferized only CL|NU|SC|SP|XX
214 |              given the rules no new boundary will occur except the one
215 |              due to SB2. So we just flush the sb8_buf. *)
216 |           Uuseg_buf.flush s.sb8_buf
217 |         else
218 |         if r0_sentence s = Eot
219 |         then (r0_sentence_set s Invalid; `Boundary (* SB2 *))
220 |         else `End
221 |     | Fill | Fill_CL_SP | Fill_SB8 -> `Await
222 |     end
223 | | `End ->
224 |     begin match s.state with
225 |     | Fill ->
226 |         s.state <- End;
227 |         if l0_sentence s = Sot then `End (* No boundary on empty seq *) else
228 |         `Boundary (* SB1 and SB2 *)
229 |     | Fill_CL_SP -> s.state <- End; `Boundary (* SB2 *)
230 |     | Fill_SB8 -> s.state <- End; r0_sentence_set s Eot; `Boundary (* SB11 *)
231 |     | Flush -> Uuseg_base.err_exp_await `End
232 |     | End -> Uuseg_base.err_ended `End
233 |     end
234 | 


--------------------------------------------------------------------------------
/src/uuseg.mli:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | (** Unicode text segmentation.
  7 | 
  8 |     [Uuseg] segments Unicode text. It implements the locale
  9 |     independent Unicode text segmentation algorithms to detect
 10 |     grapheme cluster, word and sentence boundaries and the Unicode
 11 |     line breaking algorithm to detect line break opportunities.
 12 | 
 13 |     The module is independent from any IO mechanism or Unicode text
 14 |     data structure and it can process text without a complete
 15 |     in-memory representation.
 16 | 
 17 |     The supported Unicode version
 18 |     is determined by the {!unicode_version} value.
 19 | 
 20 |     Consult the {{!basics}basics}, {{!limits}limitations} and
 21 |     {{!examples}examples} of use.
 22 | 
 23 |     {3 References}
 24 |     {ul
 25 |     {-
 26 |     {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}.
 27 |     (latest version)}
 28 |     {-
 29 |     {e {{:http://www.unicode.org/reports/tr29/}UAX #29 Unicode Text
 30 |     Segmentation}}. (latest version)}
 31 |     {-
 32 |     {e {{:http://www.unicode.org/reports/tr14/}UAX #14 Unicode Line Breaking
 33 |     Algorithm}}. (latest version)}
 34 |     {- Web based {{:http://unicode.org/cldr/utility/breaks.jsp}ICU
 35 |        break utility}.}}
 36 | *)
 37 | 
 38 | (** {1 Segment} *)
 39 | 
 40 | val unicode_version : string
 41 | (** [unicode_version] is the Unicode version supported by [Uuseg]. *)
 42 | 
 43 | type custom
 44 | (** The type for custom segmenters. See {!val:custom}. *)
 45 | 
 46 | type boundary =
 47 |   [ `Grapheme_cluster
 48 |        (** {{:http://www.unicode.org/glossary/#extended_grapheme_cluster}
 49 |            Extended grapheme clusters} according to
 50 |            {{:https://www.unicode.org/reports/tr29/#C1-1}UAX29-C1-1} *)
 51 |   | `Word
 52 |       (** Words according to
 53 |           {{:https://www.unicode.org/reports/tr29/#C2-1}UAX29-C2-1} *)
 54 |   | `Sentence
 55 |       (** Sentences according to
 56 |           {{:https://www.unicode.org/reports/tr29/#C3-1}UAX29-C3-1} *)
 57 |   | `Line_break
 58 |       (** Line breaks accordings to
 59 |           {{:http://www.unicode.org/reports/tr14/}UAX #14} *)
 60 |   | `Custom of custom ]
 61 | (** The type for boundaries. *)
 62 | 
 63 | val pp_boundary : Format.formatter -> boundary -> unit
 64 | (** [pp_boundary ppf b] prints an unspecified representation of [b]
 65 |     on [ppf]. *)
 66 | 
 67 | type t
 68 | (** The type for Unicode text segmenters. *)
 69 | 
 70 | type ret = [ `Boundary | `Uchar of Uchar.t | `Await | `End ]
 71 | (** The type for segmenter results. See {!add}. *)
 72 | 
 73 | val create : [< boundary ] -> t
 74 | (** [create b] is an Unicode text segmenter for boundaries of type [b]. *)
 75 | 
 76 | val boundary : t -> boundary
 77 | (** [boundary s] is the type of boundaries detected by [s]. *)
 78 | 
 79 | val add : t -> [ `Uchar of Uchar.t | `Await | `End ] -> ret
 80 | (** [add s v] is:
 81 |     {ul
 82 |     {- [`Boundary] if there is a boundary at that point in the sequence of
 83 |        characters. The client must then call [add] with [`Await]
 84 |        until [`Await] is returned.}
 85 |     {- [`Uchar u] if [u] is the next character in the sequence.
 86 |        The client must then call [add] with [`Await] until [`Await] is
 87 |        returned.}
 88 |     {- [`Await] when the segmenter is ready to add a new [`Uchar]
 89 |        or [`End].}
 90 |     {- [`End] when [`End] was added and all [`Boundary] and [`Uchar] were
 91 |        output.}}
 92 | 
 93 |     For [v] use [`Uchar u] to add a new character to the sequence to
 94 |     segment and [`End] to signal the end of sequence. After adding one
 95 |     of these two values always call [add] with [`Await] until [`Await]
 96 |     or [`End] is returned.
 97 | 
 98 |     @raise Invalid_argument if [`Uchar] or [`End] is added while
 99 |     that last add did not return [`Await] or if an [`Uchar] or [`End]
100 |     is added after an [`End] was already added. *)
101 | 
102 | val mandatory : t -> bool
103 | (** [mandatory s] is [true] if the last [`Boundary] returned by {!add}
104 |     was mandatory. This function only makes sense for [`Line_break]
105 |     segmenters or [`Custom] segmenters that sport that notion. For
106 |     other segmenters or if no [`Boundary] was returned so far, [true]
107 |     is returned. *)
108 | 
109 | val copy : t -> t
110 | (** [copy s] is a copy of [s] in its current state. Subsequent {!add}s on
111 |     [s] do not affect the copy. *)
112 | 
113 | val equal : t -> t -> bool
114 | (** [equal s0 s1] is [true] iff [s0] and [s1] are in the same state,
115 |     that is any sequence of {!add}s made on both [s0] and [s1] produce
116 |     the same outputs.
117 | 
118 |     @raise Invalid_argument on {{!custom}custom segmenters}.  *)
119 | 
120 | val pp_ret : Format.formatter -> [< ret] -> unit
121 | (** [pp_ret ppf v] prints an unspecified representation of [v] on [ppf]. *)
122 | 
123 | (** {1:custom Custom segmenters} *)
124 | 
125 | val custom :
126 |   ?mandatory:('a -> bool) ->
127 |   name:string ->
128 |   create:(unit -> 'a) ->
129 |   copy:('a -> 'a) ->
130 |   add: ('a -> [ `Uchar of Uchar.t | `Await | `End ] -> ret) -> unit -> custom
131 |   (** [create ~mandatory ~name ~create ~copy ~add] is a custom segmenter.
132 |       {ul
133 |       {- [name] is a name to identify the segmenter.}
134 |       {- [create] is called when the segmenter is {{!create}created}
135 |          it should return a custom segmenter value.}
136 |       {- [copy] is called with the segmenter value whenever the
137 |          segmenter is {{!copy}copied}. It should return a copy of the
138 |          segmenter value.}
139 |       {- [mandatory] is called with the segmenter value to define
140 |          the result of the {!mandatory} function. Defaults always
141 |          returns [true].}
142 |       {- [add] is called with the segmenter value to define
143 |          the result of the {!add} value. The returned value
144 |          should respect the semantics of {!add}. Use the functions
145 |          {!err_exp_await} and {!err_ended} to raise [Invalid_argument]
146 |          exception in {!add}s error cases.}} *)
147 | 
148 | val err_exp_await : [< ret] -> 'a
149 | (** [err_exp_await fnd] should be used by custom segmenters when
150 |     the client tries to {!add} an [`Uchar] or [`End] while the last
151 |     returned value was not an [`Await]. *)
152 | 
153 | val err_ended : [< ret] -> 'a
154 | (** [err_ended ()] should be used by custom segmenter when the client
155 |     tries to {!add} [`Uchar] or [`End] after [`End] was already added. *)
156 | 
157 | (** {1:limits Limitations}
158 | 
159 |     A [`Grapheme_cluster] segmenter will always consume only a small
160 |     bounded amount of memory on any text. Other segmenters will also
161 |     do so on non-degenerate text, but it's possible to feed them with
162 |     input that will make them buffer an arbitrary amount of
163 |     characters. *)
164 | 
165 | (** {1:basics Basics}
166 | 
167 |     A segmenter is a stateful filter that inputs a sequence of characters
168 |     and outputs the same sequence except characters are interleaved
169 |     with [`Boundary] values whenever the segmenter detects a boundary.
170 | 
171 |     The function {!create} returns a new segmenter for a given boundary
172 |     type:
173 | {[
174 | let words = Uuseg.create `Word
175 | ]}
176 |     To add characters to the sequence to segment, call {!add} on
177 |     [words] with [`Uchar _]. To end the sequence call {!add} on [words]
178 |     with [`End]. The segmented sequence of characters is returned character
179 |     by character, interleaved with [`Boundary] values at the appropriate
180 |     places, by the successive calls to {!add}.
181 | 
182 |     The client and the segmenter must wait on each other to limit
183 |     internal buffering: each time the client adds to the sequence
184 |     by calling {!add} with [`Uchar] or [`End] it must continue to
185 |     call {!add} with [`Await] until the segmenter returns [`Await]
186 |     or [`End]. In practice this leads to the following kind of control flow:
187 | {[
188 | let rec add acc v = match Uuseg.add words v with
189 | | `Uchar u -> add (`Uchar u :: acc) `Await
190 | | `Boundary -> add (`B :: acc) `Await
191 | | `Await | `End -> acc
192 | ]}
193 |     For example to segment the sequence <[U+0041], [U+0020], [U+0042]>
194 |     (["a b"]) to a list of characters interleaved with [`B] values on word
195 |     boundaries we can write:
196 | {[
197 | let uchar = `Uchar (Uchar.of_int u)
198 | let seq = [uchar 0x0041; uchar 0x0020; uchar 0x0042]
199 | let seq_words = List.rev (add (List.fold_left add [] seq) `End)
200 | ]}
201 | *)
202 | 
203 | (** {1:examples Examples}
204 | 
205 | [utf_8_segments seg s] is the list of UTF-8 encoded [seg] segments of
206 | the UTF-8 encoded string [s].
207 | 
208 | {[
209 | let utf_8_segments seg s =
210 |   let flush_segment buf acc =
211 |     let segment = Buffer.contents buf in
212 |     Buffer.clear buf; if segment = "" then acc else segment :: acc
213 |   in
214 |   let rec add buf acc segmenter v = match Uuseg.add segmenter v with
215 |   | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf acc segmenter `Await
216 |   | `Boundary -> add buf (flush_segment buf acc) segmenter `Await
217 |   | `Await | `End -> acc
218 |   in
219 |   let rec loop buf acc s i max segmenter =
220 |     if i > max then flush_segment buf (add buf acc segmenter `End) else
221 |     let dec = String.get_utf_8_uchar s i in
222 |     let acc = add buf acc segmenter (`Uchar (Uchar.utf_decode_uchar dec)) in
223 |     loop buf acc s (i + Uchar.utf_decode_length dec) max segmenter
224 |   in
225 |   let buf = Buffer.create 42 in
226 |   let segmenter = Uuseg.create seg in
227 |   List.rev (loop buf [] s 0 (String.length s - 1) segmenter)
228 | ]}
229 | 
230 | Note that this function can be derived directly from
231 | {!Uuseg_string.fold_utf_8}. *)
232 | 


--------------------------------------------------------------------------------
/src/uuseg_line_break.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2014 The uuseg programmers. All rights reserved.
  3 |    SPDX-License-Identifier: ISC
  4 |   ---------------------------------------------------------------------------*)
  5 | 
  6 | (* These are the rules as found in [1]
  7 |    LB1 (per suggestion)
  8 |                            (AI|SG|XX) → AL
  9 |                       SA when (Mn|Mc) → CM
 10 |                                    SA → AL
 11 |                                    CJ → NS
 12 |    LB2                            sot ×
 13 |    LB3                                ! eot
 14 |    LB4                             BK !
 15 |    LB5                             CR × LF
 16 |                            (CR|LF|NL) !
 17 |    LB6                                × (BK|CR|LF|NL)
 18 |    LB7                                × (SP|ZW)
 19 |    LB8                         ZW SP* ÷
 20 |    LB8a                           ZWJ ×
 21 |    LB9  ¬(BK|CR|LF|NL|SP|ZW as X) (CM|ZWJ) * → X
 22 |    LB10                      (CM|ZWJ) → AL
 23 |    LB11                               × WJ
 24 |                                    WJ ×
 25 |    LB12                            GL ×
 26 |    LB12a               ¬(SP|BA|HY|HH) × GL
 27 |    LB13                               × (CL|CP|EX|SY)
 28 |    LB14                 (OP|OP30) SP* ×
 29 |    LB15                        QU SP* × (OP|OP30)
 30 |    LB15a  (sot|BK|CR|LF|NL|OP|QU|QU_Pi|QU_Pf|GL|SP|ZW) QU_Pi SP* ×
 31 |    LB15b  × QU_Pf (SP|GL|WJ|CL|QU|CP|EX|IS|SY|BK|CR|LF|NL|ZW|eot)
 32 |    LB15c                           SP ÷ IS NU
 33 |    LB15d                              × IS
 34 |    LB16                   (CL|CP) SP* × NS
 35 |    LB17                        B2 SP* × B2
 36 |    LB18                            SP ÷
 37 |    LB19                               × (QU|QU_Pf)
 38 |                            (QU|QU_Pi) ×
 39 |    LB19a                   ¬EastAsian × QU_Pi
 40 |                                       × QU_Pi Eot
 41 |                                       × QU_Pi ¬EastAsian
 42 |                                 QU_Pf × ¬EastAsian
 43 |                             Sot QU_Pf ×
 44 |                      ¬EastAsian QU_Pf ×
 45 |    LB20                               ÷ CB
 46 |                                    CB ÷
 47 |    LB20a  (sot|BK|CR|LF|NL|SP|ZW|CB|GL) (HY|HH) × (AL|AL_30b|AL_circ|HL)
 48 |    LB21                               × (BA|HH|HY|NS)
 49 |                                    BB ×
 50 |    LB21a                   HL (HY|HH) × ¬(HL)
 51 |    LB21b                           SY × HL
 52 |    LB22                               × IN
 53 |    LB23        (AL|AL_30b|AL_circ|HL) × NU
 54 |                                    NU × (AL|AL_30b|AL_circ|HL)
 55 |    LB23a                           PR × (ID|ID_30b|EB|EM)
 56 |                     (ID|ID_30b|EB|EM) × PO
 57 |    LB24                       (PR|PO) × (AL|AL_30b|AL_circ|HL)
 58 |                (AL|AL_30b|AL_circ|HL) × (PR|PO)
 59 |    LB25           NU (SY|IS)* (CL|CP) × (PO|PR)
 60 |                          NU (SY |IS)* × (PO|PR|NU)
 61 |                               (PO|PR) × OP NU
 62 |                               (PO|PR) × OP IS NU
 63 |                         (PR|HY|IS|PO) × NU
 64 |    LB26                            JL × (JL|JV|H2|H3)
 65 |                               (JV|H2) × (JV|JT)
 66 |                               (JT|H3) × JT
 67 |    LB27              (JL|JV|JT|H2|H3) × (IN|PO)
 68 |                                    PR × (JL|JV|JT|H2|H3)
 69 |    LB28        (AL|AL_30b|AL_circ|HL) × (AL|AL_30b|AL_circ|HL)
 70 |    LB28a                           AP × (AK|AL_circ|AS)
 71 |                       (AK|AL_circ|AS) × (VF|VI)
 72 |                    (AK|AL_circ|AS) VI × (AK|AL_circ)
 73 |                       (AK|AL_circ|AS) × (AK|AL_circ|AS) VF
 74 |    LB29                            IS × (AL|HL)
 75 |    LB30     (AL|AL_30b|AL_circ|HL|NU) × OP-EastAsian
 76 |                          CP-EastAsian × (AL|AL_30b|AL_circ|HL|NU)
 77 |    LB30a              sot (RI RI)* RI × RI
 78 |                     [^RI] (RI RI)* RI × RI
 79 |    LB30b                           EB × EM
 80 |                                ID_30b x EM
 81 |    LB31                           ALL ÷
 82 |                                       ÷ ALL
 83 | 
 84 |    [1]: http://www.unicode.org/reports/tr14/#Algorithm
 85 |    [2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt
 86 | 
 87 |    Given the structure of the rules we keep a window of three line
 88 |    break property value slots, three on the left, two on the right of a
 89 |    boundary and pattern match these slots to find the rule that
 90 |    applies. Because of LB9 these slots may actually correspond to more
 91 |    than one character and we need to bufferize the data for the slots
 92 |    on the right.
 93 | 
 94 |    Besides we maintain two views of the window slots, one which has
 95 |    the word berak property of concrete characters and another one
 96 |    that has the word break property as seen by the LB9 rule and
 97 |    those that have SP* elements.
 98 | 
 99 | 
100 |                             ---??--->
101 |                 +----+----+----++----+----+
102 |             ... | l2 | l1 | l0 || r0 | r1 |
103 |                 +----+----+----++----+----+
104 |    already returned to client /  \ buffered in segmenter *)
105 | 
106 | type line =
107 |   | AI | AK | AL | AP | AS | B2 | BA | BB | BK | CB | CJ | CL | CM | CP
108 |   | CR | EX | EB | EM | GL | H2 | H3 | HH | HL | HY | ID | IN
109 |   | IS | JL | JT | JV | LF | NL | NS | NU | OP | PO | PR
110 |   | QU | RI | SA | SG | SP | SY | VF | VI | WJ | XX | ZW | ZWJ | Invalid | Sot
111 |   | Eot
112 |   | AL_circ (* Added to handle the U+255C constant in LB28a. We need to split
113 |                AL (the class of U+255C), the full set is AL + AL_circ +
114 |                AL_30b (see below) *)
115 |   | QU_Pf (* Added to handle LB15{a,b} and LB19. We need to split QU
116 |              (the full set is QU + QU_Pf + QU_Pi *)
117 |   | QU_Pi
118 |   (* Added to handle LB30b *)
119 |   | AL_30b
120 |   | ID_30b
121 | 
122 | (* WARNING. The indexes used here need to be synchronized with those
123 |    assigned by uucp for Uucp.Break.Low.line_break. *)
124 | 
125 | let byte_to_line =
126 |   [| AL (* LB1 AI → AL *); AK; AL; AP; AS; B2; BA; BB; BK; CB;
127 |      NS (* LB1 CJ → NS *); CL;
128 |      CM; CP; CR; EX; EB; EM; GL; H2; H3; HH; HL; HY; ID; IN; IS; JL; JT; JV; LF;
129 |      NL; NS; NU; OP; PO; PR; QU; RI; SA; AL (* LB1 SG → AL *); SP; SY; VF; VI;
130 |      WJ; XX; ZW; ZWJ |]
131 | 
132 | let eastasian (`Uchar u) = match Uucp.Break.east_asian_width u with
133 | | `F | `W | `H -> true | _ -> false
134 | 
135 | let line u = match byte_to_line.(Uucp.Break.Low.line u) with
136 | | SA -> (* LB1 for SA *)
137 |     begin match Uucp.Gc.general_category u with
138 |     | `Mn | `Mc -> CM
139 |     | _ -> AL
140 |     end
141 | | ID -> (* Decompose because of LB30b, this assumption is tested in test.ml *)
142 |     if Uucp.Emoji.is_extended_pictographic u &&
143 |        Uucp.Gc.general_category u = `Cn then ID_30b else ID
144 | | AL -> (* Decompose because of LB28a *)
145 |     if Uchar.to_int u = 0x25CC then AL_circ else AL
146 | | XX ->
147 |     (* Some unassigned characters match the second rule of LB30b *)
148 |     if Uucp.Emoji.is_extended_pictographic u &&
149 |        Uucp.Gc.general_category u = `Cn
150 |     then AL_30b
151 |     else AL (* LB1 XX → AL *)
152 | | QU -> (* Decompose because of LB15{a,b} *)
153 |     begin match Uucp.Gc.general_category u with
154 |     | `Pf -> QU_Pf
155 |     | `Pi -> QU_Pi
156 |     | _ -> QU
157 |     end
158 | | l -> l
159 | 
160 | type state =
161 | | Fill (* fill slot on the right of boundary. *)
162 | | Flush (* flush the first lement of slot r0 to get to next boundary. *)
163 | | Decide (* decide boundary of slot r0. *)
164 | 
165 | (* XXX with the time the window has enlarged quite a bit, one day
166 |    rewrite that with circular buffers. *)
167 | 
168 | type t =
169 |   { mutable state : state;                                  (* current state. *)
170 |     mutable l2 : line; mutable l2_rewrite : line; (* l2 according to LB9/LB10 *)
171 |     mutable l1 : line; mutable l1_rewrite : line; (* l1 according to LB9/LB10 *)
172 |     mutable l1_data : [`Uchar of Uchar.t ]; (* data in l1 *)
173 |     mutable l0 : line; mutable l0_rewrite : line; (* l0 according to LB9/LB10 *)
174 |     mutable l0_data : [`Uchar of Uchar.t ]; (* data in l0 *)
175 |     mutable l0_odd_ri : bool; (* odd number of RI on the left of break point. *)
176 |     mutable lb25_nu : bool; (* NU (SI|IS)* on l0 or l1 (for lb25) *)
177 |     mutable r0 : line; (* of element in r0_data *)
178 |     mutable r0_data : [`Uchar of Uchar.t ]; (* data in r0 *)
179 |     mutable r1 : line; (* of element in r1_data *)
180 |     mutable r1_data : [`Uchar of Uchar.t ]; (* data in r1 *)
181 |     mutable r2 : line; (* of element in r2_data *)
182 |     mutable r2_data : [`Uchar of Uchar.t ]; (* data in r2 *)
183 |     mutable mandatory : bool; (* [true] if break is mandatory. *)
184 |     mutable ended : bool; (* [true] if [`End was added]. *) }
185 | 
186 | let nul_buf = `Uchar (Uchar.unsafe_of_int 0x0000)
187 | let create () =
188 |   { state = Fill;
189 |     l2 = Invalid; l2_rewrite = Invalid;
190 |     l1 = Invalid; l1_rewrite = Invalid; l1_data = nul_buf;
191 |     l0 = Sot; l0_rewrite = Sot; l0_data = nul_buf;
192 |     l0_odd_ri = false;
193 |     lb25_nu = false;
194 |     r0 = Invalid;
195 |     r0_data = nul_buf (* overwritten *);
196 |     r1 = Invalid;
197 |     r1_data = nul_buf (* overwritten *);
198 |     r2 = Invalid;
199 |     r2_data = nul_buf (* overwritten *);
200 |     mandatory = false;
201 |     ended = false }
202 | 
203 | let mandatory s = s.mandatory
204 | let copy s = { s with state = s.state }
205 | let equal = ( = )
206 | 
207 | let lb10_rewrite = function CM | ZWJ -> AL | l -> l
208 | let is_lb9_X = function  BK | CR | LF | NL | SP | ZW | Sot -> false | _ -> true
209 | let is_lb12_l0 = function SP | BA | HY | HH -> false | _ -> true
210 | 
211 | let has_break s = (* N.B. sets s.mandatory by side effect. *)
212 |   let mandatory s = s.mandatory <- true; true in
213 |   s.mandatory <- false;
214 |   (* NB. s.l2 and s.r1 are not needed here. *)
215 |   match s.l1, s.l0 (**),(**) s.r0 with
216 |   (* LB1 is handled by [byte_to_line] and [line]. *)
217 |   | (* LB2 *)  _, Sot, _ -> false
218 |   | (* LB3 *)  _, _, Eot -> mandatory s
219 |   | (* LB4 *)  _, BK, _ -> mandatory s
220 |   | (* LB5 *)  _, CR, LF -> false
221 |   |            _, (CR|LF|NL), _ -> mandatory s
222 |   | (* LB6 *)   _, _, (BK|CR|LF|NL) -> false
223 |   | (* LB7 *)   _, _, (SP|ZW) -> false
224 |   | (* LB8 *)  _, ZW, _ -> true
225 |   |            _(* ZW *), _(* SP* *), _ when s.l1_rewrite = ZW &&
226 |                                              s.l0_rewrite = SP -> true
227 |   | (* LB8a *) _, ZWJ, _ -> false
228 |   | (* LB9 implicitely entails  ¬(BK|CR|LF|NL|SP|ZW as X) × (CM|ZWJ) *)
229 |                _, x, (CM|ZWJ) when is_lb9_X s.l0_rewrite -> false
230 |   | _ -> (* apply LB9/LB10 rewrite and match *)
231 |       let l2m = lb10_rewrite s.l2_rewrite in
232 |       let l1m = lb10_rewrite s.l1_rewrite in
233 |       let l0m = lb10_rewrite s.l0_rewrite in
234 |       let r0m = lb10_rewrite s.r0 in
235 |       let r1m = lb10_rewrite s.r1 in
236 |       let r2m = lb10_rewrite s.r2 in
237 |       match l2m, l1m , l0m, (**) r0m, r1m, r2m with
238 |       | (* LB11 *)  _, _, _,
239 |                     WJ, _, _ -> false
240 |       |             _, _, WJ,
241 |                     _, _, _ -> false
242 |       | (* LB12 *)  _, _, GL,
243 |                     _, _, _ -> false
244 |       | (* LB12a *) _, _, l0,
245 |                     GL, _, _ when is_lb12_l0 l0 -> false
246 |       | (* LB13 *)  _, _, _,
247 |                     (CL|CP|EX|SY), _, _ -> false
248 |       | (* LB14 *)  _, _, OP,
249 |                     _, _, _ -> false
250 |       |             _, OP, SP,
251 |                     _, _, _ -> false
252 |       | (* LB15a *) _, (Sot|BK|CR|LF|NL|OP|QU|QU_Pi|QU_Pf|GL|SP|ZW), QU_Pi,
253 |                     _, _, _ -> false
254 |       |            (Sot|BK|CR|LF|NL|OP|QU|QU_Pi|QU_Pf|GL|SP|ZW), QU_Pi, SP,
255 |                    _, _, _ -> false
256 |       | (* LB15b *) _, _, _,
257 |                     QU_Pf, (SP|GL|WJ|CL|QU|QU_Pi|QU_Pf|CP|EX|IS|SY|BK|CR|
258 |                             LF|NL|ZW|Eot), _ -> false
259 |       | (* LB15c *) _, _, SP,
260 |                     IS, NU, _ -> true
261 |       | (* LB15d *) _, _, _,
262 |                     IS, _, _ -> false
263 |       | (* LB16 *)  _, _, (CL|CP),
264 |                     NS, _, _ -> false
265 |       |             _, (CL|CP),
266 |                     SP, NS, _, _ -> false
267 |       | (* LB17 *)  _, _, B2,
268 |                     B2, _, _ -> false
269 |       |             _, B2, SP,
270 |                     B2, _, _ -> false
271 |       | (* LB18 *)  _, _, SP,
272 |                     _, _, _ -> true
273 |       | (* LB19 *)  _, _, _,
274 |                     (QU|QU_Pf), _, _ -> false
275 |       |             _, _, (QU|QU_Pi),
276 |                     _, _, _ -> false
277 |       | (* LB19a *) _, _, _,
278 |                     QU_Pi, _, _ when not (eastasian s.l0_data) -> false
279 |       |             _, _, _,
280 |                     QU_Pi, Eot, _ -> false
281 |       |             _, _, _,
282 |                     QU_Pi, _, _ when not (eastasian s.r1_data) -> false
283 |       |             _, _, QU_Pf,
284 |                     _, _, _ when not (eastasian s.r0_data) -> false
285 |       |             _, Sot, QU_Pf,
286 |                     _, _, _ -> false
287 |       |             _, _, QU_Pf,
288 |                     _, _, _ when not (eastasian s.l1_data) -> false
289 |       | (* LB20 *)  _, _, _,
290 |                     CB, _, _ -> true
291 |       |             _, _, CB,
292 |                     _, _, _ -> true
293 |       | (* LB20a *) _, (Sot|BK|CR|LF|NL|SP|ZW|CB|GL), (HY|HH),
294 |                     (AL|AL_30b|AL_circ|HL), _, _  -> false
295 |       | (* LB21 *)  _, _, _,
296 |                     (BA|HH|HY|NS), _, _ -> false
297 |       |             _, _, BB,
298 |                     _, _, _ -> false
299 |       | (* LB21a *) _, HL, (HY|HH),
300 |                     r0, _, _ when r0 <> HL -> false
301 |       | (* LB21b *) _, _, SY,
302 |                     HL, _, _ -> false
303 |       | (* LB22 *)  _, _, _,
304 |                     IN, _, _ -> false
305 |       | (* LB23 *)  _, _, (AL|AL_30b|AL_circ|HL),
306 |                     NU, _, _ -> false
307 |       |             _, _, NU,
308 |                     (AL|AL_30b|AL_circ|HL), _, _ -> false
309 |       | (* LB23a *) _, _, PR,
310 |                     (ID|ID_30b|EB|EM), _, _ -> false
311 |       |             _, _,
312 |                     (ID|ID_30b|EB|EM), PO, _, _ -> false
313 |       | (* LB24 *)  _, _, (PR|PO),
314 |                     (AL|AL_30b|AL_circ|HL), _, _ -> false
315 |       |             _, _, (AL|AL_30b|AL_circ|HL),
316 |                     (PR|PO), _, _ -> false
317 |       | (* LB25 *)   _, _, (CL|CP),
318 |                      (PO|PR), _, _ when s.lb25_nu -> false
319 |       |              _, _, (NU|SY|IS),
320 |                      (PO|PR|NU), _, _  when s.lb25_nu -> false
321 |       |              _, _, (PO|PR),
322 |                     OP, IS, NU -> false
323 |       |             _, _, (PO|PR),
324 |                     OP, NU, _ -> false
325 |       |             _, _, (PR|HY|IS|PO),
326 |                     NU, _, _ -> false
327 |       | (* LB26 *)  _, _, JL,
328 |                     (JL|JV|H2|H3), _, _ -> false
329 |       |             _, _, (JV|H2),
330 |                     (JV|JT), _, _ -> false
331 |       |             _, _, (JT|H3),
332 |                     JT, _, _ -> false
333 |       | (* LB27 *)  _, _, (JL|JV|JT|H2|H3),
334 |                     PO, _, _ -> false
335 |       |             _, _, PR,
336 |                     (JL|JV|JT|H2|H3), _, _ -> false
337 |       | (* LB28 *)  _, _, (AL|AL_30b|AL_circ|HL),
338 |                     (AL|AL_30b|AL_circ|HL), _, _ -> false
339 |       | (* LB28a *) _, _, AP,
340 |                     (AK|AL_circ|AS), _, _ -> false
341 |       |             _, _, (AK|AL_circ|AS),
342 |                     (VF|VI), _, _ -> false
343 |       |             _, (AK|AL_circ|AS), VI,
344 |                     (AK|AL_circ), _, _ -> false
345 |       |             _, _, (AK|AL_circ|AS),
346 |                     (AK|AL_circ|AS), VF, _ -> false
347 |       | (* LB29 *)  _, _, IS,
348 |                     (AL|AL_30b|AL_circ|HL), _, _ -> false
349 |       | (* LB30 *)  _, _, (AL|AL_30b|AL_circ|HL|NU),
350 |                     OP, _, _ when not (eastasian s.r0_data) -> false
351 |       |             _, _, CP,
352 |                     (AL|AL_30b|AL_circ|HL|NU), _, _
353 |         when not (eastasian s.l0_data) -> false
354 |       | (* LB30a *) _, _, RI,
355 |                     RI, _, _ when s.l0_odd_ri -> false
356 |       | (* LB30b *) _, _, EB,
357 |                     EM, _, _ -> false
358 |       |             _, _, (AL_30b|ID_30b),
359 |                     EM, _, _ -> false
360 |       | (* LB31 *)  _, _, _,
361 |                     _, _, _ -> true
362 | 
363 | let next s = (* moves to the next boundary *)
364 |   s.l2 <- s.l1;
365 |   s.l1 <- s.l0;
366 |   s.l1_data <- s.l0_data;
367 |   s.l0 <- s.r0;
368 |   s.l0_data <- s.r0_data;
369 |   begin
370 |     (* Check the regexp for LB25 *)
371 |     if not s.lb25_nu then (if s.l0 = NU then s.lb25_nu <- true else ()) else
372 |     (* We are matching check if we need to keep the matching *)
373 |     if s.l0 = NU || s.l0 = SY || s.l0 = IS then () (* keep matching *) else
374 |     if s.l1 = NU || s.l1 = SY || s.l1 = IS then () (* Keep matching *) else
375 |     s.lb25_nu <- false
376 |   end;
377 |   (* Only move rewrite window if l0_rewrite doesn't absorb the char
378 |      by rule LB9 or the various SP* *)
379 |   begin match s.r0 with
380 |   | CM | ZWJ when is_lb9_X s.l0_rewrite -> ()
381 |   | SP when s.l0_rewrite = SP -> ()
382 |   | _ ->
383 |       s.l2_rewrite <- s.l1_rewrite;
384 |       s.l1_rewrite <- s.l0_rewrite;
385 |       s.l0_rewrite <- s.r0;
386 |       s.l0_odd_ri <-
387 |         (match s.l0_rewrite with RI -> not s.l0_odd_ri | _ -> false);
388 |   end;
389 |   let data = s.r0_data in
390 |   s.r0 <- s.r1;
391 |   s.r0_data <- s.r1_data;
392 |   s.r1 <- s.r2;
393 |   s.r1_data <- s.r2_data;
394 |   s.r2 <- Invalid;
395 |   s.r2_data <- nul_buf;
396 |   (data :> Uuseg_base.ret)
397 | 
398 | let need_fill s = s.r2 = Invalid || s.r1 = Invalid || s.r0 = Invalid
399 | 
400 | let flush s = match s.ended with
401 | | false ->
402 |     let ret = next s in
403 |     (if need_fill s then s.state <- Fill else s.state <- Decide);
404 |     ret
405 | | true ->
406 |     match s.r0 with
407 |     | Eot -> `End
408 |     | _ -> s.state <- Decide; next s
409 | 
410 | let decide s = if has_break s then (s.state <- Flush; `Boundary) else flush s
411 | 
412 | let add s = function
413 | | `Uchar u as add ->
414 |     if s.ended then Uuseg_base.err_ended add else
415 |     begin match s.state with
416 |     | Fill when s.r0 = Invalid -> s.r0_data <- add; s.r0 <- line u; `Await
417 |     | Fill when s.r1 = Invalid -> s.r1_data <- add; s.r1 <- line u; `Await
418 |     | Fill -> s.r2_data <- add; s.r2 <- line u; decide s
419 |     | Flush | Decide -> Uuseg_base.err_exp_await add
420 |     end
421 | | `Await ->
422 |     begin match s.state with
423 |     | Flush -> flush s
424 |     | Decide -> decide s
425 |     | Fill -> `Await
426 |     end
427 | | `End ->
428 |     if s.ended then Uuseg_base.err_ended `End else
429 |     begin match s.state with
430 |     | Fill ->
431 |         s.ended <- true;
432 |         (if s.r0 = Invalid then s.r0 <- Eot else
433 |          if s.r1 = Invalid then s.r1 <- Eot else
434 |          s.r2 <- Eot);
435 |         if s.l0 = Sot && s.r0 = Eot then (* empty string *) `End else
436 |         decide s
437 |     | Flush | Decide -> Uuseg_base.err_exp_await `End
438 |     end
439 | 


--------------------------------------------------------------------------------