├── .gitignore
├── .merlin
├── .ocp-indent
├── CHANGES.md
├── LICENSE.md
├── README.md
├── _tags
├── doc
    ├── api.odocl
    └── dev.odocl
├── opam
├── pkg
    ├── META
    └── pkg.ml
├── src
    ├── sexpm.ml
    ├── sexpm.mli
    └── sexpm.mllib
└── test
    ├── sexptrip.ml
    ├── test.ml
    └── test_codec.ml


/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | tmp
3 | *~
4 | \.\#*
5 | \#*#
6 | *.install
7 | *.native
8 | *.byte


--------------------------------------------------------------------------------
/.merlin:
--------------------------------------------------------------------------------
1 | PKG bytes uchar uutf cmdliner bos bos.setup
2 | S src
3 | S test
4 | B _build/**
5 | 


--------------------------------------------------------------------------------
/.ocp-indent:
--------------------------------------------------------------------------------
1 | strict_with=always,match_clause=4,strict_else=never
2 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | vX.Y.Z YYYY-MM-DD Location
2 | --------------------------
3 | 
4 | First release. 
5 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 Daniel C. Bünzli
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for any
 4 | purpose with or without fee is hereby granted, provided that the above
 5 | copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | sexpm — s-expression codecs and updates for OCaml
 2 | -------------------------------------------------------------------------------
 3 | %%VERSION%%
 4 | 
 5 | sexpm codecs sequences of s-expressions to different representations.
 6 | 
 7 | sexpm is distributed under the ISC license.
 8 | 
 9 | Homepage: http://erratique.ch/software/sexpm  
10 | 
11 | ## Installation
12 | 
13 | sexpm can be installed with `opam`:
14 | 
15 |     opam install sexpm
16 | 
17 | If you don't use `opam` consult the [`opam`](opam) file for build
18 | instructions.
19 | 
20 | ## Documentation
21 | 
22 | The documentation and API reference is automatically generated from
23 | the source interfaces. It can be consulted [online][doc] or via
24 | `odig doc sexpm`.
25 | 
26 | [doc]: http://erratique.ch/software/sexpm/doc
27 | 
28 | ## Sample programs
29 | 
30 | If you installed sexpm with `opam` sample programs are located in
31 | the directory `opam config var sexpm:doc`.
32 | 
33 | In the distribution sample programs and tests are located in the
34 | [`test`](test) directory of the distribution. They can be built and run
35 | with:
36 | 
37 |     topkg build --tests true && topkg test 
38 | 


--------------------------------------------------------------------------------
/_tags:
--------------------------------------------------------------------------------
1 | true : bin_annot, safe_string, package(uutf), package(fmt)
2 | <src> : include
3 | <test> : include
4 | <test/sexptrip*> : package(cmdliner), package(bos.setup)
5 | <test/test_codec*> : package(cmdliner)


--------------------------------------------------------------------------------
/doc/api.odocl:
--------------------------------------------------------------------------------
1 | Sexpm


--------------------------------------------------------------------------------
/doc/dev.odocl:
--------------------------------------------------------------------------------
1 | Sexpm


--------------------------------------------------------------------------------
/opam:
--------------------------------------------------------------------------------
 1 | opam-version: "1.2"
 2 | maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>"
 3 | authors: ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"]
 4 | homepage: "http://erratique.ch/software/sexpm"
 5 | doc: "http://erratique.ch/software/sexpm/doc"
 6 | license: "ISC"
 7 | dev-repo: "http://erratique.ch/repos/sexpm.git"
 8 | bug-reports: "https://github.com/dbuenzli/sexpm/issues"
 9 | tags: [ "org:erratique" "codec" "sexp" ]
10 | available: [ ocaml-version >= "4.03.0"]
11 | depends: [
12 |   "ocamlfind" {build}
13 |   "ocamlbuild" {build}
14 |   "topkg" {build}
15 |   "uutf"
16 | ]
17 | depopts: [
18 |    "bos"
19 | g   "cmdliner"
20 | ]
21 | build: [
22 |   "ocaml" "pkg/pkg.ml" "build"
23 |           "--pinned" pinned ]
24 | 


--------------------------------------------------------------------------------
/pkg/META:
--------------------------------------------------------------------------------
1 | description = "s-expression codecs and updates for OCaml"
2 | version = "%%VERSION_NUM%%"
3 | requires = ""
4 | archive(byte) = "sexpm.cma"
5 | archive(native) = "sexpm.cmxa"
6 | plugin(byte) = "sexpm.cma"
7 | plugin(native) = "sexpm.cmxs"
8 | 


--------------------------------------------------------------------------------
/pkg/pkg.ml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ocaml
 2 | #use "topfind"
 3 | #require "topkg"
 4 | open Topkg
 5 | 
 6 | let bos = Conf.with_pkg "bos"
 7 | let cmdliner = Conf.with_pkg "cmdliner"
 8 | 
 9 | let () =
10 |   Pkg.describe "sexpm" @@ fun c ->
11 |   let cmdliner = Conf.value c cmdliner in
12 |   let bos = Conf.value c bos in
13 |   Ok [ Pkg.mllib "src/sexpm.mllib";
14 |        Pkg.bin ~cond:(cmdliner && bos) "test/sexptrip";
15 |        Pkg.test "test/test";
16 |        Pkg.test "test/test_codec"; ]
17 | 


--------------------------------------------------------------------------------
/src/sexpm.ml:
--------------------------------------------------------------------------------
   1 | (*---------------------------------------------------------------------------
   2 |    Copyright (c) 2016 Daniel C. Bünzli. All rights reserved.
   3 |    Distributed under the ISC license, see terms at the end of the file.
   4 |    %%NAME%% %%VERSION%%
   5 |   ---------------------------------------------------------------------------*)
   6 | 
   7 | let pf = Format.fprintf
   8 | 
   9 | (* Unsafe string and bytes manipulations. If you don't believe the authors's
  10 |    invariants, replacing with safe versions makes everything safe in the
  11 |    module. He won't be upset. *)
  12 | 
  13 | let unsafe_string_get = String.unsafe_get
  14 | let unsafe_array_get = Array.unsafe_get
  15 | let unsafe_bytes_set = Bytes.unsafe_set
  16 | let unsafe_bytes_set_byte s j byte = Bytes.unsafe_set s j (Char.unsafe_chr byte)
  17 | let unsafe_bytes_blit s soff d doff =
  18 |   Bytes.unsafe_blit (Bytes.unsafe_of_string s) soff d doff
  19 | 
  20 | (* Invalid argument strings *)
  21 | 
  22 | let err_empty_buf = "buffer size can't be 0"
  23 | let err_enc_unclosed = "unclosed list encoded"
  24 | let err_enc_unopened = "closing unopened list"
  25 | 
  26 | (* IO *)
  27 | 
  28 | let io_buffer_size = 65536                          (* IO_BUFFER_SIZE 4.0.0 *)
  29 | let bytes_buf = function
  30 | | None -> Bytes.create io_buffer_size
  31 | | Some bytes ->
  32 |     if Bytes.length bytes <> 0 then bytes else
  33 |     invalid_arg err_empty_buf
  34 | 
  35 | type input = unit -> (Bytes.t * int * int) option
  36 | 
  37 | let input_of_string s =
  38 |   let buf = ref (Some (Bytes.unsafe_of_string s, 0, String.length s)) in
  39 |   fun () -> let r = !buf in buf := None; r
  40 | 
  41 | let input_of_in_channel ?bytes ic =
  42 |   let bs = bytes_buf bytes in
  43 |   fun () ->
  44 |     let len = input ic bs 0 (Bytes.length bs) in
  45 |     if len = 0 then None else Some (bs, 0, len)
  46 | 
  47 | type output = (Bytes.t * int * int) option -> unit
  48 | 
  49 | let output_of_buffer buf = function
  50 | | None -> ()
  51 | | Some (bs, pos, len) -> Buffer.add_subbytes buf bs pos len
  52 | 
  53 | let output_of_out_channel oc = function
  54 | | None -> ()
  55 | | Some (bs, pos, len) -> output oc bs pos len
  56 | 
  57 | (* Characters and their classes *)
  58 | 
  59 | let ux_eoi = -1 (* end of input, oustide unicode range *)
  60 | let ux_soi = -2 (* start of input, outside unicode range. *)
  61 | 
  62 | let u_rep  = Uchar.to_int Uutf.u_rep
  63 | let u_tab    = 0x0009 (* '\t' *)
  64 | let u_lf     = 0x000A (* '\n' *)
  65 | let u_cr     = 0x000D (* '\r' *)
  66 | let u_sp     = 0x0020 (* ' ' *)
  67 | let u_quot   = 0x0022 (* '"' *)
  68 | let u_lpar   = 0x0028 (* '(' *)
  69 | let u_rpar   = 0x0029 (* ')' *)
  70 | let u_semi   = 0x003B (* ';' *)
  71 | let u_bslash = 0x005C (* '\\' *)
  72 | 
  73 | let uchar_is_illegal = function
  74 | | 0x0020 | 0x000A | 0x000D | 0x0009 -> false
  75 | | u when u <= 0x001F -> true
  76 | | _ -> false
  77 | 
  78 | (* The following predicates assume that illegal were filtered out. *)
  79 | 
  80 | let uchar_is_white = function
  81 | | 0x0020 | 0x000A | 0x000D | 0x0009 -> true
  82 | | _ -> false
  83 | 
  84 | let uchar_is_hex u =
  85 |    (0x0030 <= u && u <= 0x0039)
  86 | || (0x0041 <= u && u <= 0x0046)
  87 | || (0x0061 <= u && u <= 0x0066)
  88 | 
  89 | let uchar_is_token u =
  90 |    (0x0023 <= u && u <= 0x0027)
  91 | || (0x002A <= u && u <= 0x003A)
  92 | || (0x003C <= u && u <= 0x005B)
  93 | || (0x005D <= u)
  94 | || (0x0021 = u)
  95 | 
  96 | let uchar_is_qtoken u =
  97 |    (0x0023 <= u && u <= 0x005B)
  98 | || (0x005D <= u)
  99 | || uchar_is_white u
 100 | || (0x0021 = u)
 101 | 
 102 | let uchar_is_comment = function
 103 | | 0x000A | 0x000D -> false
 104 | | u when u = ux_eoi -> false
 105 | | u -> true
 106 | 
 107 | let pp_dump_uchar ppf u = pf ppf "%04X" (Uchar.to_int u)
 108 | let pp_uchar ppf u =
 109 |   if Uchar.to_int u <= 0x1F (* most control chars *) then pp_dump_uchar ppf u else
 110 |   let b = Buffer.create 4 in
 111 |   Uutf.Buffer.add_utf_8 b u;
 112 |   pf ppf "'%s' (%a)" (Buffer.contents b) pp_dump_uchar u
 113 | 
 114 | (* Lexemes *)
 115 | 
 116 | type lexeme =
 117 | | White of string
 118 | | Comment of string
 119 | | Ls | Le
 120 | | Atom of string * string option
 121 | 
 122 | let pp_raw ppf = function
 123 | | None -> pf ppf "None"
 124 | | Some s -> pf ppf "@[Some %S@]" s
 125 | 
 126 | let pp_lexeme ppf = function
 127 | | White s -> pf ppf "@[White %S@]" s
 128 | | Comment s -> pf ppf "@[Comment %S@]" s
 129 | | Ls -> pf ppf "Ls"
 130 | | Le -> pf ppf "Le"
 131 | | Atom (a, raw) -> pf ppf "@[Atom @[<1>(%S,@ %a)@]@]" a pp_raw raw
 132 | 
 133 | (* Positions *)
 134 | 
 135 | type pos = int * int
 136 | type range = pos * pos
 137 | let pp_range ppf ((l0, c0), (l1, c1)) = pf ppf "%d.%d-%d.%d" l0 c0 l1 c1
 138 | 
 139 | (* Decode errors *)
 140 | 
 141 | type error =
 142 | [ `Illegal_bytes of string
 143 | | `Illegal_uchar of Uchar.t
 144 | | `Illegal_escape of
 145 |     [ `Exp_lbrace of Uchar.t
 146 |     | `Exp_rbrace of Uchar.t
 147 |     | `Exp_hex of Uchar.t
 148 |     | `Exp_hex_rbrace of Uchar.t
 149 |     | `Not_esc of Uchar.t
 150 |     | `Not_uchar of int ]
 151 | | `Unclosed of [ `Quoted_token | `List | `Escape ]
 152 | | `Unexpected_le
 153 | | `Codec of string (* FIXME *) ]
 154 | 
 155 | let pp_illegal_escape ppf = function
 156 | | `Exp_lbrace u -> pf ppf "expected '{' found %a" pp_uchar u
 157 | | `Exp_rbrace u -> pf ppf "expected '}' found %a" pp_uchar u
 158 | | `Exp_hex u -> pf ppf "expected hex digit found %a" pp_uchar u
 159 | | `Exp_hex_rbrace u -> pf ppf "expected hex digit or '}' found %a" pp_uchar u
 160 | | `Not_esc u -> pf ppf "%a is not a valid escape letter" pp_uchar u
 161 | | `Not_uchar i -> pf ppf "escape 0x%X is not a Unicode scalar value" i
 162 | 
 163 | let pp_error ppf = function
 164 | | `Illegal_bytes b -> pf ppf "illegal byte sequence %S in input" b
 165 | | `Illegal_uchar u -> pf ppf "illegal Unicode character %a in input" pp_uchar u
 166 | | `Illegal_escape esc -> pp_illegal_escape ppf esc
 167 | | `Unclosed `Quoted_token -> pf ppf "unclosed quoted token"
 168 | | `Unclosed `List -> pf ppf "unclosed list"
 169 | | `Unclosed `Escape -> pf ppf "unfinished escape"
 170 | | `Unexpected_le -> pf ppf "unexpected list end ')'"
 171 | | `Codec s -> pf ppf "codec error: %s" s
 172 | 
 173 | exception Err of error * range (* only used internally *)
 174 | 
 175 | (* Decoder *)
 176 | 
 177 | type decoder_state =
 178 | | Decode_next
 179 | | Decoding_qtoken
 180 | | Decoding_token
 181 | | Decoding_comment
 182 | | Decoding_white
 183 | | Close_qtoken
 184 | | Close_list
 185 | 
 186 | type decoder =
 187 |   { input : input; (* input byte stream. *)
 188 |     layout : bool; (* [true] to keep whitespace and raw tokens. *)
 189 |     u : Uutf.decoder; (* unicode character decoder. *)
 190 |     atom : Buffer.t; (* atom accumulation buffer. *)
 191 |     raw : Buffer.t; (* raw accumulation buffer. *)
 192 |     mutable c : int; (* character lookahead. *)
 193 |     mutable ss_line : int; (* last sexp start line. *)
 194 |     mutable ss_col : int; (* last sexp start column. *)
 195 |     mutable s_line : int; (* last saved start line. *)
 196 |     mutable s_col : int; (* last saved start column. *)
 197 |     mutable e_line : int; (* last saved end line. *)
 198 |     mutable e_col : int; (* last saved end column. *)
 199 |     mutable stack : pos list; (* stack of open lists and their start pos. *)
 200 |     mutable state : decoder_state; (* decoder state. *)
 201 |     mutable peek : (lexeme option, error * range) result option
 202 |  }
 203 | 
 204 | let decoder ?(layout = true) input =
 205 |   let u = Uutf.decoder ~encoding:`UTF_8 `Manual in
 206 |   let buf_size = 1024 in
 207 |   let atom = Buffer.create buf_size in
 208 |   let raw = Buffer.create buf_size in
 209 |   { input; layout; u; atom; raw; c = ux_soi; (* overwritten *)
 210 |     ss_line = 1; ss_col = 0; s_line = 1; s_col = 0; e_line = 1; e_col = 0;
 211 |     stack = []; state = Decode_next; peek = None }
 212 | 
 213 | let decoder_layout d = d.layout
 214 | let decoder_sspos d = d.ss_line, d.ss_col
 215 | let decoder_spos d = d.s_line, d.s_col
 216 | let decoder_epos d = d.e_line, d.e_col
 217 | let decoded_range d = (decoder_spos d, decoder_epos d)
 218 | let decoded_sexp_range d = (decoder_sspos d, decoder_epos d)
 219 | 
 220 | let sspos d = Uutf.(d.ss_line <- decoder_line d.u; d.ss_col <- decoder_col d.u)
 221 | let spos d = Uutf.(d.s_line <- decoder_line d.u; d.s_col <- decoder_col d.u)
 222 | let epos d = Uutf.(d.e_line <- decoder_line d.u; d.e_col <- decoder_col d.u)
 223 | 
 224 | let buffer_flush b = let s = Buffer.contents b in (Buffer.clear b; s)
 225 | let buffer_add b c = Uutf.Buffer.add_utf_8 b (Uchar.unsafe_of_int c)
 226 | 
 227 | let raw_clear d = Buffer.clear d.raw
 228 | let raw_add d = if d.c <> ux_soi && d.layout then buffer_add d.raw d.c else ()
 229 | let raw d = buffer_flush d.raw
 230 | let raw_is_empty d = Buffer.length d.raw = 0
 231 | 
 232 | let atom_add d c = buffer_add d.atom c
 233 | let atom d =
 234 |   let raw = if d.layout then (Some (buffer_flush d.raw)) else None in
 235 |   Atom (buffer_flush d.atom, raw)
 236 | 
 237 | let err d e = epos d; raise_notrace (Err (e, decoded_range d))
 238 | 
 239 | let refill d = match d.input () with
 240 | | Some (b, s, l) -> Uutf.Manual.src d.u b s l
 241 | | None -> Uutf.Manual.src d.u Bytes.empty 0 0
 242 | 
 243 | let rec readc d = match Uutf.decode d.u with
 244 | | `Uchar u ->
 245 |     raw_add d;
 246 |     let i = Uchar.to_int u in
 247 |     begin match uchar_is_illegal i with
 248 |     | false -> d.c <- i
 249 |     | true -> d.c <- u_rep; err d (`Illegal_uchar u)
 250 |     end
 251 | | `End -> if d.c <> ux_eoi then (raw_add d; d.c <- ux_eoi) else ()
 252 | | `Await -> refill d; readc d
 253 | | `Malformed b -> d.c <- u_rep; err d (`Illegal_bytes b)
 254 | 
 255 | let err_illegal_esc d e = atom_add d u_rep; readc d; err d (`Illegal_escape e)
 256 | 
 257 | let d_uchar_esc d =
 258 |   let rec loop d acc count = match (readc d; d.c) with
 259 |   | u when uchar_is_hex u ->
 260 |       let count = count + 1 in
 261 |       if count > 6 then err_illegal_esc d (`Exp_rbrace (Uchar.of_int d.c)) else
 262 |       let acc = acc * 16 + (if d.c <= 0x39 (* 9 *) then d.c - 0x30 else
 263 |                             if d.c <= 0x46 (* F *) then d.c - 0x37 else
 264 |                             d.c - 0x57)
 265 |       in
 266 |       loop d acc count
 267 |   | 0x007D (* '}' *) ->
 268 |       if count = 0 then err_illegal_esc d (`Exp_hex (Uchar.of_int d.c)) else
 269 |       if not (Uchar.is_valid acc) then err_illegal_esc d (`Not_uchar acc) else
 270 |       (readc d; atom_add d acc)
 271 |   | u when u = ux_eoi -> err d (`Unclosed `Escape)
 272 |   | u ->
 273 |       if count = 0 then err_illegal_esc d (`Exp_hex (Uchar.of_int d.c)) else
 274 |       err_illegal_esc d (`Exp_hex_rbrace (Uchar.of_int d.c))
 275 |   in
 276 |   match d.c with
 277 |   | 0x007B (* '{' *) -> loop d 0 0
 278 |   | u when u = ux_eoi -> err d (`Unclosed `Escape)
 279 |   | u -> err_illegal_esc d (`Exp_lbrace (Uchar.of_int d.c))
 280 | 
 281 | let rec d_cont ~lf d = match (readc d; d.c) with
 282 | | 0x000A when lf -> d_cont ~lf:false d
 283 | | (0x0020 | 0x0009) -> d_cont ~lf:false d
 284 | | 0x005C (* '\\' *) -> d_escape ~or_cont:true d
 285 | | u -> ()
 286 | 
 287 | and d_escape ~or_cont d = match (readc d; d.c) with
 288 | | 0x0020 (* ' ' *)  -> atom_add d u_sp; readc d
 289 | | 0x0022 (* '"' *)  -> atom_add d u_quot; readc d
 290 | | 0x0028 (* '(' *)  -> atom_add d u_lpar; readc d
 291 | | 0x0029 (* ')' *)  -> atom_add d u_rpar; readc d
 292 | | 0x003B (* ';' *)  -> atom_add d u_semi; readc d
 293 | | 0x005C (* '\\' *) -> atom_add d u_bslash; readc d
 294 | | 0x006E (* 'n' *)  -> atom_add d u_lf; readc d
 295 | | 0x0072 (* 'r' *)  -> atom_add d u_cr; readc d
 296 | | 0x0074 (* 't' *)  -> atom_add d u_tab; readc d
 297 | | 0x0075 (* 'u' *)  -> readc d; d_uchar_esc d
 298 | | (0x000A | 0x000D) ->
 299 |     if or_cont then (d_cont ~lf:(d.c = u_cr) d) else
 300 |     err_illegal_esc d (`Not_esc (Uchar.of_int d.c))
 301 | | u when u = ux_eoi -> if not or_cont then err d (`Unclosed `Escape)
 302 | | u -> err_illegal_esc d (`Not_esc (Uchar.of_int u))
 303 | 
 304 | let d_token d =
 305 |   let rec loop ~or_cont d = match d.c with
 306 |   | 0x005C (* \\ *) -> d_escape ~or_cont d; loop ~or_cont:true d
 307 |   | u when uchar_is_token u ->
 308 |       atom_add d u; epos d; readc d; loop ~or_cont:true d
 309 |   | u ->
 310 |       let a = atom d in
 311 |       d.state <- Decode_next; raw_clear d; Some a
 312 |   in
 313 |   if (d.state <> Decoding_token) then (sspos d; spos d);
 314 |   d.state <- Decoding_token;
 315 |   loop ~or_cont:false d
 316 | 
 317 | let d_qtoken d =
 318 |   let rec loop d u = match u with
 319 |   | 0x005C (* '\\' *) -> d_escape ~or_cont:true d; loop d d.c
 320 |   | 0x0022 (* '\"' *) ->
 321 |       epos d; readc d;
 322 |       let a = atom d in
 323 |       d.state <- Decode_next; raw_clear d; Some a
 324 |   | u when u = ux_eoi ->
 325 |       d.state <- Close_qtoken; err d (`Unclosed `Quoted_token)
 326 |   | u -> atom_add d u; readc d; loop d d.c
 327 |   in
 328 |   if d.state = Close_qtoken then (d.state <- Decoding_qtoken; loop d u_quot)
 329 |   else begin
 330 |     if d.state <> Decoding_qtoken then (sspos d; spos d; readc d);
 331 |     d.state <- Decoding_qtoken; loop d d.c
 332 |   end
 333 | 
 334 | let d_ls d =
 335 |   d.state <- Decode_next;
 336 |   spos d; epos d; readc d; raw_clear d;
 337 |   d.stack <- (decoder_spos d) :: d.stack;
 338 |   Some Ls
 339 | 
 340 | let d_le d =
 341 |   d.state <- Decode_next;
 342 |   spos d; epos d; readc d; raw_clear d;
 343 |   match d.stack with
 344 |   | (l, c) :: stack -> d.ss_line <- l; d.ss_col <- c; d.stack <- stack; Some Le
 345 |   | [] -> err d (`Unexpected_le)
 346 | 
 347 | let d_eoi d = match d.stack with
 348 | | [] -> None
 349 | | (l, c) :: stack ->
 350 |     d.s_line <- l; d.s_col <- c; d.state <- Close_list;
 351 |     err d (`Unclosed `List)
 352 | 
 353 | let rec d_comment d =
 354 |   let rec loop d = match d.c with
 355 |   | u when uchar_is_comment u -> epos d; readc d; loop d
 356 |   | u ->
 357 |       d.state <- Decode_next;
 358 |       if not d.layout then d_next d else Some (Comment (raw d))
 359 |   in
 360 |   if d.state <> Decoding_comment then (spos d; readc d; raw_clear d);
 361 |   d.state <- Decoding_comment;
 362 |   loop d
 363 | 
 364 | and d_white d =
 365 |   let rec loop d = match (readc d; d.c) with
 366 |   | u when uchar_is_white u -> epos d; loop d
 367 |   | u ->
 368 |       d.state <- Decode_next;
 369 |       if raw_is_empty d then d_next d else Some (White (raw d))
 370 |   in
 371 |   if d.state <> Decoding_white then (spos d; epos d);
 372 |   d.state <- Decoding_white;
 373 |   loop d
 374 | 
 375 | and d_next d = match d.c with
 376 | | 0x0028 (* '(' *) -> d_ls d
 377 | | 0x0029 (* ')' *) -> d_le d
 378 | | 0x0022 (* '"' *) -> d_qtoken d
 379 | | 0x003B (* ';' *) -> d_comment d
 380 | | u when uchar_is_white u -> d_white d
 381 | | u when u = ux_eoi -> d_eoi d
 382 | | u when u = ux_soi -> readc d; d_next d
 383 | | u -> d_token d
 384 | 
 385 | and d_lexeme d = match d.state with
 386 | | Decode_next -> d_next d
 387 | (* The following cases occur after an error was decoded. *)
 388 | | Decoding_token -> d_token d
 389 | | Decoding_qtoken -> d_qtoken d
 390 | | Decoding_comment -> d_comment d
 391 | | Decoding_white -> d_white d
 392 | | Close_qtoken -> d_qtoken d
 393 | | Close_list -> d_le d
 394 | 
 395 | let decode_lexeme d = match d.peek with
 396 | | Some dec -> d.peek <- None; dec
 397 | | None ->
 398 |     try Ok (d_lexeme d) with
 399 |     | Err (e, r) -> Error (e, r)
 400 | 
 401 | let peek_lexeme d = match d.peek with
 402 | | Some dec -> dec
 403 | | None ->
 404 |     let dec = decode_lexeme d in
 405 |     d.peek <- Some dec;
 406 |     dec
 407 | 
 408 | let dec_lexeme d = match d.peek with
 409 | | None -> d_lexeme d
 410 | | Some dec ->
 411 |     d.peek <- None;
 412 |     match dec with
 413 |     | Ok l -> l
 414 |     | Error (e, r) -> raise_notrace (Err (e, r))
 415 | 
 416 | let pek_lexeme d = match d.peek with
 417 | | None ->
 418 |     let dec = d_lexeme d in
 419 |     d.peek <- Some (Ok dec);
 420 |     dec
 421 | | Some dec ->
 422 |     match dec with
 423 |     | Ok l -> l
 424 |     | Error (e, r) -> raise_notrace (Err (e, r))
 425 | 
 426 | (* Encoder *)
 427 | 
 428 | let hex_digit =
 429 |   [|'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'A';'B';'C';'D';'E';'F'|]
 430 | 
 431 | type encoder_style = [ `Pp | `Minify | `Raw ]
 432 | type last_encoded = Comment | Token (* unquoted *) | Other
 433 | type encoder =
 434 |   { output : output; (* output byte stream. *)
 435 |     style : encoder_style; (* encoder style. *)
 436 |     quote : bool; (* quote atoms that need escaping. *)
 437 |     nl : bool; (* final line feed. *)
 438 |     o : Bytes.t; (* output buffer. *)
 439 |     mutable o_pos : int; (* next output pos to write. *)
 440 |     mutable nest : int; (* nesting level *)
 441 |     mutable last : last_encoded;
 442 |     mutable col : int; (* approx we don't decode the UTF-8 *) }
 443 | 
 444 | let encoder ?bytes ?(nl = false) ?(quote = true) ?(style = `Raw) output =
 445 |   let o = bytes_buf bytes in
 446 |   { output; style; quote; nl; o; o_pos = 0; nest = 0; last = Other; col = 0; }
 447 | 
 448 | let flush e = e.output (Some (e.o, 0, e.o_pos)); e.o_pos <- 0
 449 | 
 450 | let o_rem e = Bytes.length e.o - e.o_pos    (* rem bytes to write in [e.o]. *)
 451 | 
 452 | let rec o_byte e b = match o_rem e = 0 with
 453 | | true -> flush e; o_byte e b
 454 | | false -> unsafe_bytes_set_byte e.o e.o_pos b; e.o_pos <- e.o_pos + 1
 455 | 
 456 | let rec o_string e s pos l =    (* write [l] bytes of [s] starting at [pos]. *)
 457 |   let rem = o_rem e in
 458 |   match rem >= l with
 459 |   | true -> unsafe_bytes_blit s pos e.o e.o_pos l; e.o_pos <- e.o_pos + l
 460 |   | false ->
 461 |       unsafe_bytes_blit s pos e.o e.o_pos rem; e.o_pos <- e.o_pos + rem;
 462 |       flush e;
 463 |       o_string e s (pos + rem) (l - rem)
 464 | 
 465 | let rec e_atom e a =
 466 |   if a = "" then (o_string e "\"\"" 0 2; e.last <- Other) else
 467 |   let flush ~quoted a spos len =
 468 |     if spos = 0 then begin match quoted with
 469 |     | true -> o_byte e u_quot
 470 |     | false -> if e.last = Token then o_byte e u_sp
 471 |     end;
 472 |     o_string e a spos len
 473 |   in
 474 |   let rec loop ~quoted a spos epos max = match epos > max with
 475 |   | true ->
 476 |       if not (spos > max) then (flush ~quoted a spos (epos - spos));
 477 |       if quoted then (o_byte e u_quot; e.last <- Other) else e.last <- Token
 478 |   | false ->
 479 |       let next = epos + 1 in
 480 |       let escape esc =
 481 |         flush ~quoted:e.quote a spos (epos - spos);
 482 |         o_string e esc 0 (String.length esc);
 483 |         loop e.quote a next next max
 484 |       in
 485 |       let may_escape esc = match e.quote with
 486 |       | true -> loop ~quoted:e.quote a spos next max
 487 |       | false -> escape esc
 488 |       in
 489 |       match unsafe_string_get a epos with
 490 |       | ' ' -> may_escape "\\ "
 491 |       | '(' -> may_escape "\\("
 492 |       | ')' -> may_escape "\\)"
 493 |       | ';' -> may_escape "\\;"
 494 |       | '\\' -> escape "\\\\"
 495 |       | '"'  -> escape "\\\""
 496 |       | '\t' -> escape "\\t"
 497 |       | '\n' -> escape "\\n"
 498 |       | '\r' -> escape "\\r"
 499 |       | c when c <= '\x1F' ->
 500 |           flush ~quoted:e.quote a spos (epos - spos);
 501 |           let byte = Char.code c in
 502 |           let hi = byte / 16 in
 503 |           let lo = byte mod 16 in
 504 |           o_string e "\\u00" 0 4;
 505 |           o_byte e (Char.code @@ unsafe_array_get hex_digit hi);
 506 |           o_byte e (Char.code @@ unsafe_array_get hex_digit lo);
 507 |           loop e.quote a next next max
 508 |       | c -> loop ~quoted a spos next max
 509 |   in
 510 |   loop ~quoted:false a 0 0 (String.length a - 1)
 511 | 
 512 | let e_raw_atom e r =
 513 |   begin match r.[0] = '"' (* raw is quoted *) with
 514 |   | true ->  e.last <- Other
 515 |   | false -> if e.last = Token then o_byte e u_sp;
 516 |   end;
 517 |   o_string e r 0 (String.length r)
 518 | 
 519 | let e_ls e =
 520 |   if e.last = Comment then o_byte e u_lf;
 521 |   o_byte e u_lpar; e.nest <- e.nest + 1; e.last <- Other
 522 | 
 523 | let e_le e =
 524 |   if e.nest = 0 then invalid_arg err_enc_unopened;
 525 |   if e.last = Comment then o_byte e u_lf;
 526 |   o_byte e u_rpar; e.nest <- e.nest - 1; e.last <- Other
 527 | 
 528 | let e_white e w =
 529 |   let len = String.length w in
 530 |   if e.style <> `Raw || len = 0 then () else
 531 |   begin
 532 |     if not (w.[0] = '\n' || w.[0] = '\r') && e.last = Comment
 533 |     then o_byte e u_lf;
 534 |     o_string e w 0 len;
 535 |     e.last <- Other
 536 |   end
 537 | 
 538 | let e_comment e c =
 539 |   if e.style <> `Raw then () else
 540 |   begin
 541 |     if e.last = Comment then o_byte e u_lf;
 542 |     o_byte e u_semi;
 543 |     o_string e c 0 (String.length c);
 544 |     e.last <- Comment
 545 |   end
 546 | 
 547 | let e_lexeme e = function
 548 | | Atom (a, raw) ->
 549 |     if e.last = Comment then o_byte e u_lf;
 550 |     if e.style <> `Raw then e_atom e a else
 551 |     begin match raw with
 552 |     | None -> e_atom e a;
 553 |     | Some r -> e_raw_atom e r
 554 |     end
 555 | | Ls -> e_ls e
 556 | | Le -> e_le e
 557 | | White w -> e_white e w
 558 | | Comment c -> e_comment e c
 559 | 
 560 | let e_end e = match e.nest <> 0 with
 561 | | true -> invalid_arg err_enc_unclosed
 562 | | false ->
 563 |     if e.nl then o_byte e u_lf;
 564 |     if e.o_pos <> 0 then flush e;
 565 |     e.output None
 566 | 
 567 | let encode_lexeme e = function
 568 | | Some l -> e_lexeme e l
 569 | | None -> e_end e
 570 | 
 571 | (* Custom s-expression codec *)
 572 | 
 573 | let rec decode_custom ~atom ~list d =
 574 |   let rec loop stack acc = match d_lexeme d with
 575 |   | None -> err d (`Unclosed `List)
 576 |   | Some v ->
 577 |       match v with
 578 |       | Ls -> loop (acc :: stack) []
 579 |       | Atom (a, _) -> loop stack (atom d a :: acc)
 580 |       | White _ | Comment _ -> loop stack acc
 581 |       | Le ->
 582 |           let list = list d (List.rev acc) in
 583 |           match stack with
 584 |           | [] -> Ok (Some list)
 585 |           | up :: stack -> loop stack (list :: up)
 586 |   in
 587 |   try match dec_lexeme d with
 588 |   | None -> Ok None
 589 |   | Some v ->
 590 |       match v with
 591 |       | Ls -> loop [] []
 592 |       | Atom (a, _) -> Ok (Some (atom d a))
 593 |       | White _ | Comment _ -> decode_custom ~atom ~list d
 594 |       | Le -> err d (`Unexpected_le)
 595 |   with Err (e, r) -> Error (e, r)
 596 | 
 597 | let encode_custom unfold e v =
 598 |   let rec loop stack = function
 599 |   | [] ->
 600 |       e_lexeme e Le;
 601 |       if stack = [] then () else loop (List.tl stack) (List.hd stack)
 602 |   | v :: vs ->
 603 |       match unfold v with
 604 |       | `Atom a -> e_lexeme e (Atom (a, None)); loop stack vs
 605 |       | `List l -> e_lexeme e Ls; loop (vs :: stack) l
 606 |   in
 607 |   match v with
 608 |   | None -> e_end e
 609 |   | Some v ->
 610 |       match unfold v with
 611 |       | `Atom a -> e_lexeme e (Atom (a, None))
 612 |       | `List l -> e_lexeme e Ls; loop [] l
 613 | 
 614 | (* Tagged s-expression codec *)
 615 | 
 616 | type 'a t = [ `Atom of string | `List of 'a t list ] * 'a
 617 | type 'a tagger = decoder -> [`Atom of string | `List of 'a t list] -> 'a t
 618 | 
 619 | let unit_tag _ se = se, ()
 620 | let range_tag d se = se, decoded_sexp_range d
 621 | 
 622 | let decode_tagged ~tag d =
 623 |   let atom d a = tag d (`Atom a) in
 624 |   let list d l = tag d (`List l) in
 625 |   decode_custom ~atom ~list d
 626 | 
 627 | let encode_tagged e se =
 628 |   let unfold (se, _) = se in
 629 |   encode_custom unfold e se
 630 | 
 631 | (* Escaping *)
 632 | 
 633 | let escape_info ~quote s = (* determines escape length, if needed *)
 634 |   let max = String.length s - 1 in
 635 |   let rec loop raw i l =
 636 |     if i > max then (if raw then None else Some l) else
 637 |     let next = i + 1 in
 638 |     match unsafe_string_get s i with
 639 |     | ' ' | '(' | ')' | ';' -> loop false next (l + if quote then 1 else 2)
 640 |     | '\\' | '"' | '\t' | '\n' | '\r' -> loop false next (l + 2)
 641 |     | c when c <= '\x1F' -> loop false next (l + 6)
 642 |     | c -> loop raw next (l + 1)
 643 |   in
 644 |   loop true 0 0
 645 | 
 646 | let escape_char b k c =
 647 |   unsafe_bytes_set b k '\\';
 648 |   unsafe_bytes_set b (k + 1) c;
 649 |   ()
 650 | 
 651 | let escape ?(quote = true) s =
 652 |   if s = "" then "\"\"" else
 653 |   match escape_info ~quote s with
 654 |   | None -> s
 655 |   | Some escaped_len ->
 656 |       let b, start = match quote with
 657 |       | false -> Bytes.create escaped_len, 0
 658 |       | true ->
 659 |           let escaped_len = escaped_len + 2 in
 660 |           let b = Bytes.create escaped_len in
 661 |           unsafe_bytes_set b 0 '"'; unsafe_bytes_set b (escaped_len - 1) '"';
 662 |           b, 1
 663 |       in
 664 |       let max = String.length s - 1 in
 665 |       let rec loop i k =
 666 |         if i > max then Bytes.unsafe_to_string b else
 667 |         let next = i + 1 in
 668 |         match unsafe_string_get s i with
 669 |         | ' ' ->
 670 |             if quote then (unsafe_bytes_set b k ' '; loop next (k + 1)) else
 671 |             (escape_char b k ' '; loop next (k + 2))
 672 |         | '(' ->
 673 |             if quote then (unsafe_bytes_set b k '('; loop next (k + 1)) else
 674 |             (escape_char b k '('; loop next (k + 2))
 675 |         | ')' ->
 676 |             if quote then (unsafe_bytes_set b k ')'; loop next (k + 1)) else
 677 |             (escape_char b k ')'; loop next (k + 2))
 678 |         | ';' ->
 679 |             if quote then (unsafe_bytes_set b k ';'; loop next (k + 1)) else
 680 |             (escape_char b k ';'; loop next (k + 2))
 681 |         | '\\' -> escape_char b k '\\'; loop next (k + 2)
 682 |         | '"'  -> escape_char b k '"'; loop next (k + 2)
 683 |         | '\t' -> escape_char b k 't'; loop next (k + 2)
 684 |         | '\n' -> escape_char b k 'n'; loop next (k + 2)
 685 |         | '\r' -> escape_char b k 'r'; loop next (k + 2)
 686 |         | c when c <= '\x1F' ->
 687 |             let byte = Char.code c in
 688 |             let hi = byte / 16 in
 689 |             let lo = byte mod 16 in
 690 |             unsafe_bytes_set b (k    )'\\';
 691 |             unsafe_bytes_set b (k + 1) 'u';
 692 |             unsafe_bytes_set b (k + 2) '{';
 693 |             unsafe_bytes_set b (k + 3) (unsafe_array_get hex_digit hi);
 694 |             unsafe_bytes_set b (k + 4) (unsafe_array_get hex_digit lo);
 695 |             unsafe_bytes_set b (k + 5) '}';
 696 |             loop next (k + 6)
 697 |         | c -> unsafe_bytes_set b k c; loop next (k + 1)
 698 |       in
 699 |       loop 0 start
 700 | 
 701 | (* Traces *)
 702 | 
 703 | type trace = lexeme list
 704 | type trace_rev = lexeme list
 705 | 
 706 | let dec_trace_to_peek_sexp d rt =
 707 |   let rec loop d rt = match d_lexeme d with
 708 |   | Some (Ls | Atom _) as dec -> d.peek <- Some (Ok dec); true, rt
 709 |   | Some l -> loop d (l :: rt)
 710 |   | None -> d.peek <- Some (Ok None); false, rt
 711 |   in
 712 |   match d.peek with
 713 |   | None -> loop d rt
 714 |   | Some (Ok (Some (Ls | Atom _))) -> true, rt
 715 |   | Some (Ok (Some l)) -> d.peek <- None; loop d (l :: rt)
 716 |   | Some (Ok None) -> false, rt
 717 |   | Some (Error (e, r)) -> raise_notrace (Err (e, r))
 718 | 
 719 | let dec_trace_ls d rt =
 720 |   let rec loop d rt = match dec_lexeme d with
 721 |   | Some (White _ | Comment _ as w) -> loop d (w :: rt)
 722 |   | Some Ls -> Ls :: rt
 723 |   | Some (Le | Atom _) -> err d (`Codec "expected start of list")
 724 |   | None -> err d (`Codec "expected start of list")
 725 |   in
 726 |   loop d rt
 727 | 
 728 | let dec_trace_le d rt =
 729 |   let rec loop d rt = match dec_lexeme d with
 730 |   | Some (White _ | Comment _ as w) -> loop d (w :: rt)
 731 |   | Some Le -> Le :: rt
 732 |   | Some (Ls | Atom _) -> err d (`Codec "expected end of list")
 733 |   | None -> err d (`Codec "expected end of list")
 734 |   in
 735 |   loop d rt
 736 | 
 737 | let dec_trace_atom d rt =
 738 |   let rec loop d rt = match dec_lexeme d with
 739 |   | Some (White _ | Comment _ as w) -> loop d (w :: rt)
 740 |   | Some (Atom (a, _) as at) -> a, (at :: rt)
 741 |   | Some (Ls | Le) -> err d (`Codec "expected atom") (* fixme more prec. *)
 742 |   | None -> err d (`Codec "expected atom")
 743 |   in
 744 |   loop d rt
 745 | 
 746 | let patch_layout (t, rt) =
 747 |   let rec loop rt = function
 748 |   | [] -> [], rt
 749 |   | (White _ | Comment _ as w) :: t -> loop (w :: rt) t
 750 |   | (Atom _ | Le | Ls) :: _ as t -> t, rt
 751 |   in
 752 |   loop rt t
 753 | 
 754 | let patch_ls (t, rt) =
 755 |   let rec loop rt = function
 756 |   | [] -> [], Ls :: rt
 757 |   | (White _ | Comment _ as w) :: t -> loop (w :: rt) t
 758 |   | Ls :: t -> t, Ls :: rt
 759 |   | (Le | Atom _) :: _ -> assert false
 760 |   in
 761 |   loop rt t
 762 | 
 763 | let patch_le (t, rt) =
 764 |   let rec loop rt = function
 765 |   | [] -> [], Le :: rt
 766 |   | (White _ | Comment _ as w) :: t -> loop (w :: rt) t
 767 |   | Le :: t -> t, Le :: rt
 768 |   | (Ls | Atom _) :: _ -> assert false
 769 |   in
 770 |   loop rt t
 771 | 
 772 | let patch_atom a (t, rt) =
 773 |   let rec loop rt = function
 774 |   | [] -> [], a :: rt
 775 |   | (White _ | Comment _ as w) :: t -> loop (w :: rt) t
 776 |   | Atom _ :: t -> t, a :: rt
 777 |   | (Ls | Le) :: _ -> assert false
 778 |   in
 779 |   loop rt t
 780 | 
 781 | let patch_drop_atom (t, rt) = (* keeps layout before atom (if any). *)
 782 |   let rec loop rt = function
 783 |   | [] -> [], rt
 784 |   | (White _ | Comment _ as w) :: t -> loop (w :: rt) t
 785 |   | Atom _ :: t -> t, rt
 786 |   | (Ls | Le) :: _ -> assert false
 787 |   in
 788 |   loop rt t
 789 | 
 790 | let patch_drop_sexp (t, _ as recode) = (* keeps layout before sexp (if any). *)
 791 |   if t = [] then recode else
 792 |   let t, rt = patch_layout recode in
 793 |   let rec loop depth = function
 794 |   | Atom _ :: t when depth = -1 -> t
 795 |   | Ls :: t -> loop (depth + 1) t
 796 |   | Le :: t -> if depth = 0 then t else loop (depth - 1) t
 797 |   | l :: t -> loop depth t
 798 |   | [] -> assert false
 799 |   in
 800 |   loop (-1) t, rt
 801 | 
 802 | let patch_drop_until_le recode = (* keeps layout at the Le level *)
 803 |   let rec loop = function
 804 |   | (Le :: t, _ as recode) -> recode
 805 |   | (_ :: _, _ as recode) -> recode |> patch_drop_sexp |> patch_layout |> loop
 806 |   | ([], _) -> assert false
 807 |   in
 808 |   recode |> patch_layout |> loop
 809 | 
 810 | let patch_keep_atom (t, rt) =
 811 |   let rec loop rt = function
 812 |   | (White _ | Comment _ as w) :: t -> loop (w :: rt) t
 813 |   | Atom _ as a :: t -> t, a :: rt
 814 |   | (Ls | Le) :: _ | [] -> assert false
 815 |   in
 816 |   loop rt t
 817 | 
 818 | let patch_keep_list recoder =
 819 |   let rec loop depth rt = function
 820 |   | Ls :: t -> loop (depth + 1) (Ls :: rt) t
 821 |   | Le :: t ->
 822 |       begin match depth with
 823 |       | 0 -> t, Le :: rt
 824 |       | depth -> loop (depth - 1) (Le :: rt) t
 825 |       end
 826 |   | l :: t -> loop depth (l :: rt) t
 827 |   | [] -> assert false
 828 |   in
 829 |   let t, rt = patch_layout recoder in
 830 |   match t with
 831 |   | Ls :: t -> loop 0 (Ls :: rt) t
 832 |   | (Atom _ | White _ | Comment _ | Le) :: _ | [] -> assert false
 833 | 
 834 | let patch_keep_sexp (t, rt) =
 835 |   let rec loop depth rt = function
 836 |   | Atom _ as a :: t when depth = 0 -> t, a :: rt
 837 |   | Ls :: t -> loop (depth + 1) (Ls :: rt) t
 838 |   | Le :: t ->
 839 |       begin match depth with
 840 |       | 0 -> assert false
 841 |       | 1 -> t, Le :: rt
 842 |       | depth -> loop (depth - 1) (Le :: rt) t
 843 |       end
 844 |   | l :: t -> loop depth (l :: rt) t
 845 |   | [] -> assert false
 846 |   in
 847 |   loop 0 rt t
 848 | 
 849 | let add_atom a (t, rt) = (t, Atom (a, None) :: rt)
 850 | let add_ls (t, rt) = (t, Ls :: rt)
 851 | let add_le (t, rt) = (t, Le :: rt)
 852 | 
 853 | (* Codecs
 854 | 
 855 |    A few notes about encoder updates.
 856 | 
 857 |    * Fundamentally we don't need the old value as we could decode it
 858 |      back from the trace as needed when we encode, it just seems a bit
 859 |      cleaner that way.
 860 | 
 861 |    * Encoders see all the remaining data trace rather that just the data
 862 |      that concern themselves. We could chunk the trace so that sub-encoders
 863 |      see only what is needed but this involves a lot of list processing.
 864 | 
 865 |    * Regarding the last point we could also work on a sexp representation
 866 |      this would make chunking easier; but would also necessisate a
 867 |      clarification of white and comments are associated to tree nodes.
 868 |      Currently this is somehow left to the judgement of encoders
 869 |      (which may not be a bad idea). *)
 870 | 
 871 | type 'a codec_encoder =
 872 |   'a option -> 'a -> (trace * trace_rev) -> trace * trace_rev
 873 |   (* trace must be left untouched if option is None. *)
 874 | 
 875 | type 'a codec_decoder = decoder -> trace_rev -> 'a * trace_rev
 876 | type 'a codec =
 877 |   { eq : 'a -> 'a -> bool;
 878 |     default : 'a;
 879 |     enc : 'a codec_encoder;
 880 |     dec : 'a codec_decoder; }
 881 | 
 882 | let codec ~eq ~default enc dec = { eq; default; enc; dec; }
 883 | let codec_with_default default c = { c with default }
 884 | let codec_with_codec ?eq ?default c =
 885 |   let eq = match eq with None -> c.eq | Some eq -> eq in
 886 |   let default = match default with None -> c.default | Some d -> d in
 887 |   { c with eq; default }
 888 | 
 889 | let rec enc_lexemes e = function
 890 | | [] -> ()
 891 | | l :: ls -> encode_lexeme e (Some l); enc_lexemes e ls
 892 | 
 893 | let encode_traced_value c e (v, t) = match v with
 894 | | None -> enc_lexemes e t; encode_lexeme e None
 895 | | Some (o, n) -> enc_lexemes e (List.rev @@ snd @@ c.enc (Some o) n (t, []))
 896 | 
 897 | let decode_traced_value c d =
 898 |   try
 899 |     let has_sexp, rt = dec_trace_to_peek_sexp d [] in
 900 |     if not has_sexp then Ok (None, List.rev rt) else
 901 |     let v, rt = c.dec d rt in
 902 |     Ok (Some v, List.rev rt)
 903 |   with
 904 |   | Err (e, r) -> Error (e, r)
 905 | 
 906 | let encode_value c e v = match v with
 907 | | None -> encode_lexeme e None
 908 | | Some v -> enc_lexemes e (List.rev @@ snd @@ c.enc None v ([], []))
 909 | 
 910 | let decode_value c d = match decode_traced_value c d with
 911 | | Ok (v, _) -> Ok v
 912 | | Error _ as e -> e
 913 | 
 914 | (* Codecs *)
 915 | 
 916 | let atom_codec ~eq ~default enc dec =
 917 |   let rec atom_enc enc o n recode = match o with
 918 |   | None -> recode |> add_atom (enc n)
 919 |   | Some o when eq o n -> recode |> patch_keep_atom
 920 |   | Some o -> recode |> patch_drop_atom |> atom_enc enc None n
 921 |   in
 922 |   let atom_dec dec d rt =
 923 |     let rec loop d rt = match dec_lexeme d with
 924 |     | Some (Atom (a, _) as l) -> dec d a, (l :: rt)
 925 |     | Some (Comment _ | White _ as l) -> loop d (l :: rt)
 926 |     | Some (Ls | Le) | None  -> err d (`Codec "expected atom") (* FIXME *)
 927 |     in
 928 |     loop d rt
 929 |   in
 930 |   { eq; default; enc = atom_enc enc; dec = atom_dec dec; }
 931 | 
 932 | let unit =
 933 |   let eq = (( = ) : unit -> unit -> bool) in
 934 |   let default = () in
 935 |   let unit_enc () = "unit" in
 936 |   let unit_dec d = function
 937 |   | "unit" -> ()
 938 |   | _ -> err d (`Codec "expected `unit'")
 939 |   in
 940 |   atom_codec ~eq ~default unit_enc unit_dec
 941 | 
 942 | let bool =
 943 |   let eq = (( = ) : bool -> bool -> bool) in
 944 |   let default = false in
 945 |   let bool_enc = string_of_bool in
 946 |   let bool_dec d a = try bool_of_string a with
 947 |   | Invalid_argument _ -> err d (`Codec "expected boolean")
 948 |   in
 949 |   atom_codec ~eq ~default bool_enc bool_dec
 950 | 
 951 | let int =
 952 |   let eq = (( = ) : int -> int -> bool) in
 953 |   let default = 0 in
 954 |   let int_enc = string_of_int in
 955 |   let int_dec d a = try int_of_string a with
 956 |   | Failure _ -> err d (`Codec "expected integer")
 957 |   in
 958 |   atom_codec ~eq ~default int_enc int_dec
 959 | 
 960 | let float_fmt fmt =
 961 |   let eq = (( = ) : float -> float -> bool) in
 962 |   let default = 0. in
 963 |   let float_enc f = Printf.sprintf fmt f in
 964 |   let float_dec d a = try float_of_string a with
 965 |   | Failure _ -> err d (`Codec "expected float")
 966 |   in
 967 |   atom_codec ~eq ~default float_enc float_dec
 968 | 
 969 | let float = float_fmt "%g"
 970 | let float_hex = float_fmt "%h"
 971 | 
 972 | let string =
 973 |   let eq = String.equal in
 974 |   let default = "" in
 975 |   let string_enc x = x in
 976 |   let string_dec d x = x in
 977 |   atom_codec ~eq ~default string_enc string_dec
 978 | 
 979 | let result ~ok ~error =
 980 |   let eq r0 r1 = match r0, r1 with
 981 |   | Ok _, Error _ | Error _, Ok _ -> false
 982 |   | Ok v0, Ok v1 -> ok.eq v0 v1
 983 |   | Error e0, Error e1 -> error.eq e0 e1
 984 |   in
 985 |   let default = Ok ok.default in
 986 |   let rec enc o n recode = match o with
 987 |   | None ->
 988 |       let add_case c at n recode =
 989 |         recode |> add_ls |> add_atom at |> c.enc None n |> add_le
 990 |       in
 991 |       begin match n with
 992 |       | Ok n -> recode |> add_case ok "Ok" n
 993 |       | Error n -> recode |> add_case error "Error" n
 994 |       end
 995 |   | Some o when eq o n -> recode |> patch_keep_list
 996 |   | Some o ->
 997 |       let patch_case c o n recode =
 998 |         recode |> patch_ls |> patch_keep_atom |> c.enc o n |> patch_le
 999 |       in
1000 |       match o, n with
1001 |       | Ok o, Ok n -> recode |> patch_case ok (Some o) n
1002 |       | Error o, Error n -> recode |> patch_case error (Some o) n
1003 |       | _ -> recode |> patch_layout |> patch_drop_sexp |> enc None n
1004 |   in
1005 |   let dec d rt =
1006 |     let rt = dec_trace_ls d rt in
1007 |     let atom, rt = dec_trace_atom d rt in
1008 |     let v, rt = match atom with
1009 |     | "Ok" -> let v, rt = ok.dec d rt in Ok v, rt
1010 |     | "Error" -> let e, rt = error.dec d rt in Error e, rt
1011 |     | atom -> err d (`Codec "expected `Ok' or `Error'") (* FIXME *)
1012 |     in
1013 |     let rt = dec_trace_le d rt in
1014 |     v, rt
1015 |   in
1016 |   codec ~eq ~default enc dec
1017 | 
1018 | let t2 c0 c1 =
1019 |   let eq (l0, l1) (r0, r1) = c0.eq l0 r0 && c1.eq l1 r1 in
1020 |   let default = c0.default, c1.default in
1021 |   let enc o (n0, n1 as n) recode = match o with
1022 |   | None ->
1023 |       recode |> add_ls |> c0.enc None n0 |> c1.enc None n1 |> add_le
1024 |   | Some o when eq o n -> recode |> patch_keep_list
1025 |   | Some (o0, o1) ->
1026 |       recode
1027 |       |> patch_ls |> c0.enc (Some o0) n0 |> c1.enc (Some o1) n1 |> patch_le
1028 |   in
1029 |   let dec d rt =
1030 |     let rt = dec_trace_ls d rt in
1031 |     let v0, rt = c0.dec d rt in
1032 |     let v1, rt = c1.dec d rt in
1033 |     let rt = dec_trace_le d rt in
1034 |     (v0, v1), rt
1035 |   in
1036 |   codec ~eq ~default enc dec
1037 | 
1038 | let list c =
1039 |   let rec eq l0 l1 = match l0, l1 with
1040 |   | v0 :: l0, v1 :: l1 when c.eq v0 v1 -> eq l0 l1
1041 |   | [], [] -> true
1042 |   | _, _ -> false
1043 |   in
1044 |   let default = [] in
1045 |   let enc o n recode =
1046 |     let rec add_els n recode = match n with
1047 |     | nv :: n -> add_els n (c.enc None nv recode)
1048 |     | [] -> recode
1049 |     in
1050 |     match o with
1051 |     | None -> recode |> add_ls |> add_els n |> add_le
1052 |     | Some o when eq o n -> recode |> patch_keep_list
1053 |     | Some o ->
1054 |         let rec els o n recode = match o, n with
1055 |         | ov :: o, nv :: n -> els o n (c.enc (Some ov) nv recode)
1056 |         | [], [] -> recode
1057 |         | [], n -> recode |> patch_layout |> add_els n
1058 |         | o, [] -> patch_drop_until_le recode
1059 |         in
1060 |         recode |> patch_ls |> els o n |> patch_le
1061 |   in
1062 |   let dec d rt =
1063 |     let rt = dec_trace_ls d rt in
1064 |     let rec loop d acc rt = match pek_lexeme d with
1065 |     | None -> err d (`Codec "unexpected end of input")
1066 |     | Some Le -> ignore (dec_lexeme d); List.rev acc, (Le :: rt)
1067 |     | Some (Ls | Atom _) ->
1068 |         let v, rt = c.dec d rt in
1069 |         loop d (v :: acc) rt
1070 |     | Some (White _ | Comment _ as l) ->
1071 |         ignore (dec_lexeme d); loop d acc (l :: rt)
1072 |     in
1073 |     loop d [] rt
1074 |   in
1075 |   codec ~eq ~default enc dec
1076 | 
1077 | let set
1078 |     (type e) (type s) (module Set : Set.S with type elt = e and type t = s)
1079 |     c  =
1080 |   let eq = Set.equal in
1081 |   let default = Set.empty in
1082 |   let enc o n recode =
1083 |     let rec add_els n recode =
1084 |       if Set.is_empty n then recode else
1085 |       let v = Set.choose n in
1086 |       let n = Set.remove v n in
1087 |       add_els n (c.enc None v recode)
1088 |     in
1089 |     match o with
1090 |     | None -> recode |> add_ls |> add_els n |> add_le
1091 |     | Some o when eq o n -> recode |> patch_keep_list
1092 |     | Some o ->
1093 |         let rec els o n recode =
1094 |           let ov = Set.choose o in
1095 |           let _ = Set.remove ov o in
1096 |           failwith "TODO"
1097 |         in
1098 |         recode |> patch_ls |> els o n |> patch_le
1099 |   in
1100 |   let dec d rt =
1101 |     let rt = dec_trace_ls d rt in
1102 |     let rec loop d acc rt = match pek_lexeme d with
1103 |     | None -> err d (`Codec "unexpected end of input")
1104 |     | Some Le -> ignore (dec_lexeme d); acc, (Le :: rt)
1105 |     | Some (Ls | Atom _) ->
1106 |         let v, rt = c.dec d rt in
1107 |         if Set.mem v acc then err d (`Codec "not a set") else
1108 |         loop d (Set.add v acc) rt
1109 |     | Some (White _ | Comment _ as l) ->
1110 |         ignore (dec_lexeme d); loop d acc (l :: rt)
1111 |     in
1112 |     loop d Set.empty rt
1113 |   in
1114 |   codec ~eq ~default enc dec
1115 | 
1116 | type 'a atom = ('a -> string) * (string -> 'a)
1117 | 
1118 | and 'a variant_const = string * 'a
1119 | and ('a, 'b) variant_valued = string * 'b c * ('b -> 'a)
1120 | 
1121 | and 'a variant_proj =
1122 | | Pconst : 'a variant_const -> 'a variant_proj
1123 | | Pvalued : ('a, 'b) variant_valued * 'b -> 'a variant_proj
1124 | 
1125 | and 'a variant_inj =
1126 | | Iconst : 'a variant_const -> 'a variant_inj
1127 | | Ivalued : ('a, 'b) variant_valued -> 'a variant_inj
1128 | 
1129 | and 'a variant_projs = 'a -> 'a variant_proj
1130 | and 'a variant_injs = 'a variant_inj list
1131 | 
1132 | and 'a repr =
1133 | | Ground : 'a atom -> 'a repr
1134 | | List : 'a c -> 'a list repr
1135 | | Pair : 'a c * 'b c -> ('a * 'b) repr
1136 | | Variant : 'a variant_projs * 'a variant_injs -> 'a repr
1137 | 
1138 | and 'a c =
1139 |   { name : string;
1140 |     eq : 'a -> 'a -> bool;
1141 |     default : 'a;
1142 |     repr : 'a repr }
1143 | 
1144 | let encode_value c e v = match v with
1145 | | None -> encode_lexeme e None
1146 | | Some v ->
1147 |     let rec loop : type a. a c -> encoder -> a -> unit = (* FIXME not t.r. *)
1148 |     fun c e v -> match c.repr with
1149 |     | Ground (enc, _) -> encode_lexeme e (Some (Atom ((enc v), None)))
1150 |     | List c ->
1151 |         encode_lexeme e (Some Ls);
1152 |         List.iter (loop c e) v;
1153 |         encode_lexeme e (Some Le)
1154 |     | Pair (c0, c1) ->
1155 |         encode_lexeme e (Some Ls);
1156 |         loop c0 e (fst v);
1157 |         loop c1 e (snd v);
1158 |         encode_lexeme e (Some Le)
1159 |     | Variant (proj, _) ->
1160 |         begin match proj v with
1161 |         | Pconst (a, _) -> encode_lexeme e (Some (Atom (a, None)))
1162 |         | Pvalued ((a, c, _), v) ->
1163 |             encode_lexeme e (Some Ls);
1164 |             loop c e v;
1165 |             encode_lexeme e (Some Le);
1166 |         end
1167 |     in
1168 |     loop c e v
1169 | 
1170 | let rec decode_ground : type a. a c -> decoder -> (string -> a) -> a =
1171 | fun c d dec ->
1172 |   let at, _ = dec_trace_atom d [] in
1173 |   try dec at with
1174 |   | e -> err d (`Codec ("error decoding  " ^ c.name))
1175 | 
1176 | and decode_list : type a. a c -> decoder -> a list =
1177 | fun c d ->
1178 |   let _ = dec_trace_ls d [] in
1179 |   let rec els d acc = match pek_lexeme d with
1180 |   | None -> err d (`Codec "unexpected end of input")
1181 |   | Some Le -> ignore (dec_lexeme d); List.rev acc
1182 |   | Some (Ls | Atom _) ->
1183 |       let v = decode_value c d in
1184 |       els d (v :: acc)
1185 |   | Some (White _ | Comment _ as l) -> ignore (dec_lexeme d); els d acc
1186 |   in
1187 |   els d []
1188 | 
1189 | and decode_pair : type a b. a c -> b c -> decoder -> a * b =
1190 | fun c0 c1 d ->
1191 |   let _ = dec_trace_ls d [] in
1192 |   let v0 = decode_value c0 d in
1193 |   let v1 = decode_value c1 d in
1194 |   ignore (dec_trace_le d []);
1195 |   v0, v1
1196 | 
1197 | and decode_value : type a. a c -> decoder -> a = (* FIXME not t.r. *)
1198 | fun c d -> match c.repr with
1199 | | Ground (_, dec) -> decode_ground c d dec
1200 | | List c -> decode_list c d
1201 | | Pair (c0, c1) -> decode_pair c0 c1 d
1202 | | Variant (_, inj) ->
1203 |     let has_sexp, _ = dec_trace_to_peek_sexp d [] in
1204 |     if not has_sexp then err d (`Codec ("error decoding " ^ c.name)) else
1205 |     begin match dec_lexeme d with
1206 |     | Some Ls ->
1207 |         let at, _ = dec_trace_atom d [] in
1208 |         let rec dec_valued at = function
1209 |         | Ivalued (id, c, inj) :: _ when id = at ->
1210 |             let v = inj (decode_value c d) in
1211 |             let _ = dec_trace_le d in
1212 |             v
1213 |         | _ :: injs -> dec_valued at injs
1214 |               | [] -> err d (`Codec ("error decoding " ^ c.name))
1215 |         in
1216 |         dec_valued at inj
1217 |     | Some (Atom (at, _)) ->
1218 |         let rec find_const at = function
1219 |         | Iconst (id, v) :: _ when id = at -> v
1220 |         | _ :: injs -> find_const at injs
1221 |         | [] -> err d (`Codec ("error decoding " ^ c.name))
1222 |         in
1223 |         find_const at inj
1224 |     | _ -> assert false
1225 |     end
1226 | 
1227 | let decode_value c d =
1228 |   try match pek_lexeme d with
1229 |   | None -> ignore (dec_lexeme d); Ok None
1230 |   | Some _ -> decode_value c d
1231 |   with
1232 |   | Err (e, r) -> Error (e, r)
1233 | 
1234 | let c name ~eq ~default ~repr = { name; eq; default; repr }
1235 | 
1236 | let c_int =
1237 |   let eq = (( = ) : int -> int -> bool) in
1238 |   let default = 0 in
1239 |   let int_enc = string_of_int in
1240 |   let int_dec = int_of_string in
1241 |   c "integer" ~eq ~default ~repr:(Ground (int_enc, int_dec))
1242 | 
1243 | let c_t2 c0 c1 =
1244 |   let eq (l0, l1) (r0, r1) = c0.eq l0 r0 && c1.eq l1 r1 in
1245 |   let default = c0.default, c1.default in
1246 |   c "pair" ~eq ~default ~repr:(Pair (c0, c1))
1247 | 
1248 | let c_result ~ok ~error =
1249 |   let eq r0 r1 = match r0, r1 with
1250 |   | Ok _, Error _ | Error _, Ok _ -> false
1251 |   | Ok v0, Ok v1 -> ok.eq v0 v1
1252 |   | Error e0, Error e1 -> error.eq e0 e1
1253 |   in
1254 |   let default = Ok ok.default in
1255 |   let ok = "Ok", ok, fun v -> Ok v in
1256 |   let error = "Error", error, fun v -> Error v in
1257 |   let select = function
1258 |   | Ok v -> Pvalued (ok, v)
1259 |   | Error v -> Pvalued (error, v)
1260 |   in
1261 |   let result = Variant (select, [Ivalued ok; Ivalued error]) in
1262 |   c "result" ~eq ~default ~repr:result
1263 | 
1264 | let none = "None", None
1265 | let none_proj = Pconst none
1266 | let none_inj = Iconst none
1267 | let c_option some =
1268 |   let eq o0 o1 = match o0, o1 with
1269 |   | Some v0, Some v1 -> some.eq v0 v1
1270 |   | None, None -> true
1271 |   | Some _, None
1272 |   | None, Some _ -> false
1273 |   in
1274 |   let default = None in
1275 |   let some = "Some", some, fun v -> Some v in
1276 |   let select = function
1277 |   | None -> none_proj
1278 |   | Some v -> Pvalued (some, v)
1279 |   in
1280 |   let option = Variant (select, [none_inj; Ivalued some]) in
1281 |   c "option" ~eq ~default ~repr:option
1282 | 
1283 | (*
1284 | type t = A | B | C of string
1285 | 
1286 | let t cc =
1287 |   let eq v0 v1 = v0 = v1 in
1288 |   let default = A in
1289 |   let a = "A", A in
1290 |   let b = "B", B in
1291 |   let cc = "C", cc, (fun v -> C v) in
1292 |   let select = function
1293 |   | A -> Pconst a
1294 |   | B -> Pconst b
1295 |   | C v -> Pvalued (cc, v)
1296 |   in
1297 |   let t = Variant (select, [Iconst a; Iconst b; Ivalued cc]) in
1298 |   c "t" ~eq ~default ~repr:t
1299 | *)
1300 | 
1301 | 
1302 | 
1303 | 
1304 | 
1305 | (*---------------------------------------------------------------------------
1306 |    Copyright (c) 2016 Daniel C. Bünzli
1307 | 
1308 |    Permission to use, copy, modify, and/or distribute this software for any
1309 |    purpose with or without fee is hereby granted, provided that the above
1310 |    copyright notice and this permission notice appear in all copies.
1311 | 
1312 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1313 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1314 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1315 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1316 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1317 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1318 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1319 |   ---------------------------------------------------------------------------*)
1320 | 


--------------------------------------------------------------------------------
/src/sexpm.mli:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2016 Daniel C. Bünzli. All rights reserved.
  3 |    Distributed under the ISC license, see terms at the end of the file.
  4 |    %%NAME%% %%VERSION%%
  5 |   ---------------------------------------------------------------------------*)
  6 | 
  7 | (** S-expression codecs and updates.
  8 | 
  9 |     [Sexpm] codecs sequences of s-expressions to different
 10 |     representations.  Support is provided for:
 11 |     {ul
 12 |     {- Generic {{!custom_codec}custom} or {{!tagged_codec}tagged}
 13 |        s-expressions.}
 14 |     {- Typed OCaml {{!typed_codecs}values codecs} with layout preserving
 15 |        concrete syntax updates.}
 16 |     {- Low-level sequences of {{!lexeme_codec}s-expression lexemes} with
 17 |        concrete syntax layout preservation.}}
 18 |     Consult the {{!syntax}syntax} of s-expressions,
 19 |     {{!tips_lims}tips and limitations} and {{!examples}examples} of use.
 20 | 
 21 |     {e %%VERSION%% — {{:%%PKG_HOMEPAGE%% }homepage}} *)
 22 | 
 23 | (** {1:io IO} *)
 24 | 
 25 | type input = unit -> (Bytes.t * int * int) option
 26 | (** The type for input byte streams. On [Some (b, pos, len)] the bytes
 27 |     of [b] from [pos] to [pos+len] are readable until the next call to
 28 |     the function. On [None] the end of stream is reached. *)
 29 | 
 30 | val input_of_string : string -> input
 31 | (** [input_of_string s] is a byte stream reading from [s]. *)
 32 | 
 33 | val input_of_in_channel : ?bytes:Bytes.t -> in_channel -> input
 34 | (** [input_of_in_channel ~bytes ic] is a byte stream reading from [ic]
 35 |     using [bytes] to read the data.
 36 | 
 37 |     @raise Invalid_argument if the length of [bytes] is [0]. *)
 38 | 
 39 | type output = (Bytes.t * int * int) option -> unit
 40 | (** The type for output byte streams. Given [Some (b, pos, len)] the bytes
 41 |     of [b] from [pos] to [pos+len] are readable until the function
 42 |     returns. On [None] the end of stream is signalled to the function. *)
 43 | 
 44 | val output_of_buffer : Buffer.t -> output
 45 | (** [output_to_buffer b] is an output stream writing to buffer [b]. *)
 46 | 
 47 | val output_of_out_channel : out_channel -> output
 48 | (** [output_of_out_channel oc] is an output stream writing to [oc].
 49 |     It is the client's duty to flush it or close it. *)
 50 | 
 51 | (** {1:decoder Decoders} *)
 52 | 
 53 | type pos = int * int
 54 | (** The type for character positions. A one-based line and zero-based
 55 |     column number. *)
 56 | 
 57 | type range = pos * pos
 58 | (** The type for character ranges. *)
 59 | 
 60 | val pp_range : Format.formatter -> range -> unit
 61 | (** [pp_range ppf r] prints [r] on [ppf] according to
 62 |     {{:https://www.gnu.org/prep/standards/standards.html#Errors}GNU
 63 |     conventions}. *)
 64 | 
 65 | type error =
 66 | [ `Illegal_bytes of string
 67 | | `Illegal_uchar of Uchar.t
 68 | | `Illegal_escape of
 69 |     [ `Exp_lbrace of Uchar.t
 70 |     | `Exp_rbrace of Uchar.t
 71 |     | `Exp_hex of Uchar.t
 72 |     | `Exp_hex_rbrace of Uchar.t
 73 |     | `Not_esc of Uchar.t
 74 |     | `Not_uchar of int ]
 75 | | `Unclosed of [ `Quoted_token | `List | `Escape ]
 76 | | `Unexpected_le
 77 | | `Codec of string (* FIXME *) ]
 78 | (** The type for decode errors. *)
 79 | 
 80 | val pp_error : Format.formatter -> error -> unit
 81 | (** [pp_error ppf e] prints an unspecified representation of [e]
 82 |     on [ppf]. *)
 83 | 
 84 | type decoder
 85 | (** The type for UTF-8 s-expression decoders. *)
 86 | 
 87 | val decoder : ?layout:bool -> input -> decoder
 88 | (** [decoder i] is a decoder decoding s-expressions from [i] the UTF-8
 89 |     encoded byte stream [i].
 90 | 
 91 |     If [layout] is [true] (defaults to [false]), whitespace, comments and
 92 |     raw atoms are preserved by the underlying {{!lexeme} lexeme decoder}. This
 93 |     allows the layout of the decoded sequence's concrete syntax to be
 94 |     preserved. *)
 95 | 
 96 | val decoder_layout : decoder -> bool
 97 | (** [decoder_layout d] is the value of [layout], see {!decoder}. *)
 98 | 
 99 | val decoded_range : decoder -> range
100 | (** [decoded_range d] is the range of characters spanning the last
101 |     decoded lexeme or error. *)
102 | 
103 | val decoded_sexp_range : decoder -> range
104 | (** [decoder_sexp_range d] is the range of characters spanning the
105 |     last fully parsed s-expression. The value is accurate only after
106 |     [Atom _] and [Le] lexemes. *)
107 | 
108 | (** {1:encoders Encoders} *)
109 | 
110 | type encoder
111 | (** The type for UTF-8 s-expression encoders. *)
112 | 
113 | type encoder_style = [ `Pp | `Minify | `Raw ]
114 | (** The type for the encoder output style.
115 |     {ul
116 |     {- [`Pp], pretty printed output. If {{!lexeme}lexemes}
117 |        are encoded, whitespace and raw atoms are ignored.}
118 |     {- [`Minify], compact output. If {{!lexeme}lexemes}
119 |        are encoded, whitespace and raw atoms lexemes are ignored.}
120 |     {- [`Raw], raw output. Like [`Minify] except whitespace and
121 |        raw atoms lexemes are not ignored.}} *)
122 | 
123 | val encoder :
124 |   ?bytes:Bytes.t -> ?nl:bool -> ?quote:bool -> ?style:encoder_style ->
125 |   output -> encoder
126 | (** [encoder ~nl ~bytes ~minify o] is an encoder encoding s-expressions on [o].
127 |     {ul
128 |     {- [style] indicates the {{!encoder_style}encoder style}, defaults
129 |        to [`Pp].}
130 |     {- [quote], quoting behaviour on non-raw atoms. If [true] (default)
131 |        atoms are quoted iff they need escapes. If [false] no atoms except
132 |        the empty string are quoted.}
133 |     {- [nl] indicates whether the encoder makes sure a final line feed
134 |        U+000A is encoded at the end of the sequence, defaults to [false].}
135 |     {- [bytes] indicates the buffer to pass to [o].}}
136 | 
137 |     @raise Invalid_argument if the length of [bytes] is [0]. *)
138 | 
139 | (** {1:custom_codec Custom s-expression codec}
140 | 
141 |     Codecs {{!sexp}sequences} of s-expressions by mapping them
142 |     to a custom s-expression type. See an
143 |     {{!custom_example}example.} *)
144 | 
145 | val decode_custom :
146 |   atom:(decoder -> string -> 'a) ->
147 |   list:(decoder -> 'a list -> 'a) ->
148 |   decoder -> ('a option, error * range) result
149 | (** [decode_custom ~atom ~list d] is [Some v], constructed using [atom]
150 |     and [list] by folding over a single s-expression decoded from [d]
151 |     or [None] if the end stream was reached. *)
152 | 
153 | val encode_custom :
154 |   ('a -> [ `Atom of string | `List of 'a list ]) ->
155 |   encoder -> 'a option -> unit
156 | (** [encode_custom unfold e v] is encodes a single s-expression on [e]
157 |     by unfolding [v] using [unfold]. Use with [None] to signal the
158 |     end of the sequence. *)
159 | 
160 | (** {1:tagged_codec Tagged s-expression codec}
161 | 
162 |     Codecs {{!sexp}sequences} of s-expressions to a generic
163 |     tagged representation of s-expressions. *)
164 | 
165 | type 'a t = [ `Atom of string | `List of 'a t list ] * 'a
166 | (** The type for s-expressions tagged with values of type ['a].
167 |     Either an unescaped, UTF-8 encoded, atom or a list of
168 |     s-expressions. *)
169 | 
170 | type 'a tagger =
171 |   decoder -> [ `Atom of string | `List of 'a t list ] -> 'a t
172 | (** The type for s-expression taggers. Given the decoder and the last
173 |     decoded s-expression, returns a tagged expression. *)
174 | 
175 | val unit_tag : unit tagger
176 | (** [unit_tag] is [fun _ se -> se, ()]. *)
177 | 
178 | val range_tag : range tagger
179 | (** [range_tag] is [fun d se -> se, Sexpm.decoded_sexp_range d]. *)
180 | 
181 | val decode_tagged :
182 |   tag:'a tagger -> decoder -> ('a t option, error * range) result
183 | (** [decode_tagged tag d] is an s-expression decoded from [d] and tagged
184 |     according to [tag] or [None] if the end of stream was reached. *)
185 | 
186 | val encode_tagged : encoder -> 'a t option -> unit
187 | (** [encode_tagged e se] encodes the s-expression [se] on [e].
188 |     Use with [None] to signal the end of sequence. *)
189 | 
190 | (** {1:lexeme_codec Lexeme sequence codec}
191 | 
192 |     Codecs {{!sexp}sequences} of s-expressions as layout preserving,
193 |     {{!lexeme}well-formed}, sequences of lexemes. *)
194 | 
195 | type lexeme =
196 |   | White of string
197 |   | Comment of string
198 |   | Ls | Le | Atom of string * string option (** *)
199 |   (** The type for s-expression lexemes.
200 | 
201 |       {ul
202 |       {- [White s] is {{!whitespace}whitespace}.}
203 |       {- [Comment s] is a {{!comments}comment} without the starting
204 |          semi-colon and ending new line.}
205 |       {- [Ls]tart and [Le]nd delimit lists.}
206 |       {- [Atom (a, raw)] is for the unescaped atom [a] that was parsed
207 |          from the raw string [raw].}}
208 | 
209 |       On decoding, whitespace, comments and raw atoms are reported
210 |       only if the decoder has {{!decoder_layout}layout}. No sequence of
211 |       two [White] can be decoded from an input sequence.
212 | 
213 |       A {e well-formed} sequence of lexemes belongs to the language of the
214 |       following {{:https://tools.ietf.org/html/rfc5234}RFC 5234} ABNF
215 |       grammar defined on a sequence of lexemes.
216 | {v
217 | sexp-seq = *(White _ / Comment _ / sexp)
218 |     sexp = Atom _ / Ls *(sexp-seq) Le
219 | v}
220 |       A {!decoder} returns only well-formed sequences of lexemes even
221 |       when {!error}s are returned (see {!error_recovery}). The lexemes
222 |       strings are always UTF-8 encoded and in these strings characters
223 |       originally escaped in the input are in their unescaped representation.
224 |       [White _], [Commment _] and raw atom representations are only ever
225 |       returned if the decoder has {{!decoder_layout}layout}.
226 | 
227 |       An {!encoder} accepts only well-formed sequences of lexemes or
228 |       [Invalid_argument] is raised. The strings given to the lexemes
229 |       are assumed to be UTF-8 encoded, this is {b not} checked by the
230 |       module. For atoms if [raw] is given this raw string will be used
231 |       to output the token it is the client's duty to make sure it is
232 |       properly quoted and escaped. If [raw] is not provided the module
233 |       will handle this. For comments if the string does not end with a
234 |       new line and further lexemes are encoded a new line is added. *)
235 | 
236 | val pp_lexeme : Format.formatter -> lexeme -> unit
237 | (** [pp_lexeme ppf l] prints an unspecified non-sexp representation of
238 |     [l] on [ppf]. *)
239 | 
240 | val decode_lexeme : decoder -> (lexeme option, error * range) result
241 | (** [decode_lexeme d] is:
242 |     {ul
243 |     {- [Ok (Some l)] if a lexeme [l] was decoded.}
244 |     {- [Ok None] if the end of input was reached.}
245 |     {- [Error e] if a decoding error occurred. If the client is
246 |        interested in a best-effort decoding it can still continue to
247 |        decode afer an error (see {!error_recovery}) altough the
248 |        resulting sequence of lexemes is undefined and may not be
249 |        well-formed.}}
250 | 
251 |     {b Note.} Repeated invocation eventually always returns [Ok None], even
252 |     in case of error. *)
253 | 
254 | val peek_lexeme : decoder -> (lexeme option, error * range) result
255 | (** [peek_lexeme d] is like {!decode_lexeme} but doesn't remove
256 |     the lexeme from the input sequence. *)
257 | 
258 | val encode_lexeme : encoder -> lexeme option -> unit
259 | (** [encode_lexeme e l] encodes [l] on [e]. [None] should always
260 |     be eventually encoded.
261 | 
262 |     @raise Invalid_argument if a non {{!lexeme}well-formed} sequence
263 |     of lexemes is encoded.
264 | 
265 |     {b Warning.} The encoder does not check the following properties
266 |     and  for the output to be valid it is the client's duty to make sure
267 |     that in encoding:
268 |     {ul
269 |     {- [White s], [s] is only made of {{!whitespace}whitespace}}
270 |     {- [Comment s], [s] is UTF-8 encoded, does not contain US-ASCII control
271 |        characters except spaces or tabs and does not contain any end of
272 |        line except possibly at the end.}
273 |     {- [Atom (a, None)], [a] is UTF-8 encoded.}
274 |     {- [Atom (_, Some r)], [r] is UTF-8 encoded and, if needed, properly
275 |        quoted and escaped. {!escape} can be used for this.}} *)
276 | 
277 | (** {1:escaping Atom escaping} *)
278 | 
279 | val escape : ?quote:bool -> string -> string
280 | (** [escape ~quote s] escapes the UTF-8 encoded string [s] to an
281 |     s-expression {{!atoms}token or quoted token}. If [quote] is [true]
282 |     (default) the token is quoted iff [s] needs escapes or is the empty
283 |     string. If [quote] is [false] the token is never quoted except if [s]
284 |     is the empty string. *)
285 | 
286 | (** {1:typed_codecs OCaml values codecs} *)
287 | 
288 | type trace = lexeme list
289 | (** The type for the list of lexemes that described a value. *)
290 | 
291 | type 'a codec
292 | (** The type for codecs for values of type ['a]. *)
293 | 
294 | val codec_with_default : 'a -> 'a codec -> 'a codec
295 | (** [codec_with_default v c] is [c] with default value [c]. *)
296 | 
297 | val codec_with_codec :
298 |   ?eq:('a -> 'a -> bool) -> ?default:'a -> 'a codec -> 'a codec
299 | (** [codec_with_codec ~eq ~default c] is [c] with equality [eq] (unchanged
300 |     if unspecified) and default value [c] *)
301 | 
302 | (** {2:codec_values Encoding and decoding values} *)
303 | 
304 | type 'a c
305 | 
306 | val encode_value : 'a c -> encoder -> 'a option -> unit
307 | 
308 | (*
309 | val decode_value  : 'a codec -> decoder -> ('a option, error * range) result
310 | val encode_values : 'a codec -> encoder -> 'a list -> unit
311 | val decode_values : 'a codec -> encoder -> ('a list, error * range) result
312 | *)
313 | 
314 | val encode_traced_value :
315 |   'a codec -> encoder -> ('a * 'a) option * trace -> unit
316 | 
317 | val decode_traced_value :
318 |   'a codec -> decoder -> ('a option * trace, error * range) result
319 | 
320 | (** {2:std_type_codecs OCaml standard library type codecs} *)
321 | 
322 | val unit : unit codec
323 | val bool : bool codec
324 | val int : int codec
325 | val float : float codec
326 | val float_fmt : (float -> string, unit, string) format -> float codec
327 | val float_hex : float codec
328 | val string : string codec
329 | val result : ok:'a codec -> error:'b codec -> ('a, 'b) result codec
330 | val t2 : 'a codec -> 'b codec -> ('a * 'b) codec
331 | val list : 'a codec -> 'a list codec
332 | val set :
333 |   (module Set.S with type elt = 'a and type t = 'b) -> 'a codec -> 'b codec
334 | 
335 | (** {1:syntax S-expression syntax}
336 | 
337 |     We define the syntax of s-expressions over a sequence of
338 |    {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode
339 |    characters} in which all US-ASCII control characters except
340 |    {{!whitespace}whitespace} are forbidden in unescaped form.
341 | 
342 |     {2:sexp S-expressions and sequences thereof}
343 | 
344 |     A {e s-expression} is either an {{!atoms}{e atom}} or a
345 |     {{!lists}{e list}} of s-expressions interspaced with
346 |     {{!whitespace}{e whitespace}} and {{!comments}{e comments}}. A {e
347 |     sequence of s-expressions} is simply a succession of s-expressions
348 |     interspaced with whitespace and comments.
349 | 
350 |     These elements are informally described below and finally made
351 |     precise via an ABNF {{!grammar}grammar}.
352 | 
353 |     {2:whitespace Whitespace}
354 | 
355 |     Whitespace is a sequence of whitespace characters, namely, space
356 |     (U+0020), tab (U+0009), line feed (U+000A) or carriage return
357 |     (U+000D).
358 | 
359 |     {2:comments Comments}
360 | 
361 |     Unless escaped or inside quoted tokens (see below) anything that
362 |     follows a semicolon (U+003B) is ignored until the next {e end of line},
363 |     that is either a line feed
364 |     (U+000A), a carriage return (U+000D) or a carriage return and a line
365 |     feed (<U+000D,U+000A>).
366 | {v
367 | (this is not a comment) ; This is a comment
368 | (this is not a comment)
369 | v}
370 | 
371 |     {2:atoms Atoms}
372 | 
373 |     An atom represents ground data as a string of Unicode characters.
374 |     It can, via escapes, represent any sequence of Unicode characters,
375 |     including control characters and U+0000. It cannot represent an
376 |     arbitrary byte sequence except via a client-defined encoding
377 |     convention (e.g. Base64, OCaml byte escape codes, etc.).
378 | 
379 |     An atom is specified either via a token or a quoted token.  Quoted
380 |     tokens can represent atoms with spaces, parentheses and semicolons
381 |     without having to escape them.
382 | {v
383 | abc        ; a token for the atom "abc"
384 | abc\;\ \(d ; a token for the atom "abc; (d"
385 | "abc"      ; a quoted token for the atom "abc"
386 | "abc; (d"  ; a quoted token for the atom "abc; (d"
387 | ""         ; the quoted token for the atom ""
388 | v}
389 | 
390 |     A token and its quoted form represent the same atom. All atoms
391 |     except the empty atom [""] can be represented in both forms.
392 | 
393 |     Escapes are introduced by a backslash. Double quotes and
394 |     backslashes have to be escaped everywhere except in comments.
395 | {v
396 | "\u\{1F42B\}\n\"\\" ; atom with U+1F42B, a new line, a quote and a backslash
397 | \u\{1F42B\}\n\"\\   ; idem
398 | v}
399 |     The following escape sequences are recognized:
400 |     {ul
401 |     {- ["\""] (<U+005C,U+0022>) for double quote (U+0022) (mandatory)}
402 |     {- ["\\"] (<U+005C,U+005C>) for backslash (U+005C) (mandatory)}
403 |     {- ["\("] (<U+005C,U+0028>) for left parenthesis (U+0028)}
404 |     {- ["\)"] (<U+005C,U+0029>) for right parenthesis (U+0029)}
405 |     {- ["\;"] (<U+005C,U+003B>) for semicolon (U+003B)}
406 |     {- ["\ "] (<U+005C,U+0020>) for space (U+0020)}
407 |     {- ["\t"] (<U+005C,U+0074>) for tab (U+0009)}
408 |     {- ["\n"] (<U+005C,U+006E>) for line feed (U+000A)}
409 |     {- ["\r"] (<U+005C,U+0072>) for carriage return (U+000D)}
410 |     {- ["\u{X}"] with [X] is from 1 to at most 6 upper or lower case
411 |        hexadecimal digits standing for the corresponding
412 |        {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode character}
413 |        U+X.}
414 |     {- Any other character except line feed (U+000A) or carriage return
415 |        (U+000D), following
416 |        a backslash is an illegal sequence of characters.}}
417 | 
418 |     Tokens and quoted tokens can be split across lines by using a backslash
419 |     (U+005C) followed by a line feed (<U+000A>), a carriage return
420 |     (U+000D) or a carriage return and a line feed
421 |     (<U+000D,U+000A>). Any space (U+0020) or tab (U+0009) character
422 |     at the beginning of the new line is ignored.
423 | {v
424 | "\
425 |   a\
426 |   \ " ; the atom "a "
427 |   a\
428 |   \   ; the atom "a "
429 | v}
430 | 
431 |     {2:lists Lists}
432 | 
433 |     Lists are delimited by left and right parentheses (U+0028, U+0029)
434 |     and their elements are s-expressions separated by optional
435 |     whitespace and comments. For example:
436 | {v
437 | (a list (of four) expressions)
438 | (a list(of four)expressions)
439 | ("a"list("of"four)expressions)
440 | (a list (of ; This is a comment
441 | four) expressions)
442 | () ; the empty list
443 | v}
444 | 
445 |     {2:grammar S-expression grammar}
446 | 
447 |     The following {{:https://tools.ietf.org/html/rfc5234}RFC 5234}
448 |     ABNF grammar is defined on a sequence of
449 |     {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode characters}.
450 | {v
451 |  sexp-seq = *(ws / comment / sexp)
452 |      sexp = atom / list
453 |      list = %x0028 sexp-seq %x0029
454 |      atom = token / qtoken
455 |     token = (t-char / escape) *(t-char / escape / cont)
456 |    qtoken = %x0022 *(q-char / escape / cont) %x0022
457 |    escape = %x005C (%x0020 / %x0022 / %x0028 / %x0029 / 0x003B / %x005C /
458 |                     %x006E / %x0072 / %x0074 / %x0075 %x007B unum %x007D)
459 |      unum = 1*6(HEXDIG)
460 |      cont = %x005C nl
461 |        ws = *(ws-char)
462 |   comment = %x003B *(c-char) nl
463 |        nl = %x000A / %x000D / %x000D %x000A
464 |    t-char = %x0021 / %x0023-0027 / %x002A-%x003A / %x003C-005B /
465 |             %x005D-D7FF / %xE000-10FFFF
466 |    q-char = t-char / ws-char / %x0028 / %x0029 / %x003B
467 |   ws-char = %x0020 / %x0009 / %x000A / %x000D
468 |    c-char = %x0009 / %x0020-D7FF / %xE000-10FFFF
469 | v}
470 |     A few additional constraints not expressed by the grammar:
471 |     {ul
472 |     {- If an initial {{:http://unicode.org/glossary/#byte_order_mark}BOM
473 |        character} (U+FEFF) is present it is discarded.}
474 |     {- On parsing [token] and [qtoken], any [%x0020] and [%x0009] following
475 |        a [cont] is discarded and not part of the parsed atom.}
476 |     {- [unum] once interpreted as an hexadecimal number must be a
477 |        {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode scalar
478 |        value.}}
479 |     {- A comment can be ended by the end of the character sequence.}} *)
480 | 
481 | (** {1:error_recovery Error recovery}
482 | 
483 |     After a decoding error, if best-effort decoding is performed, the
484 |     following happens before continuing:
485 |     {ul
486 |     {- [`Illegal_bytes _], [`Illegal_escape _], [`Illegal_uchar _], a Unicode
487 |        replacement character (U+FFFD) is substituted to the illegal byte
488 |        sequence, bogus escape or Unicode character}
489 |     {- [`Unexpected_le], the right parenthesis is skipped.}
490 |     {- [`Unclosed `Escape], the escape is skipped and the atom returned.}
491 |     {- [`Unclosed `Quoted_token], the token is closed and the atom returned.}
492 |     {- [`Unclosed `List], a [Le] token is returned.}}
493 |     Due to the U+FFFD insertions layout cannot be assumed to be
494 |     preserved when error recovery is performed, however outputing an
495 |     erroring stream of lexemes results in a valid sequence of s-expressions. *)
496 | 
497 | (** {1:tips_lims Tips and limitations}
498 | 
499 | {ul
500 | {- Usage of decoding and encoding functions of the various
501 |    representations on decoders and encoders is not mutually exclusive.
502 |    However if you are for example using the lexeme decoder you need
503 |    to make sure you are at the beginning of a sexp before invoking
504 |    the higher-level decoders.}
505 | {- Concrete syntax layout preservation will in general be slower.}} *)
506 | 
507 | (** {1:examples Examples}
508 | 
509 |     {2:custom_example Custom representation}
510 | 
511 |     The following codecs a sequence of s-expressions to a custom
512 |     definition of s-expressions. Note that for illustration purposes
513 |     this representation is exactly the tagged s-expression
514 |     representation of the {{!tagged_example}next example}.
515 | {[
516 | type sexp = [ `Atom of string | `List of sexp list ] * Sexpm.range
517 | 
518 | let of_string : string -> (sexp list, Sexpm.error * Sexpm.range) result =
519 | fun s ->
520 |   let d = Sexpm.(decoder @@ input_of_string s) in
521 |   let atom d a = `Atom a, Sexpm.decoded_sexp_range d in
522 |   let list d l = `List l, Sexpm.decoded_sexp_range d in
523 |   let rec loop acc = match Sexpm.decode_custom ~atom ~list d with
524 |   | Ok Some v -> loop (v :: acc)
525 |   | Ok None -> Ok (List.rev acc)
526 |   | Error _ as e -> e
527 |   in
528 |   loop []
529 | 
530 | let to_string : ?style:Sexpm.encoder_style -> sexp list -> string
531 | fun ses ->
532 |   let unfold = fst in
533 |   let b = Buffer.create 256 in
534 |   let d = Sexpm.(encoder ?style @@ output_to_buffer b) in
535 |   let rec loop = function
536 |   | se :: ses -> Sexpm.encode_custom unfold d (Some se); loop ses
537 |   | [] -> Sexpm.encode_custom d None; Buffer.contents b
538 |   in
539 |   loop ses
540 | ]}
541 | 
542 |    {2:tagged_example Tagged representation}
543 | 
544 | {[
545 | type sexp = Sexpm.range Sexpm.t
546 | 
547 | let of_string : string -> (sexp list, Sexpm.error * Sexpm.range) result =
548 | fun s ->
549 |   let d = Sexpm.(decoder @@ input_of_string s) in
550 |   let rec loop acc = match Sexpm.decode_tagged ~tag:Sexpm.range_tag d with
551 |   | Ok Some v -> loop (v :: acc)
552 |   | Ok None -> Ok (List.rev acc)
553 |   | Error _ as e -> e
554 |   in
555 |   loop []
556 | 
557 | let to_string : ?style:Sexpm.encoder_style -> sexp list -> string
558 | fun ses ->
559 |   let b = Buffer.create 256 in
560 |   let d = Sexpm.(encoder ?style @@ output_to_buffer b) in
561 |   let rec loop = function
562 |   | se :: ses -> Sexpm.encode_tagged d (Some se); loop ses
563 |   | [] -> Sexpm.encode_tagged d None; Buffer.contents b
564 |   in
565 |   loop ss
566 | ]}
567 | *)
568 | 
569 | (*---------------------------------------------------------------------------
570 |    Copyright (c) 2016 Daniel C. Bünzli
571 | 
572 |    Permission to use, copy, modify, and/or distribute this software for any
573 |    purpose with or without fee is hereby granted, provided that the above
574 |    copyright notice and this permission notice appear in all copies.
575 | 
576 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
577 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
578 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
579 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
580 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
581 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
582 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
583 |   ---------------------------------------------------------------------------*)
584 | 


--------------------------------------------------------------------------------
/src/sexpm.mllib:
--------------------------------------------------------------------------------
1 | Sexpm
2 | 


--------------------------------------------------------------------------------
/test/sexptrip.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2017 Daniel C. Bünzli. All rights reserved.
  3 |    Distributed under the ISC license, see terms at the end of the file.
  4 |    %%NAME%% %%VERSION%%
  5 |   ---------------------------------------------------------------------------*)
  6 | 
  7 | open Bos_setup
  8 | 
  9 | (* Input *)
 10 | 
 11 | type input =
 12 |   { buf_size : int;
 13 |     inf : Fpath.t;
 14 |     input_string : bool;
 15 |     layout : bool; }
 16 | 
 17 | let with_input_decoder i f v =
 18 |   let wrap input () =
 19 |     let dec = Sexpm.decoder ~layout:i.layout input in
 20 |     f i dec v
 21 |   in
 22 |   match i.input_string with
 23 |   | true ->
 24 |       OS.File.read i.inf
 25 |       >>= fun s -> Ok (wrap (Sexpm.input_of_string s) ())
 26 |   | false ->
 27 |       let bytes = Bytes.create i.buf_size in
 28 |       OS.File.with_input ~bytes i.inf wrap ()
 29 | 
 30 | (* Output *)
 31 | 
 32 | type output =
 33 |   { buf_size : int;
 34 |     outf : Fpath.t;
 35 |     output_string : bool;
 36 |     style : Sexpm.encoder_style;
 37 |     lex : bool; }
 38 | 
 39 | let with_output_encoder o f v =
 40 |   let wrap output () =
 41 |     let bytes = Bytes.create o.buf_size in
 42 |     let enc = Sexpm.encoder ~bytes ~style:o.style output in
 43 |     let r = f o enc v in
 44 |     r
 45 |   in
 46 |   match o.output_string with
 47 |   | true ->
 48 |       let buf = Buffer.create o.buf_size in
 49 |       let r = wrap (Sexpm.output_of_buffer buf) () in
 50 |       OS.File.write o.outf (Buffer.contents buf) >>| fun () -> r
 51 |   | false ->
 52 |       OS.File.with_output o.outf wrap ()
 53 | 
 54 | let with_output_formatter o f v = match o.output_string with
 55 | | true ->
 56 |     let buf = Buffer.create o.buf_size in
 57 |     let ppf = Format.formatter_of_buffer buf in
 58 |     let r = f o ppf v in
 59 |     (Format.pp_print_flush ppf (); OS.File.write o.outf (Buffer.contents buf))
 60 |     >>= fun () -> Ok (Ok r)
 61 | | false ->
 62 |     let wrap oc () =
 63 |       let ppf = Format.formatter_of_out_channel oc in
 64 |       let r = f o ppf v in
 65 |       (Format.pp_print_flush ppf (); Ok r)
 66 |     in
 67 |     OS.File.with_oc o.outf wrap ()
 68 | 
 69 | (* Pretty printers *)
 70 | 
 71 | let pp_decode i dec ppf = function
 72 | | Ok None ->
 73 |     Format.fprintf ppf "%a:%a: EOI@,"
 74 |       Fpath.pp i.inf Sexpm.pp_range (Sexpm.decoded_range dec)
 75 | | Ok (Some l) ->
 76 |     Format.fprintf ppf "%a:%a: @[%a@]@,"
 77 |       Fpath.pp i.inf Sexpm.pp_range (Sexpm.decoded_range dec)
 78 |       Sexpm.pp_lexeme l
 79 | | Error (e, r) ->
 80 |     Format.fprintf ppf "%a:%a: @[%a@]@,"
 81 |       Fpath.pp i.inf Sexpm.pp_range (Sexpm.decoded_range dec)
 82 |       Sexpm.pp_error e
 83 | 
 84 | (* Decode *)
 85 | 
 86 | let decode err_ppf i =
 87 |   let decode i dec () =
 88 |     let rec loop () = match Sexpm.decode_lexeme dec with
 89 |     | Ok None -> ()
 90 |     | Ok (Some l) -> loop ()
 91 |     | Error _ as err -> pp_decode i dec err_ppf err; loop ()
 92 |     in
 93 |     Format.(fprintf err_ppf "@[<v>"; loop (); fprintf err_ppf "@]@?")
 94 |   in
 95 |   with_input_decoder i decode ()
 96 | 
 97 | (* Lex *)
 98 | 
 99 | let lex i o =
100 |   let lex i dec o ppf () =
101 |     let rec loop () = match Sexpm.decode_lexeme dec with
102 |     | Ok None as d -> pp_decode i dec ppf d
103 |     | d -> pp_decode i dec ppf d; loop ()
104 |     in
105 |     Format.(fprintf ppf "@[<v>"; loop (); fprintf ppf "@]@?")
106 |   in
107 |   R.join @@ with_input_decoder i
108 |     (fun i dec () -> R.join @@ with_output_formatter o (lex i dec) ()) ()
109 | 
110 | (* Trip *)
111 | 
112 | let trip err_ppf i o =
113 |   let trip i dec o enc () =
114 |     let rec loop () = match Sexpm.decode_lexeme dec with
115 |     | Ok None -> Sexpm.encode_lexeme enc None
116 |     | Ok d -> Sexpm.encode_lexeme enc d; loop ()
117 |     | Error _ as err -> pp_decode i dec err_ppf err; loop ()
118 |     in
119 |     Format.(fprintf err_ppf "@[<v>"; loop (); fprintf err_ppf "@]@?"; Ok ())
120 |   in
121 |   R.join @@ with_input_decoder i
122 |     (fun i dec () -> R.join @@ with_output_encoder o (trip i dec) ()) ()
123 | 
124 | 
125 | (* Random encode *)
126 | 
127 | type rand =
128 |   { seed : int;
129 |     count : int; (* number of sexps to generate. *)
130 |     maxd : int; (* max depth of lists. *)
131 |     maxl : int; (* max length of lists. *)
132 |     maxa : int; (* max length of atoms. *) }
133 | 
134 | let r_ascii_lowercase_letter () =
135 |   Uchar.unsafe_of_int (0x0061 (* a *) + Random.int 26)
136 | 
137 | let r_general_scripts_minus_ctrl () =
138 |   Uchar.unsafe_of_int (0x0020 + Random.int 0x1FE0) (* < U+2000 *)
139 | 
140 | let r_white encode r buf =
141 |   Buffer.clear buf;
142 |   for i = 0 to Random.int 3 do match Random.int 100 with
143 |   | n when n < 90 -> Buffer.add_char buf ' '
144 |   | n when n < 94 -> Buffer.add_char buf '\t'
145 |   | n when n < 98 -> Buffer.add_char buf '\n'
146 |   | n when n < 100 -> Buffer.add_char buf '\r'
147 |   | n -> assert false
148 |   done;
149 |   encode (Sexpm.White (Buffer.contents buf))
150 | 
151 | let r_comment encode r buf =
152 |   Buffer.clear buf;
153 |   for i = 0 to Random.int 15 do
154 |     Uutf.Buffer.add_utf_8 buf (r_general_scripts_minus_ctrl ())
155 |   done;
156 |   encode (Sexpm.Comment (Buffer.contents buf))
157 | 
158 | let r_atom encode r buf =
159 |   Buffer.clear buf;
160 |   for i = 0 to Random.int (r.maxa + 1)
161 |   do Uutf.Buffer.add_utf_8 buf (r_ascii_lowercase_letter ()) done;
162 |   encode (Sexpm.Atom (Buffer.contents buf, None))
163 | 
164 | let rec r_list encode r buf depth count =
165 |   let depth = Random.int (depth + 1) in
166 |   encode (Sexpm.Ls);
167 |   let count = r_sexp_seq encode r buf depth (count - 1) in
168 |   encode (Sexpm.Le);
169 |   count
170 | 
171 | and r_sexp_seq encode r buf depth count =
172 |   let rec loop len count = match len = 0 || count = 0 with
173 |   | true -> count
174 |   | false ->
175 |       begin match Random.int 100 with
176 |       | n when n < 80 -> ()
177 |       | n when n < 90 -> r_white encode r buf
178 |       | n when n < 100 -> r_comment encode r buf
179 |       | n -> assert false
180 |       end;
181 |       match depth = 0 || Random.bool () with
182 |       | true -> r_atom encode r buf; loop (len - 1) (count - 1)
183 |       | false ->
184 |           let count = r_list encode r buf (depth - 1) count in
185 |           loop (len - 1) count
186 |   in
187 |   loop (Random.int (r.maxl + 1)) count
188 | 
189 | let random o r =
190 |   let random o e () =
191 |     let buf = Buffer.create 255 in
192 |     let encode l = Sexpm.encode_lexeme e (Some l) in
193 |     let rec loop = function
194 |     | 0 -> Sexpm.encode_lexeme e None; Ok ()
195 |     | n -> loop (r_sexp_seq encode r buf r.maxd n)
196 |     in
197 |     loop r.count
198 |   in
199 |   R.join @@ with_output_encoder o random ()
200 | 
201 | (* Main command *)
202 | 
203 | let do_cmd (i, o) map rand = function
204 | | `Decode -> decode Format.err_formatter i
205 | | `Random -> random o rand
206 | | `Trip when o.lex -> lex i o
207 | | `Trip -> trip Format.err_formatter i o
208 | 
209 | (* Command line interface *)
210 | 
211 | open Cmdliner
212 | 
213 | let nat_arg =
214 |   let kind = "positive integer "in
215 |   let parse s = match Arg.parser_of_kind_of_string ~kind String.to_int s with
216 |   | Ok n when n > 0 -> `Ok n
217 |   | Ok n -> `Error (strf "%d is not positive" n)
218 |   | Error (`Msg e) -> `Error e
219 |   in
220 |   Arg.pconv (parse, Format.pp_print_int) (* FIXME use Arg.conv *)
221 | 
222 | let fpath_arg =
223 |   let parse s = match Fpath.of_string s with
224 |   | Ok v -> `Ok v
225 |   | Error (`Msg e) -> `Error e
226 |   in
227 |   Arg.pconv (parse, Fpath.pp) (* FIXME use Arg.conf *)
228 | 
229 | let buf_size =
230 |   let doc = "Size of internal buffers." in
231 |   Arg.(value & opt nat_arg 65536 & info ["buffer-size"] ~doc ~docv:"INT")
232 | 
233 | (* Input *)
234 | 
235 | let inf =
236 |   let doc = "The input file. Reads from standard input if unspecified." in
237 |   Arg.(value & pos 0 fpath_arg OS.File.dash & info [] ~doc ~docv:"FILE")
238 | 
239 | let input_section = "INPUT OPTIONS"
240 | let docs = input_section
241 | 
242 | let input_string =
243 |   let doc = "Input everything in a string and decode the string." in
244 |   Arg.(value & flag & info ["input-string"] ~doc ~docs)
245 | 
246 | let no_layout =
247 |   let doc = "Do not preserve concrete syntax layout on decode." in
248 |   Arg.(value & flag & info ["no-layout"] ~doc ~docs)
249 | 
250 | let input =
251 |   let input buf_size inf input_string no_layout =
252 |     { buf_size; inf; input_string; layout = not no_layout }
253 |   in
254 |   Term.(const input $ buf_size $ inf $ input_string $ no_layout)
255 | 
256 | (* Output options. *)
257 | 
258 | let output_section = "OUTPUT OPTIONS"
259 | let docs = output_section
260 | 
261 | let lex =
262 |   let doc = "Output the decoded s-expression lexemes or errors, one per line
263 |              with their position."
264 |   in
265 |   Arg.(value & flag & info ["lex"] ~docs ~doc)
266 | 
267 | let style =
268 |   let doc = "Minify output." in
269 |   let minify = `Minify, Arg.info ["m"; "minify"] ~docs ~doc in
270 |   let doc = "Pretty-print output." in
271 |   let pp = `Pp, Arg.info ["p"; "pretty-print"] ~docs ~doc in
272 |   Arg.(value & vflag `Raw [minify; pp])
273 | 
274 | let output_string =
275 |   let doc = "Encode everything in a string and output the string." in
276 |   Arg.(value & flag & info ["output-string"] ~doc ~docs)
277 | 
278 | let outf =
279 |   let doc = "Output result to $(docv)." in
280 |   Arg.(value & opt (some fpath_arg) None & info ["o"] ~docs ~doc ~docv:"FILE")
281 | 
282 | let in_place =
283 |   let doc = "Atomically write output, on success, to the file specified as
284 |              input."
285 |   in
286 |   Arg.(value & flag & info ["i"; "in-place"] ~docs ~doc)
287 | 
288 | let io =
289 |   let err_excl o0 o1 =
290 |     R.error_msgf "option %s and %s are mutually exclusive" o0 o1
291 |   in
292 |   let io input lex output_string style outf in_place =
293 |     if in_place && lex then err_excl "--in-place" "--lex" else
294 |     let buf_size = (input : input).buf_size in
295 |     let outf = match outf with
296 |     | None when in_place ->
297 |         if Fpath.equal input.inf OS.File.dash
298 |         then R.error_msg ("option --in-place cannot be used with stdin")
299 |         else Ok input.inf
300 |     | Some _ when in_place -> err_excl "--in-place" "-o"
301 |     | None -> Ok (OS.File.dash)
302 |     | Some f -> Ok f
303 |     in
304 |     outf >>| fun outf ->
305 |     (input, { buf_size; lex; output_string; style; outf })
306 |   in
307 |   Term.(cli_parse_result
308 |          (const io $ input $ lex $ output_string $ style $ outf $ in_place))
309 | 
310 | (* Map lookup and update. *)
311 | 
312 | let map_section = "S-EXPRESSION MAP LOOKUP AND UPDATE OPTIONS"
313 | let docs = map_section
314 | let docv = "KEY"
315 | 
316 | type map =
317 |   { allow_undef : bool;
318 |     exists : string option;
319 |     delete : string option;
320 |     get : string option;
321 |     set : string option;
322 |     value : string option; }
323 | 
324 | let allow_undef =
325 |   let doc = "For $(b,--get), $(b,--set) and $(b,--delete), do not error if
326 |              the key $(i,KEY) is undefined." in
327 |   Arg.(value & flag & info ["u"; "allow-undefined"] ~docs ~doc)
328 | 
329 | let exists =
330 |   let doc = "Check for the existence of key $(docv). See section EXIT CODE." in
331 |   Arg.(value & opt (some string) None & info ["e"; "exists"] ~docs ~doc ~docv)
332 | 
333 | let delete =
334 |   let doc = "Output the input without the key $(docv) and its value." in
335 |   Arg.(value & opt (some string) None & info ["d"; "delete"] ~docs ~doc ~docv)
336 | 
337 | let get =
338 |   let doc = "Output the value of key $(docv) of the input." in
339 |   Arg.(value & opt (some string) None & info ["g"; "get"] ~docs ~doc ~docv)
340 | 
341 | let set =
342 |   let doc = "Output the input with the key $(docv) set to the value specified
343 |              by $(b,--value)."
344 |   in
345 |   Arg.(value & opt (some string) None & info ["s"; "set"] ~docs ~doc ~docv)
346 | 
347 | let value =
348 |   let doc = "s-expression $(docv) bound to the $(i,KEY) of $(b,--set).
349 |              If unspecified a s-expression is read from standard input."
350 |   in
351 |   let docv = "SEXP" in
352 |   Arg.(value & opt (some string) None & info ["v"; "value"] ~docs ~doc ~docv)
353 | 
354 | let map =
355 |   let map allow_undef exists delete get set value =
356 |     { allow_undef; exists; delete; get; set; value; }
357 |   in
358 |   Term.(const map $ allow_undef $ exists $ delete $ get $ set $ value)
359 | 
360 | (* Random generation parameters. *)
361 | 
362 | let rand_section = "RANDOM S-EXPRESSIONS OPTIONS"
363 | let docs = rand_section
364 | 
365 | let rseed =
366 |   let doc = "Random seed." in
367 |   Arg.(value & opt (some nat_arg) None & info ["rseed"] ~docs ~doc ~docv:"INT")
368 | 
369 | let rcount =
370 |   let doc = "Number of random s-expressions (atom or list) generated." in
371 |   let c = 100 in
372 |   Arg.(value & opt int c & info ["rcount"] ~docs ~doc ~docv:"INT")
373 | 
374 | let maxd =
375 |   let doc = "Maximal depth of random s-expressions lists." in
376 |   let d = 5 in
377 |   Arg.(value & opt int d & info ["max-depth"] ~docs ~doc ~docv:"INT")
378 | 
379 | let maxl =
380 |   let doc = "Maximal length of random s-expression lists." in
381 |   let l = 20 in
382 |   Arg.(value & opt int l & info ["max-list-len"] ~docs ~doc ~docv:"INT")
383 | 
384 | let maxa =
385 |   let doc = "Maximal length of random s-expression atoms." in
386 |   let l = 20 in
387 |   Arg.(value & opt int l & info ["max-atom-len"] ~docs ~doc ~docv:"INT")
388 | 
389 | let rand =
390 |   let rand seed count maxd maxl maxa =
391 |     let seed = match seed with
392 |     | None -> Random.self_init (); Random.int (1 lsl 30 - 1)
393 |     | Some s -> s
394 |     in
395 |     { seed; count; maxd; maxl; maxa }
396 |   in
397 |   Term.(const rand $ rseed $ rcount $ maxd $ maxl $ maxa)
398 | 
399 | (* Command *)
400 | 
401 | let cmd =
402 |   let doc = "Decode only, no encoding." in
403 |   let dec = `Decode, Arg.info ["decode"] ~doc in
404 |   let doc =
405 |       strf "Output random s-expressions, no decoding. See section %s."
406 |         rand_section
407 |   in
408 |   let rand = `Random, Arg.info ["random"] ~doc in
409 |   Arg.(value & vflag `Trip [dec; rand])
410 | 
411 | let doc = "operate on sequences of s-expressions"
412 | let exits =
413 |   (Term.exit_info 1 ~doc:"the input was malformed") ::
414 |   (Term.exit_info 2 ~doc:"the key does not exist, see the s-expression \
415 |                           map options.") ::
416 |   Term.default_exits
417 | 
418 | let man =
419 |   [ `S "DESCRIPTION";
420 |     `P "$(mname) is a command line tool to operate on sequences of
421 |         s-expressions.";
422 |     `P "Use '$(mname)' to decode and recode s-expressions."; `Noblank;
423 |     `P "Use '$(mname) --get KEY' to get a value in a s-expression."; `Noblank;
424 |     `P "Use '$(mname) --set KEY' to set a value in a s-expression."; `Noblank;
425 |     `P "Use '$(mname) --random' to generate random s-expressions.";
426 |     `S Manpage.s_options;
427 |     `S input_section;
428 |     `P "These options control the input process.";
429 |     `S output_section;
430 |     `P "These options control the output process.";
431 |     `S map_section;
432 |     `P "These options query or modify the input by interpreting
433 |       s-expressions as key-value maps.";
434 |     `S rand_section;
435 |     `P "These options control the s-expressions randomly generated by
436 |       the $(b,--random) option.";
437 |     `S "S-EXPRESSION SYNTAX";
438 |     `P "A more gentle introduction to s-expression is given
439 |         in the API documentation available at:";
440 |     `Pre "  %%PKG_DOC%%/Sexpm.html#syntax or $(b,odig doc sexpm)";
441 |     `P "The following is the parsed syntax of s-expressions expressed
442 |         as a RFC 5234 ABNF grammar:";
443 |     `Pre "\
444 |  sexp-seq = *(ws / comment / sexp)
445 |      sexp = atom / list
446 |      list = %x0028 sexp-seq %x0029
447 |      atom = token / qtoken
448 |     token = (t-char / escape) *(t-char / escape / cont)
449 |    qtoken = %x0022 *(q-char / escape / cont) %x0022
450 |    escape = %x005C (%x0020 / %x0022 / %x0028 / %x0029 / 0x003B / %x005C /
451 |                     %x006E / %x0072 / %x0074 / %x0075 %x007B unum %x007D)
452 |      unum = 1*6(HEXDIG)
453 |      cont = %x005C nl
454 |        ws = *(ws-char)
455 |   comment = %x003B *(c-char) nl
456 |        nl = %x000A / %x000D / %x000D %x000A
457 |    t-char = %x0021 / %x0023-0027 / %x002A-%x003A / %x003C-005B /
458 |             %x005D-D7FF / %xE000-10FFFF
459 |    q-char = t-char / ws-char / %x0028 / %x0029 / %x003B
460 |   ws-char = %x0020 / %x0009 / %x000A / %x000D
461 |    c-char = %x0009 / %x0020-D7FF / %xE000-10FFFF
462 | ";
463 |     `S "BUGS";
464 |     `P "This program is distributed with the sexpm OCaml library.
465 |         See %%HOMEPAGE%% for contact information." ]
466 | 
467 | let cmd =
468 |   Term.(term_result (const do_cmd $ io $ map $ rand $ cmd)),
469 |   Term.info "sexptrip" ~version:"%%VERSION%%" ~doc ~exits ~man
470 | 
471 | let () = Term.(exit @@ eval cmd)
472 | 
473 | (*---------------------------------------------------------------------------
474 |    Copyright (c) 2017 Daniel C. Bünzli
475 | 
476 |    Permission to use, copy, modify, and/or distribute this software for any
477 |    purpose with or without fee is hereby granted, provided that the above
478 |    copyright notice and this permission notice appear in all copies.
479 | 
480 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
481 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
482 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
483 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
484 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
485 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
486 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
487 |   ---------------------------------------------------------------------------*)
488 | 


--------------------------------------------------------------------------------
/test/test.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2016 Daniel C. Bünzli. All rights reserved.
  3 |    Distributed under the ISC license, see terms at the end of the file.
  4 |    %%NAME%% %%VERSION%%
  5 |   ---------------------------------------------------------------------------*)
  6 | 
  7 | let strf = Format.asprintf
  8 | let log f = Format.printf (f ^^ "@?")
  9 | let fail fmt =
 10 |   let fail _ = failwith (Format.flush_str_formatter ()) in
 11 |   Format.kfprintf fail Format.str_formatter fmt
 12 | 
 13 | let pp_range ppf ((sc, sl), (ec, el)) =
 14 |   Format.fprintf ppf "(%d,%d),(%d,%d)" sc sl ec el
 15 | 
 16 | let pp_decode ppf = function
 17 | | Ok None -> Format.fprintf ppf "Ok None"
 18 | | Ok Some l -> Format.fprintf ppf "Ok (Some (%a))" Sexpm.pp_lexeme l
 19 | | Error (e, r) ->
 20 |     Format.fprintf ppf "Error (%a,(%a))" Sexpm.pp_error e pp_range r
 21 | 
 22 | let test_decode fnd exp = match fnd, exp with
 23 | | Ok _, Ok _
 24 | | Error _, Ok _
 25 | | Ok _, Error _ when fnd <> exp ->
 26 |     fail "found: %a expected: %a" pp_decode fnd pp_decode exp
 27 | | Error (e0, _), Error (e1, _) when e0 <> e1 ->
 28 |     fail "found: %a expected: %a" pp_decode fnd pp_decode exp
 29 | | _, _ -> ()
 30 | 
 31 | let test_dseq ?layout src seq =
 32 |   let d = Sexpm.(decoder ?layout (input_of_string src)) in
 33 |   let rec loop d = function
 34 |   | l :: ls -> test_decode (Sexpm.decode_lexeme d) l; loop d ls
 35 |   | [] ->
 36 |       match Sexpm.decode_lexeme d with
 37 |       | Ok None -> ()
 38 |       | d -> fail "decoder not at the end of stream (%a)" pp_decode d
 39 |   in
 40 |   loop d seq
 41 | 
 42 | let test_eseq ?nl ?quote ?style seq result =
 43 |   let b = Buffer.create 255 in
 44 |   let o = Sexpm.output_of_buffer b in
 45 |   let e = Sexpm.(encoder ?nl ?quote ?style o) in
 46 |   let rec loop = function
 47 |   | (Ok l) :: ls -> Sexpm.encode_lexeme e l; loop ls
 48 |   | (Error _) :: ls -> assert false
 49 |   | [] ->
 50 |       Sexpm.encode_lexeme e None;
 51 |       let r = Buffer.contents b in
 52 |       if r <> result then fail "encoded : %S\n\
 53 |                                 expected: %S" r result
 54 |   in
 55 |   loop seq
 56 | 
 57 | let test_trip src =
 58 |   let b = Buffer.create 255 in
 59 |   let o = Sexpm.output_of_buffer b in
 60 |   let e = Sexpm.(encoder ~style:`Raw o ) in
 61 |   let d = Sexpm.(decoder ~layout:true (input_of_string src)) in
 62 |   let rec loop () = match Sexpm.decode_lexeme d with
 63 |   | Ok (Some _ as dec) -> Sexpm.encode_lexeme e dec; loop ()
 64 |   | Error _ -> assert false
 65 |   | Ok None ->
 66 |       Sexpm.encode_lexeme e None;
 67 |       let r = Buffer.contents b in
 68 |       if r <> src then fail "src : %S\n\
 69 |                              trip: %S" src r
 70 |   in
 71 |   loop ()
 72 | 
 73 | let ok v = Ok (Some v)
 74 | let w w = ok (Sexpm.White w)
 75 | let c c = ok (Sexpm.Comment c)
 76 | let a a r = ok (Sexpm.Atom (a, r))
 77 | let ls = ok Sexpm.Ls
 78 | let le = ok Sexpm.Le
 79 | let e err = Error (err, ((0,0),(0,0)))
 80 | 
 81 | let decode_whites () =
 82 |   log "Decoder whites.\n";
 83 |   let test_white s =
 84 |     test_dseq ~layout:false s [];
 85 |     test_dseq ~layout:true s [ w s ];
 86 |     test_dseq ~layout:false (strf "%sbu" s) [a "bu" None];
 87 |     test_dseq ~layout:false (strf "bu%s" s) [a "bu" None];
 88 |     test_dseq ~layout:true (strf "%sbu" s) [w s; a "bu" (Some "bu")];
 89 |     test_dseq ~layout:true (strf "bu%s" s) [a "bu" (Some "bu"); w s];
 90 |     test_dseq ~layout:false (strf "(%s)" s) [ls; le];
 91 |     test_dseq ~layout:true (strf "(%s)" s) [ls; w s; le];
 92 |     test_dseq ~layout:true (strf "(%s)%s" s s) [ls; w s; le; w s]
 93 |   in
 94 |   test_white " ";
 95 |   test_white "\t";
 96 |   test_white "\n";
 97 |   test_white "\r";
 98 |   test_white "\r\n";
 99 |   test_white "  \n \t\t";
100 |   test_white " \n  ";
101 |   test_white " \n  ";
102 |   ()
103 | 
104 | let decode_comments () =
105 |   log "Decoder comments.\n";
106 |   let test_comment ?(context = true) s =
107 |     let cs = String.sub s 1 (String.length s - 1) in
108 |     test_dseq ~layout:false s [];
109 |     test_dseq ~layout:true s [ c cs ];
110 |     test_dseq ~layout:true (strf " %s\n " s) [ w " "; c cs; w "\n " ];
111 |     test_dseq ~layout:true (strf " %s\r\n" s) [ w " "; c cs; w "\r\n" ];
112 |     test_dseq ~layout:true (strf " %s" s) [ w " "; c cs ];
113 |     test_dseq ~layout:true (strf " a%s" s) [ w " "; a "a" (Some "a"); c cs ];
114 |     test_dseq ~layout:true (strf " a\\\n%s" s)
115 |       [ w " "; a "a" (Some "a\\\n"); c cs ];
116 |     test_dseq ~layout:true (strf " a\\\n%s\nb" s)
117 |       [ w " "; a "a" (Some "a\\\n"); c cs; w "\n"; a "b" (Some "b") ];
118 |     test_dseq ~layout:true (strf "(a%s\nb)" s)
119 |       [ ls; a "a" (Some "a"); c cs; w "\n"; a "b" (Some "b"); le ];
120 |   in
121 |   test_dseq ~layout:true ";" [ c "" ];
122 |   test_comment "; adfj";
123 |   test_comment ";; adfj";
124 |   test_comment "; adfj \" ; ()";
125 |   test_comment ";; adfj \" ; ()";
126 |   test_comment ";; adfj \\n \" ; ()";
127 |   test_comment ";; adfj \\u{} \" ; ()";
128 |   test_comment ";; adfj \\u{23422342324} \" ; ()";
129 |   ()
130 | 
131 | let decode_atoms () =
132 |   log "Decoder atoms.\n";
133 |   let test_atom s at =
134 |     test_dseq ~layout:false s [ a at None ];
135 |     test_dseq ~layout:true s [ a at (Some s) ];
136 |     test_dseq ~layout:true (strf "\n%s;b" s) [ w "\n"; a at (Some s); c "b" ];
137 |     test_dseq ~layout:false (strf "\n%s;b" s) [ a at None ];
138 |     test_dseq ~layout:true (strf "()%s()" s) [ ls; le; a at (Some s); ls; le ];
139 |   in
140 |   test_dseq ~layout:true "bla\\" [ a "bla" (Some "bla\\")];
141 |   test_dseq ~layout:true "bla\\\r\n\\" [ a "bla" (Some "bla\\\r\n\\") ];
142 |   test_dseq ~layout:true "bla\"bla\""
143 |     [ a "bla" (Some "bla");a "bla" (Some "\"bla\"") ];
144 |   test_atom "\"\"" "";
145 |   test_atom "bla" "bla";
146 |   test_atom "b\\\nla" "bla";
147 |   test_atom "b\\\n  \tla" "bla";
148 |   test_atom "b\\nla" "b\nla";
149 |   test_atom "\"bla\"" "bla";
150 |   test_atom "\"b la\"" "b la";
151 |   test_atom "\"b\\\n la\"" "bla";
152 |   test_atom "\"b\\\n la\"" "bla";
153 |   test_atom "\"b\\nla\"" "b\nla";
154 |   test_atom "\"b\\\n\\nla\"" "b\nla";
155 |   test_atom "\\ " " ";
156 |   test_atom "\\\"" "\"";
157 |   test_atom "\\(" "(";
158 |   test_atom "\\)" ")";
159 |   test_atom "\\;" ";";
160 |   test_atom "\\\\" "\\";
161 |   test_atom "\\n" "\n";
162 |   test_atom "\\r" "\r";
163 |   test_atom "\\t" "\t";
164 |   test_atom "\\ \\\n\\ \\\n\\n" "  \n";
165 |   test_atom "\\u{0000}" "\x00";
166 |   test_atom "\\u{0001}" "\x01";
167 |   test_atom "\\u{D7FF}" "\xED\x9F\xBF";
168 |   test_atom "\\u{E000}" "\xEE\x80\x80";
169 |   test_atom "\\u{1F42B}" "\xF0\x9F\x90\xAB";
170 |   test_atom "\\u{10FFFF}" "\xF4\x8F\xBF\xBF";
171 |   ()
172 | 
173 | let decode_lists () =
174 |   log "Decoder lists.\n";
175 |   test_dseq ~layout:true "()()(a)" [ls; le; ls; le; ls; a "a" (Some "a"); le];
176 |   test_dseq ~layout:true "()a()" [ls; le; a "a" (Some "a"); ls; le];
177 |   test_dseq ~layout:true "(((a)))" [ls; ls; ls; a "a" (Some "a"); le; le; le];
178 |   test_dseq ~layout:true "(())()" [ls; ls; le; le; ls; le];
179 |   ()
180 | 
181 | let u_rep_utf8 = "\xEF\xBF\xBD"
182 | let decode_errors () =
183 |   log "Decoder errors.\n";
184 |   test_dseq ~layout:false "\xFF"
185 |     [e (`Illegal_bytes "\xFF"); a u_rep_utf8 None];
186 |   test_dseq ~layout:false "a\xFFb"
187 |     [e (`Illegal_bytes "\xFF"); a (strf "a%sb" u_rep_utf8) None ];
188 |   test_dseq ~layout:false "a\x00b"
189 |     [e (`Illegal_uchar (Uchar.of_int 0x0000)); a (strf "a%sb" u_rep_utf8) None];
190 |   test_dseq ~layout:false "a \\"
191 |     [a "a" None; e (`Unclosed `Escape); a "" None];
192 |   test_dseq ~layout:false "a\\u"
193 |     [e (`Unclosed `Escape); a "a" None ];
194 |   test_dseq ~layout:false "\\A"
195 |     [e (`Illegal_escape (`Not_esc (Uchar.of_int 0x41)));
196 |        a u_rep_utf8 None ];
197 |   test_dseq ~layout:false "\\\n"
198 |     [e (`Illegal_escape (`Not_esc (Uchar.of_int 0x0A)));
199 |      a u_rep_utf8 None ];
200 |   test_dseq ~layout:false "\\uA"
201 |     [e (`Illegal_escape (`Exp_lbrace (Uchar.of_int 0x41)));
202 |      a u_rep_utf8 None ];
203 |   test_dseq ~layout:false "\\u{}"
204 |     [e (`Illegal_escape (`Exp_hex (Uchar.of_int 0x7D)));
205 |      a u_rep_utf8 None ];
206 |   test_dseq ~layout:false "\\u{+"
207 |     [e (`Illegal_escape (`Exp_hex (Uchar.of_int 0x2B)));
208 |      a u_rep_utf8 None ];
209 |   test_dseq ~layout:false "\\u{A+}"
210 |     [e (`Illegal_escape (`Exp_hex_rbrace (Uchar.of_int 0x2B)));
211 |      a (strf "%s}" u_rep_utf8) None ];
212 |   test_dseq ~layout:false "\\u{1234567}"
213 |     [e (`Illegal_escape (`Exp_rbrace (Uchar.of_int 0x37)));
214 |      a (strf "%s}" u_rep_utf8) None ];
215 |   test_dseq ~layout:false "\\u{110000}"
216 |     [e (`Illegal_escape (`Not_uchar 0x110000)); a u_rep_utf8 None ];
217 |   test_dseq ~layout:false "\\u{D800}"
218 |     [e (`Illegal_escape (`Not_uchar 0xD800)); a u_rep_utf8 None ];
219 |   test_dseq ~layout:false "\\u{DFFF}"
220 |     [e (`Illegal_escape (`Not_uchar 0xDFFF)); a u_rep_utf8 None ];
221 |   test_dseq ~layout:false "())()"
222 |     [ls; le; e (`Unexpected_le); ls; le ];
223 |   test_dseq ~layout:false "()("
224 |     [ls; le; ls; e (`Unclosed `List); le ];
225 |   test_dseq ~layout:false "()((\""
226 |     [ls; le; ls; ls; e (`Unclosed `Quoted_token); a "" None;
227 |      e (`Unclosed `List); le; e (`Unclosed `List); le; ];
228 |   test_dseq ~layout:false "(\"\\u{0000"
229 |     [ls; e (`Unclosed `Escape); e (`Unclosed `Quoted_token); a "" None;
230 |      e (`Unclosed `List); le; ];
231 |   ()
232 | 
233 | let encode_raw () =
234 |   log "Raw encoder.\n";
235 |   let test_eseq ?quote = test_eseq ?quote ~style:`Raw in
236 |   test_eseq ~quote:true [ a "" None; a "bla" None] "\"\"bla";
237 |   test_eseq ~quote:false [ a "" None; a "bla" None] "\"\"bla";
238 |   test_eseq ~quote:true [ a "" None; a "b la" None] "\"\"\"b la\"";
239 |   test_eseq ~quote:false [ a "" None; a "b la" None] "\"\"b\\ la";
240 |   test_eseq [ a "bla" None; a "bla" None] "bla bla";
241 |   test_eseq ~quote:true [ a "b la" None; a "bla" None] "\"b la\"bla";
242 |   test_eseq ~quote:true [ a "bla" None; a "b la" None] "bla\"b la\"";
243 |   test_eseq ~quote:true [ a "b la" None; a "b la" None] "\"b la\"\"b la\"";
244 |   test_eseq ~quote:false [ a "b la" None; a "bla" None] "b\\ la bla";
245 |   test_eseq ~quote:false [ a "b la" None; a "b la" None] "b\\ la b\\ la";
246 |   test_eseq [ a "bla" None; c ";"; a "bla" None] "bla;;\nbla";
247 |   test_eseq [ a "bla" None; c ";" ] "bla;;";
248 |   test_eseq [ c ""; a "bla" None; ] ";\nbla";
249 |   test_eseq [ c ""; ls; le] ";\n()";
250 |   test_eseq [ ls; c ""; le] "(;\n)";
251 |   test_eseq [ ls; le; c ""] "();";
252 |   test_eseq [ ls; c "a"; w "\r\n"; le ] "(;a\r\n)";
253 |   test_eseq [ ls; c "a"; w "\n\r"; le ] "(;a\n\r)";
254 |   ()
255 | 
256 | let encode_min () =
257 |   log "Min encoder.\n";
258 |   let test_eseq ?quote = test_eseq ?quote ~style:`Minify in
259 |   test_eseq [ a "bla" None; a "bla" None] "bla bla";
260 |   test_eseq [ a "bla" None; a "bla" None; w " "; c "bla"] "bla bla";
261 |   test_eseq [ ls; le; w " "; ls; c "bla"; a "a" None; le; a "a" None] "()(a)a";
262 |   test_eseq [ a "" None; a "bla" None] "\"\"bla";
263 |   ()
264 | 
265 | let trip () =
266 |   log "Trips.\n";
267 |   test_trip "\\u{10FFFF}";
268 |   test_trip "()\n\r   \\u{10FFFF}";
269 |   test_trip "(  () () a a \" a   \" \n (bla ))";
270 |   ()
271 | 
272 | let escape () =
273 |   log "Sexpm.escape.\n";
274 |   let escape s qesc esc =
275 |     let qesc' = Sexpm.escape ~quote:true s in
276 |     let esc' = Sexpm.escape ~quote:false s in
277 |     if qesc' <> qesc then fail "found: %S exp: %S" qesc' qesc;
278 |     if esc' <> esc then fail "found: %S exp: %S" esc' esc
279 |   in
280 |   escape "" "\"\"" "\"\"";
281 |   escape " " "\" \"" "\\ ";
282 |   escape ";" "\";\"" "\\;";
283 |   escape "(" "\"(\"" "\\(";
284 |   escape ")" "\")\"" "\\)";
285 |   for i = 0x0000 to 0x001F do match i with
286 |   | 0x0009 -> escape "\t" "\"\\t\"" "\\t"
287 |   | 0x000A -> escape "\n" "\"\\n\"" "\\n"
288 |   | 0x000D -> escape "\r" "\"\\r\"" "\\r"
289 |   | u ->
290 |       escape (strf "%c" @@ Char.chr u)
291 |         (strf "\"\\u{%02X}\"" u) (strf "\\u{%02X}" u)
292 |   done;
293 |   escape "\x00hey ho ; () bla \n"
294 |     "\"\\u{00}hey ho ; () bla \\n\""
295 |     "\\u{00}hey\\ ho\\ \\;\\ \\(\\)\\ bla\\ \\n";
296 |   ()
297 | 
298 | let test () =
299 |   decode_whites ();
300 |   decode_comments ();
301 |   decode_atoms ();
302 |   decode_lists ();
303 |   decode_errors ();
304 |   encode_raw ();
305 |   encode_min ();
306 |   trip ();
307 |   escape ();
308 |   log "All tests suceeded.\n"
309 | 
310 | let () = if not (!Sys.interactive) then test ()
311 | 
312 | (*---------------------------------------------------------------------------
313 |    Copyright (c) 2016 Daniel C. Bünzli
314 | 
315 |    Permission to use, copy, modify, and/or distribute this software for any
316 |    purpose with or without fee is hereby granted, provided that the above
317 |    copyright notice and this permission notice appear in all copies.
318 | 
319 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
320 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
321 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
322 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
323 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
324 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
325 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
326 |   ---------------------------------------------------------------------------*)
327 | 


--------------------------------------------------------------------------------
/test/test_codec.ml:
--------------------------------------------------------------------------------
  1 | (*---------------------------------------------------------------------------
  2 |    Copyright (c) 2016 Daniel C. Bünzli. All rights reserved.
  3 |    Distributed under the ISC license, see terms at the end of the file.
  4 |    %%NAME%% %%VERSION%%
  5 |   ---------------------------------------------------------------------------*)
  6 | 
  7 | let strf = Format.asprintf
  8 | let log f = Format.printf (f ^^ "@.")
  9 | let fail fmt =
 10 |   let fail ppf = Format.fprintf ppf "@."; failwith "test" in
 11 |   Format.kfprintf fail Format.err_formatter fmt
 12 | 
 13 | let trip c src =
 14 |   let b = Buffer.create 255 in
 15 |   let o = Sexpm.output_of_buffer b in
 16 |   let e = Sexpm.(encoder ~style:`Raw o ) in
 17 |   let d = Sexpm.(decoder ~layout:true (input_of_string src)) in
 18 |   let rec loop () = match Sexpm.decode_traced_value c d with
 19 |   | Error (e, _) -> fail "%a" Sexpm.pp_error e
 20 |   | Ok (Some v, t) ->
 21 |       Sexpm.encode_traced_value c e (Some (v, v), t); loop ()
 22 |   | Ok (None, t) ->
 23 |       Sexpm.encode_traced_value c e (None, t);
 24 |       let r = Buffer.contents b in
 25 |       if r <> src then fail "src : %S@\n\
 26 |                              trip: %S" src r
 27 |   in
 28 |   loop ()
 29 | 
 30 | let test_trip () =
 31 |   log "Testing trips.";
 32 |   trip Sexpm.unit ";hu\nunit ; ha\nunit";
 33 |   trip Sexpm.bool "true \"false\" ; hey\n true ; blue";
 34 |   trip Sexpm.int  "1 \"2\" ; hey\n  \"0b11\" ; blue";
 35 |   trip Sexpm.(t2 int bool) "(2 false)\n( 3 true   )(\"4\" false)\n;ha\n  ";
 36 |   let r = Sexpm.(result ~ok:int ~error:(t2 int bool)) in
 37 |   trip r " (\\u{4f}k 3) (Error ;hey\n (2 true);ho\n)";
 38 |   let l = Sexpm.(list (t2 int int)) in
 39 |   trip l " ((3 2) (6 3) ; bla blo blu ()\n (4 2)) ()";
 40 |   ()
 41 | 
 42 | let upd c src o n =
 43 |   let dst = Printf.sprintf src n in
 44 |   let src = Printf.sprintf src o in
 45 |   let b = Buffer.create 255 in
 46 |   let o = Sexpm.output_of_buffer b in
 47 |   let e = Sexpm.(encoder ~style:`Raw o) in
 48 |   let deco = Sexpm.(decoder ~layout:true (input_of_string src)) in
 49 |   let decn = Sexpm.(decoder ~layout:true (input_of_string dst)) in
 50 |   let rec loop () =
 51 |     match Sexpm.decode_traced_value c deco,
 52 |           Sexpm.decode_traced_value c decn with
 53 |   | Ok (Some o, t), Ok (Some n, _) ->
 54 |       Sexpm.encode_traced_value c e (Some (o, n), t); loop ()
 55 |   | Ok (None, t), Ok (None, _) ->
 56 |       Sexpm.encode_traced_value c e (None, t);
 57 |       let r = Buffer.contents b in
 58 |       if r <> dst then fail "dst : %S@\n\
 59 |                              updt: %S" dst r
 60 |   | Error (e, _), _ -> fail "%a" Sexpm.pp_error e
 61 |   | _, Error (e, _) -> fail "%a" Sexpm.pp_error e
 62 |   | _ -> fail "fail"
 63 |   in
 64 |   loop ();
 65 |   ()
 66 | 
 67 | let test_update () =
 68 |   log "Testing update.";
 69 |   upd Sexpm.bool " %s; hehyo" "true" "false";
 70 |   upd Sexpm.bool " %s; hehyo" "false" "true";
 71 |   let r = Sexpm.(result ~ok:int ~error:(t2 int bool)) in
 72 |   upd r " (\\u{4f}k %s)" "2" "3";
 73 |   upd r " (\\u{4f}k %s) (Error ;hey\n (2 ;bla\n true);ho\n)" "2" "3";
 74 |   upd r " (\\u{4f}k 1) (Error ;hey\n (%s ;bla\n true);ho\n)" "2" "3";
 75 |   upd r " (\\u{4f}k 1) (Error ;hey\n (2 ;bla\n %s);ho\n)" "false" "true";
 76 |   upd r " (\\u{4f}k 1) ;heyho\n  %s;hu\n  "
 77 |     "(Error ;hey\n (2 true);bla\n)" "(Ok 3)";
 78 |   let l = Sexpm.(list (t2 int int)) in
 79 |   upd l " (%s) " "(2 ;a\n 3) (4 3)" "(2 ;a\n 4) (5 6)";
 80 |   upd l " (%s) " "(2 ;a\n 3) (4 3) (3 2)  ;bla\n" "(2 ;a\n 4) (5 6)   ;bla\n";
 81 |   upd l " (%s) " "(2 ;a\n 3) (4 3) ;bla\n(3 2)" "(2 ;a\n 4) (5 6) ;bla\n";
 82 |   upd l " (%s) "
 83 |     "(2 ;a\n 3) ;bla\n(3 2);hey\n" "(2 ;a\n 4) ;bla\n(5 6);hey\n(3 2)";
 84 |   ()
 85 | 
 86 | let test () =
 87 |   test_trip ();
 88 |   test_update ();
 89 |   log "All tests suceeded.\n"
 90 | 
 91 | let () = if not (!Sys.interactive) then test ()
 92 | 
 93 | (*---------------------------------------------------------------------------
 94 |    Copyright (c) 2016 Daniel C. Bünzli
 95 | 
 96 |    Permission to use, copy, modify, and/or distribute this software
 97 |    for any purpose with or without fee is hereby granted, provided
 98 |    that the above copyright notice and this permission notice appear
 99 |    in all copies.
100 | 
101 |    THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
102 |    WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
103 |    MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
104 |    ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
105 |    WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
106 |    ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
107 |    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
108 |   ---------------------------------------------------------------------------*)
109 | 


--------------------------------------------------------------------------------