├── .gitignore ├── .merlin ├── .ocp-indent ├── CHANGES.md ├── LICENSE.md ├── README.md ├── _tags ├── doc ├── api.odocl └── dev.odocl ├── opam ├── pkg ├── META └── pkg.ml ├── src ├── sexpm.ml ├── sexpm.mli └── sexpm.mllib └── test ├── sexptrip.ml ├── test.ml └── test_codec.ml /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | tmp 3 | *~ 4 | \.\#* 5 | \#*# 6 | *.install 7 | *.native 8 | *.byte -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG bytes uchar uutf cmdliner bos bos.setup 2 | S src 3 | S test 4 | B _build/** 5 | -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | strict_with=always,match_clause=4,strict_else=never 2 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | vX.Y.Z YYYY-MM-DD Location 2 | -------------------------- 3 | 4 | First release. 5 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Daniel C. Bünzli 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | sexpm — s-expression codecs and updates for OCaml 2 | ------------------------------------------------------------------------------- 3 | %%VERSION%% 4 | 5 | sexpm codecs sequences of s-expressions to different representations. 6 | 7 | sexpm is distributed under the ISC license. 8 | 9 | Homepage: http://erratique.ch/software/sexpm 10 | 11 | ## Installation 12 | 13 | sexpm can be installed with `opam`: 14 | 15 | opam install sexpm 16 | 17 | If you don't use `opam` consult the [`opam`](opam) file for build 18 | instructions. 19 | 20 | ## Documentation 21 | 22 | The documentation and API reference is automatically generated from 23 | the source interfaces. It can be consulted [online][doc] or via 24 | `odig doc sexpm`. 25 | 26 | [doc]: http://erratique.ch/software/sexpm/doc 27 | 28 | ## Sample programs 29 | 30 | If you installed sexpm with `opam` sample programs are located in 31 | the directory `opam config var sexpm:doc`. 32 | 33 | In the distribution sample programs and tests are located in the 34 | [`test`](test) directory of the distribution. They can be built and run 35 | with: 36 | 37 | topkg build --tests true && topkg test 38 | -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | true : bin_annot, safe_string, package(uutf), package(fmt) 2 | : include 3 | : include 4 | : package(cmdliner), package(bos.setup) 5 | : package(cmdliner) -------------------------------------------------------------------------------- /doc/api.odocl: -------------------------------------------------------------------------------- 1 | Sexpm -------------------------------------------------------------------------------- /doc/dev.odocl: -------------------------------------------------------------------------------- 1 | Sexpm -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "1.2" 2 | maintainer: "Daniel Bünzli " 3 | authors: ["Daniel Bünzli "] 4 | homepage: "http://erratique.ch/software/sexpm" 5 | doc: "http://erratique.ch/software/sexpm/doc" 6 | license: "ISC" 7 | dev-repo: "http://erratique.ch/repos/sexpm.git" 8 | bug-reports: "https://github.com/dbuenzli/sexpm/issues" 9 | tags: [ "org:erratique" "codec" "sexp" ] 10 | available: [ ocaml-version >= "4.03.0"] 11 | depends: [ 12 | "ocamlfind" {build} 13 | "ocamlbuild" {build} 14 | "topkg" {build} 15 | "uutf" 16 | ] 17 | depopts: [ 18 | "bos" 19 | g "cmdliner" 20 | ] 21 | build: [ 22 | "ocaml" "pkg/pkg.ml" "build" 23 | "--pinned" pinned ] 24 | -------------------------------------------------------------------------------- /pkg/META: -------------------------------------------------------------------------------- 1 | description = "s-expression codecs and updates for OCaml" 2 | version = "%%VERSION_NUM%%" 3 | requires = "" 4 | archive(byte) = "sexpm.cma" 5 | archive(native) = "sexpm.cmxa" 6 | plugin(byte) = "sexpm.cma" 7 | plugin(native) = "sexpm.cmxs" 8 | -------------------------------------------------------------------------------- /pkg/pkg.ml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ocaml 2 | #use "topfind" 3 | #require "topkg" 4 | open Topkg 5 | 6 | let bos = Conf.with_pkg "bos" 7 | let cmdliner = Conf.with_pkg "cmdliner" 8 | 9 | let () = 10 | Pkg.describe "sexpm" @@ fun c -> 11 | let cmdliner = Conf.value c cmdliner in 12 | let bos = Conf.value c bos in 13 | Ok [ Pkg.mllib "src/sexpm.mllib"; 14 | Pkg.bin ~cond:(cmdliner && bos) "test/sexptrip"; 15 | Pkg.test "test/test"; 16 | Pkg.test "test/test_codec"; ] 17 | -------------------------------------------------------------------------------- /src/sexpm.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2016 Daniel C. Bünzli. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | let pf = Format.fprintf 8 | 9 | (* Unsafe string and bytes manipulations. If you don't believe the authors's 10 | invariants, replacing with safe versions makes everything safe in the 11 | module. He won't be upset. *) 12 | 13 | let unsafe_string_get = String.unsafe_get 14 | let unsafe_array_get = Array.unsafe_get 15 | let unsafe_bytes_set = Bytes.unsafe_set 16 | let unsafe_bytes_set_byte s j byte = Bytes.unsafe_set s j (Char.unsafe_chr byte) 17 | let unsafe_bytes_blit s soff d doff = 18 | Bytes.unsafe_blit (Bytes.unsafe_of_string s) soff d doff 19 | 20 | (* Invalid argument strings *) 21 | 22 | let err_empty_buf = "buffer size can't be 0" 23 | let err_enc_unclosed = "unclosed list encoded" 24 | let err_enc_unopened = "closing unopened list" 25 | 26 | (* IO *) 27 | 28 | let io_buffer_size = 65536 (* IO_BUFFER_SIZE 4.0.0 *) 29 | let bytes_buf = function 30 | | None -> Bytes.create io_buffer_size 31 | | Some bytes -> 32 | if Bytes.length bytes <> 0 then bytes else 33 | invalid_arg err_empty_buf 34 | 35 | type input = unit -> (Bytes.t * int * int) option 36 | 37 | let input_of_string s = 38 | let buf = ref (Some (Bytes.unsafe_of_string s, 0, String.length s)) in 39 | fun () -> let r = !buf in buf := None; r 40 | 41 | let input_of_in_channel ?bytes ic = 42 | let bs = bytes_buf bytes in 43 | fun () -> 44 | let len = input ic bs 0 (Bytes.length bs) in 45 | if len = 0 then None else Some (bs, 0, len) 46 | 47 | type output = (Bytes.t * int * int) option -> unit 48 | 49 | let output_of_buffer buf = function 50 | | None -> () 51 | | Some (bs, pos, len) -> Buffer.add_subbytes buf bs pos len 52 | 53 | let output_of_out_channel oc = function 54 | | None -> () 55 | | Some (bs, pos, len) -> output oc bs pos len 56 | 57 | (* Characters and their classes *) 58 | 59 | let ux_eoi = -1 (* end of input, oustide unicode range *) 60 | let ux_soi = -2 (* start of input, outside unicode range. *) 61 | 62 | let u_rep = Uchar.to_int Uutf.u_rep 63 | let u_tab = 0x0009 (* '\t' *) 64 | let u_lf = 0x000A (* '\n' *) 65 | let u_cr = 0x000D (* '\r' *) 66 | let u_sp = 0x0020 (* ' ' *) 67 | let u_quot = 0x0022 (* '"' *) 68 | let u_lpar = 0x0028 (* '(' *) 69 | let u_rpar = 0x0029 (* ')' *) 70 | let u_semi = 0x003B (* ';' *) 71 | let u_bslash = 0x005C (* '\\' *) 72 | 73 | let uchar_is_illegal = function 74 | | 0x0020 | 0x000A | 0x000D | 0x0009 -> false 75 | | u when u <= 0x001F -> true 76 | | _ -> false 77 | 78 | (* The following predicates assume that illegal were filtered out. *) 79 | 80 | let uchar_is_white = function 81 | | 0x0020 | 0x000A | 0x000D | 0x0009 -> true 82 | | _ -> false 83 | 84 | let uchar_is_hex u = 85 | (0x0030 <= u && u <= 0x0039) 86 | || (0x0041 <= u && u <= 0x0046) 87 | || (0x0061 <= u && u <= 0x0066) 88 | 89 | let uchar_is_token u = 90 | (0x0023 <= u && u <= 0x0027) 91 | || (0x002A <= u && u <= 0x003A) 92 | || (0x003C <= u && u <= 0x005B) 93 | || (0x005D <= u) 94 | || (0x0021 = u) 95 | 96 | let uchar_is_qtoken u = 97 | (0x0023 <= u && u <= 0x005B) 98 | || (0x005D <= u) 99 | || uchar_is_white u 100 | || (0x0021 = u) 101 | 102 | let uchar_is_comment = function 103 | | 0x000A | 0x000D -> false 104 | | u when u = ux_eoi -> false 105 | | u -> true 106 | 107 | let pp_dump_uchar ppf u = pf ppf "%04X" (Uchar.to_int u) 108 | let pp_uchar ppf u = 109 | if Uchar.to_int u <= 0x1F (* most control chars *) then pp_dump_uchar ppf u else 110 | let b = Buffer.create 4 in 111 | Uutf.Buffer.add_utf_8 b u; 112 | pf ppf "'%s' (%a)" (Buffer.contents b) pp_dump_uchar u 113 | 114 | (* Lexemes *) 115 | 116 | type lexeme = 117 | | White of string 118 | | Comment of string 119 | | Ls | Le 120 | | Atom of string * string option 121 | 122 | let pp_raw ppf = function 123 | | None -> pf ppf "None" 124 | | Some s -> pf ppf "@[Some %S@]" s 125 | 126 | let pp_lexeme ppf = function 127 | | White s -> pf ppf "@[White %S@]" s 128 | | Comment s -> pf ppf "@[Comment %S@]" s 129 | | Ls -> pf ppf "Ls" 130 | | Le -> pf ppf "Le" 131 | | Atom (a, raw) -> pf ppf "@[Atom @[<1>(%S,@ %a)@]@]" a pp_raw raw 132 | 133 | (* Positions *) 134 | 135 | type pos = int * int 136 | type range = pos * pos 137 | let pp_range ppf ((l0, c0), (l1, c1)) = pf ppf "%d.%d-%d.%d" l0 c0 l1 c1 138 | 139 | (* Decode errors *) 140 | 141 | type error = 142 | [ `Illegal_bytes of string 143 | | `Illegal_uchar of Uchar.t 144 | | `Illegal_escape of 145 | [ `Exp_lbrace of Uchar.t 146 | | `Exp_rbrace of Uchar.t 147 | | `Exp_hex of Uchar.t 148 | | `Exp_hex_rbrace of Uchar.t 149 | | `Not_esc of Uchar.t 150 | | `Not_uchar of int ] 151 | | `Unclosed of [ `Quoted_token | `List | `Escape ] 152 | | `Unexpected_le 153 | | `Codec of string (* FIXME *) ] 154 | 155 | let pp_illegal_escape ppf = function 156 | | `Exp_lbrace u -> pf ppf "expected '{' found %a" pp_uchar u 157 | | `Exp_rbrace u -> pf ppf "expected '}' found %a" pp_uchar u 158 | | `Exp_hex u -> pf ppf "expected hex digit found %a" pp_uchar u 159 | | `Exp_hex_rbrace u -> pf ppf "expected hex digit or '}' found %a" pp_uchar u 160 | | `Not_esc u -> pf ppf "%a is not a valid escape letter" pp_uchar u 161 | | `Not_uchar i -> pf ppf "escape 0x%X is not a Unicode scalar value" i 162 | 163 | let pp_error ppf = function 164 | | `Illegal_bytes b -> pf ppf "illegal byte sequence %S in input" b 165 | | `Illegal_uchar u -> pf ppf "illegal Unicode character %a in input" pp_uchar u 166 | | `Illegal_escape esc -> pp_illegal_escape ppf esc 167 | | `Unclosed `Quoted_token -> pf ppf "unclosed quoted token" 168 | | `Unclosed `List -> pf ppf "unclosed list" 169 | | `Unclosed `Escape -> pf ppf "unfinished escape" 170 | | `Unexpected_le -> pf ppf "unexpected list end ')'" 171 | | `Codec s -> pf ppf "codec error: %s" s 172 | 173 | exception Err of error * range (* only used internally *) 174 | 175 | (* Decoder *) 176 | 177 | type decoder_state = 178 | | Decode_next 179 | | Decoding_qtoken 180 | | Decoding_token 181 | | Decoding_comment 182 | | Decoding_white 183 | | Close_qtoken 184 | | Close_list 185 | 186 | type decoder = 187 | { input : input; (* input byte stream. *) 188 | layout : bool; (* [true] to keep whitespace and raw tokens. *) 189 | u : Uutf.decoder; (* unicode character decoder. *) 190 | atom : Buffer.t; (* atom accumulation buffer. *) 191 | raw : Buffer.t; (* raw accumulation buffer. *) 192 | mutable c : int; (* character lookahead. *) 193 | mutable ss_line : int; (* last sexp start line. *) 194 | mutable ss_col : int; (* last sexp start column. *) 195 | mutable s_line : int; (* last saved start line. *) 196 | mutable s_col : int; (* last saved start column. *) 197 | mutable e_line : int; (* last saved end line. *) 198 | mutable e_col : int; (* last saved end column. *) 199 | mutable stack : pos list; (* stack of open lists and their start pos. *) 200 | mutable state : decoder_state; (* decoder state. *) 201 | mutable peek : (lexeme option, error * range) result option 202 | } 203 | 204 | let decoder ?(layout = true) input = 205 | let u = Uutf.decoder ~encoding:`UTF_8 `Manual in 206 | let buf_size = 1024 in 207 | let atom = Buffer.create buf_size in 208 | let raw = Buffer.create buf_size in 209 | { input; layout; u; atom; raw; c = ux_soi; (* overwritten *) 210 | ss_line = 1; ss_col = 0; s_line = 1; s_col = 0; e_line = 1; e_col = 0; 211 | stack = []; state = Decode_next; peek = None } 212 | 213 | let decoder_layout d = d.layout 214 | let decoder_sspos d = d.ss_line, d.ss_col 215 | let decoder_spos d = d.s_line, d.s_col 216 | let decoder_epos d = d.e_line, d.e_col 217 | let decoded_range d = (decoder_spos d, decoder_epos d) 218 | let decoded_sexp_range d = (decoder_sspos d, decoder_epos d) 219 | 220 | let sspos d = Uutf.(d.ss_line <- decoder_line d.u; d.ss_col <- decoder_col d.u) 221 | let spos d = Uutf.(d.s_line <- decoder_line d.u; d.s_col <- decoder_col d.u) 222 | let epos d = Uutf.(d.e_line <- decoder_line d.u; d.e_col <- decoder_col d.u) 223 | 224 | let buffer_flush b = let s = Buffer.contents b in (Buffer.clear b; s) 225 | let buffer_add b c = Uutf.Buffer.add_utf_8 b (Uchar.unsafe_of_int c) 226 | 227 | let raw_clear d = Buffer.clear d.raw 228 | let raw_add d = if d.c <> ux_soi && d.layout then buffer_add d.raw d.c else () 229 | let raw d = buffer_flush d.raw 230 | let raw_is_empty d = Buffer.length d.raw = 0 231 | 232 | let atom_add d c = buffer_add d.atom c 233 | let atom d = 234 | let raw = if d.layout then (Some (buffer_flush d.raw)) else None in 235 | Atom (buffer_flush d.atom, raw) 236 | 237 | let err d e = epos d; raise_notrace (Err (e, decoded_range d)) 238 | 239 | let refill d = match d.input () with 240 | | Some (b, s, l) -> Uutf.Manual.src d.u b s l 241 | | None -> Uutf.Manual.src d.u Bytes.empty 0 0 242 | 243 | let rec readc d = match Uutf.decode d.u with 244 | | `Uchar u -> 245 | raw_add d; 246 | let i = Uchar.to_int u in 247 | begin match uchar_is_illegal i with 248 | | false -> d.c <- i 249 | | true -> d.c <- u_rep; err d (`Illegal_uchar u) 250 | end 251 | | `End -> if d.c <> ux_eoi then (raw_add d; d.c <- ux_eoi) else () 252 | | `Await -> refill d; readc d 253 | | `Malformed b -> d.c <- u_rep; err d (`Illegal_bytes b) 254 | 255 | let err_illegal_esc d e = atom_add d u_rep; readc d; err d (`Illegal_escape e) 256 | 257 | let d_uchar_esc d = 258 | let rec loop d acc count = match (readc d; d.c) with 259 | | u when uchar_is_hex u -> 260 | let count = count + 1 in 261 | if count > 6 then err_illegal_esc d (`Exp_rbrace (Uchar.of_int d.c)) else 262 | let acc = acc * 16 + (if d.c <= 0x39 (* 9 *) then d.c - 0x30 else 263 | if d.c <= 0x46 (* F *) then d.c - 0x37 else 264 | d.c - 0x57) 265 | in 266 | loop d acc count 267 | | 0x007D (* '}' *) -> 268 | if count = 0 then err_illegal_esc d (`Exp_hex (Uchar.of_int d.c)) else 269 | if not (Uchar.is_valid acc) then err_illegal_esc d (`Not_uchar acc) else 270 | (readc d; atom_add d acc) 271 | | u when u = ux_eoi -> err d (`Unclosed `Escape) 272 | | u -> 273 | if count = 0 then err_illegal_esc d (`Exp_hex (Uchar.of_int d.c)) else 274 | err_illegal_esc d (`Exp_hex_rbrace (Uchar.of_int d.c)) 275 | in 276 | match d.c with 277 | | 0x007B (* '{' *) -> loop d 0 0 278 | | u when u = ux_eoi -> err d (`Unclosed `Escape) 279 | | u -> err_illegal_esc d (`Exp_lbrace (Uchar.of_int d.c)) 280 | 281 | let rec d_cont ~lf d = match (readc d; d.c) with 282 | | 0x000A when lf -> d_cont ~lf:false d 283 | | (0x0020 | 0x0009) -> d_cont ~lf:false d 284 | | 0x005C (* '\\' *) -> d_escape ~or_cont:true d 285 | | u -> () 286 | 287 | and d_escape ~or_cont d = match (readc d; d.c) with 288 | | 0x0020 (* ' ' *) -> atom_add d u_sp; readc d 289 | | 0x0022 (* '"' *) -> atom_add d u_quot; readc d 290 | | 0x0028 (* '(' *) -> atom_add d u_lpar; readc d 291 | | 0x0029 (* ')' *) -> atom_add d u_rpar; readc d 292 | | 0x003B (* ';' *) -> atom_add d u_semi; readc d 293 | | 0x005C (* '\\' *) -> atom_add d u_bslash; readc d 294 | | 0x006E (* 'n' *) -> atom_add d u_lf; readc d 295 | | 0x0072 (* 'r' *) -> atom_add d u_cr; readc d 296 | | 0x0074 (* 't' *) -> atom_add d u_tab; readc d 297 | | 0x0075 (* 'u' *) -> readc d; d_uchar_esc d 298 | | (0x000A | 0x000D) -> 299 | if or_cont then (d_cont ~lf:(d.c = u_cr) d) else 300 | err_illegal_esc d (`Not_esc (Uchar.of_int d.c)) 301 | | u when u = ux_eoi -> if not or_cont then err d (`Unclosed `Escape) 302 | | u -> err_illegal_esc d (`Not_esc (Uchar.of_int u)) 303 | 304 | let d_token d = 305 | let rec loop ~or_cont d = match d.c with 306 | | 0x005C (* \\ *) -> d_escape ~or_cont d; loop ~or_cont:true d 307 | | u when uchar_is_token u -> 308 | atom_add d u; epos d; readc d; loop ~or_cont:true d 309 | | u -> 310 | let a = atom d in 311 | d.state <- Decode_next; raw_clear d; Some a 312 | in 313 | if (d.state <> Decoding_token) then (sspos d; spos d); 314 | d.state <- Decoding_token; 315 | loop ~or_cont:false d 316 | 317 | let d_qtoken d = 318 | let rec loop d u = match u with 319 | | 0x005C (* '\\' *) -> d_escape ~or_cont:true d; loop d d.c 320 | | 0x0022 (* '\"' *) -> 321 | epos d; readc d; 322 | let a = atom d in 323 | d.state <- Decode_next; raw_clear d; Some a 324 | | u when u = ux_eoi -> 325 | d.state <- Close_qtoken; err d (`Unclosed `Quoted_token) 326 | | u -> atom_add d u; readc d; loop d d.c 327 | in 328 | if d.state = Close_qtoken then (d.state <- Decoding_qtoken; loop d u_quot) 329 | else begin 330 | if d.state <> Decoding_qtoken then (sspos d; spos d; readc d); 331 | d.state <- Decoding_qtoken; loop d d.c 332 | end 333 | 334 | let d_ls d = 335 | d.state <- Decode_next; 336 | spos d; epos d; readc d; raw_clear d; 337 | d.stack <- (decoder_spos d) :: d.stack; 338 | Some Ls 339 | 340 | let d_le d = 341 | d.state <- Decode_next; 342 | spos d; epos d; readc d; raw_clear d; 343 | match d.stack with 344 | | (l, c) :: stack -> d.ss_line <- l; d.ss_col <- c; d.stack <- stack; Some Le 345 | | [] -> err d (`Unexpected_le) 346 | 347 | let d_eoi d = match d.stack with 348 | | [] -> None 349 | | (l, c) :: stack -> 350 | d.s_line <- l; d.s_col <- c; d.state <- Close_list; 351 | err d (`Unclosed `List) 352 | 353 | let rec d_comment d = 354 | let rec loop d = match d.c with 355 | | u when uchar_is_comment u -> epos d; readc d; loop d 356 | | u -> 357 | d.state <- Decode_next; 358 | if not d.layout then d_next d else Some (Comment (raw d)) 359 | in 360 | if d.state <> Decoding_comment then (spos d; readc d; raw_clear d); 361 | d.state <- Decoding_comment; 362 | loop d 363 | 364 | and d_white d = 365 | let rec loop d = match (readc d; d.c) with 366 | | u when uchar_is_white u -> epos d; loop d 367 | | u -> 368 | d.state <- Decode_next; 369 | if raw_is_empty d then d_next d else Some (White (raw d)) 370 | in 371 | if d.state <> Decoding_white then (spos d; epos d); 372 | d.state <- Decoding_white; 373 | loop d 374 | 375 | and d_next d = match d.c with 376 | | 0x0028 (* '(' *) -> d_ls d 377 | | 0x0029 (* ')' *) -> d_le d 378 | | 0x0022 (* '"' *) -> d_qtoken d 379 | | 0x003B (* ';' *) -> d_comment d 380 | | u when uchar_is_white u -> d_white d 381 | | u when u = ux_eoi -> d_eoi d 382 | | u when u = ux_soi -> readc d; d_next d 383 | | u -> d_token d 384 | 385 | and d_lexeme d = match d.state with 386 | | Decode_next -> d_next d 387 | (* The following cases occur after an error was decoded. *) 388 | | Decoding_token -> d_token d 389 | | Decoding_qtoken -> d_qtoken d 390 | | Decoding_comment -> d_comment d 391 | | Decoding_white -> d_white d 392 | | Close_qtoken -> d_qtoken d 393 | | Close_list -> d_le d 394 | 395 | let decode_lexeme d = match d.peek with 396 | | Some dec -> d.peek <- None; dec 397 | | None -> 398 | try Ok (d_lexeme d) with 399 | | Err (e, r) -> Error (e, r) 400 | 401 | let peek_lexeme d = match d.peek with 402 | | Some dec -> dec 403 | | None -> 404 | let dec = decode_lexeme d in 405 | d.peek <- Some dec; 406 | dec 407 | 408 | let dec_lexeme d = match d.peek with 409 | | None -> d_lexeme d 410 | | Some dec -> 411 | d.peek <- None; 412 | match dec with 413 | | Ok l -> l 414 | | Error (e, r) -> raise_notrace (Err (e, r)) 415 | 416 | let pek_lexeme d = match d.peek with 417 | | None -> 418 | let dec = d_lexeme d in 419 | d.peek <- Some (Ok dec); 420 | dec 421 | | Some dec -> 422 | match dec with 423 | | Ok l -> l 424 | | Error (e, r) -> raise_notrace (Err (e, r)) 425 | 426 | (* Encoder *) 427 | 428 | let hex_digit = 429 | [|'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'A';'B';'C';'D';'E';'F'|] 430 | 431 | type encoder_style = [ `Pp | `Minify | `Raw ] 432 | type last_encoded = Comment | Token (* unquoted *) | Other 433 | type encoder = 434 | { output : output; (* output byte stream. *) 435 | style : encoder_style; (* encoder style. *) 436 | quote : bool; (* quote atoms that need escaping. *) 437 | nl : bool; (* final line feed. *) 438 | o : Bytes.t; (* output buffer. *) 439 | mutable o_pos : int; (* next output pos to write. *) 440 | mutable nest : int; (* nesting level *) 441 | mutable last : last_encoded; 442 | mutable col : int; (* approx we don't decode the UTF-8 *) } 443 | 444 | let encoder ?bytes ?(nl = false) ?(quote = true) ?(style = `Raw) output = 445 | let o = bytes_buf bytes in 446 | { output; style; quote; nl; o; o_pos = 0; nest = 0; last = Other; col = 0; } 447 | 448 | let flush e = e.output (Some (e.o, 0, e.o_pos)); e.o_pos <- 0 449 | 450 | let o_rem e = Bytes.length e.o - e.o_pos (* rem bytes to write in [e.o]. *) 451 | 452 | let rec o_byte e b = match o_rem e = 0 with 453 | | true -> flush e; o_byte e b 454 | | false -> unsafe_bytes_set_byte e.o e.o_pos b; e.o_pos <- e.o_pos + 1 455 | 456 | let rec o_string e s pos l = (* write [l] bytes of [s] starting at [pos]. *) 457 | let rem = o_rem e in 458 | match rem >= l with 459 | | true -> unsafe_bytes_blit s pos e.o e.o_pos l; e.o_pos <- e.o_pos + l 460 | | false -> 461 | unsafe_bytes_blit s pos e.o e.o_pos rem; e.o_pos <- e.o_pos + rem; 462 | flush e; 463 | o_string e s (pos + rem) (l - rem) 464 | 465 | let rec e_atom e a = 466 | if a = "" then (o_string e "\"\"" 0 2; e.last <- Other) else 467 | let flush ~quoted a spos len = 468 | if spos = 0 then begin match quoted with 469 | | true -> o_byte e u_quot 470 | | false -> if e.last = Token then o_byte e u_sp 471 | end; 472 | o_string e a spos len 473 | in 474 | let rec loop ~quoted a spos epos max = match epos > max with 475 | | true -> 476 | if not (spos > max) then (flush ~quoted a spos (epos - spos)); 477 | if quoted then (o_byte e u_quot; e.last <- Other) else e.last <- Token 478 | | false -> 479 | let next = epos + 1 in 480 | let escape esc = 481 | flush ~quoted:e.quote a spos (epos - spos); 482 | o_string e esc 0 (String.length esc); 483 | loop e.quote a next next max 484 | in 485 | let may_escape esc = match e.quote with 486 | | true -> loop ~quoted:e.quote a spos next max 487 | | false -> escape esc 488 | in 489 | match unsafe_string_get a epos with 490 | | ' ' -> may_escape "\\ " 491 | | '(' -> may_escape "\\(" 492 | | ')' -> may_escape "\\)" 493 | | ';' -> may_escape "\\;" 494 | | '\\' -> escape "\\\\" 495 | | '"' -> escape "\\\"" 496 | | '\t' -> escape "\\t" 497 | | '\n' -> escape "\\n" 498 | | '\r' -> escape "\\r" 499 | | c when c <= '\x1F' -> 500 | flush ~quoted:e.quote a spos (epos - spos); 501 | let byte = Char.code c in 502 | let hi = byte / 16 in 503 | let lo = byte mod 16 in 504 | o_string e "\\u00" 0 4; 505 | o_byte e (Char.code @@ unsafe_array_get hex_digit hi); 506 | o_byte e (Char.code @@ unsafe_array_get hex_digit lo); 507 | loop e.quote a next next max 508 | | c -> loop ~quoted a spos next max 509 | in 510 | loop ~quoted:false a 0 0 (String.length a - 1) 511 | 512 | let e_raw_atom e r = 513 | begin match r.[0] = '"' (* raw is quoted *) with 514 | | true -> e.last <- Other 515 | | false -> if e.last = Token then o_byte e u_sp; 516 | end; 517 | o_string e r 0 (String.length r) 518 | 519 | let e_ls e = 520 | if e.last = Comment then o_byte e u_lf; 521 | o_byte e u_lpar; e.nest <- e.nest + 1; e.last <- Other 522 | 523 | let e_le e = 524 | if e.nest = 0 then invalid_arg err_enc_unopened; 525 | if e.last = Comment then o_byte e u_lf; 526 | o_byte e u_rpar; e.nest <- e.nest - 1; e.last <- Other 527 | 528 | let e_white e w = 529 | let len = String.length w in 530 | if e.style <> `Raw || len = 0 then () else 531 | begin 532 | if not (w.[0] = '\n' || w.[0] = '\r') && e.last = Comment 533 | then o_byte e u_lf; 534 | o_string e w 0 len; 535 | e.last <- Other 536 | end 537 | 538 | let e_comment e c = 539 | if e.style <> `Raw then () else 540 | begin 541 | if e.last = Comment then o_byte e u_lf; 542 | o_byte e u_semi; 543 | o_string e c 0 (String.length c); 544 | e.last <- Comment 545 | end 546 | 547 | let e_lexeme e = function 548 | | Atom (a, raw) -> 549 | if e.last = Comment then o_byte e u_lf; 550 | if e.style <> `Raw then e_atom e a else 551 | begin match raw with 552 | | None -> e_atom e a; 553 | | Some r -> e_raw_atom e r 554 | end 555 | | Ls -> e_ls e 556 | | Le -> e_le e 557 | | White w -> e_white e w 558 | | Comment c -> e_comment e c 559 | 560 | let e_end e = match e.nest <> 0 with 561 | | true -> invalid_arg err_enc_unclosed 562 | | false -> 563 | if e.nl then o_byte e u_lf; 564 | if e.o_pos <> 0 then flush e; 565 | e.output None 566 | 567 | let encode_lexeme e = function 568 | | Some l -> e_lexeme e l 569 | | None -> e_end e 570 | 571 | (* Custom s-expression codec *) 572 | 573 | let rec decode_custom ~atom ~list d = 574 | let rec loop stack acc = match d_lexeme d with 575 | | None -> err d (`Unclosed `List) 576 | | Some v -> 577 | match v with 578 | | Ls -> loop (acc :: stack) [] 579 | | Atom (a, _) -> loop stack (atom d a :: acc) 580 | | White _ | Comment _ -> loop stack acc 581 | | Le -> 582 | let list = list d (List.rev acc) in 583 | match stack with 584 | | [] -> Ok (Some list) 585 | | up :: stack -> loop stack (list :: up) 586 | in 587 | try match dec_lexeme d with 588 | | None -> Ok None 589 | | Some v -> 590 | match v with 591 | | Ls -> loop [] [] 592 | | Atom (a, _) -> Ok (Some (atom d a)) 593 | | White _ | Comment _ -> decode_custom ~atom ~list d 594 | | Le -> err d (`Unexpected_le) 595 | with Err (e, r) -> Error (e, r) 596 | 597 | let encode_custom unfold e v = 598 | let rec loop stack = function 599 | | [] -> 600 | e_lexeme e Le; 601 | if stack = [] then () else loop (List.tl stack) (List.hd stack) 602 | | v :: vs -> 603 | match unfold v with 604 | | `Atom a -> e_lexeme e (Atom (a, None)); loop stack vs 605 | | `List l -> e_lexeme e Ls; loop (vs :: stack) l 606 | in 607 | match v with 608 | | None -> e_end e 609 | | Some v -> 610 | match unfold v with 611 | | `Atom a -> e_lexeme e (Atom (a, None)) 612 | | `List l -> e_lexeme e Ls; loop [] l 613 | 614 | (* Tagged s-expression codec *) 615 | 616 | type 'a t = [ `Atom of string | `List of 'a t list ] * 'a 617 | type 'a tagger = decoder -> [`Atom of string | `List of 'a t list] -> 'a t 618 | 619 | let unit_tag _ se = se, () 620 | let range_tag d se = se, decoded_sexp_range d 621 | 622 | let decode_tagged ~tag d = 623 | let atom d a = tag d (`Atom a) in 624 | let list d l = tag d (`List l) in 625 | decode_custom ~atom ~list d 626 | 627 | let encode_tagged e se = 628 | let unfold (se, _) = se in 629 | encode_custom unfold e se 630 | 631 | (* Escaping *) 632 | 633 | let escape_info ~quote s = (* determines escape length, if needed *) 634 | let max = String.length s - 1 in 635 | let rec loop raw i l = 636 | if i > max then (if raw then None else Some l) else 637 | let next = i + 1 in 638 | match unsafe_string_get s i with 639 | | ' ' | '(' | ')' | ';' -> loop false next (l + if quote then 1 else 2) 640 | | '\\' | '"' | '\t' | '\n' | '\r' -> loop false next (l + 2) 641 | | c when c <= '\x1F' -> loop false next (l + 6) 642 | | c -> loop raw next (l + 1) 643 | in 644 | loop true 0 0 645 | 646 | let escape_char b k c = 647 | unsafe_bytes_set b k '\\'; 648 | unsafe_bytes_set b (k + 1) c; 649 | () 650 | 651 | let escape ?(quote = true) s = 652 | if s = "" then "\"\"" else 653 | match escape_info ~quote s with 654 | | None -> s 655 | | Some escaped_len -> 656 | let b, start = match quote with 657 | | false -> Bytes.create escaped_len, 0 658 | | true -> 659 | let escaped_len = escaped_len + 2 in 660 | let b = Bytes.create escaped_len in 661 | unsafe_bytes_set b 0 '"'; unsafe_bytes_set b (escaped_len - 1) '"'; 662 | b, 1 663 | in 664 | let max = String.length s - 1 in 665 | let rec loop i k = 666 | if i > max then Bytes.unsafe_to_string b else 667 | let next = i + 1 in 668 | match unsafe_string_get s i with 669 | | ' ' -> 670 | if quote then (unsafe_bytes_set b k ' '; loop next (k + 1)) else 671 | (escape_char b k ' '; loop next (k + 2)) 672 | | '(' -> 673 | if quote then (unsafe_bytes_set b k '('; loop next (k + 1)) else 674 | (escape_char b k '('; loop next (k + 2)) 675 | | ')' -> 676 | if quote then (unsafe_bytes_set b k ')'; loop next (k + 1)) else 677 | (escape_char b k ')'; loop next (k + 2)) 678 | | ';' -> 679 | if quote then (unsafe_bytes_set b k ';'; loop next (k + 1)) else 680 | (escape_char b k ';'; loop next (k + 2)) 681 | | '\\' -> escape_char b k '\\'; loop next (k + 2) 682 | | '"' -> escape_char b k '"'; loop next (k + 2) 683 | | '\t' -> escape_char b k 't'; loop next (k + 2) 684 | | '\n' -> escape_char b k 'n'; loop next (k + 2) 685 | | '\r' -> escape_char b k 'r'; loop next (k + 2) 686 | | c when c <= '\x1F' -> 687 | let byte = Char.code c in 688 | let hi = byte / 16 in 689 | let lo = byte mod 16 in 690 | unsafe_bytes_set b (k )'\\'; 691 | unsafe_bytes_set b (k + 1) 'u'; 692 | unsafe_bytes_set b (k + 2) '{'; 693 | unsafe_bytes_set b (k + 3) (unsafe_array_get hex_digit hi); 694 | unsafe_bytes_set b (k + 4) (unsafe_array_get hex_digit lo); 695 | unsafe_bytes_set b (k + 5) '}'; 696 | loop next (k + 6) 697 | | c -> unsafe_bytes_set b k c; loop next (k + 1) 698 | in 699 | loop 0 start 700 | 701 | (* Traces *) 702 | 703 | type trace = lexeme list 704 | type trace_rev = lexeme list 705 | 706 | let dec_trace_to_peek_sexp d rt = 707 | let rec loop d rt = match d_lexeme d with 708 | | Some (Ls | Atom _) as dec -> d.peek <- Some (Ok dec); true, rt 709 | | Some l -> loop d (l :: rt) 710 | | None -> d.peek <- Some (Ok None); false, rt 711 | in 712 | match d.peek with 713 | | None -> loop d rt 714 | | Some (Ok (Some (Ls | Atom _))) -> true, rt 715 | | Some (Ok (Some l)) -> d.peek <- None; loop d (l :: rt) 716 | | Some (Ok None) -> false, rt 717 | | Some (Error (e, r)) -> raise_notrace (Err (e, r)) 718 | 719 | let dec_trace_ls d rt = 720 | let rec loop d rt = match dec_lexeme d with 721 | | Some (White _ | Comment _ as w) -> loop d (w :: rt) 722 | | Some Ls -> Ls :: rt 723 | | Some (Le | Atom _) -> err d (`Codec "expected start of list") 724 | | None -> err d (`Codec "expected start of list") 725 | in 726 | loop d rt 727 | 728 | let dec_trace_le d rt = 729 | let rec loop d rt = match dec_lexeme d with 730 | | Some (White _ | Comment _ as w) -> loop d (w :: rt) 731 | | Some Le -> Le :: rt 732 | | Some (Ls | Atom _) -> err d (`Codec "expected end of list") 733 | | None -> err d (`Codec "expected end of list") 734 | in 735 | loop d rt 736 | 737 | let dec_trace_atom d rt = 738 | let rec loop d rt = match dec_lexeme d with 739 | | Some (White _ | Comment _ as w) -> loop d (w :: rt) 740 | | Some (Atom (a, _) as at) -> a, (at :: rt) 741 | | Some (Ls | Le) -> err d (`Codec "expected atom") (* fixme more prec. *) 742 | | None -> err d (`Codec "expected atom") 743 | in 744 | loop d rt 745 | 746 | let patch_layout (t, rt) = 747 | let rec loop rt = function 748 | | [] -> [], rt 749 | | (White _ | Comment _ as w) :: t -> loop (w :: rt) t 750 | | (Atom _ | Le | Ls) :: _ as t -> t, rt 751 | in 752 | loop rt t 753 | 754 | let patch_ls (t, rt) = 755 | let rec loop rt = function 756 | | [] -> [], Ls :: rt 757 | | (White _ | Comment _ as w) :: t -> loop (w :: rt) t 758 | | Ls :: t -> t, Ls :: rt 759 | | (Le | Atom _) :: _ -> assert false 760 | in 761 | loop rt t 762 | 763 | let patch_le (t, rt) = 764 | let rec loop rt = function 765 | | [] -> [], Le :: rt 766 | | (White _ | Comment _ as w) :: t -> loop (w :: rt) t 767 | | Le :: t -> t, Le :: rt 768 | | (Ls | Atom _) :: _ -> assert false 769 | in 770 | loop rt t 771 | 772 | let patch_atom a (t, rt) = 773 | let rec loop rt = function 774 | | [] -> [], a :: rt 775 | | (White _ | Comment _ as w) :: t -> loop (w :: rt) t 776 | | Atom _ :: t -> t, a :: rt 777 | | (Ls | Le) :: _ -> assert false 778 | in 779 | loop rt t 780 | 781 | let patch_drop_atom (t, rt) = (* keeps layout before atom (if any). *) 782 | let rec loop rt = function 783 | | [] -> [], rt 784 | | (White _ | Comment _ as w) :: t -> loop (w :: rt) t 785 | | Atom _ :: t -> t, rt 786 | | (Ls | Le) :: _ -> assert false 787 | in 788 | loop rt t 789 | 790 | let patch_drop_sexp (t, _ as recode) = (* keeps layout before sexp (if any). *) 791 | if t = [] then recode else 792 | let t, rt = patch_layout recode in 793 | let rec loop depth = function 794 | | Atom _ :: t when depth = -1 -> t 795 | | Ls :: t -> loop (depth + 1) t 796 | | Le :: t -> if depth = 0 then t else loop (depth - 1) t 797 | | l :: t -> loop depth t 798 | | [] -> assert false 799 | in 800 | loop (-1) t, rt 801 | 802 | let patch_drop_until_le recode = (* keeps layout at the Le level *) 803 | let rec loop = function 804 | | (Le :: t, _ as recode) -> recode 805 | | (_ :: _, _ as recode) -> recode |> patch_drop_sexp |> patch_layout |> loop 806 | | ([], _) -> assert false 807 | in 808 | recode |> patch_layout |> loop 809 | 810 | let patch_keep_atom (t, rt) = 811 | let rec loop rt = function 812 | | (White _ | Comment _ as w) :: t -> loop (w :: rt) t 813 | | Atom _ as a :: t -> t, a :: rt 814 | | (Ls | Le) :: _ | [] -> assert false 815 | in 816 | loop rt t 817 | 818 | let patch_keep_list recoder = 819 | let rec loop depth rt = function 820 | | Ls :: t -> loop (depth + 1) (Ls :: rt) t 821 | | Le :: t -> 822 | begin match depth with 823 | | 0 -> t, Le :: rt 824 | | depth -> loop (depth - 1) (Le :: rt) t 825 | end 826 | | l :: t -> loop depth (l :: rt) t 827 | | [] -> assert false 828 | in 829 | let t, rt = patch_layout recoder in 830 | match t with 831 | | Ls :: t -> loop 0 (Ls :: rt) t 832 | | (Atom _ | White _ | Comment _ | Le) :: _ | [] -> assert false 833 | 834 | let patch_keep_sexp (t, rt) = 835 | let rec loop depth rt = function 836 | | Atom _ as a :: t when depth = 0 -> t, a :: rt 837 | | Ls :: t -> loop (depth + 1) (Ls :: rt) t 838 | | Le :: t -> 839 | begin match depth with 840 | | 0 -> assert false 841 | | 1 -> t, Le :: rt 842 | | depth -> loop (depth - 1) (Le :: rt) t 843 | end 844 | | l :: t -> loop depth (l :: rt) t 845 | | [] -> assert false 846 | in 847 | loop 0 rt t 848 | 849 | let add_atom a (t, rt) = (t, Atom (a, None) :: rt) 850 | let add_ls (t, rt) = (t, Ls :: rt) 851 | let add_le (t, rt) = (t, Le :: rt) 852 | 853 | (* Codecs 854 | 855 | A few notes about encoder updates. 856 | 857 | * Fundamentally we don't need the old value as we could decode it 858 | back from the trace as needed when we encode, it just seems a bit 859 | cleaner that way. 860 | 861 | * Encoders see all the remaining data trace rather that just the data 862 | that concern themselves. We could chunk the trace so that sub-encoders 863 | see only what is needed but this involves a lot of list processing. 864 | 865 | * Regarding the last point we could also work on a sexp representation 866 | this would make chunking easier; but would also necessisate a 867 | clarification of white and comments are associated to tree nodes. 868 | Currently this is somehow left to the judgement of encoders 869 | (which may not be a bad idea). *) 870 | 871 | type 'a codec_encoder = 872 | 'a option -> 'a -> (trace * trace_rev) -> trace * trace_rev 873 | (* trace must be left untouched if option is None. *) 874 | 875 | type 'a codec_decoder = decoder -> trace_rev -> 'a * trace_rev 876 | type 'a codec = 877 | { eq : 'a -> 'a -> bool; 878 | default : 'a; 879 | enc : 'a codec_encoder; 880 | dec : 'a codec_decoder; } 881 | 882 | let codec ~eq ~default enc dec = { eq; default; enc; dec; } 883 | let codec_with_default default c = { c with default } 884 | let codec_with_codec ?eq ?default c = 885 | let eq = match eq with None -> c.eq | Some eq -> eq in 886 | let default = match default with None -> c.default | Some d -> d in 887 | { c with eq; default } 888 | 889 | let rec enc_lexemes e = function 890 | | [] -> () 891 | | l :: ls -> encode_lexeme e (Some l); enc_lexemes e ls 892 | 893 | let encode_traced_value c e (v, t) = match v with 894 | | None -> enc_lexemes e t; encode_lexeme e None 895 | | Some (o, n) -> enc_lexemes e (List.rev @@ snd @@ c.enc (Some o) n (t, [])) 896 | 897 | let decode_traced_value c d = 898 | try 899 | let has_sexp, rt = dec_trace_to_peek_sexp d [] in 900 | if not has_sexp then Ok (None, List.rev rt) else 901 | let v, rt = c.dec d rt in 902 | Ok (Some v, List.rev rt) 903 | with 904 | | Err (e, r) -> Error (e, r) 905 | 906 | let encode_value c e v = match v with 907 | | None -> encode_lexeme e None 908 | | Some v -> enc_lexemes e (List.rev @@ snd @@ c.enc None v ([], [])) 909 | 910 | let decode_value c d = match decode_traced_value c d with 911 | | Ok (v, _) -> Ok v 912 | | Error _ as e -> e 913 | 914 | (* Codecs *) 915 | 916 | let atom_codec ~eq ~default enc dec = 917 | let rec atom_enc enc o n recode = match o with 918 | | None -> recode |> add_atom (enc n) 919 | | Some o when eq o n -> recode |> patch_keep_atom 920 | | Some o -> recode |> patch_drop_atom |> atom_enc enc None n 921 | in 922 | let atom_dec dec d rt = 923 | let rec loop d rt = match dec_lexeme d with 924 | | Some (Atom (a, _) as l) -> dec d a, (l :: rt) 925 | | Some (Comment _ | White _ as l) -> loop d (l :: rt) 926 | | Some (Ls | Le) | None -> err d (`Codec "expected atom") (* FIXME *) 927 | in 928 | loop d rt 929 | in 930 | { eq; default; enc = atom_enc enc; dec = atom_dec dec; } 931 | 932 | let unit = 933 | let eq = (( = ) : unit -> unit -> bool) in 934 | let default = () in 935 | let unit_enc () = "unit" in 936 | let unit_dec d = function 937 | | "unit" -> () 938 | | _ -> err d (`Codec "expected `unit'") 939 | in 940 | atom_codec ~eq ~default unit_enc unit_dec 941 | 942 | let bool = 943 | let eq = (( = ) : bool -> bool -> bool) in 944 | let default = false in 945 | let bool_enc = string_of_bool in 946 | let bool_dec d a = try bool_of_string a with 947 | | Invalid_argument _ -> err d (`Codec "expected boolean") 948 | in 949 | atom_codec ~eq ~default bool_enc bool_dec 950 | 951 | let int = 952 | let eq = (( = ) : int -> int -> bool) in 953 | let default = 0 in 954 | let int_enc = string_of_int in 955 | let int_dec d a = try int_of_string a with 956 | | Failure _ -> err d (`Codec "expected integer") 957 | in 958 | atom_codec ~eq ~default int_enc int_dec 959 | 960 | let float_fmt fmt = 961 | let eq = (( = ) : float -> float -> bool) in 962 | let default = 0. in 963 | let float_enc f = Printf.sprintf fmt f in 964 | let float_dec d a = try float_of_string a with 965 | | Failure _ -> err d (`Codec "expected float") 966 | in 967 | atom_codec ~eq ~default float_enc float_dec 968 | 969 | let float = float_fmt "%g" 970 | let float_hex = float_fmt "%h" 971 | 972 | let string = 973 | let eq = String.equal in 974 | let default = "" in 975 | let string_enc x = x in 976 | let string_dec d x = x in 977 | atom_codec ~eq ~default string_enc string_dec 978 | 979 | let result ~ok ~error = 980 | let eq r0 r1 = match r0, r1 with 981 | | Ok _, Error _ | Error _, Ok _ -> false 982 | | Ok v0, Ok v1 -> ok.eq v0 v1 983 | | Error e0, Error e1 -> error.eq e0 e1 984 | in 985 | let default = Ok ok.default in 986 | let rec enc o n recode = match o with 987 | | None -> 988 | let add_case c at n recode = 989 | recode |> add_ls |> add_atom at |> c.enc None n |> add_le 990 | in 991 | begin match n with 992 | | Ok n -> recode |> add_case ok "Ok" n 993 | | Error n -> recode |> add_case error "Error" n 994 | end 995 | | Some o when eq o n -> recode |> patch_keep_list 996 | | Some o -> 997 | let patch_case c o n recode = 998 | recode |> patch_ls |> patch_keep_atom |> c.enc o n |> patch_le 999 | in 1000 | match o, n with 1001 | | Ok o, Ok n -> recode |> patch_case ok (Some o) n 1002 | | Error o, Error n -> recode |> patch_case error (Some o) n 1003 | | _ -> recode |> patch_layout |> patch_drop_sexp |> enc None n 1004 | in 1005 | let dec d rt = 1006 | let rt = dec_trace_ls d rt in 1007 | let atom, rt = dec_trace_atom d rt in 1008 | let v, rt = match atom with 1009 | | "Ok" -> let v, rt = ok.dec d rt in Ok v, rt 1010 | | "Error" -> let e, rt = error.dec d rt in Error e, rt 1011 | | atom -> err d (`Codec "expected `Ok' or `Error'") (* FIXME *) 1012 | in 1013 | let rt = dec_trace_le d rt in 1014 | v, rt 1015 | in 1016 | codec ~eq ~default enc dec 1017 | 1018 | let t2 c0 c1 = 1019 | let eq (l0, l1) (r0, r1) = c0.eq l0 r0 && c1.eq l1 r1 in 1020 | let default = c0.default, c1.default in 1021 | let enc o (n0, n1 as n) recode = match o with 1022 | | None -> 1023 | recode |> add_ls |> c0.enc None n0 |> c1.enc None n1 |> add_le 1024 | | Some o when eq o n -> recode |> patch_keep_list 1025 | | Some (o0, o1) -> 1026 | recode 1027 | |> patch_ls |> c0.enc (Some o0) n0 |> c1.enc (Some o1) n1 |> patch_le 1028 | in 1029 | let dec d rt = 1030 | let rt = dec_trace_ls d rt in 1031 | let v0, rt = c0.dec d rt in 1032 | let v1, rt = c1.dec d rt in 1033 | let rt = dec_trace_le d rt in 1034 | (v0, v1), rt 1035 | in 1036 | codec ~eq ~default enc dec 1037 | 1038 | let list c = 1039 | let rec eq l0 l1 = match l0, l1 with 1040 | | v0 :: l0, v1 :: l1 when c.eq v0 v1 -> eq l0 l1 1041 | | [], [] -> true 1042 | | _, _ -> false 1043 | in 1044 | let default = [] in 1045 | let enc o n recode = 1046 | let rec add_els n recode = match n with 1047 | | nv :: n -> add_els n (c.enc None nv recode) 1048 | | [] -> recode 1049 | in 1050 | match o with 1051 | | None -> recode |> add_ls |> add_els n |> add_le 1052 | | Some o when eq o n -> recode |> patch_keep_list 1053 | | Some o -> 1054 | let rec els o n recode = match o, n with 1055 | | ov :: o, nv :: n -> els o n (c.enc (Some ov) nv recode) 1056 | | [], [] -> recode 1057 | | [], n -> recode |> patch_layout |> add_els n 1058 | | o, [] -> patch_drop_until_le recode 1059 | in 1060 | recode |> patch_ls |> els o n |> patch_le 1061 | in 1062 | let dec d rt = 1063 | let rt = dec_trace_ls d rt in 1064 | let rec loop d acc rt = match pek_lexeme d with 1065 | | None -> err d (`Codec "unexpected end of input") 1066 | | Some Le -> ignore (dec_lexeme d); List.rev acc, (Le :: rt) 1067 | | Some (Ls | Atom _) -> 1068 | let v, rt = c.dec d rt in 1069 | loop d (v :: acc) rt 1070 | | Some (White _ | Comment _ as l) -> 1071 | ignore (dec_lexeme d); loop d acc (l :: rt) 1072 | in 1073 | loop d [] rt 1074 | in 1075 | codec ~eq ~default enc dec 1076 | 1077 | let set 1078 | (type e) (type s) (module Set : Set.S with type elt = e and type t = s) 1079 | c = 1080 | let eq = Set.equal in 1081 | let default = Set.empty in 1082 | let enc o n recode = 1083 | let rec add_els n recode = 1084 | if Set.is_empty n then recode else 1085 | let v = Set.choose n in 1086 | let n = Set.remove v n in 1087 | add_els n (c.enc None v recode) 1088 | in 1089 | match o with 1090 | | None -> recode |> add_ls |> add_els n |> add_le 1091 | | Some o when eq o n -> recode |> patch_keep_list 1092 | | Some o -> 1093 | let rec els o n recode = 1094 | let ov = Set.choose o in 1095 | let _ = Set.remove ov o in 1096 | failwith "TODO" 1097 | in 1098 | recode |> patch_ls |> els o n |> patch_le 1099 | in 1100 | let dec d rt = 1101 | let rt = dec_trace_ls d rt in 1102 | let rec loop d acc rt = match pek_lexeme d with 1103 | | None -> err d (`Codec "unexpected end of input") 1104 | | Some Le -> ignore (dec_lexeme d); acc, (Le :: rt) 1105 | | Some (Ls | Atom _) -> 1106 | let v, rt = c.dec d rt in 1107 | if Set.mem v acc then err d (`Codec "not a set") else 1108 | loop d (Set.add v acc) rt 1109 | | Some (White _ | Comment _ as l) -> 1110 | ignore (dec_lexeme d); loop d acc (l :: rt) 1111 | in 1112 | loop d Set.empty rt 1113 | in 1114 | codec ~eq ~default enc dec 1115 | 1116 | type 'a atom = ('a -> string) * (string -> 'a) 1117 | 1118 | and 'a variant_const = string * 'a 1119 | and ('a, 'b) variant_valued = string * 'b c * ('b -> 'a) 1120 | 1121 | and 'a variant_proj = 1122 | | Pconst : 'a variant_const -> 'a variant_proj 1123 | | Pvalued : ('a, 'b) variant_valued * 'b -> 'a variant_proj 1124 | 1125 | and 'a variant_inj = 1126 | | Iconst : 'a variant_const -> 'a variant_inj 1127 | | Ivalued : ('a, 'b) variant_valued -> 'a variant_inj 1128 | 1129 | and 'a variant_projs = 'a -> 'a variant_proj 1130 | and 'a variant_injs = 'a variant_inj list 1131 | 1132 | and 'a repr = 1133 | | Ground : 'a atom -> 'a repr 1134 | | List : 'a c -> 'a list repr 1135 | | Pair : 'a c * 'b c -> ('a * 'b) repr 1136 | | Variant : 'a variant_projs * 'a variant_injs -> 'a repr 1137 | 1138 | and 'a c = 1139 | { name : string; 1140 | eq : 'a -> 'a -> bool; 1141 | default : 'a; 1142 | repr : 'a repr } 1143 | 1144 | let encode_value c e v = match v with 1145 | | None -> encode_lexeme e None 1146 | | Some v -> 1147 | let rec loop : type a. a c -> encoder -> a -> unit = (* FIXME not t.r. *) 1148 | fun c e v -> match c.repr with 1149 | | Ground (enc, _) -> encode_lexeme e (Some (Atom ((enc v), None))) 1150 | | List c -> 1151 | encode_lexeme e (Some Ls); 1152 | List.iter (loop c e) v; 1153 | encode_lexeme e (Some Le) 1154 | | Pair (c0, c1) -> 1155 | encode_lexeme e (Some Ls); 1156 | loop c0 e (fst v); 1157 | loop c1 e (snd v); 1158 | encode_lexeme e (Some Le) 1159 | | Variant (proj, _) -> 1160 | begin match proj v with 1161 | | Pconst (a, _) -> encode_lexeme e (Some (Atom (a, None))) 1162 | | Pvalued ((a, c, _), v) -> 1163 | encode_lexeme e (Some Ls); 1164 | loop c e v; 1165 | encode_lexeme e (Some Le); 1166 | end 1167 | in 1168 | loop c e v 1169 | 1170 | let rec decode_ground : type a. a c -> decoder -> (string -> a) -> a = 1171 | fun c d dec -> 1172 | let at, _ = dec_trace_atom d [] in 1173 | try dec at with 1174 | | e -> err d (`Codec ("error decoding " ^ c.name)) 1175 | 1176 | and decode_list : type a. a c -> decoder -> a list = 1177 | fun c d -> 1178 | let _ = dec_trace_ls d [] in 1179 | let rec els d acc = match pek_lexeme d with 1180 | | None -> err d (`Codec "unexpected end of input") 1181 | | Some Le -> ignore (dec_lexeme d); List.rev acc 1182 | | Some (Ls | Atom _) -> 1183 | let v = decode_value c d in 1184 | els d (v :: acc) 1185 | | Some (White _ | Comment _ as l) -> ignore (dec_lexeme d); els d acc 1186 | in 1187 | els d [] 1188 | 1189 | and decode_pair : type a b. a c -> b c -> decoder -> a * b = 1190 | fun c0 c1 d -> 1191 | let _ = dec_trace_ls d [] in 1192 | let v0 = decode_value c0 d in 1193 | let v1 = decode_value c1 d in 1194 | ignore (dec_trace_le d []); 1195 | v0, v1 1196 | 1197 | and decode_value : type a. a c -> decoder -> a = (* FIXME not t.r. *) 1198 | fun c d -> match c.repr with 1199 | | Ground (_, dec) -> decode_ground c d dec 1200 | | List c -> decode_list c d 1201 | | Pair (c0, c1) -> decode_pair c0 c1 d 1202 | | Variant (_, inj) -> 1203 | let has_sexp, _ = dec_trace_to_peek_sexp d [] in 1204 | if not has_sexp then err d (`Codec ("error decoding " ^ c.name)) else 1205 | begin match dec_lexeme d with 1206 | | Some Ls -> 1207 | let at, _ = dec_trace_atom d [] in 1208 | let rec dec_valued at = function 1209 | | Ivalued (id, c, inj) :: _ when id = at -> 1210 | let v = inj (decode_value c d) in 1211 | let _ = dec_trace_le d in 1212 | v 1213 | | _ :: injs -> dec_valued at injs 1214 | | [] -> err d (`Codec ("error decoding " ^ c.name)) 1215 | in 1216 | dec_valued at inj 1217 | | Some (Atom (at, _)) -> 1218 | let rec find_const at = function 1219 | | Iconst (id, v) :: _ when id = at -> v 1220 | | _ :: injs -> find_const at injs 1221 | | [] -> err d (`Codec ("error decoding " ^ c.name)) 1222 | in 1223 | find_const at inj 1224 | | _ -> assert false 1225 | end 1226 | 1227 | let decode_value c d = 1228 | try match pek_lexeme d with 1229 | | None -> ignore (dec_lexeme d); Ok None 1230 | | Some _ -> decode_value c d 1231 | with 1232 | | Err (e, r) -> Error (e, r) 1233 | 1234 | let c name ~eq ~default ~repr = { name; eq; default; repr } 1235 | 1236 | let c_int = 1237 | let eq = (( = ) : int -> int -> bool) in 1238 | let default = 0 in 1239 | let int_enc = string_of_int in 1240 | let int_dec = int_of_string in 1241 | c "integer" ~eq ~default ~repr:(Ground (int_enc, int_dec)) 1242 | 1243 | let c_t2 c0 c1 = 1244 | let eq (l0, l1) (r0, r1) = c0.eq l0 r0 && c1.eq l1 r1 in 1245 | let default = c0.default, c1.default in 1246 | c "pair" ~eq ~default ~repr:(Pair (c0, c1)) 1247 | 1248 | let c_result ~ok ~error = 1249 | let eq r0 r1 = match r0, r1 with 1250 | | Ok _, Error _ | Error _, Ok _ -> false 1251 | | Ok v0, Ok v1 -> ok.eq v0 v1 1252 | | Error e0, Error e1 -> error.eq e0 e1 1253 | in 1254 | let default = Ok ok.default in 1255 | let ok = "Ok", ok, fun v -> Ok v in 1256 | let error = "Error", error, fun v -> Error v in 1257 | let select = function 1258 | | Ok v -> Pvalued (ok, v) 1259 | | Error v -> Pvalued (error, v) 1260 | in 1261 | let result = Variant (select, [Ivalued ok; Ivalued error]) in 1262 | c "result" ~eq ~default ~repr:result 1263 | 1264 | let none = "None", None 1265 | let none_proj = Pconst none 1266 | let none_inj = Iconst none 1267 | let c_option some = 1268 | let eq o0 o1 = match o0, o1 with 1269 | | Some v0, Some v1 -> some.eq v0 v1 1270 | | None, None -> true 1271 | | Some _, None 1272 | | None, Some _ -> false 1273 | in 1274 | let default = None in 1275 | let some = "Some", some, fun v -> Some v in 1276 | let select = function 1277 | | None -> none_proj 1278 | | Some v -> Pvalued (some, v) 1279 | in 1280 | let option = Variant (select, [none_inj; Ivalued some]) in 1281 | c "option" ~eq ~default ~repr:option 1282 | 1283 | (* 1284 | type t = A | B | C of string 1285 | 1286 | let t cc = 1287 | let eq v0 v1 = v0 = v1 in 1288 | let default = A in 1289 | let a = "A", A in 1290 | let b = "B", B in 1291 | let cc = "C", cc, (fun v -> C v) in 1292 | let select = function 1293 | | A -> Pconst a 1294 | | B -> Pconst b 1295 | | C v -> Pvalued (cc, v) 1296 | in 1297 | let t = Variant (select, [Iconst a; Iconst b; Ivalued cc]) in 1298 | c "t" ~eq ~default ~repr:t 1299 | *) 1300 | 1301 | 1302 | 1303 | 1304 | 1305 | (*--------------------------------------------------------------------------- 1306 | Copyright (c) 2016 Daniel C. Bünzli 1307 | 1308 | Permission to use, copy, modify, and/or distribute this software for any 1309 | purpose with or without fee is hereby granted, provided that the above 1310 | copyright notice and this permission notice appear in all copies. 1311 | 1312 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1313 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1314 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1315 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1316 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1317 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1318 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1319 | ---------------------------------------------------------------------------*) 1320 | -------------------------------------------------------------------------------- /src/sexpm.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2016 Daniel C. Bünzli. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | (** S-expression codecs and updates. 8 | 9 | [Sexpm] codecs sequences of s-expressions to different 10 | representations. Support is provided for: 11 | {ul 12 | {- Generic {{!custom_codec}custom} or {{!tagged_codec}tagged} 13 | s-expressions.} 14 | {- Typed OCaml {{!typed_codecs}values codecs} with layout preserving 15 | concrete syntax updates.} 16 | {- Low-level sequences of {{!lexeme_codec}s-expression lexemes} with 17 | concrete syntax layout preservation.}} 18 | Consult the {{!syntax}syntax} of s-expressions, 19 | {{!tips_lims}tips and limitations} and {{!examples}examples} of use. 20 | 21 | {e %%VERSION%% — {{:%%PKG_HOMEPAGE%% }homepage}} *) 22 | 23 | (** {1:io IO} *) 24 | 25 | type input = unit -> (Bytes.t * int * int) option 26 | (** The type for input byte streams. On [Some (b, pos, len)] the bytes 27 | of [b] from [pos] to [pos+len] are readable until the next call to 28 | the function. On [None] the end of stream is reached. *) 29 | 30 | val input_of_string : string -> input 31 | (** [input_of_string s] is a byte stream reading from [s]. *) 32 | 33 | val input_of_in_channel : ?bytes:Bytes.t -> in_channel -> input 34 | (** [input_of_in_channel ~bytes ic] is a byte stream reading from [ic] 35 | using [bytes] to read the data. 36 | 37 | @raise Invalid_argument if the length of [bytes] is [0]. *) 38 | 39 | type output = (Bytes.t * int * int) option -> unit 40 | (** The type for output byte streams. Given [Some (b, pos, len)] the bytes 41 | of [b] from [pos] to [pos+len] are readable until the function 42 | returns. On [None] the end of stream is signalled to the function. *) 43 | 44 | val output_of_buffer : Buffer.t -> output 45 | (** [output_to_buffer b] is an output stream writing to buffer [b]. *) 46 | 47 | val output_of_out_channel : out_channel -> output 48 | (** [output_of_out_channel oc] is an output stream writing to [oc]. 49 | It is the client's duty to flush it or close it. *) 50 | 51 | (** {1:decoder Decoders} *) 52 | 53 | type pos = int * int 54 | (** The type for character positions. A one-based line and zero-based 55 | column number. *) 56 | 57 | type range = pos * pos 58 | (** The type for character ranges. *) 59 | 60 | val pp_range : Format.formatter -> range -> unit 61 | (** [pp_range ppf r] prints [r] on [ppf] according to 62 | {{:https://www.gnu.org/prep/standards/standards.html#Errors}GNU 63 | conventions}. *) 64 | 65 | type error = 66 | [ `Illegal_bytes of string 67 | | `Illegal_uchar of Uchar.t 68 | | `Illegal_escape of 69 | [ `Exp_lbrace of Uchar.t 70 | | `Exp_rbrace of Uchar.t 71 | | `Exp_hex of Uchar.t 72 | | `Exp_hex_rbrace of Uchar.t 73 | | `Not_esc of Uchar.t 74 | | `Not_uchar of int ] 75 | | `Unclosed of [ `Quoted_token | `List | `Escape ] 76 | | `Unexpected_le 77 | | `Codec of string (* FIXME *) ] 78 | (** The type for decode errors. *) 79 | 80 | val pp_error : Format.formatter -> error -> unit 81 | (** [pp_error ppf e] prints an unspecified representation of [e] 82 | on [ppf]. *) 83 | 84 | type decoder 85 | (** The type for UTF-8 s-expression decoders. *) 86 | 87 | val decoder : ?layout:bool -> input -> decoder 88 | (** [decoder i] is a decoder decoding s-expressions from [i] the UTF-8 89 | encoded byte stream [i]. 90 | 91 | If [layout] is [true] (defaults to [false]), whitespace, comments and 92 | raw atoms are preserved by the underlying {{!lexeme} lexeme decoder}. This 93 | allows the layout of the decoded sequence's concrete syntax to be 94 | preserved. *) 95 | 96 | val decoder_layout : decoder -> bool 97 | (** [decoder_layout d] is the value of [layout], see {!decoder}. *) 98 | 99 | val decoded_range : decoder -> range 100 | (** [decoded_range d] is the range of characters spanning the last 101 | decoded lexeme or error. *) 102 | 103 | val decoded_sexp_range : decoder -> range 104 | (** [decoder_sexp_range d] is the range of characters spanning the 105 | last fully parsed s-expression. The value is accurate only after 106 | [Atom _] and [Le] lexemes. *) 107 | 108 | (** {1:encoders Encoders} *) 109 | 110 | type encoder 111 | (** The type for UTF-8 s-expression encoders. *) 112 | 113 | type encoder_style = [ `Pp | `Minify | `Raw ] 114 | (** The type for the encoder output style. 115 | {ul 116 | {- [`Pp], pretty printed output. If {{!lexeme}lexemes} 117 | are encoded, whitespace and raw atoms are ignored.} 118 | {- [`Minify], compact output. If {{!lexeme}lexemes} 119 | are encoded, whitespace and raw atoms lexemes are ignored.} 120 | {- [`Raw], raw output. Like [`Minify] except whitespace and 121 | raw atoms lexemes are not ignored.}} *) 122 | 123 | val encoder : 124 | ?bytes:Bytes.t -> ?nl:bool -> ?quote:bool -> ?style:encoder_style -> 125 | output -> encoder 126 | (** [encoder ~nl ~bytes ~minify o] is an encoder encoding s-expressions on [o]. 127 | {ul 128 | {- [style] indicates the {{!encoder_style}encoder style}, defaults 129 | to [`Pp].} 130 | {- [quote], quoting behaviour on non-raw atoms. If [true] (default) 131 | atoms are quoted iff they need escapes. If [false] no atoms except 132 | the empty string are quoted.} 133 | {- [nl] indicates whether the encoder makes sure a final line feed 134 | U+000A is encoded at the end of the sequence, defaults to [false].} 135 | {- [bytes] indicates the buffer to pass to [o].}} 136 | 137 | @raise Invalid_argument if the length of [bytes] is [0]. *) 138 | 139 | (** {1:custom_codec Custom s-expression codec} 140 | 141 | Codecs {{!sexp}sequences} of s-expressions by mapping them 142 | to a custom s-expression type. See an 143 | {{!custom_example}example.} *) 144 | 145 | val decode_custom : 146 | atom:(decoder -> string -> 'a) -> 147 | list:(decoder -> 'a list -> 'a) -> 148 | decoder -> ('a option, error * range) result 149 | (** [decode_custom ~atom ~list d] is [Some v], constructed using [atom] 150 | and [list] by folding over a single s-expression decoded from [d] 151 | or [None] if the end stream was reached. *) 152 | 153 | val encode_custom : 154 | ('a -> [ `Atom of string | `List of 'a list ]) -> 155 | encoder -> 'a option -> unit 156 | (** [encode_custom unfold e v] is encodes a single s-expression on [e] 157 | by unfolding [v] using [unfold]. Use with [None] to signal the 158 | end of the sequence. *) 159 | 160 | (** {1:tagged_codec Tagged s-expression codec} 161 | 162 | Codecs {{!sexp}sequences} of s-expressions to a generic 163 | tagged representation of s-expressions. *) 164 | 165 | type 'a t = [ `Atom of string | `List of 'a t list ] * 'a 166 | (** The type for s-expressions tagged with values of type ['a]. 167 | Either an unescaped, UTF-8 encoded, atom or a list of 168 | s-expressions. *) 169 | 170 | type 'a tagger = 171 | decoder -> [ `Atom of string | `List of 'a t list ] -> 'a t 172 | (** The type for s-expression taggers. Given the decoder and the last 173 | decoded s-expression, returns a tagged expression. *) 174 | 175 | val unit_tag : unit tagger 176 | (** [unit_tag] is [fun _ se -> se, ()]. *) 177 | 178 | val range_tag : range tagger 179 | (** [range_tag] is [fun d se -> se, Sexpm.decoded_sexp_range d]. *) 180 | 181 | val decode_tagged : 182 | tag:'a tagger -> decoder -> ('a t option, error * range) result 183 | (** [decode_tagged tag d] is an s-expression decoded from [d] and tagged 184 | according to [tag] or [None] if the end of stream was reached. *) 185 | 186 | val encode_tagged : encoder -> 'a t option -> unit 187 | (** [encode_tagged e se] encodes the s-expression [se] on [e]. 188 | Use with [None] to signal the end of sequence. *) 189 | 190 | (** {1:lexeme_codec Lexeme sequence codec} 191 | 192 | Codecs {{!sexp}sequences} of s-expressions as layout preserving, 193 | {{!lexeme}well-formed}, sequences of lexemes. *) 194 | 195 | type lexeme = 196 | | White of string 197 | | Comment of string 198 | | Ls | Le | Atom of string * string option (** *) 199 | (** The type for s-expression lexemes. 200 | 201 | {ul 202 | {- [White s] is {{!whitespace}whitespace}.} 203 | {- [Comment s] is a {{!comments}comment} without the starting 204 | semi-colon and ending new line.} 205 | {- [Ls]tart and [Le]nd delimit lists.} 206 | {- [Atom (a, raw)] is for the unescaped atom [a] that was parsed 207 | from the raw string [raw].}} 208 | 209 | On decoding, whitespace, comments and raw atoms are reported 210 | only if the decoder has {{!decoder_layout}layout}. No sequence of 211 | two [White] can be decoded from an input sequence. 212 | 213 | A {e well-formed} sequence of lexemes belongs to the language of the 214 | following {{:https://tools.ietf.org/html/rfc5234}RFC 5234} ABNF 215 | grammar defined on a sequence of lexemes. 216 | {v 217 | sexp-seq = *(White _ / Comment _ / sexp) 218 | sexp = Atom _ / Ls *(sexp-seq) Le 219 | v} 220 | A {!decoder} returns only well-formed sequences of lexemes even 221 | when {!error}s are returned (see {!error_recovery}). The lexemes 222 | strings are always UTF-8 encoded and in these strings characters 223 | originally escaped in the input are in their unescaped representation. 224 | [White _], [Commment _] and raw atom representations are only ever 225 | returned if the decoder has {{!decoder_layout}layout}. 226 | 227 | An {!encoder} accepts only well-formed sequences of lexemes or 228 | [Invalid_argument] is raised. The strings given to the lexemes 229 | are assumed to be UTF-8 encoded, this is {b not} checked by the 230 | module. For atoms if [raw] is given this raw string will be used 231 | to output the token it is the client's duty to make sure it is 232 | properly quoted and escaped. If [raw] is not provided the module 233 | will handle this. For comments if the string does not end with a 234 | new line and further lexemes are encoded a new line is added. *) 235 | 236 | val pp_lexeme : Format.formatter -> lexeme -> unit 237 | (** [pp_lexeme ppf l] prints an unspecified non-sexp representation of 238 | [l] on [ppf]. *) 239 | 240 | val decode_lexeme : decoder -> (lexeme option, error * range) result 241 | (** [decode_lexeme d] is: 242 | {ul 243 | {- [Ok (Some l)] if a lexeme [l] was decoded.} 244 | {- [Ok None] if the end of input was reached.} 245 | {- [Error e] if a decoding error occurred. If the client is 246 | interested in a best-effort decoding it can still continue to 247 | decode afer an error (see {!error_recovery}) altough the 248 | resulting sequence of lexemes is undefined and may not be 249 | well-formed.}} 250 | 251 | {b Note.} Repeated invocation eventually always returns [Ok None], even 252 | in case of error. *) 253 | 254 | val peek_lexeme : decoder -> (lexeme option, error * range) result 255 | (** [peek_lexeme d] is like {!decode_lexeme} but doesn't remove 256 | the lexeme from the input sequence. *) 257 | 258 | val encode_lexeme : encoder -> lexeme option -> unit 259 | (** [encode_lexeme e l] encodes [l] on [e]. [None] should always 260 | be eventually encoded. 261 | 262 | @raise Invalid_argument if a non {{!lexeme}well-formed} sequence 263 | of lexemes is encoded. 264 | 265 | {b Warning.} The encoder does not check the following properties 266 | and for the output to be valid it is the client's duty to make sure 267 | that in encoding: 268 | {ul 269 | {- [White s], [s] is only made of {{!whitespace}whitespace}} 270 | {- [Comment s], [s] is UTF-8 encoded, does not contain US-ASCII control 271 | characters except spaces or tabs and does not contain any end of 272 | line except possibly at the end.} 273 | {- [Atom (a, None)], [a] is UTF-8 encoded.} 274 | {- [Atom (_, Some r)], [r] is UTF-8 encoded and, if needed, properly 275 | quoted and escaped. {!escape} can be used for this.}} *) 276 | 277 | (** {1:escaping Atom escaping} *) 278 | 279 | val escape : ?quote:bool -> string -> string 280 | (** [escape ~quote s] escapes the UTF-8 encoded string [s] to an 281 | s-expression {{!atoms}token or quoted token}. If [quote] is [true] 282 | (default) the token is quoted iff [s] needs escapes or is the empty 283 | string. If [quote] is [false] the token is never quoted except if [s] 284 | is the empty string. *) 285 | 286 | (** {1:typed_codecs OCaml values codecs} *) 287 | 288 | type trace = lexeme list 289 | (** The type for the list of lexemes that described a value. *) 290 | 291 | type 'a codec 292 | (** The type for codecs for values of type ['a]. *) 293 | 294 | val codec_with_default : 'a -> 'a codec -> 'a codec 295 | (** [codec_with_default v c] is [c] with default value [c]. *) 296 | 297 | val codec_with_codec : 298 | ?eq:('a -> 'a -> bool) -> ?default:'a -> 'a codec -> 'a codec 299 | (** [codec_with_codec ~eq ~default c] is [c] with equality [eq] (unchanged 300 | if unspecified) and default value [c] *) 301 | 302 | (** {2:codec_values Encoding and decoding values} *) 303 | 304 | type 'a c 305 | 306 | val encode_value : 'a c -> encoder -> 'a option -> unit 307 | 308 | (* 309 | val decode_value : 'a codec -> decoder -> ('a option, error * range) result 310 | val encode_values : 'a codec -> encoder -> 'a list -> unit 311 | val decode_values : 'a codec -> encoder -> ('a list, error * range) result 312 | *) 313 | 314 | val encode_traced_value : 315 | 'a codec -> encoder -> ('a * 'a) option * trace -> unit 316 | 317 | val decode_traced_value : 318 | 'a codec -> decoder -> ('a option * trace, error * range) result 319 | 320 | (** {2:std_type_codecs OCaml standard library type codecs} *) 321 | 322 | val unit : unit codec 323 | val bool : bool codec 324 | val int : int codec 325 | val float : float codec 326 | val float_fmt : (float -> string, unit, string) format -> float codec 327 | val float_hex : float codec 328 | val string : string codec 329 | val result : ok:'a codec -> error:'b codec -> ('a, 'b) result codec 330 | val t2 : 'a codec -> 'b codec -> ('a * 'b) codec 331 | val list : 'a codec -> 'a list codec 332 | val set : 333 | (module Set.S with type elt = 'a and type t = 'b) -> 'a codec -> 'b codec 334 | 335 | (** {1:syntax S-expression syntax} 336 | 337 | We define the syntax of s-expressions over a sequence of 338 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode 339 | characters} in which all US-ASCII control characters except 340 | {{!whitespace}whitespace} are forbidden in unescaped form. 341 | 342 | {2:sexp S-expressions and sequences thereof} 343 | 344 | A {e s-expression} is either an {{!atoms}{e atom}} or a 345 | {{!lists}{e list}} of s-expressions interspaced with 346 | {{!whitespace}{e whitespace}} and {{!comments}{e comments}}. A {e 347 | sequence of s-expressions} is simply a succession of s-expressions 348 | interspaced with whitespace and comments. 349 | 350 | These elements are informally described below and finally made 351 | precise via an ABNF {{!grammar}grammar}. 352 | 353 | {2:whitespace Whitespace} 354 | 355 | Whitespace is a sequence of whitespace characters, namely, space 356 | (U+0020), tab (U+0009), line feed (U+000A) or carriage return 357 | (U+000D). 358 | 359 | {2:comments Comments} 360 | 361 | Unless escaped or inside quoted tokens (see below) anything that 362 | follows a semicolon (U+003B) is ignored until the next {e end of line}, 363 | that is either a line feed 364 | (U+000A), a carriage return (U+000D) or a carriage return and a line 365 | feed (). 366 | {v 367 | (this is not a comment) ; This is a comment 368 | (this is not a comment) 369 | v} 370 | 371 | {2:atoms Atoms} 372 | 373 | An atom represents ground data as a string of Unicode characters. 374 | It can, via escapes, represent any sequence of Unicode characters, 375 | including control characters and U+0000. It cannot represent an 376 | arbitrary byte sequence except via a client-defined encoding 377 | convention (e.g. Base64, OCaml byte escape codes, etc.). 378 | 379 | An atom is specified either via a token or a quoted token. Quoted 380 | tokens can represent atoms with spaces, parentheses and semicolons 381 | without having to escape them. 382 | {v 383 | abc ; a token for the atom "abc" 384 | abc\;\ \(d ; a token for the atom "abc; (d" 385 | "abc" ; a quoted token for the atom "abc" 386 | "abc; (d" ; a quoted token for the atom "abc; (d" 387 | "" ; the quoted token for the atom "" 388 | v} 389 | 390 | A token and its quoted form represent the same atom. All atoms 391 | except the empty atom [""] can be represented in both forms. 392 | 393 | Escapes are introduced by a backslash. Double quotes and 394 | backslashes have to be escaped everywhere except in comments. 395 | {v 396 | "\u\{1F42B\}\n\"\\" ; atom with U+1F42B, a new line, a quote and a backslash 397 | \u\{1F42B\}\n\"\\ ; idem 398 | v} 399 | The following escape sequences are recognized: 400 | {ul 401 | {- ["\""] () for double quote (U+0022) (mandatory)} 402 | {- ["\\"] () for backslash (U+005C) (mandatory)} 403 | {- ["\("] () for left parenthesis (U+0028)} 404 | {- ["\)"] () for right parenthesis (U+0029)} 405 | {- ["\;"] () for semicolon (U+003B)} 406 | {- ["\ "] () for space (U+0020)} 407 | {- ["\t"] () for tab (U+0009)} 408 | {- ["\n"] () for line feed (U+000A)} 409 | {- ["\r"] () for carriage return (U+000D)} 410 | {- ["\u{X}"] with [X] is from 1 to at most 6 upper or lower case 411 | hexadecimal digits standing for the corresponding 412 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode character} 413 | U+X.} 414 | {- Any other character except line feed (U+000A) or carriage return 415 | (U+000D), following 416 | a backslash is an illegal sequence of characters.}} 417 | 418 | Tokens and quoted tokens can be split across lines by using a backslash 419 | (U+005C) followed by a line feed (), a carriage return 420 | (U+000D) or a carriage return and a line feed 421 | (). Any space (U+0020) or tab (U+0009) character 422 | at the beginning of the new line is ignored. 423 | {v 424 | "\ 425 | a\ 426 | \ " ; the atom "a " 427 | a\ 428 | \ ; the atom "a " 429 | v} 430 | 431 | {2:lists Lists} 432 | 433 | Lists are delimited by left and right parentheses (U+0028, U+0029) 434 | and their elements are s-expressions separated by optional 435 | whitespace and comments. For example: 436 | {v 437 | (a list (of four) expressions) 438 | (a list(of four)expressions) 439 | ("a"list("of"four)expressions) 440 | (a list (of ; This is a comment 441 | four) expressions) 442 | () ; the empty list 443 | v} 444 | 445 | {2:grammar S-expression grammar} 446 | 447 | The following {{:https://tools.ietf.org/html/rfc5234}RFC 5234} 448 | ABNF grammar is defined on a sequence of 449 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode characters}. 450 | {v 451 | sexp-seq = *(ws / comment / sexp) 452 | sexp = atom / list 453 | list = %x0028 sexp-seq %x0029 454 | atom = token / qtoken 455 | token = (t-char / escape) *(t-char / escape / cont) 456 | qtoken = %x0022 *(q-char / escape / cont) %x0022 457 | escape = %x005C (%x0020 / %x0022 / %x0028 / %x0029 / 0x003B / %x005C / 458 | %x006E / %x0072 / %x0074 / %x0075 %x007B unum %x007D) 459 | unum = 1*6(HEXDIG) 460 | cont = %x005C nl 461 | ws = *(ws-char) 462 | comment = %x003B *(c-char) nl 463 | nl = %x000A / %x000D / %x000D %x000A 464 | t-char = %x0021 / %x0023-0027 / %x002A-%x003A / %x003C-005B / 465 | %x005D-D7FF / %xE000-10FFFF 466 | q-char = t-char / ws-char / %x0028 / %x0029 / %x003B 467 | ws-char = %x0020 / %x0009 / %x000A / %x000D 468 | c-char = %x0009 / %x0020-D7FF / %xE000-10FFFF 469 | v} 470 | A few additional constraints not expressed by the grammar: 471 | {ul 472 | {- If an initial {{:http://unicode.org/glossary/#byte_order_mark}BOM 473 | character} (U+FEFF) is present it is discarded.} 474 | {- On parsing [token] and [qtoken], any [%x0020] and [%x0009] following 475 | a [cont] is discarded and not part of the parsed atom.} 476 | {- [unum] once interpreted as an hexadecimal number must be a 477 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode scalar 478 | value.}} 479 | {- A comment can be ended by the end of the character sequence.}} *) 480 | 481 | (** {1:error_recovery Error recovery} 482 | 483 | After a decoding error, if best-effort decoding is performed, the 484 | following happens before continuing: 485 | {ul 486 | {- [`Illegal_bytes _], [`Illegal_escape _], [`Illegal_uchar _], a Unicode 487 | replacement character (U+FFFD) is substituted to the illegal byte 488 | sequence, bogus escape or Unicode character} 489 | {- [`Unexpected_le], the right parenthesis is skipped.} 490 | {- [`Unclosed `Escape], the escape is skipped and the atom returned.} 491 | {- [`Unclosed `Quoted_token], the token is closed and the atom returned.} 492 | {- [`Unclosed `List], a [Le] token is returned.}} 493 | Due to the U+FFFD insertions layout cannot be assumed to be 494 | preserved when error recovery is performed, however outputing an 495 | erroring stream of lexemes results in a valid sequence of s-expressions. *) 496 | 497 | (** {1:tips_lims Tips and limitations} 498 | 499 | {ul 500 | {- Usage of decoding and encoding functions of the various 501 | representations on decoders and encoders is not mutually exclusive. 502 | However if you are for example using the lexeme decoder you need 503 | to make sure you are at the beginning of a sexp before invoking 504 | the higher-level decoders.} 505 | {- Concrete syntax layout preservation will in general be slower.}} *) 506 | 507 | (** {1:examples Examples} 508 | 509 | {2:custom_example Custom representation} 510 | 511 | The following codecs a sequence of s-expressions to a custom 512 | definition of s-expressions. Note that for illustration purposes 513 | this representation is exactly the tagged s-expression 514 | representation of the {{!tagged_example}next example}. 515 | {[ 516 | type sexp = [ `Atom of string | `List of sexp list ] * Sexpm.range 517 | 518 | let of_string : string -> (sexp list, Sexpm.error * Sexpm.range) result = 519 | fun s -> 520 | let d = Sexpm.(decoder @@ input_of_string s) in 521 | let atom d a = `Atom a, Sexpm.decoded_sexp_range d in 522 | let list d l = `List l, Sexpm.decoded_sexp_range d in 523 | let rec loop acc = match Sexpm.decode_custom ~atom ~list d with 524 | | Ok Some v -> loop (v :: acc) 525 | | Ok None -> Ok (List.rev acc) 526 | | Error _ as e -> e 527 | in 528 | loop [] 529 | 530 | let to_string : ?style:Sexpm.encoder_style -> sexp list -> string 531 | fun ses -> 532 | let unfold = fst in 533 | let b = Buffer.create 256 in 534 | let d = Sexpm.(encoder ?style @@ output_to_buffer b) in 535 | let rec loop = function 536 | | se :: ses -> Sexpm.encode_custom unfold d (Some se); loop ses 537 | | [] -> Sexpm.encode_custom d None; Buffer.contents b 538 | in 539 | loop ses 540 | ]} 541 | 542 | {2:tagged_example Tagged representation} 543 | 544 | {[ 545 | type sexp = Sexpm.range Sexpm.t 546 | 547 | let of_string : string -> (sexp list, Sexpm.error * Sexpm.range) result = 548 | fun s -> 549 | let d = Sexpm.(decoder @@ input_of_string s) in 550 | let rec loop acc = match Sexpm.decode_tagged ~tag:Sexpm.range_tag d with 551 | | Ok Some v -> loop (v :: acc) 552 | | Ok None -> Ok (List.rev acc) 553 | | Error _ as e -> e 554 | in 555 | loop [] 556 | 557 | let to_string : ?style:Sexpm.encoder_style -> sexp list -> string 558 | fun ses -> 559 | let b = Buffer.create 256 in 560 | let d = Sexpm.(encoder ?style @@ output_to_buffer b) in 561 | let rec loop = function 562 | | se :: ses -> Sexpm.encode_tagged d (Some se); loop ses 563 | | [] -> Sexpm.encode_tagged d None; Buffer.contents b 564 | in 565 | loop ss 566 | ]} 567 | *) 568 | 569 | (*--------------------------------------------------------------------------- 570 | Copyright (c) 2016 Daniel C. Bünzli 571 | 572 | Permission to use, copy, modify, and/or distribute this software for any 573 | purpose with or without fee is hereby granted, provided that the above 574 | copyright notice and this permission notice appear in all copies. 575 | 576 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 577 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 578 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 579 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 580 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 581 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 582 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 583 | ---------------------------------------------------------------------------*) 584 | -------------------------------------------------------------------------------- /src/sexpm.mllib: -------------------------------------------------------------------------------- 1 | Sexpm 2 | -------------------------------------------------------------------------------- /test/sexptrip.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2017 Daniel C. Bünzli. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | open Bos_setup 8 | 9 | (* Input *) 10 | 11 | type input = 12 | { buf_size : int; 13 | inf : Fpath.t; 14 | input_string : bool; 15 | layout : bool; } 16 | 17 | let with_input_decoder i f v = 18 | let wrap input () = 19 | let dec = Sexpm.decoder ~layout:i.layout input in 20 | f i dec v 21 | in 22 | match i.input_string with 23 | | true -> 24 | OS.File.read i.inf 25 | >>= fun s -> Ok (wrap (Sexpm.input_of_string s) ()) 26 | | false -> 27 | let bytes = Bytes.create i.buf_size in 28 | OS.File.with_input ~bytes i.inf wrap () 29 | 30 | (* Output *) 31 | 32 | type output = 33 | { buf_size : int; 34 | outf : Fpath.t; 35 | output_string : bool; 36 | style : Sexpm.encoder_style; 37 | lex : bool; } 38 | 39 | let with_output_encoder o f v = 40 | let wrap output () = 41 | let bytes = Bytes.create o.buf_size in 42 | let enc = Sexpm.encoder ~bytes ~style:o.style output in 43 | let r = f o enc v in 44 | r 45 | in 46 | match o.output_string with 47 | | true -> 48 | let buf = Buffer.create o.buf_size in 49 | let r = wrap (Sexpm.output_of_buffer buf) () in 50 | OS.File.write o.outf (Buffer.contents buf) >>| fun () -> r 51 | | false -> 52 | OS.File.with_output o.outf wrap () 53 | 54 | let with_output_formatter o f v = match o.output_string with 55 | | true -> 56 | let buf = Buffer.create o.buf_size in 57 | let ppf = Format.formatter_of_buffer buf in 58 | let r = f o ppf v in 59 | (Format.pp_print_flush ppf (); OS.File.write o.outf (Buffer.contents buf)) 60 | >>= fun () -> Ok (Ok r) 61 | | false -> 62 | let wrap oc () = 63 | let ppf = Format.formatter_of_out_channel oc in 64 | let r = f o ppf v in 65 | (Format.pp_print_flush ppf (); Ok r) 66 | in 67 | OS.File.with_oc o.outf wrap () 68 | 69 | (* Pretty printers *) 70 | 71 | let pp_decode i dec ppf = function 72 | | Ok None -> 73 | Format.fprintf ppf "%a:%a: EOI@," 74 | Fpath.pp i.inf Sexpm.pp_range (Sexpm.decoded_range dec) 75 | | Ok (Some l) -> 76 | Format.fprintf ppf "%a:%a: @[%a@]@," 77 | Fpath.pp i.inf Sexpm.pp_range (Sexpm.decoded_range dec) 78 | Sexpm.pp_lexeme l 79 | | Error (e, r) -> 80 | Format.fprintf ppf "%a:%a: @[%a@]@," 81 | Fpath.pp i.inf Sexpm.pp_range (Sexpm.decoded_range dec) 82 | Sexpm.pp_error e 83 | 84 | (* Decode *) 85 | 86 | let decode err_ppf i = 87 | let decode i dec () = 88 | let rec loop () = match Sexpm.decode_lexeme dec with 89 | | Ok None -> () 90 | | Ok (Some l) -> loop () 91 | | Error _ as err -> pp_decode i dec err_ppf err; loop () 92 | in 93 | Format.(fprintf err_ppf "@["; loop (); fprintf err_ppf "@]@?") 94 | in 95 | with_input_decoder i decode () 96 | 97 | (* Lex *) 98 | 99 | let lex i o = 100 | let lex i dec o ppf () = 101 | let rec loop () = match Sexpm.decode_lexeme dec with 102 | | Ok None as d -> pp_decode i dec ppf d 103 | | d -> pp_decode i dec ppf d; loop () 104 | in 105 | Format.(fprintf ppf "@["; loop (); fprintf ppf "@]@?") 106 | in 107 | R.join @@ with_input_decoder i 108 | (fun i dec () -> R.join @@ with_output_formatter o (lex i dec) ()) () 109 | 110 | (* Trip *) 111 | 112 | let trip err_ppf i o = 113 | let trip i dec o enc () = 114 | let rec loop () = match Sexpm.decode_lexeme dec with 115 | | Ok None -> Sexpm.encode_lexeme enc None 116 | | Ok d -> Sexpm.encode_lexeme enc d; loop () 117 | | Error _ as err -> pp_decode i dec err_ppf err; loop () 118 | in 119 | Format.(fprintf err_ppf "@["; loop (); fprintf err_ppf "@]@?"; Ok ()) 120 | in 121 | R.join @@ with_input_decoder i 122 | (fun i dec () -> R.join @@ with_output_encoder o (trip i dec) ()) () 123 | 124 | 125 | (* Random encode *) 126 | 127 | type rand = 128 | { seed : int; 129 | count : int; (* number of sexps to generate. *) 130 | maxd : int; (* max depth of lists. *) 131 | maxl : int; (* max length of lists. *) 132 | maxa : int; (* max length of atoms. *) } 133 | 134 | let r_ascii_lowercase_letter () = 135 | Uchar.unsafe_of_int (0x0061 (* a *) + Random.int 26) 136 | 137 | let r_general_scripts_minus_ctrl () = 138 | Uchar.unsafe_of_int (0x0020 + Random.int 0x1FE0) (* < U+2000 *) 139 | 140 | let r_white encode r buf = 141 | Buffer.clear buf; 142 | for i = 0 to Random.int 3 do match Random.int 100 with 143 | | n when n < 90 -> Buffer.add_char buf ' ' 144 | | n when n < 94 -> Buffer.add_char buf '\t' 145 | | n when n < 98 -> Buffer.add_char buf '\n' 146 | | n when n < 100 -> Buffer.add_char buf '\r' 147 | | n -> assert false 148 | done; 149 | encode (Sexpm.White (Buffer.contents buf)) 150 | 151 | let r_comment encode r buf = 152 | Buffer.clear buf; 153 | for i = 0 to Random.int 15 do 154 | Uutf.Buffer.add_utf_8 buf (r_general_scripts_minus_ctrl ()) 155 | done; 156 | encode (Sexpm.Comment (Buffer.contents buf)) 157 | 158 | let r_atom encode r buf = 159 | Buffer.clear buf; 160 | for i = 0 to Random.int (r.maxa + 1) 161 | do Uutf.Buffer.add_utf_8 buf (r_ascii_lowercase_letter ()) done; 162 | encode (Sexpm.Atom (Buffer.contents buf, None)) 163 | 164 | let rec r_list encode r buf depth count = 165 | let depth = Random.int (depth + 1) in 166 | encode (Sexpm.Ls); 167 | let count = r_sexp_seq encode r buf depth (count - 1) in 168 | encode (Sexpm.Le); 169 | count 170 | 171 | and r_sexp_seq encode r buf depth count = 172 | let rec loop len count = match len = 0 || count = 0 with 173 | | true -> count 174 | | false -> 175 | begin match Random.int 100 with 176 | | n when n < 80 -> () 177 | | n when n < 90 -> r_white encode r buf 178 | | n when n < 100 -> r_comment encode r buf 179 | | n -> assert false 180 | end; 181 | match depth = 0 || Random.bool () with 182 | | true -> r_atom encode r buf; loop (len - 1) (count - 1) 183 | | false -> 184 | let count = r_list encode r buf (depth - 1) count in 185 | loop (len - 1) count 186 | in 187 | loop (Random.int (r.maxl + 1)) count 188 | 189 | let random o r = 190 | let random o e () = 191 | let buf = Buffer.create 255 in 192 | let encode l = Sexpm.encode_lexeme e (Some l) in 193 | let rec loop = function 194 | | 0 -> Sexpm.encode_lexeme e None; Ok () 195 | | n -> loop (r_sexp_seq encode r buf r.maxd n) 196 | in 197 | loop r.count 198 | in 199 | R.join @@ with_output_encoder o random () 200 | 201 | (* Main command *) 202 | 203 | let do_cmd (i, o) map rand = function 204 | | `Decode -> decode Format.err_formatter i 205 | | `Random -> random o rand 206 | | `Trip when o.lex -> lex i o 207 | | `Trip -> trip Format.err_formatter i o 208 | 209 | (* Command line interface *) 210 | 211 | open Cmdliner 212 | 213 | let nat_arg = 214 | let kind = "positive integer "in 215 | let parse s = match Arg.parser_of_kind_of_string ~kind String.to_int s with 216 | | Ok n when n > 0 -> `Ok n 217 | | Ok n -> `Error (strf "%d is not positive" n) 218 | | Error (`Msg e) -> `Error e 219 | in 220 | Arg.pconv (parse, Format.pp_print_int) (* FIXME use Arg.conv *) 221 | 222 | let fpath_arg = 223 | let parse s = match Fpath.of_string s with 224 | | Ok v -> `Ok v 225 | | Error (`Msg e) -> `Error e 226 | in 227 | Arg.pconv (parse, Fpath.pp) (* FIXME use Arg.conf *) 228 | 229 | let buf_size = 230 | let doc = "Size of internal buffers." in 231 | Arg.(value & opt nat_arg 65536 & info ["buffer-size"] ~doc ~docv:"INT") 232 | 233 | (* Input *) 234 | 235 | let inf = 236 | let doc = "The input file. Reads from standard input if unspecified." in 237 | Arg.(value & pos 0 fpath_arg OS.File.dash & info [] ~doc ~docv:"FILE") 238 | 239 | let input_section = "INPUT OPTIONS" 240 | let docs = input_section 241 | 242 | let input_string = 243 | let doc = "Input everything in a string and decode the string." in 244 | Arg.(value & flag & info ["input-string"] ~doc ~docs) 245 | 246 | let no_layout = 247 | let doc = "Do not preserve concrete syntax layout on decode." in 248 | Arg.(value & flag & info ["no-layout"] ~doc ~docs) 249 | 250 | let input = 251 | let input buf_size inf input_string no_layout = 252 | { buf_size; inf; input_string; layout = not no_layout } 253 | in 254 | Term.(const input $ buf_size $ inf $ input_string $ no_layout) 255 | 256 | (* Output options. *) 257 | 258 | let output_section = "OUTPUT OPTIONS" 259 | let docs = output_section 260 | 261 | let lex = 262 | let doc = "Output the decoded s-expression lexemes or errors, one per line 263 | with their position." 264 | in 265 | Arg.(value & flag & info ["lex"] ~docs ~doc) 266 | 267 | let style = 268 | let doc = "Minify output." in 269 | let minify = `Minify, Arg.info ["m"; "minify"] ~docs ~doc in 270 | let doc = "Pretty-print output." in 271 | let pp = `Pp, Arg.info ["p"; "pretty-print"] ~docs ~doc in 272 | Arg.(value & vflag `Raw [minify; pp]) 273 | 274 | let output_string = 275 | let doc = "Encode everything in a string and output the string." in 276 | Arg.(value & flag & info ["output-string"] ~doc ~docs) 277 | 278 | let outf = 279 | let doc = "Output result to $(docv)." in 280 | Arg.(value & opt (some fpath_arg) None & info ["o"] ~docs ~doc ~docv:"FILE") 281 | 282 | let in_place = 283 | let doc = "Atomically write output, on success, to the file specified as 284 | input." 285 | in 286 | Arg.(value & flag & info ["i"; "in-place"] ~docs ~doc) 287 | 288 | let io = 289 | let err_excl o0 o1 = 290 | R.error_msgf "option %s and %s are mutually exclusive" o0 o1 291 | in 292 | let io input lex output_string style outf in_place = 293 | if in_place && lex then err_excl "--in-place" "--lex" else 294 | let buf_size = (input : input).buf_size in 295 | let outf = match outf with 296 | | None when in_place -> 297 | if Fpath.equal input.inf OS.File.dash 298 | then R.error_msg ("option --in-place cannot be used with stdin") 299 | else Ok input.inf 300 | | Some _ when in_place -> err_excl "--in-place" "-o" 301 | | None -> Ok (OS.File.dash) 302 | | Some f -> Ok f 303 | in 304 | outf >>| fun outf -> 305 | (input, { buf_size; lex; output_string; style; outf }) 306 | in 307 | Term.(cli_parse_result 308 | (const io $ input $ lex $ output_string $ style $ outf $ in_place)) 309 | 310 | (* Map lookup and update. *) 311 | 312 | let map_section = "S-EXPRESSION MAP LOOKUP AND UPDATE OPTIONS" 313 | let docs = map_section 314 | let docv = "KEY" 315 | 316 | type map = 317 | { allow_undef : bool; 318 | exists : string option; 319 | delete : string option; 320 | get : string option; 321 | set : string option; 322 | value : string option; } 323 | 324 | let allow_undef = 325 | let doc = "For $(b,--get), $(b,--set) and $(b,--delete), do not error if 326 | the key $(i,KEY) is undefined." in 327 | Arg.(value & flag & info ["u"; "allow-undefined"] ~docs ~doc) 328 | 329 | let exists = 330 | let doc = "Check for the existence of key $(docv). See section EXIT CODE." in 331 | Arg.(value & opt (some string) None & info ["e"; "exists"] ~docs ~doc ~docv) 332 | 333 | let delete = 334 | let doc = "Output the input without the key $(docv) and its value." in 335 | Arg.(value & opt (some string) None & info ["d"; "delete"] ~docs ~doc ~docv) 336 | 337 | let get = 338 | let doc = "Output the value of key $(docv) of the input." in 339 | Arg.(value & opt (some string) None & info ["g"; "get"] ~docs ~doc ~docv) 340 | 341 | let set = 342 | let doc = "Output the input with the key $(docv) set to the value specified 343 | by $(b,--value)." 344 | in 345 | Arg.(value & opt (some string) None & info ["s"; "set"] ~docs ~doc ~docv) 346 | 347 | let value = 348 | let doc = "s-expression $(docv) bound to the $(i,KEY) of $(b,--set). 349 | If unspecified a s-expression is read from standard input." 350 | in 351 | let docv = "SEXP" in 352 | Arg.(value & opt (some string) None & info ["v"; "value"] ~docs ~doc ~docv) 353 | 354 | let map = 355 | let map allow_undef exists delete get set value = 356 | { allow_undef; exists; delete; get; set; value; } 357 | in 358 | Term.(const map $ allow_undef $ exists $ delete $ get $ set $ value) 359 | 360 | (* Random generation parameters. *) 361 | 362 | let rand_section = "RANDOM S-EXPRESSIONS OPTIONS" 363 | let docs = rand_section 364 | 365 | let rseed = 366 | let doc = "Random seed." in 367 | Arg.(value & opt (some nat_arg) None & info ["rseed"] ~docs ~doc ~docv:"INT") 368 | 369 | let rcount = 370 | let doc = "Number of random s-expressions (atom or list) generated." in 371 | let c = 100 in 372 | Arg.(value & opt int c & info ["rcount"] ~docs ~doc ~docv:"INT") 373 | 374 | let maxd = 375 | let doc = "Maximal depth of random s-expressions lists." in 376 | let d = 5 in 377 | Arg.(value & opt int d & info ["max-depth"] ~docs ~doc ~docv:"INT") 378 | 379 | let maxl = 380 | let doc = "Maximal length of random s-expression lists." in 381 | let l = 20 in 382 | Arg.(value & opt int l & info ["max-list-len"] ~docs ~doc ~docv:"INT") 383 | 384 | let maxa = 385 | let doc = "Maximal length of random s-expression atoms." in 386 | let l = 20 in 387 | Arg.(value & opt int l & info ["max-atom-len"] ~docs ~doc ~docv:"INT") 388 | 389 | let rand = 390 | let rand seed count maxd maxl maxa = 391 | let seed = match seed with 392 | | None -> Random.self_init (); Random.int (1 lsl 30 - 1) 393 | | Some s -> s 394 | in 395 | { seed; count; maxd; maxl; maxa } 396 | in 397 | Term.(const rand $ rseed $ rcount $ maxd $ maxl $ maxa) 398 | 399 | (* Command *) 400 | 401 | let cmd = 402 | let doc = "Decode only, no encoding." in 403 | let dec = `Decode, Arg.info ["decode"] ~doc in 404 | let doc = 405 | strf "Output random s-expressions, no decoding. See section %s." 406 | rand_section 407 | in 408 | let rand = `Random, Arg.info ["random"] ~doc in 409 | Arg.(value & vflag `Trip [dec; rand]) 410 | 411 | let doc = "operate on sequences of s-expressions" 412 | let exits = 413 | (Term.exit_info 1 ~doc:"the input was malformed") :: 414 | (Term.exit_info 2 ~doc:"the key does not exist, see the s-expression \ 415 | map options.") :: 416 | Term.default_exits 417 | 418 | let man = 419 | [ `S "DESCRIPTION"; 420 | `P "$(mname) is a command line tool to operate on sequences of 421 | s-expressions."; 422 | `P "Use '$(mname)' to decode and recode s-expressions."; `Noblank; 423 | `P "Use '$(mname) --get KEY' to get a value in a s-expression."; `Noblank; 424 | `P "Use '$(mname) --set KEY' to set a value in a s-expression."; `Noblank; 425 | `P "Use '$(mname) --random' to generate random s-expressions."; 426 | `S Manpage.s_options; 427 | `S input_section; 428 | `P "These options control the input process."; 429 | `S output_section; 430 | `P "These options control the output process."; 431 | `S map_section; 432 | `P "These options query or modify the input by interpreting 433 | s-expressions as key-value maps."; 434 | `S rand_section; 435 | `P "These options control the s-expressions randomly generated by 436 | the $(b,--random) option."; 437 | `S "S-EXPRESSION SYNTAX"; 438 | `P "A more gentle introduction to s-expression is given 439 | in the API documentation available at:"; 440 | `Pre " %%PKG_DOC%%/Sexpm.html#syntax or $(b,odig doc sexpm)"; 441 | `P "The following is the parsed syntax of s-expressions expressed 442 | as a RFC 5234 ABNF grammar:"; 443 | `Pre "\ 444 | sexp-seq = *(ws / comment / sexp) 445 | sexp = atom / list 446 | list = %x0028 sexp-seq %x0029 447 | atom = token / qtoken 448 | token = (t-char / escape) *(t-char / escape / cont) 449 | qtoken = %x0022 *(q-char / escape / cont) %x0022 450 | escape = %x005C (%x0020 / %x0022 / %x0028 / %x0029 / 0x003B / %x005C / 451 | %x006E / %x0072 / %x0074 / %x0075 %x007B unum %x007D) 452 | unum = 1*6(HEXDIG) 453 | cont = %x005C nl 454 | ws = *(ws-char) 455 | comment = %x003B *(c-char) nl 456 | nl = %x000A / %x000D / %x000D %x000A 457 | t-char = %x0021 / %x0023-0027 / %x002A-%x003A / %x003C-005B / 458 | %x005D-D7FF / %xE000-10FFFF 459 | q-char = t-char / ws-char / %x0028 / %x0029 / %x003B 460 | ws-char = %x0020 / %x0009 / %x000A / %x000D 461 | c-char = %x0009 / %x0020-D7FF / %xE000-10FFFF 462 | "; 463 | `S "BUGS"; 464 | `P "This program is distributed with the sexpm OCaml library. 465 | See %%HOMEPAGE%% for contact information." ] 466 | 467 | let cmd = 468 | Term.(term_result (const do_cmd $ io $ map $ rand $ cmd)), 469 | Term.info "sexptrip" ~version:"%%VERSION%%" ~doc ~exits ~man 470 | 471 | let () = Term.(exit @@ eval cmd) 472 | 473 | (*--------------------------------------------------------------------------- 474 | Copyright (c) 2017 Daniel C. Bünzli 475 | 476 | Permission to use, copy, modify, and/or distribute this software for any 477 | purpose with or without fee is hereby granted, provided that the above 478 | copyright notice and this permission notice appear in all copies. 479 | 480 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 481 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 482 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 483 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 484 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 485 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 486 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 487 | ---------------------------------------------------------------------------*) 488 | -------------------------------------------------------------------------------- /test/test.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2016 Daniel C. Bünzli. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | let strf = Format.asprintf 8 | let log f = Format.printf (f ^^ "@?") 9 | let fail fmt = 10 | let fail _ = failwith (Format.flush_str_formatter ()) in 11 | Format.kfprintf fail Format.str_formatter fmt 12 | 13 | let pp_range ppf ((sc, sl), (ec, el)) = 14 | Format.fprintf ppf "(%d,%d),(%d,%d)" sc sl ec el 15 | 16 | let pp_decode ppf = function 17 | | Ok None -> Format.fprintf ppf "Ok None" 18 | | Ok Some l -> Format.fprintf ppf "Ok (Some (%a))" Sexpm.pp_lexeme l 19 | | Error (e, r) -> 20 | Format.fprintf ppf "Error (%a,(%a))" Sexpm.pp_error e pp_range r 21 | 22 | let test_decode fnd exp = match fnd, exp with 23 | | Ok _, Ok _ 24 | | Error _, Ok _ 25 | | Ok _, Error _ when fnd <> exp -> 26 | fail "found: %a expected: %a" pp_decode fnd pp_decode exp 27 | | Error (e0, _), Error (e1, _) when e0 <> e1 -> 28 | fail "found: %a expected: %a" pp_decode fnd pp_decode exp 29 | | _, _ -> () 30 | 31 | let test_dseq ?layout src seq = 32 | let d = Sexpm.(decoder ?layout (input_of_string src)) in 33 | let rec loop d = function 34 | | l :: ls -> test_decode (Sexpm.decode_lexeme d) l; loop d ls 35 | | [] -> 36 | match Sexpm.decode_lexeme d with 37 | | Ok None -> () 38 | | d -> fail "decoder not at the end of stream (%a)" pp_decode d 39 | in 40 | loop d seq 41 | 42 | let test_eseq ?nl ?quote ?style seq result = 43 | let b = Buffer.create 255 in 44 | let o = Sexpm.output_of_buffer b in 45 | let e = Sexpm.(encoder ?nl ?quote ?style o) in 46 | let rec loop = function 47 | | (Ok l) :: ls -> Sexpm.encode_lexeme e l; loop ls 48 | | (Error _) :: ls -> assert false 49 | | [] -> 50 | Sexpm.encode_lexeme e None; 51 | let r = Buffer.contents b in 52 | if r <> result then fail "encoded : %S\n\ 53 | expected: %S" r result 54 | in 55 | loop seq 56 | 57 | let test_trip src = 58 | let b = Buffer.create 255 in 59 | let o = Sexpm.output_of_buffer b in 60 | let e = Sexpm.(encoder ~style:`Raw o ) in 61 | let d = Sexpm.(decoder ~layout:true (input_of_string src)) in 62 | let rec loop () = match Sexpm.decode_lexeme d with 63 | | Ok (Some _ as dec) -> Sexpm.encode_lexeme e dec; loop () 64 | | Error _ -> assert false 65 | | Ok None -> 66 | Sexpm.encode_lexeme e None; 67 | let r = Buffer.contents b in 68 | if r <> src then fail "src : %S\n\ 69 | trip: %S" src r 70 | in 71 | loop () 72 | 73 | let ok v = Ok (Some v) 74 | let w w = ok (Sexpm.White w) 75 | let c c = ok (Sexpm.Comment c) 76 | let a a r = ok (Sexpm.Atom (a, r)) 77 | let ls = ok Sexpm.Ls 78 | let le = ok Sexpm.Le 79 | let e err = Error (err, ((0,0),(0,0))) 80 | 81 | let decode_whites () = 82 | log "Decoder whites.\n"; 83 | let test_white s = 84 | test_dseq ~layout:false s []; 85 | test_dseq ~layout:true s [ w s ]; 86 | test_dseq ~layout:false (strf "%sbu" s) [a "bu" None]; 87 | test_dseq ~layout:false (strf "bu%s" s) [a "bu" None]; 88 | test_dseq ~layout:true (strf "%sbu" s) [w s; a "bu" (Some "bu")]; 89 | test_dseq ~layout:true (strf "bu%s" s) [a "bu" (Some "bu"); w s]; 90 | test_dseq ~layout:false (strf "(%s)" s) [ls; le]; 91 | test_dseq ~layout:true (strf "(%s)" s) [ls; w s; le]; 92 | test_dseq ~layout:true (strf "(%s)%s" s s) [ls; w s; le; w s] 93 | in 94 | test_white " "; 95 | test_white "\t"; 96 | test_white "\n"; 97 | test_white "\r"; 98 | test_white "\r\n"; 99 | test_white " \n \t\t"; 100 | test_white " \n "; 101 | test_white " \n "; 102 | () 103 | 104 | let decode_comments () = 105 | log "Decoder comments.\n"; 106 | let test_comment ?(context = true) s = 107 | let cs = String.sub s 1 (String.length s - 1) in 108 | test_dseq ~layout:false s []; 109 | test_dseq ~layout:true s [ c cs ]; 110 | test_dseq ~layout:true (strf " %s\n " s) [ w " "; c cs; w "\n " ]; 111 | test_dseq ~layout:true (strf " %s\r\n" s) [ w " "; c cs; w "\r\n" ]; 112 | test_dseq ~layout:true (strf " %s" s) [ w " "; c cs ]; 113 | test_dseq ~layout:true (strf " a%s" s) [ w " "; a "a" (Some "a"); c cs ]; 114 | test_dseq ~layout:true (strf " a\\\n%s" s) 115 | [ w " "; a "a" (Some "a\\\n"); c cs ]; 116 | test_dseq ~layout:true (strf " a\\\n%s\nb" s) 117 | [ w " "; a "a" (Some "a\\\n"); c cs; w "\n"; a "b" (Some "b") ]; 118 | test_dseq ~layout:true (strf "(a%s\nb)" s) 119 | [ ls; a "a" (Some "a"); c cs; w "\n"; a "b" (Some "b"); le ]; 120 | in 121 | test_dseq ~layout:true ";" [ c "" ]; 122 | test_comment "; adfj"; 123 | test_comment ";; adfj"; 124 | test_comment "; adfj \" ; ()"; 125 | test_comment ";; adfj \" ; ()"; 126 | test_comment ";; adfj \\n \" ; ()"; 127 | test_comment ";; adfj \\u{} \" ; ()"; 128 | test_comment ";; adfj \\u{23422342324} \" ; ()"; 129 | () 130 | 131 | let decode_atoms () = 132 | log "Decoder atoms.\n"; 133 | let test_atom s at = 134 | test_dseq ~layout:false s [ a at None ]; 135 | test_dseq ~layout:true s [ a at (Some s) ]; 136 | test_dseq ~layout:true (strf "\n%s;b" s) [ w "\n"; a at (Some s); c "b" ]; 137 | test_dseq ~layout:false (strf "\n%s;b" s) [ a at None ]; 138 | test_dseq ~layout:true (strf "()%s()" s) [ ls; le; a at (Some s); ls; le ]; 139 | in 140 | test_dseq ~layout:true "bla\\" [ a "bla" (Some "bla\\")]; 141 | test_dseq ~layout:true "bla\\\r\n\\" [ a "bla" (Some "bla\\\r\n\\") ]; 142 | test_dseq ~layout:true "bla\"bla\"" 143 | [ a "bla" (Some "bla");a "bla" (Some "\"bla\"") ]; 144 | test_atom "\"\"" ""; 145 | test_atom "bla" "bla"; 146 | test_atom "b\\\nla" "bla"; 147 | test_atom "b\\\n \tla" "bla"; 148 | test_atom "b\\nla" "b\nla"; 149 | test_atom "\"bla\"" "bla"; 150 | test_atom "\"b la\"" "b la"; 151 | test_atom "\"b\\\n la\"" "bla"; 152 | test_atom "\"b\\\n la\"" "bla"; 153 | test_atom "\"b\\nla\"" "b\nla"; 154 | test_atom "\"b\\\n\\nla\"" "b\nla"; 155 | test_atom "\\ " " "; 156 | test_atom "\\\"" "\""; 157 | test_atom "\\(" "("; 158 | test_atom "\\)" ")"; 159 | test_atom "\\;" ";"; 160 | test_atom "\\\\" "\\"; 161 | test_atom "\\n" "\n"; 162 | test_atom "\\r" "\r"; 163 | test_atom "\\t" "\t"; 164 | test_atom "\\ \\\n\\ \\\n\\n" " \n"; 165 | test_atom "\\u{0000}" "\x00"; 166 | test_atom "\\u{0001}" "\x01"; 167 | test_atom "\\u{D7FF}" "\xED\x9F\xBF"; 168 | test_atom "\\u{E000}" "\xEE\x80\x80"; 169 | test_atom "\\u{1F42B}" "\xF0\x9F\x90\xAB"; 170 | test_atom "\\u{10FFFF}" "\xF4\x8F\xBF\xBF"; 171 | () 172 | 173 | let decode_lists () = 174 | log "Decoder lists.\n"; 175 | test_dseq ~layout:true "()()(a)" [ls; le; ls; le; ls; a "a" (Some "a"); le]; 176 | test_dseq ~layout:true "()a()" [ls; le; a "a" (Some "a"); ls; le]; 177 | test_dseq ~layout:true "(((a)))" [ls; ls; ls; a "a" (Some "a"); le; le; le]; 178 | test_dseq ~layout:true "(())()" [ls; ls; le; le; ls; le]; 179 | () 180 | 181 | let u_rep_utf8 = "\xEF\xBF\xBD" 182 | let decode_errors () = 183 | log "Decoder errors.\n"; 184 | test_dseq ~layout:false "\xFF" 185 | [e (`Illegal_bytes "\xFF"); a u_rep_utf8 None]; 186 | test_dseq ~layout:false "a\xFFb" 187 | [e (`Illegal_bytes "\xFF"); a (strf "a%sb" u_rep_utf8) None ]; 188 | test_dseq ~layout:false "a\x00b" 189 | [e (`Illegal_uchar (Uchar.of_int 0x0000)); a (strf "a%sb" u_rep_utf8) None]; 190 | test_dseq ~layout:false "a \\" 191 | [a "a" None; e (`Unclosed `Escape); a "" None]; 192 | test_dseq ~layout:false "a\\u" 193 | [e (`Unclosed `Escape); a "a" None ]; 194 | test_dseq ~layout:false "\\A" 195 | [e (`Illegal_escape (`Not_esc (Uchar.of_int 0x41))); 196 | a u_rep_utf8 None ]; 197 | test_dseq ~layout:false "\\\n" 198 | [e (`Illegal_escape (`Not_esc (Uchar.of_int 0x0A))); 199 | a u_rep_utf8 None ]; 200 | test_dseq ~layout:false "\\uA" 201 | [e (`Illegal_escape (`Exp_lbrace (Uchar.of_int 0x41))); 202 | a u_rep_utf8 None ]; 203 | test_dseq ~layout:false "\\u{}" 204 | [e (`Illegal_escape (`Exp_hex (Uchar.of_int 0x7D))); 205 | a u_rep_utf8 None ]; 206 | test_dseq ~layout:false "\\u{+" 207 | [e (`Illegal_escape (`Exp_hex (Uchar.of_int 0x2B))); 208 | a u_rep_utf8 None ]; 209 | test_dseq ~layout:false "\\u{A+}" 210 | [e (`Illegal_escape (`Exp_hex_rbrace (Uchar.of_int 0x2B))); 211 | a (strf "%s}" u_rep_utf8) None ]; 212 | test_dseq ~layout:false "\\u{1234567}" 213 | [e (`Illegal_escape (`Exp_rbrace (Uchar.of_int 0x37))); 214 | a (strf "%s}" u_rep_utf8) None ]; 215 | test_dseq ~layout:false "\\u{110000}" 216 | [e (`Illegal_escape (`Not_uchar 0x110000)); a u_rep_utf8 None ]; 217 | test_dseq ~layout:false "\\u{D800}" 218 | [e (`Illegal_escape (`Not_uchar 0xD800)); a u_rep_utf8 None ]; 219 | test_dseq ~layout:false "\\u{DFFF}" 220 | [e (`Illegal_escape (`Not_uchar 0xDFFF)); a u_rep_utf8 None ]; 221 | test_dseq ~layout:false "())()" 222 | [ls; le; e (`Unexpected_le); ls; le ]; 223 | test_dseq ~layout:false "()(" 224 | [ls; le; ls; e (`Unclosed `List); le ]; 225 | test_dseq ~layout:false "()((\"" 226 | [ls; le; ls; ls; e (`Unclosed `Quoted_token); a "" None; 227 | e (`Unclosed `List); le; e (`Unclosed `List); le; ]; 228 | test_dseq ~layout:false "(\"\\u{0000" 229 | [ls; e (`Unclosed `Escape); e (`Unclosed `Quoted_token); a "" None; 230 | e (`Unclosed `List); le; ]; 231 | () 232 | 233 | let encode_raw () = 234 | log "Raw encoder.\n"; 235 | let test_eseq ?quote = test_eseq ?quote ~style:`Raw in 236 | test_eseq ~quote:true [ a "" None; a "bla" None] "\"\"bla"; 237 | test_eseq ~quote:false [ a "" None; a "bla" None] "\"\"bla"; 238 | test_eseq ~quote:true [ a "" None; a "b la" None] "\"\"\"b la\""; 239 | test_eseq ~quote:false [ a "" None; a "b la" None] "\"\"b\\ la"; 240 | test_eseq [ a "bla" None; a "bla" None] "bla bla"; 241 | test_eseq ~quote:true [ a "b la" None; a "bla" None] "\"b la\"bla"; 242 | test_eseq ~quote:true [ a "bla" None; a "b la" None] "bla\"b la\""; 243 | test_eseq ~quote:true [ a "b la" None; a "b la" None] "\"b la\"\"b la\""; 244 | test_eseq ~quote:false [ a "b la" None; a "bla" None] "b\\ la bla"; 245 | test_eseq ~quote:false [ a "b la" None; a "b la" None] "b\\ la b\\ la"; 246 | test_eseq [ a "bla" None; c ";"; a "bla" None] "bla;;\nbla"; 247 | test_eseq [ a "bla" None; c ";" ] "bla;;"; 248 | test_eseq [ c ""; a "bla" None; ] ";\nbla"; 249 | test_eseq [ c ""; ls; le] ";\n()"; 250 | test_eseq [ ls; c ""; le] "(;\n)"; 251 | test_eseq [ ls; le; c ""] "();"; 252 | test_eseq [ ls; c "a"; w "\r\n"; le ] "(;a\r\n)"; 253 | test_eseq [ ls; c "a"; w "\n\r"; le ] "(;a\n\r)"; 254 | () 255 | 256 | let encode_min () = 257 | log "Min encoder.\n"; 258 | let test_eseq ?quote = test_eseq ?quote ~style:`Minify in 259 | test_eseq [ a "bla" None; a "bla" None] "bla bla"; 260 | test_eseq [ a "bla" None; a "bla" None; w " "; c "bla"] "bla bla"; 261 | test_eseq [ ls; le; w " "; ls; c "bla"; a "a" None; le; a "a" None] "()(a)a"; 262 | test_eseq [ a "" None; a "bla" None] "\"\"bla"; 263 | () 264 | 265 | let trip () = 266 | log "Trips.\n"; 267 | test_trip "\\u{10FFFF}"; 268 | test_trip "()\n\r \\u{10FFFF}"; 269 | test_trip "( () () a a \" a \" \n (bla ))"; 270 | () 271 | 272 | let escape () = 273 | log "Sexpm.escape.\n"; 274 | let escape s qesc esc = 275 | let qesc' = Sexpm.escape ~quote:true s in 276 | let esc' = Sexpm.escape ~quote:false s in 277 | if qesc' <> qesc then fail "found: %S exp: %S" qesc' qesc; 278 | if esc' <> esc then fail "found: %S exp: %S" esc' esc 279 | in 280 | escape "" "\"\"" "\"\""; 281 | escape " " "\" \"" "\\ "; 282 | escape ";" "\";\"" "\\;"; 283 | escape "(" "\"(\"" "\\("; 284 | escape ")" "\")\"" "\\)"; 285 | for i = 0x0000 to 0x001F do match i with 286 | | 0x0009 -> escape "\t" "\"\\t\"" "\\t" 287 | | 0x000A -> escape "\n" "\"\\n\"" "\\n" 288 | | 0x000D -> escape "\r" "\"\\r\"" "\\r" 289 | | u -> 290 | escape (strf "%c" @@ Char.chr u) 291 | (strf "\"\\u{%02X}\"" u) (strf "\\u{%02X}" u) 292 | done; 293 | escape "\x00hey ho ; () bla \n" 294 | "\"\\u{00}hey ho ; () bla \\n\"" 295 | "\\u{00}hey\\ ho\\ \\;\\ \\(\\)\\ bla\\ \\n"; 296 | () 297 | 298 | let test () = 299 | decode_whites (); 300 | decode_comments (); 301 | decode_atoms (); 302 | decode_lists (); 303 | decode_errors (); 304 | encode_raw (); 305 | encode_min (); 306 | trip (); 307 | escape (); 308 | log "All tests suceeded.\n" 309 | 310 | let () = if not (!Sys.interactive) then test () 311 | 312 | (*--------------------------------------------------------------------------- 313 | Copyright (c) 2016 Daniel C. Bünzli 314 | 315 | Permission to use, copy, modify, and/or distribute this software for any 316 | purpose with or without fee is hereby granted, provided that the above 317 | copyright notice and this permission notice appear in all copies. 318 | 319 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 320 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 321 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 322 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 323 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 324 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 325 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 326 | ---------------------------------------------------------------------------*) 327 | -------------------------------------------------------------------------------- /test/test_codec.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2016 Daniel C. Bünzli. All rights reserved. 3 | Distributed under the ISC license, see terms at the end of the file. 4 | %%NAME%% %%VERSION%% 5 | ---------------------------------------------------------------------------*) 6 | 7 | let strf = Format.asprintf 8 | let log f = Format.printf (f ^^ "@.") 9 | let fail fmt = 10 | let fail ppf = Format.fprintf ppf "@."; failwith "test" in 11 | Format.kfprintf fail Format.err_formatter fmt 12 | 13 | let trip c src = 14 | let b = Buffer.create 255 in 15 | let o = Sexpm.output_of_buffer b in 16 | let e = Sexpm.(encoder ~style:`Raw o ) in 17 | let d = Sexpm.(decoder ~layout:true (input_of_string src)) in 18 | let rec loop () = match Sexpm.decode_traced_value c d with 19 | | Error (e, _) -> fail "%a" Sexpm.pp_error e 20 | | Ok (Some v, t) -> 21 | Sexpm.encode_traced_value c e (Some (v, v), t); loop () 22 | | Ok (None, t) -> 23 | Sexpm.encode_traced_value c e (None, t); 24 | let r = Buffer.contents b in 25 | if r <> src then fail "src : %S@\n\ 26 | trip: %S" src r 27 | in 28 | loop () 29 | 30 | let test_trip () = 31 | log "Testing trips."; 32 | trip Sexpm.unit ";hu\nunit ; ha\nunit"; 33 | trip Sexpm.bool "true \"false\" ; hey\n true ; blue"; 34 | trip Sexpm.int "1 \"2\" ; hey\n \"0b11\" ; blue"; 35 | trip Sexpm.(t2 int bool) "(2 false)\n( 3 true )(\"4\" false)\n;ha\n "; 36 | let r = Sexpm.(result ~ok:int ~error:(t2 int bool)) in 37 | trip r " (\\u{4f}k 3) (Error ;hey\n (2 true);ho\n)"; 38 | let l = Sexpm.(list (t2 int int)) in 39 | trip l " ((3 2) (6 3) ; bla blo blu ()\n (4 2)) ()"; 40 | () 41 | 42 | let upd c src o n = 43 | let dst = Printf.sprintf src n in 44 | let src = Printf.sprintf src o in 45 | let b = Buffer.create 255 in 46 | let o = Sexpm.output_of_buffer b in 47 | let e = Sexpm.(encoder ~style:`Raw o) in 48 | let deco = Sexpm.(decoder ~layout:true (input_of_string src)) in 49 | let decn = Sexpm.(decoder ~layout:true (input_of_string dst)) in 50 | let rec loop () = 51 | match Sexpm.decode_traced_value c deco, 52 | Sexpm.decode_traced_value c decn with 53 | | Ok (Some o, t), Ok (Some n, _) -> 54 | Sexpm.encode_traced_value c e (Some (o, n), t); loop () 55 | | Ok (None, t), Ok (None, _) -> 56 | Sexpm.encode_traced_value c e (None, t); 57 | let r = Buffer.contents b in 58 | if r <> dst then fail "dst : %S@\n\ 59 | updt: %S" dst r 60 | | Error (e, _), _ -> fail "%a" Sexpm.pp_error e 61 | | _, Error (e, _) -> fail "%a" Sexpm.pp_error e 62 | | _ -> fail "fail" 63 | in 64 | loop (); 65 | () 66 | 67 | let test_update () = 68 | log "Testing update."; 69 | upd Sexpm.bool " %s; hehyo" "true" "false"; 70 | upd Sexpm.bool " %s; hehyo" "false" "true"; 71 | let r = Sexpm.(result ~ok:int ~error:(t2 int bool)) in 72 | upd r " (\\u{4f}k %s)" "2" "3"; 73 | upd r " (\\u{4f}k %s) (Error ;hey\n (2 ;bla\n true);ho\n)" "2" "3"; 74 | upd r " (\\u{4f}k 1) (Error ;hey\n (%s ;bla\n true);ho\n)" "2" "3"; 75 | upd r " (\\u{4f}k 1) (Error ;hey\n (2 ;bla\n %s);ho\n)" "false" "true"; 76 | upd r " (\\u{4f}k 1) ;heyho\n %s;hu\n " 77 | "(Error ;hey\n (2 true);bla\n)" "(Ok 3)"; 78 | let l = Sexpm.(list (t2 int int)) in 79 | upd l " (%s) " "(2 ;a\n 3) (4 3)" "(2 ;a\n 4) (5 6)"; 80 | upd l " (%s) " "(2 ;a\n 3) (4 3) (3 2) ;bla\n" "(2 ;a\n 4) (5 6) ;bla\n"; 81 | upd l " (%s) " "(2 ;a\n 3) (4 3) ;bla\n(3 2)" "(2 ;a\n 4) (5 6) ;bla\n"; 82 | upd l " (%s) " 83 | "(2 ;a\n 3) ;bla\n(3 2);hey\n" "(2 ;a\n 4) ;bla\n(5 6);hey\n(3 2)"; 84 | () 85 | 86 | let test () = 87 | test_trip (); 88 | test_update (); 89 | log "All tests suceeded.\n" 90 | 91 | let () = if not (!Sys.interactive) then test () 92 | 93 | (*--------------------------------------------------------------------------- 94 | Copyright (c) 2016 Daniel C. Bünzli 95 | 96 | Permission to use, copy, modify, and/or distribute this software 97 | for any purpose with or without fee is hereby granted, provided 98 | that the above copyright notice and this permission notice appear 99 | in all copies. 100 | 101 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 102 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 103 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 104 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 105 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 106 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 107 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 108 | ---------------------------------------------------------------------------*) 109 | --------------------------------------------------------------------------------