├── .gitignore ├── .merlin ├── .ocp-indent ├── B0.ml ├── BRZO ├── CHANGES.md ├── DEVEL.md ├── LICENSE.md ├── NOTES.md ├── README.md ├── TODO.md ├── _tags ├── abnf ├── json.abnf ├── sexp.abnf └── toml.abnf ├── doc └── index.mld ├── opam ├── pkg ├── META └── pkg.ml ├── src ├── serialkit.mllib ├── serialkit_json.ml ├── serialkit_json.mli ├── serialkit_sexp.ml ├── serialkit_sexp.mli ├── serialkit_text.ml ├── serialkit_text.mli ├── serialkit_toml.ml └── serialkit_toml.mli ├── test ├── expect │ ├── test.sexp │ └── test.toml ├── test.ml └── test_toml.ml └── tool ├── cmd_main.ml ├── cmd_main.mli ├── cmd_sexp.ml ├── cmd_sexp.mli ├── cmd_toml.ml ├── cmd_toml.mli ├── std.ml └── std.mli /.gitignore: -------------------------------------------------------------------------------- 1 | _b0 2 | _build 3 | tmp 4 | *~ 5 | \.\#* 6 | \#*# 7 | *.install 8 | *.native 9 | *.byte -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG cmdliner b0.kit 2 | S src 3 | S test 4 | S tool 5 | B _b0/** 6 | -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | strict_with=always,match_clause=4,strict_else=never 2 | -------------------------------------------------------------------------------- /B0.ml: -------------------------------------------------------------------------------- 1 | open B0_kit.V000 2 | open Result.Syntax 3 | 4 | (* OCaml library names *) 5 | 6 | let cmdliner = B0_ocaml.libname "cmdliner" 7 | let serialkit = B0_ocaml.libname "serialkit" 8 | 9 | (* Libraries *) 10 | 11 | let serialkit_lib = 12 | let srcs = [ `Dir (Fpath.v "src") ] in 13 | let name = "serialkit-lib" in 14 | B0_ocaml.lib ~name serialkit ~doc:"serialkit library" ~srcs ~requires:[] 15 | 16 | (* Tools *) 17 | 18 | let serialkit_tool = 19 | let srcs = Fpath.[`Dir (v "tool")] in 20 | let requires = [cmdliner; serialkit] in 21 | B0_ocaml.exe "serialkit" ~public:true ~doc:"serialkit tool" ~srcs ~requires 22 | 23 | (* Tests *) 24 | 25 | let test_spec = 26 | let srcs = Fpath.[`File (v "test/test.ml")] in 27 | let requires = [serialkit] in 28 | B0_ocaml.exe "test" ~doc:"Tests" ~srcs ~requires 29 | 30 | let test_toml = 31 | let srcs = Fpath.[`File (v "test/test_toml.ml")] in 32 | let requires = [serialkit] in 33 | B0_ocaml.exe "test-toml" ~doc:"TOML Tests" ~srcs ~requires 34 | 35 | (* Expectation tests *) 36 | 37 | let expect_serialk_runs ctx = 38 | let runs cmd = (* command, output suffix *) 39 | [ Cmd.(arg cmd % "locs"), ".locs"; ] 40 | in 41 | let test_run ctx serialk file (cmd, ext) = 42 | let cwd = B0_expect.base ctx and stdout = Fpath.(file -+ ext) in 43 | B0_expect.stdout ctx ~cwd ~stdout Cmd.(serialk %% cmd) 44 | in 45 | let test_file ctx serialk file = 46 | let cmd = String.subrange ~first:1 (Fpath.get_ext file) in 47 | List.iter (test_run ctx serialk file) (runs cmd) 48 | in 49 | let serialk = B0_expect.get_unit_exe_file_cmd ctx serialkit_tool in 50 | let test_files = 51 | let base_files = B0_expect.base_files ctx ~rel:true ~recurse:false in 52 | let input f = match Fpath.get_ext ~multi:true f with 53 | | ".json" | ".sexp" | ".toml" | ".cbor" | ".xml" -> true 54 | | _ -> false 55 | in 56 | List.filter input base_files 57 | in 58 | List.iter (test_file ctx serialk) test_files 59 | 60 | let expect = 61 | B0_unit.of_action' 62 | "expect" ~units:[serialkit_tool] ~doc:"Test expectations" @@ 63 | B0_expect.action_func ~base:(Fpath.v "test/expect") @@ fun ctx -> 64 | expect_serialk_runs ctx; 65 | () 66 | 67 | (* Packs *) 68 | 69 | let default = 70 | let meta = 71 | B0_meta.empty 72 | |> ~~ B0_meta.authors ["The serialkit programmers"] 73 | |> ~~ B0_meta.maintainers ["Daniel Bünzli "] 74 | |> ~~ B0_meta.homepage "https://erratique.ch/software/serialkit" 75 | |> ~~ B0_meta.online_doc "https://erratique.ch/software/serialkit/doc" 76 | |> ~~ B0_meta.licenses ["ISC"] 77 | |> ~~ B0_meta.repo "git+https://erratique.ch/repos/serialkit.git" 78 | |> ~~ B0_meta.issues "https://github.com/dbuenzli/serialkit/issues" 79 | |> ~~ B0_meta.description_tags 80 | ["codec"; "json"; "sexp"; "toml"; "query"; "org:erratique"; ] 81 | |> ~~ B0_opam.build 82 | {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"]]|} 83 | |> ~~ B0_opam.depends [ 84 | "ocaml", {|>= "4.14.0"|}; 85 | "ocamlfind", {|build|}; 86 | "ocamlbuild", {|build|}; 87 | "topkg", {|build & >= "1.0.3"|}; 88 | "cmdliner", {|>= "1.1.0"|}] 89 | |> B0_meta.tag B0_release.tag 90 | |> B0_meta.tag B0_opam.tag 91 | in 92 | B0_pack.make "default" ~doc:"serialkit package" ~meta ~locked:true @@ 93 | B0_unit.list () 94 | -------------------------------------------------------------------------------- /BRZO: -------------------------------------------------------------------------------- 1 | (srcs-x pkg test B0.ml tool) -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | vX.Y.Z YYYY-MM-DD Location 2 | -------------------------- 3 | 4 | First release. 5 | -------------------------------------------------------------------------------- /DEVEL.md: -------------------------------------------------------------------------------- 1 | # Expectation tests 2 | 3 | To add a new test add a `.{json,sexp,toml,xml,cbor}` file in `test/expect`, 4 | run the tests and add the new generated files to the repo. 5 | 6 | ```sh 7 | b0 -- expect 8 | b0 -- expect --help 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 The serialkit programmers 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /NOTES.md: -------------------------------------------------------------------------------- 1 | Layout preserving edits via AST 2 | ------------------------------- 3 | 4 | Goal, have a natural, high-level data structure for processing. But 5 | still allow user layout preserving edits. 6 | 7 | * In sexpm the idea was to patch/diff sequences of lexemes with a lexeme 8 | for whitespace. Not sure this is a good approach. It was not fully 9 | pursued but it seems hard to reconcline in the high-level API. 10 | 11 | * In serialk for now we keep very precise AST location and we do raw string 12 | surgery. Though this can be encapsulated in higher-level abstractions it 13 | remains rather brittle and subject to bugs in location tracking. In the 14 | latter case it makes it easy to generate syntax errors. 15 | 16 | * A better approach may be to define a notion of whitespace 17 | attachement and replace the `Sexp.loc` type in the `Sexp.t` case by 18 | a `Sexp.src` which has both the location and the attached whitespace 19 | (and the info whether an atom was quoted or not). Now we can we have 20 | a `preserve`ing output procedure that spits out the attached 21 | whitespace when it serializes. 22 | 23 | Whitespace attachement likely remains before/after lexeme based. The only 24 | problem is how to disambuigate them so that they are uniquely attached: 25 | ``` 26 | (a b c ; here's a comment 27 | ) 28 | ``` 29 | Tentative: 30 | 1. Before/After '(' -> list before/after lsep 31 | 2. Before ')' -> list before rsep 32 | 3. Before atom -> before atom 33 | 34 | * Look into `ocamlformat` 35 | 36 | * Having syntactically inexisting lists e.g. toplevel sexp list and or 37 | sexp key bindings as defined as serialk make things more difficult. 38 | But they seem more natural to the end user. 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | serialkit — Serialization formats toolkit for OCaml 2 | =================================================== 3 | %%VERSION%% 4 | 5 | Serialkit provides easy to use OCaml APIs to query, update and 6 | generate data in generic serialization data formats. 7 | 8 | The supported data formats are JSON text, s-expressions and TOML. 9 | 10 | Serialkit is distributed under the ISC license. It has no dependencies. 11 | 12 | Homepage: 13 | 14 | ## Installation 15 | 16 | serialkit can be installed with `opam`: 17 | 18 | opam install serialkit 19 | opam install serialkit cmdliner # for the serialkit tool 20 | 21 | If you don't use `opam` consult the [`opam`](opam) file for build 22 | instructions. 23 | 24 | ## Documentation 25 | 26 | The documentation can be consulted [online] or via `odig doc serialkit`. 27 | 28 | Questions are welcome but better asked on the [OCaml forum] than on 29 | the issue tracker. 30 | 31 | [online]: https://erratique.ch/software/serialkit/doc 32 | [OCaml forum]: https://discuss.ocaml.org/ 33 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | Change `opt_mem` to 2 | 3 | val mem' : absent:'a -> string -> 'a t -> 'a t 4 | -------------------------------------------------------------------------------- /_tags: -------------------------------------------------------------------------------- 1 | true : bin_annot, safe_string 2 | : include 3 | : include 4 | : package(cmdliner) 5 | : package(cmdliner) 6 | <_b0> : -traverse -------------------------------------------------------------------------------- /abnf/json.abnf: -------------------------------------------------------------------------------- 1 | ;; As found in https://www.rfc-editor.org/rfc/rfc8259 2 | 3 | JSON-text = ws value ws 4 | 5 | begin-array = ws %x5B ws ; [ left square bracket 6 | begin-object = ws %x7B ws ; { left curly bracket 7 | end-array = ws %x5D ws ; ] right square bracket 8 | end-object = ws %x7D ws ; } right curly bracket 9 | name-separator = ws %x3A ws ; : colon 10 | value-separator = ws %x2C ws ; , comma 11 | 12 | ws = *(%x20 / ; Space 13 | %x09 / ; Horizontal tab 14 | %x0A / ; Line feed or New line 15 | %x0D ) ; Carriage return 16 | 17 | ; Values 18 | 19 | value = false / null / true / object / array / number / string 20 | false = %x66.61.6c.73.65 ; false 21 | null = %x6e.75.6c.6c ; null 22 | true = %x74.72.75.65 ; true 23 | 24 | ; Objects 25 | 26 | object = begin-object [ member *( value-separator member ) ] 27 | end-object 28 | 29 | member = string name-separator value 30 | 31 | ; Arrays 32 | 33 | array = begin-array [ value *( value-separator value ) ] end-array 34 | 35 | ; Numbers 36 | 37 | number = [ minus ] int [ frac ] [ exp ] 38 | decimal-point = %x2E ; . 39 | digit1-9 = %x31-39 ; 1-9 40 | e = %x65 / %x45 ; e E 41 | exp = e [ minus / plus ] 1*DIGIT 42 | frac = decimal-point 1*DIGIT 43 | int = zero / ( digit1-9 *DIGIT ) 44 | minus = %x2D ; - 45 | plus = %x2B ; + 46 | zero = %x30 ; 0 47 | 48 | ; Strings 49 | 50 | string = quotation-mark *char quotation-mark 51 | char = unescaped / 52 | escape ( 53 | %x22 / ; " quotation mark U+0022 54 | %x5C / ; \ reverse solidus U+005C 55 | %x2F / ; / solidus U+002F 56 | %x62 / ; b backspace U+0008 57 | %x66 / ; f form feed U+000C 58 | %x6E / ; n line feed U+000A 59 | %x72 / ; r carriage return U+000D 60 | %x74 / ; t tab U+0009 61 | %x75 4HEXDIG ) ; uXXXX U+XXXX 62 | 63 | escape = %x5C ; \ 64 | quotation-mark = %x22 ; " 65 | unescaped = %x20-21 / %x23-5B / %x5D-10FFFF -------------------------------------------------------------------------------- /abnf/sexp.abnf: -------------------------------------------------------------------------------- 1 | ;; This document describes the s-expression syntax parsed by 2 | ;; serialk using the ABNF format of RFC 5234. 3 | ;; 4 | ;; A few constraints are not expressed by this grammar: 5 | ;; 6 | ;; 1. `unum` once interpreted as an hexadecimal number must 7 | ;; be a Unicode scalar value 8 | ;; 2. A `comment can` be ended by the end of input 9 | ;; 10 | 11 | sexp-seq = *(ws / comment / sexp) 12 | sexp = atom / list 13 | list = %x0028 sexp-seq %x0029 14 | atom = token / qtoken 15 | token = t-char *(t-char) 16 | qtoken = %x0022 *(q-char / escape / cont) %x0022 17 | escape = %x005E (%x0020 / %x0022 / %x005E / %x006E / %x0072 / 18 | %x0075 %x007B unum %x007D) 19 | unum = 1*6(HEXDIG) 20 | cont = %x005E nl ws 21 | ws = *(ws-char) 22 | comment = %x003B *(c-char) nl 23 | nl = %x000A / %x000D / %x000D %x000A 24 | t-char = %x0021 / %x0023-0027 / %x002A-%x003A / %x003C-%x005D / 25 | %x005F-%x007E / %x0080-D7FF / %xE000-10FFFF 26 | q-char = t-char / ws-char / %x0028 / %x0029 / %x003B 27 | ws-char = %x0020 / %x0009 / %x000A / %x000B / %x000C / %x000D 28 | c-char = %x0009 / %x000B / %x000C / %x0020-D7FF / %xE000-10FFFF 29 | -------------------------------------------------------------------------------- /abnf/toml.abnf: -------------------------------------------------------------------------------- 1 | ;; This document describes TOML's syntax, using the ABNF format (defined in 2 | ;; RFC 5234 -- https://www.ietf.org/rfc/rfc5234.txt). 3 | ;; 4 | ;; All valid TOML documents will match this description, however certain 5 | ;; invalid documents would need to be rejected as per the semantics described 6 | ;; in the supporting text description. 7 | 8 | ;; It is possible to try this grammar interactively, using instaparse. 9 | ;; http://instaparse.mojombo.com/ 10 | ;; 11 | ;; To do so, in the lower right, click on Options and change `:input-format` to 12 | ;; ':abnf'. Then paste this entire ABNF document into the grammar entry box 13 | ;; (above the options). Then you can type or paste a sample TOML document into 14 | ;; the beige box on the left. Tada! 15 | 16 | ;; Overall Structure 17 | 18 | toml = expression *( newline expression ) 19 | 20 | expression = ws [ comment ] 21 | expression =/ ws keyval ws [ comment ] 22 | expression =/ ws table ws [ comment ] 23 | 24 | ;; Whitespace 25 | 26 | ws = *wschar 27 | wschar = %x20 ; Space 28 | wschar =/ %x09 ; Horizontal tab 29 | 30 | ;; Newline 31 | 32 | newline = %x0A ; LF 33 | newline =/ %x0D.0A ; CRLF 34 | 35 | ;; Comment 36 | 37 | comment-start-symbol = %x23 ; # 38 | non-ascii = %x80-D7FF / %xE000-10FFFF 39 | non-eol = %x09 / %x20-7E / non-ascii 40 | 41 | comment = comment-start-symbol *non-eol 42 | 43 | ;; Key-Value pairs 44 | 45 | keyval = key keyval-sep val 46 | 47 | key = simple-key / dotted-key 48 | simple-key = quoted-key / unquoted-key 49 | 50 | unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _ 51 | quoted-key = basic-string / literal-string 52 | dotted-key = simple-key 1*( dot-sep simple-key ) 53 | 54 | dot-sep = ws %x2E ws ; . Period 55 | keyval-sep = ws %x3D ws ; = 56 | 57 | val = string / boolean / array / inline-table / date-time / float / integer 58 | 59 | ;; String 60 | 61 | string = ml-basic-string / basic-string / ml-literal-string / literal-string 62 | 63 | ;; Basic String 64 | 65 | basic-string = quotation-mark *basic-char quotation-mark 66 | 67 | quotation-mark = %x22 ; " 68 | 69 | basic-char = basic-unescaped / escaped 70 | basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii 71 | escaped = escape escape-seq-char 72 | 73 | escape = %x5C ; \ 74 | escape-seq-char = %x22 ; " quotation mark U+0022 75 | escape-seq-char =/ %x5C ; \ reverse solidus U+005C 76 | escape-seq-char =/ %x62 ; b backspace U+0008 77 | escape-seq-char =/ %x66 ; f form feed U+000C 78 | escape-seq-char =/ %x6E ; n line feed U+000A 79 | escape-seq-char =/ %x72 ; r carriage return U+000D 80 | escape-seq-char =/ %x74 ; t tab U+0009 81 | escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX 82 | escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX 83 | 84 | ;; Multiline Basic String 85 | 86 | ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body 87 | ml-basic-string-delim 88 | ml-basic-string-delim = 3quotation-mark 89 | ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ] 90 | 91 | mlb-content = mlb-char / newline / mlb-escaped-nl 92 | mlb-char = mlb-unescaped / escaped 93 | mlb-quotes = 1*2quotation-mark 94 | mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii 95 | mlb-escaped-nl = escape ws newline *( wschar / newline ) 96 | 97 | ;; Literal String 98 | 99 | literal-string = apostrophe *literal-char apostrophe 100 | 101 | apostrophe = %x27 ; ' apostrophe 102 | 103 | literal-char = %x09 / %x20-26 / %x28-7E / non-ascii 104 | 105 | ;; Multiline Literal String 106 | 107 | ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body 108 | ml-literal-string-delim 109 | ml-literal-string-delim = 3apostrophe 110 | ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ] 111 | 112 | mll-content = mll-char / newline 113 | mll-char = %x09 / %x20-26 / %x28-7E / non-ascii 114 | mll-quotes = 1*2apostrophe 115 | 116 | ;; Integer 117 | 118 | integer = dec-int / hex-int / oct-int / bin-int 119 | 120 | minus = %x2D ; - 121 | plus = %x2B ; + 122 | underscore = %x5F ; _ 123 | digit1-9 = %x31-39 ; 1-9 124 | digit0-7 = %x30-37 ; 0-7 125 | digit0-1 = %x30-31 ; 0-1 126 | 127 | hex-prefix = %x30.78 ; 0x 128 | oct-prefix = %x30.6F ; 0o 129 | bin-prefix = %x30.62 ; 0b 130 | 131 | dec-int = [ minus / plus ] unsigned-dec-int 132 | unsigned-dec-int = DIGIT / digit1-9 1*( DIGIT / underscore DIGIT ) 133 | 134 | hex-int = hex-prefix HEXDIG *( HEXDIG / underscore HEXDIG ) 135 | oct-int = oct-prefix digit0-7 *( digit0-7 / underscore digit0-7 ) 136 | bin-int = bin-prefix digit0-1 *( digit0-1 / underscore digit0-1 ) 137 | 138 | ;; Float 139 | 140 | float = float-int-part ( exp / frac [ exp ] ) 141 | float =/ special-float 142 | 143 | float-int-part = dec-int 144 | frac = decimal-point zero-prefixable-int 145 | decimal-point = %x2E ; . 146 | zero-prefixable-int = DIGIT *( DIGIT / underscore DIGIT ) 147 | 148 | exp = "e" float-exp-part 149 | float-exp-part = [ minus / plus ] zero-prefixable-int 150 | 151 | special-float = [ minus / plus ] ( inf / nan ) 152 | inf = %x69.6e.66 ; inf 153 | nan = %x6e.61.6e ; nan 154 | 155 | ;; Boolean 156 | 157 | boolean = true / false 158 | 159 | true = %x74.72.75.65 ; true 160 | false = %x66.61.6C.73.65 ; false 161 | 162 | ;; Date and Time (as defined in RFC 3339) 163 | 164 | date-time = offset-date-time / local-date-time / local-date / local-time 165 | 166 | date-fullyear = 4DIGIT 167 | date-month = 2DIGIT ; 01-12 168 | date-mday = 2DIGIT ; 01-28, 01-29, 01-30, 01-31 based on month/year 169 | time-delim = "T" / %x20 ; T, t, or space 170 | time-hour = 2DIGIT ; 00-23 171 | time-minute = 2DIGIT ; 00-59 172 | time-second = 2DIGIT ; 00-58, 00-59, 00-60 based on leap second rules 173 | time-secfrac = "." 1*DIGIT 174 | time-numoffset = ( "+" / "-" ) time-hour ":" time-minute 175 | time-offset = "Z" / time-numoffset 176 | 177 | partial-time = time-hour ":" time-minute ":" time-second [ time-secfrac ] 178 | full-date = date-fullyear "-" date-month "-" date-mday 179 | full-time = partial-time time-offset 180 | 181 | ;; Offset Date-Time 182 | 183 | offset-date-time = full-date time-delim full-time 184 | 185 | ;; Local Date-Time 186 | 187 | local-date-time = full-date time-delim partial-time 188 | 189 | ;; Local Date 190 | 191 | local-date = full-date 192 | 193 | ;; Local Time 194 | 195 | local-time = partial-time 196 | 197 | ;; Array 198 | 199 | array = array-open [ array-values ] ws-comment-newline array-close 200 | 201 | array-open = %x5B ; [ 202 | array-close = %x5D ; ] 203 | 204 | array-values = ws-comment-newline val ws-comment-newline array-sep array-values 205 | array-values =/ ws-comment-newline val ws-comment-newline [ array-sep ] 206 | 207 | array-sep = %x2C ; , Comma 208 | 209 | ws-comment-newline = *( wschar / [ comment ] newline ) 210 | 211 | ;; Table 212 | 213 | table = std-table / array-table 214 | 215 | ;; Standard Table 216 | 217 | std-table = std-table-open key std-table-close 218 | 219 | std-table-open = %x5B ws ; [ Left square bracket 220 | std-table-close = ws %x5D ; ] Right square bracket 221 | 222 | ;; Inline Table 223 | 224 | inline-table = inline-table-open [ inline-table-keyvals ] inline-table-close 225 | 226 | inline-table-open = %x7B ws ; { 227 | inline-table-close = ws %x7D ; } 228 | inline-table-sep = ws %x2C ws ; , Comma 229 | 230 | inline-table-keyvals = keyval [ inline-table-sep inline-table-keyvals ] 231 | 232 | ;; Array Table 233 | 234 | array-table = array-table-open key array-table-close 235 | 236 | array-table-open = %x5B.5B ws ; [[ Double left square bracket 237 | array-table-close = ws %x5D.5D ; ]] Double right square bracket 238 | 239 | ;; Built-in ABNF terms, reproduced here for clarity 240 | 241 | ALPHA = %x41-5A / %x61-7A ; A-Z / a-z 242 | DIGIT = %x30-39 ; 0-9 243 | HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" 244 | -------------------------------------------------------------------------------- /doc/index.mld: -------------------------------------------------------------------------------- 1 | {0 Serialkit {%html: %%VERSION%%%}} 2 | 3 | Serialkit provides easy to use APIs to query, edit and (de)serialize 4 | data serialisation formats. 5 | 6 | {b Warning.} Serialkit codecs work on complete, in-memory, 7 | representations of the data. If you need streaming support pass your 8 | way. Besides at the moment IO is performed directly from and to OCaml 9 | strings but that constraint may be lifted in the future. 10 | 11 | {1:lib Library [serialkit]} 12 | 13 | {!modules: 14 | Serialkit_text 15 | Serialkit_json 16 | Serialkit_sexp 17 | Serialkit_toml 18 | } 19 | -------------------------------------------------------------------------------- /opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "serialkit" 3 | synopsis: "Serialization formats toolkit for OCaml" 4 | description: """\ 5 | Serialkit provides easy to use OCaml APIs to query, update and 6 | generate data in generic serialization data formats. 7 | 8 | The supported data formats are JSON text, s-expressions and TOML. 9 | 10 | Serialkit is distributed under the ISC license. It has no dependencies. 11 | 12 | Homepage: """ 13 | maintainer: "Daniel Bünzli " 14 | authors: "The serialkit programmers" 15 | license: "ISC" 16 | tags: ["codec" "json" "sexp" "toml" "query" "org:erratique"] 17 | homepage: "https://erratique.ch/software/serialkit" 18 | doc: "https://erratique.ch/software/serialkit/doc" 19 | bug-reports: "https://github.com/dbuenzli/serialkit/issues" 20 | depends: [ 21 | "ocaml" {>= "4.14.0"} 22 | "ocamlfind" {build} 23 | "ocamlbuild" {build} 24 | "topkg" {build & >= "1.0.3"} 25 | "cmdliner" {>= "1.1.0"} 26 | ] 27 | build: ["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"] 28 | dev-repo: "git+https://erratique.ch/repos/serialkit.git" 29 | -------------------------------------------------------------------------------- /pkg/META: -------------------------------------------------------------------------------- 1 | description = "Serialization formats toolkit for OCaml" 2 | version = "%%VERSION_NUM%%" 3 | requires = "" 4 | archive(byte) = "serialkit.cma" 5 | archive(native) = "serialkit.cmxa" 6 | plugin(byte) = "serialkit.cma" 7 | plugin(native) = "serialkit.cmxs" 8 | exists_if = "serialkit.cma serialkit.cmxa" 9 | -------------------------------------------------------------------------------- /pkg/pkg.ml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ocaml 2 | #use "topfind" 3 | #require "topkg" 4 | open Topkg 5 | 6 | let () = 7 | Pkg.describe "serialkit" @@ fun c -> 8 | Ok [ Pkg.mllib "src/serialkit.mllib"; 9 | Pkg.test "test/test"; 10 | Pkg.bin "tool/cmd_main" ~dst:"serialkit" ] 11 | -------------------------------------------------------------------------------- /src/serialkit.mllib: -------------------------------------------------------------------------------- 1 | Serialkit_text 2 | Serialkit_json 3 | Serialkit_sexp 4 | Serialkit_toml 5 | -------------------------------------------------------------------------------- /src/serialkit_json.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2016 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open Serialkit_text 7 | 8 | module Json = struct 9 | 10 | (* JSON text *) 11 | 12 | type loc = Textloc.t 13 | type mem = (string * loc) * t 14 | and t = 15 | [ `Null of loc 16 | | `Bool of bool * loc 17 | | `Float of float * loc 18 | | `String of string * loc 19 | | `A of t list * loc 20 | | `O of mem list * loc ] 21 | 22 | let loc_nil = Textloc.none 23 | let loc = function 24 | | `Null l | `Bool (_, l) | `Float (_, l) | `String (_, l) | `A (_, l) 25 | | `O (_, l) -> l 26 | 27 | (* Constructors *) 28 | 29 | let null = `Null loc_nil 30 | let bool b = `Bool (b, loc_nil) 31 | let float f = `Float (f, loc_nil) 32 | let string s = `String (s, loc_nil) 33 | let array vs = `A (vs, loc_nil) 34 | let mem n v = ((n, loc_nil), v) 35 | let obj mems = `O (mems, loc_nil) 36 | 37 | (* Accessors *) 38 | 39 | let kind = function 40 | | `Null _ -> "null" | `Bool _ -> "bool" | `Float _ -> "float" 41 | | `String _ -> "string" | `A _ -> "array" | `O _ -> "object" 42 | 43 | let err_exp exp fnd = 44 | Format.asprintf "%a: %s but expected %s" Textloc.pp (loc fnd) (kind fnd) exp 45 | 46 | let err_exp_null = err_exp "null" 47 | let err_exp_bool = err_exp "bool" 48 | let err_exp_float = err_exp "number" 49 | let err_exp_string = err_exp "string" 50 | let err_exp_array = err_exp "array" 51 | let err_exp_obj = err_exp "object" 52 | 53 | let err e = Error e 54 | let to_null = function `Null _ -> Ok () | j -> err (err_exp_null j) 55 | let to_bool = function `Bool (b, _) -> Ok b | j -> err (err_exp_bool j) 56 | let to_float = function `Float (f, _) -> Ok f | j -> err (err_exp_float j) 57 | let to_string = function `String (s,_) -> Ok s | j -> err (err_exp_string j) 58 | let to_array = function `A (vs, _) -> Ok vs | j -> err (err_exp_array j) 59 | let to_obj = function `O (mems, _) -> Ok mems | j -> err (err_exp_obj j) 60 | 61 | let err = invalid_arg 62 | let get_null = function `Null _ -> () | j -> err (err_exp_null j) 63 | let get_bool = function `Bool (b, _) -> b | j -> err (err_exp_bool j) 64 | let get_float = function `Float (f, _) -> f | j -> err (err_exp_float j) 65 | let get_string = function `String (s,_) -> s | j -> err (err_exp_string j) 66 | let get_array = function `A (vs, _) -> vs | j -> err (err_exp_array j) 67 | let get_obj = function `O (mems, _) -> mems | j -> err (err_exp_obj j) 68 | 69 | (* Decode *) 70 | 71 | (* FIXME add positions and reuse Tlex. *) 72 | 73 | type decoder = { t : Buffer.t; i : string; mutable pos : int; } 74 | let decoder s = { t = Buffer.create 255; i = s; pos = 0 } 75 | let accept d = d.pos <- d.pos + 1 [@@ ocaml.inline] 76 | let treset d = Buffer.reset d.t [@@ ocaml.inline] 77 | let taccept d = Buffer.add_char d.t d.i.[d.pos]; accept d; [@@ ocaml.inline] 78 | let taddc d c = Buffer.add_char d.t c [@@ ocaml.inline] 79 | let taddu d u = Buffer.add_utf_8_uchar d.t u 80 | let token d = Buffer.contents d.t [@@ ocaml.inline] 81 | let eoi d = d.pos = String.length d.i [@@ ocaml.inline] 82 | let byte d = match eoi d with 83 | | true -> 0xFFF 84 | | false -> Char.code d.i.[d.pos] 85 | [@@ ocaml.inline] 86 | 87 | let err d fmt = 88 | Format.kasprintf (fun s -> raise_notrace (Failure s)) ("%d: " ^^ fmt) d.pos 89 | 90 | let pp_byte ppf d = match byte d with 91 | | 0xFFF -> Format.fprintf ppf "end of input" 92 | | b -> Format.fprintf ppf "%C" (Char.chr b) 93 | 94 | type utf_8_case = 95 | | L1 | L2 | L3_E0 | L3_E1_EC_or_EE_EF | L3_ED | L4_F0 | L4_F1_F3 | L4_F4 | E 96 | 97 | let utf_8_case = 98 | (* 99 | (* See https://tools.ietf.org/html/rfc3629#section-4 *) 100 | Printf.printf "[|"; 101 | for i = 0 to 255 do 102 | if i mod 16 = 0 then Printf.printf "\n"; 103 | if 0x00 <= i && i <= 0x7F then Printf.printf "L1; " else 104 | if 0xC2 <= i && i <= 0xDF then Printf.printf "L2; " else 105 | if 0xE0 = i then Printf.printf "L3_E0; " else 106 | if 0xE1 <= i && i <= 0xEC || 0xEE <= i && i <= 0xEF 107 | then Printf.printf "L3_E1_EC_or_EE_EF; " else 108 | if 0xED = i then Printf.printf "L3_ED;" else 109 | if 0xF0 = i then Printf.printf "L4_F0; " else 110 | if 0xF1 <= i && i <= 0xF3 then Printf.printf "L4_F1_F3; " else 111 | if 0xF4 = i then Printf.printf "L4_F4; " else 112 | Printf.printf "E; " 113 | done; 114 | Printf.printf "\n|]" 115 | *) 116 | [| 117 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 118 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 119 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 120 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 121 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 122 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 123 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 124 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 125 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 126 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 127 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 128 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 129 | E; E; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; 130 | L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; 131 | L3_E0; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 132 | L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 133 | L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 134 | L3_E1_EC_or_EE_EF; L3_ED;L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 135 | L4_F0; L4_F1_F3; L4_F1_F3; L4_F1_F3; L4_F4; E; E; E; E; E; E; E; E; E; E; E; 136 | |] 137 | 138 | let taccept_utf_8 d = 139 | let err d = err d "expected UTF-8 byte found: %a" pp_byte d in 140 | let b = byte d in 141 | let accept_tail d = 142 | if (byte d lsr 6 = 0b10) then taccept d else err d [@@ocaml.inline] 143 | in 144 | match utf_8_case.(b) with 145 | | L1 -> taccept d 146 | | L2 -> taccept d; accept_tail d 147 | | L3_E0 -> 148 | taccept d; 149 | if (byte d - 0xA0 < 0xBF - 0xA0) then taccept d else err d; 150 | accept_tail d 151 | | L3_E1_EC_or_EE_EF -> taccept d; accept_tail d; accept_tail d 152 | | L3_ED -> 153 | taccept d; 154 | if (byte d - 0x80 < 0x9F - 0x80) then taccept d else err d; 155 | accept_tail d 156 | | L4_F0 -> 157 | taccept d; 158 | if (byte d - 0x90 < 0xBF - 0x90) then taccept d else err d; 159 | accept_tail d; accept_tail d 160 | | L4_F1_F3 -> taccept d; accept_tail d; accept_tail d; accept_tail d; 161 | | L4_F4 -> 162 | taccept d; 163 | if (byte d - 0x80 < 0x8F - 0x80) then taccept d else err d; 164 | | E -> err d 165 | 166 | let accept_bytes d bytes = (* first byte already checked *) 167 | let max = String.length bytes - 1 in 168 | let rec loop i = match i > max with 169 | | true -> () 170 | | false -> 171 | match Char.code bytes.[i] = byte d with 172 | | true -> accept d; loop (i + 1) 173 | | false -> 174 | err d "expected %C found: %a while parsing '%s'" 175 | bytes.[i] pp_byte d bytes 176 | in 177 | accept d; loop 1 178 | 179 | let rec skip_ws d = match byte d with 180 | | 0x20 | 0x09 | 0x0A | 0x0D -> accept d; skip_ws d 181 | | _ -> () 182 | 183 | let parse_true d = accept_bytes d "true"; `Bool (true, loc_nil) 184 | let parse_false d = accept_bytes d "false"; `Bool (false, loc_nil) 185 | let parse_null d = accept_bytes d "null"; `Null loc_nil 186 | let parse_number d = (* not fully compliant *) 187 | let conv d = try `Float (float_of_string (token d), loc_nil) with 188 | | Failure e -> err d "could not parse a float from: %S" (token d) 189 | in 190 | let rec taccept_non_sep d = match byte d with 191 | | 0x20 | 0x09 | 0x0A | 0x0D | 0x2C | 0x5D | 0x7D | 0xFFF -> conv d 192 | | _ -> taccept d; taccept_non_sep d 193 | in 194 | treset d; taccept d; taccept_non_sep d 195 | 196 | let rec parse_uescape d hi u count = 197 | let pp_ucp ppf d = Format.fprintf ppf "U+%04X" d in 198 | let err_not_lo d u = err d "not a low surrogate %a" pp_ucp u in 199 | let err_lo d u = err d "lone low surrogate %a" pp_ucp u in 200 | let err_hi d u = err d "lone high surrogate %a" pp_ucp u in 201 | match count > 0 with 202 | | true -> 203 | begin match byte d with 204 | | c when 0x30 <= c && c <= 0x39 -> 205 | accept d; parse_uescape d hi (u * 16 + c - 0x30) (count - 1) 206 | | c when 0x41 <= c && c <= 0x46 -> 207 | accept d; parse_uescape d hi (u * 16 + c - 0x37) (count - 1) 208 | | c when 0x61 <= c && c <= 0x66 -> 209 | accept d; parse_uescape d hi (u * 16 + c - 0x57) (count - 1) 210 | | c -> 211 | err d "expected hex digit found: %C" (Char.chr c) 212 | end 213 | | false -> 214 | match hi with 215 | | Some hi -> (* combine high and low surrogate into scalar value. *) 216 | if u < 0xDC00 || u > 0xDFFF then err_not_lo d u else 217 | let u = ((((hi land 0x3FF) lsl 10) lor (u land 0x3FF)) + 0x10000) in 218 | taddu d (Uchar.unsafe_of_int u) 219 | | None -> 220 | if u < 0xD800 || u > 0xDFFF then taddu d (Uchar.unsafe_of_int u) 221 | else if u > 0xDBFF then err_lo d u else 222 | match byte d with 223 | | 0x5C -> 224 | accept d; 225 | begin match byte d with 226 | | 0x75 -> accept d; parse_uescape d (Some u) 0 4 227 | | _ -> err_hi d u 228 | end 229 | | _ -> err_hi d u 230 | 231 | let parse_string d = 232 | let parse_escape d = match byte d with 233 | | (0x22 | 0x5C | 0x2F as b) -> taddc d (Char.chr b); accept d; 234 | | 0x62 -> taddc d '\x08'; accept d; 235 | | 0x66 -> taddc d '\x0C'; accept d; 236 | | 0x6E -> taddc d '\x0A'; accept d; 237 | | 0x72 -> taddc d '\x0D'; accept d; 238 | | 0x74 -> taddc d '\x09'; accept d; 239 | | 0x75 -> 240 | accept d; parse_uescape d None 0 4 241 | | _ -> err d "expected escape found: %a" pp_byte d 242 | in 243 | let rec loop d = match byte d with 244 | | 0x5C (* '\' *) -> accept d; parse_escape d; loop d 245 | | 0x22 (* '"' *) -> accept d; `String ((token d), loc_nil) 246 | | 0xFFF -> err d "unclosed string" 247 | | _ -> taccept_utf_8 d; loop d 248 | in 249 | accept d; treset d; loop d 250 | 251 | let rec parse_object d = match (accept d; skip_ws d; byte d) with 252 | | 0x7D (* '}' *) -> accept d; `O ([], loc_nil) 253 | | _ -> 254 | let parse_name d = 255 | let `String name = match (skip_ws d; byte d) with 256 | | 0x22 (* '"' *) -> parse_string d 257 | | _ -> err d "expected '\"' found: %a" pp_byte d 258 | in 259 | skip_ws d; name 260 | in 261 | let rec loop acc d = 262 | let name = parse_name d in 263 | match byte d with 264 | | 0x3A (* ':' *) -> 265 | let v = (accept d; parse_value d) in 266 | begin match byte d with 267 | | 0x2C (* ',' *) -> accept d; loop ((name, v) :: acc) d 268 | | 0x7D (* '}' *) -> accept d; `O (List.rev ((name, v) :: acc), 269 | loc_nil) 270 | | _ -> err d "expected ',' or '}' found: %a" pp_byte d 271 | end 272 | | _ -> err d "expected ':' found: %a" pp_byte d 273 | in 274 | loop [] d 275 | 276 | and parse_array d = match (accept d; skip_ws d; byte d) with 277 | | 0x5D (* ']' *) -> accept d; `A ([], loc_nil) 278 | | _ -> 279 | let rec loop acc d = 280 | let v = parse_value d in 281 | match byte d with 282 | | 0x2C (* ',' *) -> accept d; loop (v :: acc) d 283 | | 0x5D (* ']' *) -> accept d; `A (List.rev (v :: acc), loc_nil) 284 | | _ -> err d "expected ',' or ']' found: %a" pp_byte d 285 | in 286 | loop [] d 287 | 288 | and parse_value d : t = 289 | let v = match (skip_ws d; byte d) with 290 | | 0x22 (* '"' *) -> parse_string d 291 | | 0x74 (* 't' *) -> parse_true d 292 | | 0x66 (* 'f' *) -> parse_false d 293 | | 0x6E (* 'n' *) -> parse_null d 294 | | 0x7B (* '{' *) -> parse_object d 295 | | 0x5B (* '[' *) -> parse_array d 296 | | 0x2D (* '-' *) -> parse_number d 297 | | b when 0x30 (* '0' *) <= b && b <= 0x39 (* '9' *) -> parse_number d 298 | | _ -> err d "expected a JSON value found: %a" pp_byte d 299 | in 300 | skip_ws d; 301 | v 302 | 303 | let of_string ?(file = Textloc.file_none) s = 304 | try 305 | let d = decoder s in 306 | let v = parse_value d in 307 | match byte d with 308 | | 0xFFF (* eoi *) -> Ok v 309 | | _ -> err d "expected end of input found: %a" pp_byte d 310 | with 311 | | Failure e -> Error e 312 | 313 | (* JSON generation *) 314 | 315 | module G = struct 316 | (* Not T.R. we could CPS. *) 317 | 318 | type enc = { mutable sep : bool; b : Buffer.t } 319 | type t = enc -> unit 320 | 321 | let addc c enc = Buffer.add_char enc.b c 322 | let adds s enc = Buffer.add_string enc.b s 323 | let adds_esc s enc = 324 | let is_control = 325 | function '\x00' .. '\x1F' | '\x7F' -> true | _ -> false 326 | in 327 | let len = String.length s in 328 | let max_idx = len - 1 in 329 | let flush b start i = 330 | if start < len then Buffer.add_substring b s start (i - start); 331 | in 332 | let rec loop start i = match i > max_idx with 333 | | true -> flush enc.b start i 334 | | false -> 335 | let next = i + 1 in 336 | match String.get s i with 337 | | '"' -> flush enc.b start i; adds "\\\"" enc; loop next next 338 | | '\\' -> flush enc.b start i; adds "\\\\" enc; loop next next 339 | | c when is_control c -> 340 | flush enc.b start i; 341 | adds (Format.asprintf "\\u%04X" (Char.code c)) enc; 342 | loop next next 343 | | c -> loop start next 344 | in 345 | loop 0 0 346 | 347 | let null enc = adds "null" enc 348 | let bool b enc = adds (if b then "true" else "false") enc 349 | let int i enc = adds (string_of_int i) enc 350 | let float f enc = adds (Format.asprintf "%.16g" f) enc 351 | let string s enc = addc '"' enc; adds_esc s enc; addc '"' enc 352 | 353 | let nosep enc = enc.sep <- false 354 | let sep enc = enc.sep 355 | let set_sep sep enc = enc.sep <- sep 356 | let if_sep enc = if not enc.sep then enc.sep <- true else addc ',' enc 357 | 358 | type array = t 359 | let array enc = () 360 | let array_end els enc = 361 | let sep = sep enc in 362 | addc '[' enc; nosep enc; els enc; addc ']' enc; set_sep sep enc 363 | 364 | let el e arr enc = arr enc; if_sep enc; e enc 365 | let el_if c e arr enc = if c then el (e ()) arr enc else arr enc 366 | 367 | type obj = t 368 | let obj enc = () 369 | let obj_end mems enc = 370 | let sep = sep enc in 371 | addc '{' enc; nosep enc; mems enc; addc '}' enc; set_sep sep enc 372 | 373 | let mem m v obj enc = obj enc; if_sep enc; string m enc; addc ':' enc; v enc 374 | let mem_if c m v obj enc = if c then mem m (v ()) obj enc else obj enc 375 | 376 | (* Derived generators *) 377 | 378 | let strf fmt = Format.kasprintf string fmt 379 | let list elv l = 380 | array_end (List.fold_left (fun a v -> el (elv v) a) array l) 381 | 382 | let option some o = match o with None -> null | Some v -> some v 383 | let rec json = function 384 | | `Null _ -> null 385 | | `Bool (b, _) -> bool b 386 | | `Float (f, _) -> float f 387 | | `String (s, _) -> string s 388 | | `A (a, _) -> 389 | array_end @@ List.fold_left (fun a e -> el (json e) a) array a 390 | | `O (o, _) -> 391 | obj_end @@ List.fold_left (fun o ((m, _), v) -> mem m (json v) o) obj o 392 | 393 | (* Output generated values *) 394 | 395 | let buffer_add b g = g { sep = true; b } 396 | let to_string g = 397 | let b = Buffer.create 65535 in 398 | (buffer_add b g; Buffer.contents b) 399 | end 400 | 401 | let to_string v = G.to_string (G.json v) 402 | 403 | let pp ppf (v : t) = (* FIXME not T.R. *) 404 | let pp_string ppf s = (* FIXME quick & dirty escaping *) 405 | Format.pp_print_string ppf (G.to_string (G.json ((`String (s, loc_nil))))) 406 | in 407 | let pp_comma ppf () = 408 | Format.(pp_print_char ppf ','; pp_print_space ppf ()) 409 | in 410 | let rec loop ppf = function 411 | | `Null _ -> Format.pp_print_string ppf "null" 412 | | `Bool (b,_ ) -> Format.pp_print_string ppf (if b then "true" else "false") 413 | | `Float (f, _) -> Format.fprintf ppf "%.16g" f 414 | | `String (s, _) -> pp_string ppf s 415 | | `A (a, _) -> 416 | Format.pp_open_box ppf 1; 417 | Format.pp_print_char ppf '['; 418 | Format.pp_print_list ~pp_sep:pp_comma loop ppf a; 419 | Format.pp_print_char ppf ']'; 420 | Format.pp_close_box ppf (); 421 | | `O (o, _) -> 422 | let pp_mem ppf ((m, _), v) = 423 | Format.pp_open_box ppf 1; 424 | pp_string ppf m; 425 | Format.pp_print_char ppf ':'; Format.pp_print_space ppf (); 426 | loop ppf v; 427 | Format.pp_close_box ppf (); 428 | in 429 | Format.pp_open_vbox ppf 1; 430 | Format.pp_print_char ppf '{'; 431 | Format.pp_print_list ~pp_sep:pp_comma pp_mem ppf o; 432 | Format.pp_print_char ppf '}'; 433 | Format.pp_close_box ppf (); 434 | in 435 | loop ppf v 436 | end 437 | 438 | module Jsong = Json.G 439 | module Jsonq = struct 440 | 441 | module Sset = Set.Make (String) 442 | module Smap = Map.Make (String) 443 | 444 | let pp_quote ppf s = Format.fprintf ppf "'%s'" s 445 | let pp_mem = pp_quote 446 | 447 | type path = (* Paths in JSON values, array and object member traversals. *) 448 | ([`A | `O of string] * Json.loc) list (* in reverse order *) 449 | 450 | let path_to_string p = 451 | let seg = function `A -> "[]" | `O n -> "." ^ n in 452 | String.concat "" (List.rev_map seg p) 453 | 454 | let path_to_trace ?(pp_mem = pp_mem) p = 455 | let seg = function 456 | | `A, l -> Format.asprintf "%a: in array" Textloc.pp l 457 | | `O m, l -> Format.asprintf "%a: in key %a" Textloc.pp l pp_mem m 458 | in 459 | String.concat "\n" (List.map seg p) 460 | 461 | (* Errors *) 462 | 463 | exception Err of path * Textloc.t * string 464 | 465 | let err p l msg = raise_notrace (Err (p, l, msg)) 466 | let errf p l fmt = Format.kasprintf (err p l) fmt 467 | let err_exp exp p fnd = 468 | errf p (Json.loc fnd) "found %s but expected %s" (Json.kind fnd) exp 469 | 470 | let err_exp_null = err_exp "null" 471 | let err_exp_bool = err_exp "bool" 472 | let err_exp_float = err_exp "number" 473 | let err_exp_string = err_exp "string" 474 | let err_exp_array = err_exp "array" 475 | let err_exp_obj = err_exp "object" 476 | let err_empty_array p l = errf p l "unexpected empty array" 477 | let err_miss_mem p l n = errf p l "member %a unbound in object" pp_mem n 478 | let err_to_string ?pp_mem p loc msg = 479 | let pp_lines ppf s = 480 | Format.fprintf ppf "@[%a@]" 481 | (Format.pp_print_list Format.pp_print_string) 482 | (String.split_on_char '\n' s) 483 | in 484 | match p with 485 | | [] -> Format.asprintf "%a:@\n%a" Textloc.pp loc pp_lines msg 486 | | p -> 487 | Format.asprintf "%a:@\n%a@\n @[%a@]" 488 | Textloc.pp loc pp_lines msg pp_lines (path_to_trace p) 489 | 490 | (* Queries *) 491 | 492 | type 'a t = path -> Json.t -> 'a 493 | 494 | let query q s = try Ok (q [] s) with 495 | | Err (p, l, m) -> Error (err_to_string p l m) 496 | 497 | (* Succeeding and failing queries *) 498 | 499 | let succeed v p j = v 500 | let fail msg p j = err p (Json.loc j) msg 501 | let failf fmt = Format.kasprintf fail fmt 502 | 503 | (* Query combinators *) 504 | 505 | let app fq q p j = fq p j (q p j) 506 | let ( $ ) = app 507 | let pair q0 q1 p j = let v0 = q0 p j in v0, q1 p j 508 | let bind q f p j = f (q p j) p j 509 | let map f q p j = f (q p j) 510 | let some q p j = Some (q p j) 511 | 512 | (* JSON queries *) 513 | 514 | let fold ~null ~bool ~float ~string ~array ~obj p = function 515 | | `Null _ as j -> null p j 516 | | `Bool _ as j -> bool p j 517 | | `Float _ as j -> float p j 518 | | `String _ as j -> string p j 519 | | `A _ as j -> array p j 520 | | `O _ as j -> obj p j 521 | 522 | let partial_fold ?null ?bool ?float ?string ?array ?obj () p j = 523 | let with_q q p j = match q with 524 | | None -> 525 | let kind k = function None -> "" | Some _ -> k in 526 | let kinds = [ kind "null" null; kind "bool" bool; 527 | kind "number" float; kind "string" string; 528 | kind "array" array; kind "obj" obj ] 529 | in 530 | let kinds = List.filter (fun s -> s <> "") kinds in 531 | let kinds = String.concat ", " kinds in 532 | (* FIXME use error messages from Err_msg *) 533 | let kinds = if kinds = "" then "nothing" else "one of " ^ kinds in 534 | err_exp kinds p j 535 | | Some q -> q p j 536 | in 537 | match j with 538 | | `Null _ as j -> with_q null p j 539 | | `Bool _ as j -> with_q bool p j 540 | | `Float _ as j -> with_q float p j 541 | | `String _ as j -> with_q string p j 542 | | `A _ as j -> with_q array p j 543 | | `O _ as j -> with_q obj p j 544 | 545 | let json p s = s 546 | let loc p s = Json.loc s 547 | let with_loc q p s = (q p s), Json.loc s 548 | 549 | (* Nulls *) 550 | 551 | let is_null p = function `Null _ -> true | j -> false 552 | let null p = function `Null _ -> () | j -> err_exp_null p j 553 | let nullable q p = function `Null _ -> None | j -> Some (q p j) 554 | 555 | (* Atomic values *) 556 | 557 | let bool p = function `Bool (b, _) -> b | j -> err_exp_bool p j 558 | let float p = function `Float (f, _) -> f | j -> err_exp_float p j 559 | let int = map truncate float 560 | let string p = function `String (s, _) -> s | j -> err_exp_string p j 561 | 562 | let string_to ~kind parse p = function 563 | | `String (s, _) as j -> 564 | (match parse s with Ok v -> v | Error m -> fail m p j) 565 | | j -> err_exp kind p j 566 | 567 | let enum ~kind ss p = function 568 | | `String (s, _) when Sset.mem s ss -> s 569 | | `String (s, l) -> 570 | let ss = Sset.elements ss in 571 | let hint, ss = match Textdec.err_suggest ss s with 572 | | [] -> Textdec.pp_must_be, ss 573 | | ss -> Textdec.pp_did_you_mean, ss 574 | in 575 | let kind ppf () = Format.pp_print_string ppf kind in 576 | let pp_v = Format.pp_print_string in 577 | errf p l "%a" (Textdec.pp_unknown' ~kind pp_v ~hint) (s, ss) 578 | | j -> err_exp kind p j 579 | 580 | let enum_map ~kind sm p = function 581 | | `String (s, l) -> 582 | begin match Smap.find s sm with 583 | | v -> v 584 | | exception Not_found -> 585 | let ss = Smap.fold (fun k _ acc -> k :: acc) sm [] in 586 | let hint, ss = match Textdec.err_suggest ss s with 587 | | [] -> Textdec.pp_must_be, ss 588 | | ss -> Textdec.pp_did_you_mean, ss 589 | in 590 | let kind ppf () = Format.pp_print_string ppf kind in 591 | let pp_v = Format.pp_print_string in 592 | errf p l "%a" (Textdec.pp_unknown' ~kind pp_v ~hint) (s, ss) 593 | end 594 | | j -> err_exp kind p j 595 | 596 | (* Array *) 597 | 598 | let is_empty_array p = function `A (a, _) -> a = [] | j -> err_exp_array p j 599 | let hd q p = function 600 | | `A ([], l) -> err_empty_array p l 601 | | `A (v :: _, l) -> q ((`A, l) :: p) v 602 | | j -> err_exp_array p j 603 | 604 | let tl q p = function 605 | | `A ([], l) -> err_empty_array p l 606 | | `A (_ :: [], l) -> q p (`A ([], Textloc.to_last l)) 607 | | `A (_ :: (v :: _ as a), l) -> 608 | let l = Textloc.reloc ~first:(Textloc.to_first (Json.loc v)) ~last:l in 609 | q p (`A (a, l)) 610 | | j -> err_exp_array p j 611 | 612 | let nth ?absent n q p = function 613 | | `A (vs, l) -> 614 | let p = (`A, l) :: p in 615 | let k, vs = if n < 0 then - n - 1, List.rev vs else n, vs in 616 | let rec loop k = function 617 | | v :: vs when k = 0 -> q p v 618 | | _ :: vs -> loop (k - 1) vs 619 | | [] -> 620 | match absent with 621 | | None -> errf p l "%d: no such index in array" n 622 | | Some absent -> absent 623 | in 624 | loop k vs 625 | | j -> err_exp_array p j 626 | 627 | let fold_array f q acc p = function 628 | | `A (vs, l) -> 629 | let p = (`A, l) :: p in 630 | let add p acc v = f (q p v) acc in 631 | List.fold_left (add p) acc vs 632 | | j -> err_exp_array p j 633 | 634 | let array qv = map List.rev (fold_array (fun v acc -> v :: acc) qv []) 635 | 636 | (* Objects *) 637 | 638 | let rec mem_find n = function 639 | | ((n', _), j) :: ms when String.equal n' n -> Some j 640 | | _ :: ms -> mem_find n ms 641 | | [] -> None 642 | 643 | let mem : string -> 'a t -> 'a t = fun n q p -> function 644 | | `O (ms, l) -> 645 | begin match mem_find n ms with 646 | | None -> err_miss_mem p l n 647 | | Some j -> q ((`O n, l) :: p) j 648 | end 649 | | j -> err_exp_obj p j 650 | 651 | let opt_mem n q ~absent p = function 652 | | `O (ms, l) -> 653 | begin match mem_find n ms with 654 | | None -> absent 655 | | Some j -> q ((`O n, l) :: p) j 656 | end 657 | | j -> err_exp_obj p j 658 | 659 | let mem_dom ~validate p = function 660 | | `O (ms, l) -> 661 | let add_mem = match validate with 662 | | None -> fun acc ((n, _), _) -> Sset.add n acc 663 | | Some dom -> 664 | fun acc ((n, _), _) -> match Sset.mem n dom with 665 | | true -> Sset.add n acc 666 | | false -> 667 | let ns = Sset.elements dom in 668 | let hint, ss = match Textdec.err_suggest ns n with 669 | | [] -> Textdec.pp_must_be, ns 670 | | ss -> Textdec.pp_did_you_mean, ss 671 | in 672 | let kind ppf () = Format.pp_print_string ppf "member" in 673 | let pp_v = Format.pp_print_string in 674 | errf p l "%a" (Textdec.pp_unknown' ~kind pp_v ~hint) (n, ss) 675 | in 676 | List.fold_left add_mem Sset.empty ms 677 | | j -> err_exp_obj p j 678 | 679 | let fold_obj f mem q acc p = function 680 | | `O (ms, l) -> 681 | let f acc ((n, nl), j) = match mem n with 682 | | Ok k -> f k (q ((`O n, l) :: p) j) acc 683 | | Error e -> err ((`O n, l) :: p) nl e 684 | in 685 | List.fold_left f acc ms 686 | | j -> err_exp_obj p j 687 | end 688 | -------------------------------------------------------------------------------- /src/serialkit_json.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2016 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** JSON text support. 7 | 8 | As specified in {{:https://tools.ietf.org/html/rfc8259}RFC8259}. 9 | 10 | Open this module to use it, this only introduces modules in your scope. *) 11 | 12 | open Serialkit_text 13 | 14 | (** JSON text definitions and codec. *) 15 | module Json : sig 16 | 17 | (** {1:json JSON text} *) 18 | 19 | type loc = Textloc.t 20 | (** The type for text locations. *) 21 | 22 | val loc_nil : loc 23 | (** [loc_nil] is an invalid input location. *) 24 | 25 | type mem = (string * loc) * t 26 | (** The type for JSON object members. *) 27 | 28 | and t = 29 | [ `Null of loc 30 | | `Bool of bool * loc 31 | | `Float of float * loc 32 | | `String of string * loc 33 | | `A of t list * loc 34 | | `O of mem list * loc ] 35 | (** The type for generic JSON text representations. *) 36 | 37 | val loc : t -> loc 38 | (** [loc j] is [j]'s input location. *) 39 | 40 | (** {1:cons Constructors} *) 41 | 42 | val null : t 43 | (** [null] is [`Null loc_nil]. *) 44 | 45 | val bool : bool -> t 46 | (** [bool b] is [`Bool (b, loc_nil)]. *) 47 | 48 | val float : float -> t 49 | (** [float b] is [`Float (f, loc_nil)]. *) 50 | 51 | val string : string -> t 52 | (** [string s] is [`String (s, loc_nil)]. *) 53 | 54 | val array : t list -> t 55 | (** [a vs] is [`A (vs, loc_nil)]. *) 56 | 57 | val mem : string -> t -> mem 58 | (** [mem n v] is [((n, loc_nil), v)]. *) 59 | 60 | val obj : mem list -> t 61 | (** [obj mems] is [`O (mems, loc_nil)]. *) 62 | 63 | (** {1:access Accessors} *) 64 | 65 | val to_null : t -> (unit, string) result 66 | (** [to_null j] extracts a null from [j]. If [j] is not a null an 67 | error with the location formatted according to {!Tloc.pp} 68 | is returned. *) 69 | 70 | val to_bool : t -> (bool, string) result 71 | (** [to_bool j] extracts a bool from [j]. If [j] is not a bool an 72 | error with the location formatted according to {!Tloc.pp} 73 | is returned. *) 74 | 75 | val to_float : t -> (float, string) result 76 | (** [to_float j] extracts a float from [j]. If [j] is not a float an 77 | error with the location formatted according to {!Tloc.pp} 78 | is returned. *) 79 | 80 | val to_string : t -> (string, string) result 81 | (** [to_string j] extracts a string from [j]. If [j] is not a string an 82 | error with the location formatted according to {!Tloc.pp} 83 | is returned. *) 84 | 85 | val to_array : t -> (t list, string) result 86 | (** [to_array j] extracts a array from [j]. If [j] is not a array an 87 | error with the location formatted according to {!Tloc.pp} 88 | is returned. *) 89 | 90 | val to_obj : t -> (mem list, string) result 91 | (** [to_obj j] extracts a array from [j]. If [j] is not a array an 92 | error with the location formatted according to {!Tloc.pp} 93 | is returned. *) 94 | 95 | val get_null : t -> unit 96 | (** [get_null j] is like {!to_null} but raises {!Invalid_argument} 97 | if [j] is not a null. *) 98 | 99 | val get_bool : t -> bool 100 | (** [get_bool j] is like {!to_bool} but raises {!Invalid_argument} 101 | if [j] is not a bool. *) 102 | 103 | val get_float : t -> float 104 | (** [get_float j] is like {!to_float} but raises {!Invalid_argument} 105 | if [j] is not a float. *) 106 | 107 | val get_string : t -> string 108 | (** [get_string j] is like {!to_string} but raises {!Invalid_argument} 109 | if [j] is not a string. *) 110 | 111 | val get_array : t -> t list 112 | (** [get_array j] is like {!to_array} but raises {!Invalid_argument} 113 | if [j] is not a array. *) 114 | 115 | val get_obj : t -> mem list 116 | (** [get_obj j] is like {!to_obj} but raises {!Invalid_argument} 117 | if [j] is not a array. *) 118 | 119 | (** {1:fmt Formatters} *) 120 | 121 | val pp : Format.formatter -> t -> unit 122 | (** [pp] formats JSON text. 123 | 124 | {b Warning.} Assumes all OCaml strings in the formatted value 125 | are UTF-8 encoded. *) 126 | 127 | (** {1:codec Codec} *) 128 | 129 | val of_string : ?file:Textloc.fpath -> string -> (t, string) result 130 | (** [of_string s] parses JSON text from [s] according to 131 | {{:https://tools.ietf.org/html/rfc8259}RFC8259} with the following 132 | limitations: 133 | {ul 134 | {- Numbers are parsed with [string_of_float] which is not 135 | compliant.} 136 | {- TODO Unicode escapes are left unparsed (this will not round trip 137 | with {!to_string}).}} 138 | 139 | {b Note.} All OCaml strings returned by this function are UTF-8 140 | encoded. *) 141 | 142 | val to_string : t -> string 143 | (** [to_string j] is [j] as JSON text, encoded according to 144 | {{:https://tools.ietf.org/html/rfc8259}RFC8259}. 145 | 146 | {b Warning.} Assumes all OCaml strings in [j] are UTF-8 encoded. *) 147 | end 148 | 149 | (** JSON value generation. *) 150 | module Jsong : sig 151 | 152 | (** {1:gen Generation} *) 153 | 154 | type t 155 | (** The type for generated JSON values. *) 156 | 157 | val null : t 158 | (** [null] is the generated JSON null value. *) 159 | 160 | val bool : bool -> t 161 | (** [bool b] is [b] as a generated JSON boolean value. *) 162 | 163 | val int : int -> t 164 | (** [int i] is [i] as a generated JSON number. *) 165 | 166 | val float : float -> t 167 | (** [float f] is [f] as a generated JSON number. *) 168 | 169 | val string : string -> t 170 | (** [str s] is [s] as a generated JSON string value. *) 171 | 172 | type array 173 | (** The type for generated JSON arrays. *) 174 | 175 | val array : array 176 | (** [array] is an empty array. *) 177 | 178 | val array_end : array -> t 179 | (** [array_end els] is arr a a generated JSON value. *) 180 | 181 | val el : t -> array -> array 182 | (** [el e arr] is array [arr] wit [e] added at the end. *) 183 | 184 | val el_if : bool -> (unit -> t) -> array -> array 185 | (** [el cond v arr] is [el (v ()) arr] if [cond] is [true] and 186 | [arr] otherwise. *) 187 | 188 | type obj 189 | (** The type for generated JSON objects. *) 190 | 191 | val obj : obj 192 | (** [obj] is an empty object. *) 193 | 194 | val obj_end : obj -> t 195 | (** [obj_end o] is [o] as a generated JSON value. *) 196 | 197 | val mem : string -> t -> obj -> obj 198 | (** [mem name v o] is [o] with member [name] bound to value [v] 199 | added. *) 200 | 201 | val mem_if : bool -> string -> (unit -> t) -> obj -> obj 202 | (** [mem_if cond name v o] is [mem name (v ()) o] if [cond] is [true] 203 | and [o] otherwise. *) 204 | 205 | (** {1:derived Derived generators} *) 206 | 207 | val strf : ('a, Format.formatter, unit, t) format4 -> 'a 208 | (** [strf fmt ...] is a JSON string generated value formatted according 209 | to [fmt]. *) 210 | 211 | val option : ('a -> t) -> 'a option -> t 212 | (** [option some o] is [o] as a generated JSON value which is 213 | {!null} if [o] is [None] and [some v] if [o] is [some v]. *) 214 | 215 | val list : ('a -> t) -> 'a list -> t 216 | (** [list el l] is [l] as a generated JSON array whose elements 217 | are generated using [el]. *) 218 | 219 | val json : Json.t -> t 220 | (** [of_json v] is the JSON value [v] as a generated value. *) 221 | 222 | (** {1:output Output} *) 223 | 224 | val buffer_add : Buffer.t -> t -> unit 225 | (** [buffer_add b g] adds the generated JSON value [g] to [b]. *) 226 | 227 | val to_string : t -> string 228 | (** [to_string g] is the generated JSON value [g] as a string. *) 229 | end 230 | 231 | (** JSON value queries. 232 | 233 | {b TODO} maybe we could expose a bit more options for error 234 | reporting. In particular the internal [path] type and a combinator 235 | in the vein of {!loc} to report back the path trace. Basically 236 | see {!Serialk_sexp}. *) 237 | module Jsonq : sig 238 | 239 | (** {1:query Queries} *) 240 | 241 | type 'a t 242 | (** The type JSON value queries. A query either fails or succeeds against 243 | a JSON value returning a value of type ['a]. *) 244 | 245 | val query : 'a t -> Json.t -> ('a, string) result 246 | (** [query q j] is [Ok v] if que query [q] succeeds on [s] and 247 | a (multiline) [Error e] otherwise. *) 248 | 249 | (** {1:success Success and failure} *) 250 | 251 | val succeed : 'a -> 'a t 252 | (** [succeed v] is a query that succeeds with value [v] on any 253 | JSON value. *) 254 | 255 | val fail : string -> 'a t 256 | (** [fail msg] is a query that fails on any JSON value with message 257 | [msg]. Do not include position information in [msg], this 258 | is automatically handled by the module. *) 259 | 260 | val failf : ('a, Format.formatter, unit, 'b t) format4 -> 'a 261 | (** [failf fmt ...] is like {!fail} but formats the message 262 | according to [fmt]. *) 263 | 264 | (** {1:qcomb Query combinators} *) 265 | 266 | val app : ('a -> 'b) t -> 'a t -> 'b t 267 | (** [app fq q] queries a s-expression first with [fq] and then with [q] 268 | and applies the result of latter to the former. *) 269 | 270 | val ( $ ) : ('a -> 'b) t -> 'a t -> 'b t 271 | (** [f $ v] is [app f v]. *) 272 | 273 | val pair : 'a t -> 'b t -> ('a * 'b) t 274 | (** [pair q0 q1] queries first with [q0] and then with [q1] and returns 275 | the pair of their result. *) 276 | 277 | val bind : 'a t -> ('a -> 'b t) -> 'b t 278 | (** [bind q f] queries a s-expression with [q], applies the result to 279 | [f] and re-queries the s-expression with the result. *) 280 | 281 | val map : ('a -> 'b) -> 'a t -> 'b t 282 | (** [map f q] is [app (succeed f) q]. *) 283 | 284 | val some : 'a t -> 'a option t 285 | (** [some q] is [map Option.some q]. *) 286 | 287 | (** {1:json JSON queries} *) 288 | 289 | val fold : 290 | null:'a t -> bool:'a t -> float:'a t -> string:'a t -> array:'a t -> 291 | obj:'a t -> 'a t 292 | (** [fold] queries JSON values according to their kind using the 293 | provided queries. *) 294 | 295 | val partial_fold : 296 | ?null:'a t -> ?bool:'a t -> ?float:'a t -> ?string:'a t -> ?array:'a t -> 297 | ?obj:'a t -> unit -> 'a t 298 | (** [partial_fold] is like {!fold} but only queries the kinds that 299 | are explicitely specified. It errors on other kinds. *) 300 | 301 | val json : Json.t t 302 | (** [json] queries any JSON value and returns it. *) 303 | 304 | val loc : Json.loc t 305 | (** [loc]is [map Sexp.loc sexp]. *) 306 | 307 | val with_loc : 'a t -> ('a * Json.loc) t 308 | (** [with_loc q] queries with [q] and returns the result with the 309 | location of the queried JSON value. *) 310 | 311 | (** {1:nulls Nulls} *) 312 | 313 | val is_null : bool t 314 | (** [is_null] tests for a JSON null value. *) 315 | 316 | val null : unit t 317 | (** [null] queries JSON null as unit and fails otherwise. *) 318 | 319 | val nullable : 'a t -> 'a option t 320 | (** [nullable q] is None on JSON null and otherwise queries the value 321 | with [q]. *) 322 | 323 | (** {1:atoms Atomic values} *) 324 | 325 | val bool : bool t 326 | (** [bool] queries JSON bool values as a [bool] value and fails otherwise. *) 327 | 328 | val float : float t 329 | (** [float] queries JSON number values as a [float] value and fails 330 | otherwise. *) 331 | 332 | val int : int t 333 | (** [int] is [map truncate float]. *) 334 | 335 | val string : string t 336 | (** [string] queries JSON string values as a [string] value and 337 | fails otherwise. *) 338 | 339 | val string_to : kind:string -> (string -> ('a, string) result) -> 'a t 340 | (** [string_to ~kind parse] queries a JSON string and parses it 341 | with [p]. In case of [Error m] error {!fail}s with [m]. [kind] 342 | is the kind of value parsed, it is used for the error in case no 343 | JSON string is found. *) 344 | 345 | val enum : kind:string -> Set.Make(String).t -> string t 346 | (** [enum ~kind ss] queries a JSON string for one of the elements of [ss] 347 | and fails otherwise. [kind] is for the kind of elements in [ss], 348 | it used for error reporting. *) 349 | 350 | val enum_map : kind:string -> 'a Map.Make(String).t -> 'a t 351 | (** [enum_map ~kind sm] queries a string for it's map in [sm] and fails 352 | if the string is not bound in [sm]. [kind] is for the kind elements 353 | in [sm], it is used for error reporting. *) 354 | 355 | (** {1:arrays Arrays} 356 | 357 | These queries only succeed on JSON array values. *) 358 | 359 | val is_empty_array : bool t 360 | (** [is_empty_array] queries an array for emptyness. *) 361 | 362 | val hd : 'a t -> 'a t 363 | (** [hd q] queries the first element of an array with [q]. Fails on empty 364 | arrays. *) 365 | 366 | val tl : 'a t -> 'a t 367 | (** [tail q] queries the tail of an array with [q]. Fails on empty 368 | arrays. *) 369 | 370 | val fold_array : ('a -> 'b -> 'b) -> 'a t -> 'b -> 'b t 371 | (** [fold_array f q acc] queries the elements of an array from left to 372 | right with [q] and folds the result with [f] starting with [acc]. *) 373 | 374 | val array : 'a t -> 'a list t 375 | (** [array q] queries the elements of an array with [q]. *) 376 | 377 | (** {2:array_index Array index queries} *) 378 | 379 | val nth : ?absent:'a -> int -> 'a t -> 'a t 380 | (** [nth ?absent n q] queries the [n]th element of an array with [q]. If 381 | [n] is negative counts from the end of the array, so [-1] is the 382 | last array element. If the element does not exist this fails if 383 | [absent] is [None] and succeeds with [v] if [absent] is [Some v]. *) 384 | 385 | (** {1:objects Objects} 386 | 387 | These queries only succeed on JSON object values. *) 388 | 389 | val mem : string -> 'a t -> 'a t 390 | (** [mem n q] queries the member [n] of a JSON object with [q]. The 391 | query fails if [n] is unbound in the object. *) 392 | 393 | val opt_mem : string -> 'a t -> absent:'a -> 'a t 394 | (** [opt_mem n q ~absent] queries the member [n] of a JSON object with [q]. 395 | absent is returned if [n] is unbound in the object. *) 396 | 397 | val mem_dom : validate:Set.Make(String).t option -> Set.Make(String).t t 398 | (** [mem_dom ~validate] queries the member domain of a JSON object. 399 | If [validate] is [Some dom], the query fails if a member name is not in 400 | [dom]. *) 401 | 402 | val fold_obj : 403 | ('m -> 'a -> 'b -> 'b) -> (string -> ('m, string) result) -> 404 | 'a t -> 'b -> 'b t 405 | (** [fold_obj f m q acc] parses each member with [m], queries its 406 | value with [q] and folds the result with [f] starting with [acc]. *) 407 | end 408 | -------------------------------------------------------------------------------- /src/serialkit_sexp.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2019 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** S-expression support. 7 | 8 | The module {!Sexp} has an s-expression codec and general 9 | definitions for working with them. {!Sexpg} generates 10 | s-expressions without going through a generic 11 | representation. {!Sexpq} queries and updates generic 12 | representations with combinators. 13 | 14 | Consult a {{!sexp_syntax}short introduction} to s-expressions and 15 | the syntax parsed by the codec, the encoding of 16 | {{!sexp_dict}key-value dictionaries} supported by this module and 17 | the end-user syntax for {{!sexp_path_caret}addressing and 18 | updating} s-expressions. 19 | 20 | Open this module to use it, this only introduces modules in your 21 | scope. 22 | 23 | {b Warning.} Serialization functions always assumes all OCaml 24 | strings in the data you provide is UTF-8 encoded. This is not 25 | checked by the module. *) 26 | 27 | (** {1:api API} *) 28 | 29 | open Serialkit_text 30 | 31 | (** S-expression definitions and codec. *) 32 | module Sexp : sig 33 | 34 | (** {1:meta Meta information} *) 35 | 36 | type 'a fmt = Format.formatter -> 'a -> unit 37 | (** The type for formatting functions. *) 38 | 39 | type loc = Textloc.t 40 | (** The type for source text locations. *) 41 | 42 | val loc_nil : Textloc.t 43 | (** [loc_nil] is a source text location for non-parsed s-expressions. *) 44 | 45 | val pp_loc : loc fmt 46 | (** [pp_loc] is {!Tloc.pp}. *) 47 | 48 | type a_meta 49 | (** The type for meta information about atoms. *) 50 | 51 | type l_meta 52 | (** The type for meta information about lists. *) 53 | 54 | val a_meta_nil : a_meta 55 | (** [a_meta_nil] is parse information for non-parsed atoms. *) 56 | 57 | val l_meta_nil : l_meta 58 | (** [l_meta_nil] is parse information for non-parsed lists. *) 59 | 60 | (** {1:sexp S-expressions} *) 61 | 62 | type t = [ `A of string * a_meta | `L of t list * l_meta ] 63 | (** The type for generic s-expression representations. Either an 64 | atom or a list. *) 65 | 66 | val atom : string -> t 67 | (** [atom a] is [`A (a, a_meta_nil)]. *) 68 | 69 | val list : t list -> t 70 | (** [list l] is [`L (l, l_meta_nil)]. *) 71 | 72 | (** {1:access Accessors} *) 73 | 74 | val loc : t -> loc 75 | (** [loc s] is [s]'s source text location. *) 76 | 77 | val to_atom : t -> (string, string) result 78 | (** [to_atom s] extracts an atom from [s]. If [s] is a list an error 79 | with the location formatted according to {!Tloc.pp} is 80 | returned. *) 81 | 82 | val get_atom : t -> string 83 | (** [get_atom s] is like {!to_atom} but raises {!Invalid_argument} 84 | if [s] is not an atom. *) 85 | 86 | val to_list : t -> (t list, string) result 87 | (** [to_list s] extracts a list from [s]. If [s] is an atom an error 88 | with the location formatted according to {!Tloc.pp} is 89 | returned. *) 90 | 91 | val get_list : t -> t list 92 | (** [get_atom s] is like {!to_list} but raises {!Invalid_argument} 93 | if [s] is not an list. *) 94 | 95 | val to_splice : t -> t list 96 | (** [to_splice s] is the either the list of [s] if [s] is or or 97 | [[s]] if [s] is an atom. *) 98 | 99 | (** {1:fmt Formatting} *) 100 | 101 | val pp : t fmt 102 | (** [pp] formats an s-expression. *) 103 | 104 | val pp_layout : t fmt 105 | (** [pp_layout ppf l] is like {!pp} but uses layout information. *) 106 | 107 | val pp_seq : t fmt 108 | (** [pp_seq] formats an s-expression but if it is a list the 109 | outer list separators are not formatted in the output. 110 | 111 | {b Warning.} Assumes all OCaml strings in the formatted value are 112 | UTF-8 encoded. *) 113 | 114 | val pp_seq_layout : t fmt 115 | (** [pp_seq_layout] is like {!pp_seq} but uses layout information. *) 116 | 117 | (** {1:codec Codec} *) 118 | 119 | type error_kind 120 | (** The type for kinds of decoding error. *) 121 | 122 | val pp_error_kind : unit -> error_kind fmt 123 | (** [pp_error_kind ()] formats an error kind. *) 124 | 125 | type error = error_kind * loc 126 | (** The type for decoding errors. *) 127 | 128 | val pp_error : 129 | ?pp_loc:loc fmt -> ?pp_error_kind:error_kind fmt -> 130 | ?pp_prefix:unit fmt -> unit -> error fmt 131 | (** [pp_error ~pp_loc ~pp_error_kind ~pp_prefix ()] formats errors 132 | using [pp_loc] (defaults to {!pp_loc}), [pp_error_kind] 133 | (defaults to {!pp_error_kind}) and [pp_prefix] (defaults formats 134 | ["Error: "]). *) 135 | 136 | val error_to_string : 137 | ?pp_error:error fmt -> ('a, error) result -> ('a, string) result 138 | (** [error_to_string r] converts an error to a string using [pp_error] 139 | (defaults to {!pp_error}). *) 140 | 141 | val seq_of_string : ?file:Textloc.fpath -> string -> (t, error) result 142 | (** [seq_of_string ?file s] parses a {e sequence} of s-expressions from 143 | [s]. [file] is the file for locations, defaults to ["-"]. The 144 | sequence is returned as a fake s-expression list that spans from 145 | the start of the first s-expression in [s] to the end of the 146 | last one; note that this list does not exist syntactically in 147 | [s]. 148 | 149 | If there are no s-expression in [s] the list is empty its 150 | location has both start and end positions at byte [0] (which may 151 | not exist). 152 | 153 | {b Note.} All OCaml strings returned by this function are UTF-8 154 | encoded. *) 155 | 156 | val seq_of_string' : 157 | ?pp_error:error fmt -> ?file:Textloc.fpath -> string -> (t, string) result 158 | (** [seq_of_string'] s {!seq_of_string} composed with {!error_to_string}. *) 159 | 160 | val seq_to_string : t -> string 161 | (** [seq_to_string s] encodes [s] to a sequence of s-expressions. If [s] 162 | is an s-expression list this wrapping list is not syntactically 163 | represented in the output (see also {!seq_of_string}), use 164 | [to_string (list [l])] if you want to output [l] as a list. 165 | 166 | {b Warning.} Assumes all OCaml strings in [s] are UTF-8 encoded. *) 167 | 168 | (** {1:sexp_index S-expression indices} *) 169 | 170 | type index = 171 | | Nth of int (** *) 172 | | Key of string (** *) 173 | (** The type for s-expression indexing operations. 174 | {ul 175 | {- [Nth n], lookup zero-based element [n] in a list. If [n] is 176 | negative, counts the number of elements from the end of the 177 | list, i.e. [-1] is the last list element.} 178 | {- [Key k], lookup binding [k] in an s-expression 179 | {{!sexp_dict}dictionary.}}} *) 180 | 181 | val pp_key : string fmt 182 | (** [pp_key] formats a key, this is {!Format.pp_print_string}. *) 183 | 184 | val pp_index : ?pp_key:string fmt -> unit -> index fmt 185 | (** [pp_index] formats indices. Keys are unbracketed and formatted 186 | with [pp_key], defaults to {!pp_key}. *) 187 | 188 | (** {1:sexp_path S-expression paths} *) 189 | 190 | type path = index list 191 | (** The type for paths, a sequence of indexing operations in {b reverse} 192 | order. *) 193 | 194 | val path_of_string : string -> (path, string) result 195 | (** [path_of_string] parses a path from [s] according to the syntax 196 | {{!sexp_path_caret}given here}. *) 197 | 198 | val pp_path : ?pp_key:string fmt -> unit -> path fmt 199 | (** [pp_path ?pp_key ()] is a formatter for paths using [pp_key] to 200 | format keys (defaults to {!pp_key}). *) 201 | 202 | (** {1:carets Carets} *) 203 | 204 | type caret_loc = 205 | | Before (** The void before the s-expression found by the path. *) 206 | | Over (** The s-expression found by the path. *) 207 | | After (** The void after the s-expression found by the path. *) 208 | (** The type for caret locations. *) 209 | 210 | type caret = caret_loc * path 211 | (** The type for carets. A caret location and the path at which it 212 | applies. *) 213 | 214 | val caret_of_string : string -> (caret, string) result 215 | (** [caret_of_string s] parses a caret from [s] according to the 216 | syntax {{!sexp_path_caret}given here}. *) 217 | 218 | val pp_caret : ?pp_key:string fmt -> unit -> caret fmt 219 | (** [pp_caret ?pp_key ()] is a formatter for carets using [pp_key] 220 | to format keys (defaults to {!pp_key}). *) 221 | end 222 | 223 | (** S-expression generation. *) 224 | module Sexpg : sig 225 | 226 | (** {1:gen Generation} *) 227 | 228 | type t 229 | (** The type for generated s-expressions. *) 230 | 231 | val atom : string -> t 232 | (** [atom s] is [s] as an atom. *) 233 | 234 | type lyst 235 | (** The type for generated s-expression lists. *) 236 | 237 | val ls : lyst 238 | (** [ls] starts a list. *) 239 | 240 | val le : lyst -> t 241 | (** [le l] ends lists [l]. *) 242 | 243 | val el : t -> lyst -> lyst 244 | (** [el e l] is list [l] with [e] added at the end. *) 245 | 246 | val el_if : bool -> (unit -> t) -> lyst -> lyst 247 | (** [el cond v l] is [el (v ()) l] if [cond] is [true] and 248 | [l] otherwise. *) 249 | 250 | (** {1:derived Derived generators} *) 251 | 252 | val atomf : ('a, Format.formatter, unit, t) format4 -> 'a 253 | (** [atomf fmt ...] is an atom formatted according to [fmt]. *) 254 | 255 | val bool : bool -> t 256 | (** [bool b] is [atomf "%b" b]. *) 257 | 258 | val int : int -> t 259 | (** [int i] is [atomf "%d" i]. *) 260 | 261 | val float : float -> t 262 | (** [float f] is [atomf "%g" f]. *) 263 | 264 | val float_hex : float -> t 265 | (** [float_hex f] is [atomf "%h" f]. *) 266 | 267 | val string : string -> t 268 | (** [string s] is {!atom}. *) 269 | 270 | val option : ('a -> t) -> 'a option -> t 271 | (** [option some o] is [o] as the [none] atom if [o] is 272 | [none] and a list starting with [some] atom followed by [some v] 273 | if [o] is [Some v]. *) 274 | 275 | val list : ('a -> t) -> 'a list -> t 276 | (** [list el l] is [l] as a list whose elements are generated using 277 | [el]. *) 278 | 279 | val sexp : Sexp.t -> t 280 | (** [sexp s] is the s-expression [s] as a generated value. *) 281 | 282 | (** {1:output Output} *) 283 | 284 | val buffer_add : Buffer.t -> t -> unit 285 | (** [buffer_add b g] adds the generated s-expression value [g] to [b]. *) 286 | 287 | val to_string : t -> string 288 | (** [to_string g] is the generated s-expression value [g] as a string. *) 289 | end 290 | 291 | (** S-expression queries. *) 292 | module Sexpq : sig 293 | 294 | (** {1:query_results Result paths} *) 295 | 296 | type path = (Sexp.index * Sexp.loc) list 297 | (** The type for result paths. This is a sequence of indexing 298 | operations tupled with the source text location of the indexed 299 | s-expression in {b reverse} order. *) 300 | 301 | val pp_path : 302 | ?pp_loc:Sexp.loc Sexp.fmt -> ?pp_key:string Sexp.fmt -> unit -> 303 | path Sexp.fmt 304 | (** [pp_path ~pp_loc ~pp_key ()] formats paths using [pp_loc] 305 | (defaults to {!Sexp.pp_loc}) and [pp_key] to format the keys 306 | (defaults to {!Sexp.pp_key}). *) 307 | 308 | (** {1:query_errors Query errors} *) 309 | 310 | type error_kind = 311 | [ `Key_unbound of string * string list 312 | | `Msg of string 313 | | `Nth_unbound of int * int 314 | | `Out_of_dom of string * string * string list ] 315 | (** The type for kinds of errors. 316 | {ul 317 | {- [`Key_unbound (k, dom)] on [k] that should have been in [dom] 318 | (if not empty).} 319 | {- [`Msg m] an arbitrary message [m] (should not 320 | include position information).} 321 | {- [`Nth_unbound (n, len)] on [n] an out of bound index in a list 322 | of length [len].} 323 | {- [`Out_of_dom (kind, v, dom)] on [v] of kind [kind] that 324 | should have been in [dom]}} *) 325 | 326 | val pp_error_kind : 327 | ?pp_em:string Sexp.fmt -> ?pp_key:string Sexp.fmt -> unit -> 328 | error_kind Sexp.fmt 329 | (** [pp_error_kind ~pp_loc ~pp_em ~pp_key ()] formats error kinds 330 | using [pp_loc] for locations, [pp_em] for emphasis and [pp_key] 331 | for keys. *) 332 | 333 | type error = error_kind * (path * Sexp.loc) 334 | (** The type for query errors. The error kind tupled with the path 335 | to the offending s-expression and the location of the 336 | s-expression. *) 337 | 338 | val pp_error : 339 | ?pp_loc:Sexp.loc Sexp.fmt -> ?pp_path:path Sexp.fmt -> 340 | ?pp_error_kind:error_kind Sexp.fmt -> ?pp_prefix:unit Sexp.fmt -> unit -> 341 | error Sexp.fmt 342 | (** [pp_error ~pp_loc ~pp_path ~pp_error_kind ~pp_prefix ()] formats 343 | errors using [pp_loc], [pp_path] (defaults to {!pp_path}), 344 | [pp_error_kind] (defaults to {!pp_error_kind}) and [pp_prefix] 345 | (defaults formats ["Error: "]). *) 346 | 347 | val error_to_string : 348 | ?pp_error:error Sexp.fmt -> ('a, error) result -> ('a, string) result 349 | (** [error_to_string ~pp_error r] converts an error in [r] to a string using 350 | [pp_error], defaults to {!pp_error}. *) 351 | 352 | (** {1:queries Queries} *) 353 | 354 | type 'a t 355 | (** The type for s-expression queries. A query either succeeds 356 | against an s-expression with a value of type ['a] or it 357 | fails. *) 358 | 359 | val query : 'a t -> Sexp.t -> ('a, error) result 360 | (** [query q s] is [Ok v] if the query [q] succeeds on [s] and 361 | [Error e] otherwise. *) 362 | 363 | val query_at_path : 'a t -> (Sexp.t * path) -> ('a, error) result 364 | (** [query_at_path q (s, p)] is like {!query} except it assumes [s] 365 | is at path [p]. Use to further query s-expressions obtained with 366 | {!sexp_with_path} so that errors return the full path to errors. *) 367 | 368 | val query' : ?pp_error:error Sexp.fmt -> 'a t -> Sexp.t -> ('a, string) result 369 | (** [query' q s] is like {!query} except the result is composed with 370 | {!error_to_string}. *) 371 | 372 | (** {1:outcome Success and failure} *) 373 | 374 | val succeed : 'a -> 'a t 375 | (** [succeed v] is a query that succeeds with value [v] on any 376 | s-expression. *) 377 | 378 | val fail : error_kind -> 'a t 379 | (** [fail k] is a query that fails on any s-expression with error 380 | kind [k]. *) 381 | 382 | val failf : ('a, Format.formatter, unit, 'b t) format4 -> 'a 383 | (** [failf fmt ...] is [fail (`Msg m)] with [m] formatted according 384 | to [fmt]. *) 385 | 386 | (** {1:qcomb Query combinators} *) 387 | 388 | val app : ('a -> 'b) t -> 'a t -> 'b t 389 | (** [app fq q] queries an s-expression first with [fq] and then with [q] 390 | and applies the result of latter to the former. *) 391 | 392 | val ( $ ) : ('a -> 'b) t -> 'a t -> 'b t 393 | (** [f $ v] is [app f v]. *) 394 | 395 | val pair : 'a t -> 'b t -> ('a * 'b) t 396 | (** [pair q0 q1] queries an s-expression first with [q0] and then with [q1] 397 | and returns the pair of their result. *) 398 | 399 | val bind : 'a t -> ('a -> 'b t) -> 'b t 400 | (** [bind q f] queries an s-expression with [q], applies the result to 401 | [f] and re-queries the s-expression with the result. *) 402 | 403 | val map : ('a -> 'b) -> 'a t -> 'b t 404 | (** [map f q] is [app (succeed f) q]. *) 405 | 406 | val some : 'a t -> 'a option t 407 | (** [some q] is [map Option.some q]. *) 408 | 409 | val loc : 'a t -> ('a * (path * Sexp.loc)) t 410 | (** [loc q] queries with [q] an returns the result with the 411 | query path and source text location to the queried 412 | s-expression. *) 413 | 414 | (** {1:qsexp S-expression queries} 415 | 416 | Queries for s-expressions. These queries never fail. *) 417 | 418 | val fold : atom:'a t -> list:'a t -> 'a t 419 | (** [fold ~atom ~list] queries atoms with [atom] and lists with [list]. *) 420 | 421 | val sexp : Sexp.t t 422 | (** [sexp] queries any s-expression and returns its generic representation. *) 423 | 424 | val sexp_with_path : (Sexp.t * path) t 425 | (** [sexp_with_path] is like {!sexp} but also returns the path to 426 | s-expression. *) 427 | 428 | (** {1:qatom Atom queries} 429 | 430 | Queries for atoms. These queries fail on lists. *) 431 | 432 | val atom : string t 433 | (** [atom] queries an atom as a string. *) 434 | 435 | val atom_to : kind:string -> (string -> ('a, string) result) -> 'a t 436 | (** [atom_to ~kind p] queries an atom and parses it with [p]. In 437 | case of [Error m] fails with message [m]. [kind] is the kind of 438 | value parsed, used for the error in case a list is found. *) 439 | 440 | (** {b TODO.} Maybe combinators to devise an approriate parse 441 | function for {!atom_to} are a better idea than the following 442 | two combinators. *) 443 | 444 | val enum : kind:string -> Set.Make(String).t -> string t 445 | (** [enum ~kind ss] queries an atom for one of the element of [ss] 446 | and fails otherwise. [kind] is for the kind of elements in [ss], 447 | it used for error reporting. *) 448 | 449 | val enum_map : kind:string -> 'a Map.Make(String).t -> 'a t 450 | (** [enum_map ~pp_elt ~kind sm] queries an atom for it's map in [sm] 451 | and fails if the atom is not bound in [sm]. [kind] is for the 452 | kind of elements in [sm], it used for error reporting. *) 453 | 454 | val bool : bool t 455 | (** [bool] queries an atom for one of [true] or [false]. *) 456 | 457 | val int : int t 458 | (** [int] queries an atom for an integer value parsed with 459 | {!int_of_string}. *) 460 | 461 | val int32 : int32 t 462 | (** [int32] queries an atom for an integer value parsed with 463 | {!Int32.of_string}. *) 464 | 465 | val int64 : int64 t 466 | (** [int64] queries an atom for an integer value parsed with 467 | {!Int64.of_string}. *) 468 | 469 | val float : float t 470 | (** [float] queries an atom for a float value parsed with 471 | {!float_of_string}. *) 472 | 473 | (** {1:qlist List queries} 474 | 475 | Queries for s-expression lists. These queries fail on atoms. *) 476 | 477 | val is_empty : bool t 478 | (** [is_empty] queries a list for emptyness. *) 479 | 480 | val hd : 'a t -> 'a t 481 | (** [hd q] queries the head of a list with [q]. Fails on empty lists. *) 482 | 483 | val tl : 'a t -> 'a t 484 | (** [tail q] queries the tail of a list with [q]. Fails on empty lists. *) 485 | 486 | val fold_list : ('a -> 'b -> 'b) -> 'a t -> 'b -> 'b t 487 | (** [fold_list f q acc] queries the elements of a list from left to 488 | right with [q] and folds the result with [f] starting with 489 | [acc]. *) 490 | 491 | val list : 'a t -> 'a list t 492 | (** [list q] queries the elements of a list with [q]. *) 493 | 494 | (** {2:qlist List index queries} *) 495 | 496 | val nth : ?absent:'a -> int -> 'a t -> 'a t 497 | (** [nth ?absent n q] queries the [n]th index of a list with [q]. If 498 | [n] is negative counts from the end of the list, so [-1] is the 499 | last list element. If the element does not exist this fails if 500 | [absent] is [None] and succeeds with [v] if [absent] is [Some 501 | v]. *) 502 | 503 | val delete_nth : must_exist:bool -> int -> Sexp.t t 504 | (** [delete_nth ~must_exist n] deletes the [n]th element of the 505 | list. If the element does not exist this fails when [must_exist] 506 | is [true] or returns the list unchanged when [must_exist] is 507 | [false]. *) 508 | 509 | (** {1:qdict Dictionary queries} 510 | 511 | Queries for s-expression {{!sexp}dictionaries}. These queries 512 | fail on atoms. *) 513 | 514 | val key : ?absent:'a -> string -> 'a t -> 'a t 515 | (** [key ?absent k q] queries the value of key [k] of a dictionary 516 | with [q]. If [k] is not bound this fails if [absent] is [None] 517 | and succeeds with [v] if [absent] is [Some v]. *) 518 | 519 | val delete_key : must_exist:bool -> string -> Sexp.t t 520 | (** [delete_key ~must_exist k] deletes key [k] from the dictionary. 521 | If [k] is not bound this fails when [must_exist] is [true] or 522 | returns the dictionary unchanged when [must_exist] is 523 | [false]. *) 524 | 525 | val key_dom : validate:Set.Make(String).t option -> Set.Make(String).t t 526 | (** [key_dom validate] queries the key domain of a list of bindings. 527 | If [validate] is [Some dom], the query fails if a key is not in 528 | [dom]. The query also fails if a binding is not well-formed. 529 | [pp_key] is used to format keys. 530 | 531 | {b TODO.} Not really happy about this function, we lose the key 532 | locations which is useful for further deconstruction. Also maybe 533 | we rather want binding folds. *) 534 | 535 | val atomic : 'a t -> 'a t 536 | (** [atomic q] queries an atom or the atom of a singleton list with 537 | [q]. It fails on empty lists or non-singleton lists. 538 | 539 | This is useful for singleton {{!sexp_dict}dictionary} 540 | bindings. In error reporting treats the list as if it doesn't 541 | exist syntactically which is the case in dictionary bindings. *) 542 | 543 | (** {1:indices Index queries} *) 544 | 545 | val index : ?absent:'a -> Sexp.index -> 'a t -> 'a t 546 | (** [index ?absent i q] queries the s-expression index [i] with [q] using 547 | {!nth} or {!key} according to [i]. Fails on atoms. *) 548 | 549 | val delete_index : must_exist:bool -> Sexp.index -> Sexp.t t 550 | (** [delete_index ~must_exist i] deletes the s-expression index [i] 551 | using {!delete_nth} or {!delete_key} according to [i]. *) 552 | 553 | (** {1:path_caret Path and caret queries} 554 | 555 | These queries fail on indexing errors, that is if an atom gets 556 | indexed. *) 557 | 558 | val path : ?absent:'a -> Sexp.path -> 'a t -> 'a t 559 | (** [path p q] queries the s-expression found by [p] using [q]. If 560 | [p] can't be found this fails if [absent] is [None] and succeeds 561 | with [v] if [absent] is [Some v]. *) 562 | 563 | val probe_path : Sexp.path -> (path * Sexp.t * Sexp.path) t 564 | (** [probe_path p] is a query that probes for [p]'s existence. Except 565 | for indexing errors it always succeeds with [(sp, s, rem)]: 566 | {ul 567 | {- If [p] is found, this is the path to 568 | [sp] to the found expression [s] and [rem] is empty.} 569 | {- If [p] is not found, this is the path [sp] that leads 570 | to the s-expression [s] that could not be indexed and [rem] 571 | has the indexes that could not be performed.}} *) 572 | 573 | val delete_at_path : must_exist:bool -> Sexp.path -> Sexp.t t 574 | (** [delete_at_path ~must_exist p] deletes the s-expression found by 575 | [p] from the queried s-expression. If the path does not exist 576 | this fails if [must_exist] is [true] and returns the 577 | s-expression itself if [must_exist] is [false]. *) 578 | 579 | val splice_at_path : 580 | ?stub:Sexp.t -> must_exist:bool -> Sexp.path -> rep:Sexp.t -> Sexp.t t 581 | (** [splice_at_path ?stub ~must_exist p ~rep] replaces the s-expression 582 | found at [p] by splicing [rep]. If the path does not exist this fails 583 | if [must_exist] is [true] and the non-existing part of 584 | the path is created if [must_exist] is [false]. If elements need 585 | to be created [stub] (defaults to [Sexp.atom ""]) is used. *) 586 | 587 | val splice_at_caret : 588 | ?stub:Sexp.t -> must_exist:bool -> Sexp.caret -> rep:Sexp.t -> Sexp.t t 589 | (** [splice_caret ?stub ~must_exist p rep] splices the s-expression [rep] 590 | at the caret [p] of the s-expression. If path of the caret does not 591 | exist this fails if [must_exist] is [true] and the non-existing part of 592 | the path is created if [must_exist] is [false]. If atoms 593 | need to be create [stub] (defaults to [Sexp.atom ""]) is used. *) 594 | 595 | (** {1:ocaml OCaml datatype encoding queries} *) 596 | 597 | val option : 'a t -> 'a option t 598 | (** [option q] queries with [q] the value of an option represented 599 | according the encoding of {!Sexpg.option}. *) 600 | end 601 | 602 | (** {1:sexp_dict Dictionaries} 603 | 604 | An s-expression {e dictionary} is a list of bindings. A {e 605 | binding} is a list that starts with a {e key} and the remaining 606 | elements of the list are the binding's {e value}. For example in this 607 | binding: 608 | {v 609 | (key v0 v1 ...) 610 | v} 611 | The key is [key] and the value the possibly empty list [v0], [v1], 612 | ... of s-expressions. The {{!Sexpq.qdict}API} for dictionaries 613 | represents the value by a fake (doesn't exist syntactically) 614 | s-expression list whose text location starts at the first element 615 | of the value. 616 | 617 | {1:sexp_path_caret Path & caret syntax} 618 | 619 | Path and carets provide a way for end users to address 620 | s-expressions and edit locations. 621 | 622 | A {e path} is a sequence of {{!sexp_dict}key} and list indexing 623 | operations. Applying the path to an s-expression leads to an 624 | s-expression or nothing if one of the indices does not exist, or 625 | an error if ones tries to index an atom. 626 | 627 | A {e caret} is a path and a spatial specification for the 628 | s-expression found by the path. The caret indicates either the 629 | void before that expression, the expression itself (over caret) or the 630 | void after it. 631 | 632 | Here are a few examples of paths and carets, syntactically the 633 | charater ['v'] is used to denote the caret's insertion point before or 634 | after a path. There's no distinction between a path an over caret. 635 | 636 | {v 637 | Ocaml.libs # value of key 'libs' of dictionary 'ocaml' 638 | ocaml.v[libs] # before the key binding (if any) 639 | ocaml.[libs]v # after the key binding (if any) 640 | 641 | ocaml.libs.[0] # first element of key 'libs' of dictionary 'ocaml' 642 | ocaml.libs.v[0] # before first element (if any) 643 | ocaml.libs.[0]v # after first element (if any) 644 | 645 | ocaml.libs.[-1] # last element of key 'libs' of dictionary 'ocaml' 646 | ocaml.libs.v[-1] # before last element (if any) 647 | ocaml.libs.[-1]v # after last element (if any) 648 | v} 649 | 650 | More formally a {e path} is a [.] seperated list of indices. 651 | 652 | An {e index} is written [[i]]. [i] can a zero-based list index 653 | with negative indices counting from the end of the list ([-1] is 654 | the last element). Or [i] can be a dictionary key [key]. If there 655 | is no ambiguity, the surrounding brackets can be dropped. 656 | 657 | A caret is a path whose last index brackets can be prefixed or 658 | suffixed by an insertion point, represented by the character 659 | ['v']. This respectively denote the void before or after the 660 | s-expression found by the path. 661 | 662 | {b Note.} The syntax has no form of quoting at the moment this 663 | means key names can't contain, [\[], [\]], or start with a number. 664 | 665 | {1:sexp_syntax S-expression syntax} 666 | 667 | S-expressions are a general way of describing data via atoms 668 | (sequences of characters) and lists delimited by parentheses. 669 | Here are a few examples of s-expressions and their syntax: 670 | 671 | {v 672 | this-is-an_atom 673 | (this is a list of seven atoms) 674 | (this list contains (a nested) list) 675 | 676 | ; This is a comment 677 | ; Anything that follows a semi-colon is ignored until the next line 678 | 679 | (this list ; has three atoms and an embeded () 680 | comment) 681 | 682 | "this is a quoted atom, it can contain spaces ; and ()" 683 | 684 | "quoted atoms can be split ^ 685 | across lines or contain Unicode esc^u{0061}pes" 686 | v} 687 | 688 | We define the syntax of s-expressions over a sequence of 689 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode 690 | characters} in which all US-ASCII control characters 691 | (U+0000..U+001F and U+007F) except {{!whitespace}whitespace} are 692 | forbidden in unescaped form. 693 | 694 | {2:sexp S-expressions} 695 | 696 | An {e s-expression} is either an {{!atoms}{e atom}} or a 697 | {{!lists}{e list}} of s-expressions interspaced with 698 | {{!whitespace}{e whitespace}} and {{!comments}{e comments}}. A {e 699 | sequence of s-expressions} is a succession of s-expressions 700 | interspaced with whitespace and comments. 701 | 702 | These elements are informally described below and finally made 703 | precise via an ABNF {{!grammar}grammar}. 704 | 705 | {2:whitespace Whitespace} 706 | 707 | Whitespace is a sequence of whitespace characters, namely, space 708 | [' '] (U+0020), tab ['\t'] (U+0009), line feed ['\n'] (U+000A), 709 | vertical tab ['\t'] (U+000B), form feed (U+000C) and carriage return 710 | ['\r'] (U+000D). 711 | 712 | {2:comments Comments} 713 | 714 | Unless it occurs inside an atom in quoted form (see below) 715 | anything that follows a semicolon [';'] (U+003B) is ignored until 716 | the next {e end of line}, that is either a line feed ['\n'] (U+000A), a 717 | carriage return ['\r'] (U+000D) or a carriage return and a line feed 718 | ["\r\n"] (). 719 | 720 | {v 721 | (this is not a comment) ; This is a comment 722 | (this is not a comment) 723 | v} 724 | 725 | {2:atoms Atoms} 726 | 727 | An atom represents ground data as a string of Unicode characters. 728 | It can, via escapes, represent any sequence of Unicode characters, 729 | including control characters and U+0000. It cannot represent an 730 | arbitrary byte sequence except via a client-defined encoding 731 | convention (e.g. Base64 or hex encoding). 732 | 733 | Atoms can be specified either via an unquoted or a quoted form. In 734 | unquoted form the atom is written without delimiters. In quoted 735 | form the atom is delimited by double quote ['"'] (U+0022) 736 | characters, it is mandatory for atoms that contain 737 | {{!whitespace}whitespace}, parentheses ['('] [')'], semicolons 738 | [';'], quotes ['"'], carets ['^'] or characters that need to be 739 | escaped. 740 | 741 | {v 742 | abc ; a token for the atom "abc" 743 | "abc" ; a quoted token for the atom "abc" 744 | "abc; (d" ; a quoted token for the atom "abc; (d" 745 | "" ; the quoted token for the atom "" 746 | v} 747 | 748 | For atoms that do not need to be quoted, both their unquoted and 749 | quoted form represent the same string; e.g. the string ["true"] 750 | can be represented both by the atoms {e true} and {e "true"}. The 751 | empty string can only be represented in quoted form by {e ""}. 752 | 753 | In quoted form escapes are introduced by a caret ['^']. Double 754 | quotes ['"'] and carets ['^'] must always be escaped. 755 | 756 | {v 757 | "^^" ; atom for ^ 758 | "^n" ; atom for line feed U+000A 759 | "^u{0000}" ; atom for U+0000 760 | "^"^u{1F42B}^"" ; atom with a quote, U+1F42B and a quote 761 | v} 762 | 763 | The following escape sequences are recognized: 764 | {ul 765 | {- ["^ "] () for space [' '] (U+0020)} 766 | {- ["^\""] () for double quote ['"'] (U+0022) 767 | {b mandatory}} 768 | {- ["^^"] () for caret ['^'] (U+005E) {b mandatory}} 769 | {- ["^n"] () for line feed ['\n'] (U+000A)} 770 | {- ["^r"] () for carriage return ['\r'] (U+000D)} 771 | {- ["^u{X}"] with [X] is from 1 to at most 6 upper or lower case 772 | hexadecimal digits standing for the corresponding 773 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode character} 774 | U+X.} 775 | {- Any other character except line feed ['\n'] (U+000A) or 776 | carriage return ['\r'] (U+000D), following a caret is an 777 | illegal sequence of characters. In the two former cases the 778 | atom continues on the next line and white space is ignored.}} 779 | 780 | An atom in quoted form can be split across lines by using a caret 781 | ['^'] (U+005E) followed by a line feed ['\n'] (U+000A) or a 782 | carriage return ['\r'] (U+000D); any subsequent 783 | {{!whitespace}whitespace} is ignored. 784 | 785 | {v 786 | "^ 787 | a^ 788 | ^ " ; the atom "a " 789 | v} 790 | 791 | The character ^ (U+005E) is used as an escape character rather 792 | than the usual \ (U+005C) in order to make quoted Windows® 793 | file paths decently readable and, not the least, utterly please 794 | DKM. 795 | 796 | {2:lists Lists} 797 | 798 | Lists are delimited by left ['('] (U+0028) and right [')'] 799 | (U+0029) parentheses. Their elements are s-expressions separated 800 | by optional {{!whitespace}whitespace} and 801 | {{!comments}comments}. For example: 802 | 803 | {v 804 | (a list (of four) expressions) 805 | (a list(of four)expressions) 806 | ("a"list("of"four)expressions) 807 | (a list (of ; This is a comment 808 | four) expressions) 809 | () ; the empty list 810 | v} 811 | 812 | {2:grammar Formal grammar} 813 | 814 | The following {{:https://tools.ietf.org/html/rfc5234}RFC 5234} 815 | ABNF grammar is defined on a sequence of 816 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode characters}. 817 | 818 | {v 819 | sexp-seq = *(ws / comment / sexp) 820 | sexp = atom / list 821 | list = %x0028 sexp-seq %x0029 822 | atom = token / qtoken 823 | token = t-char *(t-char) 824 | qtoken = %x0022 *(q-char / escape / cont) %x0022 825 | escape = %x005E (%x0020 / %x0022 / %x005E / %x006E / %x0072 / 826 | %x0075 %x007B unum %x007D) 827 | unum = 1*6(HEXDIG) 828 | cont = %x005E nl ws 829 | ws = *(ws-char) 830 | comment = %x003B *(c-char) nl 831 | nl = %x000A / %x000D / %x000D %x000A 832 | t-char = %x0021 / %x0023-0027 / %x002A-%x003A / %x003C-%x005D / 833 | %x005F-%x007E / %x0080-D7FF / %xE000-10FFFF 834 | q-char = t-char / ws-char / %x0028 / %x0029 / %x003B 835 | ws-char = %x0020 / %x0009 / %x000A / %x000B / %x000C / %x000D 836 | c-char = %x0009 / %x000B / %x000C / %x0020-D7FF / %xE000-10FFFF 837 | v} 838 | 839 | A few additional constraints not expressed by the grammar: 840 | {ul 841 | {- [unum] once interpreted as an hexadecimal number must be a 842 | {{:http://unicode.org/glossary/#unicode_scalar_value}Unicode scalar 843 | value.}} 844 | {- A comment can be ended by the end of the character sequence rather 845 | than [nl]. }} 846 | *) 847 | -------------------------------------------------------------------------------- /src/serialkit_text.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2019 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | 7 | let unsafe_get = String.unsafe_get 8 | module String_set = Set.Make (String) 9 | 10 | (* Heterogeneous dictionaries *) 11 | 12 | module Dict = struct 13 | (* Type identifiers, can be deleted once we require 5.1 *) 14 | module Type = struct 15 | type (_, _) eq = Equal : ('a, 'a) eq 16 | module Id = struct 17 | type _ id = .. 18 | module type ID = sig type t type _ id += Id : t id end 19 | type 'a t = (module ID with type t = 'a) 20 | 21 | let make (type a) () : a t = 22 | (module struct type t = a type _ id += Id : t id end) 23 | 24 | let provably_equal 25 | (type a b) ((module A) : a t) ((module B) : b t) : (a, b) eq option 26 | = 27 | match A.Id with B.Id -> Some Equal | _ -> None 28 | 29 | let uid (type a) ((module A) : a t) = 30 | Obj.Extension_constructor.id (Obj.Extension_constructor.of_val A.Id) 31 | end 32 | end 33 | 34 | module M = Map.Make (Int) 35 | type 'a key = 'a Type.Id.t 36 | type binding = B : 'a key * 'a -> binding 37 | type t = binding M.t 38 | 39 | let key = Type.Id.make 40 | let empty = M.empty 41 | let mem k m = M.mem (Type.Id.uid k) m 42 | let add k v m = M.add (Type.Id.uid k) (B (k, v)) m 43 | let tag k m = add k () m 44 | let remove k m = M.remove (Type.Id.uid k) m 45 | let find : type a. a key -> t -> a option = 46 | fun k m -> match M.find_opt (Type.Id.uid k) m with 47 | | None -> None 48 | | Some B (k', v) -> 49 | match Type.Id.provably_equal k k' with 50 | | None -> assert false | Some Type.Equal -> Some v 51 | end 52 | 53 | (* Text locations *) 54 | 55 | module Textloc = struct 56 | 57 | (* File paths *) 58 | 59 | type fpath = string 60 | let file_none = "-" 61 | let pp_path = Format.pp_print_string 62 | 63 | (* Byte positions *) 64 | 65 | type byte_pos = int (* zero-based *) 66 | let byte_pos_none = -1 67 | 68 | (* Lines *) 69 | 70 | type line_num = int (* one-based *) 71 | let line_num_none = -1 72 | 73 | (* Line positions 74 | 75 | We keep the byte position of the first element on the line. This 76 | first element may not exist and be equal to the text length if 77 | the input ends with a newline. Editors expect tools to compute 78 | visual columns (not a very good idea). By keeping these byte 79 | positions we can approximate columns by subtracting the line byte 80 | position data byte location. This will only be correct on 81 | US-ASCII data. *) 82 | 83 | type line_pos = line_num * byte_pos 84 | let line_pos_first = 1, 0 85 | let line_pos_none = line_num_none, byte_pos_none 86 | 87 | (* Text locations *) 88 | 89 | type t = 90 | { file : fpath; 91 | first_byte : byte_pos; last_byte : byte_pos; 92 | first_line : line_pos; last_line : line_pos } 93 | 94 | let v ~file ~first_byte ~last_byte ~first_line ~last_line = 95 | { file; first_byte; last_byte; first_line; last_line } 96 | 97 | let file l = l.file 98 | let first_byte l = l.first_byte 99 | let last_byte l = l.last_byte 100 | let first_line l = l.first_line 101 | let last_line l = l.last_line 102 | let none = 103 | let first_byte = byte_pos_none and last_byte = byte_pos_none in 104 | let first_line = line_pos_none and last_line = line_pos_none in 105 | v ~file:file_none ~first_byte ~last_byte ~first_line ~last_line 106 | 107 | (* Predicates and comparisons *) 108 | 109 | let is_none l = l.first_byte < 0 110 | let is_empty l = l.first_byte > l.last_byte 111 | let equal l0 l1 = 112 | String.equal l0.file l1.file && 113 | Int.equal l0.first_byte l1.first_byte && 114 | Int.equal l0.last_byte l1.last_byte 115 | 116 | let compare l0 l1 = 117 | let c = String.compare l0.file l1.file in 118 | if c <> 0 then c else 119 | let c = Int.compare l0.first_byte l1.first_byte in 120 | if c <> 0 then c else 121 | Int.compare l0.last_byte l1.last_byte 122 | 123 | (* Shrink and stretch *) 124 | 125 | let set_first l ~first_byte ~first_line = { l with first_byte; first_line } 126 | let set_last l ~last_byte ~last_line = { l with last_byte; last_line } 127 | 128 | [@@@warning "-6"] 129 | let to_first l = v l.file l.first_byte l.first_byte l.first_line l.first_line 130 | let to_last l = v l.file l.last_byte l.last_byte l.last_line l.last_line 131 | let before l = v l.file l.first_byte byte_pos_none l.first_line line_pos_none 132 | let after l = 133 | v l.file (l.first_byte + 1) byte_pos_none l.last_line line_pos_none 134 | [@@@warning "+6"] 135 | 136 | let span l0 l1 = 137 | let first_byte, first_line = 138 | if l0.first_byte < l1.first_byte 139 | then l0.first_byte, l0.first_line 140 | else l1.first_byte, l1.first_line 141 | in 142 | let last_byte, last_line, file = 143 | if l0.last_byte < l1.last_byte 144 | then l1.last_byte, l1.last_line, l1.file 145 | else l0.last_byte, l0.last_line, l0.file 146 | in 147 | v ~file ~first_byte ~first_line ~last_byte ~last_line 148 | 149 | [@@@warning "-6"] 150 | let reloc ~first ~last = 151 | v last.file first.first_byte last.last_byte first.first_line last.last_line 152 | [@@@warning "+6"] 153 | (* Formatters *) 154 | 155 | let pf = Format.fprintf 156 | let pp_ocaml ppf l = match is_none l with 157 | | true -> pf ppf "File \"%a\"" pp_path l.file 158 | | false -> 159 | let pp_lines ppf l = match fst l.first_line = fst l.last_line with 160 | | true -> pf ppf "line %d" (fst l.first_line) 161 | | false -> pf ppf "lines %d-%d" (fst l.first_line) (fst l.last_line) 162 | in 163 | (* "characters" represent positions (insertion points) not columns *) 164 | let pos_s = l.first_byte - snd l.first_line in 165 | let pos_e = l.last_byte - snd l.last_line + 1 in 166 | if pos_s = 0 && pos_e = 0 167 | then pf ppf "File \"%a\", %a" pp_path l.file pp_lines l 168 | else pf ppf "File \"%a\", %a, characters %d-%d" 169 | pp_path l.file pp_lines l pos_s pos_e 170 | 171 | let pp_gnu ppf l = match is_none l with 172 | | true -> pf ppf "%a:" pp_path l.file 173 | | false -> 174 | let pp_lines ppf l = 175 | let col_s = l.first_byte - snd l.first_line + 1 in 176 | let col_e = l.last_byte - snd l.last_line + 1 in 177 | match fst l.first_line = fst l.last_line with 178 | | true -> pf ppf "%d.%d-%d" (fst l.first_line) col_s col_e 179 | | false -> 180 | pf ppf "%d.%d-%d.%d" 181 | (fst l.first_line) col_s (fst l.last_line) col_e 182 | in 183 | pf ppf "%a:%a" pp_path l.file pp_lines l 184 | 185 | let pp = pp_gnu 186 | 187 | let pp_dump ppf l = 188 | pf ppf "file:%s bytes:%d-%d lines:%d-%d lines-bytes:%d-%d]" 189 | l.file l.first_byte l.last_byte (fst l.first_line) (fst l.last_line) 190 | (snd l.first_line) (snd l.last_line) 191 | end 192 | 193 | (* Node meta data *) 194 | 195 | module Meta = struct 196 | type id = int 197 | type t = { textloc : Textloc.t; id : id; dict : Dict.t } 198 | 199 | let new_id = let id = Atomic.make 0 in fun () -> Atomic.fetch_and_add id 1 200 | let make ?(textloc = Textloc.none) () = 201 | { textloc; id = new_id (); dict = Dict.empty } 202 | 203 | let none = make () 204 | let id m = m.id 205 | let textloc m = m.textloc 206 | let with_textloc ~keep_id m textloc = match keep_id with 207 | | true -> { m with textloc } 208 | | false -> { m with textloc; id = new_id () } 209 | 210 | let equal m0 m1 = Int.equal m0.id m1.id 211 | let compare m0 m1 = Int.compare m0.id m1.id 212 | let is_none m = equal none m 213 | 214 | type 'a key = 'a Dict.key 215 | let key = Dict.key 216 | let mem k m = Dict.mem k m.dict 217 | let add k v m = { m with dict = Dict.add k v m.dict } 218 | let tag k m = add k () m 219 | let remove k m = { m with dict = Dict.remove k m.dict } 220 | let find k m = Dict.find k m.dict 221 | end 222 | 223 | (* UTF tools 224 | 225 | This can be killed once we require OCaml 4.14. 226 | This is stricly equivalent to what was upstream to the Stdlib. *) 227 | 228 | module Uchar = struct 229 | include Uchar 230 | 231 | (* UTF codecs tools *) 232 | 233 | type utf_decode = int 234 | (* This is an int [0xDUUUUUU] decomposed as follows: 235 | - [D] is four bits for decode information, the highest bit is set if the 236 | decode is valid. The three lower bits indicate the number of elements 237 | from the source that were consumed by the decode. 238 | - [UUUUUU] is the decoded Unicode character or the Unicode replacement 239 | character U+FFFD if for invalid decodes. *) 240 | 241 | let valid_bit = 27 242 | let decode_bits = 24 243 | 244 | let[@inline] utf_decode_is_valid d = (d lsr valid_bit) = 1 245 | let[@inline] utf_decode_length d = (d lsr decode_bits) land 0b111 246 | let[@inline] utf_decode_uchar d = unsafe_of_int (d land 0xFFFFFF) 247 | let[@inline] utf_decode n u = ((8 lor n) lsl decode_bits) lor (to_int u) 248 | let[@inline] utf_decode_invalid n = (n lsl decode_bits) lor (Uchar.to_int rep) 249 | end 250 | 251 | (* UTF-8 *) 252 | 253 | let dec_invalid = Uchar.utf_decode_invalid 254 | let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) 255 | 256 | let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10 257 | let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101 258 | let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100 259 | let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b 260 | let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8 261 | 262 | let[@inline] utf_8_uchar_2 b0 b1 = 263 | ((b0 land 0x1F) lsl 6) lor 264 | ((b1 land 0x3F)) 265 | 266 | let[@inline] utf_8_uchar_3 b0 b1 b2 = 267 | ((b0 land 0x0F) lsl 12) lor 268 | ((b1 land 0x3F) lsl 6) lor 269 | ((b2 land 0x3F)) 270 | 271 | let[@inline] utf_8_uchar_4 b0 b1 b2 b3 = 272 | ((b0 land 0x07) lsl 18) lor 273 | ((b1 land 0x3F) lsl 12) lor 274 | ((b2 land 0x3F) lsl 6) lor 275 | ((b3 land 0x3F)) 276 | 277 | external unsafe_get_uint8 : string -> int -> int = "%bytes_unsafe_get" 278 | 279 | let string_get_utf_8_uchar b i = 280 | let b0 = Bytes.get_uint8 (Bytes.unsafe_of_string b) i in 281 | let get = unsafe_get_uint8 in 282 | let max = String.length b - 1 in 283 | match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *) 284 | | '\x00' .. '\x7F' -> dec_ret 1 b0 285 | | '\xC2' .. '\xDF' -> 286 | let i = i + 1 in if i > max then dec_invalid 1 else 287 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 288 | dec_ret 2 (utf_8_uchar_2 b0 b1) 289 | | '\xE0' -> 290 | let i = i + 1 in if i > max then dec_invalid 1 else 291 | let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else 292 | let i = i + 1 in if i > max then dec_invalid 2 else 293 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 294 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 295 | | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' -> 296 | let i = i + 1 in if i > max then dec_invalid 1 else 297 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 298 | let i = i + 1 in if i > max then dec_invalid 2 else 299 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 300 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 301 | | '\xED' -> 302 | let i = i + 1 in if i > max then dec_invalid 1 else 303 | let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else 304 | let i = i + 1 in if i > max then dec_invalid 2 else 305 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 306 | dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 307 | | '\xF0' -> 308 | let i = i + 1 in if i > max then dec_invalid 1 else 309 | let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else 310 | let i = i + 1 in if i > max then dec_invalid 2 else 311 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 312 | let i = i + 1 in if i > max then dec_invalid 3 else 313 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 314 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 315 | | '\xF1' .. '\xF3' -> 316 | let i = i + 1 in if i > max then dec_invalid 1 else 317 | let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 318 | let i = i + 1 in if i > max then dec_invalid 2 else 319 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 320 | let i = i + 1 in if i > max then dec_invalid 3 else 321 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 322 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 323 | | '\xF4' -> 324 | let i = i + 1 in if i > max then dec_invalid 1 else 325 | let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else 326 | let i = i + 1 in if i > max then dec_invalid 2 else 327 | let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 328 | let i = i + 1 in if i > max then dec_invalid 3 else 329 | let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 330 | dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 331 | | _ -> dec_invalid 1 332 | 333 | 334 | (* Error message helpers. *) 335 | 336 | module Err_msg = struct 337 | let pf = Format.fprintf 338 | let pp_sp = Format.pp_print_space 339 | let pp_nop _ () = () 340 | let pp_any fmt ppf _ = pf ppf fmt 341 | let pp_op_enum op ?(empty = pp_nop) pp_v ppf = function 342 | | [] -> empty ppf () 343 | | [v] -> pp_v ppf v 344 | | _ as vs -> 345 | let rec loop ppf = function 346 | | [v0; v1] -> pf ppf "%a@ %s@ %a" pp_v v0 op pp_v v1 347 | | v :: vs -> pf ppf "%a,@ " pp_v v; loop ppf vs 348 | | [] -> assert false 349 | in 350 | loop ppf vs 351 | 352 | let pp_and_enum ?empty pp_v ppf vs = pp_op_enum "and" ?empty pp_v ppf vs 353 | let pp_or_enum ?empty pp_v ppf vs = pp_op_enum "or" ?empty pp_v ppf vs 354 | let pp_did_you_mean pp_v ppf = function 355 | | [] -> () | vs -> pf ppf "Did@ you@ mean %a ?" (pp_or_enum pp_v) vs 356 | 357 | let pp_must_be pp_v ppf = function 358 | | [] -> () | vs -> pf ppf "Must be %a." (pp_or_enum pp_v) vs 359 | 360 | let pp_unknown ~kind pp_v ppf v = pf ppf "Unknown %a %a." kind () pp_v v 361 | let pp_unknown' ~kind pp_v ~hint ppf (v, hints) = match hints with 362 | | [] -> pp_unknown ~kind pp_v ppf v 363 | | hints -> pp_unknown ~kind pp_v ppf v; pp_sp ppf (); (hint pp_v) ppf hints 364 | 365 | let edit_distance s0 s1 = 366 | let minimum (a : int) (b : int) (c : int) : int = min a (min b c) in 367 | let s0,s1 = if String.length s0 <= String.length s1 then s0,s1 else s1,s0 in 368 | let m = String.length s0 and n = String.length s1 in 369 | let rec rows row0 row i = match i > n with 370 | | true -> row0.(m) 371 | | false -> 372 | row.(0) <- i; 373 | for j = 1 to m do 374 | if s0.[j - 1] = s1.[i - 1] then row.(j) <- row0.(j - 1) else 375 | row.(j) <- minimum (row0.(j - 1) + 1) (row0.(j) + 1) (row.(j - 1) + 1) 376 | done; 377 | rows row row0 (i + 1) 378 | in 379 | rows (Array.init (m + 1) (fun x -> x)) (Array.make (m + 1) 0) 1 380 | 381 | let suggest ?(dist = 2) candidates s = 382 | let add (min, acc) name = 383 | let d = edit_distance s name in 384 | if d = min then min, (name :: acc) else 385 | if d < min then d, [name] else 386 | min, acc 387 | in 388 | let d, suggs = List.fold_left add (max_int, []) candidates in 389 | if d <= dist (* suggest only if not too far *) then List.rev suggs else [] 390 | end 391 | 392 | (* UTF-8 decoding table. *) 393 | 394 | module Utf_8 = struct 395 | type case = 396 | | L1 | L2 | L3_E0 | L3_E1_EC_or_EE_EF | L3_ED | L4_F0 | L4_F1_F3 | L4_F4 | E 397 | 398 | let case = 399 | (* 400 | (* See https://tools.ietf.org/html/rfc3629#section-4 *) 401 | Printf.printf "[|"; 402 | for i = 0 to 255 do 403 | if i mod 16 = 0 then Printf.printf "\n"; 404 | if 0x00 <= i && i <= 0x7F then Printf.printf "L1; " else 405 | if 0xC2 <= i && i <= 0xDF then Printf.printf "L2; " else 406 | if 0xE0 = i then Printf.printf "L3_E0; " else 407 | if 0xE1 <= i && i <= 0xEC || 0xEE <= i && i <= 0xEF 408 | then Printf.printf "L3_E1_EC_or_EE_EF; " else 409 | if 0xED = i then Printf.printf "L3_ED;" else 410 | if 0xF0 = i then Printf.printf "L4_F0; " else 411 | if 0xF1 <= i && i <= 0xF3 then Printf.printf "L4_F1_F3; " else 412 | if 0xF4 = i then Printf.printf "L4_F4; " else 413 | Printf.printf "E; " 414 | done; 415 | Printf.printf "\n|]" 416 | *) 417 | [| 418 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 419 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 420 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 421 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 422 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 423 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 424 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 425 | L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 426 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 427 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 428 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 429 | E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 430 | E; E; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; 431 | L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; 432 | L3_E0; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 433 | L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 434 | L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 435 | L3_E1_EC_or_EE_EF; L3_ED;L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 436 | L4_F0; L4_F1_F3; L4_F1_F3; L4_F1_F3; L4_F4; E; E; E; E; E; E; E; E; E; E; E; 437 | |] 438 | end 439 | 440 | (* UTF-8 text decoder *) 441 | 442 | module Textdec = struct 443 | type 'a fmt = Format.formatter -> 'a -> unit 444 | let pp_did_you_mean = Err_msg.pp_did_you_mean 445 | let pp_and_enum = Err_msg.pp_and_enum 446 | let pp_or_enum = Err_msg.pp_or_enum 447 | let pp_did_you_mean = Err_msg.pp_did_you_mean 448 | let pp_must_be = Err_msg.pp_must_be 449 | let pp_unknown = Err_msg.pp_unknown 450 | let pp_unknown' = Err_msg.pp_unknown' 451 | 452 | (* Decoders *) 453 | 454 | type t = 455 | { file : Textloc.fpath; i : string; tok : Buffer.t; 456 | mutable pos : int; mutable line : int; mutable line_pos : int; } 457 | 458 | let from_string ?(file = Textloc.file_none) i = 459 | { file; i; tok = Buffer.create 255; pos = 0; line = 1; line_pos = 0 } 460 | 461 | (* Location *) 462 | 463 | let file d = d.file 464 | let pos d = d.pos 465 | let line d = d.line, d.line_pos 466 | let loc d ~first_byte ~last_byte ~first_line ~last_line = 467 | Textloc.v ~file:d.file ~first_byte ~last_byte ~first_line ~last_line 468 | 469 | let loc_to_here d ~first_byte ~first_line = 470 | let last_line = (d.line, d.line_pos) in 471 | loc d ~first_byte ~last_byte:d.pos ~first_line ~last_line 472 | 473 | let loc_here d = 474 | loc_to_here d ~first_byte:d.pos ~first_line:(d.line, d.line_pos) 475 | 476 | (* Errors *) 477 | 478 | exception Err of Textloc.t * string 479 | 480 | let err loc msg = raise_notrace (Err (loc, msg)) 481 | let err_to_here d ~first_byte ~first_line fmt = 482 | Format.kasprintf (err (loc_to_here d ~first_byte ~first_line)) fmt 483 | 484 | let err_here d fmt = Format.kasprintf (err (loc_here d)) fmt 485 | let err_suggest = Err_msg.suggest 486 | 487 | (* Lexing *) 488 | 489 | let[@inline] incr_line d = match d.i.[d.pos] with (* assert (not (eoi d)) *) 490 | | '\r' -> d.line <- d.line + 1; d.line_pos <- d.pos + 1 491 | | '\n' -> 492 | (if d.pos = 0 || d.i.[d.pos - 1] <> '\r' then d.line <- d.line + 1); 493 | d.line_pos <- d.pos + 1; 494 | | _ -> () 495 | 496 | let[@inline] eoi d = d.pos >= String.length d.i 497 | let[@inline] byte d = if eoi d then 0xFFFF else Char.code d.i.[d.pos] 498 | let[@inline] accept_byte d = incr_line d; d.pos <- d.pos + 1 499 | 500 | let accept_utf_8 accept d = 501 | let err d = match byte d with 502 | | 0xFFFF -> err_here d "UTF-8 decoding error: unexpected end of input" 503 | | b -> err_here d "UTF-8 decoding error: byte %02x illegal here" b 504 | in 505 | let accept_tail d = if (byte d lsr 6 = 0b10) then accept d else err d in 506 | match byte d with 507 | | 0xFFFF -> err d 508 | | b -> 509 | (* If a subsequent [byte d] invocation is 0xFFFF we get to [err]. *) 510 | match Utf_8.case.(b) with 511 | | L1 -> accept d 512 | | L2 -> accept d; accept_tail d 513 | | L3_E0 -> 514 | accept d; 515 | if (byte d - 0xA0 < 0xBF - 0xA0) then accept d else err d; 516 | accept_tail d 517 | | L3_E1_EC_or_EE_EF -> 518 | accept d; accept_tail d; accept_tail d 519 | | L3_ED -> 520 | accept d; 521 | if (byte d - 0x80 < 0x9F - 0x80) then accept d else err d; 522 | accept_tail d 523 | | L4_F0 -> 524 | accept d; 525 | if (byte d - 0x90 < 0xBF - 0x90) then accept d else err d; 526 | accept_tail d; accept_tail d 527 | | L4_F1_F3 -> 528 | accept d; 529 | accept_tail d; accept_tail d; accept_tail d; 530 | | L4_F4 -> 531 | accept d; 532 | if (byte d - 0x80 < 0x8F - 0x80) then accept d else err d; 533 | | E -> err d 534 | 535 | let accept_uchar d = accept_utf_8 accept_byte d 536 | 537 | (* Tokenizer *) 538 | 539 | let[@inline] lex_clear d = Buffer.clear d.tok 540 | let[@inline] lex_pop d = let t = Buffer.contents d.tok in lex_clear d; t 541 | let[@inline] lex_add_byte d b = Buffer.add_char d.tok (Char.chr b) 542 | let[@inline] lex_add_bytes d s = Buffer.add_string d.tok s 543 | let[@inline] lex_add_char d c = Buffer.add_char d.tok c 544 | let[@inline] lex_add_uchar d u = Buffer.add_utf_8_uchar d.tok u 545 | let[@inline] lex_accept_byte d = 546 | Buffer.add_char d.tok d.i.[d.pos]; accept_byte d 547 | 548 | let[@inline] lex_accept_uchar d = accept_utf_8 lex_accept_byte d 549 | 550 | (* Insertions and substitutions *) 551 | 552 | let string_subrange ?(first = 0) ?last s = 553 | let max = String.length s - 1 in 554 | let last = match last with 555 | | None -> max 556 | | Some l when l > max -> max 557 | | Some l -> l 558 | in 559 | let first = if first < 0 then 0 else first in 560 | if first > last then "" else 561 | String.sub s first (last - first + 1) 562 | 563 | let string_replace ~start ~stop ~rep s = 564 | let len = String.length s in 565 | if stop < start || start < 0 || start > len || stop < 0 || stop > len 566 | then invalid_arg (Printf.sprintf "invalid start:%d stop:%d" start stop) else 567 | let b = String.sub s 0 start in 568 | let a = String.sub s stop (len - stop) in 569 | String.concat "" [b; rep; a] 570 | end 571 | -------------------------------------------------------------------------------- /src/serialkit_text.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2019 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** UTF-8 text based formats tools. 7 | 8 | Open this module to use it defines only module in your scope. *) 9 | 10 | (**/**) 11 | module Dict : sig 12 | type 'a key 13 | val key : unit -> 'a key 14 | type t 15 | val empty : t 16 | val mem : 'a key -> t -> bool 17 | val add : 'a key -> 'a -> t -> t 18 | val tag : unit key -> t -> t 19 | val remove : 'a key -> t -> t 20 | val find : 'a key -> t -> 'a option 21 | end 22 | (**/**) 23 | 24 | (** Text locations. 25 | 26 | A text location identifies a text span in a given UTF-8 encoded file 27 | by an inclusive range of absolute {{!Textloc.type-byte_pos}byte} positions 28 | and the {{!Textloc.type-line_pos}line positions} on which those occur. *) 29 | module Textloc : sig 30 | 31 | (** {1:fpath File paths} *) 32 | 33 | type fpath = string 34 | (** The type for file paths. *) 35 | 36 | val file_none : fpath 37 | (** [file_none] is ["-"]. A file path to use when there is none. *) 38 | 39 | (** {1:pos Positions} *) 40 | 41 | (** {2:byte_pos Byte positions} *) 42 | 43 | type byte_pos = int 44 | (** The type for zero-based, absolute, byte positions in text. If 45 | the text has [n] bytes, [0] is the first position and [n-1] is 46 | the last position. *) 47 | 48 | val byte_pos_none : byte_pos 49 | (** [byte_pos_none] is [-1]. A position to use when there is none. *) 50 | 51 | (** {2:lines Lines} *) 52 | 53 | type line_num = int 54 | (** The type for one-based, line numbers in the text. Lines 55 | increment after a {e newline} which is either a line feed ['\n'] 56 | (U+000A), a carriage return ['\r'] (U+000D) or a carriage return 57 | and a line feed ["\r\n"] (). *) 58 | 59 | val line_num_none : line_num 60 | (** [line_num_none] is [-1]. A line number to use when there is none. *) 61 | 62 | (** {2:line_pos Line positions} *) 63 | 64 | type line_pos = line_num * byte_pos 65 | (** The type for line positions. This identifies a line by its line 66 | number and the absolute byte position following its newline 67 | (or the start of text for the first line). That byte position: 68 | {ul 69 | {- Indexes the first byte of text of the line if the line is non-empty.} 70 | {- Indexes the first byte of the next newline if the line is empty.} 71 | {- Is out of bounds and equal to the text's length for a last empty 72 | line (this includes when the text is empty).}} *) 73 | 74 | val line_pos_first : line_pos 75 | (** [line_pos_first] is [1, 0]. Note that this is the only line position 76 | of the empty text. *) 77 | 78 | val line_pos_none : line_pos 79 | (** [line_pos_none] is [(line_none, pos_none)]. *) 80 | 81 | (** {1:tloc Text locations} *) 82 | 83 | type t 84 | (** The type for text locations. A text location identifies a text 85 | span in an UTF-8 encoded file by an inclusive range of absolute 86 | {{!type-byte_pos}byte positions} and the {{!type-line_pos}line positions} 87 | on which they occur. 88 | 89 | If the first byte equals the last byte the range contains 90 | exactly that byte. If the first byte is greater than the last 91 | byte this represents an insertion point before the first byte. In 92 | this case information about the last position should be ignored: 93 | it can contain anything. *) 94 | 95 | val none : t 96 | (** [none] is a position to use when there is none. *) 97 | 98 | val v : 99 | file:fpath -> first_byte:byte_pos -> last_byte:byte_pos -> 100 | first_line:line_pos -> last_line:line_pos -> t 101 | (** [v ~file ~first_byte ~last_byte ~first_line ~last_line] is a text 102 | location with the given arguments, see corresponding accessors for 103 | the semantics. If you don't have a file use {!file_none}. *) 104 | 105 | val file : t -> fpath 106 | (** [file l] is [l]'s file. *) 107 | 108 | val first_byte : t -> byte_pos 109 | (** [first_byte l] is [l]'s first byte. Irrelevant if {!is_none} is 110 | [true]. *) 111 | 112 | val last_byte : t -> byte_pos 113 | (** [last_byte l] is [l]'s last byte. Irrelevant if {!is_none} or {!is_empty} 114 | is [true]. *) 115 | 116 | val first_line : t -> line_pos 117 | (** [first_line l] is the line position on which [first_byte l] lies. 118 | Irrelevant if {!is_none} is [true].*) 119 | 120 | val last_line : t -> line_pos 121 | (** [last_line l] is the line position on which [last_byte l] lies. 122 | Irrelevant if {!is_none} or {!is_empty} is [true].*) 123 | 124 | (** {2:preds Predicates and comparisons} *) 125 | 126 | val is_none : t -> bool 127 | (** [is_none t] is [true] iff [first_byte < 0]. *) 128 | 129 | val is_empty : t -> bool 130 | (** [is_empty t] is [true] iff [first_byte t > last_byte t]. *) 131 | 132 | val equal : t -> t -> bool 133 | (** [equal t0 t1] is [true] iff [t0] and [t1] are equal. This checks 134 | that {!file}, {!first_byte} and {!last_byte} are equal. Line information 135 | is ignored. *) 136 | 137 | val compare : t -> t -> int 138 | (** [compare t0 t1] orders [t0] and [t1]. The order is compatible 139 | with {!equal}. Comparison starts with {!file}, follows with {!first_byte} 140 | and ends, if needed, with {!last_byte}. Line information is ignored. *) 141 | 142 | (** {2:shrink_and_stretch Shrink and stretch} *) 143 | 144 | val set_first : t -> first_byte:byte_pos -> first_line:line_pos -> t 145 | (** [set_first l ~first_byte ~first_line] sets the the first position of 146 | [l] to given values. *) 147 | 148 | val set_last : t -> last_byte:byte_pos -> last_line:line_pos -> t 149 | (** [set_last l ~last_byte ~last_line] sets the last position of [l] 150 | to given values. *) 151 | 152 | val to_first : t -> t 153 | (** [to_first l] has both first and last positions set to [l]'s first 154 | position. The range spans {!first_byte}. See also {!before}. *) 155 | 156 | val to_last : t -> t 157 | (** [to_last l] has both first and last positions set to [l]'s last 158 | position. The range spans {!last_byte}. See also {!after}. *) 159 | 160 | val before : t -> t 161 | (** [before t] is the {{!is_empty}empty} text location starting at 162 | {!first_byte}. *) 163 | 164 | val after : t -> t 165 | (** [after t] is the empty {{!is_empty}empty} location starting at 166 | [last_byte t + 1]; note that at the end of input this may be an 167 | invalid byte {e index}. The {!first_line} and {!last_line} of the 168 | result is [last_line t]. *) 169 | 170 | val span : t -> t -> t 171 | (** [span l0 l1] is the span from the smallest byte position of [l0] and 172 | [l1] to the largest byte position of [l0] and [l1]. The file path is 173 | taken from the greatest byte position. *) 174 | 175 | val reloc : first:t -> last:t -> t 176 | (** [reloc ~first ~last] uses the first position of [first], the 177 | last position of [last] and the file of [last]. *) 178 | 179 | (** {2:fmt Formatting} *) 180 | 181 | val pp_ocaml : Format.formatter -> t -> unit 182 | (** [pp_ocaml] formats text locations like the OCaml compiler. *) 183 | 184 | val pp_gnu : Format.formatter -> t -> unit 185 | (** [pp_gnu] formats text locations according to the 186 | {{:https://www.gnu.org/prep/standards/standards.html#Errors}GNU 187 | convention}. *) 188 | 189 | val pp : Format.formatter -> t -> unit 190 | (** [pp] is {!pp_gnu}. *) 191 | 192 | val pp_dump : Format.formatter -> t -> unit 193 | (** [pp_dump] formats raw data for debugging. *) 194 | end 195 | 196 | (** Text node metadata. 197 | 198 | Holds text locations and custom, client-defined metadata. *) 199 | module Meta : sig 200 | 201 | type id = int 202 | (** The type for non-negative metadata identifiers. *) 203 | 204 | type t 205 | (** The type for abstract syntax tree node metadata. *) 206 | 207 | val none : t 208 | (** [none] is metadata for when there is none, its {!textloc} is 209 | {!Textloc.none}. *) 210 | 211 | val make : ?textloc:Textloc.t -> unit -> t 212 | (** [make textloc] is metadata with text location [textloc] (defaults 213 | to {!Textloc.none}) and a fresh identifier (see {!val-id}). *) 214 | 215 | val id : t -> id 216 | (** [id m] is an identifier for the metadata. Depending on how you 217 | process the abstract syntax tree this may become non-unique but 218 | the metadata values in an abstract syntax tree returned by 219 | {!Doc.of_string} with [locs:true] have distinct identifiers. *) 220 | 221 | val textloc : t -> Textloc.t 222 | (** [textloc m] is the source location of the syntactic construct [m] 223 | is attached to. *) 224 | 225 | val with_textloc : keep_id:bool -> t -> Textloc.t -> t 226 | (** [with_textloc ~keep_id m textloc] is metadata [m] with text location 227 | [textloc] and a fresh id, unless [keep_id] is [true]. *) 228 | 229 | (** {1:preds Predicates and comparisons} *) 230 | 231 | val equal : t -> t -> bool 232 | (** [equal m0 m1] is [true] if [m0] and [m1] have the same {!val-id}. 233 | Note that they may have different {{!custom}metadata.} *) 234 | 235 | val compare : t -> t -> int 236 | (** [compare m0 m1] is a total order on metadata {!val-id}s compatible with 237 | {!equal}. *) 238 | 239 | val is_none : t -> bool 240 | (** [is_none m] is [equal none m]. *) 241 | 242 | (** {1:custom Custom metadata} 243 | 244 | {b Warning.} Operating on custom metadata never changes 245 | {!val-id}. It is possible for two meta values to have the same 246 | id and different metadata. *) 247 | 248 | type 'a key 249 | (** The type for custom metadata keys. *) 250 | 251 | val key : unit -> 'a key 252 | (** [key ()] is a new metadata key. *) 253 | 254 | val mem : 'a key -> t -> bool 255 | (** [mem k m] is [true] iff [k] is bound in [m]. *) 256 | 257 | val add : 'a key -> 'a -> t -> t 258 | (** [add k v m] is [m] with key [k] bound to [v]. *) 259 | 260 | val tag : unit key -> t -> t 261 | (** [tag k m] is [add k () m]. *) 262 | 263 | val remove : 'a key -> t -> t 264 | (** [remove k m] is [m] with key [k] unbound in [v]. *) 265 | 266 | val find : 'a key -> t -> 'a option 267 | (** [find k m] the value of [k] in [m], if any. *) 268 | end 269 | 270 | (** UTF-8 text decoder. 271 | 272 | A decoder inputs {e valid} UTF-8 text, maintains {{!Textloc}text locations} 273 | according to advances in the input and has a {{!Textdec.lex}lexeme buffer} 274 | for lexing. *) 275 | module Textdec : sig 276 | 277 | (** {1:dec Decoder} *) 278 | 279 | type t 280 | (** The type for UTF-8 text decoders. *) 281 | 282 | val from_string : ?file:Textloc.fpath -> string -> t 283 | (** [from_string ~file s] decodes [s] using [file] (defaults to 284 | {!Tloc.no_file}) for text location. *) 285 | 286 | (** {1:loc Locations} *) 287 | 288 | val file : t -> Textloc.fpath 289 | (** [file d] is the input file. *) 290 | 291 | val pos : t -> Textloc.byte_pos 292 | (** [pos d] is the current decoding byte position. *) 293 | 294 | val line : t -> Textloc.line_pos 295 | (** [line d] is the current decoding line position. Lines increment as 296 | described {{!Tloc.line}here}. *) 297 | 298 | val loc : 299 | t -> first_byte:Textloc.byte_pos -> last_byte:Textloc.byte_pos -> 300 | first_line:Textloc.line_pos -> last_line:Textloc.line_pos -> Textloc.t 301 | (** [loc d ~first_byte ~last_bytex ~first_line ~last_line] is 302 | {!Tloc.v} using [file d] for the file. *) 303 | 304 | val loc_to_here : 305 | t -> first_byte:Textloc.byte_pos -> first_line:Textloc.line_pos -> Textloc.t 306 | (** [loc_to_here d ~first_byte ~first_line] is a location that starts at 307 | [~first_byte] and [~first_line] and ends at the current decoding 308 | position as determined by {!pos} and {!line}. *) 309 | 310 | val loc_here : t -> Textloc.t 311 | (** [loc_here d] is like {!loc_to_here} with the start position 312 | at the current decoding position as determined by 313 | {!pos} and {!line}. *) 314 | 315 | (** {1:err Errors} *) 316 | 317 | exception Err of Textloc.t * string 318 | (** The exception for errors. A location and an english error message *) 319 | 320 | val err : Textloc.t -> string -> 'b 321 | (** [err loc msg] raises [Err (loc, msg)] with no trace. *) 322 | 323 | val err_to_here : 324 | t -> first_byte:Textloc.byte_pos -> first_line:Textloc.line_pos -> 325 | ('a, Format.formatter, unit, 'b) format4 -> 'a 326 | (** [err_to_here d ~first_byte ~first_line fmt ...] is 327 | [err d (loc_to_here d ~first_byte ~first_line) fmt ...] *) 328 | 329 | val err_here : t -> ('a, Format.formatter, unit, 'b) format4 -> 'a 330 | (** [err_here d] is [err d (loc_here d) fmt ...]. *) 331 | 332 | (** {2:err_msg Error message helpers} *) 333 | 334 | val err_suggest : ?dist:int -> string list -> string -> string list 335 | (** [err_suggest ~dist candidates s] are the elements of 336 | [candidates] whose edit distance is the smallest to [s] and at 337 | most at a distance of [dist] of [s] (defaults to [2]). If 338 | multiple results are returned the order of [candidates] is 339 | preserved. *) 340 | 341 | type 'a fmt = Format.formatter -> 'a -> unit 342 | (** The type for formatters. *) 343 | 344 | val pp_and_enum : ?empty:unit fmt -> 'a fmt -> 'a list fmt 345 | (** [and_enum ~empty pp_v ppf l] formats [l] according to its length. 346 | {ul 347 | {- [0], formats {!empty} (defaults to {!nop}).} 348 | {- [1], formats the element with [pp_v].} 349 | {- [2], formats ["%a and %a"] with the list elements} 350 | {- [n], formats ["%a, ... and %a"] with the list elements}} *) 351 | 352 | val pp_or_enum : ?empty:unit fmt -> 'a fmt -> 'a list fmt 353 | (** [or_enum] is like {!and_enum} but uses "or" instead of "and". *) 354 | 355 | val pp_did_you_mean : 'a fmt -> 'a list fmt 356 | (** [did_you_mean pp_v] formats ["Did you mean %a ?"] with {!or_enum} 357 | if the list is non-empty and {!nop} otherwise. *) 358 | 359 | val pp_must_be : 'a fmt -> 'a list fmt 360 | (** [must_be pp_v] formats ["Must be %a."] with {!or_enum} if the list 361 | is non-empty and {!nop} otherwise. *) 362 | 363 | val pp_unknown : kind:unit fmt -> 'a fmt -> 'a fmt 364 | (** [unknown ~kind pp_v] formats ["Unknown %a %a." kind () pp_v]. *) 365 | 366 | val pp_unknown' : 367 | kind:unit fmt -> 'a fmt -> hint:('a fmt -> 'a list fmt) -> 368 | ('a * 'a list) fmt 369 | (** [unknown ~kind pp_v ~hint (v, hints)] formats {!unknown} followed 370 | by a space and [hint pp_v hints] if [hints] is non-empty. *) 371 | 372 | (** {1:dec Decoding} *) 373 | 374 | val eoi : t -> bool 375 | (** [eoi d] is [true] iff the decoder is at the end of input. *) 376 | 377 | val byte : t -> int 378 | (** [byte d] is the byte at current position or [0xFFFF] if 379 | [eoi d] is [true]. *) 380 | 381 | val accept_uchar : t -> unit 382 | (** [accept_uchar d] accepts an UTF-8 encoded character starting at 383 | the current position and moves to the byte location after it. Raises 384 | {!Err} in case of UTF-8 decoding error. *) 385 | 386 | val accept_byte : t -> unit 387 | (** [accept_byte d] accepts the byte at the current position and 388 | moves to the byte location after it. {b Warning.} Faster than 389 | {!accept_uchar} but the client needs to make sure it's not 390 | accepting invalid UTF-8 data, i.e. that [byte d] is an US-ASCII 391 | encoded character (i.e. [<= 0x7F]). *) 392 | 393 | (** {1:lex Lexeme buffer} *) 394 | 395 | val lex_clear : t -> unit 396 | (** [lex_clear d] sets the lexeme to the empty string. *) 397 | 398 | val lex_pop : t -> string 399 | (** [lex_pop d] is the lexeme and {!lex_clear}s it. *) 400 | 401 | val lex_add_byte : t -> int -> unit 402 | (** [lex_add_byte d b] adds byte [b] to the lexen. *) 403 | 404 | val lex_add_bytes : t -> string -> unit 405 | (** [lex_add_byte d s] adds bytes [s] to the lexen. *) 406 | 407 | val lex_add_char : t -> char -> unit 408 | (** [lex_add_char d c] adds character [c] to the lexen. *) 409 | 410 | val lex_add_uchar : t -> Uchar.t -> unit 411 | (** [lex_add_uchar t u] adds the UTF-8 encoding of character [u] 412 | to the lexen. *) 413 | 414 | val lex_accept_uchar : t -> unit 415 | (** [lex_accept_uchar d] is like {!accept_uchar} but also adds the 416 | UTF-8 byte sequence to the lexeme. *) 417 | 418 | val lex_accept_byte : t -> unit 419 | (** [lex_accept_byte d] is like {!accept_byte} but also adds the 420 | byte to the lexeme. {b Warning.} {!accept_byte}'s warning 421 | applies. *) 422 | 423 | (** {1:text String substitutions and insertions} 424 | 425 | Strictly speaking this doesn't belong here but here you go. *) 426 | 427 | val string_subrange : ?first:int -> ?last:int -> string -> string 428 | (** [string_subrange ~first ~last s] are the consecutive bytes of [s] 429 | whose indices exist in the range \[[first];[last]\]. 430 | 431 | [first] defaults to [0] and last to [String.length s - 1]. 432 | 433 | Note that both [first] and [last] can be any integer. If 434 | [first > last] the interval is empty and the empty string is 435 | returned. *) 436 | 437 | val string_replace : start:int -> stop:int -> rep:string -> string -> string 438 | (** [string_replace ~start ~stop ~rep s] replaces the index range 439 | \[[start];stop-1\] of [s] with [rep] as follows. If [start = stop] 440 | the [rep] is inserted before [start]. [start] and [stop] must be 441 | in range \[[0];[String.length s]\] and [start <= stop] or 442 | [Invalid_argument] is raised. *) 443 | end 444 | -------------------------------------------------------------------------------- /src/serialkit_toml.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2021 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open Serialkit_text 7 | 8 | let max_month_day = (* max day number in a given year's month. *) 9 | let is_leap_year y = (y mod 4 = 0) && (y mod 100 <> 0 || y mod 400 = 0) in 10 | let mlen = [|31; 28 (* or not *); 31; 30; 31; 30; 31; 31; 30; 31; 30; 31|] in 11 | fun y m -> if (m = 2 && is_leap_year y) then 29 else mlen.(m - 1) 12 | 13 | let is_date_valid (y, m, d) = 14 | 0 <= y && y <= 9999 && 15 | 1 <= m && m <= 12 && 16 | 1 <= d && d <= max_month_day y m 17 | 18 | let is_time_valid (hh, mm, ss) = 19 | 0 <= hh && hh <= 23 && 20 | 0 <= mm && mm <= 59 && 21 | 0 <= ss && ss <= 60 22 | 23 | module Toml = struct 24 | let file_ext = ".toml" 25 | let mime_type = "application/toml" 26 | let version = "1.0.0" 27 | 28 | module Textloc = Serialkit_text.Textloc 29 | module Meta = Serialkit_text.Meta 30 | 31 | type 'a node = 'a * Meta.t 32 | 33 | type tz_offset_s = int 34 | type date = int * int * int 35 | 36 | type time = (int * int * int) * float option * tz_offset_s option 37 | type date_time = date option * time option 38 | type table = (string node * value) node list 39 | 40 | and value = 41 | [ `Boolean of bool node 42 | | `Integer of int64 node 43 | | `Float of float node 44 | | `String of string node 45 | | `Array of value list node 46 | | `Date_time of date_time node 47 | | `Table of table node ] 48 | 49 | type t = table node 50 | 51 | (* Formatting *) 52 | 53 | type 'a fmt = Format.formatter -> 'a -> unit 54 | let pf = Format.fprintf 55 | let pp_char = Format.pp_print_char 56 | let pp_str = Format.pp_print_string 57 | let pp_sp = Format.pp_print_space 58 | let pp_date ppf (y, m, d) = pf ppf "%04d-%02d-%02d" y m d 59 | let pp_time ppf ((hh, mm, ss), frac, tz) = failwith "TODO" 60 | let pp_date_time ppf (date, time) = match date, time with 61 | | None, None -> assert false 62 | | Some d, None -> pp_date ppf d 63 | | None, Some t -> pp_time ppf t 64 | | Some d, Some t -> pp_date ppf d; pp_char ppf ' '; pp_time ppf t 65 | 66 | let rec pp_value : value fmt = fun ppf -> function 67 | | `Boolean (b, _) -> Format.pp_print_bool ppf b 68 | | `Integer (i, _) -> pf ppf "%Ld" i 69 | | `Float (f, _) -> pf ppf "%.16g" f 70 | | `String (s, _) -> pf ppf "%S" s 71 | | `Array (l, _) -> 72 | let pp_sep ppf () = pp_char ppf ','; pp_sp ppf () in 73 | Format.pp_open_box ppf 2; 74 | Format.pp_print_list ~pp_sep pp_value ppf l; 75 | Format.pp_close_box ppf () 76 | | `Date_time (dt, _) -> pp_date_time ppf dt 77 | | `Table (t, _) -> 78 | let pp_binding ppf (((k, _), v), _) = 79 | pf ppf "@[%s =@ %a@]" k pp_value v 80 | in 81 | Format.pp_print_list ~pp_sep:Format.pp_print_cut pp_binding ppf t 82 | 83 | let pp ppf v = pp_value ppf (`Table v) 84 | let pp_layout = (* TODO *) pp 85 | 86 | (* Decode errors *) 87 | 88 | module Error = struct 89 | type expected = 90 | | Basic_char 91 | | Bin_digit 92 | | Char of char 93 | | Comment 94 | | Comment_char 95 | | Date_time 96 | | Dec_digit 97 | | Eoi 98 | | Equal_char 99 | | False 100 | | Float 101 | | Float_exp 102 | | Float_frac 103 | | Hex_digit 104 | | Inf 105 | | Integer 106 | | Key 107 | | Keyval 108 | | Lf 109 | | Literal_char 110 | | Nan 111 | | Newline 112 | | Oct_digit 113 | | Table 114 | | Tz_offset 115 | | True 116 | | Value 117 | 118 | let pp_expected ppf = function 119 | | Basic_char -> pp_str ppf "a basic string character" 120 | | Bin_digit -> pp_str ppf "a binary digit" 121 | | Char c -> pp_str ppf "a '%C' character" 122 | | Comment -> pp_str ppf "a comment" 123 | | Comment_char -> pp_str ppf "a comment character" 124 | | Date_time -> pp_str ppf "a date-time" 125 | | Dec_digit -> pp_str ppf "a decimal digit" 126 | | Eoi -> pp_str ppf "end of input" 127 | | Equal_char -> pp_str ppf "an equal character ('=')" 128 | | Hex_digit -> pp_str ppf "an hexadecimal digit" 129 | | False -> pp_str ppf "'true'" 130 | | Float -> pp_str ppf "float" 131 | | Float_exp -> pp_str ppf "float exponent" 132 | | Float_frac -> pp_str ppf "float fractional part" 133 | | Inf -> pp_str ppf "'inf'" 134 | | Integer -> pp_str ppf "integer" 135 | | Key -> pp_str ppf "a key" 136 | | Keyval -> pp_str ppf "a key/value pair" 137 | | Lf -> pp_str ppf "a line feed (\\n)" 138 | | Literal_char -> pp_str ppf "a string literal character" 139 | | Nan -> pp_str ppf "'nan'" 140 | | Newline -> pp_str ppf "a newline ('\\n')" 141 | | Oct_digit -> pp_str ppf "an octal digit" 142 | | Table -> pp_str ppf "a table" 143 | | Tz_offset -> pp_str ppf "a timezone offset" 144 | | True -> pp_str ppf "'true'" 145 | | Value -> pp_str ppf "a value" 146 | 147 | type unclosed = 148 | | Basic_string | Basic_multiline_string | Literal_string 149 | | Literal_multiline_string 150 | 151 | let pp_unclosed ppf = function 152 | | Basic_string -> pp_str ppf "Unclosed basic string" 153 | | Basic_multiline_string -> pp_str ppf "Unclosed multiline basic string" 154 | | Literal_string -> pp_str ppf "Unclosed literal string" 155 | | Literal_multiline_string -> pp_str ppf "Unclosed multiline literal string" 156 | 157 | type escape = 158 | | Not_escape_char of char 159 | | Not_uchar_value of int 160 | | Invalid_uchar_escape 161 | 162 | let pp_escape ppf = function 163 | | Not_escape_char c -> pf ppf "%C is not a valid escape character" c 164 | | Not_uchar_value i -> pf ppf "%04X is not a Unicode character" i 165 | | Invalid_uchar_escape -> pf ppf "Invalid Unicode character escape" 166 | 167 | type invalid = 168 | | Leading_zeros 169 | | Int64_overflow 170 | | Date of date 171 | | Hour of int 172 | | Minute of int 173 | | Second of int 174 | 175 | let pp_invalid ppf = function 176 | | Leading_zeros -> pf ppf "Leading zeros are not allowed" 177 | | Int64_overflow -> pf ppf "Integer overflows 64-bit integers" 178 | | Date (y, m, d) -> pf ppf "%04d-%02d-%02d is not a valid date" y m d 179 | | Hour i -> pf ppf "%02d is not a valid hour (00-23)" i 180 | | Minute i -> pf ppf "%02d is not a valid minute (00-59)" i 181 | | Second i -> pf ppf "%02d is not a valid second (00-60)" i 182 | 183 | type kind = 184 | | Escape of escape 185 | | Expected of expected list 186 | | Invalid of invalid 187 | | Unclosed of unclosed 188 | 189 | let pp_kind () ppf = function 190 | | Escape esc -> pp_escape ppf esc 191 | | Expected exps -> Textdec.pp_or_enum pp_expected ppf exps 192 | | Invalid inv -> pp_invalid ppf inv 193 | | Unclosed uncl -> pp_unclosed ppf uncl 194 | 195 | type t = kind * Textloc.t 196 | let pp_prefix ppf () = Format.pp_print_string ppf "Error: " 197 | let pp 198 | ?(pp_loc = Textloc.pp) ?(pp_kind = pp_kind ()) 199 | ?(pp_prefix = pp_prefix) () ppf (k, l) 200 | = 201 | pf ppf "@[%a:@,%a%a@]" pp_loc l pp_prefix () pp_kind k 202 | 203 | let to_string ?(pp = pp ()) = function 204 | | Ok _ as v -> v | Error e -> Error (Format.asprintf "%a" pp e) 205 | end 206 | 207 | (* Decoder *) 208 | 209 | type decoder = 210 | { file : Textloc.fpath; 211 | s : string; 212 | nolocs : bool; 213 | nolayout : bool; 214 | max_pos : int; (* String.length i - 1 *) 215 | mutable pos : int; (* current byte position. *) 216 | mutable line_pos : Textloc.line_pos; (* line position of [pos] *) 217 | buf : Buffer.t; } 218 | 219 | let decoder ?(layout = false) ?(locs = true) ?(file = Textloc.file_none) s = 220 | let nolocs = not locs and nolayout = not layout in 221 | 222 | { file; s; nolocs; nolayout; max_pos = String.length s - 1; 223 | pos = 0; line_pos = Textloc.line_pos_first; 224 | buf = Buffer.create 255 } 225 | 226 | let tloc_line_span d ~first ~last = 227 | Textloc.v ~file:d.file ~first_byte:first ~last_byte:last 228 | ~first_line:d.line_pos ~last_line:d.line_pos 229 | 230 | (* Decoding errors *) 231 | 232 | exception Err of Error.t 233 | 234 | let dec_error_escape d ~first ~last esc = 235 | let tloc = tloc_line_span d ~first ~last in 236 | raise (Err (Escape esc, tloc)) 237 | 238 | let dec_error_expected d ~first ?(last = first) exps = 239 | let tloc = tloc_line_span d ~first ~last in 240 | raise (Err (Expected exps, tloc)) 241 | 242 | let dec_error_invalid d ~first ?(last = first) inv = 243 | let tloc = tloc_line_span d ~first ~last in 244 | raise (Err (Invalid inv, tloc)) 245 | 246 | let dec_error_unclosed d ~first uncl = 247 | let tloc = tloc_line_span d ~first ~last:d.max_pos in 248 | raise (Err (Unclosed uncl, tloc)) 249 | 250 | let dec_error_eoi d ~first exps = 251 | let tloc = tloc_line_span d ~first ~last:d.max_pos in 252 | raise (Err (Expected exps, tloc)) 253 | 254 | (* AST *) 255 | 256 | (* The type structure below closely mirrors the TOML ABNF. *) 257 | 258 | type comment = Textloc.t 259 | (* if non empty includes both the # and the newline *) 260 | type ws = Textloc.t (* may include newlines if there's no comment or data. *) 261 | type ws_comment_nl = ws * comment (* either has the newline, or none if eoi *) 262 | type basic_string = string 263 | type literal_string = string 264 | 265 | type simple_key = [`Bare | `Quoted_basic | `Quoted_literal ] * string 266 | type key = (ws * simple_key * ws) list 267 | 268 | type keyval = key * ws * tval 269 | and tval = 270 | | String 271 | | Boolean of bool 272 | | Array of ws_comment_nl * tval 273 | | Inline_table of ws * (keyval * ws) list 274 | | Date_time of date_time 275 | | Float of float 276 | | Integer of int64 277 | 278 | type expression = 279 | | Whitespace of ws_comment_nl 280 | | Keyval' of keyval * ws_comment_nl 281 | | Table' of ws * [`Std | `Array] * key * ws_comment_nl 282 | 283 | type toml = expression list 284 | 285 | (* Decoding functions *) 286 | 287 | let dec_ws d = (* ws *) 288 | let rec loop s max i = 289 | if i > max then i else match s.[i] with 290 | | ' ' | '\t' -> loop s max (i + 1) 291 | | _ -> i 292 | in 293 | let next = loop d.s d.max_pos d.pos in 294 | let ws = tloc_line_span d ~first:d.pos ~last:(next - 1) in 295 | d.pos <- next; 296 | ws 297 | 298 | let dec_newline d ws = (* newline, first newline checked, added to ws *) 299 | let next = 300 | if d.s.[d.pos] = '\n' then d.pos + 1 else (* '\r' *) 301 | let next = d.pos + 1 in 302 | if next > d.max_pos || d.s.[next] <> '\n' 303 | then dec_error_expected d ~first:next [Lf] 304 | else next + 1 305 | in 306 | let line_pos = fst d.line_pos + 1, next in 307 | let ws = tloc_line_span d ~first:(Textloc.first_byte ws) ~last:(next - 1) in 308 | d.pos <- next; 309 | d.line_pos <- line_pos; 310 | ws 311 | 312 | let dec_comment_newline d = (* [comment] newline, # checked *) 313 | let rec loop s max i = 314 | if i > max then i else match s.[i] with 315 | | '\n' -> i + 1 316 | | '\r' -> 317 | let i = i + 1 in 318 | if i > d.max_pos || d.s.[i] <> '\n' 319 | then dec_error_expected ~first:i d [Lf] 320 | else i + 1 321 | | '\x09' | '\x20' .. '\x7E' -> loop s max (i + 1) 322 | | '\x00' .. '\x08' | '\x0A' .. '\x1F' | '\x7F' -> 323 | dec_error_expected d ~first:i [Comment_char] 324 | | _ -> 325 | let udec = String.get_utf_8_uchar s i in 326 | match Uchar.utf_decode_is_valid udec with 327 | | true -> loop s max (i + Uchar.utf_decode_length udec) 328 | | false -> dec_error_expected d ~first:i [Comment_char] 329 | in 330 | let next = loop d.s d.max_pos d.pos in 331 | let line_pos = 332 | if next > d.max_pos then d.line_pos else 333 | fst d.line_pos + 1, next 334 | in 335 | let comment = tloc_line_span d ~first:d.pos ~last:(next - 1) in 336 | d.pos <- next; 337 | d.line_pos <- line_pos; 338 | comment 339 | 340 | let dec_ws_comment_newline d = 341 | if d.pos > d.max_pos then Textloc.none, Textloc.none else 342 | let ws = dec_ws d in 343 | if d.pos > d.max_pos then ws, Textloc.none else 344 | match d.s.[d.pos] with 345 | | '#' -> ws, dec_comment_newline d 346 | | '\n' | '\r' -> dec_newline d ws, Textloc.none 347 | | _ -> dec_error_expected d ~first:d.pos [Comment; Newline] 348 | 349 | let dec_table d ws = failwith "TODO" 350 | 351 | let dec_literal_string d = (* apostrophe checked. *) 352 | let rec loop s max i = 353 | if i > max then dec_error_unclosed d ~first:d.pos Literal_string else 354 | match s.[i] with 355 | | '\'' -> 356 | let first = d.pos + 1 and last = i - 1 in 357 | let s = String.sub s first (last - first + 1) in 358 | d.pos <- i + 1; 359 | s 360 | | '\x09' | '\x20' .. '\x26' | '\x28' .. '\x7E' -> loop s max (i + 1) 361 | | '\x00' .. '\x08' | '\x0A' .. '\x1F' | '\x7F' -> 362 | dec_error_expected d ~first:i [Literal_char] 363 | | _ -> 364 | let udec = String.get_utf_8_uchar s i in 365 | match Uchar.utf_decode_is_valid udec with 366 | | true -> loop s max (i + Uchar.utf_decode_length udec) 367 | | false -> dec_error_expected d ~first:i [Literal_char] 368 | in 369 | loop d.s d.max_pos (d.pos + 1) 370 | 371 | let rec dec_uchar_escape d ~first count u i = match count with 372 | | 0 -> 373 | if Uchar.is_valid u then Uchar.unsafe_of_int u, i else 374 | dec_error_escape d ~first ~last:(i - 1) (Not_uchar_value u) 375 | | count when i > d.max_pos -> 376 | dec_error_escape d ~first ~last:(i - 1) Invalid_uchar_escape 377 | | count -> 378 | match d.s.[i] with 379 | | '0' .. '9' as c -> 380 | let u = u * 16 + (Char.code c - 0x30) in 381 | dec_uchar_escape d ~first (count - 1) u (i + 1) 382 | | 'A' .. 'F' as c -> 383 | let u = u * 16 + (Char.code c - 0x37) in 384 | dec_uchar_escape d ~first (count - 1) u (i + 1) 385 | | c -> 386 | dec_error_escape d ~first ~last:(i - 1) Invalid_uchar_escape 387 | 388 | let dec_basic_string_with_escapes d first_esc = 389 | let flush b s max start i = 390 | if start <= max then Buffer.add_substring b s start (i - start) 391 | in 392 | let rec loop b s max start i = 393 | if i > max then dec_error_unclosed d ~first:d.pos Basic_string else 394 | match s.[i] with 395 | | '\"' -> 396 | flush b s max start i; d.pos <- i + 1; Buffer.contents b 397 | | '\x09' | '\x20' .. '\x21' | '\x23' .. '\x5B' | '\x5D' .. '\x7E' -> 398 | loop b s max start (i + 1) 399 | | '\x00' .. '\x08' | '\x0A' .. '\x1F' | '\x7F' -> 400 | dec_error_expected d ~first:i [Basic_char] 401 | | '\x5C' -> 402 | flush b s max start i; 403 | let i = i + 1 in 404 | if i > max then dec_error_unclosed d ~first:d.pos Basic_string else 405 | begin match s.[i] with 406 | | '\x22' | '\x5C' | '\x62' | '\x66' | '\x6E' | '\x72' | '\x74' as c -> 407 | let next = i + 1 in 408 | Buffer.add_char b c; loop b s max next next 409 | | '\x75' -> 410 | let u, next = dec_uchar_escape d ~first:(i - 1) 4 0 i in 411 | Buffer.add_utf_8_uchar b u; loop b s max next next 412 | | '\x55' -> 413 | let u, next = dec_uchar_escape d ~first:(i - 1) 8 0 i in 414 | Buffer.add_utf_8_uchar b u; loop b s max next next 415 | | c -> 416 | let first = i - 1 and last = i + 1 in 417 | dec_error_escape d ~first ~last (Not_escape_char c) 418 | end 419 | | _ -> 420 | let udec = String.get_utf_8_uchar s i in 421 | match Uchar.utf_decode_is_valid udec with 422 | | true -> loop b s max start (i + Uchar.utf_decode_length udec) 423 | | false -> dec_error_expected d ~first:i [Literal_char] 424 | in 425 | Buffer.reset d.buf; 426 | flush d.buf d.s d.max_pos d.pos first_esc; 427 | loop d.buf d.s d.max_pos first_esc first_esc 428 | 429 | let dec_basic_string d = (* quote checked *) 430 | let rec loop s max i = 431 | if i > max then dec_error_unclosed d ~first:d.pos Basic_string else 432 | match s.[i] with 433 | | '\"' -> 434 | let first = d.pos + 1 and last = i - 1 in 435 | let s = String.sub s first (last - first + 1) in 436 | d.pos <- i + 1; 437 | s 438 | | '\x09' | '\x20' .. '\x21' | '\x23' .. '\x5B' | '\x5D' .. '\x7E' -> 439 | loop s max (i + 1) 440 | | '\x00' .. '\x08' | '\x0A' .. '\x1F' | '\x7F' -> 441 | dec_error_expected d ~first:i [Basic_char] 442 | | '\x5C' -> 443 | dec_basic_string_with_escapes d i 444 | | _ -> 445 | let udec = String.get_utf_8_uchar s i in 446 | match Uchar.utf_decode_is_valid udec with 447 | | true -> loop s max (i + Uchar.utf_decode_length udec) 448 | | false -> dec_error_expected d ~first:i [Literal_char] 449 | in 450 | loop d.s d.max_pos d.pos 451 | 452 | let is_unquoted_key_char = function 453 | | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true | _ -> false 454 | 455 | let dec_unquoted_key d = 456 | let rec loop s max i = 457 | if i > max || not (is_unquoted_key_char s.[i]) then i else 458 | loop s max (i + 1) 459 | in 460 | let next = loop d.s d.max_pos d.pos in 461 | let first = d.pos and last = next - 1 in 462 | let s = String.sub d.s first (last - first + 1) in 463 | d.pos <- next; 464 | s 465 | 466 | let dec_simple_key d = 467 | if d.pos > d.max_pos then dec_error_expected d ~first:d.pos [Key] else 468 | match d.s.[d.pos] with 469 | | '\'' -> let s = dec_literal_string d in `Quoted_literal, s 470 | | '\"' -> let s = dec_basic_string d in `Quoted_basic, s 471 | | c when is_unquoted_key_char c -> `Bare, dec_unquoted_key d 472 | | _ -> dec_error_expected d ~first:d.pos [Key] 473 | 474 | let dec_key d ws = 475 | let rec loop d acc wsl = 476 | let k = dec_simple_key d in 477 | let wsr = dec_ws d in 478 | let acc = (wsl, k, wsr) :: acc in 479 | if d.pos > d.max_pos || d.s.[d.pos] <> '.' then List.rev acc else 480 | let ws = dec_ws d in 481 | loop d acc ws 482 | in 483 | loop d [] ws 484 | 485 | let dec_true d = (* 't' checked *) 486 | let last = d.pos + 3 in 487 | if last > d.max_pos then dec_error_eoi d ~first:d.pos [True] else 488 | if d.s.[d.pos + 1] = 'r' && d.s.[d.pos + 2] = 'u' && d.s.[d.pos + 3] = 'e' 489 | then (d.pos <- last + 1; Boolean true) 490 | else dec_error_expected d ~first:d.pos ~last [True] 491 | 492 | let dec_false d = (* 'f' checked *) 493 | let last = d.pos + 4 in 494 | if last > d.max_pos then dec_error_eoi d ~first:d.pos [False] else 495 | if d.s.[d.pos + 1] = 'a' && d.s.[d.pos + 2] = 'l' && 496 | d.s.[d.pos + 3] = 's' && d.s.[d.pos + 4] = 'e' 497 | then (d.pos <- last + 1; Boolean false) 498 | else dec_error_expected d ~first:d.pos ~last [False] 499 | 500 | let parse_float d ~first ~last = 501 | (* [float_of_string] is compatible with what we accept. *) 502 | try Float (float_of_string (String.sub d.s first (last - first + 1))) with 503 | | Failure _ -> assert false 504 | 505 | let parse_int64 d ~first ~last = 506 | try Integer (Int64.of_string (String.sub d.s first (last - first + 1))) with 507 | | Failure _ -> 508 | (* [Int64.of_string] is compatible with what our parser checked. If 509 | we get here, it should be only because of overflow. *) 510 | dec_error_invalid d ~first ~last Int64_overflow 511 | 512 | let dec_inf d ~first = (* 'i' checked *) 513 | let last = d.pos + 2 in 514 | if last > d.max_pos then dec_error_eoi d ~first [Inf] else 515 | if d.s.[d.pos + 1] = 'n' && d.s.[d.pos + 2] = 'f' 516 | then (d.pos <- last + 1; parse_float d ~first ~last) 517 | else dec_error_expected d ~first ~last [Inf] 518 | 519 | let dec_nan d ~first = (* 'n' checked *) 520 | let last = d.pos + 2 in 521 | if last > d.max_pos then dec_error_eoi d ~first [Nan] else 522 | if d.s.[d.pos + 1] = 'a' && d.s.[d.pos + 2] = 'n' 523 | then (d.pos <- last + 1; parse_float d ~first ~last) 524 | else dec_error_expected d ~first ~last [Nan] 525 | 526 | let rec dec_float_exp d ~first = (* 'e' checked *) 527 | let rec loop d ~last_is_digit ~first pos = 528 | if pos > d.max_pos then parse_float d ~first ~last:(pos - 1) else 529 | match d.s.[pos] with 530 | | '0' .. '9' -> loop d ~last_is_digit:true ~first (pos + 1) 531 | | '_' when last_is_digit -> loop d ~last_is_digit:false ~first (pos + 1) 532 | | _ when last_is_digit -> 533 | d.pos <- pos; parse_float d ~first ~last:(pos - 1) 534 | | _ -> dec_error_expected d ~first:pos [Dec_digit] 535 | in 536 | let next = d.pos + 1 in 537 | if next > d.max_pos then dec_error_eoi d ~first [Float_exp] else 538 | match d.s.[next] with 539 | | '-' | '+' -> d.pos <- next; dec_float_exp d ~first 540 | | _ -> loop d ~last_is_digit:false ~first next 541 | 542 | let dec_float_frac d ~first = (* '.' checked *) 543 | let rec loop d ~last_is_digit ~first pos = 544 | if pos > d.max_pos then parse_float d ~first ~last:(pos - 1) else 545 | match d.s.[pos] with 546 | | '0' .. '9' -> loop d ~last_is_digit:true ~first (pos + 1) 547 | | 'e' -> d.pos <- pos; dec_float_exp d ~first 548 | | '_' when last_is_digit -> loop d ~last_is_digit:false ~first (pos + 1) 549 | | _ when last_is_digit -> 550 | d.pos <- pos; parse_float d ~first ~last:(pos - 1) 551 | | _ -> dec_error_expected d ~first:pos [Dec_digit] 552 | in 553 | let next = d.pos + 1 in 554 | if next > d.max_pos then dec_error_eoi d ~first [Float_frac] else 555 | loop d ~last_is_digit:false ~first next 556 | 557 | let rec dec_dec_integer_or_float d ~first = 558 | if d.pos > d.max_pos then dec_error_eoi d ~first [Integer; Float] else 559 | match d.s.[d.pos] with 560 | | 'i' -> dec_inf d ~first 561 | | 'n' -> dec_nan d ~first 562 | | '0' -> 563 | let next = d.pos + 1 in 564 | begin match d.s.[next] with 565 | | '.' -> d.pos <- next; dec_float_frac d ~first 566 | | 'e' -> d.pos <- next; dec_float_exp d ~first 567 | | '0' .. '9' -> dec_error_invalid d ~first Leading_zeros 568 | | _ -> Integer 0L 569 | end 570 | | '1' .. '9' -> 571 | let rec loop d ~last_is_digit ~first pos = 572 | if pos > d.max_pos then parse_int64 d ~first ~last:(pos - 1) else 573 | match d.s.[pos] with 574 | | '0' .. '9' -> loop d ~last_is_digit:true ~first (pos + 1) 575 | | '.' -> d.pos <- pos; dec_float_frac d ~first 576 | | 'e' -> d.pos <- pos; dec_float_exp d ~first 577 | | '_' when last_is_digit -> 578 | loop d ~last_is_digit:false ~first (pos + 1) 579 | | _ when last_is_digit -> 580 | d.pos <- pos; parse_int64 d ~first ~last:(pos - 1) 581 | | _ -> dec_error_expected d ~first:pos [Dec_digit] 582 | in 583 | loop d ~last_is_digit:false ~first d.pos 584 | | _ -> dec_error_expected d ~first:d.pos [Integer; Float] 585 | 586 | let dec_hex_integer d ~first = 587 | if d.pos > d.max_pos then dec_error_eoi d ~first [Hex_digit] else 588 | let rec loop d ~last_is_digit ~first pos = 589 | if pos > d.max_pos then parse_int64 d ~first ~last:(pos - 1) else 590 | match d.s.[pos] with 591 | | '0' .. '9' | 'A' .. 'F' -> loop d ~last_is_digit:true ~first (pos + 1) 592 | | '_' when last_is_digit -> loop d ~last_is_digit:false ~first (pos + 1) 593 | | _ when last_is_digit -> 594 | d.pos <- pos; parse_int64 d ~first ~last:(pos - 1) 595 | | _ -> dec_error_expected d ~first:pos [Hex_digit] 596 | in 597 | loop d ~last_is_digit:false ~first d.pos 598 | 599 | let dec_oct_integer d ~first = 600 | if d.pos > d.max_pos then dec_error_eoi d ~first [Oct_digit] else 601 | let rec loop d ~last_is_digit ~first pos = 602 | if pos > d.max_pos then parse_int64 d ~first ~last:(pos - 1) else 603 | match d.s.[pos] with 604 | | '0' .. '7' -> loop d ~last_is_digit:true ~first (pos + 1) 605 | | '_' when last_is_digit -> loop d ~last_is_digit:false ~first (pos + 1) 606 | | _ when last_is_digit -> 607 | d.pos <- pos; parse_int64 d ~first ~last:(pos - 1) 608 | | _ -> dec_error_expected d ~first:pos [Oct_digit] 609 | in 610 | loop d ~last_is_digit:false ~first d.pos 611 | 612 | let dec_bin_integer d ~first = 613 | if d.pos > d.max_pos then dec_error_eoi d ~first [Bin_digit] else 614 | let rec loop d ~last_is_digit ~first pos = 615 | if pos > d.max_pos then parse_int64 d ~first ~last:(pos - 1) else 616 | match d.s.[pos] with 617 | | '0' .. '1' -> loop d ~last_is_digit:true ~first (pos + 1) 618 | | '_' when last_is_digit -> loop d ~last_is_digit:false ~first (pos + 1) 619 | | _ when last_is_digit -> 620 | d.pos <- pos; parse_int64 d ~first ~last:(pos - 1) 621 | | _ -> dec_error_expected d ~first:pos [Bin_digit] 622 | in 623 | loop d ~last_is_digit:false ~first d.pos 624 | 625 | let dec_integer_or_float d ~first = (* first digit checked *) 626 | d.pos <- d.pos + 1; 627 | if d.pos > d.max_pos then parse_int64 d ~first ~last:(d.pos - 1) else 628 | match d.s.[d.pos] with 629 | | 'x' -> d.pos <- d.pos + 1; dec_hex_integer d ~first 630 | | 'o' -> d.pos <- d.pos + 1; dec_oct_integer d ~first 631 | | 'b' -> d.pos <- d.pos + 1; dec_bin_integer d ~first 632 | | '0' .. '9' -> dec_dec_integer_or_float d ~first 633 | | _ -> dec_error_expected d ~first ~last:d.pos [Integer] 634 | 635 | let n_digits d ~n = 636 | let rec loop d n i v = 637 | if n = 0 then (d.pos <- i; v) else 638 | if i > d.max_pos then dec_error_eoi d ~first:i [Dec_digit] else 639 | match d.s.[i] with 640 | | '0' .. '9' as c -> 641 | loop d (n - 1) (i + 1) (v * 10 + (Char.code c - 0x30)) 642 | | _ -> dec_error_expected d ~first:i [Dec_digit] 643 | in 644 | loop d n d.pos 0 645 | 646 | let dec_char d ~c = 647 | if d.pos > d.max_pos then dec_error_eoi d ~first:d.pos [Char c] else 648 | if d.s.[d.pos] = c then (d.pos <- d.pos + 1) else 649 | dec_error_expected d ~first:d.pos [Char c] 650 | 651 | let dec_full_date d = 652 | let first = d.pos in 653 | let yyyy = n_digits d ~n:4 in 654 | dec_char d ~c:'-'; 655 | let mm = n_digits d ~n:2 in 656 | dec_char d ~c:'-'; 657 | let dd = n_digits d ~n:2 in 658 | let date = yyyy, mm, dd in 659 | if is_date_valid date then date else 660 | dec_error_invalid d ~first ~last:(d.pos - 1) (Date date) 661 | 662 | let dec_hour d = 663 | let first = d.pos in 664 | let hh = n_digits d ~n:2 in 665 | if 00 <= hh && hh <= 23 then hh else 666 | dec_error_invalid d ~first ~last:(d.pos - 1) (Hour hh) 667 | 668 | let dec_minute d = 669 | let first = d.pos in 670 | let mm = n_digits d ~n:2 in 671 | if 00 <= mm && mm <= 59 then mm else 672 | dec_error_invalid d ~first ~last:(d.pos - 1) (Minute mm) 673 | 674 | let dec_sec d = 675 | let first = d.pos in 676 | let ss = n_digits d ~n:2 in 677 | if 00 <= ss && ss <= 60 then ss else 678 | dec_error_invalid d ~first ~last:(d.pos - 1) (Second ss) 679 | 680 | let dec_secfrac d = (* . checked *) 681 | d.pos <- d.pos + 1; 682 | let rec loop d first i = 683 | if i > d.max_pos || 684 | match d.s.[i] with '0' .. '9' -> false | _ -> true 685 | then (float_of_string ("0." ^ String.sub d.s first (i - first))) 686 | else loop d first (i + 1) 687 | in 688 | loop d d.pos d.pos 689 | 690 | let dec_partial_time d = 691 | let hh = dec_hour d in 692 | dec_char d ~c:':'; 693 | let mm = dec_minute d in 694 | dec_char d ~c:':'; 695 | let ss = n_digits d ~n:2 in 696 | match d.s.[d.pos] with 697 | | '.' -> (hh, mm, ss), Some (dec_secfrac d) 698 | | _ -> (hh, mm, ss), None 699 | 700 | let dec_time_offset d = 701 | if d.pos > d.max_pos then dec_error_eoi d ~first:d.pos [Tz_offset] else 702 | match d.s.[d.pos] with 703 | | 'Z' -> d.pos <- d.pos + 1; 0 704 | | '+' | '-' as sign -> 705 | d.pos <- d.pos + 1; 706 | let hh = dec_hour d in 707 | dec_char d ~c:':'; 708 | let mm = dec_minute d in 709 | (if sign = '+' then 1 else -1) * (hh * 3600 + mm * 60) 710 | | _ -> 711 | dec_error_eoi d ~first:d.pos [Tz_offset] 712 | 713 | let dec_full_time d = 714 | let time, frac = dec_partial_time d in 715 | time, frac, Some (dec_time_offset d) 716 | 717 | let dec_local_time d = 718 | let time, frac = dec_partial_time d in 719 | Date_time (None, Some (time, frac, None)) 720 | 721 | let dec_date_time d = 722 | let date = dec_full_date d in 723 | if d.pos > d.max_pos then Date_time (Some date, None) else 724 | match d.s.[d.pos] with 725 | | 'T' -> 726 | d.pos <- d.pos + 1; 727 | let time = dec_full_time d in 728 | Date_time (Some date, Some time) 729 | | ' ' -> 730 | d.pos <- d.pos + 1; 731 | begin match d.s.[d.pos] with 732 | | '0' .. '9' -> Date_time (Some date, Some (dec_full_time d)) 733 | | _ -> Date_time (Some date, None) 734 | end 735 | | _ -> 736 | Date_time (Some date, None) 737 | 738 | let dec_val d = 739 | if d.pos > d.max_pos then dec_error_expected d ~first:d.pos [Value] else 740 | match d.s.[d.pos] with 741 | | '+' | '-' -> 742 | let first = d.pos in 743 | d.pos <- d.pos + 1; dec_dec_integer_or_float d ~first 744 | | 'i' -> dec_inf d ~first:d.pos 745 | | 'n' -> dec_nan d ~first:d.pos 746 | | '0' .. '9' -> 747 | let colon_pos = d.pos + 2 in 748 | if colon_pos > d.max_pos then dec_integer_or_float d ~first:d.pos else 749 | if d.s.[colon_pos] = ':' then dec_local_time d else 750 | let dash_pos = d.pos + 4 in 751 | if dash_pos > d.max_pos then dec_integer_or_float d ~first:d.pos else 752 | if d.s.[dash_pos] = '-' then dec_date_time d else 753 | dec_integer_or_float d ~first:d.pos 754 | | '\'' -> failwith "TODO" 755 | | '\"' -> failwith "TODO" 756 | | 't' -> dec_true d 757 | | 'f' -> dec_false d 758 | | '[' -> failwith "TODO" 759 | | _ -> dec_error_expected d ~first:d.pos [Value] 760 | 761 | let dec_keyval_sep d = 762 | if d.pos > d.max_pos || d.s.[d.pos] <> '=' 763 | then dec_error_expected d ~first:d.pos [Equal_char] 764 | else dec_ws d 765 | 766 | let dec_keyval d ws = 767 | let key = dec_key d ws in 768 | let ws = dec_keyval_sep d in 769 | let v = dec_val d in 770 | key, ws, v 771 | 772 | let dec_keyval_expression d ws = 773 | let kv = dec_keyval d ws in 774 | Keyval' (kv, dec_ws_comment_newline d) 775 | 776 | let dec_expression d = 777 | let ws = dec_ws d in 778 | if d.pos > d.max_pos then Whitespace (ws, Textloc.none) else 779 | match d.s.[d.pos] with 780 | | '#' -> Whitespace (ws, dec_comment_newline d) 781 | | '\n' | '\r' -> Whitespace (dec_newline d ws, Textloc.none) 782 | | '[' -> dec_table d ws 783 | | '\'' | '\"' -> dec_keyval_expression d ws 784 | | c when is_unquoted_key_char c -> dec_keyval_expression d ws 785 | | _ -> 786 | let exps = Error.[Keyval; Table; Comment; Newline; Eoi] in 787 | dec_error_expected d ~first:d.pos exps 788 | 789 | let dec_toml d = 790 | let rec loop d acc = 791 | if d.pos > d.max_pos then List.rev acc else 792 | loop d (dec_expression d :: acc) 793 | in 794 | loop d [] 795 | 796 | let of_string ?layout ?locs ?file i = 797 | let d = decoder ?layout ?locs ?file i in 798 | match dec_toml d with 799 | | _ -> Ok ([], Meta.none) 800 | | exception Err (err, loc) -> Error (err, loc) 801 | 802 | let of_string' ?pp_error ?file i = 803 | Error.to_string ?pp:pp_error (of_string ?file i) 804 | 805 | let to_string t = failwith "TODO" 806 | 807 | 808 | (* FIXME cut and paste from sexp *) 809 | 810 | (* Indices *) 811 | 812 | type index = Nth of int | Key of string 813 | 814 | let pp_key = Format.pp_print_string 815 | let pp_index ?(pp_key = pp_key) () ppf = function 816 | | Nth n -> pf ppf "[%d]" n 817 | | Key k -> pp_key ppf k 818 | 819 | let pp_bracketed_index ?(pp_key = pp_key) () ppf = function 820 | | Nth n -> pf ppf "[%d]" n 821 | | Key k -> pf ppf "[%a]" pp_key k 822 | 823 | (* Paths *) 824 | 825 | type path = index list (* reversed *) 826 | 827 | let path_err i fmt = Format.kasprintf failwith ("%d: " ^^ fmt) i 828 | let path_err_unexp_eoi i = path_err i "unexpected end of input" 829 | let path_err_unexp_char i s = path_err i "unexpected character: %C" s.[i] 830 | let path_err_illegal_char i s = path_err i "illegal character here: %C" s.[i] 831 | let err_unexp i s = 832 | path_err i "unexpected input: %S" (Textdec.string_subrange ~first:i s) 833 | 834 | let path_parse_eoi s i max = if i > max then () else err_unexp i s 835 | let path_parse_index p s i max = 836 | let first, stop = match s.[i] with '[' -> i + 1, ']' | _ -> i, '.' in 837 | let last, next = 838 | let rec loop stop s i max = match i > max with 839 | | true -> if stop = ']' then path_err_unexp_eoi i else (i - 1), i 840 | | false -> 841 | let illegal = s.[i] = '[' || (s.[i] = ']' && stop = '.') in 842 | if illegal then path_err_illegal_char i s else 843 | if s.[i] <> stop then loop stop s (i + 1) max else 844 | (i - 1), if stop = ']' then i + 1 else i 845 | in 846 | loop stop s first max 847 | in 848 | let idx = Textdec.string_subrange ~first ~last s in 849 | if idx = "" then path_err first "illegal empty index" else 850 | match int_of_string idx with 851 | | exception Failure _ -> next, (Key idx) :: p 852 | | idx -> next, (Nth idx) :: p 853 | 854 | let path_of_string s = 855 | let rec loop p s i max = 856 | if i > max then p else 857 | let next, p = path_parse_index p s i max in 858 | if next > max then p else 859 | if s.[next] <> '.' then path_err_unexp_char next s else 860 | if next + 1 <= max then loop p s (next + 1) max else 861 | path_err_unexp_eoi next 862 | in 863 | try 864 | if s = "" then Ok [] else 865 | let start = if s.[0] = '.' then 1 else 0 in 866 | Ok (loop [] s start (String.length s - 1)) 867 | with Failure e -> Error e 868 | 869 | let pp_path ?pp_key () ppf is = 870 | let pp_sep ppf () = Format.pp_print_char ppf '.' in 871 | Format.pp_print_list ~pp_sep (pp_index ?pp_key ()) ppf (List.rev is) 872 | 873 | (* Carets *) 874 | 875 | type caret_loc = Before | Over | After 876 | type caret = caret_loc * path 877 | 878 | let caret_path (_, p) = p 879 | let caret_of_string s = 880 | let rec loop p s i max = 881 | if i > max then Over, p else 882 | let next = i + 1 in 883 | match s.[i] with 884 | | 'v' when next <= max && s.[next] = '[' -> 885 | let next, p = path_parse_index p s next max in 886 | path_parse_eoi s next max; Before, p 887 | | c -> 888 | let next, p = path_parse_index p s i max in 889 | if next > max then Over, p else 890 | if s.[next] = 'v' 891 | then (path_parse_eoi s (next + 1) max; After, p) else 892 | if s.[next] <> '.' then path_err_unexp_char next s else 893 | if next + 1 <= max then loop p s (next + 1) max else 894 | path_err_unexp_eoi next 895 | in 896 | try 897 | if s = "" then Ok (Over, []) else 898 | let start = if s.[0] = '.' then 1 else 0 in 899 | Ok (loop [] s start (String.length s - 1)) 900 | with Failure e -> Error e 901 | 902 | let pp_caret ?pp_key () ppf = function 903 | | Over, p -> (pp_path ?pp_key ()) ppf p 904 | | Before, (c :: p) -> 905 | (pp_path ?pp_key ()) ppf p; 906 | (if p <> [] then Format.pp_print_char ppf '.'); 907 | Format.pp_print_char ppf 'v'; (pp_bracketed_index ?pp_key ()) ppf c 908 | | After, (c :: p) -> 909 | (pp_path ?pp_key ()) ppf p; 910 | (if p <> [] then Format.pp_print_char ppf '.'); 911 | (pp_bracketed_index ?pp_key ()) ppf c; Format.pp_print_char ppf 'v' 912 | | _ -> () 913 | end 914 | -------------------------------------------------------------------------------- /src/serialkit_toml.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2021 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** TOML support. 7 | 8 | As specified in {{:https://toml.io/en/v1.0.0}TOML v1.0.0}. 9 | 10 | Open this module to use it, this only introduces modules in your scope. *) 11 | 12 | open Serialkit_text 13 | 14 | (** TOML definitions and codec. 15 | 16 | {b References} 17 | {ul 18 | {- Tom Preston-Werner et al, 19 | {:https://toml.io/en/v1.0.0}{e TOML v1.0.0}}.}} *) 20 | module Toml : sig 21 | 22 | (** {1:toml TOML} *) 23 | 24 | val file_ext : string 25 | (** [file_ext] is [".toml"]. *) 26 | 27 | val mime_type : string 28 | (** [mime_type] is ["application/toml"]. *) 29 | 30 | val version : string 31 | (** [version] is the supported TOML version. *) 32 | 33 | (** Text locations. *) 34 | module Textloc = Serialkit_text.Textloc 35 | 36 | (** Node metadata. *) 37 | module Meta = Serialkit_text.Meta 38 | 39 | type 'a node = 'a * Meta.t 40 | (** The type for abstract syntax tree nodes. The data of type ['a] and its 41 | metadata. *) 42 | 43 | type tz_offset_s = int 44 | (** The type for time zone offsets between local and UTC timelines 45 | in seconds. This is the signed difference in seconds between 46 | the local timeline and the UTC timeline. *) 47 | 48 | type date = int * int * int 49 | (** The type dates. A year, a month and a day. *) 50 | 51 | type time = (int * int * int) * float option * tz_offset_s option 52 | type date_time = date option * time option 53 | type table = (string node * value) node list 54 | 55 | and value = 56 | [ `Boolean of bool node 57 | | `Integer of int64 node 58 | | `Float of float node 59 | | `String of string node 60 | | `Array of value list node 61 | | `Date_time of date_time node 62 | | `Table of table node ] 63 | 64 | type t = table node 65 | (** The type for TOML documents. *) 66 | 67 | (** {1:fmt Formatting} *) 68 | 69 | type 'a fmt = Format.formatter -> 'a -> unit 70 | (** The type for formatting functions. *) 71 | 72 | val pp : t fmt 73 | (** [pp] formats TOML. *) 74 | 75 | val pp_layout : t fmt 76 | (** [pp_layout] is like [pp] but uses layout information. *) 77 | 78 | val pp_value : value fmt 79 | (** [pp_value] formats TOML values. *) 80 | 81 | (** {1:codec Codec} *) 82 | 83 | (** Decoding errors. *) 84 | module Error : sig 85 | 86 | type kind 87 | (** The type for kinds of decoding errors. *) 88 | 89 | val pp_kind : unit -> kind fmt 90 | (** [pp_error_kind ()] formats an error kind. *) 91 | 92 | type t = kind * Textloc.t 93 | (** The type for decoding errors. The error kind and its location in 94 | text. *) 95 | 96 | val pp : 97 | ?pp_loc:Textloc.t fmt -> ?pp_kind:kind fmt -> 98 | ?pp_prefix:unit fmt -> unit -> t fmt 99 | (** [pp ~pp_loc ~pp_error_kind ~pp_prefix ()] formats errors 100 | using [pp_loc] (defaults to {!pp_loc}), [pp_error_kind] 101 | (defaults to {!pp_error_kind}) and [pp_prefix] (defaults formats 102 | ["Error: "]). *) 103 | 104 | val to_string : ?pp:t fmt -> ('a, t) result -> ('a, string) result 105 | (** [to_string ~pp r] converts an error to a string using [pp] 106 | (defaults to {!pp}). *) 107 | end 108 | 109 | val of_string : 110 | ?layout:bool -> ?locs:bool -> ?file:Textloc.fpath -> string -> 111 | (t, Error.t) result 112 | (** [of_string s] parses TOML from [s]. 113 | 114 | {ul 115 | {- [file] is the file path from which [s] is assumed to have been 116 | read (defaults to {!Textloc.none}).} 117 | {- If [locs] is [true] (default) locations are stored in nodes of the 118 | abstract syntax tree in {{!Meta.id}individually identifified} {!Meta.t} 119 | values. If [false] node meta values are all {!Meta.none} whose text 120 | location is {!Textloc.none}.} 121 | {- If [layout] is [false] (default) layout values cannot be relied 122 | and do not in general represent source layout.}} 123 | 124 | The parser has the following limitations. 125 | 126 | {ul 127 | {- The ranges of date and time {e fields} are checked but 128 | dates are not checked for validity. Use your favourite 129 | date-time library to validate them.}} 130 | 131 | {b Note.} All OCaml strings returned by this function are UTF-8 132 | encoded. *) 133 | 134 | val of_string' : 135 | ?pp_error:Error.t fmt -> ?file:Textloc.fpath -> string -> 136 | (t, string) result 137 | (** [of_string'] is [of_string] composed with {!error_to_string}. *) 138 | 139 | val to_string : t -> string 140 | (** [to_string t] is [t] as TOML. 141 | 142 | {b Warning.} Assumes all OCaml strings in [t] are UTF-8 encoded. *) 143 | 144 | (** {1:toml_index TOML indices} *) 145 | 146 | type index = 147 | | Nth of int (** *) 148 | | Key of string (** *) 149 | (** The type for TOML indexing operations. 150 | {ul 151 | {- [Nth n], lookup zero-based element [n] in a list. If [n] is 152 | negative, counts the number of elements from the end of the 153 | list, i.e. [-1] is the last list element.} 154 | {- [Key k], lookup binding [k] in an s-expression 155 | {{!sexp_dict}dictionary.}}} *) 156 | 157 | val pp_key : string fmt 158 | (** [pp_key] formats a key, this is {!Format.pp_print_string}. *) 159 | 160 | val pp_index : ?pp_key:string fmt -> unit -> index fmt 161 | (** [pp_index] formats indices. Keys are unbracketed and formatted 162 | with [pp_key], defaults to {!pp_key}. *) 163 | 164 | (** {1:toml_path TOML paths} *) 165 | 166 | type path = index list 167 | (** The type for paths, a sequence of indexing operations in {b reverse} 168 | order. *) 169 | 170 | val path_of_string : string -> (path, string) result 171 | (** [path_of_string] parses a path from [s] according to the syntax 172 | {{!sexp_path_caret}given here}. *) 173 | 174 | val pp_path : ?pp_key:string fmt -> unit -> path fmt 175 | (** [pp_path ?pp_key ()] is a formatter for paths using [pp_key] to 176 | format keys (defaults to {!pp_key}). *) 177 | 178 | (** {1:carets Carets} *) 179 | 180 | type caret_loc = 181 | | Before (** The void before the TOML found by the path. *) 182 | | Over (** The TOML found by the path. *) 183 | | After (** The void after the TOML found by the path. *) 184 | (** The type for caret locations. *) 185 | 186 | type caret = caret_loc * path 187 | (** The type for carets. A caret location and the path at which it 188 | applies. *) 189 | 190 | val caret_of_string : string -> (caret, string) result 191 | (** [caret_of_string s] parses a caret from [s] according to the 192 | syntax {{!sexp_path_caret}given here}. *) 193 | 194 | val pp_caret : ?pp_key:string fmt -> unit -> caret fmt 195 | (** [pp_caret ?pp_key ()] is a formatter for carets using [pp_key] 196 | to format keys (defaults to {!pp_key}). *) 197 | end 198 | -------------------------------------------------------------------------------- /test/expect/test.sexp: -------------------------------------------------------------------------------- 1 | ;; 2 | 3 | testabc adf 4 | (goet-list' bla bli blo blu) 5 | (hey 6 | gluu 7 | (hi blu)) 8 | 9 | (get-atom atom) 10 | 11 | (l0 12 | (l00 a b c d) 13 | (l00) 14 | (l01 a b c d) 15 | (l02 a 16 | (l03 bla bli) ; Hahahahaha 17 | );asdf 18 | ; 19 | ) 20 | 21 | ;; asdf 22 | -------------------------------------------------------------------------------- /test/expect/test.toml: -------------------------------------------------------------------------------- 1 | # This is a TOML document 2 | 3 | title = "TOML Example" 4 | 5 | [owner] 6 | name = "Tom Preston-Werner" 7 | dob = 1979-05-27T07:32:00-08:00 8 | 9 | [database] 10 | enabled = true 11 | ports = [ 8000, 8001, 8002 ] 12 | data = [ ["delta", "phi"], [3.14] ] 13 | temp_targets = { cpu = 79.5, case = 72.0 } 14 | 15 | [servers] 16 | 17 | [servers.alpha] 18 | ip = "10.0.0.1" 19 | role = "frontend" 20 | 21 | [servers.beta] 22 | ip = "10.0.0.2" 23 | role = "backend" 24 | -------------------------------------------------------------------------------- /test/test.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2019 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | -------------------------------------------------------------------------------- /test/test_toml.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2021 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | let log fmt = Format.kfprintf (Fun.const ()) Format.std_formatter (fmt ^^ "@.") 7 | let log_fail pp ~exp ~fnd = 8 | log "@[Expected: @[%a@]@,Found: @[%a@]@]" pp exp pp fnd 9 | 10 | let main () = 11 | log "All tests succeeded." 12 | 13 | let () = if !Sys.interactive then () else main () 14 | -------------------------------------------------------------------------------- /tool/cmd_main.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2023 The cmarkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open Std 7 | open Cmdliner 8 | 9 | let cmds = [ Cmd_sexp.v; Cmd_toml.v; ] 10 | 11 | let serialkit = 12 | let doc = "Process serialization formats" in 13 | let exits = Exit.exits_with_err_diff in 14 | let man = [ 15 | `S Manpage.s_description; 16 | `P "$(mname) process various serialization formats"; 17 | `Blocks Cli.common_man; ] 18 | in 19 | Cmd.group (Cmd.info "serialkit" ~version:"%%VERSION%%" ~doc ~exits ~man) @@ 20 | cmds 21 | 22 | let main () = exit (Cmd.eval' serialkit) 23 | let () = if !Sys.interactive then () else main () 24 | -------------------------------------------------------------------------------- /tool/cmd_main.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2023 The cmarkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | -------------------------------------------------------------------------------- /tool/cmd_sexp.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2019 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open Std 7 | open Serialkit_sexp 8 | 9 | let err_file = 1 10 | let err_sexp = 2 11 | let err_query = 3 12 | 13 | let exec = Filename.basename Sys.executable_name 14 | let log_err fmt = 15 | Format.fprintf Format.err_formatter ("%s: @[" ^^ fmt ^^ "@]@.") exec 16 | 17 | let log_on_error ~exit:code r f = match r with 18 | | Error e -> log_err "%s" e; code | Ok v -> f v 19 | 20 | let delete file path = 21 | let query = Sexpq.delete_at_path ~must_exist:false path in 22 | log_on_error ~exit:err_file (Os.read_file file) @@ fun content -> 23 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' ~file content) @@ fun sexp -> 24 | log_on_error ~exit:err_query (Sexpq.query' query sexp) @@ fun sexp -> 25 | Format.printf "@[%a@]" Sexp.pp_seq_layout sexp; 0 26 | 27 | let get file path = 28 | log_on_error ~exit:err_file (Os.read_file file) @@ fun content -> 29 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' ~file content) @@ fun sexp -> 30 | match path with 31 | | None -> Format.printf "@[%a@]" Sexp.pp_seq_layout sexp; 0 32 | | Some path -> 33 | let query = Sexpq.path path Sexpq.sexp in 34 | log_on_error ~exit:err_query (Sexpq.query' query sexp) @@ function 35 | | `A (a, _) | `L ([`A (a, _)], _) -> Format.printf "%s@." a; 0 36 | | `L _ as l -> Format.printf "@[%a@]@." Sexp.pp l; 0 37 | 38 | let locs file = 39 | let pp_loc ppf l = 40 | Serialkit_text.Textloc.pp ppf l; Format.pp_print_char ppf ':' 41 | in 42 | let rec pp_locs ppf = function 43 | | `A (_, _) as s -> pp_loc ppf (Sexp.loc s) 44 | | `L (vs, _) as s -> 45 | pp_loc ppf (Sexp.loc s); Format.pp_print_cut ppf (); 46 | Format.pp_print_list pp_locs ppf vs 47 | in 48 | log_on_error ~exit:err_file (Os.read_file file) @@ fun content -> 49 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' ~file content) @@ fun sexp -> 50 | Format.printf "@[%a@]@." pp_locs sexp; 0 51 | 52 | let set file caret v = 53 | log_on_error ~exit:err_file (Os.read_file file) @@ fun content -> 54 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' ~file content) @@ fun sexp -> 55 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' v) @@ fun v -> 56 | let query = Sexpq.splice_at_caret ~must_exist:false caret ~rep:v in 57 | log_on_error ~exit:err_query (Sexpq.query' query sexp) @@ fun sexp -> 58 | Format.printf "@[%a@]" Sexp.pp_seq_layout sexp; 0 59 | 60 | (* Command line interface *) 61 | 62 | open Cmdliner 63 | 64 | let file_arg = 65 | let doc = "$(docv) is the s-expression file. Use $(b,-) for stdin." in 66 | Arg.(required & pos 0 (some string) None & info [] ~doc ~docv:"FILE") 67 | 68 | let path_arg = 69 | let parse s = match Sexp.path_of_string s with 70 | | Ok _ as v -> v | Error e -> Error (`Msg e) 71 | in 72 | Arg.conv ~docv:"SPATH" (parse, Sexp.pp_path ()) 73 | 74 | let caret_arg = 75 | let parse s = match Sexp.caret_of_string s with 76 | | Ok _ as v -> v | Error e -> Error (`Msg e) 77 | in 78 | Arg.conv ~docv:"CARET" (parse, Sexp.pp_caret ()) 79 | 80 | let exits = 81 | Cmd.Exit.info err_file ~doc:"on file read errors" :: 82 | Cmd.Exit.info err_sexp ~doc:"on s-expression parse errors" :: 83 | Cmd.Exit.info err_query ~doc:"on path query errors" :: 84 | Cmd.Exit.defaults 85 | 86 | let common_man = 87 | [ `S Manpage.s_bugs; 88 | `P "This program is distributed with the serialk OCaml library. \ 89 | See $(i,%%PKG_HOMEPAGE%%) for contact information."; ] 90 | 91 | let delete_cmd = 92 | let doc = "Delete an s-expression path" in 93 | let sdocs = Manpage.s_common_options in 94 | let man = [ 95 | `S Manpage.s_description; 96 | `P "$(tname) returns the value of an s-expression file key."; 97 | `S Manpage.s_examples; 98 | `P "TODO"; 99 | `Blocks common_man; ] 100 | in 101 | let path_arg = 102 | let doc = "Delete s-expression path $(docv)." in 103 | Arg.(required & pos 1 (some path_arg) None & info [] ~doc ~docv:"SPATH") 104 | in 105 | Cmd.v (Cmd.info "delete" ~doc ~sdocs ~exits ~man) 106 | Term.(const delete $ file_arg $ path_arg) 107 | 108 | let get_cmd, get_term = 109 | let doc = "Extract an s-expression path (default)" in 110 | let sdocs = Manpage.s_common_options in 111 | let man = [ 112 | `S Manpage.s_description; 113 | `P "$(tname) returns the value of an s-expression file key."; 114 | `S Manpage.s_examples; 115 | `P "TODO"; 116 | `Blocks common_man; ] 117 | in 118 | let key_opt_arg = 119 | let doc = "Extract s-expression path $(docv)." in 120 | Arg.(value & pos 1 (some path_arg) None & info [] ~doc ~docv:"SPATH") 121 | in 122 | let term = Term.(const get $ file_arg $ key_opt_arg) in 123 | Cmd.v (Cmd.info "get" ~doc ~sdocs ~exits ~man) term, term 124 | 125 | let set_cmd = 126 | let doc = "Edit an s-expression path" in 127 | let sdocs = Manpage.s_common_options in 128 | let man = [ 129 | `S Manpage.s_description; 130 | `P "$(tname) sets the value of an s-expression file key."; 131 | `S Manpage.s_examples; 132 | `P "TODO"; 133 | `Blocks common_man; ] 134 | in 135 | let caret_arg = 136 | let doc = "Set caret $(docv)." in 137 | Arg.(required & pos 1 (some caret_arg) None & info [] ~doc ~docv:"CARET") 138 | in 139 | let sexp = 140 | let doc = "$(docv) to insert or substitute" in 141 | Arg.(required & pos 2 (some string) None & info [] ~doc ~docv:"SEXP") 142 | in 143 | Cmd.v (Cmd.info "set" ~doc ~sdocs ~exits ~man) 144 | Term.(const set $ file_arg $ caret_arg $ sexp) 145 | 146 | let locs_cmd = 147 | let doc = "Show s-expression parse locations" in 148 | let sdocs = Manpage.s_common_options in 149 | let man = [ 150 | `S Manpage.s_description; 151 | `P "$(tname) outputs s-expression parse locations."; 152 | `Blocks common_man; ] 153 | in 154 | Cmd.v (Cmd.info "locs" ~doc ~sdocs ~exits ~man) 155 | Term.(const locs $ file_arg) 156 | 157 | let v = 158 | let doc = "Process s-expressions" in 159 | let sdocs = Manpage.s_common_options in 160 | let man = [ 161 | `S Manpage.s_description; 162 | `P "$(mname) edits s-expression files"; 163 | `S Manpage.s_examples; 164 | `S Manpage.s_bugs; 165 | `P "This program is distributed with the serialk OCaml library. \ 166 | See $(i,%%PKG_HOMEPAGE%%) for contact information."; ] 167 | in 168 | Cmd.group 169 | (Cmd.info "sexp" ~version:"%%VERSION%%" ~doc ~sdocs ~exits ~man) 170 | ~default:get_term [get_cmd; delete_cmd; locs_cmd; set_cmd;] 171 | -------------------------------------------------------------------------------- /tool/cmd_sexp.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2023 The cmarkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | val v : Cmdliner.Cmd.Exit.code Cmdliner.Cmd.t 7 | (** [v] is the command line for [commonmark]. *) 8 | -------------------------------------------------------------------------------- /tool/cmd_toml.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2021 The serialkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open Std 7 | open Serialkit_toml 8 | 9 | let err_file = 1 10 | let err_toml = 2 11 | let err_query = 3 12 | 13 | let exec = Filename.basename Sys.executable_name 14 | let log_err fmt = 15 | Format.fprintf Format.err_formatter ("%s: @[" ^^ fmt ^^ "@]@.") exec 16 | 17 | let log_on_error ~exit:code r f = match r with 18 | | Error e -> log_err "%s" e; code | Ok v -> f v 19 | 20 | let delete file path = 21 | failwith "TODO" 22 | (* 23 | let query = Sexpq.delete_at_path ~must_exist:false path in 24 | log_on_error ~exit:err_file (File.read file) @@ fun content -> 25 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' ~file content) @@ fun sexp -> 26 | log_on_error ~exit:err_query (Sexpq.query' query sexp) @@ fun sexp -> 27 | Format.printf "@[%a@]" Sexp.pp_seq_layout sexp; 0 28 | *) 29 | 30 | let get file path = 31 | log_on_error ~exit:err_file (Os.read_file file) @@ fun content -> 32 | log_on_error ~exit:err_toml (Toml.of_string' ~file content) @@ fun toml -> 33 | match path with 34 | | None -> Format.printf "@[%a@]" Toml.pp_layout toml; 0 35 | | Some path -> 36 | failwith "TODO" 37 | (* 38 | let query = Sexpq.path path Sexpq.sexp in 39 | log_on_error ~exit:err_query (Sexpq.query' query sexp) @@ function 40 | | `A (a, _) | `L ([`A (a, _)], _) -> Format.printf "%s@." a; 0 41 | | `L _ as l -> Format.printf "@[%a@]@." Sexp.pp l; 0 42 | *) 43 | 44 | let locs file = 45 | failwith "TODO" 46 | (* 47 | let pp_loc ppf l = Serialk_text.Tloc.pp ppf l; Format.pp_print_char ppf ':' in 48 | let rec pp_locs ppf = function 49 | | `A (_, _) as s -> pp_loc ppf (Sexp.loc s) 50 | | `L (vs, _) as s -> 51 | pp_loc ppf (Sexp.loc s); Format.pp_print_cut ppf (); 52 | Format.pp_print_list pp_locs ppf vs 53 | in 54 | log_on_error ~exit:err_file (File.read file) @@ fun content -> 55 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' ~file content) @@ fun sexp -> 56 | Format.printf "@[%a@]@." pp_locs sexp; 0 57 | *) 58 | 59 | let set file caret v = 60 | failwith "TODO" 61 | (* 62 | log_on_error ~exit:err_file (File.read file) @@ fun content -> 63 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' ~file content) @@ fun sexp -> 64 | log_on_error ~exit:err_sexp (Sexp.seq_of_string' v) @@ fun v -> 65 | let query = Sexpq.splice_at_caret ~must_exist:false caret ~rep:v in 66 | log_on_error ~exit:err_query (Sexpq.query' query sexp) @@ fun sexp -> 67 | Format.printf "@[%a@]" Sexp.pp_seq_layout sexp; 0 68 | *) 69 | 70 | (* Command line interface *) 71 | 72 | open Cmdliner 73 | 74 | let file_arg = 75 | let doc = "$(docv) is the TOML file. Use $(b,-) for stdin." in 76 | Arg.(required & pos 0 (some string) None & info [] ~doc ~docv:"FILE") 77 | 78 | let path_arg = 79 | let parse s = match Toml.path_of_string s with 80 | | Ok _ as v -> v | Error e -> Error (`Msg e) 81 | in 82 | Arg.conv ~docv:"TPATH" (parse, Toml.pp_path ()) 83 | 84 | let caret_arg = 85 | let parse s = match Toml.caret_of_string s with 86 | | Ok _ as v -> v | Error e -> Error (`Msg e) 87 | in 88 | Arg.conv ~docv:"CARET" (parse, Toml.pp_caret ()) 89 | 90 | let exits = 91 | Cmd.Exit.info err_file ~doc:"on file read errors" :: 92 | Cmd.Exit.info err_toml ~doc:"on TOML parse errors" :: 93 | Cmd.Exit.info err_query ~doc:"on path query errors" :: 94 | Cmd.Exit.defaults 95 | 96 | let common_man = 97 | [ `S Manpage.s_bugs; 98 | `P "This program is distributed with the serialk OCaml library. 99 | See $(i,%%PKG_HOMEPAGE%%) for contact information."; ] 100 | 101 | let delete_cmd = 102 | let doc = "Delete an s-expression path" in 103 | let sdocs = Manpage.s_common_options in 104 | let man = [ 105 | `S Manpage.s_description; 106 | `P "$(tname) returns the value of an s-expression file key."; 107 | `S Manpage.s_examples; 108 | `P "TODO"; 109 | `Blocks common_man; ] 110 | in 111 | let path_arg = 112 | let doc = "Delete s-expression path $(docv)." in 113 | Arg.(required & pos 1 (some path_arg) None & info [] ~doc ~docv:"SPATH") 114 | in 115 | Cmd.v (Cmd.info "delete" ~doc ~sdocs ~exits ~man) @@ 116 | Term.(const delete $ file_arg $ path_arg) 117 | 118 | let get_cmd = 119 | let doc = "Extract a TOML path (default)" in 120 | let sdocs = Manpage.s_common_options in 121 | let man = [ 122 | `S Manpage.s_description; 123 | `P "$(tname) returns the value of a TOML file key."; 124 | `S Manpage.s_examples; 125 | `P "TODO"; 126 | `Blocks common_man; ] 127 | in 128 | let key_opt_arg = 129 | let doc = "Extract TOML path $(docv)." in 130 | Arg.(value & pos 1 (some path_arg) None & info [] ~doc ~docv:"SPATH") 131 | in 132 | Cmd.v (Cmd.info "get" ~doc ~sdocs ~exits ~man) @@ 133 | Term.(const get $ file_arg $ key_opt_arg) 134 | 135 | let set_cmd = 136 | let doc = "Edit a TOML file key" in 137 | let sdocs = Manpage.s_common_options in 138 | let man = [ 139 | `S Manpage.s_description; 140 | `P "$(tname) sets the value of an TOML file key."; 141 | `S Manpage.s_examples; 142 | `P "TODO"; 143 | `Blocks common_man; ] 144 | in 145 | let caret_arg = 146 | let doc = "Set caret $(docv)." in 147 | Arg.(required & pos 1 (some caret_arg) None & info [] ~doc ~docv:"CARET") 148 | in 149 | let toml = 150 | let doc = "$(docv) to insert or substitute" in 151 | Arg.(required & pos 2 (some string) None & info [] ~doc ~docv:"TOML") 152 | in 153 | Cmd.v (Cmd.info "set" ~doc ~sdocs ~exits ~man) @@ 154 | Term.(const set $ file_arg $ caret_arg $ toml) 155 | 156 | let locs_cmd = 157 | let doc = "Show TOML parse locations" in 158 | let sdocs = Manpage.s_common_options in 159 | let man = [ 160 | `S Manpage.s_description; 161 | `P "$(tname) outputs TOML parse locations."; 162 | `Blocks common_man; ] 163 | in 164 | Cmd.v (Cmd.info "locs" ~doc ~sdocs ~exits ~man) @@ 165 | Term.(const locs $ file_arg) 166 | 167 | let cmds = [get_cmd; delete_cmd; locs_cmd; set_cmd;] 168 | let v = 169 | let doc = "Process TOML" in 170 | let sdocs = Manpage.s_common_options in 171 | let man = [ 172 | `S Manpage.s_description; 173 | `P "$(mname) edits TOML files"; 174 | `S Manpage.s_examples; 175 | `S Manpage.s_bugs; 176 | `P "This program is distributed with the serialk OCaml library. 177 | See $(i,%%PKG_HOMEPAGE%%) for contact information."; ] 178 | in 179 | Cmd.group 180 | (Cmd.info "toml" ~version:"%%VERSION%%" ~doc ~sdocs ~exits ~man) cmds 181 | -------------------------------------------------------------------------------- /tool/cmd_toml.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2023 The cmarkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | val v : Cmdliner.Cmd.Exit.code Cmdliner.Cmd.t 7 | (** [v] is the command line for [toml]. *) 8 | -------------------------------------------------------------------------------- /tool/std.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2023 The cmarkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type fpath = string 7 | 8 | module Result = struct 9 | include Result 10 | let error_to_failure = function Ok v -> v | Error err -> failwith err 11 | module Syntax = struct 12 | let ( let* ) = Result.bind 13 | end 14 | end 15 | 16 | module Log = struct 17 | let exec = Filename.basename Sys.executable_name 18 | 19 | let err fmt = 20 | Format.fprintf Format.err_formatter ("%s: @[" ^^ fmt ^^ "@]@.") exec 21 | 22 | let warn fmt = 23 | Format.fprintf Format.err_formatter ("@[" ^^ fmt ^^ "@]@.") 24 | 25 | let on_error ~use r f = match r with 26 | | Ok v -> f v | Error e -> err "%s" e; use 27 | end 28 | 29 | module Os = struct 30 | 31 | (* Emulate B0_std.Os functionality to eschew the dep *) 32 | 33 | let read_file file = 34 | try 35 | let ic = if file = "-" then stdin else open_in_bin file in 36 | let finally () = if file = "-" then () else close_in_noerr ic in 37 | Fun.protect ~finally @@ fun () -> Ok (In_channel.input_all ic) 38 | with 39 | | Sys_error err -> Error err 40 | 41 | let write_file file s = 42 | try 43 | let oc = if file = "-" then stdout else open_out_bin file in 44 | let finally () = if file = "-" then () else close_out_noerr oc in 45 | Fun.protect ~finally @@ fun () -> Ok (Out_channel.output_string oc s) 46 | with 47 | | Sys_error err -> Error err 48 | 49 | let with_tmp_dir f = 50 | try 51 | let tmpdir = 52 | let file = Filename.temp_file "cmarkit" "dir" in 53 | (Sys.remove file; Sys.mkdir file 0o700; file) 54 | in 55 | let finally () = try Sys.rmdir tmpdir with Sys_error _ -> () in 56 | Fun.protect ~finally @@ fun () -> Ok (f tmpdir) 57 | with 58 | | Sys_error err -> Error ("Making temporary dir: " ^ err) 59 | 60 | let with_cwd cwd f = 61 | try 62 | let curr = Sys.getcwd () in 63 | let () = Sys.chdir cwd in 64 | let finally () = try Sys.chdir curr with Sys_error _ -> () in 65 | Fun.protect ~finally @@ fun () -> Ok (f ()) 66 | with 67 | | Sys_error err -> Error ("With cwd: " ^ err) 68 | end 69 | 70 | module Exit = struct 71 | open Cmdliner 72 | 73 | type code = Cmdliner.Cmd.Exit.code 74 | let err_file = 1 75 | let err_diff = 2 76 | 77 | let exits = 78 | Cmd.Exit.info err_file ~doc:"on file read errors." :: 79 | Cmd.Exit.defaults 80 | 81 | let exits_with_err_diff = 82 | Cmd.Exit.info err_diff ~doc:"on render differences." :: exits 83 | end 84 | 85 | let process_files f files = 86 | let rec loop = function 87 | | [] -> 0 88 | | file :: files -> 89 | Log.on_error ~use:Exit.err_file (Os.read_file file) @@ fun content -> 90 | f ~file content; loop files 91 | in 92 | loop files 93 | 94 | module Cli = struct 95 | open Cmdliner 96 | 97 | let common_man = 98 | [ `S Manpage.s_bugs; 99 | `P "This program is distributed with the $(b,cmarkit) OCaml library. \ 100 | See $(i,https://erratique.ch/software/cmarkit) for contact \ 101 | information."; 102 | `S Manpage.s_see_also; 103 | `P "More information about the renderers can be found in the \ 104 | documentation of the $(b,cmarkit) OCaml library. Consult \ 105 | $(b,odig doc cmarkit) or the online documentation." ] 106 | end 107 | -------------------------------------------------------------------------------- /tool/std.mli: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2023 The cmarkit programmers. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | type fpath = string 7 | 8 | module Result : sig 9 | include module type of Result 10 | val error_to_failure : ('a, string) result -> 'a 11 | 12 | module Syntax : sig 13 | val (let*) : ('a, 'e) result -> ('a -> ('b, 'e) result) -> ('b, 'e) result 14 | end 15 | end 16 | 17 | module Log : sig 18 | val err : ('a, Format.formatter, unit, unit) format4 -> 'a 19 | val warn : ('a, Format.formatter, unit, unit) format4 -> 'a 20 | val on_error : use:'a -> ('b, string) result -> ('b -> 'a) -> 'a 21 | end 22 | 23 | module Os : sig 24 | val read_file : fpath -> (string, string) result 25 | val write_file : fpath -> string -> (unit, string) result 26 | val with_tmp_dir : (fpath -> 'a) -> ('a, string) result 27 | val with_cwd : fpath -> (unit -> 'a) -> ('a, string) result 28 | end 29 | 30 | module Exit : sig 31 | type code = Cmdliner.Cmd.Exit.code 32 | val err_file : code 33 | val err_diff : code 34 | val exits : Cmdliner.Cmd.Exit.info list 35 | val exits_with_err_diff : Cmdliner.Cmd.Exit.info list 36 | end 37 | 38 | val process_files : (file:fpath -> string -> 'a) -> string list -> Exit.code 39 | 40 | module Cli : sig 41 | open Cmdliner 42 | val common_man : Manpage.block list 43 | end 44 | --------------------------------------------------------------------------------