├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── README.md
└── src
    └── lib.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "escape8259"
  7 | version = "0.5.1"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "8edd65c008c6e97290e463c336e0c3fe109a91accb0e6b3e9e353d1605bd58b8"
 10 | dependencies = [
 11 |  "rustversion",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "json-parser-toy"
 16 | version = "0.1.0"
 17 | dependencies = [
 18 |  "escape8259",
 19 |  "nom",
 20 |  "thiserror",
 21 | ]
 22 | 
 23 | [[package]]
 24 | name = "memchr"
 25 | version = "2.4.1"
 26 | source = "registry+https://github.com/rust-lang/crates.io-index"
 27 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
 28 | 
 29 | [[package]]
 30 | name = "minimal-lexical"
 31 | version = "0.2.1"
 32 | source = "registry+https://github.com/rust-lang/crates.io-index"
 33 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 34 | 
 35 | [[package]]
 36 | name = "nom"
 37 | version = "7.1.0"
 38 | source = "registry+https://github.com/rust-lang/crates.io-index"
 39 | checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109"
 40 | dependencies = [
 41 |  "memchr",
 42 |  "minimal-lexical",
 43 |  "version_check",
 44 | ]
 45 | 
 46 | [[package]]
 47 | name = "proc-macro2"
 48 | version = "1.0.36"
 49 | source = "registry+https://github.com/rust-lang/crates.io-index"
 50 | checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029"
 51 | dependencies = [
 52 |  "unicode-xid",
 53 | ]
 54 | 
 55 | [[package]]
 56 | name = "quote"
 57 | version = "1.0.14"
 58 | source = "registry+https://github.com/rust-lang/crates.io-index"
 59 | checksum = "47aa80447ce4daf1717500037052af176af5d38cc3e571d9ec1c7353fc10c87d"
 60 | dependencies = [
 61 |  "proc-macro2",
 62 | ]
 63 | 
 64 | [[package]]
 65 | name = "rustversion"
 66 | version = "1.0.6"
 67 | source = "registry+https://github.com/rust-lang/crates.io-index"
 68 | checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f"
 69 | 
 70 | [[package]]
 71 | name = "syn"
 72 | version = "1.0.85"
 73 | source = "registry+https://github.com/rust-lang/crates.io-index"
 74 | checksum = "a684ac3dcd8913827e18cd09a68384ee66c1de24157e3c556c9ab16d85695fb7"
 75 | dependencies = [
 76 |  "proc-macro2",
 77 |  "quote",
 78 |  "unicode-xid",
 79 | ]
 80 | 
 81 | [[package]]
 82 | name = "thiserror"
 83 | version = "1.0.30"
 84 | source = "registry+https://github.com/rust-lang/crates.io-index"
 85 | checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417"
 86 | dependencies = [
 87 |  "thiserror-impl",
 88 | ]
 89 | 
 90 | [[package]]
 91 | name = "thiserror-impl"
 92 | version = "1.0.30"
 93 | source = "registry+https://github.com/rust-lang/crates.io-index"
 94 | checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b"
 95 | dependencies = [
 96 |  "proc-macro2",
 97 |  "quote",
 98 |  "syn",
 99 | ]
100 | 
101 | [[package]]
102 | name = "unicode-xid"
103 | version = "0.2.2"
104 | source = "registry+https://github.com/rust-lang/crates.io-index"
105 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
106 | 
107 | [[package]]
108 | name = "version_check"
109 | version = "0.9.4"
110 | source = "registry+https://github.com/rust-lang/crates.io-index"
111 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
112 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "json-parser-toy"
 3 | version = "0.1.0"
 4 | authors = ["Eric Seppanen <eds@reric.net>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | nom = "7.1.0"
11 | escape8259 = "0.5"
12 | thiserror = "1.0"
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # Let's build a parser!
   2 | 
   3 | [*Read the original blog post at* **codeandbitters**.](https://codeandbitters.com/lets-build-a-parser/)
   4 | 
   5 | > Updated 10/2021 to use `nom 7.0`!
   6 | 
   7 | This is a demonstration of building a parser in Rust using the
   8 | [`nom`](https://docs.rs/nom/) crate. I recently built a parser for
   9 | the [`cddl-cat`](https://docs.rs/cddl-cat/) crate using nom,
  10 | and I found it a surprisingly pleasant experience, much better than my past
  11 | experiences with other parser-generators in other languages.
  12 | 
  13 | Since I like Rust a lot, and I need an excuse to do more writing about Rust, I
  14 | thought I'd do another demonstration project. I decided to choose a simple
  15 | syntax, to keep this a short project. So I'm going to build a parser for JSON.
  16 | 
  17 | <!-- more -->
  18 | 
  19 | There are a million JSON parsers in the world already, so I don't expect this
  20 | code to have much non-educational value. But, hey, you never know.
  21 | 
  22 | All of the source code and markdown source for this post is
  23 | [available on GitHub](https://github.com/ericseppanen/json-parser-toy). If you
  24 | see anything wrong, please let me know by raising an issue there.
  25 | 
  26 | ## Part 1. Introduction.
  27 | 
  28 | A few details, before I write the first lines of code:
  29 | 
  30 | 1. I'm going to use [RFC8259](https://tools.ietf.org/html/rfc8259) as my
  31 | authoritative reference for the JSON grammar.
  32 | 2. I'm not going to build a JSON serializer. My goal will only be to consume
  33 | JSON text and output a structured tree containing the data (a lot like
  34 | [`serde_json::Value`](https://docs.serde.rs/serde_json/value/enum.Value.html) ).
  35 | 3. I'll be using [`nom` 7.0](https://docs.rs/nom/7.0/nom/). I'll try to keep
  36 | this post updated when new major versions are released.
  37 | 4. Some of the code I write will violate the usual `rustfmt` style. This isn't
  38 | because I hate `rustfmt`; far from it! But as you'll see, `nom` code can look a
  39 | little weird, so it's sometimes more readable if we bend the styling rules a
  40 | little bit. Do what you like in your own code.
  41 | 5. All of my source code will be
  42 | [available on GitHub](https://github.com/ericseppanen/json-parser-toy). If you
  43 | have comments or suggestions, or see a bug or something wrong in this post,
  44 | please open an issue there.
  45 | 
  46 | Let's start with a few words about `nom`. It can take a little bit of time to
  47 | adjust to writing a parser with `nom`, because it doesn't work by first
  48 | tokenizing the input and then parsing those tokens. Both of those steps can be
  49 | tackled at once.
  50 | 
  51 | Older versions of `nom` used a lot of macros. Starting with `nom 7.0`, the
  52 | macros are gone, and the only way to use nom is with the function combinators.
  53 | This is a nice change, because while `nom` combinators can be tricky, the
  54 | function-based style is a lot friendlier to work with than the old macros.
  55 | 
  56 | A bit of advice for reading the
  57 | [`nom` documentation](https://docs.rs/nom/7.0.0/nom/), if you're following
  58 | along with this implementation:
  59 | - Start from the [modules](https://docs.rs/nom/7.0.0/nom/#modules) section of
  60 | the documentation.
  61 | - We'll be starting with the
  62 | [character](https://docs.rs/nom/7.0.0/nom/character/index.html) and
  63 | [number](https://docs.rs/nom/7.0.0/nom/number/index.html) modules.
  64 | - We'll use the
  65 | [combinator](https://docs.rs/nom/7.0.0/nom/combinator/index.html),
  66 | [multi](https://docs.rs/nom/7.0.0/nom/multi/index.html),
  67 | [sequence](https://docs.rs/nom/7.0.0/nom/sequence/index.html),
  68 | and [branch](https://docs.rs/nom/7.0.0/nom/branch/index.html) modules to tie
  69 | things together. I'll try to link to the relevant documentation as we go.
  70 | 
  71 | ## Part 2. Our first bit of parser code.
  72 | 
  73 | I've started a new library project (`cargo init --lib json-parser-toy`), and
  74 | added the `nom 7.0` dependency in `Cargo.toml`. Let's add a very simple parser
  75 | function, just to verify that we can build and test our code. We'll try to
  76 | parse the strings "true" and "false". In other words, the grammar for our json
  77 | subset is:
  78 | 
  79 | ```txt
  80 | value = "false" / "true"
  81 | ```
  82 | 
  83 | Here's our first bit of code:
  84 | ```rust
  85 | use nom::{branch::alt, bytes::complete::tag, IResult};
  86 | 
  87 | fn json_bool(input: &str) -> IResult<&str, &str> {
  88 |     alt((
  89 |         tag("false"),
  90 |         tag("true")
  91 |     ))
  92 |     (input)
  93 | }
  94 | 
  95 | #[test]
  96 | fn test_bool() {
  97 |     assert_eq!(json_bool("false"), Ok(("", "false")));
  98 |     assert_eq!(json_bool("true"), Ok(("", "true")));
  99 |     assert!(json_bool("foo").is_err());
 100 | }
 101 | ```
 102 | 
 103 | I got the [`tag`](https://docs.rs/nom/7.0.0/nom/bytes/complete/fn.tag.html)
 104 | function from `nom::bytes`, though it's not specific to byte-arrays; it works
 105 | just fine with text strings as well. It's not a big deal; it's just a minor
 106 | quirk of the way `nom` is organized.
 107 | 
 108 | We use [`alt`](https://docs.rs/nom/7.0.0/nom/branch/fn.alt.html) to express
 109 | "one of these choices". This is a common style in `nom`, and we'll see it
 110 | again when we use other combinators from `nom::sequence`.
 111 | 
 112 | There are a few other things that should be explained.
 113 | 
 114 | [`IResult`](https://docs.rs/nom/7.0.0/nom/type.IResult.html) is an important
 115 | part of working with `nom`. It's a specialized `Result`, where an `Ok` always
 116 | returns a tuple of two values. In this case, `IResult<&str, &str>` returns two
 117 | string slices. The first is the "remainder": this is everything that wasn't
 118 | parsed. The second part is the output from a successful parse; in this case we
 119 | just return the string we matched. For example, I could add this to my test,
 120 | and it would work:
 121 | 
 122 | ```rust
 123 | assert_eq!(json_bool("false more"), Ok((" more", "false")));
 124 | ```
 125 | 
 126 | The `json_bool` function consumed the `false` part of the string, and left the
 127 | rest for somebody else to deal with.
 128 | 
 129 | When `json_bool` returns an error, that doesn't necessarily mean that something
 130 | is wrong. Our top-level parser isn't going to give up. It just means that
 131 | this particular bit of grammar didn't match. Depending on how we write our
 132 | code, other parser functions might be called instead. You can actually see
 133 | this in action if you look at how the `alt` combinator works. It first calls a
 134 | parser function `tag("false")`, and if that returns an error, it instead feeds
 135 | the same input into `tag("true")`, to see if it might succeed instead.
 136 | 
 137 | This probably still looks kind of strange, because `tag("false")` isn't a
 138 | complete parser function; it's a function that returns a parser function. See
 139 | how our code calls `alt` and `tag` (twice)? The return value from that code is
 140 | another function, and that function gets called with the argument `(input)`.
 141 | 
 142 | Don't be scared off by the intimidating-looking parameters of the `tag`
 143 | function in the documentation— look at the
 144 | [examples](https://docs.rs/nom/7.0.0/nom/bytes/complete/fn.tag.html#example).
 145 | Despite the extra layer of indirection, it's still pretty easy to use.
 146 | 
 147 | ## Part 3. Returning structs.
 148 | 
 149 | We don't want to just return the strings that we matched; we want to return
 150 | some Rust structs that we can put into a tree form.
 151 | 
 152 | We could copy the previous function to add another simple JSON element:
 153 | ```rust
 154 | fn json_null(input: &str) -> IResult<&str, &str> {
 155 |     tag("null")
 156 |     (input)
 157 | }
 158 | ```
 159 | 
 160 | That would work, but let's rewrite our two parser functions to return enums or
 161 | structs instead.
 162 | ```rust
 163 | use nom::combinator::map;
 164 | 
 165 | #[derive(PartialEq, Debug)]
 166 | pub enum JsonBool {
 167 |     False,
 168 |     True,
 169 | }
 170 | 
 171 | #[derive(PartialEq, Debug)]
 172 | pub struct JsonNull {}
 173 | 
 174 | fn json_bool(input: &str) -> IResult<&str, JsonBool> {
 175 |     let parser = alt((
 176 |         tag("false"),
 177 |         tag("true")
 178 |     ));
 179 |     map(parser, |s| {
 180 |         match s {
 181 |             "false" => JsonBool::False,
 182 |             "true" => JsonBool::True,
 183 |             _ => unreachable!(),
 184 |         }
 185 |     })
 186 |     (input)
 187 | }
 188 | 
 189 | fn json_null(input: &str) -> IResult<&str, JsonNull> {
 190 |     map(tag("null"), |_| JsonNull {})
 191 |     (input)
 192 | }
 193 | 
 194 | #[test]
 195 | fn test_bool() {
 196 |     assert_eq!(json_bool("false"), Ok(("", JsonBool::False)));
 197 |     assert_eq!(json_bool("true"), Ok(("", JsonBool::True)));
 198 |     assert!(json_bool("foo").is_err());
 199 | }
 200 | 
 201 | #[test]
 202 | fn test_null() {
 203 |     assert_eq!(json_null("null"), Ok(("", JsonNull {})));
 204 | }
 205 | 
 206 | ```
 207 | 
 208 | First, notice that the parser functions' return value has changed. The first
 209 | part of the `IResult` tuple is still the remainder, so it's still `&str`. But
 210 | the second part now returns one of our new data structures.
 211 | 
 212 | To change the return value, we use `nom`'s
 213 | [`map`](https://docs.rs/nom/7.0.0/nom/combinator/fn.map.html) combinator
 214 | function. It allows us to apply a closure to convert the matched string into
 215 | something else: in the `json_bool` case, one of the `JsonBool` variants. You
 216 | will probably smell something funny about that code, though: we already matched
 217 | the `"true"` and `"false"` strings once in the parser generated by the `tag`
 218 | function, so why are we doing it again? Your instincts are right on— we should
 219 | probably back up and fix that, but let's wrap up this discussion first.
 220 | 
 221 | The `json_null` function does almost exactly the same thing, though it doesn't
 222 | need a `match` because it could only have matched one thing.
 223 | 
 224 | We need to derive `PartialEq` and `Debug` for our structs and enums so that the
 225 | `assert_eq!` will work. Our tests are now using the new data structures
 226 | `JsonBool` and `JsonNull`.
 227 | 
 228 | ## Part 4. Another way of doing the same thing.
 229 | 
 230 | In `nom`, there are often multiple ways of achieving the same goal. In our
 231 | case, `map` is a little bit overkill for this use case. Let's instead use the
 232 | [`value`](https://docs.rs/nom/7.0.0/nom/combinator/fn.value.html) combinator
 233 | instead, which is specialized for the case where we only care that the child
 234 | parser succeeded.
 235 | 
 236 | We'll also refactor `json_bool` so that we don't need to do extra work: we'll
 237 | apply our combinator a little earlier, before we lose track of which branch
 238 | we're on.
 239 | 
 240 | ```rust
 241 | use nom::combinator::value;
 242 | 
 243 | #[derive(PartialEq, Debug, Clone, Copy)]
 244 | pub enum JsonBool {
 245 |     False,
 246 |     True,
 247 | }
 248 | 
 249 | #[derive(PartialEq, Debug, Clone, Copy)]
 250 | pub struct JsonNull {}
 251 | 
 252 | fn json_bool(input: &str) -> IResult<&str, JsonBool> {
 253 |     alt((
 254 |         value(JsonBool::False, tag("false")),
 255 |         value(JsonBool::True, tag("true")),
 256 |     ))
 257 |     (input)
 258 | }
 259 | 
 260 | fn json_null(input: &str) -> IResult<&str, JsonNull> {
 261 |     value(JsonNull {}, tag("null"))
 262 |     (input)
 263 | }
 264 | ```
 265 | 
 266 | Hopefully this is pretty straightforward. The `value` combinator returns its
 267 | first argument (e.g. `JsonNull {}`), if the second argument succeeds
 268 | (`tag("null")`). That description is a bit of a lazy mental shortcut,
 269 | because `value` doesn't do any parsing itself. Remember, it's a function that
 270 | consumes one parser function and returns another parser function. But because
 271 | `nom` makes things so easy, it's sometimes a lot easier to use the lazy way of
 272 | thinking when you're plugging combinators together like Lego bricks.
 273 | 
 274 | Note that I added `Clone` to the data structures, because `value` requires it.
 275 | I also added `Copy` because these are trivially small structs & enums.
 276 | 
 277 | ## Part 5. Prepare to tree.
 278 | 
 279 | Our final output should be some tree-like data structure, similar to
 280 | [`serde_json::Value`](https://docs.serde.rs/serde_json/value/enum.Value.html).
 281 | I'm partial to the word "node" to describe the parts of a tree, so let's start
 282 | here:
 283 | 
 284 | ```rust
 285 | pub enum Node {
 286 |     Null(JsonNull),
 287 |     Bool(JsonBool),
 288 | }
 289 | ```
 290 | 
 291 | Right away, I don't like where this is going. Here are all the things I'm
 292 | unhappy with:
 293 | 
 294 | 1. The redundant naming. I have `Node::Null` and `JsonNull`, for a value that
 295 | contains no additional data.
 296 | 2. The null and bool types don't really seem like they need their own data
 297 | structure name, outside of the tree node. If this were a complex value type
 298 | that I might want to pass around on its own, sure. But for this simple case, I
 299 | think this is a lot simpler:
 300 | 
 301 | ```rust
 302 | #[derive(PartialEq, Debug, Clone)]
 303 | pub enum Node {
 304 |     Null,
 305 |     Bool(bool),
 306 | }
 307 | 
 308 | fn json_bool(input: &str) -> IResult<&str, Node> {
 309 |     alt((
 310 |         value(Node::Bool(false), tag("false")),
 311 |         value(Node::Bool(true), tag("true")),
 312 |     ))
 313 |     (input)
 314 | }
 315 | 
 316 | fn json_null(input: &str) -> IResult<&str, Node> {
 317 |     value(Node::Null, tag("null"))
 318 |     (input)
 319 | }
 320 | 
 321 | #[test]
 322 | fn test_bool() {
 323 |     assert_eq!(json_bool("false"), Ok(("", Node::Bool(false))));
 324 |     assert_eq!(json_bool("true"), Ok(("", Node::Bool(true))));
 325 |     assert!(json_bool("foo").is_err());
 326 | }
 327 | 
 328 | #[test]
 329 | fn test_null() {
 330 |     assert_eq!(json_null("null"), Ok(("", Node::Null)));
 331 | }
 332 | ```
 333 | 
 334 | We got rid of JsonNull and JsonBool entirely. For your parser you can choose
 335 | any output structure that makes sense; different grammars have different
 336 | properties, and they may not map easily onto Rust's prelude types.
 337 | 
 338 | ## Part 6. Parsing numbers is hard.
 339 | 
 340 | The other remaining literal types in JSON are strings and numbers. Let's
 341 | tackle numbers first. Referring to
 342 | [RFC8259](https://tools.ietf.org/html/rfc8259), the grammar for a JSON number
 343 | is:
 344 | 
 345 | ```txt
 346 | number = [ minus ] int [ frac ] [ exp ]
 347 | 
 348 |       decimal-point = %x2E       ; .
 349 |       digit1-9 = %x31-39         ; 1-9
 350 |       e = %x65 / %x45            ; e E
 351 |       exp = e [ minus / plus ] 1*DIGIT
 352 |       frac = decimal-point 1*DIGIT
 353 |       int = zero / ( digit1-9 *DIGIT )
 354 |       minus = %x2D               ; -
 355 |       plus = %x2B                ; +
 356 |       zero = %x30                ; 0
 357 | ```
 358 | 
 359 | That grammar can represent any integer or floating point value; it would be
 360 | grammatically correct to have an integer a thousand digits long, or a floating
 361 | point value with huge exponent. It's our decision how to handle these values.
 362 | 
 363 | JSON (like JavaScript) is a bit unusual in not distinguishing integers from
 364 | floating-point values. To make this tutorial a little more widely useful,
 365 | let's output integers and floats as separate types:
 366 | 
 367 | ```rust
 368 | pub enum Node {
 369 |     Null,
 370 |     Bool(bool),
 371 |     Integer(i64),
 372 |     Float(f64),
 373 | }
 374 | ```
 375 | 
 376 | We'll need to do something when we encounter values that are grammatically
 377 | correct (e.g. 1000 digits), that we can't handle. This is a common problem,
 378 | since most grammars don't attempt to set limits on the size of numbers. Often
 379 | there will be a limit set somewhere, but it's not part of the formal grammar.
 380 | JSON doesn't set such limits, which can lead to compatibility problems between
 381 | implementations.
 382 | 
 383 | It will be important in most parsers to set limits and make sure things fail
 384 | gracefully. In Rust you're not likely to have problems with buffer overruns,
 385 | but it might be possible to trigger a denial of service, or perhaps even a
 386 | crash by triggering excessive recursion.
 387 | 
 388 | Let's start by making the parser functions we need, and we'll see where we need
 389 | error handling.
 390 | 
 391 | Let's build a little helper function for the `digit1-9` part, since `nom` only
 392 | offers `digit`, which includes `0-9`.
 393 | 
 394 | ```rust
 395 | fn digit1to9(input: &str) -> IResult<&str, &str> {
 396 |     one_of("123456789")
 397 |     (input)
 398 | }
 399 | ```
 400 | 
 401 | Unfortunately, it doesn't compile:
 402 | ```txt
 403 | error[E0308]: mismatched types
 404 |   --> src/lib.rs:21:5
 405 |    |
 406 | 21 | /     one_of("123456789")
 407 | 22 | |     (input)
 408 |    | |___________^ expected `&str`, found `char`
 409 |    |
 410 |    = note: expected enum `std::result::Result<(&str, &str), nom::internal::Err<(&str, nom::error::ErrorKind)>>`
 411 |               found enum `std::result::Result<(&str, char), nom::internal::Err<_>>`
 412 | ```
 413 | 
 414 | This is a pretty easy mistake to make— we tried to create a parser function
 415 | that returns a string slice, but it's returning `char` instead, because, well,
 416 | that's how `one_of` works. It's not a big problem for us; just fix the return
 417 | type to match:
 418 | 
 419 | ```rust
 420 | fn digit1to9(input: &str) -> IResult<&str, char> {
 421 |     one_of("123456789")
 422 |     (input)
 423 | }
 424 | ```
 425 | 
 426 | We can now build the next function, one that recognizes integers:
 427 | ```rust
 428 | fn uint(input: &str) -> IResult<&str, &str> {
 429 |     alt((
 430 |         tag("0"),
 431 |         recognize(
 432 |             pair(
 433 |                 digit1to9,
 434 |                 digit0
 435 |             )
 436 |         )
 437 |     ))
 438 |     (input)
 439 | }
 440 | ```
 441 | 
 442 | Again, we use `alt` to specify that an integer is either `0`, or a nonzero
 443 | digit, possibly followed by more additional digits.
 444 | 
 445 | The new combinator here is `recognize`. Let's back up and look at the return
 446 | type of this hypothetical function:
 447 | 
 448 | ```rust
 449 | fn nonzero_integer(input: &str) -> IResult<&str, ____> {
 450 |     pair(
 451 |         digit1to9,
 452 |         digit0
 453 |     )
 454 |     (input)
 455 | }
 456 | ```
 457 | 
 458 | Because we used `pair`, the return type would be a 2-tuple. The first element
 459 | would be a `char` (because that's what we returned from `digit1to9`), and the
 460 | other element would be a `&str`. So the blank above would be filled in like
 461 | this:
 462 | 
 463 | ```rust
 464 | fn nonzero_integer(input: &str) -> IResult<&str, (char, &str)> {
 465 |     ...
 466 | }
 467 | ```
 468 | 
 469 | In this context, not very helpful. What we'd like to say is, "match this bunch
 470 | of stuff, but just return the string slice that covers what we matched."
 471 | That's exactly what `recognize` does.
 472 | 
 473 | Because we're going to store integers in a different `Node` variant, we should
 474 | also do one last call to `map`. But that might make life difficult if we want
 475 | to re-use this code as part of a token that's representing a floating-point
 476 | number.
 477 | 
 478 | So let's leave the `uint` function alone; we'll use it as a building block of
 479 | another function.
 480 | 
 481 | Note also that we can't finish parsing an integer until we've consumed the
 482 | optional leading "minus" symbol.
 483 | 
 484 | ```rust
 485 | fn json_integer(input: &str) -> IResult<&str, &str> {
 486 |     recognize(
 487 |         pair(
 488 |             opt(tag("-")),
 489 |             uint
 490 |         )
 491 |     )
 492 |     (input)
 493 | }
 494 | ```
 495 | 
 496 | The `opt` function is another `nom` combinator; it means "optional", and
 497 | unsurprisingly it will return an `Option<T>` where `T` in this case is `&str`
 498 | (because that's what `tag("-")` will returns. But that return type is ignored;
 499 | `recognize` will throw it away and just give us back the characters that were
 500 | consumed by the successful match.
 501 | 
 502 | Let's add one more step to our function: convert the resulting string into a
 503 | `Node::Integer`.
 504 | 
 505 | ```rust
 506 | fn json_integer(input: &str) -> IResult<&str, Node> {
 507 |     let parser = recognize(
 508 |         pair(
 509 |             opt(tag("-")),
 510 |             uint
 511 |         )
 512 |     );
 513 |     map(parser, |s| {
 514 |         // FIXME: unwrap() may panic if the value is out of range
 515 |         let n = s.parse::<i64>().unwrap();
 516 |         Node::Integer(n)
 517 |     })
 518 |     (input)
 519 | }
 520 | ```
 521 | 
 522 | Finally, we discover a point where we'll need some error handling.
 523 | [`str::parse`](https://doc.rust-lang.org/std/primitive.str.html#method.parse)
 524 | returns a `Result`, and will certainly return `Err` if we try to parse
 525 | something too big.
 526 | 
 527 | I am going to leave proper error handling until the end, so for now I will just
 528 | `unwrap` the result. This means the parser will panic if we give it a huge
 529 | integer, so we definitely need to come back and fix this later.
 530 | 
 531 | For now we'll finish up this section with a few unit tests:
 532 | 
 533 | ```rust
 534 | #[test]
 535 | fn test_integer() {
 536 |     assert_eq!(json_integer("42"), Ok(("", Node::Integer(42))));
 537 |     assert_eq!(json_integer("-123"), Ok(("", Node::Integer(-123))));
 538 |     assert_eq!(json_integer("0"), Ok(("", Node::Integer(0))));
 539 |     assert_eq!(json_integer("01"), Ok(("1", Node::Integer(0))));
 540 | }
 541 | ```
 542 | 
 543 | Note the fourth test case— this might not be what you expected. We know that
 544 | integers with a leading zero aren't allowed by this grammar— so why did the
 545 | call to `json_integer` succeed? It has to do with the way `nom` operates. Each
 546 | parser only consumes the part of the string it matches, and leaves the rest for
 547 | some other parser. So attempting to parse `01` results in a success, returning
 548 | a result `Node::Integer(0)` along with a remainder string `1`.
 549 | 
 550 | `nom` does have ways for parsers to trigger a fatal error if they're unhappy
 551 | with the sequence of characters, but this grammar probably won't need them.
 552 | 
 553 | ## Part 7. Parsing numbers some more.
 554 | 
 555 | Let's piece together the bits we need to parse floating point numbers.
 556 | 
 557 | ```rust
 558 | fn frac(input: &str) -> IResult<&str, &str> {
 559 |     recognize(
 560 |         pair(
 561 |             tag("."),
 562 |             digit1
 563 |         )
 564 |     )
 565 |     (input)
 566 | }
 567 | 
 568 | fn exp(input: &str) -> IResult<&str, &str> {
 569 |     recognize(
 570 |         tuple((
 571 |             tag("e"),
 572 |             opt(alt((
 573 |                 tag("-"),
 574 |                 tag("+")
 575 |             ))),
 576 |             digit1
 577 |         ))
 578 |     )
 579 |     (input)
 580 | }
 581 | 
 582 | fn json_float(input: &str) -> IResult<&str, Node> {
 583 |     let parser = recognize(
 584 |         tuple((
 585 |             opt(tag("-")),
 586 |             uint,
 587 |             opt(frac),
 588 |             opt(exp)
 589 |         ))
 590 |     );
 591 |     map(parser, |s| {
 592 |         // FIXME: unwrap() may panic if the value is out of range
 593 |         let n = s.parse::<f64>().unwrap();
 594 |         Node::Float(n)
 595 |     })
 596 |     (input)
 597 | }
 598 | ```
 599 | 
 600 | The only new parts here are:
 601 | - `nom::character::complete::digit1`: just like `digit0`, except this matches
 602 | one-or-more digits.
 603 | - `nom::sequence::tuple` is a lot like `pair`, but accepts an arbitrary number
 604 | of other parsers. Each sub-parser must match in sequence, and the return value
 605 | is a tuple of results.
 606 | 
 607 | I added some straightforward unit tests here, and they all pass. Despite that,
 608 | I've made a significant mistake, but one that we won't notice until we start
 609 | stitching the various parts together. Let's do that now.
 610 | 
 611 | When a parser executes, it obviously won't know which elements are arriving in
 612 | which order, so we need a parser function to handle everything we've built so
 613 | far. Thanks to the magic of `nom`, this part is really easy.
 614 | 
 615 | ```rust
 616 | fn json_literal(input: &str) -> IResult<&str, Node> {
 617 |     alt((
 618 |         json_integer,
 619 |         json_float,
 620 |         json_bool,
 621 |         json_null
 622 |     ))
 623 |     (input)
 624 | }
 625 | ```
 626 | 
 627 | And now we discover that something is wrong:
 628 | 
 629 | ```rust
 630 | #[test]
 631 | fn test_literal() {
 632 |     assert_eq!(json_literal("56"), Ok(("", Node::Integer(56))));
 633 |     assert_eq!(json_literal("78.0"), Ok(("", Node::Float(78.0))));
 634 | }
 635 | ```
 636 | 
 637 | ```txt
 638 | test test_literal ... FAILED
 639 | 
 640 | failures:
 641 | 
 642 | ---- test_literal stdout ----
 643 | thread 'test_literal' panicked at 'assertion failed: `(left == right)`
 644 |   left: `Ok((".0", Integer(78)))`,
 645 |  right: `Ok(("", Float(78.0)))`', src/lib.rs:163:5
 646 | ```
 647 | 
 648 | Because we put `json_integer` first, it grabbed the `78` part and declared
 649 | success, leaving `.0` for someone else to deal with. Not so big a deal,
 650 | right? Let's just swap the order of the parsers:
 651 | 
 652 | ```rust
 653 | fn json_literal(input: &str) -> IResult<&str, Node> {
 654 |     alt((
 655 |         json_float,
 656 |         json_integer,
 657 |         json_bool,
 658 |         json_null
 659 |     ))
 660 |     (input)
 661 | }
 662 | ```
 663 | 
 664 | ```txt
 665 | test test_literal ... FAILED
 666 | 
 667 | failures:
 668 | 
 669 | ---- test_literal stdout ----
 670 | thread 'test_literal' panicked at 'assertion failed: `(left == right)`
 671 |   left: `Ok(("", Float(56.0)))`,
 672 |  right: `Ok(("", Integer(56)))`', src/lib.rs:162:5
 673 | ```
 674 | 
 675 | We've traded one problem for another. This time, `json_float` runs first,
 676 | consumes the input `56` input and declares success, returning `Float(56.0)`.
 677 | This isn't wrong, exactly. Had we decided at the beginning to treat all
 678 | numbers as floating-point (as JavaScript does) this would be the expected
 679 | outcome. But since we committed to storing integers and floats as separate
 680 | tree nodes, we have a problem.
 681 | 
 682 | Since we can't allow either the `json_float` parser or the `json_integer`
 683 | parser to run first (at least as currently written), let's imagine what we'd
 684 | like to see happen. Ideally, we would start parsing the `[ minus ] int` part
 685 | of the grammar, and if that succeeds we have a possible integer-or-float
 686 | match. We should then continue on, trying to match the `[ frac ] [ exp ]`
 687 | part, and if _either of those_ succeeds, we have a float.
 688 | 
 689 | There are a few different ways to implement that logic.
 690 | 
 691 | One way would be to get `json_float` to fail if the next character after the
 692 | integer part is _not_ a `.` or `e` character— without that it can't possibly be
 693 | a valid float (according to our grammar), so if `json_float` fails at that
 694 | point we know the `json_integer` parser will run next (and succeed).
 695 | 
 696 | ```rust
 697 | fn json_float(input: &str) -> IResult<&str, Node> {
 698 |     let parser = recognize(
 699 |         tuple((
 700 |             opt(tag("-")),
 701 |             uint,
 702 |             peek(alt((
 703 |                 tag("."),
 704 |                 tag("e"),
 705 |             ))),
 706 |             opt(frac),
 707 |             opt(exp)
 708 |         ))
 709 |     );
 710 |     map(parser, |s| {
 711 |         let n = s.parse::<f64>().unwrap();
 712 |         Node::Float(n)
 713 |     })
 714 |     (input)
 715 | }
 716 | ```
 717 | 
 718 | This code has one small annoyance, though it's not a problem in the overall
 719 | JSON context. Imagine that we took this `json_float` parser code, and tried to
 720 | reuse it in another language, where this other language's grammar would allow
 721 | the input `123.size()`. This code would `peek` ahead and see the `.`
 722 | character, and because of that it would parse `123` as a float rather than an
 723 | integer. In other words, this `json_float` implementation decides that this
 724 | input is a float before it's actually finished parsing all the characters
 725 | making up that float.
 726 | 
 727 | There is a slightly better way, though. Remember, our original problem is that
 728 | `json_float` will succeed in all of the following cases:
 729 | - `123`
 730 | - `123.0`
 731 | - `123e9`
 732 | - `123.0e9`
 733 | What we'd rather have is a parser that succeeds at the last three, but not the
 734 | first. There isn't a combinator in `nom` that implements "A or B or AB", but
 735 | it's not that hard to implement ourselves:
 736 | 
 737 | ```rust
 738 | fn json_float(input: &str) -> IResult<&str, Node> {
 739 |     let parser = recognize(
 740 |         tuple((
 741 |             opt(tag("-")),
 742 |             uint,
 743 |             alt((
 744 |                 recognize(pair(
 745 |                     frac,
 746 |                     opt(exp)
 747 |                 )),
 748 |                 exp
 749 |             )),
 750 |         ))
 751 |     );
 752 |     map(parser, |s| {
 753 |         let n = s.parse::<f64>().unwrap();
 754 |         Node::Float(n)
 755 |     })
 756 |     (input)
 757 | }
 758 | ```
 759 | 
 760 | This new logic uses `alt` to allow two choices: either a `frac` must be present
 761 | (with an optional `exp`) following, or an `exp` must be present by itself. An
 762 | input with neither a valid `frac` or `exp` will now fail, which makes
 763 | everything work the way we want it to.
 764 | 
 765 | ## Part 8. Handling string literals
 766 | 
 767 | So far we support literal null, boolean, integer, and float types. There's
 768 | only one more literal type left to handle: strings.
 769 | 
 770 | In the JSON grammar, a string is basically a series of Unicode characters that
 771 | starts and ends with a quote, plus a few extra rules:
 772 | 
 773 | 1. Certain characters must be escaped (ASCII control characters, quotes, and
 774 | backslashes)
 775 | 2. Any character may be escaped, using `\u` plus 4 hexadecimal digits, e.g.
 776 | `\uF903`.
 777 | 3. A small number of common characters have two-character escapes:
 778 | `\"` `\\` `\/` `\b` `\f` `\n` `\r` `\t`.
 779 | 
 780 | That's how RFC 8259 does things, anyway. Different implementations may have
 781 | subtle differences.
 782 | 
 783 | This means there are many possible ways to represent a certain string. We're
 784 | only building a parser, so we just need to make sure we can parse all the valid
 785 | JSON representations (and hopefully return an error on all the invalid ones).
 786 | 
 787 | The presence of escape characters makes our job more difficult. There are
 788 | different ways we might choose to address this. I'm going to choose to break
 789 | escape handling into a separate phase. This means we will only use `nom` to do
 790 | the lexing part (finding the bounds of the string literal), and we'll follow up
 791 | with an "un-escaping" pass to decode the escaped characters.
 792 | 
 793 | Bad inputs must be rejected by one of the two phases, but we don't care which
 794 | one. For example, `"\ud800"` looks like a valid JSON string, but can't be
 795 | decoded because U+D800 is a magic "surrogate" character, meaning it's half of a
 796 | character that needs more than 16 bits to encode. We should also reject things
 797 | like `"\x"` (a nonexistent escape), `"\u001"` (not enough hex digits), and
 798 | `"\"` (which is unterminated because the trailing quote is escaped). We also
 799 | need to reject "naked" (non-escaped) control characters (ASCII 0x00-0x1F),
 800 | though for some reason 0x7F (ASCII DELETE) is legal.
 801 | 
 802 | Let's begin by building a parser for "a string of valid non-escaped
 803 | characters": everything except control characters, backslash, and quote. We
 804 | don't need to check the upper limit 0x10FFFF because those characters will
 805 | never appear in a Rust `char`.
 806 | 
 807 | ```rust
 808 | use nom::bytes::complete::take_while1;
 809 | 
 810 | fn is_nonescaped_string_char(c: char) -> bool {
 811 |     let cv = c as u32;
 812 |     (cv >= 0x20) && (cv != 0x22) && (cv != 0x5C)
 813 | }
 814 | 
 815 | // One or more unescaped text characters
 816 | fn nonescaped_string(input: &str) -> IResult<&str, &str> {
 817 |     take_while1(is_nonescaped_string_char)
 818 |     (input)
 819 | }
 820 | ```
 821 | 
 822 | The `take_while1` function comes from the nom `bytes` module (which, remember,
 823 | isn't specific to byte sequences). `nom` offers a few different `take`
 824 | functions in this module; `take_while1` consumes characters that match some
 825 | condition, requiring at least 1 matching character.
 826 | 
 827 | Next, let's add a parser that can detect one escape sequence. Actually, we're
 828 | going to be even lazier than that; we'll pretend that `\u` is an escape
 829 | sequence all by itself, and let the unescape function determine whether the
 830 | characters that follow make sense. We could easily do it differently, but
 831 | since the unescape code will need to look at those characters in detail later,
 832 | we won't waste time doing that work twice.
 833 | 
 834 | ```rust
 835 | fn escape_code(input: &str) -> IResult<&str, &str> {
 836 |     recognize(
 837 |         pair(
 838 |             tag("\\"),
 839 |             alt((
 840 |                 tag("\""),
 841 |                 tag("\\"),
 842 |                 tag("/"),
 843 |                 tag("b"),
 844 |                 tag("f"),
 845 |                 tag("n"),
 846 |                 tag("r"),
 847 |                 tag("t"),
 848 |                 tag("u"),
 849 |             ))
 850 |         )
 851 |     )
 852 |     (input)
 853 | }
 854 | ```
 855 | 
 856 | Using those two pieces, we can now connect them together to parse the entire
 857 | body of a JSON string (minus the quotes that surround it):
 858 | 
 859 | ```rust
 860 | use nom::multi::many0;
 861 | 
 862 | fn string_body(input: &str) -> IResult<&str, &str> {
 863 |     recognize(
 864 |         many0(
 865 |             alt((
 866 |                 nonescaped_string,
 867 |                 escape_code
 868 |             ))
 869 |         )
 870 |     )
 871 |     (input)
 872 | }
 873 | ```
 874 | 
 875 | We've seen most of the pieces here before.
 876 | 
 877 | `many0` tries to apply a parser function repeatedly, gathering all of the
 878 | results into a vector. This version gathers "zero or more" of whatever we were
 879 | searching for (which is desirable because `""` is a valid JSON string). There
 880 | is also a `many1`, (if you want "one or more") and several other variations.
 881 | 
 882 | The final `recognize` throws away the output of `many0` (a vector), and instead
 883 | just returns to us the string that was matched. It's a little unfortunate that
 884 | we're throwing away the information we developed about where escapes appear—
 885 | perhaps another implementation could do the unescaping work right here. It
 886 | seems pretty typical (in my limited experience) to have to make tradeoffs like
 887 | this. We're breaking the work into multiple phases, which may require a little
 888 | bit of redundant effort, but our code gets a little simpler as a result.
 889 | 
 890 | There's one subtle thing about these two layers that should be pointed out.
 891 | Both `nonescaped_string` and `escape_code` are parsers that return "one or more
 892 | characters". And then we use those to build a parser that returns "zero or
 893 | more characters". In fact, you can't build a "zero or more" parser using other
 894 | "zero or more" components, because that could trigger an infinite loop: the
 895 | outer parser could try to gather an infinite number of empty subparser
 896 | successes. Typically `nom` combinators will return an error instead of going
 897 | into an infinite loop.
 898 | 
 899 | The next step is pretty simple: the string body must be wrapped in quotes.
 900 | 
 901 | ```rust
 902 | use nom::sequence::delimited;
 903 | 
 904 | fn json_string(input: &str) -> IResult<&str, &str> {
 905 |     delimited(
 906 |         tag("\""),
 907 |         string_body,
 908 |         tag("\"")
 909 |     )
 910 |     (input)
 911 | }
 912 | ```
 913 | 
 914 | This is the first time we've used `delimited`. It runs three sub-parsers,
 915 | returning the result of the middle one. The result from the first and third
 916 | arguments (the quote characters) are discarded.
 917 | 
 918 | At this point I should plug in some code to do un-escaping. Because this code
 919 | doesn't use `nom` and doesn't really help us understand how to write a `nom`
 920 | parser, I'm going to skip the explanation and just pull the
 921 | [escape8259](https://docs.rs/escape8259/0.5.0/escape8259/) crate that does this
 922 | part. A call to un-escape a string is pretty simple:
 923 | 
 924 | ```rust
 925 | pub fn unescape(s: &str) -> Result<String, UnescapeError>
 926 | ```
 927 | 
 928 | So all we need to do is plug that into `json_string`. We earlier used `nom`'s
 929 | `map` combinator to do this sort of thing, but here we need something a little
 930 | different because `unescape` may fail. We need to use `map_res` to handle
 931 | `Result::Err`.
 932 | 
 933 | ```rust
 934 | use nom::combinator::map_res;
 935 | use escape8259::unescape;
 936 | 
 937 | fn string_literal(input: &str) -> IResult<&str, String> {
 938 |     let parser = delimited(
 939 |         tag("\""),
 940 |         string_body,
 941 |         tag("\"")
 942 |     );
 943 |     map_res(parser, |s| {
 944 |         unescape(s)
 945 |     })
 946 |     (input)
 947 | }
 948 | ```
 949 | 
 950 | We also need to update our `Node` enum to include a string variant (we'll call
 951 | this `Str`), and make that our final output.
 952 | 
 953 | ```rust
 954 | pub enum Node {
 955 |     Null,
 956 |     Bool(bool),
 957 |     Integer(i64),
 958 |     Float(f64),
 959 |     Str(String),
 960 | }
 961 | 
 962 | fn json_string(input: &str) -> IResult<&str, Node> {
 963 |     map(string_literal, |s| {
 964 |         Node::Str(s)
 965 |     })
 966 |     (input)
 967 | }
 968 | ```
 969 | 
 970 | Finally, we should write some tests to make sure this is working correctly.
 971 | 
 972 | ```rust
 973 | #[test]
 974 | fn test_string() {
 975 |     // Plain Unicode strings with no escaping
 976 |     assert_eq!(json_string(r#""""#), Ok(("", Node::Str("".into()))));
 977 |     assert_eq!(json_string(r#""Hello""#), Ok(("", Node::Str("Hello".into()))));
 978 |     assert_eq!(json_string(r#""の""#), Ok(("", Node::Str("の".into()))));
 979 |     assert_eq!(json_string(r#""𝄞""#), Ok(("", Node::Str("𝄞".into()))));
 980 | 
 981 |     // valid 2-character escapes
 982 |     assert_eq!(json_string(r#""  \\  ""#), Ok(("", Node::Str("  \\  ".into()))));
 983 |     assert_eq!(json_string(r#""  \"  ""#), Ok(("", Node::Str("  \"  ".into()))));
 984 | 
 985 |     // valid 6-character escapes
 986 |     assert_eq!(json_string(r#""\u0000""#), Ok(("", Node::Str("\x00".into()))));
 987 |     assert_eq!(json_string(r#""\u00DF""#), Ok(("", Node::Str("ß".into()))));
 988 |     assert_eq!(json_string(r#""\uD834\uDD1E""#), Ok(("", Node::Str("𝄞".into()))));
 989 | 
 990 |     // Invalid because surrogate characters must come in pairs
 991 |     assert!(json_string(r#""\ud800""#).is_err());
 992 |     // Unknown 2-character escape
 993 |     assert!(json_string(r#""\x""#).is_err());
 994 |     // Not enough hex digits
 995 |     assert!(json_string(r#""\u""#).is_err());
 996 |     assert!(json_string(r#""\u001""#).is_err());
 997 |     // Naked control character
 998 |     assert!(json_string(r#""\x0a""#).is_err());
 999 |     // Not a JSON string because it's not wrapped in quotes
1000 |     assert!(json_string("abc").is_err());
1001 | }
1002 | ```
1003 | 
1004 | ## Part 9. Arrays and Objects
1005 | 
1006 | Finally, all of the hard parts are complete, and we get to the fun parts:
1007 | arrays and objects (maps or dictionaries in other languages).
1008 | 
1009 | Let's start with the changes to our `Node` enum, to give us a little better
1010 | idea how these recursive data structures should work.
1011 | 
1012 | ```rust
1013 | pub enum Node {
1014 |     Null,
1015 |     Bool(bool),
1016 |     Integer(i64),
1017 |     Float(f64),
1018 |     Str(String),
1019 |     Array(Vec<Node>),
1020 |     Object(Vec<(String, Node)>),
1021 | }
1022 | ```
1023 | 
1024 | Since `Node` now includes types other than literal values, let's rename
1025 | `json_literal` to `json_value`:
1026 | 
1027 | ```rust
1028 | fn json_value(input: &str) -> -> IResult<&str, Node> {
1029 |     spacey(alt((
1030 |         json_array,
1031 |         json_object,
1032 |         json_string,
1033 |         json_float,
1034 |         json_integer,
1035 |         json_bool,
1036 |         json_null
1037 |     )))
1038 |     (input)
1039 | }
1040 | ```
1041 | 
1042 | An array can be heterogeneous (different value types, e.g. `[1, "foo", true]`).
1043 | Each object member must have a string for its key, and may have any value
1044 | type. An object might be `{"a": 1, "b": false}`. Arrays and objects can be
1045 | nested arbitrarily.
1046 | 
1047 | Let's implement arrays first.
1048 | 
1049 | ```rust
1050 | use nom::multi::separated_list0;
1051 | 
1052 | fn json_array(input: &str) -> IResult<&str, Node> {
1053 |     let parser = delimited(
1054 |         tag("["),
1055 |         separated_list0(tag(","), json_value),
1056 |         tag("]")
1057 |     );
1058 |     map(parser, |v| {
1059 |         Node::Array(v)
1060 |     })
1061 |     (input)
1062 | }
1063 | ```
1064 | 
1065 | That was surprisingly easy. The only new thing we needed was `separated_list0`,
1066 | which alternates between two subparsers. The first argument is the
1067 | "separator", and its result is thrown away; we get a vector of results from the
1068 | second parser. It will match zero or more elements; `nom` has a
1069 | `separated_list1` if you want one-or-more.
1070 | 
1071 | Objects are up next; they're a little more complicated so let's implement them
1072 | as two separate functions.
1073 | 
1074 | ```rust
1075 | use nom::sequence::separated_pair;
1076 | 
1077 | fn object_member(input: &str) -> IResult<&str, (String, Node)> {
1078 |     separated_pair(string_literal, tag(":"), json_value)
1079 |     (input)
1080 | }
1081 | 
1082 | fn json_object(input: &str) -> IResult<&str, Node> {
1083 |     let parser = delimited(
1084 |         tag("{"),
1085 |         separated_list0(
1086 |             tag(","),
1087 |             object_member
1088 |         ),
1089 |         tag("}")
1090 |     );
1091 |     map(parser, |v| {
1092 |         Node::Object(v)
1093 |     })
1094 |     (input)
1095 | }
1096 | ```
1097 | 
1098 | This looks a lot like the array implementation. The only difference (other
1099 | than the braces) is that where an array looks for a single value, the object
1100 | looks for a quoted string literal, then a `:` character, and then a value.
1101 | 
1102 | And we have a JSON parser!
1103 | 
1104 | ## Part 10. Spacing out
1105 | 
1106 | Well, we almost have a JSON parser. We might start testing arrays like this:
1107 | 
1108 | ```rust
1109 | #[test]
1110 | fn test_array() {
1111 |     assert_eq!(json_array("[]"), Ok(("", Node::Array(vec![]))));
1112 |     assert_eq!(json_array("[1]"), Ok(("", Node::Array(vec![Node::Integer(1)]))));
1113 | 
1114 |     let expected = Node::Array(vec![Node::Integer(1), Node::Integer(2)]);
1115 |     assert_eq!(json_array("[1,2]"), Ok(("", expected)));
1116 | }
1117 | ```
1118 | 
1119 | But it doesn't work if we write:
1120 | 
1121 | ```rust
1122 |     assert_eq!(json_array("[1, 2]"), Ok(("", expected)));
1123 | ```
1124 | 
1125 | The only difference is the space character after the comma. We forgot to
1126 | handle whitespace.
1127 | 
1128 | In fact, we haven't handled whitespace anywhere. Whitespace could appear
1129 | anywhere: before or after values or any punctuation (braces, brackets, comma,
1130 | or colon).
1131 | 
1132 | To ignore whitespace, we need a parser function that matches whitespace. We
1133 | could easily build one, but `nom` includes one that matches our needs exactly:
1134 | `nom::character::complete::multispace0`.
1135 | 
1136 | That means we need to do a bunch of substitutions, things like:
1137 | ```rust
1138 | tag("[")
1139 | ```
1140 | need to become
1141 | ```rust
1142 | delimited(multispace0, tag("["), multispace0)
1143 | ```
1144 | 
1145 | Which adds a lot of clutter, and is kind of hard to read. Maybe instead we
1146 | should write a combinator of our own to make this a little more compact. This
1147 | isn't absolutely necessary— the cluttered version is perfectly functional. The
1148 | only reason I'm going to tackle this is it provides a little bit of insight
1149 | into the pile of generic parameters you see if you look at the documentation
1150 | for `nom` combinators. If you don't care, feel free to skip this section.
1151 | 
1152 | First, let's write a combinator that does nothing, other than apply a parser we
1153 | specify.
1154 | 
1155 | ```rust
1156 | fn identity<F, I, O, E>(f: F) -> impl FnMut(I) -> IResult<I, O, E>
1157 | where
1158 |     F: FnMut(I) -> IResult<I, O, E>,
1159 | {
1160 |     f
1161 | }
1162 | ```
1163 | 
1164 | That looks pretty intimidating. But so do most of the built-in `nom`
1165 | combinators, so if we can understand this combinator function, we'll have a
1166 | little easier time understanding other parts of `nom`.
1167 | 
1168 | Let's see if we can make some sense of all those generic parameters.
1169 | 
1170 | `F` is the type of the parser we pass in. It could be any `nom`-style parser,
1171 | and we already know what those look like; they accept one input parameter, and
1172 | return an `IResult`. This `IResult` has three generic parameters, and we've
1173 | always used two— the third has a default value, and we've been omitting it.
1174 | 
1175 | So our `F` is a function that accepts one `I` and returns `IResult<I, O, E>`.
1176 | `I` is our input parameter (which has been `&str` so far everywhere). `O` is
1177 | our output type (and we've used a bunch of different ones; `&str`, `Node`,
1178 | etc.) The `E` is the parser error type, and we can continue ignoring that for
1179 | now since we've only used the default.
1180 | 
1181 | Our combinator returns a closure. So its return type is
1182 | `FnMut(I) -> IResult<I, O, E>`. That looks the same as `F`, but for all cases
1183 | other than `identity` we'll return a different closure than the input, so we
1184 | will need to spell out the return type.
1185 | 
1186 | A lot of `nom` combinators have even more complex type signatures
1187 | (`separated_pair` has 8 generic parameters!) but picking them apart is usually
1188 | pretty straightforward if you're patient. You'll probably only need to know
1189 | when something fails to compile.
1190 | 
1191 | Anyway, let's write a combinator that wraps its input in a `delimited` with
1192 | `multispace0` on both sides.
1193 | 
1194 | ```rust
1195 | fn spacey<F, I, O, E>(f: F) -> impl FnMut(I) -> IResult<I, O, E>
1196 | where
1197 |     F: FnMut(I) -> IResult<I, O, E>,
1198 | {
1199 |     delimited(multispace0, f, multispace0)
1200 | }
1201 | ```
1202 | 
1203 | This explodes with a huge pile of errors; many complaints about trait bounds
1204 | that aren't met for `I` and `E`. But it turns out that this is just because
1205 | `multispace0` requires those on its `I` and `E`, so we have to guarantee those
1206 | trait bounds as well. Copying those trait bounds over to our function will
1207 | work:
1208 | 
1209 | ```rust
1210 | fn spacey<F, I, O, E>(f: F) -> impl FnMut(I) -> IResult<I, O, E>
1211 | where
1212 |     F: FnMut(I) -> IResult<I, O, E>,
1213 |     I: nom::InputTakeAtPosition,
1214 |     <I as nom::InputTakeAtPosition>::Item: nom::AsChar + Clone,
1215 |     E: nom::error::ParseError<I>,
1216 | {
1217 |     delimited(multispace0, f, multispace0)
1218 | }
1219 | ```
1220 | 
1221 | Was that worth it? Maybe not for this program. But it's interesting to see
1222 | what's involved in building our own combinators. Maybe the `nom` function
1223 | documentation will look a little less scary, too.
1224 | 
1225 | Now that we have a useful multispace-handling combinator, we can sprinkle it
1226 | around all the places where we need to ignore whitespace. For example:
1227 | 
1228 | ```rust
1229 | fn json_array(input: &str) -> IResult<&str, Node> {
1230 |     let parser = delimited(
1231 |         spacey(tag("[")),
1232 |         separated_list0(spacey(tag(",")), json_value),
1233 |         spacey(tag("]")),
1234 |     );
1235 |     map(parser, |v| {
1236 |         Node::Array(v)
1237 |     })
1238 |     (input)
1239 | }
1240 | ```
1241 | 
1242 | ## Part 11. Error handling.
1243 | 
1244 | We skipped over a few places where proper error handling is needed. For
1245 | example, numbers that are out of bounds (e.g. `1e99999`) should return some
1246 | kind of parse error.
1247 | 
1248 | Currently we are using the `IResult` default error type, which is
1249 | `nom::internal::Err<(&str, nom::error::ErrorKind)>`. That doesn't look
1250 | promising— we can't realistically expect to be able to extend that type with
1251 | our own error variants.
1252 | 
1253 | So let's build our own error type. We'll use macros from the
1254 | [`thiserror`](https://docs.rs/thiserror/1.0/thiserror/) crate to automatically
1255 | generate some of the boilerplate that's necessary for error types.
1256 | 
1257 | ```rust
1258 | #[derive(thiserror::Error, Debug, PartialEq)]
1259 | pub enum JSONParseError {
1260 |     #[error("bad integer")]
1261 |     BadInt,
1262 |     #[error("bad float")]
1263 |     BadFloat,
1264 |     #[error("bad escape sequence")]
1265 |     BadEscape,
1266 |     #[error("unknown parser error")]
1267 |     Unparseable,
1268 | }
1269 | ```
1270 | 
1271 | Because `nom` error handling uses generic parameters, it can be difficult to
1272 | see how to best implement a custom error type. There is a good minimal example
1273 | of custom error types in the nom 7.0 sources
1274 | ([examples/custom_error.rs](https://github.com/Geal/nom/blob/7.0.0/examples/custom_error.rs))
1275 | that shows the steps needed to make things work gracefully:
1276 | 
1277 | 1. Figure out how to map a `nom` error into your error type. Usually this will
1278 | be with a dedicated enum variant.
1279 | 2. Implement the trait `nom::error::ParseError<I>` for your error type. This
1280 | will allow all of the `nom` combinators to generate your custom error type when
1281 | needed.
1282 | 3. Use the 3-argument form of `IResult`, specifying your error type. You will
1283 | probably want to do this on most or all of your parser functions so combinators
1284 | work gracefully.
1285 | 
1286 | When building a custom error type that will be generated by nom parsers,
1287 | consider how far you want to propagate the error metadata (`ErrorKind` and
1288 | input slice). If the error type is only visible internal to a crate, it can
1289 | be useful to preserve all the nom metadata (the `input` and `kind` parameters
1290 | to `ParseError::from_error_kind`) for debugging. In a public error struct, it
1291 | may be wiser to discard that information, as a user of your crate probably
1292 | doesn't care about `nom` error metadata. I will assume `JSONParseError` is
1293 | public, so I will discard the `nom` error parameters.
1294 | 
1295 | ```rust
1296 | use nom::error::{ErrorKind, ParseError};
1297 | 
1298 | impl<I> ParseError<I> for JSONParseError {
1299 |     fn from_error_kind(_input: I, _kind: ErrorKind) -> Self {
1300 |         JSONParseError::Unparseable
1301 |     }
1302 | 
1303 |     fn append(_: I, _: ErrorKind, other: Self) -> Self {
1304 |         other
1305 |     }
1306 | }
1307 | ```
1308 | 
1309 | For error handling on integers, we'll split the function into two parts to make
1310 | it easier to read:
1311 | 
1312 | ```rust
1313 | fn integer_body(input: &str) -> IResult<&str, &str, JSONParseError> {
1314 |     recognize(
1315 |         pair(
1316 |             opt(tag("-")),
1317 |             uint
1318 |         )
1319 |     )
1320 |     (input)
1321 | }
1322 | 
1323 | fn json_integer(input: &str) -> IResult<&str, Node, JSONParseError> {
1324 |     let (remain, raw_int) = integer_body(input)?;
1325 |     match raw_int.parse::<i64>() {
1326 |         Ok(i) => Ok((remain, Node::Integer(i))),
1327 |         Err(_) => Err(nom::Err::Failure(JSONParseError::BadInt)),
1328 |     }
1329 | }
1330 | ```
1331 | 
1332 | Note that `json_integer` works differently from all the other parsers we've
1333 | written so far: instead of composing parsers using combinators, we actually run
1334 | the `integer_body` parser and capture its result (the remainder and the matched
1335 | string slice). We then attempt to parse the string slice into an integer, and
1336 | hand-assemble an `IResult` by hand.
1337 | 
1338 | This can be a useful technique when the `nom` combinators don't supply exactly
1339 | what you need. Here, I first tried using `map_res` to parse the int, but it
1340 | turns out that `map_res` always throws away the error value returned by the
1341 | closure, and substitutes its own error (with kind `MapRes`).
1342 | 
1343 | The same approach works for string escaping errors and float parsing errors,
1344 | though float overflow in Rust results in infinity, not an error. This means we
1345 | will never actually return `BadFloat` because there are no
1346 | grammatically-correct floats that can't be parsed into an `f64`.
1347 | (Though Rust versions older than 1.55 had some problems parsing
1348 | [certain edge cases](https://github.com/rust-lang/rust/issues/31407).)
1349 | 
1350 | ## Part 12. Finalization.
1351 | 
1352 | There's one more `nom`-specific step that we probably want. Assuming our code
1353 | is a library, meant to be used by other programs, we don't want `nom::IResult`
1354 | to show up as our public result type. We should instead return
1355 | `Result<Node, JSONParseError>`.
1356 | 
1357 | We can use `all_consuming` to ensure that all input was matched. Unfortunately,
1358 | there doesn't seem to be a simple `nom` shortcut for translating the error. We
1359 | can do this ourselves:
1360 | 
1361 | ```rust
1362 | 
1363 | use nom::combinator::all_consuming;
1364 | 
1365 | pub fn parse_json(input: &str) -> Result<Node, JSONParseError> {
1366 |     let (_, result) = all_consuming(json_value)(input).map_err(|nom_err| {
1367 |         match nom_err {
1368 |             nom::Err::Incomplete(_) => unreachable!(),
1369 |             nom::Err::Error(e) => e,
1370 |             nom::Err::Failure(e) => e,
1371 |         }
1372 |     })?;
1373 |     Ok(result)
1374 | }
1375 | ```
1376 | 
1377 | We haven't talked yet about the three
1378 | [`nom::Err`](https://docs.rs/nom/7.0.0/nom/enum.Err.html) variants.
1379 | 
1380 | - `Incomplete` is only used by `nom` streaming parsers. We don't use those, so
1381 | we can just mark that branch `unreachable!` (which would panic).
1382 | - `Error` is what we usually see when a parser has a problem. Something didn't
1383 | match the expected grammar.
1384 | - `Failure` appears less often. It means that the input could only be parsed
1385 | one way, but a parser decided that it was invalid. Unlike `Error`, this error
1386 | is propagated upward without trying any alternative paths (if something like
1387 | `alt` is present).
1388 | 
1389 | Our code does use `Failure` in a few places: that's what we return when there
1390 | is a numeric conversion error or a bad escape code. If we use `Error` instead,
1391 | the parsers could return the wrong error type. The reason is that the nom
1392 | `alt` parser would keep trying other parsers, and if all of them fail, there's
1393 | no way for `alt` to know which error is the right one— it usually just returns
1394 | the last error.
1395 | 
1396 | ## Thanks for reading!
1397 | 
1398 | This ended up being a lot longer than I originally planned, and along the way I
1399 | discovered several things that I'd been doing wrong in my own parsers. There
1400 | are probably a few things that I've still missed; if you notice something, feel
1401 | free to open an issue at this page's
1402 | [GitHub repo](https://github.com/ericseppanen/json-parser-toy), or get in touch
1403 | on [twitter: @codeandbitters](https://twitter.com/codeandbitters)
1404 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use nom::{branch::alt, IResult};
  2 | use nom::bytes::complete::{tag, take_while1};
  3 | use nom::character::complete::{one_of, digit0, digit1, multispace0};
  4 | use nom::combinator::{all_consuming, map, opt, recognize, value};
  5 | use nom::error::{ErrorKind, ParseError};
  6 | use nom::multi::{many0, separated_list0};
  7 | use nom::sequence::{delimited, pair, separated_pair, tuple};
  8 | use escape8259::unescape;
  9 | 
 10 | #[derive(thiserror::Error, Debug, PartialEq)]
 11 | pub enum JSONParseError {
 12 |     #[error("bad integer")]
 13 |     BadInt,
 14 |     #[error("bad float")]
 15 |     BadFloat,
 16 |     #[error("bad escape sequence")]
 17 |     BadEscape,
 18 |     #[error("unknown parser error")]
 19 |     Unparseable,
 20 | }
 21 | 
 22 | impl<I> ParseError<I> for JSONParseError {
 23 |     fn from_error_kind(_input: I, _kind: ErrorKind) -> Self {
 24 |         // Because JSONParseError is a simplified public error type,
 25 |         // we discard the nom error parameters.
 26 |         JSONParseError::Unparseable
 27 |     }
 28 | 
 29 |     fn append(_: I, _: ErrorKind, other: Self) -> Self {
 30 |         other
 31 |     }
 32 | }
 33 | 
 34 | #[derive(PartialEq, Debug, Clone)]
 35 | pub enum Node {
 36 |     Null,
 37 |     Bool(bool),
 38 |     Integer(i64),
 39 |     Float(f64),
 40 |     Str(String),
 41 |     Array(Vec<Node>),
 42 |     Object(Vec<(String, Node)>),
 43 | }
 44 | 
 45 | pub fn parse_json(input: &str) -> Result<Node, JSONParseError> {
 46 |     let (_, result) = all_consuming(json_value)(input).map_err(|nom_err| {
 47 |         match nom_err {
 48 |             nom::Err::Incomplete(_) => unreachable!(),
 49 |             nom::Err::Error(e) => e,
 50 |             nom::Err::Failure(e) => e,
 51 |         }
 52 |     })?;
 53 |     Ok(result)
 54 | }
 55 | 
 56 | fn json_value(input: &str) -> IResult<&str, Node, JSONParseError> {
 57 |     spacey(alt((
 58 |         json_array,
 59 |         json_object,
 60 |         json_string,
 61 |         json_float,
 62 |         json_integer,
 63 |         json_bool,
 64 |         json_null
 65 |     )))
 66 |     (input)
 67 | }
 68 | 
 69 | fn spacey<F, I, O, E>(f: F) -> impl FnMut(I) -> IResult<I, O, E>
 70 | where
 71 |     F: FnMut(I) -> IResult<I, O, E>,
 72 |     I: nom::InputTakeAtPosition,
 73 |     <I as nom::InputTakeAtPosition>::Item: nom::AsChar + Clone,
 74 |     E: nom::error::ParseError<I>,
 75 | {
 76 |     delimited(multispace0, f, multispace0)
 77 | }
 78 | 
 79 | fn json_array(input: &str) -> IResult<&str, Node, JSONParseError> {
 80 |     let parser = delimited(
 81 |         spacey(tag("[")),
 82 |         separated_list0(spacey(tag(",")), json_value),
 83 |         spacey(tag("]")),
 84 |     );
 85 |     map(parser, |v| {
 86 |         Node::Array(v)
 87 |     })
 88 |     (input)
 89 | }
 90 | 
 91 | // "key: value", where key and value are any JSON type.
 92 | fn object_member(input: &str) -> IResult<&str, (String, Node), JSONParseError> {
 93 |     separated_pair(string_literal, spacey(tag(":")), json_value)
 94 |     (input)
 95 | }
 96 | 
 97 | fn json_object(input: &str) -> IResult<&str, Node, JSONParseError> {
 98 |     let parser = delimited(
 99 |         spacey(tag("{")),
100 |         separated_list0(
101 |             spacey(tag(",")),
102 |             object_member
103 |         ),
104 |         spacey(tag("}")),
105 |     );
106 |     map(parser, |v| {
107 |         Node::Object(v)
108 |     })
109 |     (input)
110 | }
111 | 
112 | // A character that is:
113 | // NOT a control character (0x00 - 0x1F)
114 | // NOT a quote character (0x22)
115 | // NOT a backslash character (0x5C)
116 | // Is within the unicode range (< 0x10FFFF) (this is already guaranteed by Rust char)
117 | fn is_nonescaped_string_char(c: char) -> bool {
118 |     let cv = c as u32;
119 |     (cv >= 0x20) && (cv != 0x22) && (cv != 0x5C)
120 | }
121 | 
122 | // One or more unescaped text characters
123 | fn nonescaped_string(input: &str) -> IResult<&str, &str, JSONParseError> {
124 |     take_while1(is_nonescaped_string_char)
125 |     (input)
126 | }
127 | 
128 | // There are only two types of escape allowed by RFC 8259.
129 | // - single-character escapes \" \\ \/ \b \f \n \r \t
130 | // - general-purpose \uXXXX
131 | // Note: we don't enforce that escape codes are valid here.
132 | // There must be a decoder later on.
133 | fn escape_code(input: &str) -> IResult<&str, &str, JSONParseError> {
134 |     recognize(
135 |         pair(
136 |             tag("\\"),
137 |             alt((
138 |                 tag("\""),
139 |                 tag("\\"),
140 |                 tag("/"),
141 |                 tag("b"),
142 |                 tag("f"),
143 |                 tag("n"),
144 |                 tag("r"),
145 |                 tag("t"),
146 |                 tag("u"),
147 |             ))
148 |         )
149 |     )
150 |     (input)
151 | }
152 | 
153 | // Zero or more text characters
154 | fn string_body(input: &str) -> IResult<&str, &str, JSONParseError> {
155 |     recognize(
156 |         many0(
157 |             alt((
158 |                 nonescaped_string,
159 |                 escape_code
160 |             ))
161 |         )
162 |     )
163 |     (input)
164 | }
165 | 
166 | fn string_literal(input: &str) -> IResult<&str, String, JSONParseError> {
167 |     let (remain, raw_string) = delimited(
168 |         tag("\""),
169 |         string_body,
170 |         tag("\"")
171 |     )
172 |     (input)?;
173 | 
174 |     match unescape(raw_string) {
175 |         Ok(s) => Ok((remain, s)),
176 |         Err(_) => Err(nom::Err::Failure(JSONParseError::BadEscape)),
177 |     }
178 | }
179 | 
180 | fn json_string(input: &str) -> IResult<&str, Node, JSONParseError> {
181 |     map(string_literal, |s| {
182 |         Node::Str(s)
183 |     })
184 |     (input)
185 | }
186 | 
187 | // This can be done a few different ways:
188 | // one_of("123456789"),
189 | // anychar("0123456789"),
190 | // we could also extract the character value as u32 and do range checks...
191 | 
192 | fn digit1to9(input: &str) -> IResult<&str, char, JSONParseError> {
193 |     one_of("123456789")
194 |     (input)
195 | }
196 | 
197 | // unsigned_integer = zero / ( digit1-9 *DIGIT )
198 | fn uint(input: &str) -> IResult<&str, &str, JSONParseError> {
199 |     alt((
200 |         tag("0"),
201 |         recognize(
202 |             pair(
203 |                 digit1to9,
204 |                 digit0
205 |             )
206 |         )
207 |     ))
208 |     (input)
209 | }
210 | 
211 | fn integer_body(input: &str) -> IResult<&str, &str, JSONParseError> {
212 |     recognize(
213 |         pair(
214 |             opt(tag("-")),
215 |             uint
216 |         )
217 |     )
218 |     (input)
219 | }
220 | 
221 | fn json_integer(input: &str) -> IResult<&str, Node, JSONParseError> {
222 |     let (remain, raw_int) = integer_body(input)?;
223 |     match raw_int.parse::<i64>() {
224 |         Ok(i) => Ok((remain, Node::Integer(i))),
225 |         Err(_) => Err(nom::Err::Failure(JSONParseError::BadInt)),
226 |     }
227 | }
228 | 
229 | // number = [ minus ] int [ frac ] [ exp ]
230 | //
231 | //       decimal-point = %x2E       ; .
232 | //       digit1-9 = %x31-39         ; 1-9
233 | //       e = %x65 / %x45            ; e E
234 | //       exp = e [ minus / plus ] 1*DIGIT
235 | //       frac = decimal-point 1*DIGIT
236 | //       int = zero / ( digit1-9 *DIGIT )
237 | //       minus = %x2D               ; -
238 | //       plus = %x2B                ; +
239 | //       zero = %x30                ; 0
240 | 
241 | fn frac(input: &str) -> IResult<&str, &str, JSONParseError> {
242 |     recognize(
243 |         pair(
244 |             tag("."),
245 |             digit1
246 |         )
247 |     )
248 |     (input)
249 | }
250 | 
251 | fn exp(input: &str) -> IResult<&str, &str, JSONParseError> {
252 |     recognize(
253 |         tuple((
254 |             tag("e"),
255 |             opt(alt((
256 |                 tag("-"),
257 |                 tag("+")
258 |             ))),
259 |             digit1
260 |         ))
261 |     )
262 |     (input)
263 | }
264 | 
265 | fn float_body(input: &str) -> IResult<&str, &str, JSONParseError> {
266 |     recognize(
267 |         tuple((
268 |             opt(tag("-")),
269 |             uint,
270 |             alt((
271 |                 recognize(pair(
272 |                     frac,
273 |                     opt(exp)
274 |                 )),
275 |                 exp
276 |             )),
277 |         ))
278 |     )
279 |     (input)
280 | }
281 | 
282 | fn json_float(input: &str) -> IResult<&str, Node, JSONParseError> {
283 |     let (remain, raw_float) = float_body(input)?;
284 |     match raw_float.parse::<f64>() {
285 |         Ok(f) => Ok((remain, Node::Float(f))),
286 |         Err(_) => Err(nom::Err::Failure(JSONParseError::BadFloat)),
287 |     }
288 | }
289 | 
290 | fn json_bool(input: &str) -> IResult<&str, Node, JSONParseError> {
291 |     alt((
292 |         value(Node::Bool(false), tag("false")),
293 |         value(Node::Bool(true), tag("true")),
294 |     ))
295 |     (input)
296 | }
297 | 
298 | fn json_null(input: &str) -> IResult<&str, Node, JSONParseError> {
299 |     value(Node::Null, tag("null"))
300 |     (input)
301 | }
302 | 
303 | #[test]
304 | fn test_bool() {
305 |     assert_eq!(json_bool("false"), Ok(("", Node::Bool(false))));
306 |     assert_eq!(json_bool("true"), Ok(("", Node::Bool(true))));
307 |     assert!(json_bool("foo").is_err());
308 | }
309 | 
310 | #[test]
311 | fn test_null() {
312 |     assert_eq!(json_null("null"), Ok(("", Node::Null)));
313 | }
314 | 
315 | #[test]
316 | fn test_integer() {
317 |     assert_eq!(json_integer("42"), Ok(("", Node::Integer(42))));
318 |     assert_eq!(json_integer("-123"), Ok(("", Node::Integer(-123))));
319 |     assert_eq!(json_integer("0"), Ok(("", Node::Integer(0))));
320 |     assert_eq!(json_integer("01"), Ok(("1", Node::Integer(0))));
321 |     assert_eq!(json_integer("9999999999999999999"), Err(nom::Err::Failure(JSONParseError::BadInt)));
322 | }
323 | 
324 | #[test]
325 | fn test_float() {
326 |     assert_eq!(json_float("42.0"), Ok(("", Node::Float(42.0))));
327 |     assert_eq!(json_float("-123.99"), Ok(("", Node::Float(-123.99))));
328 |     assert_eq!(json_float("6.02214086e23"), Ok(("", Node::Float(6.02214086e23))));
329 |     assert_eq!(json_float("-1e6"), Ok(("", Node::Float(-1000000.0))));
330 |     assert_eq!(json_float("1.0e+3"), Ok(("", Node::Float(1000.0))));
331 | 
332 | 
333 |     // f64::from_str overflows to infinity instead of throwing an error
334 |     assert_eq!(json_float("1e9999"), Ok(("", Node::Float(f64::INFINITY))));
335 | 
336 |     // odd looking but still valid.
337 |     assert_eq!(json_float("0e+42949672970"), Ok(("", Node::Float(0.0))));
338 |     assert_eq!(json_float("9e00010"), Ok(("", Node::Float(90000000000.0))));
339 |     assert_eq!(json_float("-0.0e-99999999999999999999999999"), Ok(("", Node::Float(-0.0))));
340 | }
341 | 
342 | #[test]
343 | fn test_string() {
344 |     // Plain Unicode strings with no escaping
345 |     assert_eq!(json_string(r#""""#), Ok(("", Node::Str("".into()))));
346 |     assert_eq!(json_string(r#""Hello""#), Ok(("", Node::Str("Hello".into()))));
347 |     assert_eq!(json_string(r#""の""#), Ok(("", Node::Str("の".into()))));
348 |     assert_eq!(json_string(r#""𝄞""#), Ok(("", Node::Str("𝄞".into()))));
349 | 
350 |     // valid 2-character escapes
351 |     assert_eq!(json_string(r#""  \\  ""#), Ok(("", Node::Str("  \\  ".into()))));
352 |     assert_eq!(json_string(r#""  \"  ""#), Ok(("", Node::Str("  \"  ".into()))));
353 | 
354 |     // valid 6-character escapes
355 |     assert_eq!(json_string(r#""\u0000""#), Ok(("", Node::Str("\x00".into()))));
356 |     assert_eq!(json_string(r#""\u00DF""#), Ok(("", Node::Str("ß".into()))));
357 |     assert_eq!(json_string(r#""\uD834\uDD1E""#), Ok(("", Node::Str("𝄞".into()))));
358 | 
359 |     // Invalid because surrogate characters must come in pairs
360 |     assert!(json_string(r#""\ud800""#).is_err());
361 |     // Unknown 2-character escape
362 |     assert!(json_string(r#""\x""#).is_err());
363 |     // Not enough hex digits
364 |     assert!(json_string(r#""\u""#).is_err());
365 |     assert!(json_string(r#""\u001""#).is_err());
366 |     // Naked control character
367 |     assert!(json_string(r#""\x0a""#).is_err());
368 |     // Not a JSON string because it's not wrapped in quotes
369 |     assert!(json_string("abc").is_err());
370 |     // An unterminated string (because the trailing quote is escaped)
371 |     assert!(json_string(r#""\""#).is_err());
372 | 
373 |     // Parses correctly but has escape errors due to incomplete surrogate pair.
374 |     assert_eq!(json_string(r#""\ud800""#), Err(nom::Err::Failure(JSONParseError::BadEscape)));
375 | }
376 | 
377 | #[test]
378 | fn test_array() {
379 |     assert_eq!(json_array("[ ]"), Ok(("", Node::Array(vec![]))));
380 |     assert_eq!(json_array("[ 1 ]"), Ok(("", Node::Array(vec![Node::Integer(1)]))));
381 | 
382 |     let expected = Node::Array(vec![Node::Integer(1), Node::Str("x".into())]);
383 |     assert_eq!(json_array(r#" [ 1 , "x" ] "#), Ok(("", expected)));
384 | }
385 | 
386 | #[test]
387 | fn test_object() {
388 |     assert_eq!(json_object("{ }"), Ok(("", Node::Object(vec![]))));
389 |     let expected = Node::Object(vec![("1".into(), Node::Integer(2))]);
390 |     assert_eq!(json_object(r#" { "1" : 2 } "#), Ok(("", expected)));
391 | }
392 | 
393 | #[test]
394 | fn test_values() {
395 |     assert_eq!(parse_json(" 56 "), Ok(Node::Integer(56)));
396 |     assert_eq!(parse_json(" 78.0 "), Ok(Node::Float(78.0)));
397 |     assert_eq!(parse_json(r#" "Hello" "#), Ok(Node::Str("Hello".into())));
398 |     // These two tests aren't relevant for JSON. They verify that `json_float`
399 |     // will never mistake integers for floats in other grammars that might
400 |     // allow a `.` or `e` character after a literal integer.
401 |     assert_eq!(json_value("123else"), Ok(("else", Node::Integer(123))));
402 |     assert_eq!(json_value("123.x"), Ok((".x", Node::Integer(123))));
403 | 
404 |     assert_eq!(parse_json("123else"), Err(JSONParseError::Unparseable));
405 |     assert_eq!(parse_json("123.x"), Err(JSONParseError::Unparseable));
406 |     assert_eq!(parse_json("[ 56, "), Err(JSONParseError::Unparseable));
407 |     assert_eq!(parse_json(r#"{ "a": "b" "#), Err(JSONParseError::Unparseable));
408 |     assert_eq!(parse_json(" 56 a"), Err(JSONParseError::Unparseable));
409 | 
410 |     assert_eq!(parse_json("9999999999999999999"), Err(JSONParseError::BadInt));
411 |     assert_eq!(parse_json(r#""\ud800""#), Err(JSONParseError::BadEscape));
412 | }
413 | 


--------------------------------------------------------------------------------