├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── README.md └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "escape8259" 7 | version = "0.5.1" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8edd65c008c6e97290e463c336e0c3fe109a91accb0e6b3e9e353d1605bd58b8" 10 | dependencies = [ 11 | "rustversion", 12 | ] 13 | 14 | [[package]] 15 | name = "json-parser-toy" 16 | version = "0.1.0" 17 | dependencies = [ 18 | "escape8259", 19 | "nom", 20 | "thiserror", 21 | ] 22 | 23 | [[package]] 24 | name = "memchr" 25 | version = "2.4.1" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" 28 | 29 | [[package]] 30 | name = "minimal-lexical" 31 | version = "0.2.1" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" 34 | 35 | [[package]] 36 | name = "nom" 37 | version = "7.1.0" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" 40 | dependencies = [ 41 | "memchr", 42 | "minimal-lexical", 43 | "version_check", 44 | ] 45 | 46 | [[package]] 47 | name = "proc-macro2" 48 | version = "1.0.36" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" 51 | dependencies = [ 52 | "unicode-xid", 53 | ] 54 | 55 | [[package]] 56 | name = "quote" 57 | version = "1.0.14" 58 | source = "registry+https://github.com/rust-lang/crates.io-index" 59 | checksum = "47aa80447ce4daf1717500037052af176af5d38cc3e571d9ec1c7353fc10c87d" 60 | dependencies = [ 61 | "proc-macro2", 62 | ] 63 | 64 | [[package]] 65 | name = "rustversion" 66 | version = "1.0.6" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" 69 | 70 | [[package]] 71 | name = "syn" 72 | version = "1.0.85" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "a684ac3dcd8913827e18cd09a68384ee66c1de24157e3c556c9ab16d85695fb7" 75 | dependencies = [ 76 | "proc-macro2", 77 | "quote", 78 | "unicode-xid", 79 | ] 80 | 81 | [[package]] 82 | name = "thiserror" 83 | version = "1.0.30" 84 | source = "registry+https://github.com/rust-lang/crates.io-index" 85 | checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" 86 | dependencies = [ 87 | "thiserror-impl", 88 | ] 89 | 90 | [[package]] 91 | name = "thiserror-impl" 92 | version = "1.0.30" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" 95 | dependencies = [ 96 | "proc-macro2", 97 | "quote", 98 | "syn", 99 | ] 100 | 101 | [[package]] 102 | name = "unicode-xid" 103 | version = "0.2.2" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 106 | 107 | [[package]] 108 | name = "version_check" 109 | version = "0.9.4" 110 | source = "registry+https://github.com/rust-lang/crates.io-index" 111 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" 112 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "json-parser-toy" 3 | version = "0.1.0" 4 | authors = ["Eric Seppanen "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | nom = "7.1.0" 11 | escape8259 = "0.5" 12 | thiserror = "1.0" 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Let's build a parser! 2 | 3 | [*Read the original blog post at* **codeandbitters**.](https://codeandbitters.com/lets-build-a-parser/) 4 | 5 | > Updated 10/2021 to use `nom 7.0`! 6 | 7 | This is a demonstration of building a parser in Rust using the 8 | [`nom`](https://docs.rs/nom/) crate. I recently built a parser for 9 | the [`cddl-cat`](https://docs.rs/cddl-cat/) crate using nom, 10 | and I found it a surprisingly pleasant experience, much better than my past 11 | experiences with other parser-generators in other languages. 12 | 13 | Since I like Rust a lot, and I need an excuse to do more writing about Rust, I 14 | thought I'd do another demonstration project. I decided to choose a simple 15 | syntax, to keep this a short project. So I'm going to build a parser for JSON. 16 | 17 | 18 | 19 | There are a million JSON parsers in the world already, so I don't expect this 20 | code to have much non-educational value. But, hey, you never know. 21 | 22 | All of the source code and markdown source for this post is 23 | [available on GitHub](https://github.com/ericseppanen/json-parser-toy). If you 24 | see anything wrong, please let me know by raising an issue there. 25 | 26 | ## Part 1. Introduction. 27 | 28 | A few details, before I write the first lines of code: 29 | 30 | 1. I'm going to use [RFC8259](https://tools.ietf.org/html/rfc8259) as my 31 | authoritative reference for the JSON grammar. 32 | 2. I'm not going to build a JSON serializer. My goal will only be to consume 33 | JSON text and output a structured tree containing the data (a lot like 34 | [`serde_json::Value`](https://docs.serde.rs/serde_json/value/enum.Value.html) ). 35 | 3. I'll be using [`nom` 7.0](https://docs.rs/nom/7.0/nom/). I'll try to keep 36 | this post updated when new major versions are released. 37 | 4. Some of the code I write will violate the usual `rustfmt` style. This isn't 38 | because I hate `rustfmt`; far from it! But as you'll see, `nom` code can look a 39 | little weird, so it's sometimes more readable if we bend the styling rules a 40 | little bit. Do what you like in your own code. 41 | 5. All of my source code will be 42 | [available on GitHub](https://github.com/ericseppanen/json-parser-toy). If you 43 | have comments or suggestions, or see a bug or something wrong in this post, 44 | please open an issue there. 45 | 46 | Let's start with a few words about `nom`. It can take a little bit of time to 47 | adjust to writing a parser with `nom`, because it doesn't work by first 48 | tokenizing the input and then parsing those tokens. Both of those steps can be 49 | tackled at once. 50 | 51 | Older versions of `nom` used a lot of macros. Starting with `nom 7.0`, the 52 | macros are gone, and the only way to use nom is with the function combinators. 53 | This is a nice change, because while `nom` combinators can be tricky, the 54 | function-based style is a lot friendlier to work with than the old macros. 55 | 56 | A bit of advice for reading the 57 | [`nom` documentation](https://docs.rs/nom/7.0.0/nom/), if you're following 58 | along with this implementation: 59 | - Start from the [modules](https://docs.rs/nom/7.0.0/nom/#modules) section of 60 | the documentation. 61 | - We'll be starting with the 62 | [character](https://docs.rs/nom/7.0.0/nom/character/index.html) and 63 | [number](https://docs.rs/nom/7.0.0/nom/number/index.html) modules. 64 | - We'll use the 65 | [combinator](https://docs.rs/nom/7.0.0/nom/combinator/index.html), 66 | [multi](https://docs.rs/nom/7.0.0/nom/multi/index.html), 67 | [sequence](https://docs.rs/nom/7.0.0/nom/sequence/index.html), 68 | and [branch](https://docs.rs/nom/7.0.0/nom/branch/index.html) modules to tie 69 | things together. I'll try to link to the relevant documentation as we go. 70 | 71 | ## Part 2. Our first bit of parser code. 72 | 73 | I've started a new library project (`cargo init --lib json-parser-toy`), and 74 | added the `nom 7.0` dependency in `Cargo.toml`. Let's add a very simple parser 75 | function, just to verify that we can build and test our code. We'll try to 76 | parse the strings "true" and "false". In other words, the grammar for our json 77 | subset is: 78 | 79 | ```txt 80 | value = "false" / "true" 81 | ``` 82 | 83 | Here's our first bit of code: 84 | ```rust 85 | use nom::{branch::alt, bytes::complete::tag, IResult}; 86 | 87 | fn json_bool(input: &str) -> IResult<&str, &str> { 88 | alt(( 89 | tag("false"), 90 | tag("true") 91 | )) 92 | (input) 93 | } 94 | 95 | #[test] 96 | fn test_bool() { 97 | assert_eq!(json_bool("false"), Ok(("", "false"))); 98 | assert_eq!(json_bool("true"), Ok(("", "true"))); 99 | assert!(json_bool("foo").is_err()); 100 | } 101 | ``` 102 | 103 | I got the [`tag`](https://docs.rs/nom/7.0.0/nom/bytes/complete/fn.tag.html) 104 | function from `nom::bytes`, though it's not specific to byte-arrays; it works 105 | just fine with text strings as well. It's not a big deal; it's just a minor 106 | quirk of the way `nom` is organized. 107 | 108 | We use [`alt`](https://docs.rs/nom/7.0.0/nom/branch/fn.alt.html) to express 109 | "one of these choices". This is a common style in `nom`, and we'll see it 110 | again when we use other combinators from `nom::sequence`. 111 | 112 | There are a few other things that should be explained. 113 | 114 | [`IResult`](https://docs.rs/nom/7.0.0/nom/type.IResult.html) is an important 115 | part of working with `nom`. It's a specialized `Result`, where an `Ok` always 116 | returns a tuple of two values. In this case, `IResult<&str, &str>` returns two 117 | string slices. The first is the "remainder": this is everything that wasn't 118 | parsed. The second part is the output from a successful parse; in this case we 119 | just return the string we matched. For example, I could add this to my test, 120 | and it would work: 121 | 122 | ```rust 123 | assert_eq!(json_bool("false more"), Ok((" more", "false"))); 124 | ``` 125 | 126 | The `json_bool` function consumed the `false` part of the string, and left the 127 | rest for somebody else to deal with. 128 | 129 | When `json_bool` returns an error, that doesn't necessarily mean that something 130 | is wrong. Our top-level parser isn't going to give up. It just means that 131 | this particular bit of grammar didn't match. Depending on how we write our 132 | code, other parser functions might be called instead. You can actually see 133 | this in action if you look at how the `alt` combinator works. It first calls a 134 | parser function `tag("false")`, and if that returns an error, it instead feeds 135 | the same input into `tag("true")`, to see if it might succeed instead. 136 | 137 | This probably still looks kind of strange, because `tag("false")` isn't a 138 | complete parser function; it's a function that returns a parser function. See 139 | how our code calls `alt` and `tag` (twice)? The return value from that code is 140 | another function, and that function gets called with the argument `(input)`. 141 | 142 | Don't be scared off by the intimidating-looking parameters of the `tag` 143 | function in the documentation— look at the 144 | [examples](https://docs.rs/nom/7.0.0/nom/bytes/complete/fn.tag.html#example). 145 | Despite the extra layer of indirection, it's still pretty easy to use. 146 | 147 | ## Part 3. Returning structs. 148 | 149 | We don't want to just return the strings that we matched; we want to return 150 | some Rust structs that we can put into a tree form. 151 | 152 | We could copy the previous function to add another simple JSON element: 153 | ```rust 154 | fn json_null(input: &str) -> IResult<&str, &str> { 155 | tag("null") 156 | (input) 157 | } 158 | ``` 159 | 160 | That would work, but let's rewrite our two parser functions to return enums or 161 | structs instead. 162 | ```rust 163 | use nom::combinator::map; 164 | 165 | #[derive(PartialEq, Debug)] 166 | pub enum JsonBool { 167 | False, 168 | True, 169 | } 170 | 171 | #[derive(PartialEq, Debug)] 172 | pub struct JsonNull {} 173 | 174 | fn json_bool(input: &str) -> IResult<&str, JsonBool> { 175 | let parser = alt(( 176 | tag("false"), 177 | tag("true") 178 | )); 179 | map(parser, |s| { 180 | match s { 181 | "false" => JsonBool::False, 182 | "true" => JsonBool::True, 183 | _ => unreachable!(), 184 | } 185 | }) 186 | (input) 187 | } 188 | 189 | fn json_null(input: &str) -> IResult<&str, JsonNull> { 190 | map(tag("null"), |_| JsonNull {}) 191 | (input) 192 | } 193 | 194 | #[test] 195 | fn test_bool() { 196 | assert_eq!(json_bool("false"), Ok(("", JsonBool::False))); 197 | assert_eq!(json_bool("true"), Ok(("", JsonBool::True))); 198 | assert!(json_bool("foo").is_err()); 199 | } 200 | 201 | #[test] 202 | fn test_null() { 203 | assert_eq!(json_null("null"), Ok(("", JsonNull {}))); 204 | } 205 | 206 | ``` 207 | 208 | First, notice that the parser functions' return value has changed. The first 209 | part of the `IResult` tuple is still the remainder, so it's still `&str`. But 210 | the second part now returns one of our new data structures. 211 | 212 | To change the return value, we use `nom`'s 213 | [`map`](https://docs.rs/nom/7.0.0/nom/combinator/fn.map.html) combinator 214 | function. It allows us to apply a closure to convert the matched string into 215 | something else: in the `json_bool` case, one of the `JsonBool` variants. You 216 | will probably smell something funny about that code, though: we already matched 217 | the `"true"` and `"false"` strings once in the parser generated by the `tag` 218 | function, so why are we doing it again? Your instincts are right on— we should 219 | probably back up and fix that, but let's wrap up this discussion first. 220 | 221 | The `json_null` function does almost exactly the same thing, though it doesn't 222 | need a `match` because it could only have matched one thing. 223 | 224 | We need to derive `PartialEq` and `Debug` for our structs and enums so that the 225 | `assert_eq!` will work. Our tests are now using the new data structures 226 | `JsonBool` and `JsonNull`. 227 | 228 | ## Part 4. Another way of doing the same thing. 229 | 230 | In `nom`, there are often multiple ways of achieving the same goal. In our 231 | case, `map` is a little bit overkill for this use case. Let's instead use the 232 | [`value`](https://docs.rs/nom/7.0.0/nom/combinator/fn.value.html) combinator 233 | instead, which is specialized for the case where we only care that the child 234 | parser succeeded. 235 | 236 | We'll also refactor `json_bool` so that we don't need to do extra work: we'll 237 | apply our combinator a little earlier, before we lose track of which branch 238 | we're on. 239 | 240 | ```rust 241 | use nom::combinator::value; 242 | 243 | #[derive(PartialEq, Debug, Clone, Copy)] 244 | pub enum JsonBool { 245 | False, 246 | True, 247 | } 248 | 249 | #[derive(PartialEq, Debug, Clone, Copy)] 250 | pub struct JsonNull {} 251 | 252 | fn json_bool(input: &str) -> IResult<&str, JsonBool> { 253 | alt(( 254 | value(JsonBool::False, tag("false")), 255 | value(JsonBool::True, tag("true")), 256 | )) 257 | (input) 258 | } 259 | 260 | fn json_null(input: &str) -> IResult<&str, JsonNull> { 261 | value(JsonNull {}, tag("null")) 262 | (input) 263 | } 264 | ``` 265 | 266 | Hopefully this is pretty straightforward. The `value` combinator returns its 267 | first argument (e.g. `JsonNull {}`), if the second argument succeeds 268 | (`tag("null")`). That description is a bit of a lazy mental shortcut, 269 | because `value` doesn't do any parsing itself. Remember, it's a function that 270 | consumes one parser function and returns another parser function. But because 271 | `nom` makes things so easy, it's sometimes a lot easier to use the lazy way of 272 | thinking when you're plugging combinators together like Lego bricks. 273 | 274 | Note that I added `Clone` to the data structures, because `value` requires it. 275 | I also added `Copy` because these are trivially small structs & enums. 276 | 277 | ## Part 5. Prepare to tree. 278 | 279 | Our final output should be some tree-like data structure, similar to 280 | [`serde_json::Value`](https://docs.serde.rs/serde_json/value/enum.Value.html). 281 | I'm partial to the word "node" to describe the parts of a tree, so let's start 282 | here: 283 | 284 | ```rust 285 | pub enum Node { 286 | Null(JsonNull), 287 | Bool(JsonBool), 288 | } 289 | ``` 290 | 291 | Right away, I don't like where this is going. Here are all the things I'm 292 | unhappy with: 293 | 294 | 1. The redundant naming. I have `Node::Null` and `JsonNull`, for a value that 295 | contains no additional data. 296 | 2. The null and bool types don't really seem like they need their own data 297 | structure name, outside of the tree node. If this were a complex value type 298 | that I might want to pass around on its own, sure. But for this simple case, I 299 | think this is a lot simpler: 300 | 301 | ```rust 302 | #[derive(PartialEq, Debug, Clone)] 303 | pub enum Node { 304 | Null, 305 | Bool(bool), 306 | } 307 | 308 | fn json_bool(input: &str) -> IResult<&str, Node> { 309 | alt(( 310 | value(Node::Bool(false), tag("false")), 311 | value(Node::Bool(true), tag("true")), 312 | )) 313 | (input) 314 | } 315 | 316 | fn json_null(input: &str) -> IResult<&str, Node> { 317 | value(Node::Null, tag("null")) 318 | (input) 319 | } 320 | 321 | #[test] 322 | fn test_bool() { 323 | assert_eq!(json_bool("false"), Ok(("", Node::Bool(false)))); 324 | assert_eq!(json_bool("true"), Ok(("", Node::Bool(true)))); 325 | assert!(json_bool("foo").is_err()); 326 | } 327 | 328 | #[test] 329 | fn test_null() { 330 | assert_eq!(json_null("null"), Ok(("", Node::Null))); 331 | } 332 | ``` 333 | 334 | We got rid of JsonNull and JsonBool entirely. For your parser you can choose 335 | any output structure that makes sense; different grammars have different 336 | properties, and they may not map easily onto Rust's prelude types. 337 | 338 | ## Part 6. Parsing numbers is hard. 339 | 340 | The other remaining literal types in JSON are strings and numbers. Let's 341 | tackle numbers first. Referring to 342 | [RFC8259](https://tools.ietf.org/html/rfc8259), the grammar for a JSON number 343 | is: 344 | 345 | ```txt 346 | number = [ minus ] int [ frac ] [ exp ] 347 | 348 | decimal-point = %x2E ; . 349 | digit1-9 = %x31-39 ; 1-9 350 | e = %x65 / %x45 ; e E 351 | exp = e [ minus / plus ] 1*DIGIT 352 | frac = decimal-point 1*DIGIT 353 | int = zero / ( digit1-9 *DIGIT ) 354 | minus = %x2D ; - 355 | plus = %x2B ; + 356 | zero = %x30 ; 0 357 | ``` 358 | 359 | That grammar can represent any integer or floating point value; it would be 360 | grammatically correct to have an integer a thousand digits long, or a floating 361 | point value with huge exponent. It's our decision how to handle these values. 362 | 363 | JSON (like JavaScript) is a bit unusual in not distinguishing integers from 364 | floating-point values. To make this tutorial a little more widely useful, 365 | let's output integers and floats as separate types: 366 | 367 | ```rust 368 | pub enum Node { 369 | Null, 370 | Bool(bool), 371 | Integer(i64), 372 | Float(f64), 373 | } 374 | ``` 375 | 376 | We'll need to do something when we encounter values that are grammatically 377 | correct (e.g. 1000 digits), that we can't handle. This is a common problem, 378 | since most grammars don't attempt to set limits on the size of numbers. Often 379 | there will be a limit set somewhere, but it's not part of the formal grammar. 380 | JSON doesn't set such limits, which can lead to compatibility problems between 381 | implementations. 382 | 383 | It will be important in most parsers to set limits and make sure things fail 384 | gracefully. In Rust you're not likely to have problems with buffer overruns, 385 | but it might be possible to trigger a denial of service, or perhaps even a 386 | crash by triggering excessive recursion. 387 | 388 | Let's start by making the parser functions we need, and we'll see where we need 389 | error handling. 390 | 391 | Let's build a little helper function for the `digit1-9` part, since `nom` only 392 | offers `digit`, which includes `0-9`. 393 | 394 | ```rust 395 | fn digit1to9(input: &str) -> IResult<&str, &str> { 396 | one_of("123456789") 397 | (input) 398 | } 399 | ``` 400 | 401 | Unfortunately, it doesn't compile: 402 | ```txt 403 | error[E0308]: mismatched types 404 | --> src/lib.rs:21:5 405 | | 406 | 21 | / one_of("123456789") 407 | 22 | | (input) 408 | | |___________^ expected `&str`, found `char` 409 | | 410 | = note: expected enum `std::result::Result<(&str, &str), nom::internal::Err<(&str, nom::error::ErrorKind)>>` 411 | found enum `std::result::Result<(&str, char), nom::internal::Err<_>>` 412 | ``` 413 | 414 | This is a pretty easy mistake to make— we tried to create a parser function 415 | that returns a string slice, but it's returning `char` instead, because, well, 416 | that's how `one_of` works. It's not a big problem for us; just fix the return 417 | type to match: 418 | 419 | ```rust 420 | fn digit1to9(input: &str) -> IResult<&str, char> { 421 | one_of("123456789") 422 | (input) 423 | } 424 | ``` 425 | 426 | We can now build the next function, one that recognizes integers: 427 | ```rust 428 | fn uint(input: &str) -> IResult<&str, &str> { 429 | alt(( 430 | tag("0"), 431 | recognize( 432 | pair( 433 | digit1to9, 434 | digit0 435 | ) 436 | ) 437 | )) 438 | (input) 439 | } 440 | ``` 441 | 442 | Again, we use `alt` to specify that an integer is either `0`, or a nonzero 443 | digit, possibly followed by more additional digits. 444 | 445 | The new combinator here is `recognize`. Let's back up and look at the return 446 | type of this hypothetical function: 447 | 448 | ```rust 449 | fn nonzero_integer(input: &str) -> IResult<&str, ____> { 450 | pair( 451 | digit1to9, 452 | digit0 453 | ) 454 | (input) 455 | } 456 | ``` 457 | 458 | Because we used `pair`, the return type would be a 2-tuple. The first element 459 | would be a `char` (because that's what we returned from `digit1to9`), and the 460 | other element would be a `&str`. So the blank above would be filled in like 461 | this: 462 | 463 | ```rust 464 | fn nonzero_integer(input: &str) -> IResult<&str, (char, &str)> { 465 | ... 466 | } 467 | ``` 468 | 469 | In this context, not very helpful. What we'd like to say is, "match this bunch 470 | of stuff, but just return the string slice that covers what we matched." 471 | That's exactly what `recognize` does. 472 | 473 | Because we're going to store integers in a different `Node` variant, we should 474 | also do one last call to `map`. But that might make life difficult if we want 475 | to re-use this code as part of a token that's representing a floating-point 476 | number. 477 | 478 | So let's leave the `uint` function alone; we'll use it as a building block of 479 | another function. 480 | 481 | Note also that we can't finish parsing an integer until we've consumed the 482 | optional leading "minus" symbol. 483 | 484 | ```rust 485 | fn json_integer(input: &str) -> IResult<&str, &str> { 486 | recognize( 487 | pair( 488 | opt(tag("-")), 489 | uint 490 | ) 491 | ) 492 | (input) 493 | } 494 | ``` 495 | 496 | The `opt` function is another `nom` combinator; it means "optional", and 497 | unsurprisingly it will return an `Option` where `T` in this case is `&str` 498 | (because that's what `tag("-")` will returns. But that return type is ignored; 499 | `recognize` will throw it away and just give us back the characters that were 500 | consumed by the successful match. 501 | 502 | Let's add one more step to our function: convert the resulting string into a 503 | `Node::Integer`. 504 | 505 | ```rust 506 | fn json_integer(input: &str) -> IResult<&str, Node> { 507 | let parser = recognize( 508 | pair( 509 | opt(tag("-")), 510 | uint 511 | ) 512 | ); 513 | map(parser, |s| { 514 | // FIXME: unwrap() may panic if the value is out of range 515 | let n = s.parse::().unwrap(); 516 | Node::Integer(n) 517 | }) 518 | (input) 519 | } 520 | ``` 521 | 522 | Finally, we discover a point where we'll need some error handling. 523 | [`str::parse`](https://doc.rust-lang.org/std/primitive.str.html#method.parse) 524 | returns a `Result`, and will certainly return `Err` if we try to parse 525 | something too big. 526 | 527 | I am going to leave proper error handling until the end, so for now I will just 528 | `unwrap` the result. This means the parser will panic if we give it a huge 529 | integer, so we definitely need to come back and fix this later. 530 | 531 | For now we'll finish up this section with a few unit tests: 532 | 533 | ```rust 534 | #[test] 535 | fn test_integer() { 536 | assert_eq!(json_integer("42"), Ok(("", Node::Integer(42)))); 537 | assert_eq!(json_integer("-123"), Ok(("", Node::Integer(-123)))); 538 | assert_eq!(json_integer("0"), Ok(("", Node::Integer(0)))); 539 | assert_eq!(json_integer("01"), Ok(("1", Node::Integer(0)))); 540 | } 541 | ``` 542 | 543 | Note the fourth test case— this might not be what you expected. We know that 544 | integers with a leading zero aren't allowed by this grammar— so why did the 545 | call to `json_integer` succeed? It has to do with the way `nom` operates. Each 546 | parser only consumes the part of the string it matches, and leaves the rest for 547 | some other parser. So attempting to parse `01` results in a success, returning 548 | a result `Node::Integer(0)` along with a remainder string `1`. 549 | 550 | `nom` does have ways for parsers to trigger a fatal error if they're unhappy 551 | with the sequence of characters, but this grammar probably won't need them. 552 | 553 | ## Part 7. Parsing numbers some more. 554 | 555 | Let's piece together the bits we need to parse floating point numbers. 556 | 557 | ```rust 558 | fn frac(input: &str) -> IResult<&str, &str> { 559 | recognize( 560 | pair( 561 | tag("."), 562 | digit1 563 | ) 564 | ) 565 | (input) 566 | } 567 | 568 | fn exp(input: &str) -> IResult<&str, &str> { 569 | recognize( 570 | tuple(( 571 | tag("e"), 572 | opt(alt(( 573 | tag("-"), 574 | tag("+") 575 | ))), 576 | digit1 577 | )) 578 | ) 579 | (input) 580 | } 581 | 582 | fn json_float(input: &str) -> IResult<&str, Node> { 583 | let parser = recognize( 584 | tuple(( 585 | opt(tag("-")), 586 | uint, 587 | opt(frac), 588 | opt(exp) 589 | )) 590 | ); 591 | map(parser, |s| { 592 | // FIXME: unwrap() may panic if the value is out of range 593 | let n = s.parse::().unwrap(); 594 | Node::Float(n) 595 | }) 596 | (input) 597 | } 598 | ``` 599 | 600 | The only new parts here are: 601 | - `nom::character::complete::digit1`: just like `digit0`, except this matches 602 | one-or-more digits. 603 | - `nom::sequence::tuple` is a lot like `pair`, but accepts an arbitrary number 604 | of other parsers. Each sub-parser must match in sequence, and the return value 605 | is a tuple of results. 606 | 607 | I added some straightforward unit tests here, and they all pass. Despite that, 608 | I've made a significant mistake, but one that we won't notice until we start 609 | stitching the various parts together. Let's do that now. 610 | 611 | When a parser executes, it obviously won't know which elements are arriving in 612 | which order, so we need a parser function to handle everything we've built so 613 | far. Thanks to the magic of `nom`, this part is really easy. 614 | 615 | ```rust 616 | fn json_literal(input: &str) -> IResult<&str, Node> { 617 | alt(( 618 | json_integer, 619 | json_float, 620 | json_bool, 621 | json_null 622 | )) 623 | (input) 624 | } 625 | ``` 626 | 627 | And now we discover that something is wrong: 628 | 629 | ```rust 630 | #[test] 631 | fn test_literal() { 632 | assert_eq!(json_literal("56"), Ok(("", Node::Integer(56)))); 633 | assert_eq!(json_literal("78.0"), Ok(("", Node::Float(78.0)))); 634 | } 635 | ``` 636 | 637 | ```txt 638 | test test_literal ... FAILED 639 | 640 | failures: 641 | 642 | ---- test_literal stdout ---- 643 | thread 'test_literal' panicked at 'assertion failed: `(left == right)` 644 | left: `Ok((".0", Integer(78)))`, 645 | right: `Ok(("", Float(78.0)))`', src/lib.rs:163:5 646 | ``` 647 | 648 | Because we put `json_integer` first, it grabbed the `78` part and declared 649 | success, leaving `.0` for someone else to deal with. Not so big a deal, 650 | right? Let's just swap the order of the parsers: 651 | 652 | ```rust 653 | fn json_literal(input: &str) -> IResult<&str, Node> { 654 | alt(( 655 | json_float, 656 | json_integer, 657 | json_bool, 658 | json_null 659 | )) 660 | (input) 661 | } 662 | ``` 663 | 664 | ```txt 665 | test test_literal ... FAILED 666 | 667 | failures: 668 | 669 | ---- test_literal stdout ---- 670 | thread 'test_literal' panicked at 'assertion failed: `(left == right)` 671 | left: `Ok(("", Float(56.0)))`, 672 | right: `Ok(("", Integer(56)))`', src/lib.rs:162:5 673 | ``` 674 | 675 | We've traded one problem for another. This time, `json_float` runs first, 676 | consumes the input `56` input and declares success, returning `Float(56.0)`. 677 | This isn't wrong, exactly. Had we decided at the beginning to treat all 678 | numbers as floating-point (as JavaScript does) this would be the expected 679 | outcome. But since we committed to storing integers and floats as separate 680 | tree nodes, we have a problem. 681 | 682 | Since we can't allow either the `json_float` parser or the `json_integer` 683 | parser to run first (at least as currently written), let's imagine what we'd 684 | like to see happen. Ideally, we would start parsing the `[ minus ] int` part 685 | of the grammar, and if that succeeds we have a possible integer-or-float 686 | match. We should then continue on, trying to match the `[ frac ] [ exp ]` 687 | part, and if _either of those_ succeeds, we have a float. 688 | 689 | There are a few different ways to implement that logic. 690 | 691 | One way would be to get `json_float` to fail if the next character after the 692 | integer part is _not_ a `.` or `e` character— without that it can't possibly be 693 | a valid float (according to our grammar), so if `json_float` fails at that 694 | point we know the `json_integer` parser will run next (and succeed). 695 | 696 | ```rust 697 | fn json_float(input: &str) -> IResult<&str, Node> { 698 | let parser = recognize( 699 | tuple(( 700 | opt(tag("-")), 701 | uint, 702 | peek(alt(( 703 | tag("."), 704 | tag("e"), 705 | ))), 706 | opt(frac), 707 | opt(exp) 708 | )) 709 | ); 710 | map(parser, |s| { 711 | let n = s.parse::().unwrap(); 712 | Node::Float(n) 713 | }) 714 | (input) 715 | } 716 | ``` 717 | 718 | This code has one small annoyance, though it's not a problem in the overall 719 | JSON context. Imagine that we took this `json_float` parser code, and tried to 720 | reuse it in another language, where this other language's grammar would allow 721 | the input `123.size()`. This code would `peek` ahead and see the `.` 722 | character, and because of that it would parse `123` as a float rather than an 723 | integer. In other words, this `json_float` implementation decides that this 724 | input is a float before it's actually finished parsing all the characters 725 | making up that float. 726 | 727 | There is a slightly better way, though. Remember, our original problem is that 728 | `json_float` will succeed in all of the following cases: 729 | - `123` 730 | - `123.0` 731 | - `123e9` 732 | - `123.0e9` 733 | What we'd rather have is a parser that succeeds at the last three, but not the 734 | first. There isn't a combinator in `nom` that implements "A or B or AB", but 735 | it's not that hard to implement ourselves: 736 | 737 | ```rust 738 | fn json_float(input: &str) -> IResult<&str, Node> { 739 | let parser = recognize( 740 | tuple(( 741 | opt(tag("-")), 742 | uint, 743 | alt(( 744 | recognize(pair( 745 | frac, 746 | opt(exp) 747 | )), 748 | exp 749 | )), 750 | )) 751 | ); 752 | map(parser, |s| { 753 | let n = s.parse::().unwrap(); 754 | Node::Float(n) 755 | }) 756 | (input) 757 | } 758 | ``` 759 | 760 | This new logic uses `alt` to allow two choices: either a `frac` must be present 761 | (with an optional `exp`) following, or an `exp` must be present by itself. An 762 | input with neither a valid `frac` or `exp` will now fail, which makes 763 | everything work the way we want it to. 764 | 765 | ## Part 8. Handling string literals 766 | 767 | So far we support literal null, boolean, integer, and float types. There's 768 | only one more literal type left to handle: strings. 769 | 770 | In the JSON grammar, a string is basically a series of Unicode characters that 771 | starts and ends with a quote, plus a few extra rules: 772 | 773 | 1. Certain characters must be escaped (ASCII control characters, quotes, and 774 | backslashes) 775 | 2. Any character may be escaped, using `\u` plus 4 hexadecimal digits, e.g. 776 | `\uF903`. 777 | 3. A small number of common characters have two-character escapes: 778 | `\"` `\\` `\/` `\b` `\f` `\n` `\r` `\t`. 779 | 780 | That's how RFC 8259 does things, anyway. Different implementations may have 781 | subtle differences. 782 | 783 | This means there are many possible ways to represent a certain string. We're 784 | only building a parser, so we just need to make sure we can parse all the valid 785 | JSON representations (and hopefully return an error on all the invalid ones). 786 | 787 | The presence of escape characters makes our job more difficult. There are 788 | different ways we might choose to address this. I'm going to choose to break 789 | escape handling into a separate phase. This means we will only use `nom` to do 790 | the lexing part (finding the bounds of the string literal), and we'll follow up 791 | with an "un-escaping" pass to decode the escaped characters. 792 | 793 | Bad inputs must be rejected by one of the two phases, but we don't care which 794 | one. For example, `"\ud800"` looks like a valid JSON string, but can't be 795 | decoded because U+D800 is a magic "surrogate" character, meaning it's half of a 796 | character that needs more than 16 bits to encode. We should also reject things 797 | like `"\x"` (a nonexistent escape), `"\u001"` (not enough hex digits), and 798 | `"\"` (which is unterminated because the trailing quote is escaped). We also 799 | need to reject "naked" (non-escaped) control characters (ASCII 0x00-0x1F), 800 | though for some reason 0x7F (ASCII DELETE) is legal. 801 | 802 | Let's begin by building a parser for "a string of valid non-escaped 803 | characters": everything except control characters, backslash, and quote. We 804 | don't need to check the upper limit 0x10FFFF because those characters will 805 | never appear in a Rust `char`. 806 | 807 | ```rust 808 | use nom::bytes::complete::take_while1; 809 | 810 | fn is_nonescaped_string_char(c: char) -> bool { 811 | let cv = c as u32; 812 | (cv >= 0x20) && (cv != 0x22) && (cv != 0x5C) 813 | } 814 | 815 | // One or more unescaped text characters 816 | fn nonescaped_string(input: &str) -> IResult<&str, &str> { 817 | take_while1(is_nonescaped_string_char) 818 | (input) 819 | } 820 | ``` 821 | 822 | The `take_while1` function comes from the nom `bytes` module (which, remember, 823 | isn't specific to byte sequences). `nom` offers a few different `take` 824 | functions in this module; `take_while1` consumes characters that match some 825 | condition, requiring at least 1 matching character. 826 | 827 | Next, let's add a parser that can detect one escape sequence. Actually, we're 828 | going to be even lazier than that; we'll pretend that `\u` is an escape 829 | sequence all by itself, and let the unescape function determine whether the 830 | characters that follow make sense. We could easily do it differently, but 831 | since the unescape code will need to look at those characters in detail later, 832 | we won't waste time doing that work twice. 833 | 834 | ```rust 835 | fn escape_code(input: &str) -> IResult<&str, &str> { 836 | recognize( 837 | pair( 838 | tag("\\"), 839 | alt(( 840 | tag("\""), 841 | tag("\\"), 842 | tag("/"), 843 | tag("b"), 844 | tag("f"), 845 | tag("n"), 846 | tag("r"), 847 | tag("t"), 848 | tag("u"), 849 | )) 850 | ) 851 | ) 852 | (input) 853 | } 854 | ``` 855 | 856 | Using those two pieces, we can now connect them together to parse the entire 857 | body of a JSON string (minus the quotes that surround it): 858 | 859 | ```rust 860 | use nom::multi::many0; 861 | 862 | fn string_body(input: &str) -> IResult<&str, &str> { 863 | recognize( 864 | many0( 865 | alt(( 866 | nonescaped_string, 867 | escape_code 868 | )) 869 | ) 870 | ) 871 | (input) 872 | } 873 | ``` 874 | 875 | We've seen most of the pieces here before. 876 | 877 | `many0` tries to apply a parser function repeatedly, gathering all of the 878 | results into a vector. This version gathers "zero or more" of whatever we were 879 | searching for (which is desirable because `""` is a valid JSON string). There 880 | is also a `many1`, (if you want "one or more") and several other variations. 881 | 882 | The final `recognize` throws away the output of `many0` (a vector), and instead 883 | just returns to us the string that was matched. It's a little unfortunate that 884 | we're throwing away the information we developed about where escapes appear— 885 | perhaps another implementation could do the unescaping work right here. It 886 | seems pretty typical (in my limited experience) to have to make tradeoffs like 887 | this. We're breaking the work into multiple phases, which may require a little 888 | bit of redundant effort, but our code gets a little simpler as a result. 889 | 890 | There's one subtle thing about these two layers that should be pointed out. 891 | Both `nonescaped_string` and `escape_code` are parsers that return "one or more 892 | characters". And then we use those to build a parser that returns "zero or 893 | more characters". In fact, you can't build a "zero or more" parser using other 894 | "zero or more" components, because that could trigger an infinite loop: the 895 | outer parser could try to gather an infinite number of empty subparser 896 | successes. Typically `nom` combinators will return an error instead of going 897 | into an infinite loop. 898 | 899 | The next step is pretty simple: the string body must be wrapped in quotes. 900 | 901 | ```rust 902 | use nom::sequence::delimited; 903 | 904 | fn json_string(input: &str) -> IResult<&str, &str> { 905 | delimited( 906 | tag("\""), 907 | string_body, 908 | tag("\"") 909 | ) 910 | (input) 911 | } 912 | ``` 913 | 914 | This is the first time we've used `delimited`. It runs three sub-parsers, 915 | returning the result of the middle one. The result from the first and third 916 | arguments (the quote characters) are discarded. 917 | 918 | At this point I should plug in some code to do un-escaping. Because this code 919 | doesn't use `nom` and doesn't really help us understand how to write a `nom` 920 | parser, I'm going to skip the explanation and just pull the 921 | [escape8259](https://docs.rs/escape8259/0.5.0/escape8259/) crate that does this 922 | part. A call to un-escape a string is pretty simple: 923 | 924 | ```rust 925 | pub fn unescape(s: &str) -> Result 926 | ``` 927 | 928 | So all we need to do is plug that into `json_string`. We earlier used `nom`'s 929 | `map` combinator to do this sort of thing, but here we need something a little 930 | different because `unescape` may fail. We need to use `map_res` to handle 931 | `Result::Err`. 932 | 933 | ```rust 934 | use nom::combinator::map_res; 935 | use escape8259::unescape; 936 | 937 | fn string_literal(input: &str) -> IResult<&str, String> { 938 | let parser = delimited( 939 | tag("\""), 940 | string_body, 941 | tag("\"") 942 | ); 943 | map_res(parser, |s| { 944 | unescape(s) 945 | }) 946 | (input) 947 | } 948 | ``` 949 | 950 | We also need to update our `Node` enum to include a string variant (we'll call 951 | this `Str`), and make that our final output. 952 | 953 | ```rust 954 | pub enum Node { 955 | Null, 956 | Bool(bool), 957 | Integer(i64), 958 | Float(f64), 959 | Str(String), 960 | } 961 | 962 | fn json_string(input: &str) -> IResult<&str, Node> { 963 | map(string_literal, |s| { 964 | Node::Str(s) 965 | }) 966 | (input) 967 | } 968 | ``` 969 | 970 | Finally, we should write some tests to make sure this is working correctly. 971 | 972 | ```rust 973 | #[test] 974 | fn test_string() { 975 | // Plain Unicode strings with no escaping 976 | assert_eq!(json_string(r#""""#), Ok(("", Node::Str("".into())))); 977 | assert_eq!(json_string(r#""Hello""#), Ok(("", Node::Str("Hello".into())))); 978 | assert_eq!(json_string(r#""の""#), Ok(("", Node::Str("の".into())))); 979 | assert_eq!(json_string(r#""𝄞""#), Ok(("", Node::Str("𝄞".into())))); 980 | 981 | // valid 2-character escapes 982 | assert_eq!(json_string(r#"" \\ ""#), Ok(("", Node::Str(" \\ ".into())))); 983 | assert_eq!(json_string(r#"" \" ""#), Ok(("", Node::Str(" \" ".into())))); 984 | 985 | // valid 6-character escapes 986 | assert_eq!(json_string(r#""\u0000""#), Ok(("", Node::Str("\x00".into())))); 987 | assert_eq!(json_string(r#""\u00DF""#), Ok(("", Node::Str("ß".into())))); 988 | assert_eq!(json_string(r#""\uD834\uDD1E""#), Ok(("", Node::Str("𝄞".into())))); 989 | 990 | // Invalid because surrogate characters must come in pairs 991 | assert!(json_string(r#""\ud800""#).is_err()); 992 | // Unknown 2-character escape 993 | assert!(json_string(r#""\x""#).is_err()); 994 | // Not enough hex digits 995 | assert!(json_string(r#""\u""#).is_err()); 996 | assert!(json_string(r#""\u001""#).is_err()); 997 | // Naked control character 998 | assert!(json_string(r#""\x0a""#).is_err()); 999 | // Not a JSON string because it's not wrapped in quotes 1000 | assert!(json_string("abc").is_err()); 1001 | } 1002 | ``` 1003 | 1004 | ## Part 9. Arrays and Objects 1005 | 1006 | Finally, all of the hard parts are complete, and we get to the fun parts: 1007 | arrays and objects (maps or dictionaries in other languages). 1008 | 1009 | Let's start with the changes to our `Node` enum, to give us a little better 1010 | idea how these recursive data structures should work. 1011 | 1012 | ```rust 1013 | pub enum Node { 1014 | Null, 1015 | Bool(bool), 1016 | Integer(i64), 1017 | Float(f64), 1018 | Str(String), 1019 | Array(Vec), 1020 | Object(Vec<(String, Node)>), 1021 | } 1022 | ``` 1023 | 1024 | Since `Node` now includes types other than literal values, let's rename 1025 | `json_literal` to `json_value`: 1026 | 1027 | ```rust 1028 | fn json_value(input: &str) -> -> IResult<&str, Node> { 1029 | spacey(alt(( 1030 | json_array, 1031 | json_object, 1032 | json_string, 1033 | json_float, 1034 | json_integer, 1035 | json_bool, 1036 | json_null 1037 | ))) 1038 | (input) 1039 | } 1040 | ``` 1041 | 1042 | An array can be heterogeneous (different value types, e.g. `[1, "foo", true]`). 1043 | Each object member must have a string for its key, and may have any value 1044 | type. An object might be `{"a": 1, "b": false}`. Arrays and objects can be 1045 | nested arbitrarily. 1046 | 1047 | Let's implement arrays first. 1048 | 1049 | ```rust 1050 | use nom::multi::separated_list0; 1051 | 1052 | fn json_array(input: &str) -> IResult<&str, Node> { 1053 | let parser = delimited( 1054 | tag("["), 1055 | separated_list0(tag(","), json_value), 1056 | tag("]") 1057 | ); 1058 | map(parser, |v| { 1059 | Node::Array(v) 1060 | }) 1061 | (input) 1062 | } 1063 | ``` 1064 | 1065 | That was surprisingly easy. The only new thing we needed was `separated_list0`, 1066 | which alternates between two subparsers. The first argument is the 1067 | "separator", and its result is thrown away; we get a vector of results from the 1068 | second parser. It will match zero or more elements; `nom` has a 1069 | `separated_list1` if you want one-or-more. 1070 | 1071 | Objects are up next; they're a little more complicated so let's implement them 1072 | as two separate functions. 1073 | 1074 | ```rust 1075 | use nom::sequence::separated_pair; 1076 | 1077 | fn object_member(input: &str) -> IResult<&str, (String, Node)> { 1078 | separated_pair(string_literal, tag(":"), json_value) 1079 | (input) 1080 | } 1081 | 1082 | fn json_object(input: &str) -> IResult<&str, Node> { 1083 | let parser = delimited( 1084 | tag("{"), 1085 | separated_list0( 1086 | tag(","), 1087 | object_member 1088 | ), 1089 | tag("}") 1090 | ); 1091 | map(parser, |v| { 1092 | Node::Object(v) 1093 | }) 1094 | (input) 1095 | } 1096 | ``` 1097 | 1098 | This looks a lot like the array implementation. The only difference (other 1099 | than the braces) is that where an array looks for a single value, the object 1100 | looks for a quoted string literal, then a `:` character, and then a value. 1101 | 1102 | And we have a JSON parser! 1103 | 1104 | ## Part 10. Spacing out 1105 | 1106 | Well, we almost have a JSON parser. We might start testing arrays like this: 1107 | 1108 | ```rust 1109 | #[test] 1110 | fn test_array() { 1111 | assert_eq!(json_array("[]"), Ok(("", Node::Array(vec![])))); 1112 | assert_eq!(json_array("[1]"), Ok(("", Node::Array(vec![Node::Integer(1)])))); 1113 | 1114 | let expected = Node::Array(vec![Node::Integer(1), Node::Integer(2)]); 1115 | assert_eq!(json_array("[1,2]"), Ok(("", expected))); 1116 | } 1117 | ``` 1118 | 1119 | But it doesn't work if we write: 1120 | 1121 | ```rust 1122 | assert_eq!(json_array("[1, 2]"), Ok(("", expected))); 1123 | ``` 1124 | 1125 | The only difference is the space character after the comma. We forgot to 1126 | handle whitespace. 1127 | 1128 | In fact, we haven't handled whitespace anywhere. Whitespace could appear 1129 | anywhere: before or after values or any punctuation (braces, brackets, comma, 1130 | or colon). 1131 | 1132 | To ignore whitespace, we need a parser function that matches whitespace. We 1133 | could easily build one, but `nom` includes one that matches our needs exactly: 1134 | `nom::character::complete::multispace0`. 1135 | 1136 | That means we need to do a bunch of substitutions, things like: 1137 | ```rust 1138 | tag("[") 1139 | ``` 1140 | need to become 1141 | ```rust 1142 | delimited(multispace0, tag("["), multispace0) 1143 | ``` 1144 | 1145 | Which adds a lot of clutter, and is kind of hard to read. Maybe instead we 1146 | should write a combinator of our own to make this a little more compact. This 1147 | isn't absolutely necessary— the cluttered version is perfectly functional. The 1148 | only reason I'm going to tackle this is it provides a little bit of insight 1149 | into the pile of generic parameters you see if you look at the documentation 1150 | for `nom` combinators. If you don't care, feel free to skip this section. 1151 | 1152 | First, let's write a combinator that does nothing, other than apply a parser we 1153 | specify. 1154 | 1155 | ```rust 1156 | fn identity(f: F) -> impl FnMut(I) -> IResult 1157 | where 1158 | F: FnMut(I) -> IResult, 1159 | { 1160 | f 1161 | } 1162 | ``` 1163 | 1164 | That looks pretty intimidating. But so do most of the built-in `nom` 1165 | combinators, so if we can understand this combinator function, we'll have a 1166 | little easier time understanding other parts of `nom`. 1167 | 1168 | Let's see if we can make some sense of all those generic parameters. 1169 | 1170 | `F` is the type of the parser we pass in. It could be any `nom`-style parser, 1171 | and we already know what those look like; they accept one input parameter, and 1172 | return an `IResult`. This `IResult` has three generic parameters, and we've 1173 | always used two— the third has a default value, and we've been omitting it. 1174 | 1175 | So our `F` is a function that accepts one `I` and returns `IResult`. 1176 | `I` is our input parameter (which has been `&str` so far everywhere). `O` is 1177 | our output type (and we've used a bunch of different ones; `&str`, `Node`, 1178 | etc.) The `E` is the parser error type, and we can continue ignoring that for 1179 | now since we've only used the default. 1180 | 1181 | Our combinator returns a closure. So its return type is 1182 | `FnMut(I) -> IResult`. That looks the same as `F`, but for all cases 1183 | other than `identity` we'll return a different closure than the input, so we 1184 | will need to spell out the return type. 1185 | 1186 | A lot of `nom` combinators have even more complex type signatures 1187 | (`separated_pair` has 8 generic parameters!) but picking them apart is usually 1188 | pretty straightforward if you're patient. You'll probably only need to know 1189 | when something fails to compile. 1190 | 1191 | Anyway, let's write a combinator that wraps its input in a `delimited` with 1192 | `multispace0` on both sides. 1193 | 1194 | ```rust 1195 | fn spacey(f: F) -> impl FnMut(I) -> IResult 1196 | where 1197 | F: FnMut(I) -> IResult, 1198 | { 1199 | delimited(multispace0, f, multispace0) 1200 | } 1201 | ``` 1202 | 1203 | This explodes with a huge pile of errors; many complaints about trait bounds 1204 | that aren't met for `I` and `E`. But it turns out that this is just because 1205 | `multispace0` requires those on its `I` and `E`, so we have to guarantee those 1206 | trait bounds as well. Copying those trait bounds over to our function will 1207 | work: 1208 | 1209 | ```rust 1210 | fn spacey(f: F) -> impl FnMut(I) -> IResult 1211 | where 1212 | F: FnMut(I) -> IResult, 1213 | I: nom::InputTakeAtPosition, 1214 | ::Item: nom::AsChar + Clone, 1215 | E: nom::error::ParseError, 1216 | { 1217 | delimited(multispace0, f, multispace0) 1218 | } 1219 | ``` 1220 | 1221 | Was that worth it? Maybe not for this program. But it's interesting to see 1222 | what's involved in building our own combinators. Maybe the `nom` function 1223 | documentation will look a little less scary, too. 1224 | 1225 | Now that we have a useful multispace-handling combinator, we can sprinkle it 1226 | around all the places where we need to ignore whitespace. For example: 1227 | 1228 | ```rust 1229 | fn json_array(input: &str) -> IResult<&str, Node> { 1230 | let parser = delimited( 1231 | spacey(tag("[")), 1232 | separated_list0(spacey(tag(",")), json_value), 1233 | spacey(tag("]")), 1234 | ); 1235 | map(parser, |v| { 1236 | Node::Array(v) 1237 | }) 1238 | (input) 1239 | } 1240 | ``` 1241 | 1242 | ## Part 11. Error handling. 1243 | 1244 | We skipped over a few places where proper error handling is needed. For 1245 | example, numbers that are out of bounds (e.g. `1e99999`) should return some 1246 | kind of parse error. 1247 | 1248 | Currently we are using the `IResult` default error type, which is 1249 | `nom::internal::Err<(&str, nom::error::ErrorKind)>`. That doesn't look 1250 | promising— we can't realistically expect to be able to extend that type with 1251 | our own error variants. 1252 | 1253 | So let's build our own error type. We'll use macros from the 1254 | [`thiserror`](https://docs.rs/thiserror/1.0/thiserror/) crate to automatically 1255 | generate some of the boilerplate that's necessary for error types. 1256 | 1257 | ```rust 1258 | #[derive(thiserror::Error, Debug, PartialEq)] 1259 | pub enum JSONParseError { 1260 | #[error("bad integer")] 1261 | BadInt, 1262 | #[error("bad float")] 1263 | BadFloat, 1264 | #[error("bad escape sequence")] 1265 | BadEscape, 1266 | #[error("unknown parser error")] 1267 | Unparseable, 1268 | } 1269 | ``` 1270 | 1271 | Because `nom` error handling uses generic parameters, it can be difficult to 1272 | see how to best implement a custom error type. There is a good minimal example 1273 | of custom error types in the nom 7.0 sources 1274 | ([examples/custom_error.rs](https://github.com/Geal/nom/blob/7.0.0/examples/custom_error.rs)) 1275 | that shows the steps needed to make things work gracefully: 1276 | 1277 | 1. Figure out how to map a `nom` error into your error type. Usually this will 1278 | be with a dedicated enum variant. 1279 | 2. Implement the trait `nom::error::ParseError` for your error type. This 1280 | will allow all of the `nom` combinators to generate your custom error type when 1281 | needed. 1282 | 3. Use the 3-argument form of `IResult`, specifying your error type. You will 1283 | probably want to do this on most or all of your parser functions so combinators 1284 | work gracefully. 1285 | 1286 | When building a custom error type that will be generated by nom parsers, 1287 | consider how far you want to propagate the error metadata (`ErrorKind` and 1288 | input slice). If the error type is only visible internal to a crate, it can 1289 | be useful to preserve all the nom metadata (the `input` and `kind` parameters 1290 | to `ParseError::from_error_kind`) for debugging. In a public error struct, it 1291 | may be wiser to discard that information, as a user of your crate probably 1292 | doesn't care about `nom` error metadata. I will assume `JSONParseError` is 1293 | public, so I will discard the `nom` error parameters. 1294 | 1295 | ```rust 1296 | use nom::error::{ErrorKind, ParseError}; 1297 | 1298 | impl ParseError for JSONParseError { 1299 | fn from_error_kind(_input: I, _kind: ErrorKind) -> Self { 1300 | JSONParseError::Unparseable 1301 | } 1302 | 1303 | fn append(_: I, _: ErrorKind, other: Self) -> Self { 1304 | other 1305 | } 1306 | } 1307 | ``` 1308 | 1309 | For error handling on integers, we'll split the function into two parts to make 1310 | it easier to read: 1311 | 1312 | ```rust 1313 | fn integer_body(input: &str) -> IResult<&str, &str, JSONParseError> { 1314 | recognize( 1315 | pair( 1316 | opt(tag("-")), 1317 | uint 1318 | ) 1319 | ) 1320 | (input) 1321 | } 1322 | 1323 | fn json_integer(input: &str) -> IResult<&str, Node, JSONParseError> { 1324 | let (remain, raw_int) = integer_body(input)?; 1325 | match raw_int.parse::() { 1326 | Ok(i) => Ok((remain, Node::Integer(i))), 1327 | Err(_) => Err(nom::Err::Failure(JSONParseError::BadInt)), 1328 | } 1329 | } 1330 | ``` 1331 | 1332 | Note that `json_integer` works differently from all the other parsers we've 1333 | written so far: instead of composing parsers using combinators, we actually run 1334 | the `integer_body` parser and capture its result (the remainder and the matched 1335 | string slice). We then attempt to parse the string slice into an integer, and 1336 | hand-assemble an `IResult` by hand. 1337 | 1338 | This can be a useful technique when the `nom` combinators don't supply exactly 1339 | what you need. Here, I first tried using `map_res` to parse the int, but it 1340 | turns out that `map_res` always throws away the error value returned by the 1341 | closure, and substitutes its own error (with kind `MapRes`). 1342 | 1343 | The same approach works for string escaping errors and float parsing errors, 1344 | though float overflow in Rust results in infinity, not an error. This means we 1345 | will never actually return `BadFloat` because there are no 1346 | grammatically-correct floats that can't be parsed into an `f64`. 1347 | (Though Rust versions older than 1.55 had some problems parsing 1348 | [certain edge cases](https://github.com/rust-lang/rust/issues/31407).) 1349 | 1350 | ## Part 12. Finalization. 1351 | 1352 | There's one more `nom`-specific step that we probably want. Assuming our code 1353 | is a library, meant to be used by other programs, we don't want `nom::IResult` 1354 | to show up as our public result type. We should instead return 1355 | `Result`. 1356 | 1357 | We can use `all_consuming` to ensure that all input was matched. Unfortunately, 1358 | there doesn't seem to be a simple `nom` shortcut for translating the error. We 1359 | can do this ourselves: 1360 | 1361 | ```rust 1362 | 1363 | use nom::combinator::all_consuming; 1364 | 1365 | pub fn parse_json(input: &str) -> Result { 1366 | let (_, result) = all_consuming(json_value)(input).map_err(|nom_err| { 1367 | match nom_err { 1368 | nom::Err::Incomplete(_) => unreachable!(), 1369 | nom::Err::Error(e) => e, 1370 | nom::Err::Failure(e) => e, 1371 | } 1372 | })?; 1373 | Ok(result) 1374 | } 1375 | ``` 1376 | 1377 | We haven't talked yet about the three 1378 | [`nom::Err`](https://docs.rs/nom/7.0.0/nom/enum.Err.html) variants. 1379 | 1380 | - `Incomplete` is only used by `nom` streaming parsers. We don't use those, so 1381 | we can just mark that branch `unreachable!` (which would panic). 1382 | - `Error` is what we usually see when a parser has a problem. Something didn't 1383 | match the expected grammar. 1384 | - `Failure` appears less often. It means that the input could only be parsed 1385 | one way, but a parser decided that it was invalid. Unlike `Error`, this error 1386 | is propagated upward without trying any alternative paths (if something like 1387 | `alt` is present). 1388 | 1389 | Our code does use `Failure` in a few places: that's what we return when there 1390 | is a numeric conversion error or a bad escape code. If we use `Error` instead, 1391 | the parsers could return the wrong error type. The reason is that the nom 1392 | `alt` parser would keep trying other parsers, and if all of them fail, there's 1393 | no way for `alt` to know which error is the right one— it usually just returns 1394 | the last error. 1395 | 1396 | ## Thanks for reading! 1397 | 1398 | This ended up being a lot longer than I originally planned, and along the way I 1399 | discovered several things that I'd been doing wrong in my own parsers. There 1400 | are probably a few things that I've still missed; if you notice something, feel 1401 | free to open an issue at this page's 1402 | [GitHub repo](https://github.com/ericseppanen/json-parser-toy), or get in touch 1403 | on [twitter: @codeandbitters](https://twitter.com/codeandbitters) 1404 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use nom::{branch::alt, IResult}; 2 | use nom::bytes::complete::{tag, take_while1}; 3 | use nom::character::complete::{one_of, digit0, digit1, multispace0}; 4 | use nom::combinator::{all_consuming, map, opt, recognize, value}; 5 | use nom::error::{ErrorKind, ParseError}; 6 | use nom::multi::{many0, separated_list0}; 7 | use nom::sequence::{delimited, pair, separated_pair, tuple}; 8 | use escape8259::unescape; 9 | 10 | #[derive(thiserror::Error, Debug, PartialEq)] 11 | pub enum JSONParseError { 12 | #[error("bad integer")] 13 | BadInt, 14 | #[error("bad float")] 15 | BadFloat, 16 | #[error("bad escape sequence")] 17 | BadEscape, 18 | #[error("unknown parser error")] 19 | Unparseable, 20 | } 21 | 22 | impl ParseError for JSONParseError { 23 | fn from_error_kind(_input: I, _kind: ErrorKind) -> Self { 24 | // Because JSONParseError is a simplified public error type, 25 | // we discard the nom error parameters. 26 | JSONParseError::Unparseable 27 | } 28 | 29 | fn append(_: I, _: ErrorKind, other: Self) -> Self { 30 | other 31 | } 32 | } 33 | 34 | #[derive(PartialEq, Debug, Clone)] 35 | pub enum Node { 36 | Null, 37 | Bool(bool), 38 | Integer(i64), 39 | Float(f64), 40 | Str(String), 41 | Array(Vec), 42 | Object(Vec<(String, Node)>), 43 | } 44 | 45 | pub fn parse_json(input: &str) -> Result { 46 | let (_, result) = all_consuming(json_value)(input).map_err(|nom_err| { 47 | match nom_err { 48 | nom::Err::Incomplete(_) => unreachable!(), 49 | nom::Err::Error(e) => e, 50 | nom::Err::Failure(e) => e, 51 | } 52 | })?; 53 | Ok(result) 54 | } 55 | 56 | fn json_value(input: &str) -> IResult<&str, Node, JSONParseError> { 57 | spacey(alt(( 58 | json_array, 59 | json_object, 60 | json_string, 61 | json_float, 62 | json_integer, 63 | json_bool, 64 | json_null 65 | ))) 66 | (input) 67 | } 68 | 69 | fn spacey(f: F) -> impl FnMut(I) -> IResult 70 | where 71 | F: FnMut(I) -> IResult, 72 | I: nom::InputTakeAtPosition, 73 | ::Item: nom::AsChar + Clone, 74 | E: nom::error::ParseError, 75 | { 76 | delimited(multispace0, f, multispace0) 77 | } 78 | 79 | fn json_array(input: &str) -> IResult<&str, Node, JSONParseError> { 80 | let parser = delimited( 81 | spacey(tag("[")), 82 | separated_list0(spacey(tag(",")), json_value), 83 | spacey(tag("]")), 84 | ); 85 | map(parser, |v| { 86 | Node::Array(v) 87 | }) 88 | (input) 89 | } 90 | 91 | // "key: value", where key and value are any JSON type. 92 | fn object_member(input: &str) -> IResult<&str, (String, Node), JSONParseError> { 93 | separated_pair(string_literal, spacey(tag(":")), json_value) 94 | (input) 95 | } 96 | 97 | fn json_object(input: &str) -> IResult<&str, Node, JSONParseError> { 98 | let parser = delimited( 99 | spacey(tag("{")), 100 | separated_list0( 101 | spacey(tag(",")), 102 | object_member 103 | ), 104 | spacey(tag("}")), 105 | ); 106 | map(parser, |v| { 107 | Node::Object(v) 108 | }) 109 | (input) 110 | } 111 | 112 | // A character that is: 113 | // NOT a control character (0x00 - 0x1F) 114 | // NOT a quote character (0x22) 115 | // NOT a backslash character (0x5C) 116 | // Is within the unicode range (< 0x10FFFF) (this is already guaranteed by Rust char) 117 | fn is_nonescaped_string_char(c: char) -> bool { 118 | let cv = c as u32; 119 | (cv >= 0x20) && (cv != 0x22) && (cv != 0x5C) 120 | } 121 | 122 | // One or more unescaped text characters 123 | fn nonescaped_string(input: &str) -> IResult<&str, &str, JSONParseError> { 124 | take_while1(is_nonescaped_string_char) 125 | (input) 126 | } 127 | 128 | // There are only two types of escape allowed by RFC 8259. 129 | // - single-character escapes \" \\ \/ \b \f \n \r \t 130 | // - general-purpose \uXXXX 131 | // Note: we don't enforce that escape codes are valid here. 132 | // There must be a decoder later on. 133 | fn escape_code(input: &str) -> IResult<&str, &str, JSONParseError> { 134 | recognize( 135 | pair( 136 | tag("\\"), 137 | alt(( 138 | tag("\""), 139 | tag("\\"), 140 | tag("/"), 141 | tag("b"), 142 | tag("f"), 143 | tag("n"), 144 | tag("r"), 145 | tag("t"), 146 | tag("u"), 147 | )) 148 | ) 149 | ) 150 | (input) 151 | } 152 | 153 | // Zero or more text characters 154 | fn string_body(input: &str) -> IResult<&str, &str, JSONParseError> { 155 | recognize( 156 | many0( 157 | alt(( 158 | nonescaped_string, 159 | escape_code 160 | )) 161 | ) 162 | ) 163 | (input) 164 | } 165 | 166 | fn string_literal(input: &str) -> IResult<&str, String, JSONParseError> { 167 | let (remain, raw_string) = delimited( 168 | tag("\""), 169 | string_body, 170 | tag("\"") 171 | ) 172 | (input)?; 173 | 174 | match unescape(raw_string) { 175 | Ok(s) => Ok((remain, s)), 176 | Err(_) => Err(nom::Err::Failure(JSONParseError::BadEscape)), 177 | } 178 | } 179 | 180 | fn json_string(input: &str) -> IResult<&str, Node, JSONParseError> { 181 | map(string_literal, |s| { 182 | Node::Str(s) 183 | }) 184 | (input) 185 | } 186 | 187 | // This can be done a few different ways: 188 | // one_of("123456789"), 189 | // anychar("0123456789"), 190 | // we could also extract the character value as u32 and do range checks... 191 | 192 | fn digit1to9(input: &str) -> IResult<&str, char, JSONParseError> { 193 | one_of("123456789") 194 | (input) 195 | } 196 | 197 | // unsigned_integer = zero / ( digit1-9 *DIGIT ) 198 | fn uint(input: &str) -> IResult<&str, &str, JSONParseError> { 199 | alt(( 200 | tag("0"), 201 | recognize( 202 | pair( 203 | digit1to9, 204 | digit0 205 | ) 206 | ) 207 | )) 208 | (input) 209 | } 210 | 211 | fn integer_body(input: &str) -> IResult<&str, &str, JSONParseError> { 212 | recognize( 213 | pair( 214 | opt(tag("-")), 215 | uint 216 | ) 217 | ) 218 | (input) 219 | } 220 | 221 | fn json_integer(input: &str) -> IResult<&str, Node, JSONParseError> { 222 | let (remain, raw_int) = integer_body(input)?; 223 | match raw_int.parse::() { 224 | Ok(i) => Ok((remain, Node::Integer(i))), 225 | Err(_) => Err(nom::Err::Failure(JSONParseError::BadInt)), 226 | } 227 | } 228 | 229 | // number = [ minus ] int [ frac ] [ exp ] 230 | // 231 | // decimal-point = %x2E ; . 232 | // digit1-9 = %x31-39 ; 1-9 233 | // e = %x65 / %x45 ; e E 234 | // exp = e [ minus / plus ] 1*DIGIT 235 | // frac = decimal-point 1*DIGIT 236 | // int = zero / ( digit1-9 *DIGIT ) 237 | // minus = %x2D ; - 238 | // plus = %x2B ; + 239 | // zero = %x30 ; 0 240 | 241 | fn frac(input: &str) -> IResult<&str, &str, JSONParseError> { 242 | recognize( 243 | pair( 244 | tag("."), 245 | digit1 246 | ) 247 | ) 248 | (input) 249 | } 250 | 251 | fn exp(input: &str) -> IResult<&str, &str, JSONParseError> { 252 | recognize( 253 | tuple(( 254 | tag("e"), 255 | opt(alt(( 256 | tag("-"), 257 | tag("+") 258 | ))), 259 | digit1 260 | )) 261 | ) 262 | (input) 263 | } 264 | 265 | fn float_body(input: &str) -> IResult<&str, &str, JSONParseError> { 266 | recognize( 267 | tuple(( 268 | opt(tag("-")), 269 | uint, 270 | alt(( 271 | recognize(pair( 272 | frac, 273 | opt(exp) 274 | )), 275 | exp 276 | )), 277 | )) 278 | ) 279 | (input) 280 | } 281 | 282 | fn json_float(input: &str) -> IResult<&str, Node, JSONParseError> { 283 | let (remain, raw_float) = float_body(input)?; 284 | match raw_float.parse::() { 285 | Ok(f) => Ok((remain, Node::Float(f))), 286 | Err(_) => Err(nom::Err::Failure(JSONParseError::BadFloat)), 287 | } 288 | } 289 | 290 | fn json_bool(input: &str) -> IResult<&str, Node, JSONParseError> { 291 | alt(( 292 | value(Node::Bool(false), tag("false")), 293 | value(Node::Bool(true), tag("true")), 294 | )) 295 | (input) 296 | } 297 | 298 | fn json_null(input: &str) -> IResult<&str, Node, JSONParseError> { 299 | value(Node::Null, tag("null")) 300 | (input) 301 | } 302 | 303 | #[test] 304 | fn test_bool() { 305 | assert_eq!(json_bool("false"), Ok(("", Node::Bool(false)))); 306 | assert_eq!(json_bool("true"), Ok(("", Node::Bool(true)))); 307 | assert!(json_bool("foo").is_err()); 308 | } 309 | 310 | #[test] 311 | fn test_null() { 312 | assert_eq!(json_null("null"), Ok(("", Node::Null))); 313 | } 314 | 315 | #[test] 316 | fn test_integer() { 317 | assert_eq!(json_integer("42"), Ok(("", Node::Integer(42)))); 318 | assert_eq!(json_integer("-123"), Ok(("", Node::Integer(-123)))); 319 | assert_eq!(json_integer("0"), Ok(("", Node::Integer(0)))); 320 | assert_eq!(json_integer("01"), Ok(("1", Node::Integer(0)))); 321 | assert_eq!(json_integer("9999999999999999999"), Err(nom::Err::Failure(JSONParseError::BadInt))); 322 | } 323 | 324 | #[test] 325 | fn test_float() { 326 | assert_eq!(json_float("42.0"), Ok(("", Node::Float(42.0)))); 327 | assert_eq!(json_float("-123.99"), Ok(("", Node::Float(-123.99)))); 328 | assert_eq!(json_float("6.02214086e23"), Ok(("", Node::Float(6.02214086e23)))); 329 | assert_eq!(json_float("-1e6"), Ok(("", Node::Float(-1000000.0)))); 330 | assert_eq!(json_float("1.0e+3"), Ok(("", Node::Float(1000.0)))); 331 | 332 | 333 | // f64::from_str overflows to infinity instead of throwing an error 334 | assert_eq!(json_float("1e9999"), Ok(("", Node::Float(f64::INFINITY)))); 335 | 336 | // odd looking but still valid. 337 | assert_eq!(json_float("0e+42949672970"), Ok(("", Node::Float(0.0)))); 338 | assert_eq!(json_float("9e00010"), Ok(("", Node::Float(90000000000.0)))); 339 | assert_eq!(json_float("-0.0e-99999999999999999999999999"), Ok(("", Node::Float(-0.0)))); 340 | } 341 | 342 | #[test] 343 | fn test_string() { 344 | // Plain Unicode strings with no escaping 345 | assert_eq!(json_string(r#""""#), Ok(("", Node::Str("".into())))); 346 | assert_eq!(json_string(r#""Hello""#), Ok(("", Node::Str("Hello".into())))); 347 | assert_eq!(json_string(r#""の""#), Ok(("", Node::Str("の".into())))); 348 | assert_eq!(json_string(r#""𝄞""#), Ok(("", Node::Str("𝄞".into())))); 349 | 350 | // valid 2-character escapes 351 | assert_eq!(json_string(r#"" \\ ""#), Ok(("", Node::Str(" \\ ".into())))); 352 | assert_eq!(json_string(r#"" \" ""#), Ok(("", Node::Str(" \" ".into())))); 353 | 354 | // valid 6-character escapes 355 | assert_eq!(json_string(r#""\u0000""#), Ok(("", Node::Str("\x00".into())))); 356 | assert_eq!(json_string(r#""\u00DF""#), Ok(("", Node::Str("ß".into())))); 357 | assert_eq!(json_string(r#""\uD834\uDD1E""#), Ok(("", Node::Str("𝄞".into())))); 358 | 359 | // Invalid because surrogate characters must come in pairs 360 | assert!(json_string(r#""\ud800""#).is_err()); 361 | // Unknown 2-character escape 362 | assert!(json_string(r#""\x""#).is_err()); 363 | // Not enough hex digits 364 | assert!(json_string(r#""\u""#).is_err()); 365 | assert!(json_string(r#""\u001""#).is_err()); 366 | // Naked control character 367 | assert!(json_string(r#""\x0a""#).is_err()); 368 | // Not a JSON string because it's not wrapped in quotes 369 | assert!(json_string("abc").is_err()); 370 | // An unterminated string (because the trailing quote is escaped) 371 | assert!(json_string(r#""\""#).is_err()); 372 | 373 | // Parses correctly but has escape errors due to incomplete surrogate pair. 374 | assert_eq!(json_string(r#""\ud800""#), Err(nom::Err::Failure(JSONParseError::BadEscape))); 375 | } 376 | 377 | #[test] 378 | fn test_array() { 379 | assert_eq!(json_array("[ ]"), Ok(("", Node::Array(vec![])))); 380 | assert_eq!(json_array("[ 1 ]"), Ok(("", Node::Array(vec![Node::Integer(1)])))); 381 | 382 | let expected = Node::Array(vec![Node::Integer(1), Node::Str("x".into())]); 383 | assert_eq!(json_array(r#" [ 1 , "x" ] "#), Ok(("", expected))); 384 | } 385 | 386 | #[test] 387 | fn test_object() { 388 | assert_eq!(json_object("{ }"), Ok(("", Node::Object(vec![])))); 389 | let expected = Node::Object(vec![("1".into(), Node::Integer(2))]); 390 | assert_eq!(json_object(r#" { "1" : 2 } "#), Ok(("", expected))); 391 | } 392 | 393 | #[test] 394 | fn test_values() { 395 | assert_eq!(parse_json(" 56 "), Ok(Node::Integer(56))); 396 | assert_eq!(parse_json(" 78.0 "), Ok(Node::Float(78.0))); 397 | assert_eq!(parse_json(r#" "Hello" "#), Ok(Node::Str("Hello".into()))); 398 | // These two tests aren't relevant for JSON. They verify that `json_float` 399 | // will never mistake integers for floats in other grammars that might 400 | // allow a `.` or `e` character after a literal integer. 401 | assert_eq!(json_value("123else"), Ok(("else", Node::Integer(123)))); 402 | assert_eq!(json_value("123.x"), Ok((".x", Node::Integer(123)))); 403 | 404 | assert_eq!(parse_json("123else"), Err(JSONParseError::Unparseable)); 405 | assert_eq!(parse_json("123.x"), Err(JSONParseError::Unparseable)); 406 | assert_eq!(parse_json("[ 56, "), Err(JSONParseError::Unparseable)); 407 | assert_eq!(parse_json(r#"{ "a": "b" "#), Err(JSONParseError::Unparseable)); 408 | assert_eq!(parse_json(" 56 a"), Err(JSONParseError::Unparseable)); 409 | 410 | assert_eq!(parse_json("9999999999999999999"), Err(JSONParseError::BadInt)); 411 | assert_eq!(parse_json(r#""\ud800""#), Err(JSONParseError::BadEscape)); 412 | } 413 | --------------------------------------------------------------------------------