├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md └── src ├── bin └── ojc.rs ├── internals.rs ├── lib.rs └── tests.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "oxidized-json-checker" 3 | version = "0.3.2" 4 | description = "A pushdown automaton low memory JSON bytes stream checker" 5 | authors = ["Kerollmops "] 6 | edition = "2018" 7 | license-file = "LICENSE" 8 | repository = "https://github.com/Kerollmops/oxidized-json-checker" 9 | homepage = "https://github.com/Kerollmops/oxidized-json-checker" 10 | readme = "README.md" 11 | default-run = "ojc" 12 | 13 | [dependencies] 14 | packed_simd = { version = "0.3.3", optional = true } 15 | 16 | [dev-dependencies] 17 | snap = "1.0.0" # for the lib.rs example 18 | 19 | [features] 20 | default = [] 21 | nightly = ["packed_simd"] 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2005 JSON.org 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | The Software shall be used for Good, not Evil. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # oxidized-json-checker 2 | 3 | This is a pure Rust version of [the JSON_checker library](http://www.json.org/JSON_checker/). 4 | 5 | This is a Pushdown Automaton that very quickly determines if a JSON text is syntactically correct. It could be used to filter inputs to a system, or to verify that the outputs of a system are syntactically correct. 6 | 7 | You can use it with [the `std::io::Read` Rust trait](https://doc.rust-lang.org/std/io/trait.Read.html) to checked if a JSON is valid without having to keep it in memory. 8 | 9 | ## Performances 10 | 11 | I ran some tests against `jq` to make sure the library when in the bounds. 12 | I used a big JSON lines files (8.3GB) that I converted to JSON using `jq -cs '.'` 😜 13 | 14 | You can find those Wikipedia articles on [the benchmark repository of Paul Masurel's Tantivy](https://github.com/tantivy-search/search-benchmark-game#running). 15 | 16 | ### `jq type` 17 | 18 | How many times does `jq` takes when it comes to checking and determining the type of a JSON document? 19 | Probably too much, and also a little bit of memory: 12GB! 20 | 21 | ```bash 22 | $ time cat ../wiki-articles.json | jq type 23 | "array" 24 | 25 | real 1m55.064s 26 | user 1m37.335s 27 | sys 0m21.935s 28 | ``` 29 | 30 | ### `ojc` 31 | 32 | How many times does it takes to `ojc`? Just a little bit less! It also consumes 0kb of memory. 33 | 34 | ```bash 35 | $ time cat ../wiki-articles.json | ojc 36 | Array 37 | 38 | real 0m56.780s 39 | user 0m47.487s 40 | sys 0m12.628s 41 | ``` 42 | 43 | ### `ojc` with SIMD 44 | 45 | How many times does it takes to `ojc` already? 56s, that can't be true, we are in 2020... 46 | What about enabling some SIMD optimizations? Compile the binary with the `nightly` feature and here we go! 47 | 48 | ```bash 49 | $ cargo build --release --features nightly 50 | $ time cat ../wiki-articles.json | ojc 51 | Array 52 | 53 | real 0m15.818s 54 | user 0m10.892s 55 | sys 0m10.721s 56 | ``` 57 | -------------------------------------------------------------------------------- /src/bin/ojc.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use oxidized_json_checker::JsonChecker; 3 | 4 | fn fmain() -> io::Result<()> { 5 | let stdin = io::stdin(); 6 | let mut checker = JsonChecker::new(stdin.lock()); 7 | io::copy(&mut checker, &mut io::sink())?; 8 | let outer_type = checker.finish()?; 9 | println!("{:?}", outer_type); 10 | Ok(()) 11 | } 12 | 13 | fn main() { 14 | if let Err(e) = fmain() { 15 | eprintln!("{}", e); 16 | std::process::exit(1); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/internals.rs: -------------------------------------------------------------------------------- 1 | use self::Class::*; 2 | use self::State::*; 3 | 4 | const ___: Class = Class::Invalid; 5 | const __: State = State::Invalid; 6 | 7 | #[derive(Debug, Copy, Clone, PartialEq, Eq)] 8 | pub enum Class { 9 | CSpace, // space 10 | CWhite, // other whitespace 11 | CLcurb, // { 12 | CRcurb, // } 13 | CLsqrb, // [ 14 | CRsqrb, // ] 15 | CColon, // : 16 | CComma, // , 17 | CQuote, // " 18 | CBacks, // \ 19 | CSlash, // / 20 | CPlus, // + 21 | CMinus, // - 22 | CPoint, // . 23 | CZero , // 0 24 | CDigit, // 123456789 25 | CLowA, // a 26 | CLowB, // b 27 | CLowC, // c 28 | CLowD, // d 29 | CLowE, // e 30 | CLowF, // f 31 | CLowL, // l 32 | CLowN, // n 33 | CLowR, // r 34 | CLowS, // s 35 | CLowT, // t 36 | CLowU, // u 37 | CAbcdf, // ABCDF 38 | CE, // E 39 | CEtc, // everything else 40 | Invalid, 41 | } 42 | 43 | /// This array maps the 128 ASCII characters into character classes. 44 | /// The remaining Unicode characters should be mapped to C_ETC. 45 | /// Non-whitespace control characters are errors. 46 | pub const ASCII_CLASS: [Class; 128] = [ 47 | ___, ___, ___, ___, ___, ___, ___, ___, 48 | ___, CWhite, CWhite, ___, ___, CWhite, ___, ___, 49 | ___, ___, ___, ___, ___, ___, ___, ___, 50 | ___, ___, ___, ___, ___, ___, ___, ___, 51 | 52 | CSpace, CEtc, CQuote, CEtc, CEtc, CEtc, CEtc, CEtc, 53 | CEtc, CEtc, CEtc, CPlus, CComma, CMinus, CPoint, CSlash, 54 | CZero, CDigit, CDigit, CDigit, CDigit, CDigit, CDigit, CDigit, 55 | CDigit, CDigit, CColon, CEtc, CEtc, CEtc, CEtc, CEtc, 56 | 57 | CEtc, CAbcdf, CAbcdf, CAbcdf, CAbcdf, CE, CAbcdf, CEtc, 58 | CEtc, CEtc, CEtc, CEtc, CEtc, CEtc, CEtc, CEtc, 59 | CEtc, CEtc, CEtc, CEtc, CEtc, CEtc, CEtc, CEtc, 60 | CEtc, CEtc, CEtc, CLsqrb, CBacks, CRsqrb, CEtc, CEtc, 61 | 62 | CEtc, CLowA, CLowB, CLowC, CLowD, CLowE, CLowF, CEtc, 63 | CEtc, CEtc, CEtc, CEtc, CLowL, CEtc, CLowN, CEtc, 64 | CEtc, CEtc, CLowR, CLowS, CLowT, CLowU, CEtc, CEtc, 65 | CEtc, CEtc, CEtc, CLcurb, CEtc, CRcurb, CEtc, CEtc 66 | ]; 67 | 68 | /// The state codes. 69 | #[derive(Debug, Copy, Clone, PartialEq, Eq)] 70 | pub enum State { 71 | Go, // start 72 | Ok, // ok 73 | Ob, // object 74 | Ke, // key 75 | Co, // colon 76 | Va, // value 77 | Ar, // array 78 | St, // string 79 | Es, // escape 80 | U1, // u1 81 | U2, // u2 82 | U3, // u3 83 | U4, // u4 84 | Mi, // minus 85 | Ze, // zero 86 | In, // integer 87 | Fr, // fraction 88 | Fs, // fraction 89 | E1, // e 90 | E2, // ex 91 | E3, // exp 92 | T1, // tr 93 | T2, // tru 94 | T3, // true 95 | F1, // fa 96 | F2, // fal 97 | F3, // fals 98 | F4, // false 99 | N1, // nu 100 | N2, // nul 101 | N3, // null 102 | Wcl, // Wrong Colon : (-2) 103 | Wcm, // Wrong Comma , (-3) 104 | Wq, // Wrong Quote " (-4) 105 | Wos, // Wrong Opening Squared [ (-5) 106 | Woc, // Wrong Opening Curly { (-6) 107 | Ws, // Wrong Squared ] (-7) 108 | Wcu, // Wrong Curly } (-8) 109 | Wec, // Wrong Empty curly } (-9) 110 | Invalid, 111 | } 112 | 113 | // Number of states by number of classes 114 | pub const STATE_TRANSITION_TABLE: [[State; 31]; 31] = [ 115 | /* 116 | The state transition table takes the current state and the current symbol, 117 | and returns either a new state or an action. An action is represented as a 118 | negative number. A JSON text is accepted if at the end of the text the 119 | state is OK and if the mode is MODE_DONE. 120 | 121 | white 1-9 ABCDF etc 122 | space | { } [ ] : , " \ / + - . 0 | a b c d e f l n r s t u | E | */ 123 | /*start GO*/ [Go, Go,Woc, __,Wos, __, __, __, Wq, __, __, __, Mi, __, In, In, __, __, __, __, __, F1, __, N1, __, __, T1, __, __, __, __], 124 | /*ok OK*/ [Ok, Ok, __,Wcu, __, Ws, __, Wcm,__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 125 | /*object OB*/ [Ob, Ob, __,Wec, __, __, __, __, St, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 126 | /*key KE*/ [Ke, Ke, __, __, __, __, __, __, St, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 127 | /*colon CO*/ [Co, Co, __, __, __, __,Wcl, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 128 | /*value VA*/ [Va, Va,Woc, __,Wos, __, __, __, St, __, __, __, Mi, __, Ze, In, __, __, __, __, __, F1, __, N1, __, __, T1, __, __, __, __], 129 | /*array AR*/ [Ar, Ar,Woc, __,Wos, Ws, __, __, St, __, __, __, Mi, __, Ze, In, __, __, __, __, __, F1, __, N1, __, __, T1, __, __, __, __], 130 | /*string ST*/ [St, __, St, St, St, St, St, St, Wq, Es, St, St, St, St, St, St, St, St, St, St, St, St, St, St, St, St, St, St, St, St, St], 131 | /*escape ES*/ [__, __, __, __, __, __, __, __, St, St, St, __, __, __, __, __, __, St, __, __, __, St, __, St, St, __, St, U1, __, __, __], 132 | /*u1 U1*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, U2, U2, U2, U2, U2, U2, U2, U2, __, __, __, __, __, __, U2, U2, __], 133 | /*u2 U2*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, U3, U3, U3, U3, U3, U3, U3, U3, __, __, __, __, __, __, U3, U3, __], 134 | /*u3 U3*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, U4, U4, U4, U4, U4, U4, U4, U4, __, __, __, __, __, __, U4, U4, __], 135 | /*u4 U4*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, St, St, St, St, St, St, St, St, __, __, __, __, __, __, St, St, __], 136 | /*minus MI*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, Ze, In, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 137 | /*zero ZE*/ [Ok, Ok, __,Wcu, __, Ws, __,Wcm, __, __, __, __, __, Fr, __, __, __, __, __, __, E1, __, __, __, __, __, __, __, __, E1, __], 138 | /*int IN*/ [Ok, Ok, __,Wcu, __, Ws, __,Wcm, __, __, __, __, __, Fr, In, In, __, __, __, __, E1, __, __, __, __, __, __, __, __, E1, __], 139 | /*frac FR*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, Fs, Fs, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 140 | /*fracs FS*/ [Ok, Ok, __,Wcu, __, Ws, __,Wcm, __, __, __, __, __, __, Fs, Fs, __, __, __, __, E1, __, __, __, __, __, __, __, __, E1, __], 141 | /*e E1*/ [__, __, __, __, __, __, __, __, __, __, __, E2, E2, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 142 | /*ex E2*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 143 | /*exp E3*/ [Ok, Ok, __,Wcu, __, Ws, __,Wcm, __, __, __, __, __, __, E3, E3, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 144 | /*tr T1*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, T2, __, __, __, __, __, __], 145 | /*tru T2*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, T3, __, __, __], 146 | /*true T3*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, Ok, __, __, __, __, __, __, __, __, __, __], 147 | /*fa F1*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F2, __, __, __, __, __, __, __, __, __, __, __, __, __, __], 148 | /*fal F2*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F3, __, __, __, __, __, __, __, __], 149 | /*fals F3*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, F4, __, __, __, __, __], 150 | /*false F4*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, Ok, __, __, __, __, __, __, __, __, __, __], 151 | /*nu N1*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, N2, __, __, __], 152 | /*nul N2*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, N3, __, __, __, __, __, __, __, __], 153 | /*null N3*/ [__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, Ok, __, __, __, __, __, __, __, __], 154 | ]; 155 | 156 | /// These modes can be pushed on the stack. 157 | #[derive(Debug, Copy, Clone, PartialEq, Eq)] 158 | pub enum Mode { 159 | Array, 160 | Done, 161 | Key, 162 | Object, 163 | String, 164 | } 165 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! `oxidized-json-checker` is a library that provides JSON validation without 2 | //! keeping the stream of bytes in memory, it streams the bytes and validate it 3 | //! on the fly using a pushdown automaton. 4 | //! 5 | //! The original library has been retrieved from [json.org](http://www.json.org/JSON_checker/) 6 | //! and improved to accept every valid JSON element has a valid JSOn document. 7 | //! 8 | //! Therefore this library accepts a single string or single integer as a valid JSON document, 9 | //! this way we follow the [`serde_json`](https://docs.rs/serde_json) rules. 10 | //! 11 | //! # Example: validate some bytes 12 | //! 13 | //! This example shows how you can give the library a simple slice 14 | //! of bytes and validate that it is a valid JSON document. 15 | //! 16 | //! ``` 17 | //! # fn fmain() -> Result<(), Box> { 18 | //! let text = r#"["I", "am", "a", "valid", "JSON", "array"]"#; 19 | //! let bytes = text.as_bytes(); 20 | //! 21 | //! oxidized_json_checker::validate(bytes)?; 22 | //! # Ok(()) } 23 | //! # fmain().unwrap() 24 | //! ``` 25 | //! 26 | //! # Example: validate a stream of bytes 27 | //! 28 | //! This example shows that you can use any type that implements `io::Read` 29 | //! to the `JsonChecker` and validate that it is valid JSON. 30 | //! 31 | //! ``` 32 | //! # const json_bytes: &[u8] = b"null"; 33 | //! # fn streaming_from_the_web() -> std::io::Result<&'static [u8]> { 34 | //! # Ok(json_bytes) 35 | //! # } 36 | //! # fn fmain() -> Result<(), Box> { 37 | //! let stream = streaming_from_the_web()?; 38 | //! 39 | //! oxidized_json_checker::validate(stream)?; 40 | //! # Ok(()) } 41 | //! # fmain().unwrap() 42 | //! ``` 43 | //! 44 | //! # Example: complex compositions 45 | //! 46 | //! This example show how you can use the `JsonChecker` type to check 47 | //! a compressed stream of bytes. 48 | //! 49 | //! You can decompress the stream, check it using the `JsonChecker`, and compress it 50 | //! again to pipe it elsewhere. All of that without much memory impact. 51 | //! 52 | //! ```no_run 53 | //! # fn fmain() -> Result<(), Box> { 54 | //! use std::io; 55 | //! use oxidized_json_checker::JsonChecker; 56 | //! 57 | //! let stdin = io::stdin(); 58 | //! let stdout = io::stdout(); 59 | //! 60 | //! // Wrap the stdin reader in a Snappy reader 61 | //! // then wrap it in a JsonChecker reader. 62 | //! let rdr = snap::read::FrameDecoder::new(stdin.lock()); 63 | //! let mut rdr = JsonChecker::new(rdr); 64 | //! 65 | //! // Wrap the stdout writer in a Snappy writer. 66 | //! let mut wtr = snap::write::FrameEncoder::new(stdout.lock()); 67 | //! 68 | //! // The copy function will return any io error thrown by any of the reader, 69 | //! // the JsonChecker throw errors when invalid JSON is encountered. 70 | //! io::copy(&mut rdr, &mut wtr)?; 71 | //! 72 | //! // We must check that the final bytes were valid. 73 | //! rdr.finish()?; 74 | //! # Ok(()) } 75 | //! # fmain().unwrap() 76 | //! ``` 77 | //! 78 | 79 | use std::{fmt, io}; 80 | use crate::internals::{State, Class, Mode}; 81 | use crate::internals::{STATE_TRANSITION_TABLE, ASCII_CLASS}; 82 | 83 | #[cfg(test)] 84 | mod tests; 85 | mod internals; 86 | 87 | /// The error type returned by the `JsonChecker` type. 88 | #[derive(Copy, Clone, Debug)] 89 | pub enum Error { 90 | InvalidCharacter, 91 | EmptyCurlyBraces, 92 | OrphanCurlyBrace, 93 | OrphanSquareBrace, 94 | MaxDepthReached, 95 | InvalidQuote, 96 | InvalidComma, 97 | InvalidColon, 98 | InvalidState, 99 | IncompleteElement, 100 | } 101 | 102 | impl From for io::Error { 103 | fn from(err: Error) -> io::Error { 104 | io::Error::new(io::ErrorKind::Other, err) 105 | } 106 | } 107 | 108 | impl std::error::Error for Error {} 109 | 110 | impl fmt::Display for Error { 111 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 112 | match self { 113 | Error::InvalidCharacter => f.write_str("invalid character"), 114 | Error::EmptyCurlyBraces => f.write_str("empty curly braces"), 115 | Error::OrphanCurlyBrace => f.write_str("orphan curly brace"), 116 | Error::OrphanSquareBrace => f.write_str("orphan square brace"), 117 | Error::MaxDepthReached => f.write_str("max depth reached"), 118 | Error::InvalidQuote => f.write_str("invalid quote"), 119 | Error::InvalidComma => f.write_str("invalid comma"), 120 | Error::InvalidColon => f.write_str("invalid colon"), 121 | Error::InvalidState => f.write_str("invalid state"), 122 | Error::IncompleteElement => f.write_str("incomplete element"), 123 | } 124 | } 125 | } 126 | 127 | /// Represents any valid JSON type. 128 | #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] 129 | pub enum JsonType { 130 | Null, 131 | Bool, 132 | Number, 133 | String, 134 | Array, 135 | Object, 136 | } 137 | 138 | /// A convenient method to check and consume JSON from a stream of bytes. 139 | /// 140 | /// # Example 141 | /// 142 | /// ``` 143 | /// # fn fmain() -> Result<(), Box> { 144 | /// use oxidized_json_checker::{validate, JsonType}; 145 | /// let text = r#""I am a simple string!""#; 146 | /// let bytes = text.as_bytes(); 147 | /// 148 | /// let json_type = validate(bytes)?; 149 | /// assert_eq!(json_type, JsonType::String); 150 | /// # Ok(()) } 151 | /// # fmain().unwrap() 152 | /// ``` 153 | pub fn validate(reader: R) -> io::Result { 154 | let mut checker = JsonChecker::new(reader); 155 | io::copy(&mut checker, &mut io::sink())?; 156 | let outer_type = checker.finish()?; 157 | Ok(outer_type) 158 | } 159 | 160 | /// A convenient method to check and consume JSON from an `str`. 161 | pub fn validate_str(string: &str) -> Result { 162 | validate_bytes(string.as_bytes()) 163 | } 164 | 165 | /// A convenient method to check and consume JSON from a bytes slice. 166 | pub fn validate_bytes(bytes: &[u8]) -> Result { 167 | let mut checker = JsonChecker::new(()); 168 | checker.next_bytes(bytes)?; 169 | checker.finish() 170 | } 171 | 172 | /// The `JsonChecker` is a `io::Read` adapter, it can be used like a pipe, 173 | /// reading bytes, checkings those and output the same bytes. 174 | /// 175 | /// If an error is encountered, a JSON syntax error or an `io::Error` 176 | /// it is returned by the `io::Read::read` method. 177 | /// 178 | /// # Safety 179 | /// 180 | /// An error encountered while reading bytes will invalidate the checker. 181 | /// 182 | /// # Example: read from a slice 183 | /// 184 | /// ``` 185 | /// # fn fmain() -> Result<(), Box> { 186 | /// use std::io; 187 | /// use oxidized_json_checker::JsonChecker; 188 | /// 189 | /// let text = r#"{"I am": "an object"}"#; 190 | /// let bytes = text.as_bytes(); 191 | /// 192 | /// let mut checker = JsonChecker::new(bytes); 193 | /// io::copy(&mut checker, &mut io::sink())?; 194 | /// checker.finish()?; 195 | /// # Ok(()) } 196 | /// # fmain().unwrap() 197 | /// ``` 198 | pub struct JsonChecker { 199 | state: State, 200 | error: Option, 201 | outer_type: Option, 202 | max_depth: usize, 203 | stack: Vec, 204 | reader: R, 205 | } 206 | 207 | impl fmt::Debug for JsonChecker { 208 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 209 | f.debug_struct("JsonChecker").finish() 210 | } 211 | } 212 | 213 | impl JsonChecker { 214 | /// Construct a `JsonChecker. To continue the process, write to the `JsonChecker` 215 | /// like a sink, and then call `JsonChecker::finish` to obtain the final result. 216 | pub fn new(reader: R) -> JsonChecker { 217 | JsonChecker::with_max_depth(reader, usize::max_value()) 218 | } 219 | 220 | /// Construct a `JsonChecker` and restrict the level of maximum nesting. 221 | /// 222 | /// For more information read the `JsonChecker::new` documentation. 223 | pub fn with_max_depth(reader: R, max_depth: usize) -> JsonChecker { 224 | JsonChecker { 225 | state: State::Go, 226 | error: None, 227 | outer_type: None, 228 | max_depth, 229 | stack: vec![Mode::Done], 230 | reader, 231 | } 232 | } 233 | 234 | #[inline] 235 | #[cfg(feature = "nightly")] 236 | fn next_bytes(&mut self, bytes: &[u8]) -> Result<(), Error> { 237 | use packed_simd::u8x8; 238 | 239 | // TODO use chunks_exact instead? 240 | // By using u8x8 instead of u8x16 we lost 2s on 16s but 241 | // we are less prone to find state change requirements. 242 | for chunk in bytes.chunks(u8x8::lanes()) { 243 | if chunk.len() == u8x8::lanes() && self.state == State::St { 244 | // Load the bytes into a SIMD type 245 | let bytes = u8x8::from_slice_unaligned(chunk); 246 | 247 | // According to the state STATE_TRANSITION_TABLE we are in the `St` state 248 | // and *none of those bytes* are in the `CWhite`, `CQuote` or `CBacks` ascci class 249 | // we can avoid processing them at all because they will not change the current state. 250 | 251 | let cquotes = u8x8::splat(b'"'); 252 | let cbacks = u8x8::splat(b'\\'); 253 | 254 | let cwhites1 = u8x8::splat(b'\t'); 255 | let cwhites2 = u8x8::splat(b'\n'); 256 | let cwhites3 = u8x8::splat(b'\r'); 257 | 258 | // We first compare with quotes because this is the most 259 | // common character we can encounter in valid JSON strings 260 | // and this way we are able to skip other comparisons faster 261 | if bytes.eq(cquotes).any() || 262 | bytes.eq(cbacks).any() || 263 | bytes.eq(cwhites1).any() || 264 | bytes.eq(cwhites2).any() || 265 | bytes.eq(cwhites3).any() 266 | { 267 | chunk.iter().try_for_each(|b| self.next_byte(*b))?; 268 | } 269 | 270 | // Now that we checked that these bytes will not change 271 | // the state we can continue to the next chunk and ignore them 272 | 273 | } else { 274 | chunk.iter().try_for_each(|b| self.next_byte(*b))?; 275 | } 276 | } 277 | 278 | Ok(()) 279 | } 280 | 281 | #[inline] 282 | #[cfg(not(feature = "nightly"))] 283 | fn next_bytes(&mut self, bytes: &[u8]) -> Result<(), Error> { 284 | bytes.iter().try_for_each(|b| self.next_byte(*b)) 285 | } 286 | 287 | #[inline] 288 | fn next_byte(&mut self, next_byte: u8) -> Result<(), Error> { 289 | if let Some(error) = self.error { 290 | return Err(error); 291 | } 292 | 293 | // We can potentially use try_blocks in the future. 294 | fn internal_next_byte(jc: &mut JsonChecker, next_byte: u8) -> Result<(), Error> { 295 | // Determine the character's class. 296 | let next_class = if next_byte >= 128 { 297 | Class::CEtc 298 | } else { 299 | ASCII_CLASS[next_byte as usize] 300 | }; 301 | 302 | if next_class == Class::Invalid { 303 | return Err(Error::InvalidCharacter); 304 | } 305 | 306 | // Get the next state from the state transition table and 307 | // perform one of the actions. 308 | let next_state = STATE_TRANSITION_TABLE[jc.state as usize][next_class as usize]; 309 | 310 | // Save the type we met if not already saved. 311 | if jc.outer_type.is_none() { 312 | match next_state { 313 | State::N1 => jc.outer_type = Some(JsonType::Null), 314 | State::T1 | State::F1 => jc.outer_type = Some(JsonType::Bool), 315 | State::In => jc.outer_type = Some(JsonType::Number), 316 | State::Wq => jc.outer_type = Some(JsonType::String), 317 | State::Wos => jc.outer_type = Some(JsonType::Array), 318 | State::Woc => jc.outer_type = Some(JsonType::Object), 319 | _ => (), 320 | } 321 | } 322 | 323 | match next_state { 324 | State::Wec => { // Empty } 325 | if !jc.pop(Mode::Key) { 326 | return Err(Error::EmptyCurlyBraces); 327 | } 328 | jc.state = State::Ok; 329 | }, 330 | State::Wcu => { // } 331 | if !jc.pop(Mode::Object) { 332 | return Err(Error::OrphanCurlyBrace); 333 | } 334 | jc.state = State::Ok; 335 | }, 336 | State::Ws => { // ] 337 | if !jc.pop(Mode::Array) { 338 | return Err(Error::OrphanSquareBrace); 339 | } 340 | jc.state = State::Ok; 341 | }, 342 | State::Woc => { // { 343 | if !jc.push(Mode::Key) { 344 | return Err(Error::MaxDepthReached); 345 | } 346 | jc.state = State::Ob; 347 | }, 348 | State::Wos => { // [ 349 | if !jc.push(Mode::Array) { 350 | return Err(Error::MaxDepthReached); 351 | } 352 | jc.state = State::Ar; 353 | } 354 | State::Wq => { // " 355 | match jc.stack.last() { 356 | Some(Mode::Done) => { 357 | if !jc.push(Mode::String) { 358 | return Err(Error::MaxDepthReached); 359 | } 360 | jc.state = State::St; 361 | }, 362 | Some(Mode::String) => { 363 | jc.pop(Mode::String); 364 | jc.state = State::Ok; 365 | }, 366 | Some(Mode::Key) => jc.state = State::Co, 367 | Some(Mode::Array) | 368 | Some(Mode::Object) => jc.state = State::Ok, 369 | _ => return Err(Error::InvalidQuote), 370 | } 371 | }, 372 | State::Wcm => { // , 373 | match jc.stack.last() { 374 | Some(Mode::Object) => { 375 | // A comma causes a flip from object mode to key mode. 376 | if !jc.pop(Mode::Object) || !jc.push(Mode::Key) { 377 | return Err(Error::InvalidComma); 378 | } 379 | jc.state = State::Ke; 380 | } 381 | Some(Mode::Array) => jc.state = State::Va, 382 | _ => return Err(Error::InvalidComma), 383 | } 384 | }, 385 | State::Wcl => { // : 386 | // A colon causes a flip from key mode to object mode. 387 | if !jc.pop(Mode::Key) || !jc.push(Mode::Object) { 388 | return Err(Error::InvalidColon); 389 | } 390 | jc.state = State::Va; 391 | }, 392 | State::Invalid => { 393 | return Err(Error::InvalidState) 394 | }, 395 | 396 | // Or change the state. 397 | state => jc.state = state, 398 | } 399 | 400 | Ok(()) 401 | } 402 | 403 | // By catching returned errors when this `JsonChecker` is used we *fuse* 404 | // the checker and ensure the user don't use a checker in an invalid state. 405 | if let Err(error) = internal_next_byte(self, next_byte) { 406 | self.error = Some(error); 407 | return Err(error); 408 | } 409 | 410 | Ok(()) 411 | } 412 | 413 | /// The `JsonChecker::finish` method must be called after all of the characters 414 | /// have been processed. 415 | /// 416 | /// This function consumes the `JsonChecker` and returns `Ok(JsonType)` if the 417 | /// JSON text was accepted and the JSON type guessed. 418 | pub fn finish(self) -> Result { 419 | self.into_inner().map(|(_, t)| t) 420 | } 421 | 422 | /// The `JsonChecker::into_inner` does the same as the `JsonChecker::finish` 423 | /// method but returns the internal reader along with the JSON type guessed. 424 | pub fn into_inner(mut self) -> Result<(R, JsonType), Error> { 425 | let is_state_valid = match self.state { 426 | State::Ok | State::In | State::Fr | State::Fs | State::E3 => true, 427 | _ => false, 428 | }; 429 | 430 | if is_state_valid && self.pop(Mode::Done) { 431 | let outer_type = self.outer_type.expect("BUG: the outer type must have been guessed"); 432 | return Ok((self.reader, outer_type)) 433 | } 434 | 435 | // We do not need to catch this error to *fuse* the checker because this method 436 | // consumes the checker, it cannot be reused after an error has been thrown. 437 | Err(Error::IncompleteElement) 438 | } 439 | 440 | /// Push a mode onto the stack. Returns false if max depth is reached. 441 | fn push(&mut self, mode: Mode) -> bool { 442 | if self.stack.len() + 1 >= self.max_depth { 443 | return false; 444 | } 445 | self.stack.push(mode); 446 | return true; 447 | } 448 | 449 | /// Pop the stack, assuring that the current mode matches the expectation. 450 | /// Return false if the stack is empty or if the modes mismatch. 451 | fn pop(&mut self, mode: Mode) -> bool { 452 | self.stack.pop() == Some(mode) 453 | } 454 | } 455 | 456 | impl io::Read for JsonChecker { 457 | fn read(&mut self, buf: &mut [u8]) -> io::Result { 458 | // If an error have already been encountered we return it, 459 | // this *fuses* the JsonChecker. 460 | if let Some(error) = self.error { 461 | return Err(error.into()); 462 | } 463 | 464 | let len = match self.reader.read(buf) { 465 | Err(error) => { 466 | // We do not store the io::Error in the JsonChecker Error 467 | // type instead we use the IncompleteElement error. 468 | self.error = Some(Error::IncompleteElement); 469 | return Err(error); 470 | }, 471 | Ok(len) => len, 472 | }; 473 | 474 | self.next_bytes(&buf[..len])?; 475 | 476 | Ok(len) 477 | } 478 | } 479 | -------------------------------------------------------------------------------- /src/tests.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | use crate::*; 3 | 4 | fn parse(text: &str) -> io::Result { 5 | let mut string = String::new(); 6 | let mut checker = JsonChecker::new(text.as_bytes()); 7 | checker.read_to_string(&mut string)?; 8 | let outer_type = checker.finish()?; 9 | Ok(outer_type) 10 | } 11 | 12 | #[test] 13 | fn it_works() { 14 | let json = r#"{"hello": "girls 😜 ❤️"}"#; 15 | 16 | let mut string = String::new(); 17 | let mut checker = JsonChecker::new(json.as_bytes()); 18 | checker.read_to_string(&mut string).unwrap(); 19 | checker.finish().unwrap(); 20 | 21 | assert_eq!(&*string, json); 22 | } 23 | 24 | #[test] 25 | fn it_does_not_work() { 26 | let json = r#"{"hello": "boys}"#; // missing quote 27 | 28 | let mut string = String::new(); 29 | let mut checker = JsonChecker::new(json.as_bytes()); 30 | checker.read_to_string(&mut string).unwrap(); 31 | 32 | // This should fail 33 | checker.finish().unwrap_err(); 34 | 35 | assert_eq!(&*string, json); 36 | } 37 | 38 | #[test] 39 | fn unclosed_array() { 40 | assert!(parse(r#"["Unclosed array""#).is_err()); 41 | } 42 | 43 | #[test] 44 | fn unquoted_key() { 45 | assert!(parse(r#"{unquoted_key: "keys must be quoted"}"#).is_err()); 46 | } 47 | 48 | #[test] 49 | fn extra_comma_arr() { 50 | assert!(parse(r#"["extra comma",]"#).is_err()); 51 | } 52 | 53 | #[test] 54 | fn double_extra_comma() { 55 | assert!(parse(r#"["double extra comma",,]"#).is_err()); 56 | } 57 | 58 | #[test] 59 | fn missing_value() { 60 | assert!(parse(r#"[ , "<-- missing value"]"#).is_err()); 61 | } 62 | 63 | #[test] 64 | fn comma_after_close() { 65 | assert!(parse(r#"["Comma after the close"],"#).is_err()); 66 | } 67 | 68 | #[test] 69 | fn extra_close() { 70 | assert!(parse(r#"["Extra close"]]"#).is_err()); 71 | } 72 | 73 | #[test] 74 | fn extra_comma_obj() { 75 | assert!(parse(r#"{"Extra comma": true,}"#).is_err()); 76 | } 77 | 78 | #[test] 79 | fn extra_value_after_close() { 80 | assert!(parse(r#"{"Extra value after close": true} "misplaced quoted value""#).is_err()); 81 | } 82 | 83 | #[test] 84 | fn illegal_expression() { 85 | assert!(parse(r#"{"Illegal expression": 1 + 2}"#).is_err()); 86 | } 87 | 88 | #[test] 89 | fn illegal_invocation() { 90 | assert!(parse(r#"{"Illegal invocation": alert()}"#).is_err()); 91 | } 92 | 93 | #[test] 94 | fn numbers_cannot_have_leading_zeroes() { 95 | assert!(parse(r#"{"Numbers cannot have leading zeroes": 013}"#).is_err()); 96 | } 97 | 98 | #[test] 99 | fn numbers_cannot_be_hex() { 100 | assert!(parse(r#"{"Numbers cannot be hex": 0x14}"#).is_err()); 101 | } 102 | 103 | #[test] 104 | fn illegal_backslash_escape() { 105 | assert!(parse(r#"["Illegal backslash escape: \x15"]"#).is_err()); 106 | } 107 | 108 | #[test] 109 | fn naked() { 110 | assert!(parse(r#"[\naked]"#).is_err()); 111 | } 112 | 113 | #[test] 114 | fn illegal_backslash_escape_2() { 115 | assert!(parse(r#"["Illegal backslash escape: \017"]"#).is_err()); 116 | } 117 | 118 | #[test] 119 | fn missing_colon() { 120 | assert!(parse(r#"{"Missing colon" null}"#).is_err()); 121 | } 122 | 123 | #[test] 124 | fn double_colon() { 125 | assert!(parse(r#"{"Double colon":: null}"#).is_err()); 126 | } 127 | 128 | #[test] 129 | fn comma_instead_of_colon() { 130 | assert!(parse(r#"{"Comma instead of colon", null}"#).is_err()); 131 | } 132 | 133 | #[test] 134 | fn colon_instead_of_comma() { 135 | assert!(parse(r#"["Colon instead of comma": false]"#).is_err()); 136 | } 137 | 138 | #[test] 139 | fn bad_value() { 140 | assert!(parse(r#"["Bad value", truth]"#).is_err()); 141 | } 142 | 143 | #[test] 144 | fn single_quote() { 145 | assert!(parse(r#"['single quote']"#).is_err()); 146 | } 147 | 148 | #[test] 149 | fn tab_character_in_string() { 150 | assert!(parse("[\"\ttab\tcharacter\tin\tstring\t\"]").is_err()); 151 | } 152 | 153 | #[test] 154 | fn tab_character_in_string_esc() { 155 | assert!(parse("[\"tab\\\tcharacter\\\tin\\\tstring\\\t\"]").is_err()); 156 | } 157 | 158 | #[test] 159 | fn line_break() { 160 | assert!(parse("[\"line\nbreak\"]").is_err()); 161 | } 162 | 163 | #[test] 164 | fn line_break_escaped() { 165 | assert!(parse("[\"line\\\nbreak\"]").is_err()); 166 | } 167 | 168 | #[test] 169 | fn no_exponent() { 170 | assert!(parse(r#"[0e]"#).is_err()); 171 | } 172 | 173 | #[test] 174 | fn no_exponent_plus() { 175 | assert!(parse(r#"[0e+]"#).is_err()); 176 | } 177 | 178 | #[test] 179 | fn exponent_both_signs() { 180 | assert!(parse(r#"[0e+-1]"#).is_err()); 181 | } 182 | 183 | #[test] 184 | fn comma_instead_of_closing_brace() { 185 | assert!(parse(r#"{"Comma instead if closing brace": true,"#).is_err()); 186 | } 187 | 188 | #[test] 189 | fn missmatch() { 190 | assert!(parse(r#"["mismatch"}"#).is_err()); 191 | } 192 | 193 | #[test] 194 | fn unclosed_single_string() { 195 | assert!(parse(r#""hello"#).is_err()) 196 | } 197 | 198 | #[test] 199 | fn unfinished_single_boolean() { 200 | assert!(parse(r#"tru"#).is_err()); 201 | assert!(parse(r#"fa"#).is_err()); 202 | } 203 | 204 | #[test] 205 | fn unfinished_single_null() { 206 | assert!(parse(r#"nul"#).is_err()); 207 | } 208 | 209 | #[test] 210 | fn pass_single_string() { 211 | assert_eq!(parse(r#""hello""#).unwrap(), JsonType::String); 212 | } 213 | 214 | #[test] 215 | fn pass_single_integer() { 216 | assert_eq!(parse(r#"235896"#).unwrap(), JsonType::Number); 217 | assert_eq!(parse(r#"-235896"#).unwrap(), JsonType::Number); 218 | } 219 | 220 | #[test] 221 | fn pass_single_float() { 222 | assert_eq!(parse(r#"235896.789076"#).unwrap(), JsonType::Number); 223 | assert_eq!(parse(r#"-235896.0"#).unwrap(), JsonType::Number); 224 | } 225 | 226 | #[test] 227 | fn pass_single_exponent() { 228 | assert_eq!(parse(r#"235896.10e-56"#).unwrap(), JsonType::Number); 229 | } 230 | 231 | #[test] 232 | fn pass_single_fraction() { 233 | assert_eq!(parse(r#"235896."#).unwrap(), JsonType::Number); 234 | } 235 | 236 | #[test] 237 | fn pass_single_boolean() { 238 | assert_eq!(parse(r#"true"#).unwrap(), JsonType::Bool); 239 | assert_eq!(parse(r#"false"#).unwrap(), JsonType::Bool); 240 | } 241 | 242 | #[test] 243 | fn pass_single_null() { 244 | assert_eq!(parse(r#"null"#).unwrap(), JsonType::Null); 245 | } 246 | 247 | #[test] 248 | fn pass_1() { 249 | let outer_type = parse(r##" 250 | 251 | [ 252 | "JSON Test Pattern pass1", 253 | {"object with 1 member":["array with 1 element"]}, 254 | {}, 255 | [], 256 | -42, 257 | true, 258 | false, 259 | null, 260 | { 261 | "integer": 1234567890, 262 | "real": -9876.543210, 263 | "e": 0.123456789e-12, 264 | "E": 1.234567890E+34, 265 | "": 23456789012E66, 266 | "zero": 0, 267 | "one": 1, 268 | "space": " ", 269 | "quote": "\"", 270 | "backslash": "\\", 271 | "controls": "\b\f\n\r\t", 272 | "slash": "/ & \/", 273 | "alpha": "abcdefghijklmnopqrstuvwyz", 274 | "ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ", 275 | "digit": "0123456789", 276 | "0123456789": "digit", 277 | "special": "`1~!@#$%^&*()_+-={':[,]}|;.?", 278 | "hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A", 279 | "true": true, 280 | "false": false, 281 | "null": null, 282 | "array":[ ], 283 | "object":{ }, 284 | "address": "50 St. James Street", 285 | "url": "http://www.JSON.org/", 286 | "comment": "// /* */": " ", 288 | " s p a c e d " :[1,2 , 3 289 | 290 | , 291 | 292 | 4 , 5 , 6 ,7 ],"compact":[1,2,3,4,5,6,7], 293 | "jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}", 294 | "quotes": "" \u0022 %22 0x22 034 "", 295 | "\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?" 296 | : "A key can be any string" 297 | }, 298 | 0.5 ,98.6 299 | , 300 | 99.44 301 | , 302 | 303 | 1066, 304 | 1e1, 305 | 0.1e1, 306 | 1e-1, 307 | 1e00,2e+00,2e-00 308 | ,"rosebud"] 309 | 310 | "##).unwrap(); 311 | 312 | assert_eq!(outer_type, JsonType::Array); 313 | } 314 | 315 | #[test] 316 | fn pass_2() { 317 | let outer_type = parse(r#"[[[[[[[[[[[[[[[[[[["Not too deep"]]]]]]]]]]]]]]]]]]]"#).unwrap(); 318 | 319 | assert_eq!(outer_type, JsonType::Array); 320 | } 321 | 322 | #[test] 323 | fn pass_3() { 324 | let outer_type = parse(r#" 325 | 326 | { 327 | "JSON Test Pattern pass3": { 328 | "The outermost value": "must be an object or array.", 329 | "In this test": "It is an object." 330 | } 331 | } 332 | 333 | "#).unwrap(); 334 | 335 | assert_eq!(outer_type, JsonType::Object); 336 | } 337 | --------------------------------------------------------------------------------