├── .gitignore ├── CHANGELOG.md ├── .github └── workflows │ ├── rust.yml │ └── codspeed_bench.yml ├── Cargo.toml ├── tests ├── data │ ├── json_gen.py │ └── bench.sh └── test_builder.rs ├── src ├── strategy │ ├── base.rs │ ├── scalar.rs │ ├── mod.rs │ ├── object.rs │ └── array.rs ├── builder.rs ├── main.rs ├── lib.rs └── node.rs ├── README.md ├── benches └── build_schema_bench.rs ├── LICENSE └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | *.svg 3 | tests/data/*.json -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | # v0.2.0 4 | - Support generating schema from mutiple JSON files 5 | 6 | # v0.1.0 7 | - Initial release 🚀 -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: rust build & test 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.github/workflows/codspeed_bench.yml: -------------------------------------------------------------------------------- 1 | name: codspeed benchmarks 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | pull_request: 8 | # `workflow_dispatch` allows CodSpeed to trigger backtest 9 | # performance analysis in order to generate initial data. 10 | workflow_dispatch: 11 | 12 | jobs: 13 | benchmarks: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: Setup rust toolchain, cache and cargo-codspeed binary 19 | uses: moonrepo/setup-rust@v0 20 | with: 21 | channel: stable 22 | cache-target: release 23 | bins: cargo-codspeed 24 | 25 | - name: Build the benchmark target(s) 26 | run: cargo codspeed build 27 | 28 | - name: Run the benchmarks 29 | uses: CodSpeedHQ/action@v2 30 | with: 31 | token: ${{ secrets.CODSPEED_TOKEN }} 32 | run: cargo codspeed run 33 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "genson-rs" 3 | authors = ["Junyu Wang "] 4 | version = "0.2.0" 5 | edition = "2021" 6 | description = "Extremely fast JSON Schema inference engine built in Rust" 7 | homepage = "https://github.com/junyu-w/genson-rs" 8 | repository = "https://github.com/junyu-w/genson-rs" 9 | readme = "README.md" 10 | license-file = "LICENSE" 11 | keywords = ["json", "schema", "inference", "genson"] 12 | categories = ["command-line-utilities", "parser-implementations"] 13 | 14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 15 | 16 | [dependencies] 17 | clap = { version = "4.5.4", features = ["derive"] } 18 | mimalloc = "0.1.41" 19 | rayon = "1.10.0" 20 | regex = "1.10.4" 21 | serde_json = "1.0.116" 22 | simd-json = "0.13.10" 23 | 24 | [dev-dependencies] 25 | codspeed-criterion-compat = "2.6.0" 26 | criterion = "0.3" 27 | 28 | [[bench]] 29 | name = "build_schema_bench" 30 | harness = false 31 | -------------------------------------------------------------------------------- /tests/data/json_gen.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script generates a JSON file with a size of 50MB, 500MB, 1GB and 3GB. 3 | """ 4 | import json 5 | 6 | three_gb_json_count = 10000000 7 | five_hundred_mb_json_count = three_gb_json_count // 6 8 | fifty_mb_json_count = three_gb_json_count // 60 9 | one_gb_json_count = five_hundred_mb_json_count * 2 10 | 11 | data = [] 12 | for i in range(one_gb_json_count): 13 | charge = { 14 | "status": "success", 15 | "data": { 16 | "order_id": "123456789", 17 | "customer": { 18 | "name": "John Doe", 19 | "email": "johndoe@example.com", 20 | "address": { 21 | "street": "123 Main St", 22 | "city": "San Francisco", 23 | "state": "CA", 24 | "zip": "94101" 25 | } 26 | }, 27 | "items": [ 28 | { 29 | "id": "item1", 30 | "name": "Product 1", 31 | "price": 99.99 32 | }, 33 | { 34 | "id": "item2", 35 | "name": "Product 2", 36 | "price": 49.99 37 | } 38 | ], 39 | "total_amount": 149.98 40 | } 41 | } 42 | if i % 2 == 0: 43 | charge["status"] = "failed" 44 | charge["data"].pop("items") 45 | if i % 3 == 0: 46 | charge["metadata"] = { 47 | "notes": "This is a test charge" 48 | } 49 | data.append(charge) 50 | 51 | # Write the JSON data to a file 52 | with open("test_large_1gb_full_json_array.json", "w") as file: 53 | # json_text = "\n".join([json.dumps(charge) for charge in data]) 54 | # file.write(json_text) 55 | json.dump(data, file) 56 | 57 | -------------------------------------------------------------------------------- /src/strategy/base.rs: -------------------------------------------------------------------------------- 1 | use serde_json::Value; 2 | use simd_json; 3 | 4 | /// base schema strategy trait 5 | pub trait SchemaStrategy { 6 | 7 | fn match_schema(schema: &Value) -> bool; 8 | fn match_object(object: &simd_json::BorrowedValue) -> bool; 9 | 10 | fn add_schema(&mut self, schema: &Value) { 11 | self.add_extra_keywords(schema) 12 | } 13 | 14 | fn add_object(&mut self, _object: &simd_json::BorrowedValue); 15 | 16 | fn to_schema(&self) -> Value { 17 | self.get_extra_keywords().clone() 18 | } 19 | 20 | fn add_extra_keywords(&mut self, schema: &Value) { 21 | if let Value::Object(schema) = schema { 22 | schema.iter().for_each(|(key, value)| { 23 | let keywords = self.get_extra_keywords_mut(); 24 | match keywords { 25 | Value::Object(keywords) => { 26 | if key == "type" { 27 | return; 28 | } else if !keywords.contains_key(key) { 29 | // add the property from the input schema if it doesn't already exist 30 | keywords.insert(key.to_string(), value.clone()); 31 | } 32 | }, 33 | _ => () 34 | } 35 | }); 36 | } 37 | } 38 | 39 | fn get_extra_keywords_mut(&mut self) -> &mut Value; 40 | 41 | fn get_extra_keywords(&self) -> &Value; 42 | } 43 | 44 | 45 | /// base schema strategy trait for scalar types 46 | pub trait ScalarSchemaStrategy: SchemaStrategy { 47 | fn js_type() -> &'static str; 48 | 49 | fn to_schema(&self) -> Value { 50 | let mut schema = SchemaStrategy::to_schema(self); 51 | schema["type"] = Value::String(Self::js_type().to_string()); 52 | schema 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/data/bench.sh: -------------------------------------------------------------------------------- 1 | echo "Benching genson-rs" 2 | 3 | echo "[genson-rs] Running benchmarks for 50MB file" 4 | time ./target/release/genson-rs ./tests/data/test_small_50mb.json -d newline 5 | time ./target/release/genson-rs ./tests/data/test_small_50mb.json -d newline 6 | time ./target/release/genson-rs ./tests/data/test_small_50mb.json -d newline 7 | 8 | echo "[genson-rs] Running benchmarks for 500MB file" 9 | time ./target/release/genson-rs ./tests/data/test_medium_500mb.json -d newline 10 | time ./target/release/genson-rs ./tests/data/test_medium_500mb.json -d newline 11 | time ./target/release/genson-rs ./tests/data/test_medium_500mb.json -d newline 12 | 13 | echo "[genson-rs] Running benchmarks for 1GB file" 14 | time ./target/release/genson-rs ./tests/data/test_large_1gb.json -d newline 15 | time ./target/release/genson-rs ./tests/data/test_large_1gb.json -d newline 16 | time ./target/release/genson-rs ./tests/data/test_large_1gb.json -d newline 17 | 18 | echo "[genson-rs] Running benchmarks for 3GB file" 19 | time ./target/release/genson-rs ./tests/data/test_large_3gb.json -d newline 20 | time ./target/release/genson-rs ./tests/data/test_large_3gb.json -d newline 21 | time ./target/release/genson-rs ./tests/data/test_large_3gb.json -d newline 22 | 23 | echo "[genson-rs] Running benchmarks for 3GB large JSON array file" 24 | time ./target/release/genson-rs ./tests/data/test_large_3gb_full_json_array.json 25 | time ./target/release/genson-rs ./tests/data/test_large_3gb_full_json_array.json 26 | time ./target/release/genson-rs ./tests/data/test_large_3gb_full_json_array.json 27 | 28 | echo "Benching GenSON (Python)" 29 | 30 | echo "[GenSON] Running benchmarks for 50MB file" 31 | time genson ./tests/data/test_small_50mb.json 32 | time genson ./tests/data/test_small_50mb.json 33 | time genson ./tests/data/test_small_50mb.json 34 | 35 | echo "[GenSON] Running benchmarks for 500MB file" 36 | time genson ./tests/data/test_medium_500mb.json 37 | time genson ./tests/data/test_medium_500mb.json 38 | time genson ./tests/data/test_medium_500mb.json 39 | 40 | echo "[GenSON] Running benchmarks for 1GB file" 41 | time genson ./tests/data/test_large_1gb.json 42 | time genson ./tests/data/test_large_1gb.json 43 | time genson ./tests/data/test_large_1gb.json 44 | 45 | echo "[GenSON] Running benchmarks for 3GB large JSON array file" 46 | time genson ./tests/data/test_large_3gb_full_json_array.json 47 | time genson ./tests/data/test_large_3gb_full_json_array.json 48 | time genson ./tests/data/test_large_3gb_full_json_array.json -------------------------------------------------------------------------------- /src/builder.rs: -------------------------------------------------------------------------------- 1 | use serde_json::{Value, json}; 2 | use simd_json; 3 | 4 | use crate::node::{DataType, SchemaNode}; 5 | 6 | const DEFAULT_SCHEMA_URI: &str = "http://json-schema.org/schema#"; 7 | const NULL_SCHEMA_URI: &str = "NULL"; 8 | 9 | 10 | pub struct SchemaBuilder { 11 | schema_uri: Option, 12 | root_node: SchemaNode, 13 | } 14 | 15 | impl SchemaBuilder { 16 | /// Create a new SchemaBuilder object. The schema_uri parameter is optional, a value 17 | /// of "AUTO" will automatically detect the schema URI based on the input schema, if no 18 | /// schema URI was detected, a default URI of "http://json-schema.org/schema#" will be used. 19 | /// A value of None will leave out the "$schema" keyword in the output schema. 20 | pub fn new(schema_uri: Option<&str>) -> Self { 21 | // TODO: the functionality to allow non-default node class with extended 22 | // strategies is not supported yet 23 | let root_node: SchemaNode = SchemaNode::new(); 24 | 25 | if let Some(uri) = schema_uri { 26 | if uri == "AUTO" { 27 | SchemaBuilder { schema_uri: None, root_node } 28 | } else { 29 | SchemaBuilder { schema_uri: Some(uri.to_string()), root_node } 30 | } 31 | } else { 32 | SchemaBuilder { schema_uri: Some(NULL_SCHEMA_URI.to_string()), root_node } 33 | } 34 | } 35 | 36 | /// Merge in raw JSON schema object 37 | pub fn add_schema(&mut self, mut schema: Value) { 38 | if let Value::Object(ref mut schema_obj) = schema { 39 | if schema_obj.contains_key("$schema") && self.schema_uri.is_none() { 40 | self.schema_uri = Some(schema_obj["$schema"].as_str().unwrap().to_string()); 41 | schema_obj.remove("$schema"); 42 | } 43 | self.root_node.add_schema(DataType::Schema(&schema)); 44 | } else { 45 | panic!("Invalid schema type - must be a valid JSON object") 46 | } 47 | } 48 | 49 | /// Merge in another SchemaNode object 50 | pub fn add_schema_node(&mut self, node: SchemaNode) { 51 | self.root_node.add_schema(DataType::SchemaNode(&node)); 52 | } 53 | 54 | /// Modify the schema to accomodate the input object 55 | pub fn add_object(&mut self, object: &simd_json::BorrowedValue) { 56 | self.root_node.add_object(DataType::Object(object)); 57 | } 58 | 59 | /// Export the currently constructed schema as a JSON object 60 | pub fn to_schema(&self) -> Value { 61 | let mut base_schema = self.get_base_schema(); 62 | 63 | let base_schema_map = base_schema.as_object_mut().unwrap(); 64 | let node_schema = self.root_node.to_schema(); 65 | let node_schema_map = node_schema.as_object().unwrap(); 66 | 67 | for (key, value) in node_schema_map.iter() { 68 | base_schema_map.insert(key.to_string(), value.clone()); 69 | } 70 | return base_schema; 71 | } 72 | 73 | fn get_base_schema(&self) -> Value { 74 | if let Some(uri) = &self.schema_uri { 75 | if uri == NULL_SCHEMA_URI { 76 | return json!({}); 77 | } else { 78 | return json!({"$schema": uri}); 79 | } 80 | } else { 81 | return json!({"$schema": DEFAULT_SCHEMA_URI}); 82 | } 83 | } 84 | 85 | /// Serialize the currently constructed schema to a JSON string 86 | pub fn to_json(&self) -> String { 87 | let schema = self.to_schema(); 88 | return schema.to_string(); 89 | } 90 | 91 | 92 | } 93 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::process; 2 | 3 | use clap::{ArgAction, Parser}; 4 | use genson_rs::*; 5 | use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; 6 | use std::mem; 7 | 8 | #[derive(Parser)] 9 | #[command(name = "genson-rs")] 10 | #[command(version = "0.2")] 11 | #[command(about = "Generate one, unified JSON Schema from JSON objects. Compatible with JSON Schema Draft-4 and above.", long_about = None)] 12 | #[command(author = "Junyu Wang ")] 13 | #[command(arg_required_else_help = true)] 14 | struct Cli { 15 | #[arg(short, long)] 16 | /// Must be one of "newline", "tab", "space". 17 | /// Specifying a delimiter is optional, but will improve the performance 18 | /// if your input is multiple JSON objects concatenated together (e.g. each object on a newline) 19 | delimiter: Option, 20 | 21 | #[arg(short, long, action=ArgAction::SetTrue, default_value="false")] 22 | /// Only applicable if you JSON file is one JSON arrary, and 23 | /// you only care about the schema of the JSON objects inside of it 24 | ignore_outer_array: bool, 25 | 26 | /// Path to the JSON file(s) to generate the schema from. The generated schema will 27 | /// accomodate all the JSON objects in the file(s). 28 | json_files: Option>, 29 | } 30 | 31 | /// Get the delimiter byte from the CLI arguments 32 | fn get_delimiter(cli: &Cli) -> Option { 33 | let d = cli.delimiter.as_deref(); 34 | if let Some(delimiter) = d { 35 | let delimiter = match delimiter { 36 | "newline" => Some(b'\n'), 37 | "tab" => Some(b'\t'), 38 | "space" => Some(b' '), 39 | _ => { 40 | panic!("Invalid delimiter: {}, must be one of \"newline\", \"tab\", \"space\"", delimiter); 41 | }, 42 | }; 43 | delimiter 44 | } else { 45 | None 46 | } 47 | } 48 | 49 | /// Generate a JSON Schema from a JSON file 50 | fn build_schema( 51 | builder: &mut SchemaBuilder, file_path: &str, config: BuildConfig 52 | ) { 53 | let mut object_slice = std::fs::read(file_path).unwrap(); 54 | build_json_schema(builder, &mut object_slice, &config); 55 | // NOTE: avoid dropping the object_slice to improve performance 56 | // the effect is more siginificant for larger JSON files 57 | mem::forget(object_slice); 58 | } 59 | 60 | fn main() { 61 | let cli = Cli::parse(); 62 | let delimiter = get_delimiter(&cli); 63 | 64 | if let Some(json_files) = cli.json_files.as_deref() { 65 | // parallelize the schema building process for multiple JSON files 66 | let aggregated_builder = json_files.par_iter() 67 | .fold( 68 | || get_builder(Some("AUTO")), 69 | |mut builder, file_path| { 70 | build_schema(&mut builder, file_path, BuildConfig { 71 | delimiter, 72 | ignore_outer_array: cli.ignore_outer_array, 73 | }); 74 | return builder; 75 | }).reduce( 76 | || get_builder(Some("AUTO")), 77 | |mut builder, other_builder| { 78 | builder.add_schema(other_builder.to_schema()); 79 | return builder; 80 | }); 81 | 82 | let schema = aggregated_builder.to_schema(); 83 | println!("{}", schema.to_string()); 84 | 85 | // NOTE: early exit here to avoid dropping of the `object` variable 86 | // which takes about 15~35% of the total runtime (depending on the size of the object) 87 | process::exit(0); 88 | } else { 89 | println!("No JSON files provided. Use --help for more information."); 90 | process::exit(1); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # genson-rs 2 | 3 | [![CodSpeed Badge](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/junyu-w/genson-rs) 4 | [![crates.io](https://img.shields.io/crates/v/genson-rs.svg)](https://crates.io/crates/genson-rs) 5 | [![CI](https://github.com/junyu-w/genson-rs/actions/workflows/rust.yml/badge.svg)](https://github.com/junyu-w/genson-rs/actions/workflows/rust.yml) 6 | 7 | *-- 🔥 Generate JSON Schema from Gigabytes of JSON data in seconds* 8 | 9 | `genson-rs` is a Rust rewrite of the [GenSON](https://github.com/wolverdude/genson/) Python library , which can be used to generate [JSON schema](https://json-schema.org/) (Draft-04 and after) from one or multiple JSON objects. 10 | 11 | While not having full feature parity yet, `genson-rs` focuses on **speed** ⚡️. It offers MUCH better performance (**25x ~ 75x faster**) compared to the Python `GenSON` library, and is generally a lot faster than other open source schema inference tools as well. Its high performance makes it a viable choice for online schema inference for large JSON dataset at scale. Check out the [benchmark](#benchmark) section for performance benchmark comparisons. 12 | 13 | ## Install 14 | Installation via [Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html) is the easiest. If you don't have it already, follow the link to set up Cargo (one simple command), then run: 15 | ``` 16 | cargo install genson-rs 17 | ``` 18 | Installing via `brew` will be supported soon. 19 | 20 | ## Usage 21 | ``` 22 | genson-rs