├── .circleci └── config.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── docs ├── development.md └── examples.md ├── scripts ├── README.md ├── mps-download-sampled-data.py ├── mps-download-schemas.sh ├── mps-generate-avro-data-helper.py ├── mps-generate-avro-data.sh ├── mps-generate-schemas.sh ├── mps-load-avro-bq.sh ├── mps-validate-avro-schemas.py ├── mps-verify-nested-list.sh └── mps-verify-tuple-struct.sh ├── src ├── ast.rs ├── avro.rs ├── bigquery.rs ├── casing.rs ├── jsonschema.rs ├── lib.rs ├── main.rs └── traits.rs └── tests ├── force_nullable.rs ├── normalize_case.rs ├── resolve_method.rs ├── resources ├── casing │ ├── alphanum_3.csv │ ├── mps-diff-integration.csv │ └── word_4.csv └── translate │ ├── array.json │ ├── atomic.json │ ├── json_column.json │ ├── map.json │ ├── object.json │ └── oneof.json ├── transpile_avro.rs ├── transpile_bigquery.rs └── tuple_struct.rs /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: rust:1 6 | steps: 7 | - checkout 8 | - run: 9 | name: Install dependencies 10 | command: | 11 | apt update 12 | apt install --yes --no-install-recommends libclang-dev 13 | - run: 14 | name: Version information 15 | command: rustc --version; cargo --version; rustup --version 16 | - run: 17 | name: Linting dependencies 18 | command: rustup component add rustfmt clippy 19 | - run: 20 | name: Calculate Dependencies 21 | command: cargo generate-lockfile 22 | - restore_cache: 23 | keys: 24 | - deps-{{ arch }}-{{ checksum "Cargo.lock" }} 25 | - run: 26 | name: Build all 27 | command: cargo build --all 28 | - save_cache: 29 | paths: 30 | - /usr/local/cargo/registry 31 | - target/debug/.fingerprint 32 | - target/debug/build 33 | - target/debug/deps 34 | key: dep-{{ arch }}-{{ checksum "Cargo.lock" }} 35 | - run: 36 | name: Assert compiled tests were committed 37 | command: git diff --exit-code -- tests/ 38 | - run: 39 | name: Run linting 40 | command: | 41 | set -eo pipefail 42 | cargo fmt -- --check 43 | cargo clippy 44 | - run: 45 | name: Run all tests 46 | command: | 47 | cargo test --all 48 | # also run tests with oniguruma 49 | cargo test --all --features oniguruma 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | 4 | /target 5 | **/*.rs.bk 6 | 7 | /schemas 8 | /test_tuple_results -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # v2.0.0 (2024-02-08) 2 | 3 | * Dependency updates 4 | * BigQuery: Convert `json` atoms to a JSON column type if configured in `mozPipelineMetadata` 5 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "ansi_term" 16 | version = "0.12.1" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" 19 | dependencies = [ 20 | "winapi", 21 | ] 22 | 23 | [[package]] 24 | name = "anstream" 25 | version = "0.6.11" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5" 28 | dependencies = [ 29 | "anstyle", 30 | "anstyle-parse", 31 | "anstyle-query", 32 | "anstyle-wincon", 33 | "colorchoice", 34 | "utf8parse", 35 | ] 36 | 37 | [[package]] 38 | name = "anstyle" 39 | version = "1.0.6" 40 | source = "registry+https://github.com/rust-lang/crates.io-index" 41 | checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" 42 | 43 | [[package]] 44 | name = "anstyle-parse" 45 | version = "0.2.3" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" 48 | dependencies = [ 49 | "utf8parse", 50 | ] 51 | 52 | [[package]] 53 | name = "anstyle-query" 54 | version = "1.0.2" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" 57 | dependencies = [ 58 | "windows-sys 0.52.0", 59 | ] 60 | 61 | [[package]] 62 | name = "anstyle-wincon" 63 | version = "3.0.2" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" 66 | dependencies = [ 67 | "anstyle", 68 | "windows-sys 0.52.0", 69 | ] 70 | 71 | [[package]] 72 | name = "atty" 73 | version = "0.2.14" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 76 | dependencies = [ 77 | "hermit-abi 0.1.19", 78 | "libc", 79 | "winapi", 80 | ] 81 | 82 | [[package]] 83 | name = "bindgen" 84 | version = "0.59.2" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" 87 | dependencies = [ 88 | "bitflags 1.3.2", 89 | "cexpr", 90 | "clang-sys", 91 | "clap 2.34.0", 92 | "env_logger 0.9.3", 93 | "lazy_static", 94 | "lazycell", 95 | "log", 96 | "peeking_take_while", 97 | "proc-macro2", 98 | "quote", 99 | "regex", 100 | "rustc-hash", 101 | "shlex", 102 | "which", 103 | ] 104 | 105 | [[package]] 106 | name = "bitflags" 107 | version = "1.3.2" 108 | source = "registry+https://github.com/rust-lang/crates.io-index" 109 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 110 | 111 | [[package]] 112 | name = "bitflags" 113 | version = "2.4.2" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" 116 | 117 | [[package]] 118 | name = "cc" 119 | version = "1.0.83" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" 122 | dependencies = [ 123 | "libc", 124 | ] 125 | 126 | [[package]] 127 | name = "cexpr" 128 | version = "0.6.0" 129 | source = "registry+https://github.com/rust-lang/crates.io-index" 130 | checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" 131 | dependencies = [ 132 | "nom", 133 | ] 134 | 135 | [[package]] 136 | name = "cfg-if" 137 | version = "1.0.0" 138 | source = "registry+https://github.com/rust-lang/crates.io-index" 139 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 140 | 141 | [[package]] 142 | name = "clang-sys" 143 | version = "1.7.0" 144 | source = "registry+https://github.com/rust-lang/crates.io-index" 145 | checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" 146 | dependencies = [ 147 | "glob", 148 | "libc", 149 | "libloading", 150 | ] 151 | 152 | [[package]] 153 | name = "clap" 154 | version = "2.34.0" 155 | source = "registry+https://github.com/rust-lang/crates.io-index" 156 | checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" 157 | dependencies = [ 158 | "ansi_term", 159 | "atty", 160 | "bitflags 1.3.2", 161 | "strsim 0.8.0", 162 | "textwrap", 163 | "unicode-width", 164 | "vec_map", 165 | ] 166 | 167 | [[package]] 168 | name = "clap" 169 | version = "4.4.18" 170 | source = "registry+https://github.com/rust-lang/crates.io-index" 171 | checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" 172 | dependencies = [ 173 | "clap_builder", 174 | "clap_derive", 175 | ] 176 | 177 | [[package]] 178 | name = "clap_builder" 179 | version = "4.4.18" 180 | source = "registry+https://github.com/rust-lang/crates.io-index" 181 | checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" 182 | dependencies = [ 183 | "anstream", 184 | "anstyle", 185 | "clap_lex", 186 | "strsim 0.10.0", 187 | ] 188 | 189 | [[package]] 190 | name = "clap_derive" 191 | version = "4.4.7" 192 | source = "registry+https://github.com/rust-lang/crates.io-index" 193 | checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" 194 | dependencies = [ 195 | "heck", 196 | "proc-macro2", 197 | "quote", 198 | "syn", 199 | ] 200 | 201 | [[package]] 202 | name = "clap_lex" 203 | version = "0.6.0" 204 | source = "registry+https://github.com/rust-lang/crates.io-index" 205 | checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" 206 | 207 | [[package]] 208 | name = "colorchoice" 209 | version = "1.0.0" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" 212 | 213 | [[package]] 214 | name = "diff" 215 | version = "0.1.13" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" 218 | 219 | [[package]] 220 | name = "either" 221 | version = "1.9.0" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" 224 | 225 | [[package]] 226 | name = "env_logger" 227 | version = "0.9.3" 228 | source = "registry+https://github.com/rust-lang/crates.io-index" 229 | checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" 230 | dependencies = [ 231 | "atty", 232 | "humantime", 233 | "log", 234 | "regex", 235 | "termcolor", 236 | ] 237 | 238 | [[package]] 239 | name = "env_logger" 240 | version = "0.10.2" 241 | source = "registry+https://github.com/rust-lang/crates.io-index" 242 | checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" 243 | dependencies = [ 244 | "humantime", 245 | "is-terminal", 246 | "log", 247 | "regex", 248 | "termcolor", 249 | ] 250 | 251 | [[package]] 252 | name = "errno" 253 | version = "0.3.8" 254 | source = "registry+https://github.com/rust-lang/crates.io-index" 255 | checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" 256 | dependencies = [ 257 | "libc", 258 | "windows-sys 0.52.0", 259 | ] 260 | 261 | [[package]] 262 | name = "glob" 263 | version = "0.3.1" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" 266 | 267 | [[package]] 268 | name = "heck" 269 | version = "0.4.1" 270 | source = "registry+https://github.com/rust-lang/crates.io-index" 271 | checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" 272 | 273 | [[package]] 274 | name = "hermit-abi" 275 | version = "0.1.19" 276 | source = "registry+https://github.com/rust-lang/crates.io-index" 277 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 278 | dependencies = [ 279 | "libc", 280 | ] 281 | 282 | [[package]] 283 | name = "hermit-abi" 284 | version = "0.3.5" 285 | source = "registry+https://github.com/rust-lang/crates.io-index" 286 | checksum = "d0c62115964e08cb8039170eb33c1d0e2388a256930279edca206fff675f82c3" 287 | 288 | [[package]] 289 | name = "home" 290 | version = "0.5.9" 291 | source = "registry+https://github.com/rust-lang/crates.io-index" 292 | checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" 293 | dependencies = [ 294 | "windows-sys 0.52.0", 295 | ] 296 | 297 | [[package]] 298 | name = "humantime" 299 | version = "2.1.0" 300 | source = "registry+https://github.com/rust-lang/crates.io-index" 301 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" 302 | 303 | [[package]] 304 | name = "is-terminal" 305 | version = "0.4.10" 306 | source = "registry+https://github.com/rust-lang/crates.io-index" 307 | checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" 308 | dependencies = [ 309 | "hermit-abi 0.3.5", 310 | "rustix", 311 | "windows-sys 0.52.0", 312 | ] 313 | 314 | [[package]] 315 | name = "itoa" 316 | version = "1.0.10" 317 | source = "registry+https://github.com/rust-lang/crates.io-index" 318 | checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" 319 | 320 | [[package]] 321 | name = "jsonschema-transpiler" 322 | version = "2.0.0" 323 | dependencies = [ 324 | "clap 4.4.18", 325 | "env_logger 0.10.2", 326 | "heck", 327 | "lazy_static", 328 | "log", 329 | "maplit", 330 | "onig", 331 | "pretty_assertions", 332 | "regex", 333 | "serde", 334 | "serde_json", 335 | ] 336 | 337 | [[package]] 338 | name = "lazy_static" 339 | version = "1.4.0" 340 | source = "registry+https://github.com/rust-lang/crates.io-index" 341 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 342 | 343 | [[package]] 344 | name = "lazycell" 345 | version = "1.3.0" 346 | source = "registry+https://github.com/rust-lang/crates.io-index" 347 | checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" 348 | 349 | [[package]] 350 | name = "libc" 351 | version = "0.2.153" 352 | source = "registry+https://github.com/rust-lang/crates.io-index" 353 | checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" 354 | 355 | [[package]] 356 | name = "libloading" 357 | version = "0.8.1" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "c571b676ddfc9a8c12f1f3d3085a7b163966a8fd8098a90640953ce5f6170161" 360 | dependencies = [ 361 | "cfg-if", 362 | "windows-sys 0.48.0", 363 | ] 364 | 365 | [[package]] 366 | name = "linux-raw-sys" 367 | version = "0.4.13" 368 | source = "registry+https://github.com/rust-lang/crates.io-index" 369 | checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" 370 | 371 | [[package]] 372 | name = "log" 373 | version = "0.4.20" 374 | source = "registry+https://github.com/rust-lang/crates.io-index" 375 | checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" 376 | 377 | [[package]] 378 | name = "maplit" 379 | version = "1.0.2" 380 | source = "registry+https://github.com/rust-lang/crates.io-index" 381 | checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" 382 | 383 | [[package]] 384 | name = "memchr" 385 | version = "2.7.1" 386 | source = "registry+https://github.com/rust-lang/crates.io-index" 387 | checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" 388 | 389 | [[package]] 390 | name = "minimal-lexical" 391 | version = "0.2.1" 392 | source = "registry+https://github.com/rust-lang/crates.io-index" 393 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" 394 | 395 | [[package]] 396 | name = "nom" 397 | version = "7.1.3" 398 | source = "registry+https://github.com/rust-lang/crates.io-index" 399 | checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" 400 | dependencies = [ 401 | "memchr", 402 | "minimal-lexical", 403 | ] 404 | 405 | [[package]] 406 | name = "once_cell" 407 | version = "1.19.0" 408 | source = "registry+https://github.com/rust-lang/crates.io-index" 409 | checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" 410 | 411 | [[package]] 412 | name = "onig" 413 | version = "6.4.0" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" 416 | dependencies = [ 417 | "bitflags 1.3.2", 418 | "libc", 419 | "once_cell", 420 | "onig_sys", 421 | ] 422 | 423 | [[package]] 424 | name = "onig_sys" 425 | version = "69.8.1" 426 | source = "registry+https://github.com/rust-lang/crates.io-index" 427 | checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" 428 | dependencies = [ 429 | "bindgen", 430 | "cc", 431 | "pkg-config", 432 | ] 433 | 434 | [[package]] 435 | name = "peeking_take_while" 436 | version = "0.1.2" 437 | source = "registry+https://github.com/rust-lang/crates.io-index" 438 | checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" 439 | 440 | [[package]] 441 | name = "pkg-config" 442 | version = "0.3.29" 443 | source = "registry+https://github.com/rust-lang/crates.io-index" 444 | checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" 445 | 446 | [[package]] 447 | name = "pretty_assertions" 448 | version = "1.4.0" 449 | source = "registry+https://github.com/rust-lang/crates.io-index" 450 | checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" 451 | dependencies = [ 452 | "diff", 453 | "yansi", 454 | ] 455 | 456 | [[package]] 457 | name = "proc-macro2" 458 | version = "1.0.78" 459 | source = "registry+https://github.com/rust-lang/crates.io-index" 460 | checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" 461 | dependencies = [ 462 | "unicode-ident", 463 | ] 464 | 465 | [[package]] 466 | name = "quote" 467 | version = "1.0.35" 468 | source = "registry+https://github.com/rust-lang/crates.io-index" 469 | checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" 470 | dependencies = [ 471 | "proc-macro2", 472 | ] 473 | 474 | [[package]] 475 | name = "regex" 476 | version = "1.10.3" 477 | source = "registry+https://github.com/rust-lang/crates.io-index" 478 | checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" 479 | dependencies = [ 480 | "aho-corasick", 481 | "memchr", 482 | "regex-automata", 483 | "regex-syntax", 484 | ] 485 | 486 | [[package]] 487 | name = "regex-automata" 488 | version = "0.4.5" 489 | source = "registry+https://github.com/rust-lang/crates.io-index" 490 | checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" 491 | dependencies = [ 492 | "aho-corasick", 493 | "memchr", 494 | "regex-syntax", 495 | ] 496 | 497 | [[package]] 498 | name = "regex-syntax" 499 | version = "0.8.2" 500 | source = "registry+https://github.com/rust-lang/crates.io-index" 501 | checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" 502 | 503 | [[package]] 504 | name = "rustc-hash" 505 | version = "1.1.0" 506 | source = "registry+https://github.com/rust-lang/crates.io-index" 507 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 508 | 509 | [[package]] 510 | name = "rustix" 511 | version = "0.38.31" 512 | source = "registry+https://github.com/rust-lang/crates.io-index" 513 | checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" 514 | dependencies = [ 515 | "bitflags 2.4.2", 516 | "errno", 517 | "libc", 518 | "linux-raw-sys", 519 | "windows-sys 0.52.0", 520 | ] 521 | 522 | [[package]] 523 | name = "ryu" 524 | version = "1.0.16" 525 | source = "registry+https://github.com/rust-lang/crates.io-index" 526 | checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" 527 | 528 | [[package]] 529 | name = "serde" 530 | version = "1.0.196" 531 | source = "registry+https://github.com/rust-lang/crates.io-index" 532 | checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" 533 | dependencies = [ 534 | "serde_derive", 535 | ] 536 | 537 | [[package]] 538 | name = "serde_derive" 539 | version = "1.0.196" 540 | source = "registry+https://github.com/rust-lang/crates.io-index" 541 | checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" 542 | dependencies = [ 543 | "proc-macro2", 544 | "quote", 545 | "syn", 546 | ] 547 | 548 | [[package]] 549 | name = "serde_json" 550 | version = "1.0.113" 551 | source = "registry+https://github.com/rust-lang/crates.io-index" 552 | checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79" 553 | dependencies = [ 554 | "itoa", 555 | "ryu", 556 | "serde", 557 | ] 558 | 559 | [[package]] 560 | name = "shlex" 561 | version = "1.3.0" 562 | source = "registry+https://github.com/rust-lang/crates.io-index" 563 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 564 | 565 | [[package]] 566 | name = "strsim" 567 | version = "0.8.0" 568 | source = "registry+https://github.com/rust-lang/crates.io-index" 569 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" 570 | 571 | [[package]] 572 | name = "strsim" 573 | version = "0.10.0" 574 | source = "registry+https://github.com/rust-lang/crates.io-index" 575 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 576 | 577 | [[package]] 578 | name = "syn" 579 | version = "2.0.48" 580 | source = "registry+https://github.com/rust-lang/crates.io-index" 581 | checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" 582 | dependencies = [ 583 | "proc-macro2", 584 | "quote", 585 | "unicode-ident", 586 | ] 587 | 588 | [[package]] 589 | name = "termcolor" 590 | version = "1.4.1" 591 | source = "registry+https://github.com/rust-lang/crates.io-index" 592 | checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" 593 | dependencies = [ 594 | "winapi-util", 595 | ] 596 | 597 | [[package]] 598 | name = "textwrap" 599 | version = "0.11.0" 600 | source = "registry+https://github.com/rust-lang/crates.io-index" 601 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 602 | dependencies = [ 603 | "unicode-width", 604 | ] 605 | 606 | [[package]] 607 | name = "unicode-ident" 608 | version = "1.0.12" 609 | source = "registry+https://github.com/rust-lang/crates.io-index" 610 | checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" 611 | 612 | [[package]] 613 | name = "unicode-width" 614 | version = "0.1.11" 615 | source = "registry+https://github.com/rust-lang/crates.io-index" 616 | checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" 617 | 618 | [[package]] 619 | name = "utf8parse" 620 | version = "0.2.1" 621 | source = "registry+https://github.com/rust-lang/crates.io-index" 622 | checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" 623 | 624 | [[package]] 625 | name = "vec_map" 626 | version = "0.8.2" 627 | source = "registry+https://github.com/rust-lang/crates.io-index" 628 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" 629 | 630 | [[package]] 631 | name = "which" 632 | version = "4.4.2" 633 | source = "registry+https://github.com/rust-lang/crates.io-index" 634 | checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" 635 | dependencies = [ 636 | "either", 637 | "home", 638 | "once_cell", 639 | "rustix", 640 | ] 641 | 642 | [[package]] 643 | name = "winapi" 644 | version = "0.3.9" 645 | source = "registry+https://github.com/rust-lang/crates.io-index" 646 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 647 | dependencies = [ 648 | "winapi-i686-pc-windows-gnu", 649 | "winapi-x86_64-pc-windows-gnu", 650 | ] 651 | 652 | [[package]] 653 | name = "winapi-i686-pc-windows-gnu" 654 | version = "0.4.0" 655 | source = "registry+https://github.com/rust-lang/crates.io-index" 656 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 657 | 658 | [[package]] 659 | name = "winapi-util" 660 | version = "0.1.6" 661 | source = "registry+https://github.com/rust-lang/crates.io-index" 662 | checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" 663 | dependencies = [ 664 | "winapi", 665 | ] 666 | 667 | [[package]] 668 | name = "winapi-x86_64-pc-windows-gnu" 669 | version = "0.4.0" 670 | source = "registry+https://github.com/rust-lang/crates.io-index" 671 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 672 | 673 | [[package]] 674 | name = "windows-sys" 675 | version = "0.48.0" 676 | source = "registry+https://github.com/rust-lang/crates.io-index" 677 | checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" 678 | dependencies = [ 679 | "windows-targets 0.48.5", 680 | ] 681 | 682 | [[package]] 683 | name = "windows-sys" 684 | version = "0.52.0" 685 | source = "registry+https://github.com/rust-lang/crates.io-index" 686 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 687 | dependencies = [ 688 | "windows-targets 0.52.0", 689 | ] 690 | 691 | [[package]] 692 | name = "windows-targets" 693 | version = "0.48.5" 694 | source = "registry+https://github.com/rust-lang/crates.io-index" 695 | checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" 696 | dependencies = [ 697 | "windows_aarch64_gnullvm 0.48.5", 698 | "windows_aarch64_msvc 0.48.5", 699 | "windows_i686_gnu 0.48.5", 700 | "windows_i686_msvc 0.48.5", 701 | "windows_x86_64_gnu 0.48.5", 702 | "windows_x86_64_gnullvm 0.48.5", 703 | "windows_x86_64_msvc 0.48.5", 704 | ] 705 | 706 | [[package]] 707 | name = "windows-targets" 708 | version = "0.52.0" 709 | source = "registry+https://github.com/rust-lang/crates.io-index" 710 | checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" 711 | dependencies = [ 712 | "windows_aarch64_gnullvm 0.52.0", 713 | "windows_aarch64_msvc 0.52.0", 714 | "windows_i686_gnu 0.52.0", 715 | "windows_i686_msvc 0.52.0", 716 | "windows_x86_64_gnu 0.52.0", 717 | "windows_x86_64_gnullvm 0.52.0", 718 | "windows_x86_64_msvc 0.52.0", 719 | ] 720 | 721 | [[package]] 722 | name = "windows_aarch64_gnullvm" 723 | version = "0.48.5" 724 | source = "registry+https://github.com/rust-lang/crates.io-index" 725 | checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" 726 | 727 | [[package]] 728 | name = "windows_aarch64_gnullvm" 729 | version = "0.52.0" 730 | source = "registry+https://github.com/rust-lang/crates.io-index" 731 | checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" 732 | 733 | [[package]] 734 | name = "windows_aarch64_msvc" 735 | version = "0.48.5" 736 | source = "registry+https://github.com/rust-lang/crates.io-index" 737 | checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" 738 | 739 | [[package]] 740 | name = "windows_aarch64_msvc" 741 | version = "0.52.0" 742 | source = "registry+https://github.com/rust-lang/crates.io-index" 743 | checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" 744 | 745 | [[package]] 746 | name = "windows_i686_gnu" 747 | version = "0.48.5" 748 | source = "registry+https://github.com/rust-lang/crates.io-index" 749 | checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" 750 | 751 | [[package]] 752 | name = "windows_i686_gnu" 753 | version = "0.52.0" 754 | source = "registry+https://github.com/rust-lang/crates.io-index" 755 | checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" 756 | 757 | [[package]] 758 | name = "windows_i686_msvc" 759 | version = "0.48.5" 760 | source = "registry+https://github.com/rust-lang/crates.io-index" 761 | checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" 762 | 763 | [[package]] 764 | name = "windows_i686_msvc" 765 | version = "0.52.0" 766 | source = "registry+https://github.com/rust-lang/crates.io-index" 767 | checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" 768 | 769 | [[package]] 770 | name = "windows_x86_64_gnu" 771 | version = "0.48.5" 772 | source = "registry+https://github.com/rust-lang/crates.io-index" 773 | checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" 774 | 775 | [[package]] 776 | name = "windows_x86_64_gnu" 777 | version = "0.52.0" 778 | source = "registry+https://github.com/rust-lang/crates.io-index" 779 | checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" 780 | 781 | [[package]] 782 | name = "windows_x86_64_gnullvm" 783 | version = "0.48.5" 784 | source = "registry+https://github.com/rust-lang/crates.io-index" 785 | checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" 786 | 787 | [[package]] 788 | name = "windows_x86_64_gnullvm" 789 | version = "0.52.0" 790 | source = "registry+https://github.com/rust-lang/crates.io-index" 791 | checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" 792 | 793 | [[package]] 794 | name = "windows_x86_64_msvc" 795 | version = "0.48.5" 796 | source = "registry+https://github.com/rust-lang/crates.io-index" 797 | checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" 798 | 799 | [[package]] 800 | name = "windows_x86_64_msvc" 801 | version = "0.52.0" 802 | source = "registry+https://github.com/rust-lang/crates.io-index" 803 | checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" 804 | 805 | [[package]] 806 | name = "yansi" 807 | version = "0.5.1" 808 | source = "registry+https://github.com/rust-lang/crates.io-index" 809 | checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" 810 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Anthony Miyaguchi "] 3 | description = "A tool to transpile JSON Schema into schemas for data processing" 4 | edition = "2021" 5 | license = "MPL-2.0" 6 | name = "jsonschema-transpiler" 7 | readme = "README.md" 8 | repository = "https://github.com/mozilla/jsonschema-transpiler" 9 | version = "2.0.0" 10 | 11 | [lib] 12 | name = "jst" 13 | 14 | [dependencies] 15 | clap = { version = "4.4", features = ["derive"] } 16 | env_logger = "0.10.0" 17 | heck = "0.4.1" 18 | lazy_static = {version = "1.3.0", optional = true} 19 | log = "0.4" 20 | maplit = "1.0.2" 21 | onig = {version = "6.4", optional = true} 22 | regex = "1.5.4" 23 | serde = {version = "1.0", features = ["derive"]} 24 | serde_json = "1.0" 25 | 26 | [build-dependencies] 27 | serde = {version = "1.0", features = ["derive"]} 28 | serde_json = "1.0" 29 | 30 | [dev-dependencies] 31 | pretty_assertions = "1.4.0" 32 | 33 | [features] 34 | oniguruma = ["onig", "lazy_static"] 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jsonschema-transpiler 2 | 3 | [![CircleCI](https://circleci.com/gh/mozilla/jsonschema-transpiler.svg?style=svg)](https://circleci.com/gh/mozilla/jsonschema-transpiler) 4 | 5 | A tool for transpiling [JSON Schema](https://json-schema.org/) into schemas for 6 | [Avro](https://avro.apache.org/docs/current/index.html#schemas) and 7 | [BigQuery](https://cloud.google.com/bigquery/docs/schemas). 8 | 9 | JSON Schema is primarily used to validate incoming data, but contains enough 10 | information to describe the structure of the data. The transpiler encodes the 11 | schema for use with data serialization and processing frameworks. The main 12 | use-case is to enable ingestion of JSON documents into BigQuery through an Avro 13 | intermediary. 14 | 15 | This tool can handle many of the composite types seen in modern data processing 16 | tools that support a SQL interface such as lists, structures, key-value 17 | maps, and type-variants. 18 | 19 | This tool is designed for generating new schemas from 20 | [`mozilla-pipeline-schemas`](https://github.com/mozilla-services/mozilla-pipeline-schemas), 21 | the canonical source of truth for JSON schemas in the Firefox Data Platform. 22 | 23 | ## Installation 24 | 25 | ```bash 26 | cargo install jsonschema-transpiler 27 | ``` 28 | 29 | ## Usage 30 | 31 | ```bash 32 | A tool to transpile JSON Schema into schemas for data processing 33 | 34 | Usage: jsonschema-transpiler [OPTIONS] [FILE] 35 | 36 | Arguments: 37 | [FILE] 38 | Sets the input file to use 39 | 40 | Options: 41 | -t, --type 42 | The output schema format 43 | 44 | [default: avro] 45 | 46 | Possible values: 47 | - avro: Avro format 48 | - bigquery: BigQuery format 49 | 50 | -r, --resolve 51 | The resolution strategy for incompatible or under-specified schema 52 | 53 | [default: cast] 54 | 55 | Possible values: 56 | - cast: Cast incompatible/under-specified schemas 57 | - panic: Panic on incompatible/under-specified schemas 58 | - drop: Drop incompatible/under-specified schemas 59 | 60 | -c, --normalize-case 61 | snake_case column-names for consistent behavior between SQL engines 62 | 63 | -n, --force-nullable 64 | Treats all columns as NULLABLE, ignoring the required section in the JSON Schema object 65 | 66 | --tuple-struct 67 | Treats tuple validation as an anonymous struct 68 | 69 | -w, --allow-maps-without-value 70 | Produces maps without a value field for incompatible or under-specified value schema 71 | 72 | -h, --help 73 | Print help (see a summary with '-h') 74 | 75 | -V, --version 76 | Print version 77 | ``` 78 | 79 | JSON Schemas can be read from stdin or from a file. 80 | 81 | ### Examples usage 82 | 83 | ```bash 84 | # An object with a single, optional boolean field 85 | $ schema='{"type": "object", "properties": {"foo": {"type": "boolean"}}}' 86 | 87 | $ echo $schema | jq 88 | { 89 | "type": "object", 90 | "properties": { 91 | "foo": { 92 | "type": "boolean" 93 | } 94 | } 95 | } 96 | 97 | $ echo $schema | jsonschema-transpiler --type avro 98 | { 99 | "fields": [ 100 | { 101 | "default": null, 102 | "name": "foo", 103 | "type": [ 104 | { 105 | "type": "null" 106 | }, 107 | { 108 | "type": "boolean" 109 | } 110 | ] 111 | } 112 | ], 113 | "name": "root", 114 | "type": "record" 115 | } 116 | 117 | $ echo $schema | jsonschema-transpiler --type bigquery 118 | [ 119 | { 120 | "mode": "NULLABLE", 121 | "name": "foo", 122 | "type": "BOOL" 123 | } 124 | ] 125 | ``` 126 | 127 | ## Building 128 | 129 | To build and test the package: 130 | 131 | ```bash 132 | cargo build 133 | cargo test 134 | ``` 135 | 136 | Older versions of the package (<= 1.9) relied on the use of oniguruma for 137 | performing snake-casing logic. To enable the use of this module, add a feature 138 | flag: 139 | 140 | ```bash 141 | cargo test --features oniguruma 142 | ``` 143 | 144 | ## Contributing 145 | 146 | Contributions are welcome. The API may change significantly, but the 147 | transformation between various source formats should remain consistent. To aid 148 | in the development of the transpiler, tests cases are generated from a language 149 | agnostic format under `tests/resources`. 150 | 151 | ```json 152 | { 153 | "name": "test-suite", 154 | "tests": [ 155 | { 156 | "name": "test-case", 157 | "description": [ 158 | "A short description of the test case." 159 | ], 160 | "tests": { 161 | "avro": {...}, 162 | "bigquery": {...}, 163 | "json": {...} 164 | } 165 | }, 166 | ... 167 | ] 168 | } 169 | ``` 170 | 171 | Schemas provide a type system for data-structures. Most schema languages support 172 | a similar set of primitives. There are atomic data types like booleans, 173 | integers, and floats. These atomic data types can form compound units of 174 | structure, such as objects, arrays, and maps. The absence of a value is usually 175 | denoted by a null type. There are type modifiers, like the union of two types. 176 | 177 | The following schemas are currently supported: 178 | 179 | - JSON Schema 180 | - Avro 181 | - BigQuery 182 | 183 | In the future, it may be possible to support schemas from similar systems like 184 | Parquet and Spark, or into various interactive data languages (IDL) like 185 | Avro IDL. 186 | 187 | ## Publishing 188 | 189 | The jsonschema-transpiler is distributed as a crate via Cargo. Follow this 190 | checklist for deploying to [crates.io](https://crates.io/crates/jsonschema-transpiler). 191 | 192 | 1. Bump the version number in the `Cargo.toml`, as per [Semantic Versioning](https://semver.org/). 193 | 2. Double check that `cargo test` and CI succeeds. 194 | 3. Run `cargo publish`. It must be run with the `--no-verify` flag due to issue #59. 195 | 4. Draft a new release in GitHub corresponding with the version bump. 196 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate serde; 3 | extern crate serde_json; 4 | 5 | use serde_json::Value; 6 | use std::env; 7 | use std::ffi::OsString; 8 | use std::fs::{self, File}; 9 | use std::io::{BufReader, Write}; 10 | 11 | #[derive(Serialize, Deserialize, Debug)] 12 | struct TestData { 13 | avro: Value, 14 | bigquery: Value, 15 | json: Value, 16 | #[serde(default, skip_serializing_if = "Value::is_null")] 17 | context: Value, 18 | } 19 | 20 | #[derive(Serialize, Deserialize, Debug)] 21 | struct TestCase { 22 | #[serde(default, skip_serializing_if = "Value::is_null")] 23 | description: Value, 24 | name: String, 25 | // True if the schema does not involve ambiguous sections 26 | #[serde(default)] 27 | compatible: bool, 28 | test: TestData, 29 | } 30 | 31 | #[derive(Serialize, Deserialize, Debug)] 32 | struct TestSuite { 33 | #[serde(default, skip_serializing_if = "Value::is_null")] 34 | description: Value, 35 | name: String, 36 | tests: Vec, 37 | } 38 | 39 | const TRUTHY_ENV_VALUES: [&str; 5] = ["y", "yes", "t", "true", "1"]; 40 | 41 | fn get_env_var_as_bool(var: &str, default: bool) -> bool { 42 | match env::var(var) { 43 | Ok(val) => TRUTHY_ENV_VALUES.contains(&val.to_lowercase().as_ref()), 44 | _ => default, 45 | } 46 | } 47 | 48 | fn format_json(obj: Value) -> String { 49 | let pretty = serde_json::to_string_pretty(&obj).unwrap(); 50 | // 4 spaces 51 | pretty.replace('\n', "\n ") 52 | } 53 | 54 | fn write_backup(path: &std::path::PathBuf) { 55 | let mut backup = path.to_path_buf(); 56 | let mut extension = OsString::new(); 57 | if let Some(s) = backup.extension() { 58 | extension.push(s); 59 | extension.push("."); 60 | }; 61 | extension.push("bak"); 62 | backup.set_extension(extension); 63 | println!("Backing up: {:?} -> {:?}", path, backup); 64 | fs::copy(path, backup).unwrap(); 65 | } 66 | 67 | fn write_formatted_test(path: &std::path::PathBuf, suite: &TestSuite) { 68 | println!("Formatting test: {:?}", path); 69 | let formatted = serde_json::to_string_pretty(suite).unwrap(); 70 | let fp_write = File::create(path).unwrap(); 71 | writeln!(&fp_write, "{}", formatted).unwrap() 72 | } 73 | 74 | fn write_avro_tests(mut outfile: &File, suite: &TestSuite) { 75 | for case in &suite.tests { 76 | let formatted = format!( 77 | r##" 78 | #[test]{should_panic} 79 | fn avro_{name}() {{ 80 | let input_data = r#" 81 | {input_data} 82 | "#; 83 | let expected_data = r#" 84 | {expected} 85 | "#; 86 | let mut context = Context {{ 87 | ..Default::default() 88 | }}; 89 | let input: Value = serde_json::from_str(input_data).unwrap(); 90 | let expected: Value = serde_json::from_str(expected_data).unwrap(); 91 | if expected.is_null() {{ 92 | // No expected data = no avro support 93 | return; 94 | }} 95 | 96 | assert_eq!(expected, convert_avro(&input, context.clone())); 97 | 98 | context.resolve_method = ResolveMethod::Panic; 99 | convert_avro(&input, context); 100 | }} 101 | "##, 102 | name = case.name, 103 | should_panic = if case.compatible { 104 | "" 105 | } else { 106 | "\n#[should_panic]" 107 | }, 108 | input_data = format_json(case.test.json.clone()), 109 | expected = format_json(case.test.avro.clone()), 110 | ); 111 | write!(outfile, "{}", formatted).unwrap() 112 | } 113 | } 114 | 115 | fn write_bigquery_tests(mut outfile: &File, suite: &TestSuite) { 116 | for case in &suite.tests { 117 | let formatted = format!( 118 | r##" 119 | #[test]{should_panic} 120 | fn bigquery_{name}() {{ 121 | let input_data = r#" 122 | {input_data} 123 | "#; 124 | let expected_data = r#" 125 | {expected} 126 | "#; 127 | let context_data = r#" 128 | {context} 129 | "#; 130 | let context: Value = serde_json::from_str(context_data).unwrap(); 131 | let mut context: Context = if context.is_null() {{ 132 | Default::default() 133 | }} else {{ 134 | serde_json::from_value(context).unwrap() 135 | }}; 136 | let input: Value = serde_json::from_str(input_data).unwrap(); 137 | let expected: Value = serde_json::from_str(expected_data).unwrap(); 138 | assert_eq!(expected, convert_bigquery(&input, context.clone())); 139 | 140 | context.resolve_method = ResolveMethod::Panic; 141 | convert_bigquery(&input, context); 142 | }} 143 | "##, 144 | name = case.name, 145 | should_panic = if case.compatible { 146 | "" 147 | } else { 148 | "\n#[should_panic]" 149 | }, 150 | input_data = format_json(case.test.json.clone()), 151 | expected = format_json(case.test.bigquery.clone()), 152 | context = format_json(case.test.context.clone()), 153 | ); 154 | write!(outfile, "{}", formatted).unwrap() 155 | } 156 | } 157 | 158 | fn main() { 159 | let test_cases = "tests/resources/translate"; 160 | let mut avro_fp = File::create("tests/transpile_avro.rs").unwrap(); 161 | let mut bq_fp = File::create("tests/transpile_bigquery.rs").unwrap(); 162 | let format_tests = get_env_var_as_bool("FORMAT_TESTS", true); 163 | let backup = get_env_var_as_bool("FORMAT_TESTS_BACKUP", false); 164 | 165 | write!( 166 | avro_fp, 167 | r#"use jst::convert_avro; 168 | use jst::{{Context, ResolveMethod}}; 169 | use pretty_assertions::assert_eq; 170 | use serde_json::Value; 171 | "# 172 | ) 173 | .unwrap(); 174 | 175 | write!( 176 | bq_fp, 177 | r#"use jst::convert_bigquery; 178 | use jst::{{Context, ResolveMethod}}; 179 | use pretty_assertions::assert_eq; 180 | use serde_json::Value; 181 | "# 182 | ) 183 | .unwrap(); 184 | 185 | let mut paths: Vec<_> = fs::read_dir(test_cases) 186 | .unwrap() 187 | .map(|e| e.unwrap().path()) 188 | .filter(|e| match e.file_name() { 189 | Some(os_str) => !os_str.to_str().unwrap().starts_with('.'), 190 | None => false, 191 | }) 192 | .collect(); 193 | paths.sort(); 194 | for path in paths { 195 | println!("Test file: {:?}", path); 196 | let file = File::open(&path).unwrap(); 197 | let reader = BufReader::new(file); 198 | let suite: TestSuite = serde_json::from_reader(reader).unwrap(); 199 | write_avro_tests(&avro_fp, &suite); 200 | write_bigquery_tests(&bq_fp, &suite); 201 | if backup { 202 | write_backup(&path); 203 | } 204 | if format_tests { 205 | write_formatted_test(&path, &suite); 206 | } 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Development Notes 2 | 3 | This section contains miscellaneous notes around the development of the 4 | transpiler. 5 | 6 | 7 | ### Representation of schemas 8 | Currently, schemas are deserialized directly from their JSON counterparts into 9 | Rust structs and enums using `serde_json`. Enums in Rust are similar to algebraic 10 | data types in functional languages and support robust pattern matching. As such, 11 | a common pattern is to abstract a schema into a type and a tag. 12 | 13 | The type forms a set of symbols and the rules for producing a sequence of those 14 | symbols. A simple type could be defined as follows: 15 | 16 | ```rust 17 | enum Atom { 18 | Boolean, 19 | Integer 20 | } 21 | 22 | enum Type { 23 | Null, 24 | Atom(Atom), 25 | List(Vec) 26 | } 27 | 28 | // [null, true, [null, -1]] 29 | let root = Type::List(vec![ 30 | Type::Null, 31 | Type::Atom(Atom::Boolean), 32 | Type::List(vec![ 33 | Type::Null, 34 | Type::Atom(Atom::Integer) 35 | ]) 36 | ]); 37 | ``` 38 | 39 | While it is possible to generate a schema for a document tree where the ordering 40 | of elements are fixed (by traversing the tree top-down, left-right), schema 41 | validators often assert other properties about the data structure. We may be 42 | interested in asserting the existence of names in a document; to support naming, 43 | we associate each type with a tag. 44 | 45 | A tag is attribute data associated with a type. A tag is used as a proxy in the 46 | recursive definition of a type. Traversing a schema can be done by iterating 47 | through all of the tags in order. Tags may also reference other parts of the 48 | tree, which would typically not be possible by directly defining an recursive 49 | enum. 50 | 51 | 52 | ```rust 53 | enum Type { 54 | Atom, 55 | List(Vec) 56 | } 57 | 58 | struct Tag { 59 | dtype: Type, 60 | name: String 61 | } 62 | 63 | let root = Tag { 64 | dtype: Type::List(vec![ 65 | Tag { dtype: Type::Atom, name: "foo" }, 66 | Tag { dtype: Type::Atom, name: "bar" }, 67 | ]), 68 | name: "object" 69 | }; 70 | ``` 71 | 72 | By annotating this with the appropriate `serde` attributes, we are able to obtain 73 | the following schema for free: 74 | 75 | ```json 76 | { 77 | "name": "object", 78 | "type": [ 79 | {"name": "foo", "type": "atom"}, 80 | {"name": "bar", "type": "atom"} 81 | ] 82 | } 83 | ``` -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This section contains an extended set of examples that you may modify to 4 | understand the differences between different schema formats. 5 | 6 | ### `atom.json` 7 | 8 | ```json 9 | { 10 | "type": "object", 11 | "properties": { 12 | "flag": { 13 | "type": "boolean" 14 | } 15 | }, 16 | "required": ["flag"] 17 | } 18 | ``` 19 | 20 | ### `atom-nullable.json` 21 | 22 | ```json 23 | { 24 | "type": "object", 25 | "properties": { 26 | "flag": { 27 | "type": "boolean" 28 | } 29 | } 30 | } 31 | ``` 32 | 33 | ### `list.json` 34 | 35 | ```json 36 | { 37 | "type": "object", 38 | "properties": { 39 | "feature-vector": { 40 | "type": "array", 41 | "items": { 42 | "type": "integer" 43 | } 44 | } 45 | }, 46 | "required": [ 47 | "feature-vector" 48 | ] 49 | } 50 | ``` 51 | 52 | ### `map.json` 53 | 54 | ```json 55 | { 56 | "type": "object", 57 | "properties": { 58 | "histogram": { 59 | "type": "object", 60 | "additionalProperties": { 61 | "type": "integer" 62 | } 63 | } 64 | }, 65 | "required": [ 66 | "histogram" 67 | ] 68 | } 69 | ``` 70 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Scripts 2 | 3 | This directory contains miscellaneous scripts that supplement the development of 4 | this repository. 5 | 6 | ## Integration Scripts 7 | 8 | The jsonschema-transpiler was developed for the GCP-ingestion pipeline that 9 | ingests Firefox Telemetry directly into BigQuery. Because of the complexity 10 | structure of the documents, the JSON payloads cannot be loaded directly into a 11 | table. Instead, documents are first decoded into Avro using generated schemas to 12 | guide serialization. This handles renaming columns and disambiguating structures 13 | like objects from maps. 14 | 15 | The mozilla-pipeline-schemas (mps) repo is the canonical source of schemas for 16 | the Mozilla data platform. These JSON Schema are used by the ingestion pipeline 17 | to validate incoming data for correctness. These scripts generate the necessary 18 | avro schemas, downloads sampled pipeline data, and loads the data into a GCP 19 | project. This allows manual inspection of the data. 20 | 21 | You will need valid AWS credentials for accessing `s3://telemetry-parquet`, as 22 | well as credentials for uploading to a new GCP project. Some scripts require 23 | modification, so use at your own risk. 24 | 25 | Run these scripts from the root of the project. 26 | 27 | ```bash 28 | 29 | # Check that the correct tools are installed 30 | $ python3 --help 31 | Python 3.7.2 32 | 33 | $ gsutil --version 34 | gsutil version: 4.37 35 | 36 | $ bq version 37 | This is BigQuery CLI 2.0.42 38 | 39 | # Test AWS credentials 40 | $ aws s3 ls s3://telemetry-parquet 41 | 42 | # Install python dependencies 43 | $ pip3 install --user avro-python3 python-rapidjson boto 44 | 45 | # Generates a folder schemas/ 46 | $ ./scripts/mps-download-sampled-schemas.py 47 | 48 | # Generates a folder data/ 49 | $ ./scripts/mps-download-sampled-data.py 50 | 51 | # Generates a folder avro/ 52 | $ ./scripts/mps-generate-schemas.sh 53 | 54 | # Alternatively, specify a folder and pass flags 55 | $ ./scripts/mps-generate-schemas.sh \ 56 | bq_schemas \ 57 | --type bigquery \ 58 | --resolve drop \ 59 | --normalize-case 60 | 61 | # Generates a folder avro-data/ 62 | $ ./scripts/mps-generate-avro-data.sh 63 | 64 | # Uploads data to a GCP project: gcs://test-avro-ingest/data/ 65 | # Creates multiple BigQuery datasets and tables 66 | $ ./scripts/load-avro-bq.sh 67 | ``` -------------------------------------------------------------------------------- /scripts/mps-download-sampled-data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import base64 4 | import logging 5 | import json 6 | import os 7 | 8 | import boto3 9 | 10 | # python-rapidjson 11 | import rapidjson 12 | 13 | 14 | def parse_schema_name(path): 15 | """Given a directory path to a json schema in the mps directory, generate 16 | the fully qualified name in the form `{namespace}.{doctype}.{docver}`.""" 17 | elements = path.split("/") 18 | doctype, docver = elements[-1].split(".")[:-2] 19 | namespace = elements[-3] 20 | return f"{namespace}.{doctype}.{docver}" 21 | 22 | 23 | def load_schemas(path): 24 | """return a dictionary containing "{namespace}.{doctype}.{doctype}" to validator""" 25 | schemas = {} 26 | for root, _, files in os.walk(path): 27 | for name in files: 28 | if name.endswith(".schema.json"): 29 | schemafile = os.path.join(root, name) 30 | name = parse_schema_name(schemafile) 31 | with open(schemafile, "r") as f: 32 | schemas[name] = rapidjson.Validator(f.read()) 33 | return schemas 34 | 35 | 36 | def get_schema_name(key): 37 | # Example: 38 | # sanitized-landfill-sample/v3/submission_date_s3=20190308/namespace=webpagetest/doc_type=webpagetest-run/doc_version=1/part-00122-tid-2954272513278013416-c06a39af-9979-41a5-8459-76412a4554b3-650.c000.json 39 | params = dict([x.split("=") for x in key.split("/") if "=" in x]) 40 | return ".".join(map(params.get, ["namespace", "doc_type", "doc_version"])) 41 | 42 | 43 | if __name__ == "__main__": 44 | logging.basicConfig(level=logging.INFO) 45 | 46 | root = os.path.join(os.path.dirname(__file__), "..") 47 | os.chdir(root) 48 | 49 | # current directory 50 | schemas = load_schemas("schemas") 51 | 52 | output_folder = "data" 53 | if not os.path.exists(output_folder): 54 | os.mkdir(output_folder) 55 | 56 | bucket = "telemetry-parquet" 57 | prefix = "sanitized-landfill-sample/v3/submission_date_s3=20190310" 58 | s3 = boto3.client("s3") 59 | 60 | objs = s3.list_objects(Bucket=bucket, Prefix=prefix) 61 | keys = [obj["Key"] for obj in objs["Contents"] if obj["Key"].endswith(".json")] 62 | 63 | for key in keys: 64 | schema_name = get_schema_name(key) 65 | if not schema_name in schemas: 66 | logging.info(f"schema does not exist: {schema_name}") 67 | continue 68 | 69 | data = ( 70 | s3.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8").strip() 71 | ) 72 | lines = data.split("\n") 73 | 74 | with open(f"{output_folder}/{schema_name}.ndjson", "w") as fp: 75 | errors = 0 76 | for line in lines: 77 | # each of the lines contains metadata with a content field 78 | content = json.loads(line).get("content") 79 | try: 80 | schemas[schema_name](content) 81 | except ValueError: 82 | errors += 1 83 | continue 84 | fp.write(json.dumps(json.loads(content)) + "\n") 85 | logging.info( 86 | f"wrote {len(lines)-errors}, skipped {errors} documents: {schema_name}" 87 | ) 88 | logging.info("Done!") 89 | -------------------------------------------------------------------------------- /scripts/mps-download-schemas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download production mozilla-pipeline-schemas into a schema folder 3 | 4 | cd "$(dirname "$0")/.." || exit 5 | curl -o schemas.tar.gz -L https://github.com/mozilla-services/mozilla-pipeline-schemas/archive/master.tar.gz 6 | tar --strip-components=1 -xvf schemas.tar.gz mozilla-pipeline-schemas-master/schemas 7 | rm schemas.tar.gz 8 | -------------------------------------------------------------------------------- /scripts/mps-generate-avro-data-helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import io 3 | import json 4 | import os 5 | import sys 6 | 7 | import avro.datafile 8 | import avro.io 9 | import avro.schema 10 | from fastavro import parse_schema, validation 11 | 12 | if len(sys.argv) > 1: 13 | # formatted as {namespace}.{doctype}.{docver} 14 | document = sys.argv[1] 15 | else: 16 | sys.exit("Error: missing argument for document") 17 | 18 | assert os.path.isdir("data") 19 | assert any( 20 | [document in name for name in os.listdir("data")] 21 | ), f"{document} not found in data" 22 | assert any( 23 | [document in name for name in os.listdir("avro")] 24 | ), f"{document} not found in avro schemas" 25 | 26 | 27 | def format_key(key): 28 | if not key: 29 | raise ValueError("empty key not allowed") 30 | key = key.replace("-", "_").replace(".", "_") 31 | if key[0].isdigit(): 32 | key = "_" + key 33 | return key 34 | 35 | 36 | def convert(data, schema): 37 | if schema.type == "string": 38 | if not isinstance(data, str): 39 | return json.dumps(data) 40 | 41 | if schema.type == "record": 42 | # iterate over all keys 43 | out = {} 44 | if not data: 45 | return out 46 | 47 | # convert a nested 48 | if isinstance(data, list) and set(schema.field_map.keys()) == {"list"}: 49 | data = {"list": data} 50 | # cast tuple into an object before continuing 51 | if isinstance(data, list): 52 | data = {f"f{i}_": v for i, v in enumerate(data)} 53 | for key, value in data.items(): 54 | # apply the appropriate transformations on the key 55 | key = format_key(key) 56 | field = schema.field_map.get(key) 57 | if not field: 58 | continue 59 | out[key] = convert(value, field.type) 60 | return out 61 | 62 | if schema.type == "union": 63 | for sub in schema.schemas: 64 | if sub.type == "null": 65 | continue 66 | out = convert(data, sub) 67 | return out 68 | 69 | if schema.type == "array": 70 | out = [] 71 | if not data: 72 | return out 73 | for item in data: 74 | out.append(convert(item, schema.items)) 75 | return out 76 | 77 | if schema.type == "map": 78 | out = {} 79 | for key, value in data.items(): 80 | out[key] = convert(value, schema.values) 81 | return out 82 | 83 | # terminal node, do nothing 84 | return data 85 | 86 | 87 | outdir = "avro-data" 88 | if not os.path.exists(outdir): 89 | os.makedirs(outdir) 90 | 91 | with open(f"avro/{document}.schema.json", "r") as f: 92 | schema_data = f.read() 93 | schema = avro.schema.Parse(schema_data) 94 | 95 | outfile = open(f"{outdir}/{document}.avro", "wb") 96 | writer = avro.datafile.DataFileWriter(outfile, avro.io.DatumWriter(), schema) 97 | 98 | with open(f"data/{document}.ndjson", "r") as f: 99 | data = f.readlines() 100 | 101 | try: 102 | orig = None 103 | for line in data: 104 | orig = json.loads(line) 105 | out = convert(orig, schema) 106 | writer.append(out) 107 | except: 108 | with open("test.json", "w") as f: 109 | json.dump(orig, f) 110 | with open("test-schema.json", "w") as f: 111 | json.dump(schema.to_json(), f, indent=2) 112 | validation.validate(out, parse_schema(schema.to_json())) 113 | 114 | writer.close() 115 | -------------------------------------------------------------------------------- /scripts/mps-generate-avro-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0")/.." || exit 4 | 5 | documents=$(ls data | sed 's/.ndjson//') 6 | 7 | total=0 8 | failed=0 9 | for document in $documents; do 10 | if ! python3 scripts/mps-generate-avro-data-helper.py $document; then 11 | echo "failed to write $document" 12 | rm "avro-data/$document.avro" 13 | ((failed++)) 14 | fi 15 | ((total++)) 16 | done 17 | 18 | echo "$((total - failed))/$total succeeded" -------------------------------------------------------------------------------- /scripts/mps-generate-schemas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Test the jsonschema transpiler against documents in mozilla-pipeline-schemas. 3 | 4 | cd "$(dirname "$0")/.." || exit 5 | 6 | if [[ ! -d "schemas/" ]]; then 7 | echo "Run scripts/mps-download-schemas.sh to retrieve schemas" 8 | exit 1 9 | fi 10 | 11 | cargo build 12 | bin="target/debug/jsonschema-transpiler" 13 | 14 | schemas=$(find schemas/ -name "*.schema.json") 15 | 16 | # create a new folder for avro schemas 17 | outdir=${1:-"avro"} 18 | if [[ -d $outdir ]]; then 19 | rm -r $outdir 20 | fi 21 | shift; 22 | 23 | mkdir $outdir 24 | 25 | total=0 26 | failed=0 27 | for schema in $schemas; do 28 | namespace=$(basename $(dirname $(dirname $schema))) 29 | schema_filename=$(basename $schema) 30 | outfile="$outdir/$namespace.$schema_filename" 31 | 32 | if ! $bin "$@" "$schema" > $outfile; then 33 | echo "Failed on $schema" 34 | rm $outfile 35 | ((failed++)) 36 | fi 37 | ((total++)) 38 | done 39 | 40 | echo "$((total - failed))/$total succeeded" 41 | -------------------------------------------------------------------------------- /scripts/mps-load-avro-bq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0")/.." || exit 4 | 5 | project_id=$(gcloud config get-value project) 6 | dataset_id="test_avro" 7 | 8 | gsutil -m cp avro-data/* gs://${project_id}/data 9 | bq rm -rf $dataset_id 10 | bq mk $dataset_id 11 | 12 | total=0 13 | error=0 14 | skip=0 15 | 16 | trap "exit" INT 17 | for document in $(ls avro-data | sed 's/.avro//'); do 18 | # downcase hyphens to underscores before generating names 19 | bq_document=$(echo $document | sed 's/-/_/g') 20 | namespace=$(echo $bq_document | cut -d. -f1) 21 | doctype=$(echo $bq_document | cut -d. -f2) 22 | docver=$(echo $bq_document | cut -d. -f3) 23 | 24 | table_exists=$(bq ls ${dataset_id} | grep ${namespace}__${doctype}_v${docver}) 25 | 26 | if [[ ! -z ${SKIP_EXISTING+x} ]] && [[ ! -z ${table_exists} ]]; then 27 | echo "skipping bq load for ${document}" 28 | ((skip++)) 29 | continue 30 | fi 31 | 32 | echo "running bq load for ${document}" 33 | bq load --source_format=AVRO \ 34 | --replace \ 35 | ${dataset_id}.${namespace}__${doctype}_v${docver} \ 36 | gs://${project_id}/data/${document}.avro 37 | 38 | if [[ $? -ne 0 ]]; then 39 | ((error++)) 40 | fi 41 | ((total++)) 42 | done 43 | 44 | echo "$((total-error))/$total loaded successfully, $skip skipped" -------------------------------------------------------------------------------- /scripts/mps-validate-avro-schemas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """A one off script to test whether mozilla-pipeline-schemas are generated 4 | as valid avro schemas. This script requires avro to be installed, as well 5 | as running the `mps-download-schemas.sh` script.""" 6 | 7 | import json 8 | import logging 9 | import os 10 | import subprocess 11 | import avro.schema 12 | 13 | 14 | ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 15 | BIN = "target/debug/jsonschema_transpiler" 16 | 17 | 18 | def build(): 19 | """Initialize a build of the transpiler.""" 20 | subprocess.call(["cargo", "build"]) 21 | 22 | 23 | def avro_schema(path): 24 | """Return an avro schema in json if valid, None otherwise.""" 25 | schema = None 26 | try: 27 | data = subprocess.check_output( 28 | [BIN, "--from-file", path, "--type", "avro"], stderr=subprocess.DEVNULL 29 | ) 30 | schema = json.loads(data) 31 | except subprocess.CalledProcessError: 32 | pass 33 | return schema 34 | 35 | 36 | def parse_schema_name(path): 37 | """Given a directory path to a json schema in the mps directory, generate 38 | the fully qualified name in the form `{namespace}.{doctype}.{docver}`.""" 39 | elements = path.split("/") 40 | doctype, docver = elements[-1].split(".")[:-2] 41 | namespace = elements[-3] 42 | return f"{namespace}.{doctype}.{docver}" 43 | 44 | 45 | def test_documents(mps_path): 46 | """Walk the schemas directory, generate the document, and parse it.""" 47 | total = 0 48 | error = 0 49 | skipped = 0 50 | for root, _, files in os.walk(mps_path): 51 | for name in files: 52 | if name.endswith(".schema.json"): 53 | path = os.path.join(root, name) 54 | schema_name = parse_schema_name(path) 55 | schema = avro_schema(path) 56 | if not schema: 57 | logging.info(f"failed to convert {schema_name}") 58 | skipped += 1 59 | continue 60 | total += 1 61 | try: 62 | avro.schema.Parse(json.dumps(schema)) 63 | except Exception as e: 64 | logging.info(f"failed to parse {schema_name}") 65 | error += 1 66 | logging.debug(json.dumps(schema)) 67 | logging.exception(e) 68 | 69 | logging.info(f"{error}/{total} parsing errors, {skipped} skipped") 70 | 71 | 72 | if __name__ == "__main__": 73 | logging.basicConfig(level=logging.DEBUG) 74 | build() 75 | test_documents(os.path.join(ROOT, "schemas")) 76 | -------------------------------------------------------------------------------- /scripts/mps-verify-nested-list.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0").." 4 | 5 | datadir=$(mktemp -d -t tmp.XXXXXXXXXX) 6 | function cleanup { 7 | echo "Running cleanup!" 8 | rm -rf "$datadir" 9 | } 10 | trap cleanup EXIT 11 | 12 | scripts/mps-download-schemas.sh 13 | 14 | avro_no_tuple_control=$datadir/avro-no-tuple-control 15 | avro_tuple_control=$datadir/avro-tuple-control 16 | avro_no_tuple=$datadir/avro-no-tuple 17 | avro_tuple=$datadir/avro-tuple 18 | 19 | bq_no_tuple_control=$datadir/bq-no-tuple-control 20 | bq_tuple_control=$datadir/bq-tuple-control 21 | bq_no_tuple=$datadir/bq-no-tuple 22 | bq_tuple=$datadir/bq-tuple 23 | 24 | 25 | # get control values 26 | git checkout v1.5.0 27 | 28 | scripts/mps-generate-schemas.sh $avro_no_tuple_control --type avro --resolve drop 29 | scripts/mps-generate-schemas.sh $avro_tuple_control --type avro --resolve drop --tuple-struct 30 | scripts/mps-generate-schemas.sh $bq_no_tuple_control --type bigquery --resolve drop 31 | scripts/mps-generate-schemas.sh $bq_tuple_control --type bigquery --resolve drop --tuple-struct 32 | 33 | git checkout - 34 | 35 | # get values for tuple/no-tuple 36 | scripts/mps-generate-schemas.sh $avro_no_tuple --type avro --resolve drop 37 | scripts/mps-generate-schemas.sh $avro_tuple --type avro --resolve drop --tuple-struct 38 | scripts/mps-generate-schemas.sh $bq_no_tuple --type bigquery --resolve drop 39 | scripts/mps-generate-schemas.sh $bq_tuple --type bigquery --resolve drop --tuple-struct 40 | 41 | outdir="test_nested_list_results" 42 | mkdir -p $outdir 43 | 44 | diff -r $avro_no_tuple_control $avro_no_tuple > $outdir/avro-no-tuple.diff 45 | diff -r $avro_tuple_control $avro_tuple > $outdir/avro-tuple.diff 46 | diff -r $bq_no_tuple_control $bq_no_tuple > $outdir/bq-no-tuple.diff 47 | diff -r $bq_tuple_control $bq_tuple > $outdir/bq-tuple.diff 48 | -------------------------------------------------------------------------------- /scripts/mps-verify-tuple-struct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "$(dirname "$0").." 4 | 5 | datadir=$(mktemp -d -t tmp.XXXXXXXXXX) 6 | function cleanup { 7 | echo "Running cleanup!" 8 | rm -rf "$datadir" 9 | } 10 | trap cleanup EXIT 11 | 12 | scripts/mps-download-schemas.sh 13 | 14 | avro_control=$datadir/avro-control 15 | avro_no_tuple=$datadir/avro-no-tuple 16 | avro_tuple=$datadir/avro-tuple 17 | 18 | bq_control=$datadir/bq-control 19 | bq_no_tuple=$datadir/bq-no-tuple 20 | bq_tuple=$datadir/bq-tuple 21 | 22 | 23 | # get control values 24 | git checkout v1.4.1 25 | 26 | scripts/mps-generate-schemas.sh $avro_control --type avro --resolve drop 27 | scripts/mps-generate-schemas.sh $bq_control --type bigquery --resolve drop 28 | 29 | git checkout - 30 | 31 | # get values for tuple/no-tuple 32 | scripts/mps-generate-schemas.sh $avro_no_tuple --type avro --resolve drop 33 | scripts/mps-generate-schemas.sh $avro_tuple --type avro --resolve drop --tuple-struct 34 | scripts/mps-generate-schemas.sh $bq_no_tuple --type bigquery --resolve drop 35 | scripts/mps-generate-schemas.sh $bq_tuple --type bigquery --resolve drop --tuple-struct 36 | 37 | outdir="test_tuple_results" 38 | mkdir -p $outdir 39 | 40 | diff -r $avro_control $avro_no_tuple > $outdir/avro-no-tuple.diff 41 | diff -r $bq_control $bq_no_tuple > $outdir/bq-no-tuple.diff 42 | diff -r $avro_no_tuple $avro_tuple > $outdir/avro-tuple.diff 43 | diff -r $bq_no_tuple $bq_tuple > $outdir/bq-tuple.diff 44 | -------------------------------------------------------------------------------- /src/avro.rs: -------------------------------------------------------------------------------- 1 | /// https://avro.apache.org/docs/current/spec.html 2 | use super::ast; 3 | use super::TranslateFrom; 4 | use super::{Context, ResolveMethod}; 5 | use serde_json::{json, Value}; 6 | 7 | #[derive(Serialize, Deserialize, Debug)] 8 | #[serde(rename_all = "lowercase", tag = "type")] 9 | pub enum Primitive { 10 | Null, 11 | Boolean, 12 | Int, 13 | Long, 14 | Float, 15 | Double, 16 | Bytes, 17 | String, 18 | } 19 | 20 | #[derive(Serialize, Deserialize, Debug, Default)] 21 | struct CommonAttributes { 22 | name: String, 23 | #[serde(skip_serializing_if = "Option::is_none")] 24 | namespace: Option, 25 | #[serde(skip_serializing_if = "Option::is_none")] 26 | doc: Option, 27 | #[serde(skip_serializing_if = "Option::is_none")] 28 | aliases: Option>, 29 | } 30 | 31 | #[derive(Serialize, Deserialize, Debug)] 32 | pub struct Record { 33 | #[serde(flatten)] 34 | common: CommonAttributes, 35 | fields: Vec, 36 | } 37 | 38 | // The field doesn't handle the canonical form naively e.g. a null record 39 | // `{"name": "foo", "type": "null"}` must explicitly nest the type in the 40 | // following form: `{"name": "foo", "type": {"type": "null"}}`. Applying 41 | // flattening at this level will produce the wrong results for nested objects. 42 | // We may apply an extra layer of indirection in code by using a `FieldType`, 43 | // but this does not affect correctness of the schema. 44 | #[derive(Serialize, Deserialize, Debug, Default)] 45 | #[serde(tag = "type")] 46 | struct Field { 47 | name: String, 48 | #[serde(skip_serializing_if = "Option::is_none")] 49 | doc: Option, 50 | #[serde(rename = "type")] 51 | data_type: Type, 52 | #[serde(skip_serializing_if = "Option::is_none")] 53 | default: Option, 54 | } 55 | 56 | #[derive(Serialize, Deserialize, Debug)] 57 | pub struct Enum { 58 | #[serde(flatten)] 59 | common: CommonAttributes, 60 | symbols: Vec, 61 | } 62 | 63 | #[derive(Serialize, Deserialize, Debug)] 64 | pub struct Array { 65 | items: Box, 66 | } 67 | 68 | #[derive(Serialize, Deserialize, Debug)] 69 | pub struct Map { 70 | values: Box, 71 | } 72 | 73 | #[derive(Serialize, Deserialize, Debug)] 74 | pub struct Fixed { 75 | // this field, however, does not support the doc attribute 76 | #[serde(flatten)] 77 | common: CommonAttributes, 78 | size: usize, 79 | } 80 | 81 | #[derive(Serialize, Deserialize, Debug)] 82 | #[serde(rename_all = "lowercase", tag = "type")] 83 | pub enum Complex { 84 | Record(Record), 85 | Enum(Enum), 86 | Array(Array), 87 | Map(Map), 88 | Fixed(Fixed), 89 | } 90 | 91 | #[derive(Serialize, Deserialize, Debug)] 92 | #[serde(untagged)] 93 | pub enum Type { 94 | Primitive(Primitive), 95 | Complex(Complex), 96 | // A union is categorized as a complex type, but acts as a top-level type. 97 | // It is delineated by the presence of a JSON array in the type field. This 98 | // particular definition allows for nested unions, which is not valid avro. 99 | Union(Vec), 100 | } 101 | 102 | impl Default for Type { 103 | fn default() -> Self { 104 | Type::Primitive(Primitive::Null) 105 | } 106 | } 107 | 108 | impl TranslateFrom for Type { 109 | type Error = String; 110 | 111 | fn translate_from(tag: ast::Tag, context: &Context) -> Result { 112 | let mut tag = tag; 113 | if tag.is_root { 114 | // Name inference is run only from the root for the proper 115 | // construction of the namespace. Fully qualified names require a 116 | // top-down approach. 117 | tag.collapse(); 118 | tag.name = Some("root".into()); 119 | tag.infer_name(context.normalize_case); 120 | } 121 | tag.infer_nullability(context.force_nullable); 122 | 123 | let fmt_reason = 124 | |reason: &str| -> String { format!("{} - {}", tag.fully_qualified_name(), reason) }; 125 | let handle_error = |reason: &str| -> Result { 126 | let message = fmt_reason(reason); 127 | match context.resolve_method { 128 | ResolveMethod::Cast => { 129 | warn!("{}", message); 130 | Ok(Type::Primitive(Primitive::String)) 131 | } 132 | ResolveMethod::Drop => Err(message), 133 | ResolveMethod::Panic => panic!("{}", message), 134 | } 135 | }; 136 | 137 | let data_type = match &tag.data_type { 138 | ast::Type::Null => Type::Primitive(Primitive::Null), 139 | ast::Type::Atom(atom) => Type::Primitive(match atom { 140 | ast::Atom::Boolean => Primitive::Boolean, 141 | ast::Atom::Integer => Primitive::Long, 142 | ast::Atom::Number => Primitive::Double, 143 | ast::Atom::String => Primitive::String, 144 | ast::Atom::Datetime => Primitive::String, 145 | ast::Atom::Bytes => Primitive::Bytes, 146 | ast::Atom::Json => match handle_error("json atom") { 147 | Ok(_) => Primitive::String, 148 | Err(reason) => return Err(reason), 149 | }, 150 | }), 151 | ast::Type::Object(object) => { 152 | let mut fields: Vec = if object.fields.is_empty() { 153 | Vec::new() 154 | } else { 155 | object 156 | .fields 157 | .iter() 158 | .map(|(k, v)| { 159 | let default = if v.nullable { Some(json!(null)) } else { None }; 160 | ( 161 | k.to_string(), 162 | Type::translate_from(*v.clone(), context), 163 | default, 164 | ) 165 | }) 166 | .filter(|(_, v, _)| v.is_ok()) 167 | .map(|(name, data_type, default)| Field { 168 | name, 169 | data_type: data_type.unwrap(), 170 | default, 171 | ..Default::default() 172 | }) 173 | .collect() 174 | }; 175 | 176 | if fields.is_empty() { 177 | handle_error("empty object")? 178 | } else { 179 | fields.sort_by_key(|v| v.name.to_string()); 180 | let record = Record { 181 | common: CommonAttributes { 182 | // This is not a safe assumption 183 | name: tag.name.clone().unwrap_or_else(|| "__UNNAMED__".into()), 184 | namespace: tag.namespace.clone(), 185 | ..Default::default() 186 | }, 187 | fields, 188 | }; 189 | if record.common.name == "__UNNAMED__" { 190 | warn!("{} - Unnamed field", tag.fully_qualified_name()); 191 | } 192 | Type::Complex(Complex::Record(record)) 193 | } 194 | } 195 | ast::Type::Tuple(tuple) => { 196 | let fields = tuple 197 | .items 198 | .iter() 199 | .enumerate() 200 | .map(|(i, v)| { 201 | let default = if v.nullable { Some(json!(null)) } else { None }; 202 | ( 203 | format!("f{}_", i), 204 | Type::translate_from(v.clone(), context), 205 | default, 206 | ) 207 | }) 208 | .filter(|(_, v, _)| v.is_ok()) 209 | .map(|(name, data_type, default)| Field { 210 | name, 211 | data_type: data_type.unwrap(), 212 | default, 213 | ..Default::default() 214 | }) 215 | .collect(); 216 | let record = Record { 217 | common: CommonAttributes { 218 | name: tag.name.clone().unwrap_or_else(|| "__UNNAMED__".into()), 219 | namespace: tag.namespace.clone(), 220 | ..Default::default() 221 | }, 222 | fields, 223 | }; 224 | if record.common.name == "__UNNAMED__" { 225 | warn!("{} - Unnamed field", tag.fully_qualified_name()); 226 | } 227 | Type::Complex(Complex::Record(record)) 228 | } 229 | ast::Type::Array(array) => { 230 | let child_is_array = matches!(&array.items.data_type, ast::Type::Array(_)); 231 | match Type::translate_from(*array.items.clone(), context) { 232 | Ok(data_type) => { 233 | if child_is_array { 234 | Type::Complex(Complex::Array(Array { 235 | items: Box::new(Type::Complex(Complex::Record(Record { 236 | common: CommonAttributes { 237 | name: tag 238 | .name 239 | .clone() 240 | .unwrap_or_else(|| "__UNNAMED__".into()), 241 | namespace: tag.namespace.clone(), 242 | ..Default::default() 243 | }, 244 | fields: vec![Field { 245 | name: "list".into(), 246 | data_type, 247 | ..Default::default() 248 | }], 249 | }))), 250 | })) 251 | } else { 252 | Type::Complex(Complex::Array(Array { 253 | items: Box::new(data_type), 254 | })) 255 | } 256 | } 257 | Err(_) => return Err(fmt_reason("untyped array")), 258 | } 259 | } 260 | ast::Type::Map(map) => match Type::translate_from(*map.value.clone(), context) { 261 | Ok(data_type) => Type::Complex(Complex::Map(Map { 262 | values: Box::new(data_type), 263 | })), 264 | // Err is only reachable when context.resolve_method is Drop 265 | Err(_) => { 266 | return if context.allow_maps_without_value { 267 | Err(fmt_reason("map value cannot be dropped in avro")) 268 | } else { 269 | Err(fmt_reason("untyped map value")) 270 | } 271 | } 272 | }, 273 | _ => handle_error("unknown type")?, 274 | }; 275 | if tag.nullable && !tag.is_null() { 276 | Ok(Type::Union(vec![ 277 | Type::Primitive(Primitive::Null), 278 | data_type, 279 | ])) 280 | } else { 281 | Ok(data_type) 282 | } 283 | } 284 | } 285 | 286 | #[cfg(test)] 287 | mod tests { 288 | use super::*; 289 | use pretty_assertions::assert_eq; 290 | use serde_json::json; 291 | 292 | fn assert_serialize(expect: Value, schema: Type) { 293 | assert_eq!(expect, json!(schema)) 294 | } 295 | 296 | fn type_from_value(value: Value) -> Type { 297 | serde_json::from_value(value).unwrap() 298 | } 299 | 300 | fn assert_from_ast_eq(ast: Value, avro: Value) { 301 | let context = Context { 302 | ..Default::default() 303 | }; 304 | let tag: ast::Tag = serde_json::from_value(ast).unwrap(); 305 | let from_tag = Type::translate_from(tag, &context).unwrap(); 306 | assert_eq!(avro, json!(from_tag)) 307 | } 308 | 309 | #[test] 310 | fn serialize_primitive() { 311 | let schema = Type::Primitive(Primitive::Null); 312 | let expect = json!({"type": "null"}); 313 | assert_serialize(expect, schema); 314 | } 315 | 316 | #[test] 317 | fn serialize_complex_record() { 318 | let fields = vec![ 319 | Field { 320 | name: "test-bool".into(), 321 | data_type: Type::Primitive(Primitive::Boolean), 322 | ..Default::default() 323 | }, 324 | Field { 325 | name: "test-int".into(), 326 | data_type: Type::Primitive(Primitive::Int), 327 | ..Default::default() 328 | }, 329 | Field { 330 | name: "test-string".into(), 331 | data_type: Type::Primitive(Primitive::String), 332 | ..Default::default() 333 | }, 334 | ]; 335 | 336 | let schema = Type::Complex(Complex::Record(Record { 337 | common: CommonAttributes { 338 | name: "test-record".into(), 339 | ..Default::default() 340 | }, 341 | fields, 342 | })); 343 | 344 | let expect = json!({ 345 | "type": "record", 346 | "name": "test-record", 347 | "fields": [ 348 | {"name": "test-bool", "type": {"type": "boolean"}}, 349 | {"name": "test-int", "type": {"type": "int"}}, 350 | {"name": "test-string", "type": {"type": "string"}}, 351 | ] 352 | }); 353 | 354 | assert_serialize(expect, schema); 355 | } 356 | 357 | #[test] 358 | fn serialize_complex_enum() { 359 | let schema = Type::Complex(Complex::Enum(Enum { 360 | common: CommonAttributes { 361 | name: "test-enum".into(), 362 | ..Default::default() 363 | }, 364 | symbols: vec!["A".into(), "B".into(), "C".into()], 365 | })); 366 | let expect = json!({ 367 | "type": "enum", 368 | "name": "test-enum", 369 | "symbols": ["A", "B", "C"] 370 | }); 371 | assert_serialize(expect, schema); 372 | } 373 | 374 | #[test] 375 | fn serialize_complex_array() { 376 | let schema = Type::Complex(Complex::Array(Array { 377 | items: Box::new(Type::Primitive(Primitive::String)), 378 | })); 379 | let expect = json!({ 380 | "type": "array", 381 | "items": { 382 | "type": "string" 383 | } 384 | }); 385 | assert_serialize(expect, schema); 386 | } 387 | 388 | #[test] 389 | fn serialize_complex_map() { 390 | let schema = Type::Complex(Complex::Map(Map { 391 | values: Box::new(Type::Primitive(Primitive::Long)), 392 | })); 393 | let expect = json!({ 394 | "type": "map", 395 | "values": { 396 | "type": "long" 397 | } 398 | }); 399 | assert_serialize(expect, schema); 400 | } 401 | 402 | #[test] 403 | fn serialize_complex_union() { 404 | let schema = Type::Union(vec![ 405 | Type::Primitive(Primitive::Null), 406 | Type::Primitive(Primitive::Long), 407 | ]); 408 | let expect = json!([ 409 | {"type": "null"}, 410 | {"type": "long"}, 411 | ]); 412 | assert_serialize(expect, schema); 413 | } 414 | 415 | #[test] 416 | fn serialize_complex_fixed() { 417 | let schema = Type::Complex(Complex::Fixed(Fixed { 418 | common: CommonAttributes { 419 | name: "md5".into(), 420 | ..Default::default() 421 | }, 422 | size: 16, 423 | })); 424 | let expect = json!({ 425 | "type": "fixed", 426 | "size": 16, 427 | "name": "md5" 428 | }); 429 | assert_serialize(expect, schema); 430 | } 431 | 432 | #[test] 433 | fn deserialize_primitive() { 434 | let data = json!({ 435 | "type": "int" 436 | }); 437 | match type_from_value(data) { 438 | Type::Primitive(Primitive::Int) => (), 439 | _ => panic!(), 440 | } 441 | } 442 | 443 | #[test] 444 | fn deserialize_complex_record() { 445 | let data = json!({ 446 | "type": "record", 447 | "name": "test-record", 448 | "fields": [ 449 | {"name": "test-bool", "type": {"type": "boolean"}}, 450 | {"name": "test-int", "type": {"type": "int"}}, 451 | {"name": "test-string", "type": {"type": "string"}}, 452 | ] 453 | }); 454 | match type_from_value(data) { 455 | Type::Complex(Complex::Record(record)) => { 456 | assert_eq!(record.fields[0].name, "test-bool"); 457 | assert_eq!(record.fields[1].name, "test-int"); 458 | assert_eq!(record.fields[2].name, "test-string"); 459 | } 460 | _ => panic!(), 461 | } 462 | } 463 | 464 | #[test] 465 | fn deserialize_complex_enum() { 466 | let data = json!({ 467 | "type": "enum", 468 | "name": "test-enum", 469 | "symbols": ["A", "B", "C"] 470 | }); 471 | match type_from_value(data) { 472 | Type::Complex(Complex::Enum(enum_type)) => { 473 | assert_eq!(enum_type.symbols, vec!["A", "B", "C"]); 474 | } 475 | _ => panic!(), 476 | } 477 | } 478 | 479 | #[test] 480 | fn deserialize_complex_array() { 481 | let data = json!({ 482 | "type": "array", 483 | "items": { 484 | "type": "string" 485 | } 486 | }); 487 | match type_from_value(data) { 488 | Type::Complex(Complex::Array(array)) => match *array.items { 489 | Type::Primitive(Primitive::String) => (), 490 | _ => panic!(), 491 | }, 492 | _ => panic!(), 493 | } 494 | } 495 | 496 | #[test] 497 | fn deserialize_complex_map() { 498 | let data = json!({ 499 | "type": "map", 500 | "values": { 501 | "type": "long" 502 | } 503 | }); 504 | match type_from_value(data) { 505 | Type::Complex(Complex::Map(map)) => match *map.values { 506 | Type::Primitive(Primitive::Long) => (), 507 | _ => panic!(), 508 | }, 509 | _ => panic!(), 510 | } 511 | } 512 | 513 | #[test] 514 | fn deserialize_complex_union() { 515 | let data = json!([ 516 | {"type": "null"}, 517 | {"type": "long"}, 518 | ]); 519 | match type_from_value(data) { 520 | Type::Union(union) => { 521 | match union[0] { 522 | Type::Primitive(Primitive::Null) => (), 523 | _ => panic!(), 524 | }; 525 | match union[1] { 526 | Type::Primitive(Primitive::Long) => (), 527 | _ => panic!(), 528 | }; 529 | } 530 | _ => panic!(), 531 | } 532 | } 533 | 534 | #[test] 535 | fn deserialize_complex_fixed() { 536 | let data = json!({ 537 | "type": "fixed", 538 | "size": 16, 539 | "name": "md5" 540 | }); 541 | match type_from_value(data) { 542 | Type::Complex(Complex::Fixed(fixed)) => { 543 | assert_eq!(fixed.common.name, "md5"); 544 | assert_eq!(fixed.size, 16); 545 | } 546 | _ => panic!(), 547 | } 548 | } 549 | 550 | #[test] 551 | fn from_ast_null() { 552 | let ast = json!({"type": "null"}); 553 | let avro = json!({"type": "null"}); 554 | assert_from_ast_eq(ast, avro); 555 | } 556 | 557 | #[test] 558 | fn from_ast_atom() { 559 | let ast = json!({"type": {"atom": "integer"}}); 560 | let avro = json!({"type": "long"}); 561 | assert_from_ast_eq(ast, avro); 562 | } 563 | 564 | #[test] 565 | fn from_ast_object() { 566 | // Note the inclusion of `is_root`, which is required for proper 567 | // namespace resolution. For testing purposes, this property is only 568 | // included when testing deeply nested data-structures. 569 | let ast = json!({ 570 | "is_root": true, 571 | "type": {"object": { 572 | // An additional case could be made for the behavior of nested 573 | // structs and nested arrays. A nullable array for example may 574 | // not be a valid structure in bigquery. 575 | "required": ["1-test-int", "3-test-nested", "4-test-array"], 576 | "fields": { 577 | "0-test-null": {"type": "null"}, 578 | "1-test-int": {"type": {"atom": "integer"}}, 579 | "2-test-null-int": {"type": {"atom": "integer"}, "nullable": true}, 580 | "3-test-nested": {"type": {"object": {"fields": { 581 | "test-bool": { 582 | "type": {"atom": "boolean"}, 583 | "nullable": true 584 | }}}}}, 585 | "4-test-array": {"type": {"array": { 586 | "items": {"type": {"atom": "integer"}}}}}, 587 | "$invalid-name": {"type": "null"} 588 | }}} 589 | }); 590 | let avro = json!({ 591 | "type": "record", 592 | "name": "root", 593 | "fields": [ 594 | {"name": "_0_test_null", "type": {"type": "null"}, "default": null}, 595 | {"name": "_1_test_int", "type": {"type": "long"}}, 596 | {"name": "_2_test_null_int", 597 | "type": [ 598 | {"type": "null"}, 599 | {"type": "long"}, 600 | ], 601 | "default": null, 602 | }, 603 | {"name": "_3_test_nested", "type": { 604 | "name": "_3_test_nested", 605 | "namespace": "root", 606 | "type": "record", 607 | "fields": [ 608 | {"name": "test_bool", 609 | "type": [ 610 | {"type": "null"}, 611 | {"type": "boolean"}, 612 | ], 613 | "default": null, 614 | }, 615 | ]}}, 616 | {"name": "_4_test_array", "type": { 617 | "type": "array", 618 | "items": {"type": "long"} 619 | }} 620 | ] 621 | }); 622 | assert_from_ast_eq(ast, avro); 623 | } 624 | 625 | #[test] 626 | fn from_ast_map() { 627 | let ast = json!({ 628 | "type": {"map": { 629 | "key": {"type": {"atom": "string"}}, 630 | "value": {"type": {"atom": "integer"}} 631 | }}}); 632 | let avro = json!({ 633 | "type": "map", 634 | "values": {"type": "long"} 635 | }); 636 | assert_from_ast_eq(ast, avro); 637 | } 638 | 639 | #[test] 640 | fn from_ast_array() { 641 | let ast = json!({ 642 | "type": {"array": {"items": { 643 | "type": {"atom": "string"}}}} 644 | }); 645 | let avro = json!({ 646 | "type": "array", 647 | "items": {"type": "string"} 648 | }); 649 | assert_from_ast_eq(ast, avro); 650 | } 651 | 652 | #[test] 653 | fn from_ast_array_array() { 654 | let ast = json!({ 655 | "is_root": true, 656 | "type": {"array": {"items": { 657 | "type": {"array": {"items": 658 | {"type": {"atom": "integer"}}}}}}} 659 | }); 660 | let avro = json!({ 661 | "type": "array", 662 | "items": 663 | { 664 | "type": "record", 665 | "name": "root", 666 | "fields": [ 667 | { 668 | "name": "list", 669 | "type": { 670 | "type": "array", 671 | "items": { 672 | "type": "long" 673 | } 674 | } 675 | } 676 | ] 677 | } 678 | }); 679 | assert_from_ast_eq(ast, avro); 680 | } 681 | 682 | #[test] 683 | fn from_ast_tuple() { 684 | // This case is not handled and instead converted into an object 685 | let ast = json!({ 686 | "type": { 687 | "tuple": { 688 | "items": [ 689 | {"type": {"atom": "boolean"}}, 690 | {"type": {"atom": "integer"}}, 691 | ] 692 | } 693 | } 694 | }); 695 | let avro = json!({ 696 | "type": "record", 697 | "name": "__UNNAMED__", 698 | "fields": [ 699 | {"name": "f0_", "type": {"type": "boolean"}}, 700 | {"name": "f1_", "type": {"type": "long"}} 701 | ] 702 | }); 703 | assert_from_ast_eq(ast, avro); 704 | } 705 | 706 | #[test] 707 | /// The union type is collapsed before being reconstructed 708 | fn from_ast_union() { 709 | let ast = json!({ 710 | // test this document as if it were root 711 | "is_root": true, 712 | "type": {"union": {"items": [ 713 | {"type": "null"}, 714 | {"type": {"atom": "boolean"}}, 715 | ]}} 716 | }); 717 | let avro = json!([ 718 | {"type": "null"}, 719 | {"type": "boolean"} 720 | ]); 721 | assert_from_ast_eq(ast, avro); 722 | } 723 | 724 | #[test] 725 | fn from_ast_datetime() { 726 | let ast = json!({"type": {"atom": "datetime"}}); 727 | let avro = json!({"type": "string"}); 728 | assert_from_ast_eq(ast, avro); 729 | } 730 | 731 | #[test] 732 | fn from_ast_bytes() { 733 | let ast = json!({"type": {"atom": "bytes"}}); 734 | let avro = json!({"type": "bytes"}); 735 | assert_from_ast_eq(ast, avro); 736 | } 737 | } 738 | -------------------------------------------------------------------------------- /src/casing.rs: -------------------------------------------------------------------------------- 1 | #[cfg(not(feature = "oniguruma"))] 2 | use heck::ToSnakeCase; 3 | #[cfg(feature = "oniguruma")] 4 | use lazy_static::lazy_static; 5 | #[cfg(feature = "oniguruma")] 6 | use onig::Regex; 7 | 8 | /// Normalize the case of a string to be `snake_case`. 9 | /// 10 | /// This function produces internally-consistent snake-casing that performs well 11 | /// in many situations. The rule-set for word boundaries are consistent with the 12 | /// withoutboats/heck crate. Several benefits include treating embedded 13 | /// mnemonics like `RAM` and `XMLHttpRequest` in an intuitive fashion. See 14 | /// `tests/resources/casing/mps-diff-integration.csv` in the test sources for 15 | /// empirical use of this casing logic. 16 | /// 17 | /// Underscores are considered word boundaries alongside the standard `\b` 18 | /// pattern. Boundaries in `camelCasing` are found by instances of a lowercase 19 | /// followed by an uppercase. Digits can be either lowercase or uppercase 20 | /// depending on the case of the most recent letter. Sequences of underscores 21 | /// are not significant and therefore cannot be used to encode other characters 22 | /// e.g. `-` cannot be represented via `__` because `_` is a word boundary. 23 | /// 24 | /// ## References 25 | /// 26 | /// * [Reference Python3 implementation](https://github.com/acmiyaguchi/test-casing/blob/8ca3d68db512fd3a17868c0b08cc84909ebebbc7/src/main.py#L1-L34) 27 | /// * [[withoutboats/heck] - Definition of a word boundary](https://github.com/withoutboats/heck/blob/093d56fbf001e1506e56dbfa38631d99b1066df1/src/lib.rs#L7-L17) 28 | /// * [[RexEgg] - Regex Boundaries and Delimiters—Standard and Advanced](https://www.rexegg.com/regex-boundaries.html) 29 | /// * [[StackOverflow] - RegEx to split camelCase or TitleCase (advanced)](https://stackoverflow.com/a/7599674) 30 | /// * [[StackOverflow] - What's the technical reason for “lookbehind assertion MUST be fixed length” in regex?](https://stackoverflow.com/a/40078049) 31 | #[cfg(not(feature = "oniguruma"))] 32 | pub fn to_snake_case(input: &str) -> String { 33 | input.to_snake_case() 34 | } 35 | #[cfg(feature = "oniguruma")] 36 | pub fn to_snake_case(input: &str) -> String { 37 | lazy_static! { 38 | static ref EXTRA_SYMBOL: Regex = Regex::new(r"[^\w]|_").unwrap(); 39 | // This regex matches camelCase in reverse, since the lookbehind 40 | // operation only accepts patterns of fixed length. This "inverted" 41 | // lookahead can help determine whether a digit is lowercase or 42 | // uppercase. 43 | static ref REV_WORD_BOUNDARY: Regex = Regex::new( 44 | r"(?x) 45 | \b # standard word boundary 46 | |(?<=[a-z][A-Z])(?=\d*[A-Z]) # break on runs of uppercase e.g. A7Aa -> A7|Aa 47 | |(?<=[a-z][A-Z])(?=\d*[a-z]) # break on runs of lowercase e.g a7Aa -> a7|Aa 48 | |(?<=[A-Z])(?=\d*[a-z]) # break on final uppercase e.g. a7A -> a7|A 49 | ", 50 | ) 51 | .unwrap(); 52 | } 53 | let subbed: String = EXTRA_SYMBOL.replace_all(input, " ").chars().rev().collect(); 54 | let words: Vec<&str> = REV_WORD_BOUNDARY 55 | .split(&subbed) 56 | .filter(|s| !s.trim().is_empty()) 57 | .collect(); 58 | words.join("_").to_lowercase().chars().rev().collect() 59 | } 60 | 61 | #[cfg(test)] 62 | mod tests { 63 | use super::*; 64 | 65 | macro_rules! case { 66 | ($test:expr, $expect:expr) => { 67 | assert_eq!(to_snake_case($test), $expect) 68 | }; 69 | } 70 | #[test] 71 | fn test_to_snake_case() { 72 | // one word 73 | case!("Aa", "aa"); 74 | // two words 75 | case!("aA", "a_a"); 76 | // underscores are word boundaries 77 | case!("_a__a_", "a_a"); 78 | // mnemonics are considered words 79 | case!("RAM", "ram"); 80 | // numbers can be lowercase 81 | case!("a7aAa", "a7a_aa"); 82 | // numbers can be uppercase 83 | case!("A7AAa", "a7a_aa"); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![recursion_limit = "128"] 2 | #[macro_use] 3 | extern crate log; 4 | #[macro_use] 5 | extern crate serde; 6 | #[macro_use] 7 | extern crate maplit; 8 | 9 | mod ast; 10 | mod avro; 11 | mod bigquery; 12 | pub mod casing; 13 | mod jsonschema; 14 | mod traits; 15 | 16 | use regex::Regex; 17 | use serde_json::{json, Value}; 18 | use traits::TranslateFrom; 19 | 20 | /// The error resolution method in the [`TranslateFrom`] and [`TranslateInto`] 21 | /// interfaces when converting between schema formats. 22 | /// 23 | /// The `Cast` method will represent under-specified (e.g. empty objects) and 24 | /// incompatible (e.g. variant-types or conflicting oneOf definitions) as 25 | /// strings. This behavior is useful for compacting complex types into a single 26 | /// column. In Spark and BigQuery, a casted column can be processed via a user 27 | /// defined function that works on JSON. However, this method may cause issues 28 | /// with schema evolution, for example when adding properties to empty objects. 29 | /// 30 | /// The `Drop` method will drop fields if they do not fall neatly into one of 31 | /// the supported types. This method ensures forward compatibility with schemas, 32 | /// but it can lose large portions of nested data. Support from the data 33 | /// processing side can recover dropped data from the structured section of the 34 | /// schema. 35 | /// 36 | /// The `Panic` method will panic if the JSON Schema is inconsistent or uses 37 | /// unsupported features. This method is a useful way to test for incompatible 38 | /// schemas. 39 | #[derive(Copy, Clone, Default, Deserialize)] 40 | pub enum ResolveMethod { 41 | #[default] 42 | Cast, 43 | Drop, 44 | Panic, 45 | } 46 | 47 | /// Options for modifying the behavior of translating between two schema 48 | /// formats. 49 | /// 50 | /// This structure passes context from the command-line interface into the 51 | /// translation logic between the various schema types in the project. In 52 | /// particular, the context is useful for resolving edge-cases in ambiguous 53 | /// situations. This can includes situations like casting or dropping an empty 54 | /// object. 55 | #[derive(Clone, Default, Deserialize)] 56 | #[serde(default)] 57 | pub struct Context { 58 | pub resolve_method: ResolveMethod, 59 | pub normalize_case: bool, 60 | pub force_nullable: bool, 61 | pub tuple_struct: bool, 62 | pub allow_maps_without_value: bool, 63 | pub json_object_path_regex: Option, 64 | } 65 | 66 | impl Context { 67 | /// Determine whether the given fully qualified name matches the configured json object path. 68 | fn is_json_object_path(&self, fqn: &str) -> bool { 69 | // would need to be passed in _somehow_. 70 | self.json_object_path_regex 71 | .as_ref() 72 | .map(|object_regex| { 73 | // Ensure we match from the beginning of the string 74 | let re = format!(r"\A{}", object_regex); 75 | let json_object_path_re = Regex::new(&re).unwrap(); 76 | json_object_path_re.is_match(fqn) 77 | }) 78 | .unwrap_or(false) 79 | } 80 | } 81 | 82 | fn into_ast(input: &Value, context: &mut Context) -> ast::Tag { 83 | let jsonschema: jsonschema::Tag = match serde_json::from_value(json!(input)) { 84 | Ok(tag) => tag, 85 | Err(e) => panic!("{:#?}", e), 86 | }; 87 | 88 | // The only special thing this crates knows about the schema: 89 | // Every sub-tree id matching the regex in `mozPipelineMetadata.json_object_path_regex` is dumped as a JSON 90 | // column without peeking further into that subtree. 91 | let metadata = jsonschema 92 | .extra 93 | .get("mozPipelineMetadata") 94 | .and_then(|obj| obj["json_object_path_regex"].as_str()); 95 | if let Some(json_object_path_regex) = metadata { 96 | context.json_object_path_regex = Some(json_object_path_regex.to_string()); 97 | } 98 | 99 | ast::Tag::translate_from(jsonschema, context).unwrap() 100 | } 101 | 102 | /// Convert JSON Schema into an Avro compatible schema 103 | pub fn convert_avro(input: &Value, mut context: Context) -> Value { 104 | let ast = into_ast(input, &mut context); 105 | let avro = avro::Type::translate_from(ast, &context).unwrap(); 106 | json!(avro) 107 | } 108 | 109 | /// Convert JSON Schema into a BigQuery compatible schema 110 | pub fn convert_bigquery(input: &Value, mut context: Context) -> Value { 111 | let ast = into_ast(input, &mut context); 112 | let bq = bigquery::Schema::translate_from(ast, &context).unwrap(); 113 | json!(bq) 114 | } 115 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use clap::{Parser, ValueEnum}; 2 | use jst::{Context, ResolveMethod}; 3 | use serde_json::Value; 4 | use std::fs::File; 5 | use std::io::{self, BufReader}; 6 | 7 | /// Output schema format 8 | #[derive(Copy, Clone, Default, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] 9 | enum Type { 10 | /// Avro format 11 | #[default] 12 | Avro, 13 | /// BigQuery format 14 | Bigquery, 15 | } 16 | 17 | /// Resolution strategy 18 | #[derive(Copy, Clone, Default, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] 19 | enum Resolve { 20 | /// Cast incompatible/under-specified schemas 21 | #[default] 22 | Cast, 23 | /// Panic on incompatible/under-specified schemas 24 | Panic, 25 | /// Drop incompatible/under-specified schemas 26 | Drop, 27 | } 28 | 29 | #[derive(Parser, Debug)] 30 | #[command(version, about, long_about = None)] 31 | struct Args { 32 | /// Sets the input file to use 33 | file: Option, 34 | 35 | /// The output schema format 36 | #[arg(short, long = "type", value_enum, default_value_t = Type::Avro, value_name = "TYPE")] 37 | typ: Type, 38 | 39 | /// The resolution strategy for incompatible or under-specified schema 40 | #[arg(short, long, value_enum, default_value_t = Resolve::Cast)] 41 | resolve: Resolve, 42 | 43 | /// snake_case column-names for consistent behavior between SQL engines 44 | #[arg(short = 'c', long)] 45 | normalize_case: bool, 46 | 47 | /// Treats all columns as NULLABLE, ignoring the required section in the JSON Schema object 48 | #[arg(short = 'n', long)] 49 | force_nullable: bool, 50 | 51 | /// Treats tuple validation as an anonymous struct 52 | #[arg(long)] 53 | tuple_struct: bool, 54 | 55 | /// Produces maps without a value field for incompatible or under-specified value schema 56 | #[arg(short = 'w', long)] 57 | allow_maps_without_value: bool, 58 | } 59 | 60 | fn main() { 61 | env_logger::init(); 62 | 63 | let args = Args::parse(); 64 | 65 | let reader: Box = match &args.file { 66 | Some(path) if path == "-" => Box::new(io::stdin()), 67 | Some(path) => { 68 | let file = File::open(path).unwrap(); 69 | Box::new(BufReader::new(file)) 70 | } 71 | None => Box::new(io::stdin()), 72 | }; 73 | let data: Value = serde_json::from_reader(reader).unwrap(); 74 | let context = Context { 75 | resolve_method: match args.resolve { 76 | Resolve::Cast => ResolveMethod::Cast, 77 | Resolve::Panic => ResolveMethod::Panic, 78 | Resolve::Drop => ResolveMethod::Drop, 79 | }, 80 | normalize_case: args.normalize_case, 81 | force_nullable: args.force_nullable, 82 | tuple_struct: args.tuple_struct, 83 | allow_maps_without_value: args.allow_maps_without_value, 84 | json_object_path_regex: None, 85 | }; 86 | 87 | let output = match args.typ { 88 | Type::Avro => jst::convert_avro(&data, context), 89 | Type::Bigquery => jst::convert_bigquery(&data, context), 90 | }; 91 | let pretty = serde_json::to_string_pretty(&output).unwrap(); 92 | println!("{}", pretty); 93 | } 94 | -------------------------------------------------------------------------------- /src/traits.rs: -------------------------------------------------------------------------------- 1 | use super::Context; 2 | 3 | /// A translation between two schema formats that may fail under certain 4 | /// conditions. 5 | /// 6 | /// This is similar to the `TryFrom` trait, but requires the implementor to pass 7 | /// a Context struct for runtime modifications to the schema. A concrete use 8 | /// context is to decide on an appropriate error handling mechanism when a JSON 9 | /// Schema contains an empty field. Given a use-case, it may be more appropriate 10 | /// to fail fast and panic over dropping or casting the field. 11 | /// 12 | /// https://doc.rust-lang.org/src/core/convert.rs.html#478-486 13 | pub trait TranslateFrom: Sized { 14 | type Error; 15 | 16 | fn translate_from(value: T, context: &Context) -> Result; 17 | } 18 | 19 | /// A translation between two schema formats. It is the reciprocal of 20 | /// [`TranslateFrom'] 21 | pub trait TranslateInto: Sized { 22 | type Error; 23 | 24 | fn translate_into(self, context: &Context) -> Result; 25 | } 26 | 27 | // TranslateFrom implies TranslateInto 28 | impl TranslateInto for T 29 | where 30 | U: TranslateFrom, 31 | { 32 | type Error = U::Error; 33 | fn translate_into(self, context: &Context) -> Result { 34 | U::translate_from(self, context) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/force_nullable.rs: -------------------------------------------------------------------------------- 1 | use jst::Context; 2 | use jst::{convert_avro, convert_bigquery}; 3 | use pretty_assertions::assert_eq; 4 | use serde_json::Value; 5 | 6 | fn test_data() -> Value { 7 | serde_json::from_str( 8 | r#" 9 | { 10 | "type": "object", 11 | "properties": { 12 | "array": { 13 | "type": "array", 14 | "items": { 15 | "type": "object", 16 | "properties": { 17 | "a": {"type": "boolean"} 18 | }, 19 | "required": ["a"] 20 | } 21 | }, 22 | "atom": {"type": "integer"}, 23 | "map": { 24 | "type": "object", 25 | "additionalProperties": { 26 | "type": "object", 27 | "properties": { 28 | "b": {"type": "boolean"} 29 | }, 30 | "required": ["b"] 31 | } 32 | }, 33 | "object": { 34 | "type": "object", 35 | "properties": { 36 | "c": {"type": "boolean"}, 37 | "d": {"type": "boolean"} 38 | }, 39 | "required": ["c", "d"] 40 | }, 41 | "union": { 42 | "oneOf": [ 43 | { 44 | "type": "object", 45 | "properties": { 46 | "e": {"type": "boolean"} 47 | }, 48 | "required": ["e"] 49 | }, 50 | { 51 | "type": "object", 52 | "properties": { 53 | "f": {"type": "boolean"} 54 | }, 55 | "required": ["f"] 56 | } 57 | ] 58 | }, 59 | "tuple": { 60 | "type": "array", 61 | "items": [ 62 | {"type": "boolean"} 63 | ], 64 | "maxItems": 1 65 | } 66 | }, 67 | "required": ["atom", "object", "map", "array", "union", "tuple"] 68 | } 69 | "#, 70 | ) 71 | .unwrap() 72 | } 73 | 74 | #[test] 75 | fn test_bigquery_force_nullable() { 76 | let context = Context { 77 | force_nullable: true, 78 | tuple_struct: true, 79 | ..Default::default() 80 | }; 81 | 82 | let expected: Value = serde_json::from_str( 83 | r#" 84 | [ 85 | { 86 | "fields": [ 87 | { 88 | "mode": "NULLABLE", 89 | "name": "a", 90 | "type": "BOOL" 91 | } 92 | ], 93 | "mode": "REPEATED", 94 | "name": "array", 95 | "type": "RECORD" 96 | }, 97 | { 98 | "mode": "NULLABLE", 99 | "name": "atom", 100 | "type": "INT64" 101 | }, 102 | { 103 | "fields": [ 104 | { 105 | "mode": "NULLABLE", 106 | "name": "key", 107 | "type": "STRING" 108 | }, 109 | { 110 | "fields": [ 111 | { 112 | "mode": "NULLABLE", 113 | "name": "b", 114 | "type": "BOOL" 115 | } 116 | ], 117 | "mode": "NULLABLE", 118 | "name": "value", 119 | "type": "RECORD" 120 | } 121 | ], 122 | "mode": "REPEATED", 123 | "name": "map", 124 | "type": "RECORD" 125 | }, 126 | { 127 | "fields": [ 128 | { 129 | "mode": "NULLABLE", 130 | "name": "c", 131 | "type": "BOOL" 132 | }, 133 | { 134 | "mode": "NULLABLE", 135 | "name": "d", 136 | "type": "BOOL" 137 | } 138 | ], 139 | "mode": "NULLABLE", 140 | "name": "object", 141 | "type": "RECORD" 142 | }, 143 | { 144 | "fields": [ 145 | { 146 | "mode": "NULLABLE", 147 | "name": "f0_", 148 | "type": "BOOL" 149 | } 150 | ], 151 | "mode": "NULLABLE", 152 | "name": "tuple", 153 | "type": "RECORD" 154 | }, 155 | { 156 | "fields": [ 157 | { 158 | "mode": "NULLABLE", 159 | "name": "e", 160 | "type": "BOOL" 161 | }, 162 | { 163 | "mode": "NULLABLE", 164 | "name": "f", 165 | "type": "BOOL" 166 | } 167 | ], 168 | "mode": "NULLABLE", 169 | "name": "union", 170 | "type": "RECORD" 171 | } 172 | ] 173 | "#, 174 | ) 175 | .unwrap(); 176 | 177 | assert_eq!(expected, convert_bigquery(&test_data(), context)); 178 | } 179 | 180 | #[test] 181 | fn test_avro_force_nullable() { 182 | let context = Context { 183 | force_nullable: true, 184 | tuple_struct: true, 185 | ..Default::default() 186 | }; 187 | let expected: Value = serde_json::from_str( 188 | r#" 189 | [ 190 | {"type": "null"}, 191 | { 192 | "fields": [ 193 | { 194 | "default": null, 195 | "name": "array", 196 | "type": [ 197 | {"type": "null"}, 198 | { 199 | "items": [ 200 | {"type": "null"}, 201 | { 202 | "fields": [ 203 | { 204 | "default": null, 205 | "name": "a", 206 | "type": [ 207 | {"type": "null"}, 208 | {"type": "boolean"} 209 | ] 210 | } 211 | ], 212 | "name": "list", 213 | "namespace": "root.array", 214 | "type": "record" 215 | } 216 | ], 217 | "type": "array" 218 | } 219 | ] 220 | }, 221 | { 222 | "default": null, 223 | "name": "atom", 224 | "type": [ 225 | {"type": "null"}, 226 | {"type": "long"} 227 | ] 228 | }, 229 | { 230 | "default": null, 231 | "name": "map", 232 | "type": [ 233 | {"type": "null"}, 234 | { 235 | "type": "map", 236 | "values": [ 237 | {"type": "null"}, 238 | { 239 | "fields": [ 240 | { 241 | "default": null, 242 | "name": "b", 243 | "type": [ 244 | {"type": "null"}, 245 | {"type": "boolean"} 246 | ] 247 | } 248 | ], 249 | "name": "value", 250 | "namespace": "root.map", 251 | "type": "record" 252 | } 253 | ] 254 | } 255 | ] 256 | }, 257 | { 258 | "default": null, 259 | "name": "object", 260 | "type": [ 261 | {"type": "null"}, 262 | { 263 | "fields": [ 264 | { 265 | "default": null, 266 | "name": "c", 267 | "type": [ 268 | {"type": "null"}, 269 | {"type": "boolean"} 270 | ] 271 | }, 272 | { 273 | "default": null, 274 | "name": "d", 275 | "type": [ 276 | {"type": "null"}, 277 | {"type": "boolean"} 278 | ] 279 | } 280 | ], 281 | "name": "object", 282 | "namespace": "root", 283 | "type": "record" 284 | } 285 | ] 286 | }, 287 | { 288 | "default": null, 289 | "name": "tuple", 290 | "type": [ 291 | {"type": "null"}, 292 | { 293 | "name": "tuple", 294 | "namespace": "root", 295 | "type": "record", 296 | "fields": [ 297 | { 298 | "default": null, 299 | "name": "f0_", 300 | "type": [ 301 | {"type": "null"}, 302 | {"type": "boolean"} 303 | ] 304 | } 305 | ] 306 | } 307 | ] 308 | }, 309 | { 310 | "default": null, 311 | "name": "union", 312 | "type": [ 313 | {"type": "null"}, 314 | { 315 | "fields": [ 316 | { 317 | "default": null, 318 | "name": "e", 319 | "type": [ 320 | {"type": "null"}, 321 | {"type": "boolean"} 322 | ] 323 | }, 324 | { 325 | "default": null, 326 | "name": "f", 327 | "type": [ 328 | {"type": "null"}, 329 | {"type": "boolean"} 330 | ] 331 | } 332 | ], 333 | "name": "union", 334 | "namespace": "root", 335 | "type": "record" 336 | } 337 | ] 338 | } 339 | ], 340 | "name": "root", 341 | "type": "record" 342 | } 343 | ] 344 | "#, 345 | ) 346 | .unwrap(); 347 | 348 | assert_eq!(expected, convert_avro(&test_data(), context)); 349 | } 350 | -------------------------------------------------------------------------------- /tests/normalize_case.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{BufRead, BufReader}; 3 | use std::path::PathBuf; 4 | 5 | use pretty_assertions::assert_eq; 6 | use serde_json::Value; 7 | 8 | use jst::casing::to_snake_case; 9 | use jst::{convert_avro, convert_bigquery}; 10 | use jst::{Context, ResolveMethod}; 11 | 12 | fn test_data() -> Value { 13 | serde_json::from_str( 14 | r#" 15 | { 16 | "type": "object", 17 | "properties": { 18 | "test_snake_case": {"type": "boolean"}, 19 | "testCamelCase": {"type": "boolean"}, 20 | "TestPascalCase": {"type": "boolean"}, 21 | "TEST_SCREAMING_SNAKE_CASE": {"type": "boolean"} 22 | }, 23 | "required": [ 24 | "test_snake_case", 25 | "testCamelCase", 26 | "TestPascalCase", 27 | "TEST_SCREAMING_SNAKE_CASE" 28 | ] 29 | } 30 | "#, 31 | ) 32 | .unwrap() 33 | } 34 | 35 | /// Get the resource path for all the casing tests 36 | fn resource_path() -> PathBuf { 37 | let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); 38 | path.push("tests/resources/casing"); 39 | path 40 | } 41 | 42 | /// Test the `to_snake_case` method against a test file in the format 43 | /// `reference,expected` 44 | fn snake_case_test(case_name: &str) { 45 | let mut path = resource_path(); 46 | path.push(case_name); 47 | let file = File::open(&path).unwrap(); 48 | let reader = BufReader::new(file); 49 | for line in reader.lines() { 50 | let line = line.unwrap().to_string(); 51 | let cols: Vec<&str> = line.split(',').collect(); 52 | assert_eq!(cols.len(), 2); 53 | assert_eq!(to_snake_case(cols[0]), cols[1]); 54 | } 55 | } 56 | 57 | #[test] 58 | fn test_snake_casing_alphanum_3() { 59 | // all strings of length 3 drawn from the alphabet "aA7" 60 | snake_case_test("alphanum_3.csv"); 61 | } 62 | 63 | #[test] 64 | fn test_snake_casing_word_4() { 65 | // all strings of length 4 drawn from the alphabet "aA7_" 66 | snake_case_test("word_4.csv"); 67 | } 68 | 69 | #[test] 70 | fn test_snake_casing_mps_diff_integration() { 71 | // all column names from mozilla-pipeline-schemas affected by snake_casing 72 | // https://github.com/mozilla/jsonschema-transpiler/pull/79#issuecomment-509839572 73 | // https://gist.github.com/acmiyaguchi/3f526c440b67ebe469bcb6ab2da5123f#file-readme-md 74 | snake_case_test("mps-diff-integration.csv"); 75 | } 76 | 77 | #[test] 78 | fn test_bigquery_normalize_snake_casing() { 79 | let context = Context { 80 | normalize_case: true, 81 | resolve_method: ResolveMethod::Panic, 82 | ..Default::default() 83 | }; 84 | let expected: Value = serde_json::from_str( 85 | r#" 86 | [ 87 | { 88 | "mode": "REQUIRED", 89 | "name": "test_camel_case", 90 | "type": "BOOL" 91 | }, 92 | { 93 | "mode": "REQUIRED", 94 | "name": "test_pascal_case", 95 | "type": "BOOL" 96 | }, 97 | { 98 | "mode": "REQUIRED", 99 | "name": "test_screaming_snake_case", 100 | "type": "BOOL" 101 | }, 102 | { 103 | "mode": "REQUIRED", 104 | "name": "test_snake_case", 105 | "type": "BOOL" 106 | } 107 | ] 108 | "#, 109 | ) 110 | .unwrap(); 111 | 112 | assert_eq!(expected, convert_bigquery(&test_data(), context)); 113 | } 114 | 115 | #[test] 116 | fn test_avro_normalize_snake_casing() { 117 | let context = Context { 118 | normalize_case: true, 119 | resolve_method: ResolveMethod::Panic, 120 | ..Default::default() 121 | }; 122 | let expected: Value = serde_json::from_str( 123 | r#" 124 | { 125 | "fields": [ 126 | { 127 | "name": "test_camel_case", 128 | "type": {"type": "boolean"} 129 | }, 130 | { 131 | "name": "test_pascal_case", 132 | "type": {"type": "boolean"} 133 | }, 134 | { 135 | "name": "test_screaming_snake_case", 136 | "type": {"type": "boolean"} 137 | }, 138 | { 139 | "name": "test_snake_case", 140 | "type": {"type": "boolean"} 141 | } 142 | ], 143 | "name": "root", 144 | "type": "record" 145 | } 146 | "#, 147 | ) 148 | .unwrap(); 149 | 150 | assert_eq!(expected, convert_avro(&test_data(), context)); 151 | } 152 | -------------------------------------------------------------------------------- /tests/resolve_method.rs: -------------------------------------------------------------------------------- 1 | use jst::{convert_avro, convert_bigquery}; 2 | use jst::{Context, ResolveMethod}; 3 | use serde_json::Value; 4 | 5 | fn test_data() -> Value { 6 | serde_json::from_str( 7 | r#" 8 | { 9 | "type": "object", 10 | "properties": { 11 | "empty": {}, 12 | "int": {"type": "integer"} 13 | } 14 | } 15 | "#, 16 | ) 17 | .unwrap() 18 | } 19 | 20 | #[test] 21 | fn test_bigquery_resolve_error_cast() { 22 | let context = Context { 23 | resolve_method: ResolveMethod::Cast, 24 | ..Default::default() 25 | }; 26 | let expected: Value = serde_json::from_str( 27 | r#" 28 | [ 29 | { 30 | "mode": "NULLABLE", 31 | "name": "empty", 32 | "type": "STRING" 33 | }, 34 | { 35 | "mode": "NULLABLE", 36 | "name": "int", 37 | "type": "INT64" 38 | } 39 | ] 40 | "#, 41 | ) 42 | .unwrap(); 43 | 44 | assert_eq!(expected, convert_bigquery(&test_data(), context)); 45 | } 46 | 47 | #[test] 48 | fn test_bigquery_resolve_error_drop() { 49 | let context = Context { 50 | resolve_method: ResolveMethod::Drop, 51 | ..Default::default() 52 | }; 53 | let expected: Value = serde_json::from_str( 54 | r#" 55 | [ 56 | { 57 | "mode": "NULLABLE", 58 | "name": "int", 59 | "type": "INT64" 60 | } 61 | ] 62 | "#, 63 | ) 64 | .unwrap(); 65 | assert_eq!(expected, convert_bigquery(&test_data(), context)); 66 | } 67 | 68 | #[test] 69 | #[should_panic] 70 | fn test_bigquery_resolve_error_panic() { 71 | let context = Context { 72 | resolve_method: ResolveMethod::Panic, 73 | ..Default::default() 74 | }; 75 | convert_bigquery(&test_data(), context); 76 | } 77 | 78 | #[test] 79 | fn test_avro_resolve_error_cast() { 80 | let context = Context { 81 | resolve_method: ResolveMethod::Cast, 82 | ..Default::default() 83 | }; 84 | let expected: Value = serde_json::from_str( 85 | r#" 86 | { 87 | "fields": [ 88 | { 89 | "default": null, 90 | "name": "empty", 91 | "type": [ 92 | {"type": "null"}, 93 | {"type": "string"} 94 | ] 95 | }, 96 | { 97 | "default": null, 98 | "name": "int", 99 | "type": [ 100 | {"type": "null"}, 101 | {"type": "long"} 102 | ] 103 | } 104 | ], 105 | "name": "root", 106 | "type": "record" 107 | } 108 | "#, 109 | ) 110 | .unwrap(); 111 | 112 | assert_eq!(expected, convert_avro(&test_data(), context)); 113 | } 114 | 115 | #[test] 116 | fn test_avro_resolve_error_drop() { 117 | let context = Context { 118 | resolve_method: ResolveMethod::Drop, 119 | ..Default::default() 120 | }; 121 | let expected: Value = serde_json::from_str( 122 | r#" 123 | { 124 | "fields": [ 125 | { 126 | "default": null, 127 | "name": "int", 128 | "type": [ 129 | {"type": "null"}, 130 | {"type": "long"} 131 | ] 132 | } 133 | ], 134 | "name": "root", 135 | "type": "record" 136 | } 137 | "#, 138 | ) 139 | .unwrap(); 140 | assert_eq!(expected, convert_avro(&test_data(), context)); 141 | } 142 | 143 | #[test] 144 | #[should_panic] 145 | fn test_avro_resolve_error_panic() { 146 | let context = Context { 147 | resolve_method: ResolveMethod::Panic, 148 | ..Default::default() 149 | }; 150 | convert_avro(&test_data(), context); 151 | } 152 | -------------------------------------------------------------------------------- /tests/resources/casing/alphanum_3.csv: -------------------------------------------------------------------------------- 1 | AAA,aaa 2 | AAa,a_aa 3 | AA7,aa7 4 | AaA,aa_a 5 | Aaa,aaa 6 | Aa7,aa7 7 | A7A,a7a 8 | A7a,a7a 9 | A77,a77 10 | aAA,a_aa 11 | aAa,a_aa 12 | aA7,a_a7 13 | aaA,aa_a 14 | aaa,aaa 15 | aa7,aa7 16 | a7A,a7_a 17 | a7a,a7a 18 | a77,a77 19 | 7AA,7aa 20 | 7Aa,7aa 21 | 7A7,7a7 22 | 7aA,7a_a 23 | 7aa,7aa 24 | 7a7,7a7 25 | 77A,77a 26 | 77a,77a 27 | 777,777 28 | -------------------------------------------------------------------------------- /tests/resources/casing/mps-diff-integration.csv: -------------------------------------------------------------------------------- 1 | AvailablePageFile,available_page_file 2 | AvailablePhysicalMemory,available_physical_memory 3 | AvailableVirtualMemory,available_virtual_memory 4 | BuildID,build_id 5 | D2DEnabled,d2d_enabled 6 | DWriteEnabled,d_write_enabled 7 | GPUActive,gpu_active 8 | Headless,headless 9 | IsGarbageCollecting,is_garbage_collecting 10 | LowEndMachine,low_end_machine 11 | ProductID,product_id 12 | ProductName,product_name 13 | RAM,ram 14 | ReleaseChannel,release_channel 15 | SecondsSinceLastCrash,seconds_since_last_crash 16 | StartupCrash,startup_crash 17 | SystemMemoryUsePercentage,system_memory_use_percentage 18 | TotalPageFile,total_page_file 19 | TotalPhysicalMemory,total_physical_memory 20 | TotalVirtualMemory,total_virtual_memory 21 | Version,version 22 | acceptLanguages,accept_languages 23 | accessibilityServices,accessibility_services 24 | activeAddons,active_addons 25 | activeExperiment,active_experiment 26 | activeGMPlugins,active_gm_plugins 27 | activePlugins,active_plugins 28 | adHocTablesDir,ad_hoc_tables_dir 29 | additionalProperties,additional_properties 30 | addonCompatibilityCheckEnabled,addon_compatibility_check_enabled 31 | addonId,addon_id 32 | addonVersion,addon_version 33 | advancedLayers,advanced_layers 34 | allowAutoplay,allow_autoplay 35 | apiCall,api_call 36 | apiVersion,api_version 37 | appDisabled,app_disabled 38 | appLocales,app_locales 39 | appName,app_name 40 | appUpdateChannel,app_update_channel 41 | appVersion,app_version 42 | appleModelId,apple_model_id 43 | applicationId,application_id 44 | applicationName,application_name 45 | architecturesInBinary,architectures_in_binary 46 | autoDownload,auto_download 47 | availableLocales,available_locales 48 | baseAddress,base_address 49 | blocklistEnabled,blocklist_enabled 50 | buildId,build_id 51 | certSubject,cert_subject 52 | changedFiles,changed_files 53 | changesetID,changeset_id 54 | clientId,client_id 55 | closedTS,closed_ts 56 | connType,conn_type 57 | crashDate,crash_date 58 | createdDate,created_date 59 | createdTimestamp,created_timestamp 60 | creationDate,creation_date 61 | debugID,debug_id 62 | debugName,debug_name 63 | defaultBrowser,default_browser 64 | defaultSearch,default_search 65 | defaultSearchEngine,default_search_engine 66 | defaultSearchEngineData,default_search_engine_data 67 | description,description 68 | detectedUri,detected_uri 69 | detectedVersion,detected_version 70 | deviceID,device_id 71 | displayVersion,display_version 72 | distributionId,distribution_id 73 | distributionVersion,distribution_version 74 | distributorChannel,distributor_channel 75 | driver,driver 76 | driverDate,driver_date 77 | driverVersion,driver_version 78 | e10sCohort,e10s_cohort 79 | e10sEnabled,e10s_enabled 80 | ecosystemClientId,ecosystem_client_id 81 | effectiveContentProcessLevel,effective_content_process_level 82 | encryptedData,encrypted_data 83 | encryptionKeyId,encryption_key_id 84 | engagedTS,engaged_ts 85 | engagementType,engagement_type 86 | errorModules,error_modules 87 | eventId,event_id 88 | expiredTS,expired_ts 89 | fileSize,file_size 90 | fileVersion,file_version 91 | firstUseDate,first_use_date 92 | firstView,first_view 93 | flashUsage,flash_usage 94 | flowId,flow_id 95 | globalSettings,global_settings 96 | gpuProcess,gpu_process 97 | hasBinaryComponents,has_binary_components 98 | hasCrashEnvironment,has_crash_environment 99 | hasSync,has_sync 100 | hotfixVersion,hotfix_version 101 | installDay,install_day 102 | installYear,install_year 103 | ipc_channel_error,ipc_channel_error 104 | isDefaultBrowser,is_default_browser 105 | isStartup,is_startup 106 | isStubProfile,is_stub_profile 107 | isSystem,is_system 108 | isTablet,is_tablet 109 | isWow64,is_wow64 110 | kernelVersion,kernel_version 111 | keyedHistograms,keyed_histograms 112 | l2cacheKB,l2cache_kb 113 | l3cacheKB,l3cache_kb 114 | landingSystem,landing_system 115 | lastBuildId,last_build_id 116 | lastVersion,last_version 117 | launcherProcessState,launcher_process_state 118 | learnMoreTS,learn_more_ts 119 | loadDurationMS,load_duration_ms 120 | loadPath,load_path 121 | loaderName,loader_name 122 | lostEventsCount,lost_events_count 123 | memoryMB,memory_mb 124 | mimeTypes,mime_types 125 | moduleName,module_name 126 | moduleTrustFlags,module_trust_flags 127 | offeredTS,offered_ts 128 | osName,os_name 129 | osVersion,os_version 130 | packetVersion,packet_version 131 | pageId,page_id 132 | pageSpecific,page_specific 133 | partnerId,partner_id 134 | partnerNames,partner_names 135 | pingDiscardedForSize,ping_discarded_for_size 136 | pioneerAddonMetadata,pioneer_addon_metadata 137 | pioneerId,pioneer_id 138 | pioneerUtilsVersion,pioneer_utils_version 139 | placesBookmarksCount,places_bookmarks_count 140 | placesPagesCount,places_pages_count 141 | platformVersion,platform_version 142 | pocketId,pocket_id 143 | previousBuildId,previous_build_id 144 | previousChannel,previous_channel 145 | previousVersion,previous_version 146 | prioData,prio_data 147 | processStartTimestamp,process_start_timestamp 148 | processType,process_type 149 | processUptimeMS,process_uptime_ms 150 | profileCreationDate,profile_creation_date 151 | profileDate,profile_date 152 | profileSubsessionCounter,profile_subsession_counter 153 | promptResponse,prompt_response 154 | pseudoDisplay,pseudo_display 155 | pushDate,push_date 156 | refreshRate,refresh_rate 157 | regionalPrefsLocales,regional_prefs_locales 158 | rememberCheckbox,remember_checkbox 159 | requestedLocales,requested_locales 160 | resetDate,reset_date 161 | responseTime,response_time 162 | reviewSystemUsed,review_system_used 163 | runId,run_id 164 | schemaName,schema_name 165 | schemaVersion,schema_version 166 | screenHeight,screen_height 167 | screenWidth,screen_width 168 | searchCohort,search_cohort 169 | searchCounts,search_counts 170 | sendFailure,send_failure 171 | servicePackMajor,service_pack_major 172 | servicePackMinor,service_pack_minor 173 | sessionId,session_id 174 | sessionState,session_state 175 | settingsChanged,settings_changed 176 | showTrackerStatsShare,show_tracker_stats_share 177 | signedState,signed_state 178 | sourcesJson,sources_json 179 | spbeMaxConcurrentTabCount,spbe_max_concurrent_tab_count 180 | spbeMaxConcurrentWindowCount,spbe_max_concurrent_window_count 181 | spbeNavigationAboutNewtab,spbe_navigation_about_newtab 182 | spbeNavigationContextmenu,spbe_navigation_contextmenu 183 | spbeNavigationSearchbar,spbe_navigation_searchbar 184 | spbeNavigationUrlbar,spbe_navigation_urlbar 185 | spbeTabOpenEventCount,spbe_tab_open_event_count 186 | spbeTotalUriCount,spbe_total_uri_count 187 | spbeUnfilteredUriCount,spbe_unfiltered_uri_count 188 | spbeUniqueDomainsCount,spbe_unique_domains_count 189 | spbeWindowOpenEventCount,spbe_window_open_event_count 190 | speedMHz,speed_m_hz 191 | sqlTableName,sql_table_name 192 | standardDeviation,standard_deviation 193 | structVersion,struct_version 194 | studyName,study_name 195 | submissionURL,submission_url 196 | subsessionId,subsession_id 197 | subsessionLength,subsession_length 198 | subsessionStartDate,subsession_start_date 199 | subsysID,subsys_id 200 | surveyId,survey_id 201 | surveyVersion,survey_version 202 | systemCpuCores,system_cpu_cores 203 | systemCpuSpeedMhz,system_cpu_speed_mhz 204 | systemGfxMonitors1ScreenWidth,system_gfx_monitors1_screen_width 205 | systemGfxMonitors1ScreenWidthZeroIndexed,system_gfx_monitors1_screen_width_zero_indexed 206 | systemLocales,system_locales 207 | systemMemoryMb,system_memory_mb 208 | tableName,table_name 209 | targetBuildId,target_build_id 210 | targetChannel,target_channel 211 | targetDisplayVersion,target_display_version 212 | targetVersion,target_version 213 | telemetryEnabled,telemetry_enabled 214 | textureSharing,texture_sharing 215 | threadID,thread_id 216 | threadName,thread_name 217 | timezoneOffest,timezone_offest 218 | totalBlockedAudibleMedia,total_blocked_audible_media 219 | totalPages,total_pages 220 | totalPagesAM,total_pages_am 221 | totalTime,total_time 222 | updateDay,update_day 223 | updaterAvailable,updater_available 224 | userDisabled,user_disabled 225 | vendorID,vendor_id 226 | virtualMaxMB,virtual_max_mb 227 | votedTS,voted_ts 228 | windowClosedTS,window_closed_ts 229 | windowsBuildNumber,windows_build_number 230 | windowsUBR,windows_ubr 231 | xpcomAbi,xpcom_abi 232 | xulLoadDurationMS,xul_load_duration_ms 233 | -------------------------------------------------------------------------------- /tests/resources/casing/word_4.csv: -------------------------------------------------------------------------------- 1 | AAAA,aaaa 2 | AAAa,aa_aa 3 | AAA7,aaa7 4 | AAA_,aaa 5 | AAaA,a_aa_a 6 | AAaa,a_aaa 7 | AAa7,a_aa7 8 | AAa_,a_aa 9 | AA7A,aa7a 10 | AA7a,aa7a 11 | AA77,aa77 12 | AA7_,aa7 13 | AA_A,aa_a 14 | AA_a,aa_a 15 | AA_7,aa_7 16 | AA__,aa 17 | AaAA,aa_aa 18 | AaAa,aa_aa 19 | AaA7,aa_a7 20 | AaA_,aa_a 21 | AaaA,aaa_a 22 | Aaaa,aaaa 23 | Aaa7,aaa7 24 | Aaa_,aaa 25 | Aa7A,aa7_a 26 | Aa7a,aa7a 27 | Aa77,aa77 28 | Aa7_,aa7 29 | Aa_A,aa_a 30 | Aa_a,aa_a 31 | Aa_7,aa_7 32 | Aa__,aa 33 | A7AA,a7aa 34 | A7Aa,a7_aa 35 | A7A7,a7a7 36 | A7A_,a7a 37 | A7aA,a7a_a 38 | A7aa,a7aa 39 | A7a7,a7a7 40 | A7a_,a7a 41 | A77A,a77a 42 | A77a,a77a 43 | A777,a777 44 | A77_,a77 45 | A7_A,a7_a 46 | A7_a,a7_a 47 | A7_7,a7_7 48 | A7__,a7 49 | A_AA,a_aa 50 | A_Aa,a_aa 51 | A_A7,a_a7 52 | A_A_,a_a 53 | A_aA,a_a_a 54 | A_aa,a_aa 55 | A_a7,a_a7 56 | A_a_,a_a 57 | A_7A,a_7a 58 | A_7a,a_7a 59 | A_77,a_77 60 | A_7_,a_7 61 | A__A,a_a 62 | A__a,a_a 63 | A__7,a_7 64 | A___,a 65 | aAAA,a_aaa 66 | aAAa,a_a_aa 67 | aAA7,a_aa7 68 | aAA_,a_aa 69 | aAaA,a_aa_a 70 | aAaa,a_aaa 71 | aAa7,a_aa7 72 | aAa_,a_aa 73 | aA7A,a_a7a 74 | aA7a,a_a7a 75 | aA77,a_a77 76 | aA7_,a_a7 77 | aA_A,a_a_a 78 | aA_a,a_a_a 79 | aA_7,a_a_7 80 | aA__,a_a 81 | aaAA,aa_aa 82 | aaAa,aa_aa 83 | aaA7,aa_a7 84 | aaA_,aa_a 85 | aaaA,aaa_a 86 | aaaa,aaaa 87 | aaa7,aaa7 88 | aaa_,aaa 89 | aa7A,aa7_a 90 | aa7a,aa7a 91 | aa77,aa77 92 | aa7_,aa7 93 | aa_A,aa_a 94 | aa_a,aa_a 95 | aa_7,aa_7 96 | aa__,aa 97 | a7AA,a7_aa 98 | a7Aa,a7_aa 99 | a7A7,a7_a7 100 | a7A_,a7_a 101 | a7aA,a7a_a 102 | a7aa,a7aa 103 | a7a7,a7a7 104 | a7a_,a7a 105 | a77A,a77_a 106 | a77a,a77a 107 | a777,a777 108 | a77_,a77 109 | a7_A,a7_a 110 | a7_a,a7_a 111 | a7_7,a7_7 112 | a7__,a7 113 | a_AA,a_aa 114 | a_Aa,a_aa 115 | a_A7,a_a7 116 | a_A_,a_a 117 | a_aA,a_a_a 118 | a_aa,a_aa 119 | a_a7,a_a7 120 | a_a_,a_a 121 | a_7A,a_7a 122 | a_7a,a_7a 123 | a_77,a_77 124 | a_7_,a_7 125 | a__A,a_a 126 | a__a,a_a 127 | a__7,a_7 128 | a___,a 129 | 7AAA,7aaa 130 | 7AAa,7a_aa 131 | 7AA7,7aa7 132 | 7AA_,7aa 133 | 7AaA,7aa_a 134 | 7Aaa,7aaa 135 | 7Aa7,7aa7 136 | 7Aa_,7aa 137 | 7A7A,7a7a 138 | 7A7a,7a7a 139 | 7A77,7a77 140 | 7A7_,7a7 141 | 7A_A,7a_a 142 | 7A_a,7a_a 143 | 7A_7,7a_7 144 | 7A__,7a 145 | 7aAA,7a_aa 146 | 7aAa,7a_aa 147 | 7aA7,7a_a7 148 | 7aA_,7a_a 149 | 7aaA,7aa_a 150 | 7aaa,7aaa 151 | 7aa7,7aa7 152 | 7aa_,7aa 153 | 7a7A,7a7_a 154 | 7a7a,7a7a 155 | 7a77,7a77 156 | 7a7_,7a7 157 | 7a_A,7a_a 158 | 7a_a,7a_a 159 | 7a_7,7a_7 160 | 7a__,7a 161 | 77AA,77aa 162 | 77Aa,77aa 163 | 77A7,77a7 164 | 77A_,77a 165 | 77aA,77a_a 166 | 77aa,77aa 167 | 77a7,77a7 168 | 77a_,77a 169 | 777A,777a 170 | 777a,777a 171 | 7777,7777 172 | 777_,777 173 | 77_A,77_a 174 | 77_a,77_a 175 | 77_7,77_7 176 | 77__,77 177 | 7_AA,7_aa 178 | 7_Aa,7_aa 179 | 7_A7,7_a7 180 | 7_A_,7_a 181 | 7_aA,7_a_a 182 | 7_aa,7_aa 183 | 7_a7,7_a7 184 | 7_a_,7_a 185 | 7_7A,7_7a 186 | 7_7a,7_7a 187 | 7_77,7_77 188 | 7_7_,7_7 189 | 7__A,7_a 190 | 7__a,7_a 191 | 7__7,7_7 192 | 7___,7 193 | _AAA,aaa 194 | _AAa,a_aa 195 | _AA7,aa7 196 | _AA_,aa 197 | _AaA,aa_a 198 | _Aaa,aaa 199 | _Aa7,aa7 200 | _Aa_,aa 201 | _A7A,a7a 202 | _A7a,a7a 203 | _A77,a77 204 | _A7_,a7 205 | _A_A,a_a 206 | _A_a,a_a 207 | _A_7,a_7 208 | _A__,a 209 | _aAA,a_aa 210 | _aAa,a_aa 211 | _aA7,a_a7 212 | _aA_,a_a 213 | _aaA,aa_a 214 | _aaa,aaa 215 | _aa7,aa7 216 | _aa_,aa 217 | _a7A,a7_a 218 | _a7a,a7a 219 | _a77,a77 220 | _a7_,a7 221 | _a_A,a_a 222 | _a_a,a_a 223 | _a_7,a_7 224 | _a__,a 225 | _7AA,7aa 226 | _7Aa,7aa 227 | _7A7,7a7 228 | _7A_,7a 229 | _7aA,7a_a 230 | _7aa,7aa 231 | _7a7,7a7 232 | _7a_,7a 233 | _77A,77a 234 | _77a,77a 235 | _777,777 236 | _77_,77 237 | _7_A,7_a 238 | _7_a,7_a 239 | _7_7,7_7 240 | _7__,7 241 | __AA,aa 242 | __Aa,aa 243 | __A7,a7 244 | __A_,a 245 | __aA,a_a 246 | __aa,aa 247 | __a7,a7 248 | __a_,a 249 | __7A,7a 250 | __7a,7a 251 | __77,77 252 | __7_,7 253 | ___A,a 254 | ___a,a 255 | ___7,7 256 | ____, 257 | -------------------------------------------------------------------------------- /tests/resources/translate/array.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "array", 3 | "tests": [ 4 | { 5 | "name": "test_array_with_atomics", 6 | "compatible": true, 7 | "test": { 8 | "avro": { 9 | "items": { 10 | "type": "long" 11 | }, 12 | "type": "array" 13 | }, 14 | "bigquery": [ 15 | { 16 | "mode": "REPEATED", 17 | "name": "root", 18 | "type": "INT64" 19 | } 20 | ], 21 | "json": { 22 | "items": { 23 | "type": "integer" 24 | }, 25 | "type": "array" 26 | } 27 | } 28 | }, 29 | { 30 | "name": "test_array_with_complex", 31 | "compatible": true, 32 | "test": { 33 | "avro": { 34 | "items": { 35 | "fields": [ 36 | { 37 | "default": null, 38 | "name": "field_1", 39 | "type": [ 40 | { 41 | "type": "null" 42 | }, 43 | { 44 | "type": "string" 45 | } 46 | ] 47 | }, 48 | { 49 | "default": null, 50 | "name": "field_2", 51 | "type": [ 52 | { 53 | "type": "null" 54 | }, 55 | { 56 | "type": "long" 57 | } 58 | ] 59 | } 60 | ], 61 | "name": "list", 62 | "namespace": "root", 63 | "type": "record" 64 | }, 65 | "type": "array" 66 | }, 67 | "bigquery": [ 68 | { 69 | "fields": [ 70 | { 71 | "mode": "NULLABLE", 72 | "name": "field_1", 73 | "type": "STRING" 74 | }, 75 | { 76 | "mode": "NULLABLE", 77 | "name": "field_2", 78 | "type": "INT64" 79 | } 80 | ], 81 | "mode": "REPEATED", 82 | "name": "root", 83 | "type": "RECORD" 84 | } 85 | ], 86 | "json": { 87 | "items": { 88 | "properties": { 89 | "field_1": { 90 | "type": "string" 91 | }, 92 | "field_2": { 93 | "type": "integer" 94 | } 95 | }, 96 | "type": "object" 97 | }, 98 | "type": "array" 99 | } 100 | } 101 | }, 102 | { 103 | "name": "test_array_of_array", 104 | "compatible": true, 105 | "test": { 106 | "avro": { 107 | "fields": [ 108 | { 109 | "name": "array", 110 | "type": { 111 | "items": { 112 | "fields": [ 113 | { 114 | "name": "list", 115 | "type": { 116 | "items": { 117 | "fields": [ 118 | { 119 | "name": "list", 120 | "type": { 121 | "items": { 122 | "type": "long" 123 | }, 124 | "type": "array" 125 | } 126 | } 127 | ], 128 | "name": "list", 129 | "namespace": "root.array", 130 | "type": "record" 131 | }, 132 | "type": "array" 133 | } 134 | } 135 | ], 136 | "name": "array", 137 | "namespace": "root", 138 | "type": "record" 139 | }, 140 | "type": "array" 141 | } 142 | } 143 | ], 144 | "name": "root", 145 | "type": "record" 146 | }, 147 | "bigquery": [ 148 | { 149 | "fields": [ 150 | { 151 | "fields": [ 152 | { 153 | "mode": "REPEATED", 154 | "name": "list", 155 | "type": "INT64" 156 | } 157 | ], 158 | "mode": "REPEATED", 159 | "name": "list", 160 | "type": "RECORD" 161 | } 162 | ], 163 | "mode": "REPEATED", 164 | "name": "array", 165 | "type": "RECORD" 166 | } 167 | ], 168 | "json": { 169 | "properties": { 170 | "array": { 171 | "items": { 172 | "items": { 173 | "items": { 174 | "type": "integer" 175 | }, 176 | "type": "array" 177 | }, 178 | "type": "array" 179 | }, 180 | "type": "array" 181 | } 182 | }, 183 | "required": [ 184 | "array" 185 | ], 186 | "type": "object" 187 | } 188 | } 189 | } 190 | ] 191 | } 192 | -------------------------------------------------------------------------------- /tests/resources/translate/atomic.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "atomic", 3 | "tests": [ 4 | { 5 | "name": "test_atomic", 6 | "compatible": true, 7 | "test": { 8 | "avro": { 9 | "type": "long" 10 | }, 11 | "bigquery": [ 12 | { 13 | "mode": "REQUIRED", 14 | "name": "root", 15 | "type": "INT64" 16 | } 17 | ], 18 | "json": { 19 | "type": "integer" 20 | } 21 | } 22 | }, 23 | { 24 | "name": "test_atomic_with_null", 25 | "compatible": true, 26 | "test": { 27 | "avro": [ 28 | { 29 | "type": "null" 30 | }, 31 | { 32 | "type": "long" 33 | } 34 | ], 35 | "bigquery": [ 36 | { 37 | "mode": "NULLABLE", 38 | "name": "root", 39 | "type": "INT64" 40 | } 41 | ], 42 | "json": { 43 | "type": [ 44 | "integer", 45 | "null" 46 | ] 47 | } 48 | } 49 | }, 50 | { 51 | "description": [ 52 | "Test that overlapping types are treated as json blobs." 53 | ], 54 | "name": "test_incompatible_atomic_multitype", 55 | "compatible": false, 56 | "test": { 57 | "avro": { 58 | "type": "string" 59 | }, 60 | "bigquery": [ 61 | { 62 | "mode": "REQUIRED", 63 | "name": "root", 64 | "type": "STRING" 65 | } 66 | ], 67 | "json": { 68 | "type": [ 69 | "boolean", 70 | "integer" 71 | ] 72 | } 73 | } 74 | }, 75 | { 76 | "description": [ 77 | "Test that overlapping types that can be null are nullable json blobs.", 78 | "A field is null if any of it's types are null." 79 | ], 80 | "name": "test_incompatible_atomic_multitype_with_null", 81 | "compatible": false, 82 | "test": { 83 | "avro": [ 84 | { 85 | "type": "null" 86 | }, 87 | { 88 | "type": "string" 89 | } 90 | ], 91 | "bigquery": [ 92 | { 93 | "mode": "NULLABLE", 94 | "name": "root", 95 | "type": "STRING" 96 | } 97 | ], 98 | "json": { 99 | "type": [ 100 | "boolean", 101 | "integer", 102 | "null" 103 | ] 104 | } 105 | } 106 | }, 107 | { 108 | "name": "test_datetime", 109 | "compatible": true, 110 | "test": { 111 | "avro": { 112 | "type": "string" 113 | }, 114 | "bigquery": [ 115 | { 116 | "mode": "REQUIRED", 117 | "name": "root", 118 | "type": "TIMESTAMP" 119 | } 120 | ], 121 | "json": { 122 | "format": "date-time", 123 | "type": "string" 124 | } 125 | } 126 | }, 127 | { 128 | "description": [ 129 | "Test that strings can be cast into byte strings" 130 | ], 131 | "name": "test_bytes_format", 132 | "compatible": true, 133 | "test": { 134 | "avro": { 135 | "type": "bytes" 136 | }, 137 | "bigquery": [ 138 | { 139 | "mode": "REQUIRED", 140 | "name": "root", 141 | "type": "BYTES" 142 | } 143 | ], 144 | "json": { 145 | "format": "bytes", 146 | "type": "string" 147 | } 148 | } 149 | }, 150 | { 151 | "name": "test_atomic_with_description", 152 | "compatible": true, 153 | "test": { 154 | "avro": { 155 | "type": "long" 156 | }, 157 | "bigquery": [ 158 | { 159 | "description": "test description", 160 | "mode": "REQUIRED", 161 | "name": "root", 162 | "type": "INT64" 163 | } 164 | ], 165 | "json": { 166 | "description": "test description", 167 | "type": "integer" 168 | } 169 | } 170 | }, 171 | { 172 | "name": "test_atomic_with_description_and_title", 173 | "compatible": true, 174 | "test": { 175 | "avro": { 176 | "type": "long" 177 | }, 178 | "bigquery": [ 179 | { 180 | "description": "test title - test description", 181 | "mode": "REQUIRED", 182 | "name": "root", 183 | "type": "INT64" 184 | } 185 | ], 186 | "json": { 187 | "description": "test description", 188 | "title": "test title", 189 | "type": "integer" 190 | } 191 | } 192 | }, 193 | { 194 | "name": "test_atomic_with_title", 195 | "compatible": true, 196 | "test": { 197 | "avro": { 198 | "type": "long" 199 | }, 200 | "bigquery": [ 201 | { 202 | "description": "test title", 203 | "mode": "REQUIRED", 204 | "name": "root", 205 | "type": "INT64" 206 | } 207 | ], 208 | "json": { 209 | "title": "test title", 210 | "type": "integer" 211 | } 212 | } 213 | } 214 | ] 215 | } 216 | -------------------------------------------------------------------------------- /tests/resources/translate/json_column.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "json", 3 | "tests": [ 4 | { 5 | "name": "test_json_object", 6 | "compatible": true, 7 | "test": { 8 | "avro": null, 9 | "bigquery": [ 10 | { 11 | "mode": "NULLABLE", 12 | "name": "an_object_name", 13 | "type": "JSON" 14 | } 15 | ], 16 | "json": { 17 | "properties": { 18 | "an_object_name": { 19 | "items": {}, 20 | "type": [ 21 | "object", 22 | "array" 23 | ] 24 | } 25 | } 26 | }, 27 | "context": { 28 | "json_object_path_regex": "an_object_name" 29 | } 30 | } 31 | }, 32 | { 33 | "name": "test_json_object_no_avro_support", 34 | "compatible": false, 35 | "test": { 36 | "avro": { 37 | "fields": [ 38 | { 39 | "default": null, 40 | "name": "an_object_name", 41 | "type": [ 42 | { 43 | "type": "null" 44 | }, 45 | { 46 | "type": "string" 47 | } 48 | ] 49 | } 50 | ], 51 | "name": "root", 52 | "type": "record" 53 | }, 54 | "bigquery": "no schema -- we need it to panic", 55 | "json": { 56 | "properties": { 57 | "an_object_name": { 58 | "items": {}, 59 | "type": [ 60 | "object", 61 | "array" 62 | ] 63 | } 64 | } 65 | }, 66 | "context": { 67 | "json_object_path_regex": "an_object_name" 68 | } 69 | } 70 | }, 71 | { 72 | "name": "test_json_object_nested", 73 | "compatible": true, 74 | "test": { 75 | "avro": null, 76 | "bigquery": [ 77 | { 78 | "fields": [ 79 | { 80 | "mode": "NULLABLE", 81 | "name": "an_object_name", 82 | "type": "JSON" 83 | } 84 | ], 85 | "mode": "NULLABLE", 86 | "name": "object", 87 | "type": "RECORD" 88 | } 89 | ], 90 | "json": { 91 | "properties": { 92 | "object": { 93 | "properties": { 94 | "an_object_name": { 95 | "items": {}, 96 | "type": [ 97 | "object", 98 | "array" 99 | ] 100 | } 101 | }, 102 | "type": "object" 103 | } 104 | } 105 | }, 106 | "context": { 107 | "json_object_path_regex": "object\\..*" 108 | } 109 | } 110 | } 111 | ] 112 | } 113 | -------------------------------------------------------------------------------- /tests/resources/translate/map.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": [ 3 | "Test the behavior of repeated key-value structures.", 4 | "This is influenced strongly by the data-structures used in collecting metrics.", 5 | "They have different names but common structure.", 6 | "This type of output structure can be handled efficiently with the use of `UNNEST` and projections.", 7 | "An alternative is to dump the entire structure to JSON and use javascript UDFs to handle processing." 8 | ], 9 | "name": "map", 10 | "tests": [ 11 | { 12 | "name": "test_map_with_atomics", 13 | "compatible": true, 14 | "test": { 15 | "avro": { 16 | "type": "map", 17 | "values": { 18 | "type": "long" 19 | } 20 | }, 21 | "bigquery": [ 22 | { 23 | "fields": [ 24 | { 25 | "mode": "REQUIRED", 26 | "name": "key", 27 | "type": "STRING" 28 | }, 29 | { 30 | "mode": "REQUIRED", 31 | "name": "value", 32 | "type": "INT64" 33 | } 34 | ], 35 | "mode": "REPEATED", 36 | "name": "root", 37 | "type": "RECORD" 38 | } 39 | ], 40 | "json": { 41 | "additionalProperties": { 42 | "type": "integer" 43 | }, 44 | "type": "object" 45 | } 46 | } 47 | }, 48 | { 49 | "name": "test_map_with_complex", 50 | "compatible": true, 51 | "test": { 52 | "avro": { 53 | "type": "map", 54 | "values": { 55 | "fields": [ 56 | { 57 | "default": null, 58 | "name": "field_1", 59 | "type": [ 60 | { 61 | "type": "null" 62 | }, 63 | { 64 | "type": "string" 65 | } 66 | ] 67 | }, 68 | { 69 | "default": null, 70 | "name": "field_2", 71 | "type": [ 72 | { 73 | "type": "null" 74 | }, 75 | { 76 | "type": "long" 77 | } 78 | ] 79 | } 80 | ], 81 | "name": "value", 82 | "namespace": "root", 83 | "type": "record" 84 | } 85 | }, 86 | "bigquery": [ 87 | { 88 | "description": "root description", 89 | "fields": [ 90 | { 91 | "mode": "REQUIRED", 92 | "name": "key", 93 | "type": "STRING" 94 | }, 95 | { 96 | "description": "object description", 97 | "fields": [ 98 | { 99 | "description": "field description", 100 | "mode": "NULLABLE", 101 | "name": "field_1", 102 | "type": "STRING" 103 | }, 104 | { 105 | "mode": "NULLABLE", 106 | "name": "field_2", 107 | "type": "INT64" 108 | } 109 | ], 110 | "mode": "REQUIRED", 111 | "name": "value", 112 | "type": "RECORD" 113 | } 114 | ], 115 | "mode": "REPEATED", 116 | "name": "root", 117 | "type": "RECORD" 118 | } 119 | ], 120 | "json": { 121 | "additionalProperties": { 122 | "description": "object description", 123 | "properties": { 124 | "field_1": { 125 | "description": "field description", 126 | "type": "string" 127 | }, 128 | "field_2": { 129 | "type": "integer" 130 | } 131 | }, 132 | "type": "object" 133 | }, 134 | "description": "root description", 135 | "type": "object" 136 | } 137 | } 138 | }, 139 | { 140 | "name": "test_map_with_pattern_properties", 141 | "compatible": true, 142 | "test": { 143 | "avro": { 144 | "type": "map", 145 | "values": { 146 | "type": "long" 147 | } 148 | }, 149 | "bigquery": [ 150 | { 151 | "fields": [ 152 | { 153 | "mode": "REQUIRED", 154 | "name": "key", 155 | "type": "STRING" 156 | }, 157 | { 158 | "mode": "REQUIRED", 159 | "name": "value", 160 | "type": "INT64" 161 | } 162 | ], 163 | "mode": "REPEATED", 164 | "name": "root", 165 | "type": "RECORD" 166 | } 167 | ], 168 | "json": { 169 | "additionalProperties": false, 170 | "patternProperties": { 171 | ".+": { 172 | "type": "integer" 173 | } 174 | }, 175 | "type": "object" 176 | } 177 | } 178 | }, 179 | { 180 | "name": "test_map_with_pattern_and_additional_properties", 181 | "compatible": true, 182 | "test": { 183 | "avro": { 184 | "type": "map", 185 | "values": { 186 | "type": "long" 187 | } 188 | }, 189 | "bigquery": [ 190 | { 191 | "fields": [ 192 | { 193 | "mode": "REQUIRED", 194 | "name": "key", 195 | "type": "STRING" 196 | }, 197 | { 198 | "mode": "REQUIRED", 199 | "name": "value", 200 | "type": "INT64" 201 | } 202 | ], 203 | "mode": "REPEATED", 204 | "name": "root", 205 | "type": "RECORD" 206 | } 207 | ], 208 | "json": { 209 | "additionalProperties": { 210 | "type": "integer" 211 | }, 212 | "patternProperties": { 213 | ".+": { 214 | "type": "integer" 215 | } 216 | }, 217 | "type": "object" 218 | } 219 | } 220 | }, 221 | { 222 | "name": "test_incompatible_map_with_pattern_properties", 223 | "compatible": false, 224 | "test": { 225 | "avro": { 226 | "type": "map", 227 | "values": { 228 | "type": "string" 229 | } 230 | }, 231 | "bigquery": [ 232 | { 233 | "fields": [ 234 | { 235 | "mode": "REQUIRED", 236 | "name": "key", 237 | "type": "STRING" 238 | }, 239 | { 240 | "mode": "REQUIRED", 241 | "name": "value", 242 | "type": "STRING" 243 | } 244 | ], 245 | "mode": "REPEATED", 246 | "name": "root", 247 | "type": "RECORD" 248 | } 249 | ], 250 | "json": { 251 | "additionalProperties": false, 252 | "patternProperties": { 253 | "^I_": { 254 | "type": "integer" 255 | }, 256 | "^S_": { 257 | "type": "string" 258 | } 259 | }, 260 | "type": "object" 261 | } 262 | } 263 | }, 264 | { 265 | "name": "test_incompatible_map_with_pattern_and_additional_properties", 266 | "compatible": false, 267 | "test": { 268 | "avro": { 269 | "type": "map", 270 | "values": { 271 | "type": "string" 272 | } 273 | }, 274 | "bigquery": [ 275 | { 276 | "fields": [ 277 | { 278 | "mode": "REQUIRED", 279 | "name": "key", 280 | "type": "STRING" 281 | }, 282 | { 283 | "mode": "REQUIRED", 284 | "name": "value", 285 | "type": "STRING" 286 | } 287 | ], 288 | "mode": "REPEATED", 289 | "name": "root", 290 | "type": "RECORD" 291 | } 292 | ], 293 | "json": { 294 | "additionalProperties": { 295 | "type": "integer" 296 | }, 297 | "patternProperties": { 298 | ".+": { 299 | "type": "string" 300 | } 301 | }, 302 | "type": "object" 303 | } 304 | } 305 | } 306 | ] 307 | } 308 | -------------------------------------------------------------------------------- /tests/resources/translate/object.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "object", 3 | "tests": [ 4 | { 5 | "description": [ 6 | "Test that fields are sorted in a record.", 7 | "Sorting makes the output schema deterministic." 8 | ], 9 | "name": "test_object_with_atomics_is_sorted", 10 | "compatible": true, 11 | "test": { 12 | "avro": { 13 | "fields": [ 14 | { 15 | "default": null, 16 | "name": "field_1", 17 | "type": [ 18 | { 19 | "type": "null" 20 | }, 21 | { 22 | "type": "long" 23 | } 24 | ] 25 | }, 26 | { 27 | "default": null, 28 | "name": "field_2", 29 | "type": [ 30 | { 31 | "type": "null" 32 | }, 33 | { 34 | "type": "string" 35 | } 36 | ] 37 | }, 38 | { 39 | "default": null, 40 | "name": "field_3", 41 | "type": [ 42 | { 43 | "type": "null" 44 | }, 45 | { 46 | "type": "boolean" 47 | } 48 | ] 49 | }, 50 | { 51 | "default": null, 52 | "name": "field_4", 53 | "type": [ 54 | { 55 | "type": "null" 56 | }, 57 | { 58 | "type": "double" 59 | } 60 | ] 61 | } 62 | ], 63 | "name": "root", 64 | "type": "record" 65 | }, 66 | "bigquery": [ 67 | { 68 | "mode": "NULLABLE", 69 | "name": "field_1", 70 | "type": "INT64" 71 | }, 72 | { 73 | "mode": "NULLABLE", 74 | "name": "field_2", 75 | "type": "STRING" 76 | }, 77 | { 78 | "mode": "NULLABLE", 79 | "name": "field_3", 80 | "type": "BOOL" 81 | }, 82 | { 83 | "mode": "NULLABLE", 84 | "name": "field_4", 85 | "type": "FLOAT64" 86 | } 87 | ], 88 | "json": { 89 | "properties": { 90 | "field_1": { 91 | "type": "integer" 92 | }, 93 | "field_2": { 94 | "type": "string" 95 | }, 96 | "field_3": { 97 | "type": "boolean" 98 | }, 99 | "field_4": { 100 | "type": "number" 101 | } 102 | }, 103 | "type": "object" 104 | } 105 | } 106 | }, 107 | { 108 | "description": [ 109 | "Test that required fields have the required mode.", 110 | "This changes the mode of the underlying atomic field." 111 | ], 112 | "name": "test_object_with_atomics_required", 113 | "compatible": true, 114 | "test": { 115 | "avro": { 116 | "fields": [ 117 | { 118 | "name": "field_1", 119 | "type": { 120 | "type": "long" 121 | } 122 | }, 123 | { 124 | "default": null, 125 | "name": "field_2", 126 | "type": [ 127 | { 128 | "type": "null" 129 | }, 130 | { 131 | "type": "string" 132 | } 133 | ] 134 | }, 135 | { 136 | "name": "field_3", 137 | "type": { 138 | "type": "boolean" 139 | } 140 | } 141 | ], 142 | "name": "root", 143 | "type": "record" 144 | }, 145 | "bigquery": [ 146 | { 147 | "mode": "REQUIRED", 148 | "name": "field_1", 149 | "type": "INT64" 150 | }, 151 | { 152 | "mode": "NULLABLE", 153 | "name": "field_2", 154 | "type": "STRING" 155 | }, 156 | { 157 | "mode": "REQUIRED", 158 | "name": "field_3", 159 | "type": "BOOL" 160 | } 161 | ], 162 | "json": { 163 | "properties": { 164 | "field_1": { 165 | "type": "integer" 166 | }, 167 | "field_2": { 168 | "type": "string" 169 | }, 170 | "field_3": { 171 | "type": "boolean" 172 | } 173 | }, 174 | "required": [ 175 | "field_1", 176 | "field_3" 177 | ], 178 | "type": "object" 179 | } 180 | } 181 | }, 182 | { 183 | "description": [ 184 | "Test the output of a nullable required field.", 185 | "The field is casted from nullable to required at the object level.", 186 | "Since the underlying field is null, the field is then casted back to nullable." 187 | ], 188 | "name": "test_object_with_atomics_required_with_null", 189 | "compatible": true, 190 | "test": { 191 | "avro": { 192 | "fields": [ 193 | { 194 | "default": null, 195 | "name": "field_1", 196 | "type": [ 197 | { 198 | "type": "null" 199 | }, 200 | { 201 | "type": "long" 202 | } 203 | ] 204 | }, 205 | { 206 | "default": null, 207 | "name": "field_2", 208 | "type": [ 209 | { 210 | "type": "null" 211 | }, 212 | { 213 | "type": "string" 214 | } 215 | ] 216 | }, 217 | { 218 | "name": "field_3", 219 | "type": { 220 | "type": "boolean" 221 | } 222 | } 223 | ], 224 | "name": "root", 225 | "type": "record" 226 | }, 227 | "bigquery": [ 228 | { 229 | "mode": "NULLABLE", 230 | "name": "field_1", 231 | "type": "INT64" 232 | }, 233 | { 234 | "mode": "NULLABLE", 235 | "name": "field_2", 236 | "type": "STRING" 237 | }, 238 | { 239 | "mode": "REQUIRED", 240 | "name": "field_3", 241 | "type": "BOOL" 242 | } 243 | ], 244 | "json": { 245 | "properties": { 246 | "field_1": { 247 | "type": [ 248 | "integer", 249 | "null" 250 | ] 251 | }, 252 | "field_2": { 253 | "type": "string" 254 | }, 255 | "field_3": { 256 | "type": "boolean" 257 | } 258 | }, 259 | "required": [ 260 | "field_1", 261 | "field_3" 262 | ], 263 | "type": "object" 264 | } 265 | } 266 | }, 267 | { 268 | "name": "test_object_with_complex", 269 | "compatible": true, 270 | "test": { 271 | "avro": { 272 | "fields": [ 273 | { 274 | "default": null, 275 | "name": "namespace_1", 276 | "type": [ 277 | { 278 | "type": "null" 279 | }, 280 | { 281 | "fields": [ 282 | { 283 | "default": null, 284 | "name": "field_1", 285 | "type": [ 286 | { 287 | "type": "null" 288 | }, 289 | { 290 | "type": "string" 291 | } 292 | ] 293 | }, 294 | { 295 | "default": null, 296 | "name": "field_2", 297 | "type": [ 298 | { 299 | "type": "null" 300 | }, 301 | { 302 | "type": "long" 303 | } 304 | ] 305 | } 306 | ], 307 | "name": "namespace_1", 308 | "namespace": "root", 309 | "type": "record" 310 | } 311 | ] 312 | } 313 | ], 314 | "name": "root", 315 | "type": "record" 316 | }, 317 | "bigquery": [ 318 | { 319 | "fields": [ 320 | { 321 | "description": "field description", 322 | "mode": "NULLABLE", 323 | "name": "field_1", 324 | "type": "STRING" 325 | }, 326 | { 327 | "mode": "NULLABLE", 328 | "name": "field_2", 329 | "type": "INT64" 330 | } 331 | ], 332 | "mode": "NULLABLE", 333 | "name": "namespace_1", 334 | "type": "RECORD" 335 | } 336 | ], 337 | "json": { 338 | "properties": { 339 | "namespace_1": { 340 | "properties": { 341 | "field_1": { 342 | "description": "field description", 343 | "type": "string" 344 | }, 345 | "field_2": { 346 | "type": "integer" 347 | } 348 | }, 349 | "type": "object" 350 | } 351 | }, 352 | "type": "object" 353 | } 354 | } 355 | }, 356 | { 357 | "description": [ 358 | "Empty structs are not supported in BigQuery, so we treat them ", 359 | "as empty documents." 360 | ], 361 | "name": "test_object_empty_record", 362 | "compatible": false, 363 | "test": { 364 | "avro": { 365 | "type": "string" 366 | }, 367 | "bigquery": [ 368 | { 369 | "mode": "REQUIRED", 370 | "name": "root", 371 | "type": "STRING" 372 | } 373 | ], 374 | "json": { 375 | "properties": {}, 376 | "type": "object" 377 | } 378 | } 379 | } 380 | ] 381 | } 382 | -------------------------------------------------------------------------------- /tests/resources/translate/oneof.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "oneof", 3 | "tests": [ 4 | { 5 | "name": "test_oneof_atomic", 6 | "compatible": true, 7 | "test": { 8 | "avro": { 9 | "type": "long" 10 | }, 11 | "bigquery": [ 12 | { 13 | "mode": "REQUIRED", 14 | "name": "root", 15 | "type": "INT64" 16 | } 17 | ], 18 | "json": { 19 | "oneOf": [ 20 | { 21 | "type": "integer" 22 | }, 23 | { 24 | "type": "integer" 25 | } 26 | ] 27 | } 28 | } 29 | }, 30 | { 31 | "name": "test_oneof_atomic_with_null", 32 | "compatible": true, 33 | "test": { 34 | "avro": [ 35 | { 36 | "type": "null" 37 | }, 38 | { 39 | "type": "long" 40 | } 41 | ], 42 | "bigquery": [ 43 | { 44 | "mode": "NULLABLE", 45 | "name": "root", 46 | "type": "INT64" 47 | } 48 | ], 49 | "json": { 50 | "oneOf": [ 51 | { 52 | "type": "integer" 53 | }, 54 | { 55 | "type": "null" 56 | } 57 | ] 58 | } 59 | } 60 | }, 61 | { 62 | "name": "test_incompatible_oneof_atomic", 63 | "compatible": false, 64 | "test": { 65 | "avro": { 66 | "type": "string" 67 | }, 68 | "bigquery": [ 69 | { 70 | "mode": "REQUIRED", 71 | "name": "root", 72 | "type": "STRING" 73 | } 74 | ], 75 | "json": { 76 | "oneOf": [ 77 | { 78 | "type": "integer" 79 | }, 80 | { 81 | "type": "boolean" 82 | } 83 | ] 84 | } 85 | } 86 | }, 87 | { 88 | "description": [ 89 | "Test a oneOf clause and verify that the mode is NULLABLE.", 90 | "`null` has a logical-OR like behavior when there are choices of types." 91 | ], 92 | "name": "test_incompatible_oneof_atomic_with_null", 93 | "compatible": false, 94 | "test": { 95 | "avro": [ 96 | { 97 | "type": "null" 98 | }, 99 | { 100 | "type": "string" 101 | } 102 | ], 103 | "bigquery": [ 104 | { 105 | "mode": "NULLABLE", 106 | "name": "root", 107 | "type": "STRING" 108 | } 109 | ], 110 | "json": { 111 | "oneOf": [ 112 | { 113 | "type": [ 114 | "integer", 115 | "null" 116 | ] 117 | }, 118 | { 119 | "type": "boolean" 120 | } 121 | ] 122 | } 123 | } 124 | }, 125 | { 126 | "name": "test_oneof_object_with_atomics", 127 | "compatible": true, 128 | "test": { 129 | "avro": { 130 | "fields": [ 131 | { 132 | "default": null, 133 | "name": "field_1", 134 | "type": [ 135 | { 136 | "type": "null" 137 | }, 138 | { 139 | "type": "long" 140 | } 141 | ] 142 | }, 143 | { 144 | "default": null, 145 | "name": "field_2", 146 | "type": [ 147 | { 148 | "type": "null" 149 | }, 150 | { 151 | "type": "long" 152 | } 153 | ] 154 | } 155 | ], 156 | "name": "root", 157 | "type": "record" 158 | }, 159 | "bigquery": [ 160 | { 161 | "mode": "NULLABLE", 162 | "name": "field_1", 163 | "type": "INT64" 164 | }, 165 | { 166 | "mode": "NULLABLE", 167 | "name": "field_2", 168 | "type": "INT64" 169 | } 170 | ], 171 | "json": { 172 | "oneOf": [ 173 | { 174 | "properties": { 175 | "field_1": { 176 | "type": "integer" 177 | }, 178 | "field_2": { 179 | "type": "integer" 180 | } 181 | }, 182 | "type": "object" 183 | }, 184 | { 185 | "properties": { 186 | "field_1": { 187 | "type": "integer" 188 | }, 189 | "field_2": { 190 | "type": "integer" 191 | } 192 | }, 193 | "type": "object" 194 | } 195 | ] 196 | } 197 | } 198 | }, 199 | { 200 | "description": [ 201 | "Test schemas that share common structure" 202 | ], 203 | "name": "test_oneof_object_merge", 204 | "compatible": true, 205 | "test": { 206 | "avro": { 207 | "fields": [ 208 | { 209 | "default": null, 210 | "name": "field_1", 211 | "type": [ 212 | { 213 | "type": "null" 214 | }, 215 | { 216 | "type": "long" 217 | } 218 | ] 219 | }, 220 | { 221 | "default": null, 222 | "name": "field_2", 223 | "type": [ 224 | { 225 | "type": "null" 226 | }, 227 | { 228 | "type": "boolean" 229 | } 230 | ] 231 | }, 232 | { 233 | "default": null, 234 | "name": "field_3", 235 | "type": [ 236 | { 237 | "type": "null" 238 | }, 239 | { 240 | "type": "double" 241 | } 242 | ] 243 | } 244 | ], 245 | "name": "root", 246 | "type": "record" 247 | }, 248 | "bigquery": [ 249 | { 250 | "mode": "NULLABLE", 251 | "name": "field_1", 252 | "type": "INT64" 253 | }, 254 | { 255 | "mode": "NULLABLE", 256 | "name": "field_2", 257 | "type": "BOOL" 258 | }, 259 | { 260 | "mode": "NULLABLE", 261 | "name": "field_3", 262 | "type": "FLOAT64" 263 | } 264 | ], 265 | "json": { 266 | "oneOf": [ 267 | { 268 | "properties": { 269 | "field_1": { 270 | "type": "integer" 271 | }, 272 | "field_3": { 273 | "type": "number" 274 | } 275 | }, 276 | "type": "object" 277 | }, 278 | { 279 | "properties": { 280 | "field_2": { 281 | "type": "boolean" 282 | }, 283 | "field_3": { 284 | "type": "number" 285 | } 286 | }, 287 | "type": "object" 288 | } 289 | ] 290 | } 291 | } 292 | }, 293 | { 294 | "name": "test_oneof_object_merge_with_complex", 295 | "compatible": true, 296 | "test": { 297 | "avro": { 298 | "fields": [ 299 | { 300 | "default": null, 301 | "name": "field_4", 302 | "type": [ 303 | { 304 | "type": "null" 305 | }, 306 | { 307 | "type": "boolean" 308 | } 309 | ] 310 | }, 311 | { 312 | "default": null, 313 | "name": "field_5", 314 | "type": [ 315 | { 316 | "type": "null" 317 | }, 318 | { 319 | "type": "double" 320 | } 321 | ] 322 | }, 323 | { 324 | "default": null, 325 | "name": "namespace_1", 326 | "type": [ 327 | { 328 | "type": "null" 329 | }, 330 | { 331 | "fields": [ 332 | { 333 | "default": null, 334 | "name": "field_1", 335 | "type": [ 336 | { 337 | "type": "null" 338 | }, 339 | { 340 | "type": "long" 341 | } 342 | ] 343 | }, 344 | { 345 | "default": null, 346 | "name": "field_2", 347 | "type": [ 348 | { 349 | "type": "null" 350 | }, 351 | { 352 | "type": "boolean" 353 | } 354 | ] 355 | }, 356 | { 357 | "default": null, 358 | "name": "field_3", 359 | "type": [ 360 | { 361 | "type": "null" 362 | }, 363 | { 364 | "type": "double" 365 | } 366 | ] 367 | } 368 | ], 369 | "name": "namespace_1", 370 | "namespace": "root", 371 | "type": "record" 372 | } 373 | ] 374 | } 375 | ], 376 | "name": "root", 377 | "type": "record" 378 | }, 379 | "bigquery": [ 380 | { 381 | "mode": "NULLABLE", 382 | "name": "field_4", 383 | "type": "BOOL" 384 | }, 385 | { 386 | "mode": "NULLABLE", 387 | "name": "field_5", 388 | "type": "FLOAT64" 389 | }, 390 | { 391 | "fields": [ 392 | { 393 | "mode": "NULLABLE", 394 | "name": "field_1", 395 | "type": "INT64" 396 | }, 397 | { 398 | "mode": "NULLABLE", 399 | "name": "field_2", 400 | "type": "BOOL" 401 | }, 402 | { 403 | "mode": "NULLABLE", 404 | "name": "field_3", 405 | "type": "FLOAT64" 406 | } 407 | ], 408 | "mode": "NULLABLE", 409 | "name": "namespace_1", 410 | "type": "RECORD" 411 | } 412 | ], 413 | "json": { 414 | "oneOf": [ 415 | { 416 | "properties": { 417 | "namespace_1": { 418 | "properties": { 419 | "field_1": { 420 | "type": "integer" 421 | }, 422 | "field_3": { 423 | "type": "number" 424 | } 425 | }, 426 | "type": "object" 427 | } 428 | }, 429 | "type": "object" 430 | }, 431 | { 432 | "properties": { 433 | "namespace_1": { 434 | "properties": { 435 | "field_2": { 436 | "type": "boolean" 437 | }, 438 | "field_3": { 439 | "type": "number" 440 | } 441 | }, 442 | "type": "object" 443 | } 444 | }, 445 | "type": "object" 446 | }, 447 | { 448 | "properties": { 449 | "field_4": { 450 | "type": "boolean" 451 | }, 452 | "field_5": { 453 | "type": "number" 454 | } 455 | }, 456 | "type": "object" 457 | } 458 | ] 459 | } 460 | } 461 | }, 462 | { 463 | "name": "test_incompatible_oneof_atomic_and_object", 464 | "compatible": false, 465 | "test": { 466 | "avro": { 467 | "type": "string" 468 | }, 469 | "bigquery": [ 470 | { 471 | "mode": "REQUIRED", 472 | "name": "root", 473 | "type": "STRING" 474 | } 475 | ], 476 | "json": { 477 | "oneOf": [ 478 | { 479 | "type": "integer" 480 | }, 481 | { 482 | "properties": { 483 | "field_1": { 484 | "type": "integer" 485 | } 486 | }, 487 | "type": "object" 488 | } 489 | ] 490 | } 491 | } 492 | }, 493 | { 494 | "name": "test_incompatible_oneof_object", 495 | "compatible": false, 496 | "test": { 497 | "avro": { 498 | "type": "string" 499 | }, 500 | "bigquery": [ 501 | { 502 | "mode": "REQUIRED", 503 | "name": "root", 504 | "type": "STRING" 505 | } 506 | ], 507 | "json": { 508 | "oneOf": [ 509 | { 510 | "properties": { 511 | "field_1": { 512 | "type": "integer" 513 | } 514 | }, 515 | "type": "object" 516 | }, 517 | { 518 | "properties": { 519 | "field_1": { 520 | "type": "boolean" 521 | } 522 | }, 523 | "type": "object" 524 | } 525 | ] 526 | } 527 | } 528 | }, 529 | { 530 | "description": [ 531 | "Test behavior of creating an incompatible leaf on a complex object.", 532 | "NOTE: A conflict at a node invalidates the entire tree. Another ", 533 | "conflict resolution method is to treat diffs as json blobs while ", 534 | "retaining as much structure as possible." 535 | ], 536 | "name": "test_incompatible_oneof_object_with_complex", 537 | "compatible": false, 538 | "test": { 539 | "avro": { 540 | "type": "string" 541 | }, 542 | "bigquery": [ 543 | { 544 | "mode": "REQUIRED", 545 | "name": "root", 546 | "type": "STRING" 547 | } 548 | ], 549 | "json": { 550 | "oneOf": [ 551 | { 552 | "properties": { 553 | "namespace_1": { 554 | "properties": { 555 | "field_1": { 556 | "type": "string" 557 | }, 558 | "field_2": { 559 | "type": "integer" 560 | } 561 | }, 562 | "type": "object" 563 | } 564 | }, 565 | "type": "object" 566 | }, 567 | { 568 | "properties": { 569 | "namespace_1": { 570 | "properties": { 571 | "field_1": { 572 | "type": "boolean" 573 | }, 574 | "field_2": { 575 | "type": "integer" 576 | } 577 | }, 578 | "type": "object" 579 | } 580 | }, 581 | "type": "object" 582 | } 583 | ] 584 | } 585 | } 586 | }, 587 | { 588 | "description": [ 589 | "Test for nullability across two variants with required fields" 590 | ], 591 | "name": "test_oneof_object_merge_nullability", 592 | "compatible": true, 593 | "test": { 594 | "avro": { 595 | "fields": [ 596 | { 597 | "name": "shared", 598 | "type": { 599 | "type": "long" 600 | } 601 | }, 602 | { 603 | "default": null, 604 | "name": "type_a", 605 | "type": [ 606 | { 607 | "type": "null" 608 | }, 609 | { 610 | "type": "long" 611 | } 612 | ] 613 | }, 614 | { 615 | "default": null, 616 | "name": "type_b", 617 | "type": [ 618 | { 619 | "type": "null" 620 | }, 621 | { 622 | "type": "long" 623 | } 624 | ] 625 | } 626 | ], 627 | "name": "root", 628 | "type": "record" 629 | }, 630 | "bigquery": [ 631 | { 632 | "mode": "REQUIRED", 633 | "name": "shared", 634 | "type": "INT64" 635 | }, 636 | { 637 | "mode": "NULLABLE", 638 | "name": "type_a", 639 | "type": "INT64" 640 | }, 641 | { 642 | "mode": "NULLABLE", 643 | "name": "type_b", 644 | "type": "INT64" 645 | } 646 | ], 647 | "json": { 648 | "oneOf": [ 649 | { 650 | "properties": { 651 | "shared": { 652 | "type": "integer" 653 | }, 654 | "type_a": { 655 | "type": "integer" 656 | } 657 | }, 658 | "required": [ 659 | "shared", 660 | "type_a" 661 | ] 662 | }, 663 | { 664 | "properties": { 665 | "shared": { 666 | "type": "integer" 667 | }, 668 | "type_b": { 669 | "type": "integer" 670 | } 671 | }, 672 | "required": [ 673 | "shared", 674 | "type_b" 675 | ] 676 | } 677 | ] 678 | } 679 | } 680 | } 681 | ] 682 | } 683 | -------------------------------------------------------------------------------- /tests/tuple_struct.rs: -------------------------------------------------------------------------------- 1 | use jst::{convert_avro, convert_bigquery}; 2 | use jst::{Context, ResolveMethod}; 3 | use pretty_assertions::assert_eq; 4 | use serde_json::Value; 5 | 6 | fn data_atomic() -> Value { 7 | serde_json::from_str( 8 | r#" 9 | { 10 | "additionalItems": false, 11 | "items": [ 12 | {"type": "boolean"}, 13 | {"type": "string"} 14 | ], 15 | "type": "array" 16 | } 17 | "#, 18 | ) 19 | .unwrap() 20 | } 21 | 22 | fn data_atomic_with_additional_properties() -> Value { 23 | serde_json::from_str( 24 | r#" 25 | { 26 | "additionalItems": { 27 | "type": "integer" 28 | }, 29 | "items": [ 30 | {"type": "boolean"}, 31 | {"type": "string"} 32 | ], 33 | "maxItems": 4, 34 | "type": "array" 35 | } 36 | "#, 37 | ) 38 | .unwrap() 39 | } 40 | 41 | fn data_object_missing() -> Value { 42 | // The second item has an incompatible field, but will be dropped. 43 | serde_json::from_str( 44 | r#" 45 | { 46 | "additionalItems": false, 47 | "items": [ 48 | {"type": "integer"}, 49 | { 50 | "type": "object", 51 | "properties": { 52 | "first": {"type": "string"}, 53 | "second": {"type": ["string", "object"]} 54 | }, 55 | "required": ["first"] 56 | } 57 | ], 58 | "type": "array" 59 | } 60 | "#, 61 | ) 62 | .unwrap() 63 | } 64 | 65 | fn data_incompatible() -> Value { 66 | serde_json::from_str( 67 | r#" 68 | { 69 | "additionalItems": false, 70 | "items": [ 71 | {"type": ["string", "integer"]} 72 | ] 73 | } 74 | "#, 75 | ) 76 | .unwrap() 77 | } 78 | 79 | #[test] 80 | fn test_avro_tuple_atomic() { 81 | let context = Context { 82 | tuple_struct: true, 83 | ..Default::default() 84 | }; 85 | let expected: Value = serde_json::from_str( 86 | r#" 87 | { 88 | "fields": [ 89 | { 90 | "name": "f0_", 91 | "type": {"type": "boolean"} 92 | }, 93 | { 94 | "name": "f1_", 95 | "type": {"type": "string"} 96 | } 97 | ], 98 | "name": "root", 99 | "type": "record" 100 | } 101 | "#, 102 | ) 103 | .unwrap(); 104 | assert_eq!(expected, convert_avro(&data_atomic(), context)); 105 | } 106 | 107 | #[test] 108 | fn test_bigquery_tuple_atomic() { 109 | let context = Context { 110 | tuple_struct: true, 111 | ..Default::default() 112 | }; 113 | let expected: Value = serde_json::from_str( 114 | r#" 115 | [ 116 | { 117 | "mode": "REQUIRED", 118 | "name": "f0_", 119 | "type": "BOOL" 120 | }, 121 | { 122 | "mode": "REQUIRED", 123 | "name": "f1_", 124 | "type": "STRING" 125 | } 126 | ] 127 | "#, 128 | ) 129 | .unwrap(); 130 | assert_eq!(expected, convert_bigquery(&data_atomic(), context)); 131 | } 132 | 133 | #[test] 134 | fn test_avro_tuple_atomic_with_additional_items() { 135 | let context = Context { 136 | tuple_struct: true, 137 | ..Default::default() 138 | }; 139 | let expected: Value = serde_json::from_str( 140 | r#" 141 | { 142 | "fields": [ 143 | { 144 | "name": "f0_", 145 | "type": {"type": "boolean"} 146 | }, 147 | { 148 | "name": "f1_", 149 | "type": {"type": "string"} 150 | }, 151 | { 152 | "name": "f2_", 153 | "default": null, 154 | "type": [{"type": "null"}, {"type": "long"}] 155 | }, 156 | { 157 | "name": "f3_", 158 | "default": null, 159 | "type": [{"type": "null"}, {"type": "long"}] 160 | } 161 | ], 162 | "name": "root", 163 | "type": "record" 164 | } 165 | "#, 166 | ) 167 | .unwrap(); 168 | assert_eq!( 169 | expected, 170 | convert_avro(&data_atomic_with_additional_properties(), context) 171 | ); 172 | } 173 | 174 | #[test] 175 | fn test_bigquery_tuple_atomic_with_additional_items() { 176 | let context = Context { 177 | tuple_struct: true, 178 | ..Default::default() 179 | }; 180 | let expected: Value = serde_json::from_str( 181 | r#" 182 | [ 183 | { 184 | "mode": "REQUIRED", 185 | "name": "f0_", 186 | "type": "BOOL" 187 | }, 188 | { 189 | "mode": "REQUIRED", 190 | "name": "f1_", 191 | "type": "STRING" 192 | }, 193 | { 194 | "mode": "NULLABLE", 195 | "name": "f2_", 196 | "type": "INT64" 197 | }, 198 | { 199 | "mode": "NULLABLE", 200 | "name": "f3_", 201 | "type": "INT64" 202 | } 203 | ] 204 | "#, 205 | ) 206 | .unwrap(); 207 | assert_eq!( 208 | expected, 209 | convert_bigquery(&data_atomic_with_additional_properties(), context) 210 | ); 211 | } 212 | 213 | /// Objects within tuples are allowed to have extra fields. The decoding tool 214 | /// should preserve the ordering of the items in the tuples. 215 | #[test] 216 | fn test_avro_tuple_object_drop() { 217 | let context = Context { 218 | tuple_struct: true, 219 | resolve_method: ResolveMethod::Drop, 220 | ..Default::default() 221 | }; 222 | 223 | let expected: Value = serde_json::from_str( 224 | r#" 225 | { 226 | "fields": [ 227 | { 228 | "name": "f0_", 229 | "type": {"type": "long"} 230 | }, 231 | { 232 | "name": "f1_", 233 | "type": { 234 | "name": "f1_", 235 | "namespace": "root", 236 | "type": "record", 237 | "fields": [ 238 | {"name": "first", "type": {"type": "string"}} 239 | ] 240 | } 241 | } 242 | ], 243 | "name": "root", 244 | "type": "record" 245 | } 246 | "#, 247 | ) 248 | .unwrap(); 249 | assert_eq!(expected, convert_avro(&data_object_missing(), context)); 250 | } 251 | 252 | #[test] 253 | fn test_bigquery_tuple_object_drop() { 254 | let context = Context { 255 | tuple_struct: true, 256 | resolve_method: ResolveMethod::Drop, 257 | ..Default::default() 258 | }; 259 | 260 | let expected: Value = serde_json::from_str( 261 | r#" 262 | [ 263 | { 264 | "mode": "REQUIRED", 265 | "name": "f0_", 266 | "type": "INT64" 267 | }, 268 | { 269 | "mode": "REQUIRED", 270 | "name": "f1_", 271 | "type": "RECORD", 272 | "fields": [ 273 | {"name": "first", "type": "STRING", "mode": "REQUIRED"} 274 | ] 275 | } 276 | ] 277 | "#, 278 | ) 279 | .unwrap(); 280 | assert_eq!(expected, convert_bigquery(&data_object_missing(), context)); 281 | } 282 | 283 | #[test] 284 | #[should_panic] 285 | fn test_avro_tuple_object_incompatible() { 286 | let context = Context { 287 | tuple_struct: true, 288 | resolve_method: ResolveMethod::Drop, 289 | ..Default::default() 290 | }; 291 | convert_avro(&data_incompatible(), context); 292 | } 293 | 294 | #[test] 295 | #[should_panic] 296 | fn test_bigquery_tuple_object_incompatible() { 297 | let context = Context { 298 | tuple_struct: true, 299 | resolve_method: ResolveMethod::Drop, 300 | ..Default::default() 301 | }; 302 | convert_bigquery(&data_incompatible(), context); 303 | } 304 | --------------------------------------------------------------------------------