├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── benches ├── bench_wikipedia.rs └── wikipedia-2020-12-21.html ├── examples ├── get_all_href │ ├── index.html │ └── main.rs └── simple_parser │ └── main.rs ├── src ├── dom │ ├── element.rs │ ├── formatting.rs │ ├── mod.rs │ ├── node.rs │ └── span.rs ├── error.rs ├── grammar │ ├── mod.rs │ └── rules.pest └── lib.rs └── tests ├── bin.rs ├── comments.rs ├── document.rs ├── document_empty.rs ├── document_fragment.rs ├── element.rs ├── element_attributes.rs ├── node_iter.rs ├── output.rs ├── snapshots ├── comments__it_can_parse_document_with_just_comments.snap ├── comments__it_can_parse_document_with_just_one_comment.snap ├── document__it_can_parse_document_with_comments.snap ├── document__it_can_parse_minimal_document.snap ├── document_empty__it_can_parse_empty_document.snap ├── document_fragment__it_can_parse_single_div_as_fragment.snap ├── document_fragment__it_can_parse_single_text_as_fragment.snap ├── document_fragment__it_can_parse_text_comment_element_as_fragment.snap ├── element__it_can_clone_node.snap ├── element__it_can_deal_with_weird_whitespaces.snap ├── element__it_can_parse_broken_html.snap ├── element__it_can_parse_deeply_nested.snap ├── element__it_can_parse_multiple_elements.snap ├── element__it_can_parse_multiple_open_elements.snap ├── element__it_can_parse_nested_elements.snap ├── element__it_can_parse_nested_elements_mixed_children.snap ├── element__it_can_parse_one_element.snap ├── element__it_can_parse_one_element_mixed_case.snap ├── element__it_can_parse_one_element_mixed_case_numbers.snap ├── element__it_can_parse_one_element_mixed_case_numbers_symbols.snap ├── element__it_can_parse_one_element_upper_case.snap ├── element__it_can_parse_script_with_content.snap ├── element__it_can_parse_style_with_content.snap ├── element__it_errors_when_multiple_nested_elements_dont_match.snap ├── element__it_skips_dangling_elements.snap ├── element_attributes__it_can_parse_attribute_key_mixed_case_symbols.snap ├── element_attributes__it_can_parse_attribute_multiple_values_double_quote.snap ├── element_attributes__it_can_parse_attribute_multiple_values_single_quote.snap ├── element_attributes__it_can_parse_attribute_with_empty_value.snap ├── element_attributes__it_can_parse_classes.snap ├── element_attributes__it_can_parse_double_quote.snap ├── element_attributes__it_can_parse_id.snap ├── element_attributes__it_can_parse_multiple_attributes_double_quote.snap ├── element_attributes__it_can_parse_multiple_attributes_no_quote.snap ├── element_attributes__it_can_parse_multiple_attributes_single_quote.snap ├── element_attributes__it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys.snap ├── element_attributes__it_can_parse_no_quote.snap ├── element_attributes__it_can_parse_single_quote.snap ├── element_attributes__it_keeps_spaces_for_non_classes.snap ├── output__it_can_output_complex_html_as_json.snap ├── source_span__it_can_generate_source_span.snap ├── svg__it_can_parse_svg.snap ├── text__it_can_parse_document_with_just_text.snap ├── text__it_can_parse_document_with_multiple_text_elements.snap ├── text__it_can_parse_document_with_text_and_line_breaks.snap ├── text__it_can_parse_text_in_paragraph_with_weird_formatting.snap └── text__it_can_parse_text_with_chevron.snap ├── source_span.rs ├── svg.rs ├── text.rs └── websites.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | *.temp 3 | 4 | # IDE 5 | .idea/ 6 | *.iml -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.7.0 4 | 5 | - Updated all dependencies 6 | - Removed structopt for clap 7 | - Add source_span to Element; by [bennyboer](https://github.com/bennyboer) 8 | - Improve whitespace; by [bennyboer](https://github.com/bennyboer) 9 | - Fix type in docs; by [c-git](https://github.com/c-git) 10 | 11 | ## Older versions 12 | 13 | - See commit history 14 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anes" 7 | version = "0.1.6" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 10 | 11 | [[package]] 12 | name = "anstream" 13 | version = "0.3.2" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" 16 | dependencies = [ 17 | "anstyle", 18 | "anstyle-parse", 19 | "anstyle-query", 20 | "anstyle-wincon", 21 | "colorchoice", 22 | "is-terminal", 23 | "utf8parse", 24 | ] 25 | 26 | [[package]] 27 | name = "anstyle" 28 | version = "1.0.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" 31 | 32 | [[package]] 33 | name = "anstyle-parse" 34 | version = "0.2.0" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee" 37 | dependencies = [ 38 | "utf8parse", 39 | ] 40 | 41 | [[package]] 42 | name = "anstyle-query" 43 | version = "1.0.0" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" 46 | dependencies = [ 47 | "windows-sys 0.48.0", 48 | ] 49 | 50 | [[package]] 51 | name = "anstyle-wincon" 52 | version = "1.0.1" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" 55 | dependencies = [ 56 | "anstyle", 57 | "windows-sys 0.48.0", 58 | ] 59 | 60 | [[package]] 61 | name = "atty" 62 | version = "0.2.14" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 65 | dependencies = [ 66 | "hermit-abi 0.1.19", 67 | "libc", 68 | "winapi", 69 | ] 70 | 71 | [[package]] 72 | name = "autocfg" 73 | version = "1.1.0" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 76 | 77 | [[package]] 78 | name = "base64" 79 | version = "0.21.0" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" 82 | 83 | [[package]] 84 | name = "bitflags" 85 | version = "1.3.2" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 88 | 89 | [[package]] 90 | name = "block-buffer" 91 | version = "0.10.4" 92 | source = "registry+https://github.com/rust-lang/crates.io-index" 93 | checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" 94 | dependencies = [ 95 | "generic-array", 96 | ] 97 | 98 | [[package]] 99 | name = "bumpalo" 100 | version = "3.12.2" 101 | source = "registry+https://github.com/rust-lang/crates.io-index" 102 | checksum = "3c6ed94e98ecff0c12dd1b04c15ec0d7d9458ca8fe806cea6f12954efe74c63b" 103 | 104 | [[package]] 105 | name = "bytes" 106 | version = "1.4.0" 107 | source = "registry+https://github.com/rust-lang/crates.io-index" 108 | checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" 109 | 110 | [[package]] 111 | name = "cast" 112 | version = "0.3.0" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 115 | 116 | [[package]] 117 | name = "cc" 118 | version = "1.0.79" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" 121 | 122 | [[package]] 123 | name = "cfg-if" 124 | version = "1.0.0" 125 | source = "registry+https://github.com/rust-lang/crates.io-index" 126 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 127 | 128 | [[package]] 129 | name = "ciborium" 130 | version = "0.2.1" 131 | source = "registry+https://github.com/rust-lang/crates.io-index" 132 | checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" 133 | dependencies = [ 134 | "ciborium-io", 135 | "ciborium-ll", 136 | "serde", 137 | ] 138 | 139 | [[package]] 140 | name = "ciborium-io" 141 | version = "0.2.1" 142 | source = "registry+https://github.com/rust-lang/crates.io-index" 143 | checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" 144 | 145 | [[package]] 146 | name = "ciborium-ll" 147 | version = "0.2.1" 148 | source = "registry+https://github.com/rust-lang/crates.io-index" 149 | checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" 150 | dependencies = [ 151 | "ciborium-io", 152 | "half", 153 | ] 154 | 155 | [[package]] 156 | name = "clap" 157 | version = "3.2.25" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" 160 | dependencies = [ 161 | "bitflags", 162 | "clap_lex 0.2.4", 163 | "indexmap", 164 | "textwrap", 165 | ] 166 | 167 | [[package]] 168 | name = "clap" 169 | version = "4.2.7" 170 | source = "registry+https://github.com/rust-lang/crates.io-index" 171 | checksum = "34d21f9bf1b425d2968943631ec91202fe5e837264063503708b83013f8fc938" 172 | dependencies = [ 173 | "clap_builder", 174 | "clap_derive", 175 | "once_cell", 176 | ] 177 | 178 | [[package]] 179 | name = "clap_builder" 180 | version = "4.2.7" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "914c8c79fb560f238ef6429439a30023c862f7a28e688c58f7203f12b29970bd" 183 | dependencies = [ 184 | "anstream", 185 | "anstyle", 186 | "bitflags", 187 | "clap_lex 0.4.1", 188 | "strsim", 189 | ] 190 | 191 | [[package]] 192 | name = "clap_derive" 193 | version = "4.2.0" 194 | source = "registry+https://github.com/rust-lang/crates.io-index" 195 | checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4" 196 | dependencies = [ 197 | "heck", 198 | "proc-macro2", 199 | "quote", 200 | "syn", 201 | ] 202 | 203 | [[package]] 204 | name = "clap_lex" 205 | version = "0.2.4" 206 | source = "registry+https://github.com/rust-lang/crates.io-index" 207 | checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" 208 | dependencies = [ 209 | "os_str_bytes", 210 | ] 211 | 212 | [[package]] 213 | name = "clap_lex" 214 | version = "0.4.1" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" 217 | 218 | [[package]] 219 | name = "colorchoice" 220 | version = "1.0.0" 221 | source = "registry+https://github.com/rust-lang/crates.io-index" 222 | checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" 223 | 224 | [[package]] 225 | name = "console" 226 | version = "0.15.5" 227 | source = "registry+https://github.com/rust-lang/crates.io-index" 228 | checksum = "c3d79fbe8970a77e3e34151cc13d3b3e248aa0faaecb9f6091fa07ebefe5ad60" 229 | dependencies = [ 230 | "encode_unicode", 231 | "lazy_static", 232 | "libc", 233 | "windows-sys 0.42.0", 234 | ] 235 | 236 | [[package]] 237 | name = "core-foundation" 238 | version = "0.9.3" 239 | source = "registry+https://github.com/rust-lang/crates.io-index" 240 | checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" 241 | dependencies = [ 242 | "core-foundation-sys", 243 | "libc", 244 | ] 245 | 246 | [[package]] 247 | name = "core-foundation-sys" 248 | version = "0.8.4" 249 | source = "registry+https://github.com/rust-lang/crates.io-index" 250 | checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" 251 | 252 | [[package]] 253 | name = "cpufeatures" 254 | version = "0.2.7" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" 257 | dependencies = [ 258 | "libc", 259 | ] 260 | 261 | [[package]] 262 | name = "criterion" 263 | version = "0.4.0" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" 266 | dependencies = [ 267 | "anes", 268 | "atty", 269 | "cast", 270 | "ciborium", 271 | "clap 3.2.25", 272 | "criterion-plot", 273 | "itertools", 274 | "lazy_static", 275 | "num-traits", 276 | "oorandom", 277 | "plotters", 278 | "rayon", 279 | "regex", 280 | "serde", 281 | "serde_derive", 282 | "serde_json", 283 | "tinytemplate", 284 | "walkdir", 285 | ] 286 | 287 | [[package]] 288 | name = "criterion-plot" 289 | version = "0.5.0" 290 | source = "registry+https://github.com/rust-lang/crates.io-index" 291 | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" 292 | dependencies = [ 293 | "cast", 294 | "itertools", 295 | ] 296 | 297 | [[package]] 298 | name = "crossbeam-channel" 299 | version = "0.5.8" 300 | source = "registry+https://github.com/rust-lang/crates.io-index" 301 | checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" 302 | dependencies = [ 303 | "cfg-if", 304 | "crossbeam-utils", 305 | ] 306 | 307 | [[package]] 308 | name = "crossbeam-deque" 309 | version = "0.8.3" 310 | source = "registry+https://github.com/rust-lang/crates.io-index" 311 | checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" 312 | dependencies = [ 313 | "cfg-if", 314 | "crossbeam-epoch", 315 | "crossbeam-utils", 316 | ] 317 | 318 | [[package]] 319 | name = "crossbeam-epoch" 320 | version = "0.9.14" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" 323 | dependencies = [ 324 | "autocfg", 325 | "cfg-if", 326 | "crossbeam-utils", 327 | "memoffset", 328 | "scopeguard", 329 | ] 330 | 331 | [[package]] 332 | name = "crossbeam-utils" 333 | version = "0.8.15" 334 | source = "registry+https://github.com/rust-lang/crates.io-index" 335 | checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" 336 | dependencies = [ 337 | "cfg-if", 338 | ] 339 | 340 | [[package]] 341 | name = "crypto-common" 342 | version = "0.1.6" 343 | source = "registry+https://github.com/rust-lang/crates.io-index" 344 | checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" 345 | dependencies = [ 346 | "generic-array", 347 | "typenum", 348 | ] 349 | 350 | [[package]] 351 | name = "digest" 352 | version = "0.10.6" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" 355 | dependencies = [ 356 | "block-buffer", 357 | "crypto-common", 358 | ] 359 | 360 | [[package]] 361 | name = "doc-comment" 362 | version = "0.3.3" 363 | source = "registry+https://github.com/rust-lang/crates.io-index" 364 | checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" 365 | 366 | [[package]] 367 | name = "either" 368 | version = "1.8.1" 369 | source = "registry+https://github.com/rust-lang/crates.io-index" 370 | checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" 371 | 372 | [[package]] 373 | name = "encode_unicode" 374 | version = "0.3.6" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" 377 | 378 | [[package]] 379 | name = "encoding_rs" 380 | version = "0.8.32" 381 | source = "registry+https://github.com/rust-lang/crates.io-index" 382 | checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" 383 | dependencies = [ 384 | "cfg-if", 385 | ] 386 | 387 | [[package]] 388 | name = "errno" 389 | version = "0.3.1" 390 | source = "registry+https://github.com/rust-lang/crates.io-index" 391 | checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" 392 | dependencies = [ 393 | "errno-dragonfly", 394 | "libc", 395 | "windows-sys 0.48.0", 396 | ] 397 | 398 | [[package]] 399 | name = "errno-dragonfly" 400 | version = "0.1.2" 401 | source = "registry+https://github.com/rust-lang/crates.io-index" 402 | checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" 403 | dependencies = [ 404 | "cc", 405 | "libc", 406 | ] 407 | 408 | [[package]] 409 | name = "fastrand" 410 | version = "1.9.0" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" 413 | dependencies = [ 414 | "instant", 415 | ] 416 | 417 | [[package]] 418 | name = "fnv" 419 | version = "1.0.7" 420 | source = "registry+https://github.com/rust-lang/crates.io-index" 421 | checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" 422 | 423 | [[package]] 424 | name = "foreign-types" 425 | version = "0.3.2" 426 | source = "registry+https://github.com/rust-lang/crates.io-index" 427 | checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" 428 | dependencies = [ 429 | "foreign-types-shared", 430 | ] 431 | 432 | [[package]] 433 | name = "foreign-types-shared" 434 | version = "0.1.1" 435 | source = "registry+https://github.com/rust-lang/crates.io-index" 436 | checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" 437 | 438 | [[package]] 439 | name = "form_urlencoded" 440 | version = "1.1.0" 441 | source = "registry+https://github.com/rust-lang/crates.io-index" 442 | checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" 443 | dependencies = [ 444 | "percent-encoding", 445 | ] 446 | 447 | [[package]] 448 | name = "futures-channel" 449 | version = "0.3.28" 450 | source = "registry+https://github.com/rust-lang/crates.io-index" 451 | checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" 452 | dependencies = [ 453 | "futures-core", 454 | ] 455 | 456 | [[package]] 457 | name = "futures-core" 458 | version = "0.3.28" 459 | source = "registry+https://github.com/rust-lang/crates.io-index" 460 | checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" 461 | 462 | [[package]] 463 | name = "futures-io" 464 | version = "0.3.28" 465 | source = "registry+https://github.com/rust-lang/crates.io-index" 466 | checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" 467 | 468 | [[package]] 469 | name = "futures-sink" 470 | version = "0.3.28" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" 473 | 474 | [[package]] 475 | name = "futures-task" 476 | version = "0.3.28" 477 | source = "registry+https://github.com/rust-lang/crates.io-index" 478 | checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" 479 | 480 | [[package]] 481 | name = "futures-util" 482 | version = "0.3.28" 483 | source = "registry+https://github.com/rust-lang/crates.io-index" 484 | checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" 485 | dependencies = [ 486 | "futures-core", 487 | "futures-io", 488 | "futures-task", 489 | "memchr", 490 | "pin-project-lite", 491 | "pin-utils", 492 | "slab", 493 | ] 494 | 495 | [[package]] 496 | name = "generic-array" 497 | version = "0.14.7" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" 500 | dependencies = [ 501 | "typenum", 502 | "version_check", 503 | ] 504 | 505 | [[package]] 506 | name = "h2" 507 | version = "0.3.18" 508 | source = "registry+https://github.com/rust-lang/crates.io-index" 509 | checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" 510 | dependencies = [ 511 | "bytes", 512 | "fnv", 513 | "futures-core", 514 | "futures-sink", 515 | "futures-util", 516 | "http", 517 | "indexmap", 518 | "slab", 519 | "tokio", 520 | "tokio-util", 521 | "tracing", 522 | ] 523 | 524 | [[package]] 525 | name = "half" 526 | version = "1.8.2" 527 | source = "registry+https://github.com/rust-lang/crates.io-index" 528 | checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" 529 | 530 | [[package]] 531 | name = "hashbrown" 532 | version = "0.12.3" 533 | source = "registry+https://github.com/rust-lang/crates.io-index" 534 | checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" 535 | 536 | [[package]] 537 | name = "heck" 538 | version = "0.4.1" 539 | source = "registry+https://github.com/rust-lang/crates.io-index" 540 | checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" 541 | 542 | [[package]] 543 | name = "hermit-abi" 544 | version = "0.1.19" 545 | source = "registry+https://github.com/rust-lang/crates.io-index" 546 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 547 | dependencies = [ 548 | "libc", 549 | ] 550 | 551 | [[package]] 552 | name = "hermit-abi" 553 | version = "0.2.6" 554 | source = "registry+https://github.com/rust-lang/crates.io-index" 555 | checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" 556 | dependencies = [ 557 | "libc", 558 | ] 559 | 560 | [[package]] 561 | name = "hermit-abi" 562 | version = "0.3.1" 563 | source = "registry+https://github.com/rust-lang/crates.io-index" 564 | checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" 565 | 566 | [[package]] 567 | name = "html_parser" 568 | version = "0.7.0" 569 | dependencies = [ 570 | "clap 4.2.7", 571 | "criterion", 572 | "doc-comment", 573 | "indoc", 574 | "insta", 575 | "pest", 576 | "pest_derive", 577 | "reqwest", 578 | "serde", 579 | "serde_derive", 580 | "serde_json", 581 | "tempfile", 582 | "thiserror", 583 | ] 584 | 585 | [[package]] 586 | name = "http" 587 | version = "0.2.9" 588 | source = "registry+https://github.com/rust-lang/crates.io-index" 589 | checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" 590 | dependencies = [ 591 | "bytes", 592 | "fnv", 593 | "itoa", 594 | ] 595 | 596 | [[package]] 597 | name = "http-body" 598 | version = "0.4.5" 599 | source = "registry+https://github.com/rust-lang/crates.io-index" 600 | checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" 601 | dependencies = [ 602 | "bytes", 603 | "http", 604 | "pin-project-lite", 605 | ] 606 | 607 | [[package]] 608 | name = "httparse" 609 | version = "1.8.0" 610 | source = "registry+https://github.com/rust-lang/crates.io-index" 611 | checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" 612 | 613 | [[package]] 614 | name = "httpdate" 615 | version = "1.0.2" 616 | source = "registry+https://github.com/rust-lang/crates.io-index" 617 | checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" 618 | 619 | [[package]] 620 | name = "hyper" 621 | version = "0.14.26" 622 | source = "registry+https://github.com/rust-lang/crates.io-index" 623 | checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4" 624 | dependencies = [ 625 | "bytes", 626 | "futures-channel", 627 | "futures-core", 628 | "futures-util", 629 | "h2", 630 | "http", 631 | "http-body", 632 | "httparse", 633 | "httpdate", 634 | "itoa", 635 | "pin-project-lite", 636 | "socket2", 637 | "tokio", 638 | "tower-service", 639 | "tracing", 640 | "want", 641 | ] 642 | 643 | [[package]] 644 | name = "hyper-rustls" 645 | version = "0.23.2" 646 | source = "registry+https://github.com/rust-lang/crates.io-index" 647 | checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" 648 | dependencies = [ 649 | "http", 650 | "hyper", 651 | "rustls", 652 | "tokio", 653 | "tokio-rustls", 654 | ] 655 | 656 | [[package]] 657 | name = "hyper-tls" 658 | version = "0.5.0" 659 | source = "registry+https://github.com/rust-lang/crates.io-index" 660 | checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" 661 | dependencies = [ 662 | "bytes", 663 | "hyper", 664 | "native-tls", 665 | "tokio", 666 | "tokio-native-tls", 667 | ] 668 | 669 | [[package]] 670 | name = "idna" 671 | version = "0.3.0" 672 | source = "registry+https://github.com/rust-lang/crates.io-index" 673 | checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" 674 | dependencies = [ 675 | "unicode-bidi", 676 | "unicode-normalization", 677 | ] 678 | 679 | [[package]] 680 | name = "indexmap" 681 | version = "1.9.3" 682 | source = "registry+https://github.com/rust-lang/crates.io-index" 683 | checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" 684 | dependencies = [ 685 | "autocfg", 686 | "hashbrown", 687 | ] 688 | 689 | [[package]] 690 | name = "indoc" 691 | version = "2.0.1" 692 | source = "registry+https://github.com/rust-lang/crates.io-index" 693 | checksum = "9f2cb48b81b1dc9f39676bf99f5499babfec7cd8fe14307f7b3d747208fb5690" 694 | 695 | [[package]] 696 | name = "insta" 697 | version = "1.29.0" 698 | source = "registry+https://github.com/rust-lang/crates.io-index" 699 | checksum = "9a28d25139df397cbca21408bb742cf6837e04cdbebf1b07b760caf971d6a972" 700 | dependencies = [ 701 | "console", 702 | "lazy_static", 703 | "linked-hash-map", 704 | "serde", 705 | "similar", 706 | "yaml-rust", 707 | ] 708 | 709 | [[package]] 710 | name = "instant" 711 | version = "0.1.12" 712 | source = "registry+https://github.com/rust-lang/crates.io-index" 713 | checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" 714 | dependencies = [ 715 | "cfg-if", 716 | ] 717 | 718 | [[package]] 719 | name = "io-lifetimes" 720 | version = "1.0.10" 721 | source = "registry+https://github.com/rust-lang/crates.io-index" 722 | checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" 723 | dependencies = [ 724 | "hermit-abi 0.3.1", 725 | "libc", 726 | "windows-sys 0.48.0", 727 | ] 728 | 729 | [[package]] 730 | name = "ipnet" 731 | version = "2.7.2" 732 | source = "registry+https://github.com/rust-lang/crates.io-index" 733 | checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" 734 | 735 | [[package]] 736 | name = "is-terminal" 737 | version = "0.4.7" 738 | source = "registry+https://github.com/rust-lang/crates.io-index" 739 | checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" 740 | dependencies = [ 741 | "hermit-abi 0.3.1", 742 | "io-lifetimes", 743 | "rustix", 744 | "windows-sys 0.48.0", 745 | ] 746 | 747 | [[package]] 748 | name = "itertools" 749 | version = "0.10.5" 750 | source = "registry+https://github.com/rust-lang/crates.io-index" 751 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" 752 | dependencies = [ 753 | "either", 754 | ] 755 | 756 | [[package]] 757 | name = "itoa" 758 | version = "1.0.6" 759 | source = "registry+https://github.com/rust-lang/crates.io-index" 760 | checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" 761 | 762 | [[package]] 763 | name = "js-sys" 764 | version = "0.3.62" 765 | source = "registry+https://github.com/rust-lang/crates.io-index" 766 | checksum = "68c16e1bfd491478ab155fd8b4896b86f9ede344949b641e61501e07c2b8b4d5" 767 | dependencies = [ 768 | "wasm-bindgen", 769 | ] 770 | 771 | [[package]] 772 | name = "lazy_static" 773 | version = "1.4.0" 774 | source = "registry+https://github.com/rust-lang/crates.io-index" 775 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 776 | 777 | [[package]] 778 | name = "libc" 779 | version = "0.2.144" 780 | source = "registry+https://github.com/rust-lang/crates.io-index" 781 | checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" 782 | 783 | [[package]] 784 | name = "linked-hash-map" 785 | version = "0.5.6" 786 | source = "registry+https://github.com/rust-lang/crates.io-index" 787 | checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" 788 | 789 | [[package]] 790 | name = "linux-raw-sys" 791 | version = "0.3.7" 792 | source = "registry+https://github.com/rust-lang/crates.io-index" 793 | checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" 794 | 795 | [[package]] 796 | name = "log" 797 | version = "0.4.17" 798 | source = "registry+https://github.com/rust-lang/crates.io-index" 799 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" 800 | dependencies = [ 801 | "cfg-if", 802 | ] 803 | 804 | [[package]] 805 | name = "memchr" 806 | version = "2.5.0" 807 | source = "registry+https://github.com/rust-lang/crates.io-index" 808 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 809 | 810 | [[package]] 811 | name = "memoffset" 812 | version = "0.8.0" 813 | source = "registry+https://github.com/rust-lang/crates.io-index" 814 | checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" 815 | dependencies = [ 816 | "autocfg", 817 | ] 818 | 819 | [[package]] 820 | name = "mime" 821 | version = "0.3.17" 822 | source = "registry+https://github.com/rust-lang/crates.io-index" 823 | checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" 824 | 825 | [[package]] 826 | name = "mio" 827 | version = "0.8.6" 828 | source = "registry+https://github.com/rust-lang/crates.io-index" 829 | checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" 830 | dependencies = [ 831 | "libc", 832 | "log", 833 | "wasi", 834 | "windows-sys 0.45.0", 835 | ] 836 | 837 | [[package]] 838 | name = "native-tls" 839 | version = "0.2.11" 840 | source = "registry+https://github.com/rust-lang/crates.io-index" 841 | checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" 842 | dependencies = [ 843 | "lazy_static", 844 | "libc", 845 | "log", 846 | "openssl", 847 | "openssl-probe", 848 | "openssl-sys", 849 | "schannel", 850 | "security-framework", 851 | "security-framework-sys", 852 | "tempfile", 853 | ] 854 | 855 | [[package]] 856 | name = "num-traits" 857 | version = "0.2.15" 858 | source = "registry+https://github.com/rust-lang/crates.io-index" 859 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" 860 | dependencies = [ 861 | "autocfg", 862 | ] 863 | 864 | [[package]] 865 | name = "num_cpus" 866 | version = "1.15.0" 867 | source = "registry+https://github.com/rust-lang/crates.io-index" 868 | checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" 869 | dependencies = [ 870 | "hermit-abi 0.2.6", 871 | "libc", 872 | ] 873 | 874 | [[package]] 875 | name = "once_cell" 876 | version = "1.17.1" 877 | source = "registry+https://github.com/rust-lang/crates.io-index" 878 | checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" 879 | 880 | [[package]] 881 | name = "oorandom" 882 | version = "11.1.3" 883 | source = "registry+https://github.com/rust-lang/crates.io-index" 884 | checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" 885 | 886 | [[package]] 887 | name = "openssl" 888 | version = "0.10.52" 889 | source = "registry+https://github.com/rust-lang/crates.io-index" 890 | checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56" 891 | dependencies = [ 892 | "bitflags", 893 | "cfg-if", 894 | "foreign-types", 895 | "libc", 896 | "once_cell", 897 | "openssl-macros", 898 | "openssl-sys", 899 | ] 900 | 901 | [[package]] 902 | name = "openssl-macros" 903 | version = "0.1.1" 904 | source = "registry+https://github.com/rust-lang/crates.io-index" 905 | checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" 906 | dependencies = [ 907 | "proc-macro2", 908 | "quote", 909 | "syn", 910 | ] 911 | 912 | [[package]] 913 | name = "openssl-probe" 914 | version = "0.1.5" 915 | source = "registry+https://github.com/rust-lang/crates.io-index" 916 | checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" 917 | 918 | [[package]] 919 | name = "openssl-sys" 920 | version = "0.9.87" 921 | source = "registry+https://github.com/rust-lang/crates.io-index" 922 | checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e" 923 | dependencies = [ 924 | "cc", 925 | "libc", 926 | "pkg-config", 927 | "vcpkg", 928 | ] 929 | 930 | [[package]] 931 | name = "os_str_bytes" 932 | version = "6.5.0" 933 | source = "registry+https://github.com/rust-lang/crates.io-index" 934 | checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" 935 | 936 | [[package]] 937 | name = "percent-encoding" 938 | version = "2.2.0" 939 | source = "registry+https://github.com/rust-lang/crates.io-index" 940 | checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" 941 | 942 | [[package]] 943 | name = "pest" 944 | version = "2.6.0" 945 | source = "registry+https://github.com/rust-lang/crates.io-index" 946 | checksum = "e68e84bfb01f0507134eac1e9b410a12ba379d064eab48c50ba4ce329a527b70" 947 | dependencies = [ 948 | "thiserror", 949 | "ucd-trie", 950 | ] 951 | 952 | [[package]] 953 | name = "pest_derive" 954 | version = "2.6.0" 955 | source = "registry+https://github.com/rust-lang/crates.io-index" 956 | checksum = "6b79d4c71c865a25a4322296122e3924d30bc8ee0834c8bfc8b95f7f054afbfb" 957 | dependencies = [ 958 | "pest", 959 | "pest_generator", 960 | ] 961 | 962 | [[package]] 963 | name = "pest_generator" 964 | version = "2.6.0" 965 | source = "registry+https://github.com/rust-lang/crates.io-index" 966 | checksum = "6c435bf1076437b851ebc8edc3a18442796b30f1728ffea6262d59bbe28b077e" 967 | dependencies = [ 968 | "pest", 969 | "pest_meta", 970 | "proc-macro2", 971 | "quote", 972 | "syn", 973 | ] 974 | 975 | [[package]] 976 | name = "pest_meta" 977 | version = "2.6.0" 978 | source = "registry+https://github.com/rust-lang/crates.io-index" 979 | checksum = "745a452f8eb71e39ffd8ee32b3c5f51d03845f99786fa9b68db6ff509c505411" 980 | dependencies = [ 981 | "once_cell", 982 | "pest", 983 | "sha2", 984 | ] 985 | 986 | [[package]] 987 | name = "pin-project-lite" 988 | version = "0.2.9" 989 | source = "registry+https://github.com/rust-lang/crates.io-index" 990 | checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" 991 | 992 | [[package]] 993 | name = "pin-utils" 994 | version = "0.1.0" 995 | source = "registry+https://github.com/rust-lang/crates.io-index" 996 | checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" 997 | 998 | [[package]] 999 | name = "pkg-config" 1000 | version = "0.3.27" 1001 | source = "registry+https://github.com/rust-lang/crates.io-index" 1002 | checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" 1003 | 1004 | [[package]] 1005 | name = "plotters" 1006 | version = "0.3.4" 1007 | source = "registry+https://github.com/rust-lang/crates.io-index" 1008 | checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" 1009 | dependencies = [ 1010 | "num-traits", 1011 | "plotters-backend", 1012 | "plotters-svg", 1013 | "wasm-bindgen", 1014 | "web-sys", 1015 | ] 1016 | 1017 | [[package]] 1018 | name = "plotters-backend" 1019 | version = "0.3.4" 1020 | source = "registry+https://github.com/rust-lang/crates.io-index" 1021 | checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" 1022 | 1023 | [[package]] 1024 | name = "plotters-svg" 1025 | version = "0.3.3" 1026 | source = "registry+https://github.com/rust-lang/crates.io-index" 1027 | checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" 1028 | dependencies = [ 1029 | "plotters-backend", 1030 | ] 1031 | 1032 | [[package]] 1033 | name = "proc-macro2" 1034 | version = "1.0.56" 1035 | source = "registry+https://github.com/rust-lang/crates.io-index" 1036 | checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" 1037 | dependencies = [ 1038 | "unicode-ident", 1039 | ] 1040 | 1041 | [[package]] 1042 | name = "quote" 1043 | version = "1.0.27" 1044 | source = "registry+https://github.com/rust-lang/crates.io-index" 1045 | checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" 1046 | dependencies = [ 1047 | "proc-macro2", 1048 | ] 1049 | 1050 | [[package]] 1051 | name = "rayon" 1052 | version = "1.7.0" 1053 | source = "registry+https://github.com/rust-lang/crates.io-index" 1054 | checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" 1055 | dependencies = [ 1056 | "either", 1057 | "rayon-core", 1058 | ] 1059 | 1060 | [[package]] 1061 | name = "rayon-core" 1062 | version = "1.11.0" 1063 | source = "registry+https://github.com/rust-lang/crates.io-index" 1064 | checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" 1065 | dependencies = [ 1066 | "crossbeam-channel", 1067 | "crossbeam-deque", 1068 | "crossbeam-utils", 1069 | "num_cpus", 1070 | ] 1071 | 1072 | [[package]] 1073 | name = "redox_syscall" 1074 | version = "0.3.5" 1075 | source = "registry+https://github.com/rust-lang/crates.io-index" 1076 | checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" 1077 | dependencies = [ 1078 | "bitflags", 1079 | ] 1080 | 1081 | [[package]] 1082 | name = "regex" 1083 | version = "1.8.1" 1084 | source = "registry+https://github.com/rust-lang/crates.io-index" 1085 | checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370" 1086 | dependencies = [ 1087 | "regex-syntax", 1088 | ] 1089 | 1090 | [[package]] 1091 | name = "regex-syntax" 1092 | version = "0.7.1" 1093 | source = "registry+https://github.com/rust-lang/crates.io-index" 1094 | checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c" 1095 | 1096 | [[package]] 1097 | name = "reqwest" 1098 | version = "0.11.17" 1099 | source = "registry+https://github.com/rust-lang/crates.io-index" 1100 | checksum = "13293b639a097af28fc8a90f22add145a9c954e49d77da06263d58cf44d5fb91" 1101 | dependencies = [ 1102 | "base64", 1103 | "bytes", 1104 | "encoding_rs", 1105 | "futures-core", 1106 | "futures-util", 1107 | "h2", 1108 | "http", 1109 | "http-body", 1110 | "hyper", 1111 | "hyper-rustls", 1112 | "hyper-tls", 1113 | "ipnet", 1114 | "js-sys", 1115 | "log", 1116 | "mime", 1117 | "native-tls", 1118 | "once_cell", 1119 | "percent-encoding", 1120 | "pin-project-lite", 1121 | "rustls", 1122 | "rustls-pemfile", 1123 | "serde", 1124 | "serde_json", 1125 | "serde_urlencoded", 1126 | "tokio", 1127 | "tokio-native-tls", 1128 | "tokio-rustls", 1129 | "tower-service", 1130 | "url", 1131 | "wasm-bindgen", 1132 | "wasm-bindgen-futures", 1133 | "web-sys", 1134 | "webpki-roots", 1135 | "winreg", 1136 | ] 1137 | 1138 | [[package]] 1139 | name = "ring" 1140 | version = "0.16.20" 1141 | source = "registry+https://github.com/rust-lang/crates.io-index" 1142 | checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" 1143 | dependencies = [ 1144 | "cc", 1145 | "libc", 1146 | "once_cell", 1147 | "spin", 1148 | "untrusted", 1149 | "web-sys", 1150 | "winapi", 1151 | ] 1152 | 1153 | [[package]] 1154 | name = "rustix" 1155 | version = "0.37.19" 1156 | source = "registry+https://github.com/rust-lang/crates.io-index" 1157 | checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" 1158 | dependencies = [ 1159 | "bitflags", 1160 | "errno", 1161 | "io-lifetimes", 1162 | "libc", 1163 | "linux-raw-sys", 1164 | "windows-sys 0.48.0", 1165 | ] 1166 | 1167 | [[package]] 1168 | name = "rustls" 1169 | version = "0.20.8" 1170 | source = "registry+https://github.com/rust-lang/crates.io-index" 1171 | checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" 1172 | dependencies = [ 1173 | "log", 1174 | "ring", 1175 | "sct", 1176 | "webpki", 1177 | ] 1178 | 1179 | [[package]] 1180 | name = "rustls-pemfile" 1181 | version = "1.0.2" 1182 | source = "registry+https://github.com/rust-lang/crates.io-index" 1183 | checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" 1184 | dependencies = [ 1185 | "base64", 1186 | ] 1187 | 1188 | [[package]] 1189 | name = "ryu" 1190 | version = "1.0.13" 1191 | source = "registry+https://github.com/rust-lang/crates.io-index" 1192 | checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" 1193 | 1194 | [[package]] 1195 | name = "same-file" 1196 | version = "1.0.6" 1197 | source = "registry+https://github.com/rust-lang/crates.io-index" 1198 | checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 1199 | dependencies = [ 1200 | "winapi-util", 1201 | ] 1202 | 1203 | [[package]] 1204 | name = "schannel" 1205 | version = "0.1.21" 1206 | source = "registry+https://github.com/rust-lang/crates.io-index" 1207 | checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3" 1208 | dependencies = [ 1209 | "windows-sys 0.42.0", 1210 | ] 1211 | 1212 | [[package]] 1213 | name = "scopeguard" 1214 | version = "1.1.0" 1215 | source = "registry+https://github.com/rust-lang/crates.io-index" 1216 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 1217 | 1218 | [[package]] 1219 | name = "sct" 1220 | version = "0.7.0" 1221 | source = "registry+https://github.com/rust-lang/crates.io-index" 1222 | checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" 1223 | dependencies = [ 1224 | "ring", 1225 | "untrusted", 1226 | ] 1227 | 1228 | [[package]] 1229 | name = "security-framework" 1230 | version = "2.8.2" 1231 | source = "registry+https://github.com/rust-lang/crates.io-index" 1232 | checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254" 1233 | dependencies = [ 1234 | "bitflags", 1235 | "core-foundation", 1236 | "core-foundation-sys", 1237 | "libc", 1238 | "security-framework-sys", 1239 | ] 1240 | 1241 | [[package]] 1242 | name = "security-framework-sys" 1243 | version = "2.8.0" 1244 | source = "registry+https://github.com/rust-lang/crates.io-index" 1245 | checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4" 1246 | dependencies = [ 1247 | "core-foundation-sys", 1248 | "libc", 1249 | ] 1250 | 1251 | [[package]] 1252 | name = "serde" 1253 | version = "1.0.163" 1254 | source = "registry+https://github.com/rust-lang/crates.io-index" 1255 | checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2" 1256 | dependencies = [ 1257 | "serde_derive", 1258 | ] 1259 | 1260 | [[package]] 1261 | name = "serde_derive" 1262 | version = "1.0.163" 1263 | source = "registry+https://github.com/rust-lang/crates.io-index" 1264 | checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e" 1265 | dependencies = [ 1266 | "proc-macro2", 1267 | "quote", 1268 | "syn", 1269 | ] 1270 | 1271 | [[package]] 1272 | name = "serde_json" 1273 | version = "1.0.96" 1274 | source = "registry+https://github.com/rust-lang/crates.io-index" 1275 | checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" 1276 | dependencies = [ 1277 | "itoa", 1278 | "ryu", 1279 | "serde", 1280 | ] 1281 | 1282 | [[package]] 1283 | name = "serde_urlencoded" 1284 | version = "0.7.1" 1285 | source = "registry+https://github.com/rust-lang/crates.io-index" 1286 | checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" 1287 | dependencies = [ 1288 | "form_urlencoded", 1289 | "itoa", 1290 | "ryu", 1291 | "serde", 1292 | ] 1293 | 1294 | [[package]] 1295 | name = "sha2" 1296 | version = "0.10.6" 1297 | source = "registry+https://github.com/rust-lang/crates.io-index" 1298 | checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" 1299 | dependencies = [ 1300 | "cfg-if", 1301 | "cpufeatures", 1302 | "digest", 1303 | ] 1304 | 1305 | [[package]] 1306 | name = "similar" 1307 | version = "2.2.1" 1308 | source = "registry+https://github.com/rust-lang/crates.io-index" 1309 | checksum = "420acb44afdae038210c99e69aae24109f32f15500aa708e81d46c9f29d55fcf" 1310 | 1311 | [[package]] 1312 | name = "slab" 1313 | version = "0.4.8" 1314 | source = "registry+https://github.com/rust-lang/crates.io-index" 1315 | checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" 1316 | dependencies = [ 1317 | "autocfg", 1318 | ] 1319 | 1320 | [[package]] 1321 | name = "socket2" 1322 | version = "0.4.9" 1323 | source = "registry+https://github.com/rust-lang/crates.io-index" 1324 | checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" 1325 | dependencies = [ 1326 | "libc", 1327 | "winapi", 1328 | ] 1329 | 1330 | [[package]] 1331 | name = "spin" 1332 | version = "0.5.2" 1333 | source = "registry+https://github.com/rust-lang/crates.io-index" 1334 | checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" 1335 | 1336 | [[package]] 1337 | name = "strsim" 1338 | version = "0.10.0" 1339 | source = "registry+https://github.com/rust-lang/crates.io-index" 1340 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 1341 | 1342 | [[package]] 1343 | name = "syn" 1344 | version = "2.0.15" 1345 | source = "registry+https://github.com/rust-lang/crates.io-index" 1346 | checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" 1347 | dependencies = [ 1348 | "proc-macro2", 1349 | "quote", 1350 | "unicode-ident", 1351 | ] 1352 | 1353 | [[package]] 1354 | name = "tempfile" 1355 | version = "3.5.0" 1356 | source = "registry+https://github.com/rust-lang/crates.io-index" 1357 | checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" 1358 | dependencies = [ 1359 | "cfg-if", 1360 | "fastrand", 1361 | "redox_syscall", 1362 | "rustix", 1363 | "windows-sys 0.45.0", 1364 | ] 1365 | 1366 | [[package]] 1367 | name = "textwrap" 1368 | version = "0.16.0" 1369 | source = "registry+https://github.com/rust-lang/crates.io-index" 1370 | checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" 1371 | 1372 | [[package]] 1373 | name = "thiserror" 1374 | version = "1.0.40" 1375 | source = "registry+https://github.com/rust-lang/crates.io-index" 1376 | checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" 1377 | dependencies = [ 1378 | "thiserror-impl", 1379 | ] 1380 | 1381 | [[package]] 1382 | name = "thiserror-impl" 1383 | version = "1.0.40" 1384 | source = "registry+https://github.com/rust-lang/crates.io-index" 1385 | checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" 1386 | dependencies = [ 1387 | "proc-macro2", 1388 | "quote", 1389 | "syn", 1390 | ] 1391 | 1392 | [[package]] 1393 | name = "tinytemplate" 1394 | version = "1.2.1" 1395 | source = "registry+https://github.com/rust-lang/crates.io-index" 1396 | checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 1397 | dependencies = [ 1398 | "serde", 1399 | "serde_json", 1400 | ] 1401 | 1402 | [[package]] 1403 | name = "tinyvec" 1404 | version = "1.6.0" 1405 | source = "registry+https://github.com/rust-lang/crates.io-index" 1406 | checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" 1407 | dependencies = [ 1408 | "tinyvec_macros", 1409 | ] 1410 | 1411 | [[package]] 1412 | name = "tinyvec_macros" 1413 | version = "0.1.1" 1414 | source = "registry+https://github.com/rust-lang/crates.io-index" 1415 | checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" 1416 | 1417 | [[package]] 1418 | name = "tokio" 1419 | version = "1.28.1" 1420 | source = "registry+https://github.com/rust-lang/crates.io-index" 1421 | checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105" 1422 | dependencies = [ 1423 | "autocfg", 1424 | "bytes", 1425 | "libc", 1426 | "mio", 1427 | "num_cpus", 1428 | "pin-project-lite", 1429 | "socket2", 1430 | "windows-sys 0.48.0", 1431 | ] 1432 | 1433 | [[package]] 1434 | name = "tokio-native-tls" 1435 | version = "0.3.1" 1436 | source = "registry+https://github.com/rust-lang/crates.io-index" 1437 | checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" 1438 | dependencies = [ 1439 | "native-tls", 1440 | "tokio", 1441 | ] 1442 | 1443 | [[package]] 1444 | name = "tokio-rustls" 1445 | version = "0.23.4" 1446 | source = "registry+https://github.com/rust-lang/crates.io-index" 1447 | checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" 1448 | dependencies = [ 1449 | "rustls", 1450 | "tokio", 1451 | "webpki", 1452 | ] 1453 | 1454 | [[package]] 1455 | name = "tokio-util" 1456 | version = "0.7.8" 1457 | source = "registry+https://github.com/rust-lang/crates.io-index" 1458 | checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" 1459 | dependencies = [ 1460 | "bytes", 1461 | "futures-core", 1462 | "futures-sink", 1463 | "pin-project-lite", 1464 | "tokio", 1465 | "tracing", 1466 | ] 1467 | 1468 | [[package]] 1469 | name = "tower-service" 1470 | version = "0.3.2" 1471 | source = "registry+https://github.com/rust-lang/crates.io-index" 1472 | checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" 1473 | 1474 | [[package]] 1475 | name = "tracing" 1476 | version = "0.1.37" 1477 | source = "registry+https://github.com/rust-lang/crates.io-index" 1478 | checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" 1479 | dependencies = [ 1480 | "cfg-if", 1481 | "pin-project-lite", 1482 | "tracing-core", 1483 | ] 1484 | 1485 | [[package]] 1486 | name = "tracing-core" 1487 | version = "0.1.30" 1488 | source = "registry+https://github.com/rust-lang/crates.io-index" 1489 | checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" 1490 | dependencies = [ 1491 | "once_cell", 1492 | ] 1493 | 1494 | [[package]] 1495 | name = "try-lock" 1496 | version = "0.2.4" 1497 | source = "registry+https://github.com/rust-lang/crates.io-index" 1498 | checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" 1499 | 1500 | [[package]] 1501 | name = "typenum" 1502 | version = "1.16.0" 1503 | source = "registry+https://github.com/rust-lang/crates.io-index" 1504 | checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" 1505 | 1506 | [[package]] 1507 | name = "ucd-trie" 1508 | version = "0.1.5" 1509 | source = "registry+https://github.com/rust-lang/crates.io-index" 1510 | checksum = "9e79c4d996edb816c91e4308506774452e55e95c3c9de07b6729e17e15a5ef81" 1511 | 1512 | [[package]] 1513 | name = "unicode-bidi" 1514 | version = "0.3.13" 1515 | source = "registry+https://github.com/rust-lang/crates.io-index" 1516 | checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" 1517 | 1518 | [[package]] 1519 | name = "unicode-ident" 1520 | version = "1.0.8" 1521 | source = "registry+https://github.com/rust-lang/crates.io-index" 1522 | checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" 1523 | 1524 | [[package]] 1525 | name = "unicode-normalization" 1526 | version = "0.1.22" 1527 | source = "registry+https://github.com/rust-lang/crates.io-index" 1528 | checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" 1529 | dependencies = [ 1530 | "tinyvec", 1531 | ] 1532 | 1533 | [[package]] 1534 | name = "untrusted" 1535 | version = "0.7.1" 1536 | source = "registry+https://github.com/rust-lang/crates.io-index" 1537 | checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" 1538 | 1539 | [[package]] 1540 | name = "url" 1541 | version = "2.3.1" 1542 | source = "registry+https://github.com/rust-lang/crates.io-index" 1543 | checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" 1544 | dependencies = [ 1545 | "form_urlencoded", 1546 | "idna", 1547 | "percent-encoding", 1548 | ] 1549 | 1550 | [[package]] 1551 | name = "utf8parse" 1552 | version = "0.2.1" 1553 | source = "registry+https://github.com/rust-lang/crates.io-index" 1554 | checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" 1555 | 1556 | [[package]] 1557 | name = "vcpkg" 1558 | version = "0.2.15" 1559 | source = "registry+https://github.com/rust-lang/crates.io-index" 1560 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" 1561 | 1562 | [[package]] 1563 | name = "version_check" 1564 | version = "0.9.4" 1565 | source = "registry+https://github.com/rust-lang/crates.io-index" 1566 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" 1567 | 1568 | [[package]] 1569 | name = "walkdir" 1570 | version = "2.3.3" 1571 | source = "registry+https://github.com/rust-lang/crates.io-index" 1572 | checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" 1573 | dependencies = [ 1574 | "same-file", 1575 | "winapi-util", 1576 | ] 1577 | 1578 | [[package]] 1579 | name = "want" 1580 | version = "0.3.0" 1581 | source = "registry+https://github.com/rust-lang/crates.io-index" 1582 | checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" 1583 | dependencies = [ 1584 | "log", 1585 | "try-lock", 1586 | ] 1587 | 1588 | [[package]] 1589 | name = "wasi" 1590 | version = "0.11.0+wasi-snapshot-preview1" 1591 | source = "registry+https://github.com/rust-lang/crates.io-index" 1592 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 1593 | 1594 | [[package]] 1595 | name = "wasm-bindgen" 1596 | version = "0.2.85" 1597 | source = "registry+https://github.com/rust-lang/crates.io-index" 1598 | checksum = "5b6cb788c4e39112fbe1822277ef6fb3c55cd86b95cb3d3c4c1c9597e4ac74b4" 1599 | dependencies = [ 1600 | "cfg-if", 1601 | "wasm-bindgen-macro", 1602 | ] 1603 | 1604 | [[package]] 1605 | name = "wasm-bindgen-backend" 1606 | version = "0.2.85" 1607 | source = "registry+https://github.com/rust-lang/crates.io-index" 1608 | checksum = "35e522ed4105a9d626d885b35d62501b30d9666283a5c8be12c14a8bdafe7822" 1609 | dependencies = [ 1610 | "bumpalo", 1611 | "log", 1612 | "once_cell", 1613 | "proc-macro2", 1614 | "quote", 1615 | "syn", 1616 | "wasm-bindgen-shared", 1617 | ] 1618 | 1619 | [[package]] 1620 | name = "wasm-bindgen-futures" 1621 | version = "0.4.35" 1622 | source = "registry+https://github.com/rust-lang/crates.io-index" 1623 | checksum = "083abe15c5d88556b77bdf7aef403625be9e327ad37c62c4e4129af740168163" 1624 | dependencies = [ 1625 | "cfg-if", 1626 | "js-sys", 1627 | "wasm-bindgen", 1628 | "web-sys", 1629 | ] 1630 | 1631 | [[package]] 1632 | name = "wasm-bindgen-macro" 1633 | version = "0.2.85" 1634 | source = "registry+https://github.com/rust-lang/crates.io-index" 1635 | checksum = "358a79a0cb89d21db8120cbfb91392335913e4890665b1a7981d9e956903b434" 1636 | dependencies = [ 1637 | "quote", 1638 | "wasm-bindgen-macro-support", 1639 | ] 1640 | 1641 | [[package]] 1642 | name = "wasm-bindgen-macro-support" 1643 | version = "0.2.85" 1644 | source = "registry+https://github.com/rust-lang/crates.io-index" 1645 | checksum = "4783ce29f09b9d93134d41297aded3a712b7b979e9c6f28c32cb88c973a94869" 1646 | dependencies = [ 1647 | "proc-macro2", 1648 | "quote", 1649 | "syn", 1650 | "wasm-bindgen-backend", 1651 | "wasm-bindgen-shared", 1652 | ] 1653 | 1654 | [[package]] 1655 | name = "wasm-bindgen-shared" 1656 | version = "0.2.85" 1657 | source = "registry+https://github.com/rust-lang/crates.io-index" 1658 | checksum = "a901d592cafaa4d711bc324edfaff879ac700b19c3dfd60058d2b445be2691eb" 1659 | 1660 | [[package]] 1661 | name = "web-sys" 1662 | version = "0.3.62" 1663 | source = "registry+https://github.com/rust-lang/crates.io-index" 1664 | checksum = "16b5f940c7edfdc6d12126d98c9ef4d1b3d470011c47c76a6581df47ad9ba721" 1665 | dependencies = [ 1666 | "js-sys", 1667 | "wasm-bindgen", 1668 | ] 1669 | 1670 | [[package]] 1671 | name = "webpki" 1672 | version = "0.22.0" 1673 | source = "registry+https://github.com/rust-lang/crates.io-index" 1674 | checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd" 1675 | dependencies = [ 1676 | "ring", 1677 | "untrusted", 1678 | ] 1679 | 1680 | [[package]] 1681 | name = "webpki-roots" 1682 | version = "0.22.6" 1683 | source = "registry+https://github.com/rust-lang/crates.io-index" 1684 | checksum = "b6c71e40d7d2c34a5106301fb632274ca37242cd0c9d3e64dbece371a40a2d87" 1685 | dependencies = [ 1686 | "webpki", 1687 | ] 1688 | 1689 | [[package]] 1690 | name = "winapi" 1691 | version = "0.3.9" 1692 | source = "registry+https://github.com/rust-lang/crates.io-index" 1693 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 1694 | dependencies = [ 1695 | "winapi-i686-pc-windows-gnu", 1696 | "winapi-x86_64-pc-windows-gnu", 1697 | ] 1698 | 1699 | [[package]] 1700 | name = "winapi-i686-pc-windows-gnu" 1701 | version = "0.4.0" 1702 | source = "registry+https://github.com/rust-lang/crates.io-index" 1703 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 1704 | 1705 | [[package]] 1706 | name = "winapi-util" 1707 | version = "0.1.5" 1708 | source = "registry+https://github.com/rust-lang/crates.io-index" 1709 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 1710 | dependencies = [ 1711 | "winapi", 1712 | ] 1713 | 1714 | [[package]] 1715 | name = "winapi-x86_64-pc-windows-gnu" 1716 | version = "0.4.0" 1717 | source = "registry+https://github.com/rust-lang/crates.io-index" 1718 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 1719 | 1720 | [[package]] 1721 | name = "windows-sys" 1722 | version = "0.42.0" 1723 | source = "registry+https://github.com/rust-lang/crates.io-index" 1724 | checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" 1725 | dependencies = [ 1726 | "windows_aarch64_gnullvm 0.42.2", 1727 | "windows_aarch64_msvc 0.42.2", 1728 | "windows_i686_gnu 0.42.2", 1729 | "windows_i686_msvc 0.42.2", 1730 | "windows_x86_64_gnu 0.42.2", 1731 | "windows_x86_64_gnullvm 0.42.2", 1732 | "windows_x86_64_msvc 0.42.2", 1733 | ] 1734 | 1735 | [[package]] 1736 | name = "windows-sys" 1737 | version = "0.45.0" 1738 | source = "registry+https://github.com/rust-lang/crates.io-index" 1739 | checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" 1740 | dependencies = [ 1741 | "windows-targets 0.42.2", 1742 | ] 1743 | 1744 | [[package]] 1745 | name = "windows-sys" 1746 | version = "0.48.0" 1747 | source = "registry+https://github.com/rust-lang/crates.io-index" 1748 | checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" 1749 | dependencies = [ 1750 | "windows-targets 0.48.0", 1751 | ] 1752 | 1753 | [[package]] 1754 | name = "windows-targets" 1755 | version = "0.42.2" 1756 | source = "registry+https://github.com/rust-lang/crates.io-index" 1757 | checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" 1758 | dependencies = [ 1759 | "windows_aarch64_gnullvm 0.42.2", 1760 | "windows_aarch64_msvc 0.42.2", 1761 | "windows_i686_gnu 0.42.2", 1762 | "windows_i686_msvc 0.42.2", 1763 | "windows_x86_64_gnu 0.42.2", 1764 | "windows_x86_64_gnullvm 0.42.2", 1765 | "windows_x86_64_msvc 0.42.2", 1766 | ] 1767 | 1768 | [[package]] 1769 | name = "windows-targets" 1770 | version = "0.48.0" 1771 | source = "registry+https://github.com/rust-lang/crates.io-index" 1772 | checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" 1773 | dependencies = [ 1774 | "windows_aarch64_gnullvm 0.48.0", 1775 | "windows_aarch64_msvc 0.48.0", 1776 | "windows_i686_gnu 0.48.0", 1777 | "windows_i686_msvc 0.48.0", 1778 | "windows_x86_64_gnu 0.48.0", 1779 | "windows_x86_64_gnullvm 0.48.0", 1780 | "windows_x86_64_msvc 0.48.0", 1781 | ] 1782 | 1783 | [[package]] 1784 | name = "windows_aarch64_gnullvm" 1785 | version = "0.42.2" 1786 | source = "registry+https://github.com/rust-lang/crates.io-index" 1787 | checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" 1788 | 1789 | [[package]] 1790 | name = "windows_aarch64_gnullvm" 1791 | version = "0.48.0" 1792 | source = "registry+https://github.com/rust-lang/crates.io-index" 1793 | checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" 1794 | 1795 | [[package]] 1796 | name = "windows_aarch64_msvc" 1797 | version = "0.42.2" 1798 | source = "registry+https://github.com/rust-lang/crates.io-index" 1799 | checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" 1800 | 1801 | [[package]] 1802 | name = "windows_aarch64_msvc" 1803 | version = "0.48.0" 1804 | source = "registry+https://github.com/rust-lang/crates.io-index" 1805 | checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" 1806 | 1807 | [[package]] 1808 | name = "windows_i686_gnu" 1809 | version = "0.42.2" 1810 | source = "registry+https://github.com/rust-lang/crates.io-index" 1811 | checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" 1812 | 1813 | [[package]] 1814 | name = "windows_i686_gnu" 1815 | version = "0.48.0" 1816 | source = "registry+https://github.com/rust-lang/crates.io-index" 1817 | checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" 1818 | 1819 | [[package]] 1820 | name = "windows_i686_msvc" 1821 | version = "0.42.2" 1822 | source = "registry+https://github.com/rust-lang/crates.io-index" 1823 | checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" 1824 | 1825 | [[package]] 1826 | name = "windows_i686_msvc" 1827 | version = "0.48.0" 1828 | source = "registry+https://github.com/rust-lang/crates.io-index" 1829 | checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" 1830 | 1831 | [[package]] 1832 | name = "windows_x86_64_gnu" 1833 | version = "0.42.2" 1834 | source = "registry+https://github.com/rust-lang/crates.io-index" 1835 | checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" 1836 | 1837 | [[package]] 1838 | name = "windows_x86_64_gnu" 1839 | version = "0.48.0" 1840 | source = "registry+https://github.com/rust-lang/crates.io-index" 1841 | checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" 1842 | 1843 | [[package]] 1844 | name = "windows_x86_64_gnullvm" 1845 | version = "0.42.2" 1846 | source = "registry+https://github.com/rust-lang/crates.io-index" 1847 | checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" 1848 | 1849 | [[package]] 1850 | name = "windows_x86_64_gnullvm" 1851 | version = "0.48.0" 1852 | source = "registry+https://github.com/rust-lang/crates.io-index" 1853 | checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" 1854 | 1855 | [[package]] 1856 | name = "windows_x86_64_msvc" 1857 | version = "0.42.2" 1858 | source = "registry+https://github.com/rust-lang/crates.io-index" 1859 | checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" 1860 | 1861 | [[package]] 1862 | name = "windows_x86_64_msvc" 1863 | version = "0.48.0" 1864 | source = "registry+https://github.com/rust-lang/crates.io-index" 1865 | checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" 1866 | 1867 | [[package]] 1868 | name = "winreg" 1869 | version = "0.10.1" 1870 | source = "registry+https://github.com/rust-lang/crates.io-index" 1871 | checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" 1872 | dependencies = [ 1873 | "winapi", 1874 | ] 1875 | 1876 | [[package]] 1877 | name = "yaml-rust" 1878 | version = "0.4.5" 1879 | source = "registry+https://github.com/rust-lang/crates.io-index" 1880 | checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" 1881 | dependencies = [ 1882 | "linked-hash-map", 1883 | ] 1884 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "html_parser" 3 | version = "0.7.0" 4 | authors = ["Mathias Iversen "] 5 | edition = "2018" 6 | repository = "https://github.com/mathiversen/html-parser" 7 | license = "MIT" 8 | description = "A simple and general purpose html/xhtml parser" 9 | keywords = ["html", "parser", "json", "pest", "dom"] 10 | categories = ["parsing", "web-programming"] 11 | readme = "README.md" 12 | 13 | [dependencies] 14 | pest = "2.5.7" 15 | pest_derive = "2.5.7" 16 | thiserror = "1.0.40" 17 | serde = { version = "1.0.159", features = ["derive"] } 18 | serde_derive = "1.0.159" 19 | serde_json = "1.0.95" 20 | doc-comment = "0.3.3" 21 | 22 | [dev-dependencies] 23 | indoc = "2.0.1" 24 | insta = { version = "1.29.0", features = ["json"]} 25 | tempfile = "3.5.0" 26 | criterion = "0.4.0" 27 | reqwest = { version = "0.11.16", features = ["blocking", "rustls-tls"] } 28 | clap = { version = "4.2.1", features = ["derive"] } 29 | 30 | [[example]] 31 | name = "get_all_href" 32 | path = "examples/get_all_href/main.rs" 33 | 34 | [[example]] 35 | name = "simple_parser" 36 | path = "examples/simple_parser/main.rs" 37 | 38 | [[bench]] 39 | name = "bench_wikipedia" 40 | harness = false 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Mathias Iversen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Html parser 2 | 3 | A simple and general purpose html/xhtml parser lib/bin, using [Pest](https://pest.rs/). 4 | 5 | ## Features 6 | 7 | - Parse html & xhtml (not xml processing instructions) 8 | - Parse html-documents 9 | - Parse html-fragments 10 | - Parse empty documents 11 | - Parse with the same api for both documents and fragments 12 | - Parse custom, non-standard, elements; ``, `` and `` 13 | - Removes comments 14 | - Removes dangling elements 15 | - Iterate over all nodes in the dom three 16 | 17 | ## What is it not 18 | 19 | - It's not a high-performance browser-grade parser 20 | - It's not suitable for html validation 21 | - It's not a parser that includes element selection or dom manipulation 22 | 23 | If your requirements matches any of the above, then you're most likely looking for one of the crates below: 24 | 25 | - [html5ever](https://crates.io/crates/html5ever) 26 | - [kuchiki](https://crates.io/crates/kuchiki) 27 | - [scraper](https://crates.io/crates/scraper) 28 | - or other crates using the `html5ever` parser 29 | 30 | ## Examples bin 31 | 32 | Parse html file 33 | 34 | ```shell 35 | html_parser index.html 36 | 37 | ``` 38 | 39 | Parse stdin with pretty output 40 | 41 | ```shell 42 | curl | html_parser -p 43 | ``` 44 | 45 | ## Examples lib 46 | 47 | Parse html document 48 | 49 | ```rust 50 | use html_parser::Dom; 51 | 52 | fn main() { 53 | let html = r#" 54 | 55 | 56 | 57 | 58 | Html parser 59 | 60 | 61 |

Hello world

62 | 63 | 64 | "#; 65 | 66 | assert!(Dom::parse(html).is_ok()); 67 | } 68 | ``` 69 | 70 | Parse html fragment 71 | 72 | ```rust 73 | use html_parser::Dom; 74 | 75 | fn main() { 76 | let html = "
"; 77 | assert!(Dom::parse(html).is_ok()); 78 | } 79 | ``` 80 | 81 | Print to json 82 | 83 | ```rust 84 | use html_parser::{Dom, Result}; 85 | 86 | fn main() -> Result<()> { 87 | let html = "
"; 88 | let json = Dom::parse(html)?.to_json_pretty()?; 89 | println!("{}", json); 90 | Ok(()) 91 | } 92 | ``` 93 | -------------------------------------------------------------------------------- /benches/bench_wikipedia.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use html_parser::Dom; 3 | 4 | static HTML: &'static str = include_str!("./wikipedia-2020-12-21.html"); 5 | 6 | fn wikipedia(c: &mut Criterion) { 7 | c.bench_function("wikipedia", |b| b.iter(|| Dom::parse(HTML).unwrap())); 8 | } 9 | 10 | criterion_group!(benches, wikipedia); 11 | criterion_main!(benches); 12 | -------------------------------------------------------------------------------- /examples/get_all_href/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Can you get all the links? 7 | 8 | 9 |
10 | 16 |
17 | 18 | -------------------------------------------------------------------------------- /examples/get_all_href/main.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Node, Result}; 2 | 3 | // This example illustrates how to use the library to get all of the anchor-hrefs from a document. 4 | 5 | fn main() -> Result<()> { 6 | let html = include_str!("./index.html"); 7 | let dom = Dom::parse(html)?; 8 | let iter = dom.children.get(0).unwrap().into_iter(); 9 | 10 | let hrefs = iter.filter_map(|item| match item { 11 | Node::Element(ref element) if element.name == "a" => element.attributes["href"].clone(), 12 | _ => None, 13 | }); 14 | 15 | println!("\nThe following links where found:"); 16 | for (index, href) in hrefs.enumerate() { 17 | println!("{}: {}", index + 1, href) 18 | } 19 | 20 | Ok(()) 21 | } 22 | -------------------------------------------------------------------------------- /examples/simple_parser/main.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | use html_parser::{Dom, Result}; 3 | use std::{ 4 | fs::File, 5 | io::{self, Read}, 6 | path::PathBuf, 7 | }; 8 | 9 | #[derive(Debug, Parser)] 10 | /// A simple and general purpose html/xhtml parser. 11 | struct Opt { 12 | #[arg(short, long)] 13 | /// Pretty-print the output. 14 | pretty_print: bool, 15 | 16 | #[arg(short, long)] 17 | /// Debug the parser, this will print errors to the console. 18 | debug: bool, 19 | 20 | /// Path to the file, or stdin (piped content). 21 | /// 22 | /// This argument can either be a path to the html-file that you would like to parse or the 23 | /// result of stdin. Note: Content over stdin needs to be finite, for now, as it is collected 24 | /// into a string and then processed by the parser. 25 | input: Option, 26 | } 27 | 28 | fn main() -> Result<()> { 29 | let opt = Opt::parse(); 30 | 31 | let mut content = String::with_capacity(100_000); 32 | 33 | // If input is provided then use that as a path 34 | if let Some(path) = opt.input { 35 | let mut file = File::open(path)?; 36 | file.read_to_string(&mut content)?; 37 | 38 | // Else read from stdin, this enables piping 39 | // ex: `cat index.html | html_parser` 40 | } else { 41 | let stdin = io::stdin(); 42 | let mut handle = stdin.lock(); 43 | handle.read_to_string(&mut content)?; 44 | }; 45 | 46 | let dom = Dom::parse(&content)?; 47 | 48 | if opt.debug { 49 | for error in &dom.errors { 50 | println!("# {}", error); 51 | } 52 | } 53 | 54 | if opt.pretty_print { 55 | println!("{}", dom.to_json_pretty()?); 56 | } else { 57 | println!("{}", dom.to_json()?); 58 | } 59 | 60 | Ok(()) 61 | } 62 | -------------------------------------------------------------------------------- /src/dom/element.rs: -------------------------------------------------------------------------------- 1 | use super::node::Node; 2 | use super::span::SourceSpan; 3 | use serde::{Serialize, Serializer}; 4 | use std::collections::{BTreeMap, HashMap}; 5 | use std::default::Default; 6 | use std::result::Result; 7 | 8 | /// Normal: `
` or Void: ``and `` 9 | #[derive(Debug, Clone, Serialize, PartialEq)] 10 | #[serde(rename_all = "camelCase")] 11 | // TODO: Align with: https://html.spec.whatwg.org/multipage/syntax.html#elements-2 12 | pub enum ElementVariant { 13 | /// A normal element can have children, ex:
. 14 | Normal, 15 | /// A void element can't have children, ex: and 16 | Void, 17 | } 18 | 19 | pub type Attributes = HashMap>; 20 | 21 | /// Most of the parsed html nodes are elements, except for text 22 | #[derive(Debug, Clone, Serialize, PartialEq)] 23 | #[serde(rename_all = "camelCase")] 24 | pub struct Element { 25 | /// The id of the element 26 | #[serde(skip_serializing_if = "Option::is_none")] 27 | pub id: Option, 28 | 29 | /// The name / tag of the element 30 | pub name: String, 31 | 32 | /// The element variant, if it is of type void or not 33 | pub variant: ElementVariant, 34 | 35 | /// All of the elements attributes, except id and class 36 | #[serde(skip_serializing_if = "HashMap::is_empty")] 37 | #[serde(serialize_with = "ordered_map")] 38 | pub attributes: Attributes, 39 | 40 | /// All of the elements classes 41 | #[serde(skip_serializing_if = "Vec::is_empty")] 42 | pub classes: Vec, 43 | 44 | /// All of the elements child nodes 45 | #[serde(skip_serializing_if = "Vec::is_empty")] 46 | pub children: Vec, 47 | 48 | /// Span of the element in the parsed source 49 | #[serde(skip)] 50 | pub source_span: SourceSpan 51 | } 52 | 53 | impl Default for Element { 54 | fn default() -> Self { 55 | Self { 56 | id: None, 57 | name: "".to_string(), 58 | variant: ElementVariant::Void, 59 | classes: vec![], 60 | attributes: HashMap::new(), 61 | children: vec![], 62 | source_span: SourceSpan::default() 63 | } 64 | } 65 | } 66 | 67 | fn ordered_map(value: &Attributes, serializer: S) -> Result { 68 | let ordered: BTreeMap<_, _> = value.iter().collect(); 69 | ordered.serialize(serializer) 70 | } 71 | -------------------------------------------------------------------------------- /src/dom/formatting.rs: -------------------------------------------------------------------------------- 1 | use crate::error::Error; 2 | use crate::Result; 3 | use crate::Rule; 4 | use pest::error::Error as PestError; 5 | 6 | /// This function abstracts the formatting of errors away from the core logic inside parser, 7 | /// so that the file is easier to read. 8 | pub fn error_msg(error: PestError) -> Result { 9 | let message = error.renamed_rules(|rule| match *rule { 10 | Rule::EOI => "end of input".to_string(), 11 | Rule::doctype => "doctype element".to_string(), 12 | Rule::node_text => "text node".to_string(), 13 | Rule::node_element => "element node".to_string(), 14 | Rule::el_void => "void element".to_string(), 15 | Rule::el_void_xml => "void element with xml ending (/>)".to_string(), 16 | Rule::el_process_instruct => "xml processing instruction".to_string(), 17 | Rule::el_raw_text => "element with raw text (style or script)".to_string(), 18 | Rule::el_normal => "normal element".to_string(), 19 | Rule::el_dangling => "".to_string(), 20 | Rule::attr => "attribute (key=\"value\")".to_string(), 21 | Rule::attr_key => "attribute key".to_string(), 22 | Rule::attr_value => "attribute value".to_string(), 23 | Rule::el_name => "element name".to_string(), 24 | Rule::el_void_name_html => "void element name".to_string(), 25 | // TODO: Continue with this 26 | x => format!("{:?} ", x), 27 | }); 28 | Err(Error::Parsing(message.to_string())) 29 | } 30 | -------------------------------------------------------------------------------- /src/dom/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::Result; 2 | use pest::{iterators::Pair, iterators::Pairs, Parser}; 3 | use serde::Serialize; 4 | use std::default::Default; 5 | 6 | use crate::error::Error; 7 | use crate::grammar::Grammar; 8 | use crate::Rule; 9 | 10 | pub mod element; 11 | pub mod formatting; 12 | pub mod node; 13 | pub mod span; 14 | 15 | use crate::dom::span::SourceSpan; 16 | use element::{Element, ElementVariant}; 17 | use node::Node; 18 | 19 | /// Document, DocumentFragment or Empty 20 | #[derive(Debug, Clone, PartialEq, Serialize)] 21 | #[serde(rename_all = "camelCase")] 22 | pub enum DomVariant { 23 | /// This means that the parsed html had the representation of an html document. The doctype is optional but a document should only have one root node with the name of html. 24 | /// Example: 25 | /// ```text 26 | /// 27 | /// 28 | /// 29 | /// 30 | ///

Hello world

31 | /// 32 | /// 33 | /// ``` 34 | Document, 35 | /// A document fragment means that the parsed html did not have the representation of a document. A fragment can have multiple root children of any name except html, body or head. 36 | /// Example: 37 | /// ```text 38 | ///

Hello world

39 | /// ``` 40 | DocumentFragment, 41 | /// An empty dom means that the input was empty 42 | Empty, 43 | } 44 | 45 | /// **The main struct** & the result of the parsed html 46 | #[derive(Debug, Clone, Serialize, PartialEq)] 47 | #[serde(rename_all = "camelCase")] 48 | pub struct Dom { 49 | /// The type of the tree that was parsed 50 | pub tree_type: DomVariant, 51 | 52 | /// All of the root children in the tree 53 | #[serde(skip_serializing_if = "Vec::is_empty")] 54 | pub children: Vec, 55 | 56 | /// A collection of all errors during parsing 57 | #[serde(skip_serializing)] 58 | pub errors: Vec, 59 | } 60 | 61 | impl Default for Dom { 62 | fn default() -> Self { 63 | Self { 64 | tree_type: DomVariant::Empty, 65 | children: vec![], 66 | errors: vec![], 67 | } 68 | } 69 | } 70 | 71 | impl Dom { 72 | pub fn parse(input: &str) -> Result { 73 | let pairs = match Grammar::parse(Rule::html, input) { 74 | Ok(pairs) => pairs, 75 | Err(error) => return formatting::error_msg(error), 76 | }; 77 | Self::build_dom(pairs) 78 | } 79 | 80 | pub fn to_json(&self) -> Result { 81 | Ok(serde_json::to_string(self)?) 82 | } 83 | 84 | pub fn to_json_pretty(&self) -> Result { 85 | Ok(serde_json::to_string_pretty(self)?) 86 | } 87 | 88 | fn build_dom(pairs: Pairs) -> Result { 89 | let mut dom = Self::default(); 90 | 91 | // NOTE: The logic is roughly as follows: 92 | // 1) A document containing nothing but comments is DomVariant::Empty even though it will have 93 | // children in this first pass. We fix this in the next section. This allows us to use 94 | // DomVariant::Empty to indicate "we haven't decided the type yet". 95 | // 2) If the type is DomVariant::Empty _so far_, then it can be changed to DomVariant::Document 96 | // or DomVariant::DocumentFragment. DomVariant is only selected in this stage if we see a 97 | // DOCTYPE tag. Comments do not change the type. 98 | // 3) If the type is non-empty, we don't re-set the type. We do look for conflicts between 99 | // the type and the tokens in the next stage. 100 | for pair in pairs { 101 | match pair.as_rule() { 102 | // A tag means a full-fledged document. Note that because of the way 103 | // the grammar is written, we will only get this token if the occurs 104 | // before any other tag; otherwise it will be parsed as a custom tag. 105 | Rule::doctype => { 106 | if dom.tree_type == DomVariant::Empty { 107 | dom.tree_type = DomVariant::Document; 108 | } 109 | } 110 | 111 | // If we see an element, build the sub-tree and add it as a child. If we don't 112 | // have a document type yet (i.e. "empty"), select DocumentFragment 113 | Rule::node_element => match Self::build_node_element(pair, &mut dom) { 114 | Ok(el) => { 115 | if let Some(node) = el { 116 | if dom.tree_type == DomVariant::Empty { 117 | dom.tree_type = DomVariant::DocumentFragment; 118 | }; 119 | dom.children.push(node); 120 | } 121 | } 122 | Err(error) => { 123 | dom.errors.push(format!("{}", error)); 124 | } 125 | }, 126 | 127 | // Similar to an element, we add it as a child and select DocumentFragment if we 128 | // don't already have a document type. 129 | Rule::node_text => { 130 | if dom.tree_type == DomVariant::Empty { 131 | dom.tree_type = DomVariant::DocumentFragment; 132 | } 133 | let text = pair.as_str().to_string(); 134 | if !text.trim().is_empty() { 135 | dom.children.push(Node::Text(text)); 136 | } 137 | } 138 | 139 | // Store comments as a child, but it doesn't affect the document type selection 140 | // until the next phase (validation). 141 | Rule::node_comment => { 142 | dom.children 143 | .push(Node::Comment(pair.into_inner().as_str().to_string())); 144 | } 145 | 146 | // Ignore 'end of input', which then allows the catch-all unreachable!() arm to 147 | // function properly. 148 | Rule::EOI => (), 149 | 150 | // This should be unreachable, due to the way the grammar is written 151 | _ => unreachable!("[build dom] unknown rule: {:?}", pair.as_rule()), 152 | }; 153 | } 154 | 155 | // Implement some checks on the generated dom's data and initial type. The type may be 156 | // modified in this section. 157 | match dom.tree_type { 158 | // A DomVariant::Empty can only have comments. Anything else is an error. 159 | DomVariant::Empty => { 160 | for node in &dom.children { 161 | if let Node::Comment(_) = node { 162 | // An "empty" document, but it has comments - this is where we cleanup the 163 | // earlier assumption that a document with only comments is "empty". 164 | // Really, it is a "fragment". 165 | dom.tree_type = DomVariant::DocumentFragment 166 | } else { 167 | // Anything else (i.e. Text() or Element() ) can't happen at the top level; 168 | // if we had seen one, we would have set the document type above 169 | unreachable!("[build dom] empty document with an Element {:?}", node) 170 | } 171 | } 172 | } 173 | 174 | // A DomVariant::Document can only have comments and an node at the top level. 175 | // Only one tag is permitted. 176 | DomVariant::Document => { 177 | if dom 178 | .children 179 | .iter() 180 | .filter(|x| match x { 181 | Node::Element(el) if el.name.to_lowercase() == "html" => true, 182 | _ => false, 183 | }) 184 | .count() 185 | > 1 186 | { 187 | return Err(Error::Parsing(format!("Document with multiple HTML tags",))); 188 | } 189 | } 190 | 191 | // A DomVariant::DocumentFragment should not have , or tags at the 192 | // top-level. If we find an tag, then we consider this a Document instead (if 193 | // it comes before any other elements, and if there is only one tag). 194 | DomVariant::DocumentFragment => { 195 | let mut seen_html = false; 196 | let mut seen_elements = false; 197 | 198 | for node in &dom.children { 199 | match node { 200 | // Nodes other than - reject and 201 | Node::Element(ref el) if el.name.clone().to_lowercase() != "html" => { 202 | if el.name == "head" || el.name == "body" { 203 | return Err(Error::Parsing(format!( 204 | "A document fragment should not include {}", 205 | el.name 206 | ))); 207 | } 208 | seen_elements = true; 209 | } 210 | // Nodes - one (before any other elements) is okay 211 | Node::Element(ref el) if el.name.clone().to_lowercase() == "html" => { 212 | if seen_html || seen_elements { 213 | return Err(Error::Parsing(format!( 214 | "A document fragment should not include {}", 215 | el.name 216 | ))); 217 | }; 218 | 219 | // A fragment with just an tag is a document 220 | dom.tree_type = DomVariant::Document; 221 | seen_html = true; 222 | } 223 | // Comment() and Text() nodes are permitted at the top-level of a 224 | // DocumentFragment 225 | _ => (), 226 | } 227 | } 228 | } 229 | } 230 | 231 | // The result is the validated tree 232 | Ok(dom) 233 | } 234 | 235 | fn build_node_element(pair: Pair, dom: &mut Dom) -> Result> { 236 | let source_span = { 237 | let pair_span = pair.as_span(); 238 | let (start_line, start_column) = pair_span.start_pos().line_col(); 239 | let (end_line, end_column) = pair_span.end_pos().line_col(); 240 | 241 | SourceSpan::new( 242 | String::from(pair_span.as_str()), 243 | start_line, 244 | end_line, 245 | start_column, 246 | end_column, 247 | ) 248 | }; 249 | 250 | let mut element = Element { 251 | source_span, 252 | ..Element::default() 253 | }; 254 | 255 | for pair in pair.into_inner() { 256 | match pair.as_rule() { 257 | Rule::node_element | Rule::el_raw_text => { 258 | match Self::build_node_element(pair, dom) { 259 | Ok(el) => { 260 | if let Some(child_element) = el { 261 | element.children.push(child_element) 262 | } 263 | } 264 | Err(error) => { 265 | dom.errors.push(format!("{}", error)); 266 | } 267 | } 268 | } 269 | Rule::node_text | Rule::el_raw_text_content => { 270 | let text = pair.as_str().to_string(); 271 | if !text.trim().is_empty() { 272 | element.children.push(Node::Text(text)); 273 | } 274 | } 275 | Rule::node_comment => { 276 | element 277 | .children 278 | .push(Node::Comment(pair.into_inner().as_str().to_string())); 279 | } 280 | // TODO: To enable some kind of validation we should probably align this with 281 | // https://html.spec.whatwg.org/multipage/syntax.html#elements-2 282 | // Also see element variants 283 | Rule::el_name | Rule::el_void_name | Rule::el_raw_text_name => { 284 | element.name = pair.as_str().to_string(); 285 | } 286 | Rule::attr => match Self::build_attribute(pair.into_inner()) { 287 | Ok((attr_key, attr_value)) => { 288 | match attr_key.as_str() { 289 | "id" => element.id = attr_value, 290 | "class" => { 291 | if let Some(classes) = attr_value { 292 | let classes = classes.split_whitespace().collect::>(); 293 | for class in classes { 294 | element.classes.push(class.to_string()); 295 | } 296 | } 297 | } 298 | _ => { 299 | element.attributes.insert(attr_key, attr_value); 300 | } 301 | }; 302 | } 303 | Err(error) => { 304 | dom.errors.push(format!("{}", error)); 305 | } 306 | }, 307 | Rule::el_normal_end | Rule::el_raw_text_end => { 308 | element.variant = ElementVariant::Normal; 309 | break; 310 | } 311 | Rule::el_dangling => (), 312 | Rule::EOI => (), 313 | _ => { 314 | return Err(Error::Parsing(format!( 315 | "Failed to create element at rule: {:?}", 316 | pair.as_rule() 317 | ))) 318 | } 319 | } 320 | } 321 | if element.name != "" { 322 | Ok(Some(Node::Element(element))) 323 | } else { 324 | Ok(None) 325 | } 326 | } 327 | 328 | fn build_attribute(pairs: Pairs) -> Result<(String, Option)> { 329 | let mut attribute = ("".to_string(), None); 330 | for pair in pairs { 331 | match pair.as_rule() { 332 | Rule::attr_key => { 333 | attribute.0 = pair.as_str().trim().to_string(); 334 | } 335 | Rule::attr_non_quoted => { 336 | attribute.1 = Some(pair.as_str().trim().to_string()); 337 | } 338 | Rule::attr_quoted => { 339 | let inner_pair = pair 340 | .into_inner() 341 | .into_iter() 342 | .next() 343 | .expect("attribute value"); 344 | 345 | match inner_pair.as_rule() { 346 | Rule::attr_value => attribute.1 = Some(inner_pair.as_str().to_string()), 347 | _ => { 348 | return Err(Error::Parsing(format!( 349 | "Failed to parse attr value: {:?}", 350 | inner_pair.as_rule() 351 | ))) 352 | } 353 | } 354 | } 355 | _ => { 356 | return Err(Error::Parsing(format!( 357 | "Failed to parse attr: {:?}", 358 | pair.as_rule() 359 | ))) 360 | } 361 | } 362 | } 363 | Ok(attribute) 364 | } 365 | } 366 | -------------------------------------------------------------------------------- /src/dom/node.rs: -------------------------------------------------------------------------------- 1 | use super::element::Element; 2 | use serde::Serialize; 3 | 4 | #[derive(Debug, Clone, Serialize, PartialEq)] 5 | #[serde(untagged)] 6 | pub enum Node { 7 | Text(String), 8 | Element(Element), 9 | Comment(String), 10 | } 11 | 12 | impl Node { 13 | pub fn text(&self) -> Option<&str> { 14 | match self { 15 | Node::Text(t) => Some(t.as_str()), 16 | _ => None, 17 | } 18 | } 19 | 20 | pub fn element(&self) -> Option<&Element> { 21 | match self { 22 | Node::Element(e) => Some(e), 23 | _ => None, 24 | } 25 | } 26 | 27 | pub fn comment(&self) -> Option<&str> { 28 | match self { 29 | Node::Comment(t) => Some(t.as_str()), 30 | _ => None, 31 | } 32 | } 33 | } 34 | 35 | impl<'a> IntoIterator for &'a Node { 36 | type Item = &'a Node; 37 | type IntoIter = NodeIntoIterator<'a>; 38 | 39 | fn into_iter(self) -> Self::IntoIter { 40 | NodeIntoIterator { 41 | node: self, 42 | index: vec![], 43 | } 44 | } 45 | } 46 | 47 | pub struct NodeIntoIterator<'a> { 48 | node: &'a Node, 49 | // We add/remove to this vec each time we go up/down a node three 50 | index: Vec<(usize, &'a Node)>, 51 | } 52 | 53 | impl<'a> Iterator for NodeIntoIterator<'a> { 54 | type Item = &'a Node; 55 | 56 | fn next(&mut self) -> Option { 57 | // Get first child 58 | let child = match self.node { 59 | Node::Element(ref e) => e.children.get(0), 60 | _ => None, 61 | }; 62 | 63 | let result = match child { 64 | // If element has child, return child 65 | Some(child) => { 66 | self.index.push((0, self.node)); 67 | self.node = child; 68 | Some(child) 69 | } 70 | // If element doesn't have a child, but is a child of another node 71 | None if self.index.len() > 0 => { 72 | let mut has_finished = false; 73 | let mut next_node = None; 74 | 75 | while !has_finished { 76 | // Try to get the next sibling of the parent node 77 | if let Some((sibling_index, parent)) = self.index.pop() { 78 | let next_sibling = sibling_index + 1; 79 | let sibling = if let Node::Element(ref e) = parent { 80 | e.children.get(next_sibling) 81 | } else { 82 | None 83 | }; 84 | if sibling.is_some() { 85 | has_finished = true; 86 | self.index.push((next_sibling, parent)); 87 | next_node = sibling; 88 | } else { 89 | continue; 90 | } 91 | // Break of there are no more parents 92 | } else { 93 | has_finished = true; 94 | } 95 | } 96 | 97 | if let Some(next_node) = next_node { 98 | self.node = next_node; 99 | } 100 | 101 | next_node 102 | } 103 | _ => None, 104 | }; 105 | 106 | result 107 | } 108 | } 109 | 110 | #[cfg(test)] 111 | mod tests { 112 | use super::*; 113 | 114 | #[test] 115 | fn node_utillity_functions() { 116 | let node = Node::Text("test".to_string()); 117 | 118 | assert_eq!(node.text(), Some("test")); 119 | assert_eq!(node.element(), None); 120 | assert_eq!(node.comment(), None); 121 | 122 | let node = Node::Element(Element::default()); 123 | 124 | assert_eq!(node.text(), None); 125 | assert_eq!(node.element(), Some(&Element::default())); 126 | assert_eq!(node.comment(), None); 127 | 128 | let node = Node::Comment("test".to_string()); 129 | 130 | assert_eq!(node.text(), None); 131 | assert_eq!(node.element(), None); 132 | assert_eq!(node.comment(), Some("test")); 133 | } 134 | } -------------------------------------------------------------------------------- /src/dom/span.rs: -------------------------------------------------------------------------------- 1 | use serde::{Serialize}; 2 | 3 | /// Span of the information in the parsed source. 4 | #[derive(Debug, Default, Clone, Serialize, PartialEq)] 5 | #[serde(rename_all = "camelCase")] 6 | pub struct SourceSpan { 7 | pub text: String, 8 | pub start_line: usize, 9 | pub end_line: usize, 10 | pub start_column: usize, 11 | pub end_column: usize, 12 | } 13 | 14 | impl SourceSpan { 15 | pub fn new( 16 | text: String, 17 | start_line: usize, 18 | end_line: usize, 19 | start_column: usize, 20 | end_column: usize, 21 | ) -> Self { 22 | Self { 23 | text, 24 | start_line, 25 | end_line, 26 | start_column, 27 | end_column, 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error as ThisError; 2 | 3 | #[derive(ThisError, Debug)] 4 | pub enum Error { 5 | #[error("{0}")] 6 | Parsing(String), 7 | #[error("{0}")] 8 | Cli(String), 9 | #[error("{0}")] 10 | IO(#[from] std::io::Error), 11 | #[error("{0}")] 12 | Serde(#[from] serde_json::Error), 13 | } 14 | 15 | pub type Result = std::result::Result; 16 | -------------------------------------------------------------------------------- /src/grammar/mod.rs: -------------------------------------------------------------------------------- 1 | use pest_derive::Parser; 2 | 3 | #[derive(Parser)] 4 | #[grammar = "grammar/rules.pest"] 5 | pub struct Grammar; 6 | -------------------------------------------------------------------------------- /src/grammar/rules.pest: -------------------------------------------------------------------------------- 1 | // 2 | // HTML 3 | // 4 | html = _{ 5 | SOI 6 | ~ node_comment* 7 | ~ doctype? 8 | ~ node* 9 | ~ EOI 10 | } 11 | 12 | // 13 | // DOCTYPE 14 | // 15 | doctype = { WSP* ~ chevron_left_bang ~ ^"doctype" ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal} 16 | 17 | // 18 | // NODES 19 | // 20 | node = _{ node_comment | node_element | node_text } 21 | node_comment = { WSP* ~ (comment_if | comment_normal) ~ WSP* } 22 | node_text = { (!(node_element | comment_tag_start | chevron_left_bang) ~ ANY)+ } 23 | node_element = { el_void | el_void_xml | el_process_instruct | el_raw_text | el_normal | el_dangling } 24 | 25 | // 26 | // COMMENTS 27 | // 28 | comment_normal = _{ comment_tag_start ~ comment_body ~ comment_tag_end } 29 | comment_body = { (!comment_tag_end ~ ANY)* } 30 | comment_tag_start = _{ chevron_left_bang ~ "--" ~ WSP* } 31 | comment_tag_end = _{ WSP* ~ "--" ~ chevron_right_normal } 32 | 33 | // Compatability with old IE browsers... This is not necessary for newer browsers 34 | comment_if = _{ comment_if_start ~ comment_if_body ~ comment_if_end } 35 | comment_if_body = { (!comment_if_end ~ ANY)* } 36 | comment_if_start = _{ comment_tag_start ~ "[" ~ ^"if" } 37 | comment_if_end = _{ chevron_left_bang ~ "[" ~ ^"endif" ~ "]" ~ comment_tag_end } 38 | 39 | // 40 | // ATTRIBUTES 41 | // 42 | attr = { attr_key ~ (equal ~ WSP* ~ (attr_non_quoted | attr_quoted ))? } 43 | attr_quoted = ${PUSH(quote) ~ attr_value ~ POP } 44 | attr_non_quoted = @{ !quote ~ (!(WSP | chevron_right) ~ ANY)* } 45 | attr_key = { WSP* ~ ASCII_ALPHA ~ text_chars* ~ WSP* } 46 | attr_value = { WSP* ~ (!PEEK ~ ANY)* ~ WSP* } 47 | 48 | // 49 | // ELEMENTS 50 | // 51 | el_name = @{ ASCII_ALPHA ~ text_chars* } 52 | 53 | // Void element aka self-closing element 54 | // Ex:
55 | el_void_name_html = @{ 56 | ^"area" 57 | | ^"base" 58 | | ^"br" 59 | | ^"col" 60 | | ^"command" 61 | | ^"embed" 62 | | ^"hr" 63 | | ^"img" 64 | | ^"input" 65 | | ^"keygen" 66 | | ^"link" 67 | | ^"meta" 68 | | ^"param" 69 | | ^"source" 70 | | ^"track" 71 | | ^"wbr" 72 | | ^"meta" 73 | } 74 | // NOTE: This should not have to be a rule, but people doesn't know what void elements are... 75 | el_void_name_svg = @{ 76 | ^"path" 77 | | ^"polygon" 78 | | ^"rect" 79 | | ^"circle" 80 | } 81 | el_void_name = @{ el_void_name_html | el_void_name_svg } 82 | el_void = _{ chevron_left_normal ~ WSP* ~ el_void_name ~ WSP* ~ attr* ~ WSP* ~ (chevron_right_normal | chevron_right_closed) } 83 | el_void_xml = _{ chevron_left_normal ~ WSP* ~ el_name ~ WSP* ~ attr* ~ WSP* ~ chevron_right_closed } 84 | 85 | // Open elements are default element that can take children 86 | // and have both a start tag and an end tag 87 | // Ex: 88 | el_normal = _{ el_normal_start ~ (!el_normal_end ~ node)* ~ el_normal_end } 89 | el_normal_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal} 90 | el_normal_end = { chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal} 91 | 92 | // Raw text elements are elements with text/script content that 93 | // might interfere with the normal html syntax 94 | el_raw_text_name = { 95 | ^"style" 96 | | ^"script" 97 | | ^"title" 98 | | ^"textarea" 99 | } 100 | el_raw_text_content = { (!el_raw_text_end ~ ANY)* } 101 | el_raw_text = _{ el_raw_text_start ~ el_raw_text_content ~ el_raw_text_end } 102 | el_raw_text_start = _{ chevron_left_normal ~ WSP* ~ PUSH(el_raw_text_name) ~ WSP* ~ attr* ~ WSP* ~ chevron_right_normal ~ WSP*} 103 | el_raw_text_end = { WSP* ~ chevron_left_closed ~ WSP* ~ POP ~ WSP* ~ chevron_right_normal} 104 | 105 | // XML processing instruction 106 | // Ex: 107 | el_process_instruct = { chevron_left_question ~ WSP* ~ el_name? ~ WSP* ~ attr* ~ WSP* ~ chevron_right_question } 108 | 109 | // Catch dangling elements 110 | // Ex:
111 | el_dangling = { chevron_left_closed ~ WSP* ~ el_name ~ WSP* ~ chevron_right_normal} 112 | 113 | // 114 | // SYMBOLS / CHARACTERS 115 | // 116 | text_chars = _{'a'..'z' | 'A'..'Z' | "_" | "-" | ":" |'0'..'9'} 117 | 118 | chevron_left_normal = _{ "<" } 119 | chevron_left_closed = _{ "" } 124 | chevron_right_closed = _{ "/>" } 125 | chevron_right_question = _{ "?>" } 126 | chevron_right = _{ 127 | chevron_right_normal 128 | | chevron_right_closed 129 | | chevron_right_question 130 | } 131 | 132 | equal = _{ "=" } 133 | quote_dubble = _{ "\"" } 134 | quote_single = _{ "'" } 135 | quote = _{ quote_dubble | quote_single } 136 | WSP = _{ " " | "\t" | "\r" | "\n" } 137 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! [![github]](https://github.com/mathiversen/html-parser) 2 | //! 3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github 4 | //! 5 | //! # Html parser 6 | //! 7 | //! A simple and general purpose html/xhtml parser lib/bin, using [Pest](https://pest.rs/). 8 | //! 9 | //! ## Features 10 | //! - Parse html & xhtml (not xml processing instructions) 11 | //! - Parse html-documents 12 | //! - Parse html-fragments 13 | //! - Parse empty documents 14 | //! - Parse with the same api for both documents and fragments 15 | //! - Parse custom, non-standard, elements; ``, `` and `` 16 | //! - Removes comments 17 | //! - Removes dangling elements 18 | //! - Iterate over all nodes in the dom tree 19 | //! 20 | //! ## What is it not 21 | //! 22 | //! - It's not a high-performance browser-grade parser 23 | //! - It's not suitable for html validation 24 | //! - It's not a parser that includes element selection or dom manipulation 25 | //! 26 | //! If your requirements matches any of the above, then you're most likely looking for one of the crates below: 27 | //! 28 | //! - [html5ever](https://crates.io/crates/html5ever) 29 | //! - [kuchiki](https://crates.io/crates/kuchiki) 30 | //! - [scraper](https://crates.io/crates/scraper) 31 | //! - or other crates using the `html5ever` parser 32 | //! 33 | //! ## Examples bin 34 | //! 35 | //! Parse html file 36 | //! 37 | //! ```shell 38 | //! html_parser index.html 39 | //! 40 | //! ``` 41 | //! 42 | //! Parse stdin with pretty output 43 | //! 44 | //! ```shell 45 | //! curl | html_parser -p 46 | //! ``` 47 | //! 48 | //! ## Examples lib 49 | //! 50 | //! Parse html document 51 | //! 52 | //! ```rust 53 | //! use html_parser::Dom; 54 | //! 55 | //! fn main() { 56 | //! let html = r#" 57 | //! 58 | //! 59 | //! 60 | //! 61 | //! Html parser 62 | //! 63 | //! 64 | //!

Hello world

65 | //! 66 | //! 67 | //! "#; 68 | //! 69 | //! assert!(Dom::parse(html).is_ok()); 70 | //! } 71 | //! ``` 72 | //! 73 | //! Parse html fragment 74 | //! 75 | //! ```rust 76 | //! use html_parser::Dom; 77 | //! 78 | //! fn main() { 79 | //! let html = "
"; 80 | //! assert!(Dom::parse(html).is_ok()); 81 | //! } 82 | //! ``` 83 | //! 84 | //! Print to json 85 | //! 86 | //! ```rust 87 | //! use html_parser::{Dom, Result}; 88 | //! 89 | //! fn main() -> Result<()> { 90 | //! let html = "
"; 91 | //! let json = Dom::parse(html)?.to_json_pretty()?; 92 | //! println!("{}", json); 93 | //! Ok(()) 94 | //! } 95 | //! ``` 96 | 97 | #![allow(clippy::needless_doctest_main)] 98 | 99 | mod dom; 100 | mod error; 101 | mod grammar; 102 | 103 | use grammar::Rule; 104 | 105 | pub use crate::dom::element::{Element, ElementVariant}; 106 | pub use crate::dom::node::Node; 107 | pub use crate::dom::Dom; 108 | pub use crate::dom::DomVariant; 109 | pub use crate::error::Error; 110 | pub use crate::error::Result; 111 | -------------------------------------------------------------------------------- /tests/bin.rs: -------------------------------------------------------------------------------- 1 | use html_parser::Result; 2 | use indoc::indoc; 3 | use std::io::Write; 4 | use std::process::Command; 5 | use tempfile::NamedTempFile; 6 | 7 | #[test] 8 | fn it_prints_out_processing_error() -> Result<()> { 9 | let html = indoc!( 10 | r#" 11 | 12 |
13 | "# 14 | ); 15 | 16 | let mut file = NamedTempFile::new()?; 17 | file.write_all(html.as_bytes())?; 18 | 19 | let output = Command::new("./target/debug/examples/simple_parser") 20 | .arg("-d") 21 | .arg(file.path()) 22 | .output() 23 | .unwrap(); 24 | 25 | let stdout = String::from_utf8(output.stdout).unwrap(); 26 | 27 | assert!(stdout.starts_with("# Failed to create element at rule: el_process_instruct")); 28 | Ok(()) 29 | } 30 | -------------------------------------------------------------------------------- /tests/comments.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use insta::assert_json_snapshot; 3 | 4 | #[test] 5 | fn it_can_parse_document_with_just_one_comment() -> Result<()> { 6 | let html = ""; 7 | let ast = Dom::parse(html)?; 8 | assert_json_snapshot!(ast); 9 | Ok(()) 10 | } 11 | #[test] 12 | fn it_can_parse_document_with_just_comments() -> Result<()> { 13 | let html = ""; 14 | let ast = Dom::parse(html)?; 15 | assert_json_snapshot!(ast); 16 | Ok(()) 17 | } 18 | -------------------------------------------------------------------------------- /tests/document.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use indoc::indoc; 3 | use insta::assert_json_snapshot; 4 | 5 | #[test] 6 | fn it_can_parse_minimal_document() -> Result<()> { 7 | let html = ""; 8 | let dom = Dom::parse(html)?; 9 | assert_json_snapshot!(dom); 10 | Ok(()) 11 | } 12 | #[test] 13 | fn it_can_parse_document_with_comments() -> Result<()> { 14 | let html = indoc!( 15 | r#" 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | "# 27 | ); 28 | let dom = Dom::parse(html)?; 29 | assert_json_snapshot!(dom); 30 | Ok(()) 31 | } 32 | #[test] 33 | fn it_error_when_doctype_and_multiple_html() { 34 | let html = ""; 35 | assert!(Dom::parse(html).is_err()); 36 | } 37 | -------------------------------------------------------------------------------- /tests/document_empty.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use insta::assert_json_snapshot; 3 | 4 | #[test] 5 | fn it_can_parse_empty_document() -> Result<()> { 6 | let html = ""; 7 | let dom = Dom::parse(html)?; 8 | assert_json_snapshot!(dom); 9 | Ok(()) 10 | } 11 | -------------------------------------------------------------------------------- /tests/document_fragment.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use insta::assert_json_snapshot; 3 | 4 | #[test] 5 | fn it_can_parse_single_div_as_fragment() -> Result<()> { 6 | let html = "
"; 7 | let dom = Dom::parse(html)?; 8 | assert_json_snapshot!(dom); 9 | Ok(()) 10 | } 11 | #[test] 12 | fn it_can_parse_single_text_as_fragment() -> Result<()> { 13 | let html = "hello"; 14 | let dom = Dom::parse(html)?; 15 | assert_json_snapshot!(dom); 16 | Ok(()) 17 | } 18 | #[test] 19 | fn it_can_parse_text_comment_element_as_fragment() -> Result<()> { 20 | let html = "hello
"; 21 | let dom = Dom::parse(html)?; 22 | assert_json_snapshot!(dom); 23 | Ok(()) 24 | } 25 | #[test] 26 | fn it_error_when_body_is_used_in_fragment_root() { 27 | let html = "
"; 28 | assert!(Dom::parse(html).is_err()); 29 | } 30 | #[test] 31 | fn it_error_when_head_is_used_in_fragment_root() { 32 | let html = "
"; 33 | assert!(Dom::parse(html).is_err()); 34 | } 35 | #[test] 36 | fn it_error_when_html_is_used_in_fragment_root() { 37 | let html = "
"; 38 | assert!(Dom::parse(html).is_err()); 39 | } 40 | -------------------------------------------------------------------------------- /tests/element.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use indoc::indoc; 3 | use insta::assert_json_snapshot; 4 | 5 | #[test] 6 | fn it_can_parse_one_element() -> Result<()> { 7 | let html = ""; 8 | let dom = Dom::parse(html)?; 9 | assert_json_snapshot!(dom); 10 | Ok(()) 11 | } 12 | #[test] 13 | fn it_can_parse_one_element_upper_case() -> Result<()> { 14 | let html = ""; 15 | let dom = Dom::parse(html)?; 16 | assert_json_snapshot!(dom); 17 | Ok(()) 18 | } 19 | #[test] 20 | fn it_can_parse_one_element_mixed_case() -> Result<()> { 21 | let html = ""; 22 | let dom = Dom::parse(html)?; 23 | assert_json_snapshot!(dom); 24 | Ok(()) 25 | } 26 | #[test] 27 | fn it_can_parse_one_element_mixed_case_numbers() -> Result<()> { 28 | let html = ""; 29 | let dom = Dom::parse(html)?; 30 | assert_json_snapshot!(dom); 31 | Ok(()) 32 | } 33 | #[test] 34 | fn it_can_parse_one_element_mixed_case_numbers_symbols() -> Result<()> { 35 | let html = ""; 36 | let dom = Dom::parse(html)?; 37 | assert_json_snapshot!(dom); 38 | Ok(()) 39 | } 40 | #[test] 41 | fn it_can_parse_multiple_elements() -> Result<()> { 42 | let html = "
"; 43 | let dom = Dom::parse(html)?; 44 | assert_json_snapshot!(dom); 45 | Ok(()) 46 | } 47 | #[test] 48 | fn it_can_parse_multiple_open_elements() -> Result<()> { 49 | let html = "
"; 50 | let dom = Dom::parse(html)?; 51 | assert_json_snapshot!(dom); 52 | Ok(()) 53 | } 54 | #[test] 55 | fn it_can_parse_nested_elements() -> Result<()> { 56 | let html = indoc!( 57 | r" 58 |
59 |
60 |
61 | " 62 | ); 63 | let dom = Dom::parse(html)?; 64 | assert_json_snapshot!(dom); 65 | Ok(()) 66 | } 67 | #[test] 68 | fn it_can_parse_nested_elements_mixed_children() -> Result<()> { 69 | let html = indoc!( 70 | r" 71 |
72 | 73 |
74 | Hello 75 |
76 | World 77 |
78 |
79 | " 80 | ); 81 | let dom = Dom::parse(html)?; 82 | assert_json_snapshot!(dom); 83 | Ok(()) 84 | } 85 | #[test] 86 | fn it_can_parse_deeply_nested() -> Result<()> { 87 | let html = indoc!( 88 | r#" 89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 | 98 | hello world 99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 | "# 108 | ); 109 | let dom = Dom::parse(html)?; 110 | assert_json_snapshot!(dom); 111 | Ok(()) 112 | } 113 | #[test] 114 | fn it_can_parse_script_with_content() -> Result<()> { 115 | let html = indoc!( 116 | r#" 117 | 142 | "# 143 | ); 144 | let dom = Dom::parse(html)?; 145 | assert_json_snapshot!(dom); 146 | Ok(()) 147 | } 148 | #[test] 149 | fn it_can_parse_style_with_content() -> Result<()> { 150 | let html = indoc!( 151 | r#" 152 | 162 | "# 163 | ); 164 | let dom = Dom::parse(html)?; 165 | assert_json_snapshot!(dom); 166 | Ok(()) 167 | } 168 | #[test] 169 | fn it_skips_dangling_elements() -> Result<()> { 170 | let html = indoc!( 171 | " 172 |
173 |
174 |
175 | " 176 | ); 177 | let dom = Dom::parse(html)?; 178 | assert_json_snapshot!(dom); 179 | Ok(()) 180 | } 181 | #[test] 182 | fn it_can_parse_broken_html() -> Result<()> { 183 | let html = "
"; 184 | let dom = Dom::parse(html)?; 185 | assert_json_snapshot!(dom); 186 | Ok(()) 187 | } 188 | #[test] 189 | fn it_errors_when_multiple_nested_elements_dont_match() -> Result<()> { 190 | let html = "
"; 191 | let dom = Dom::parse(html)?; 192 | assert_json_snapshot!(dom); 193 | Ok(()) 194 | } 195 | #[test] 196 | fn it_can_clone_node() { 197 | let html = indoc!( 198 | " 199 |
one
200 |
two
201 | " 202 | ); 203 | let dom = Dom::parse(html).unwrap(); 204 | let one = dom.children[0].clone(); 205 | assert_json_snapshot!(one); 206 | } 207 | #[test] 208 | fn it_can_clone_dom() { 209 | let html = indoc!( 210 | " 211 | 212 | 213 | Title 214 | 215 | 216 |

Hello world

217 | 218 | 219 | " 220 | ); 221 | let dom = Dom::parse(html).unwrap(); 222 | let dom_clone = dom.clone(); 223 | assert_eq!(dom, dom_clone); 224 | } 225 | 226 | #[test] 227 | fn it_can_deal_with_weird_whitespaces() { 228 | let html = indoc!( 229 | " 230 | 231 |
Text
232 | 233 | 234 | < div> Text
235 | 236 | 237 |
Text
238 | 239 | 240 |
Text < /div> 241 | 242 | 243 |
Text
244 | 245 | 246 | < div > Text < / div > 247 | " 248 | ); 249 | let dom = Dom::parse(html).unwrap(); 250 | assert_json_snapshot!(dom); 251 | } 252 | -------------------------------------------------------------------------------- /tests/element_attributes.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use insta::assert_json_snapshot; 3 | 4 | #[test] 5 | fn it_can_parse_double_quote() -> Result<()> { 6 | let html = "
"; 7 | let dom = Dom::parse(html)?; 8 | assert_json_snapshot!(dom); 9 | Ok(()) 10 | } 11 | #[test] 12 | fn it_can_parse_single_quote() -> Result<()> { 13 | let html = "
"; 14 | let dom = Dom::parse(html)?; 15 | assert_json_snapshot!(dom); 16 | Ok(()) 17 | } 18 | #[test] 19 | fn it_can_parse_no_quote() -> Result<()> { 20 | let html = "
"; 21 | let dom = Dom::parse(html)?; 22 | assert_json_snapshot!(dom); 23 | Ok(()) 24 | } 25 | #[test] 26 | fn it_can_parse_attribute_key_mixed_case_symbols() -> Result<()> { 27 | let html = "
"; 28 | let dom = Dom::parse(html)?; 29 | assert_json_snapshot!(dom); 30 | Ok(()) 31 | } 32 | #[test] 33 | fn it_can_parse_multiple_attributes_single_quote() -> Result<()> { 34 | let html = "
"; 35 | let dom = Dom::parse(html)?; 36 | assert_json_snapshot!(dom); 37 | Ok(()) 38 | } 39 | #[test] 40 | fn it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys() -> Result<()> { 41 | let html = "
"; 42 | let dom = Dom::parse(html)?; 43 | assert_json_snapshot!(dom); 44 | Ok(()) 45 | } 46 | #[test] 47 | fn it_can_parse_multiple_attributes_double_quote() -> Result<()> { 48 | let html = "
"; 49 | let dom = Dom::parse(html)?; 50 | assert_json_snapshot!(dom); 51 | Ok(()) 52 | } 53 | #[test] 54 | fn it_can_parse_multiple_attributes_no_quote() -> Result<()> { 55 | let html = "
"; 56 | let dom = Dom::parse(html)?; 57 | assert_json_snapshot!(dom); 58 | Ok(()) 59 | } 60 | #[test] 61 | fn it_can_parse_attribute_multiple_values_single_quote() -> Result<()> { 62 | let html = "
"; 63 | let dom = Dom::parse(html)?; 64 | assert_json_snapshot!(dom); 65 | Ok(()) 66 | } 67 | #[test] 68 | fn it_can_parse_attribute_multiple_values_double_quote() -> Result<()> { 69 | let html = "
"; 70 | let dom = Dom::parse(html)?; 71 | assert_json_snapshot!(dom); 72 | Ok(()) 73 | } 74 | #[test] 75 | fn it_can_parse_attribute_with_empty_value() -> Result<()> { 76 | let html = ""; 77 | let dom = Dom::parse(html)?; 78 | assert_json_snapshot!(dom); 79 | Ok(()) 80 | } 81 | 82 | #[test] 83 | fn it_can_parse_id() -> Result<()> { 84 | let html = ""; 85 | let dom = Dom::parse(html)?; 86 | assert_json_snapshot!(dom); 87 | Ok(()) 88 | } 89 | #[test] 90 | fn it_can_parse_classes() -> Result<()> { 91 | let html = ""; 92 | let dom = Dom::parse(html)?; 93 | assert_json_snapshot!(dom); 94 | Ok(()) 95 | } 96 | #[test] 97 | fn it_keeps_spaces_for_non_classes() -> Result<()> { 98 | let html = ""; 99 | let dom = Dom::parse(html)?; 100 | assert_json_snapshot!(dom); 101 | Ok(()) 102 | } 103 | -------------------------------------------------------------------------------- /tests/node_iter.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Node, Result}; 2 | use indoc::indoc; 3 | 4 | #[test] 5 | fn it_can_iter_1() -> Result<()> { 6 | let html = indoc! {" 7 | 8 | 9 | title 10 | 11 | 12 |
    13 |
  • 14 |
  • 15 |
  • 16 |
17 | 18 | 19 | "}; 20 | let dom = Dom::parse(&html)?; 21 | let root = dom.children.get(0).unwrap().into_iter(); 22 | let num_li = root.into_iter().fold(0, |mut acc, curr| match curr { 23 | Node::Element(ref e) => { 24 | if e.name == "li" { 25 | acc += 1; 26 | } 27 | acc 28 | } 29 | _ => acc, 30 | }); 31 | assert_eq!(num_li, 3); 32 | Ok(()) 33 | } 34 | -------------------------------------------------------------------------------- /tests/output.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use indoc::indoc; 3 | use insta::assert_json_snapshot; 4 | 5 | #[test] 6 | fn it_can_output_json() -> Result<()> { 7 | assert!(Dom::parse("
")?.to_json().is_ok()); 8 | Ok(()) 9 | } 10 | 11 | #[test] 12 | fn it_can_output_json_pretty() -> Result<()> { 13 | assert!(Dom::parse("
")?.to_json_pretty().is_ok()); 14 | Ok(()) 15 | } 16 | 17 | #[test] 18 | fn it_can_output_complex_html_as_json() -> Result<()> { 19 | let html = indoc!( 20 | " 21 | 22 | Här kan man va 23 | 24 | 25 |

Tjena världen!

26 |

Tänkte bara informera om att Sverige är bättre än Finland i ishockey.

27 | 28 | " 29 | ); 30 | let dom = Dom::parse(html)?; 31 | assert_json_snapshot!(dom); 32 | Ok(()) 33 | } 34 | -------------------------------------------------------------------------------- /tests/snapshots/comments__it_can_parse_document_with_just_comments.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/comments.rs 3 | expression: ast 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "x", 9 | "y", 10 | "z" 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /tests/snapshots/comments__it_can_parse_document_with_just_one_comment.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/comments.rs 3 | expression: ast 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "hello !\"#/()=" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /tests/snapshots/document__it_can_parse_document_with_comments.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/document.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "document", 7 | "children": [ 8 | "comment", 9 | "comment", 10 | "comment", 11 | "comment", 12 | { 13 | "name": "html", 14 | "variant": "normal", 15 | "children": [ 16 | "comment" 17 | ] 18 | }, 19 | "comment", 20 | "comment" 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /tests/snapshots/document__it_can_parse_minimal_document.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/document.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "document", 7 | "children": [ 8 | { 9 | "name": "html", 10 | "variant": "normal" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/snapshots/document_empty__it_can_parse_empty_document.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/document_empty.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "empty" 7 | } 8 | -------------------------------------------------------------------------------- /tests/snapshots/document_fragment__it_can_parse_single_div_as_fragment.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/document_fragment.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "void" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/snapshots/document_fragment__it_can_parse_single_text_as_fragment.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/document_fragment.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "hello" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /tests/snapshots/document_fragment__it_can_parse_text_comment_element_as_fragment.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/document_fragment.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "hello", 9 | "world?", 10 | { 11 | "name": "div", 12 | "variant": "void" 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_clone_node.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: one 4 | --- 5 | { 6 | "name": "div", 7 | "variant": "normal", 8 | "children": [ 9 | "one" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_deal_with_weird_whitespaces.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "Normal case", 9 | { 10 | "name": "div", 11 | "variant": "normal", 12 | "children": [ 13 | " Text " 14 | ] 15 | }, 16 | "Whitespaces in opening tag to the left", 17 | { 18 | "name": "div", 19 | "variant": "normal", 20 | "children": [ 21 | " Text " 22 | ] 23 | }, 24 | "Whitespaces in opening tag to the right", 25 | { 26 | "name": "div", 27 | "variant": "normal", 28 | "children": [ 29 | " Text " 30 | ] 31 | }, 32 | "Whitespaces in closing tag to the left (should not work)", 33 | "
Text < /div>\n\n", 34 | "Whitespaces in closing tag to the right", 35 | { 36 | "name": "div", 37 | "variant": "normal", 38 | "children": [ 39 | " Text " 40 | ] 41 | }, 42 | "Whitespaces everywhere (should not work)", 43 | "< div > Text < / div >\n" 44 | ] 45 | } 46 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_broken_html.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "
", 9 | { 10 | "name": "div", 11 | "variant": "normal" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_deeply_nested.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "classes": [ 12 | "1" 13 | ], 14 | "children": [ 15 | { 16 | "name": "div", 17 | "variant": "normal", 18 | "classes": [ 19 | "1" 20 | ], 21 | "children": [ 22 | { 23 | "name": "div", 24 | "variant": "normal", 25 | "classes": [ 26 | "1" 27 | ], 28 | "children": [ 29 | { 30 | "name": "div", 31 | "variant": "normal", 32 | "classes": [ 33 | "1" 34 | ], 35 | "children": [ 36 | { 37 | "name": "div", 38 | "variant": "normal", 39 | "classes": [ 40 | "1" 41 | ], 42 | "children": [ 43 | { 44 | "name": "div", 45 | "variant": "normal", 46 | "classes": [ 47 | "1" 48 | ], 49 | "children": [ 50 | { 51 | "name": "div", 52 | "variant": "normal", 53 | "classes": [ 54 | "1" 55 | ], 56 | "children": [ 57 | { 58 | "name": "div", 59 | "variant": "normal", 60 | "classes": [ 61 | "1" 62 | ], 63 | "children": [ 64 | "this is deep", 65 | "hello world\n " 66 | ] 67 | } 68 | ] 69 | } 70 | ] 71 | } 72 | ] 73 | } 74 | ] 75 | } 76 | ] 77 | } 78 | ] 79 | } 80 | ] 81 | } 82 | ] 83 | } 84 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_multiple_elements.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "void" 11 | }, 12 | { 13 | "name": "div", 14 | "variant": "void" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_multiple_open_elements.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal" 11 | }, 12 | { 13 | "name": "div", 14 | "variant": "normal" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_nested_elements.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "children": [ 12 | { 13 | "name": "div", 14 | "variant": "void" 15 | } 16 | ] 17 | } 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_nested_elements_mixed_children.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "children": [ 12 | "comment", 13 | { 14 | "name": "div", 15 | "variant": "void" 16 | }, 17 | "\n Hello\n ", 18 | { 19 | "name": "div", 20 | "variant": "normal", 21 | "children": [ 22 | "\n World\n " 23 | ] 24 | } 25 | ] 26 | } 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_one_element.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "document", 7 | "children": [ 8 | { 9 | "name": "html", 10 | "variant": "normal" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_one_element_mixed_case.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "document", 7 | "children": [ 8 | { 9 | "name": "Html", 10 | "variant": "normal" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_one_element_mixed_case_numbers.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "Header1", 10 | "variant": "normal" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_one_element_mixed_case_numbers_symbols.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "Head_Er-1", 10 | "variant": "normal" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_one_element_upper_case.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "document", 7 | "children": [ 8 | { 9 | "name": "HTML", 10 | "variant": "normal" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_script_with_content.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "script", 10 | "variant": "normal", 11 | "children": [ 12 | "const person_creator = ({ name, symtoms }) => {\n let person = {}\n person.name = name\n person.symtoms = {}\n for (symtom of symtoms) {\n person.symtoms[symtom] = true\n }\n return person\n }\n \n const main = () => {\n let name = 'mathias'\n let symtoms = ['Dunning-Kruger', 'ACDC', 'Slacker']\n \n setTimeout(() => {\n let person = person_creator({ name, symtoms })\n if (person.symtoms.hasOwnProperty('Dunning-Kruger')) {\n console.log('yeah buddy, that\\'s right')\n }\n }, 1337)\n }\n \n main()" 13 | ] 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_can_parse_style_with_content.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "style", 10 | "variant": "normal", 11 | "children": [ 12 | ":root {\n --background-color: black;\n --text-color: white;\n }\n body {\n background: var(--background-color);\n color: var(--text-color);\n }" 13 | ] 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_errors_when_multiple_nested_elements_dont_match.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "
", 9 | { 10 | "name": "div", 11 | "variant": "normal", 12 | "children": [ 13 | { 14 | "name": "div", 15 | "variant": "normal", 16 | "children": [ 17 | { 18 | "name": "div", 19 | "variant": "normal" 20 | } 21 | ] 22 | } 23 | ] 24 | } 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /tests/snapshots/element__it_skips_dangling_elements.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "id": "123", 10 | "name": "div", 11 | "variant": "normal" 12 | }, 13 | { 14 | "id": "321", 15 | "name": "div", 16 | "variant": "normal" 17 | } 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_attribute_key_mixed_case_symbols.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "attributes": { 12 | "data-cat": "morris" 13 | } 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_attribute_multiple_values_double_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "void", 11 | "attributes": { 12 | "cat": "mjau mjau" 13 | } 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_attribute_multiple_values_single_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "void", 11 | "attributes": { 12 | "cat": "mjau mjau" 13 | } 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_attribute_with_empty_value.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "img", 10 | "variant": "void", 11 | "attributes": { 12 | "hidden": null 13 | } 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_classes.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "img", 10 | "variant": "void", 11 | "classes": [ 12 | "a", 13 | "b", 14 | "c" 15 | ] 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_double_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "id": "one", 10 | "name": "div", 11 | "variant": "normal" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_id.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "id": "a", 10 | "name": "img", 11 | "variant": "void" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_multiple_attributes_double_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "attributes": { 12 | "ape": "oh", 13 | "cat": "mjau", 14 | "dog": "woff" 15 | } 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_multiple_attributes_no_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "attributes": { 12 | "ape": "oh", 13 | "cat": "mjau", 14 | "dog": "woff" 15 | } 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_multiple_attributes_single_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "attributes": { 12 | "ape": "oh", 13 | "cat": "mjau", 14 | "dog": "woff" 15 | } 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_multiple_attributes_where_whitespace_does_not_matter_for_keys.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "div", 10 | "variant": "normal", 11 | "attributes": { 12 | "ape": "oh", 13 | "cat": "mjau", 14 | "dog": " woff " 15 | } 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_no_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "id": "one", 10 | "name": "div", 11 | "variant": "normal" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_can_parse_single_quote.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "id": "one", 10 | "name": "div", 11 | "variant": "normal" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /tests/snapshots/element_attributes__it_keeps_spaces_for_non_classes.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/element_attributes.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "img", 10 | "variant": "void", 11 | "attributes": { 12 | "attr": " a b \n\t" 13 | } 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/snapshots/output__it_can_output_complex_html_as_json.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/output.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "document", 7 | "children": [ 8 | { 9 | "name": "html", 10 | "variant": "normal", 11 | "attributes": { 12 | "lang": "sv" 13 | }, 14 | "children": [ 15 | { 16 | "name": "head", 17 | "variant": "normal", 18 | "children": [ 19 | { 20 | "name": "title", 21 | "variant": "normal", 22 | "children": [ 23 | "Här kan man va" 24 | ] 25 | } 26 | ] 27 | }, 28 | { 29 | "name": "body", 30 | "variant": "normal", 31 | "children": [ 32 | { 33 | "name": "h1", 34 | "variant": "normal", 35 | "children": [ 36 | "Tjena världen!" 37 | ] 38 | }, 39 | { 40 | "name": "p", 41 | "variant": "normal", 42 | "children": [ 43 | "Tänkte bara informera om att Sverige är bättre än Finland i ishockey." 44 | ] 45 | } 46 | ] 47 | } 48 | ] 49 | } 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /tests/snapshots/source_span__it_can_generate_source_span.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/source_span.rs 3 | expression: dom 4 | --- 5 | Dom { 6 | tree_type: DocumentFragment, 7 | children: [ 8 | Element( 9 | Element { 10 | id: None, 11 | name: "template", 12 | variant: Normal, 13 | attributes: {}, 14 | classes: [], 15 | children: [ 16 | Element( 17 | Element { 18 | id: None, 19 | name: "h1", 20 | variant: Normal, 21 | attributes: {}, 22 | classes: [], 23 | children: [ 24 | Text( 25 | "Header", 26 | ), 27 | ], 28 | source_span: SourceSpan { 29 | text: "

Header

", 30 | start_line: 2, 31 | end_line: 2, 32 | start_column: 5, 33 | end_column: 20, 34 | }, 35 | }, 36 | ), 37 | Element( 38 | Element { 39 | id: None, 40 | name: "p", 41 | variant: Normal, 42 | attributes: {}, 43 | classes: [], 44 | children: [ 45 | Text( 46 | "Paragraph", 47 | ), 48 | ], 49 | source_span: SourceSpan { 50 | text: "

Paragraph

", 51 | start_line: 3, 52 | end_line: 3, 53 | start_column: 5, 54 | end_column: 21, 55 | }, 56 | }, 57 | ), 58 | ], 59 | source_span: SourceSpan { 60 | text: "", 61 | start_line: 1, 62 | end_line: 4, 63 | start_column: 1, 64 | end_column: 12, 65 | }, 66 | }, 67 | ), 68 | ], 69 | errors: [], 70 | } 71 | -------------------------------------------------------------------------------- /tests/snapshots/svg__it_can_parse_svg.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/svg.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "svg", 10 | "variant": "normal", 11 | "attributes": { 12 | "xmlns": "http://www.w3.org/2000/svg", 13 | "xmlns:xlink": "http://www.w3.org/1999/xlink" 14 | }, 15 | "children": [ 16 | { 17 | "name": "rect", 18 | "variant": "void", 19 | "attributes": { 20 | "height": "100", 21 | "style": "stroke:#ff0000; fill: #0000ff", 22 | "width": "100", 23 | "x": "10", 24 | "y": "10" 25 | } 26 | } 27 | ] 28 | } 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /tests/snapshots/text__it_can_parse_document_with_just_text.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/text.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "hello world" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /tests/snapshots/text__it_can_parse_document_with_multiple_text_elements.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/text.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "hello world\nhere's another line for you!\n", 9 | { 10 | "name": "div", 11 | "variant": "void" 12 | }, 13 | "\nThe end\n" 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /tests/snapshots/text__it_can_parse_document_with_text_and_line_breaks.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/text.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "hello world\nhere's another line for you!\nThe end\n" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /tests/snapshots/text__it_can_parse_text_in_paragraph_with_weird_formatting.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/text.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | { 9 | "name": "p", 10 | "variant": "normal", 11 | "children": [ 12 | "\n This is a ", 13 | { 14 | "name": "b", 15 | "variant": "normal", 16 | "children": [ 17 | "para" 18 | ] 19 | }, 20 | "gra", 21 | { 22 | "name": "b", 23 | "variant": "normal", 24 | "children": [ 25 | "ph" 26 | ] 27 | }, 28 | " with some", 29 | { 30 | "name": "i", 31 | "variant": "normal", 32 | "children": [ 33 | " weird " 34 | ] 35 | }, 36 | " formatting.\n" 37 | ] 38 | } 39 | ] 40 | } 41 | -------------------------------------------------------------------------------- /tests/snapshots/text__it_can_parse_text_with_chevron.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: tests/text.rs 3 | expression: dom 4 | --- 5 | { 6 | "treeType": "documentFragment", 7 | "children": [ 8 | "hello <> world" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /tests/source_span.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use indoc::indoc; 3 | use insta::assert_debug_snapshot; 4 | 5 | #[test] 6 | fn it_can_generate_source_span() -> Result<()> { 7 | let html = indoc! {" 8 | 12 | "}; 13 | let dom = Dom::parse(html)?; 14 | assert_debug_snapshot!(dom); 15 | Ok(()) 16 | } 17 | -------------------------------------------------------------------------------- /tests/svg.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use indoc::indoc; 3 | use insta::assert_json_snapshot; 4 | 5 | #[test] 6 | fn it_can_parse_svg() -> Result<()> { 7 | let html = indoc!( 8 | r#" 9 | 10 | 11 | 12 | "# 13 | ); 14 | let dom = Dom::parse(html)?; 15 | assert_json_snapshot!(dom); 16 | Ok(()) 17 | } 18 | 19 | #[test] 20 | fn it_can_parse_complex_svg() { 21 | let svg = indoc!( 22 | r#" 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 39 | 40 | Sorry, your browser does not support inline SVG. 41 | 42 | "# 43 | ); 44 | assert!(Dom::parse(&svg).is_ok()); 45 | } 46 | -------------------------------------------------------------------------------- /tests/text.rs: -------------------------------------------------------------------------------- 1 | use html_parser::{Dom, Result}; 2 | use indoc::indoc; 3 | use insta::assert_json_snapshot; 4 | 5 | #[test] 6 | fn it_can_parse_document_with_just_text() -> Result<()> { 7 | let html = "hello world"; 8 | let dom = Dom::parse(html)?; 9 | assert_json_snapshot!(dom); 10 | Ok(()) 11 | } 12 | 13 | #[test] 14 | fn it_can_parse_document_with_text_and_line_breaks() -> Result<()> { 15 | let html = indoc!( 16 | r" 17 | hello world 18 | here's another line for you! 19 | The end 20 | " 21 | ); 22 | let dom = Dom::parse(html)?; 23 | assert_json_snapshot!(dom); 24 | Ok(()) 25 | } 26 | 27 | #[test] 28 | fn it_can_parse_document_with_multiple_text_elements() -> Result<()> { 29 | let html = indoc!( 30 | r" 31 | hello world 32 | here's another line for you! 33 |
34 | The end 35 | " 36 | ); 37 | let dom = Dom::parse(html)?; 38 | assert_json_snapshot!(dom); 39 | Ok(()) 40 | } 41 | 42 | #[test] 43 | fn it_can_parse_text_with_chevron() -> Result<()> { 44 | let html = indoc!(r"hello <> world"); 45 | let dom = Dom::parse(html)?; 46 | assert_json_snapshot!(dom); 47 | Ok(()) 48 | } 49 | 50 | #[test] 51 | fn it_can_parse_text_in_paragraph_with_weird_formatting() -> Result<()> { 52 | let html = indoc!(r" 53 |

54 | This is a paragraph with some weird formatting. 55 |

56 | "); 57 | let dom = Dom::parse(html)?; 58 | assert_json_snapshot!(dom); 59 | Ok(()) 60 | } 61 | -------------------------------------------------------------------------------- /tests/websites.rs: -------------------------------------------------------------------------------- 1 | use html_parser::Dom; 2 | use indoc::indoc; 3 | 4 | #[test] 5 | fn it_can_parse_simple() { 6 | let html = indoc!( 7 | r#" 8 | 9 | 10 | 11 | 12 | 13 | Document 14 | 23 | 24 | 25 |

Hello world

26 | 27 | 31 | 32 | 33 | "# 34 | ); 35 | assert!(Dom::parse(html).is_ok()); 36 | } 37 | 38 | #[test] 39 | fn it_can_parse_spotify() { 40 | let resp = reqwest::blocking::get("https://www.spotify.com/se") 41 | .unwrap() 42 | .text() 43 | .unwrap(); 44 | assert!(Dom::parse(&resp).is_ok()); 45 | } 46 | 47 | #[ignore] 48 | #[test] 49 | fn it_can_parse_facebook() { 50 | let resp = reqwest::blocking::get("https://www.facebook.com/") 51 | .unwrap() 52 | .text() 53 | .unwrap(); 54 | assert!(Dom::parse(&resp).is_ok()); 55 | } 56 | 57 | #[ignore] 58 | #[test] 59 | fn it_can_parse_amazon() { 60 | let resp = reqwest::blocking::get("https://www.amazon.com/") 61 | .unwrap() 62 | .text() 63 | .unwrap(); 64 | assert!(Dom::parse(&resp).is_ok()); 65 | } 66 | 67 | #[ignore] 68 | #[test] 69 | fn it_can_parse_apple() { 70 | let resp = reqwest::blocking::get("https://www.apple.com/") 71 | .unwrap() 72 | .text() 73 | .unwrap(); 74 | assert!(Dom::parse(&resp).is_ok()); 75 | } 76 | 77 | #[ignore] 78 | #[test] 79 | fn it_can_parse_nytimes() { 80 | let resp = reqwest::blocking::get("https://www.nytimes.com/") 81 | .unwrap() 82 | .text() 83 | .unwrap(); 84 | assert!(Dom::parse(&resp).is_ok()); 85 | } 86 | 87 | #[ignore] 88 | #[test] 89 | fn it_can_parse_wikipedia() { 90 | let resp = reqwest::blocking::get("https://en.wikipedia.org/wiki/Main_Page") 91 | .unwrap() 92 | .text() 93 | .unwrap(); 94 | assert!(Dom::parse(&resp).is_ok()); 95 | } 96 | --------------------------------------------------------------------------------