├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ └── test.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md └── scraper ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── examples ├── document.rs └── fragment.rs ├── scraper.1 └── src ├── element_ref ├── element.rs ├── mod.rs └── serializable.rs ├── error.rs ├── error └── utils.rs ├── html ├── mod.rs ├── serializable.rs └── tree_sink.rs ├── lib.rs ├── main.rs ├── node.rs ├── node └── serializable.rs ├── selectable.rs ├── selector.rs └── test.rs /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig 2 | root = true 3 | 4 | [*] 5 | end_of_line = lf 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | indent_style = space 10 | indent_size = 4 11 | 12 | [*.md] 13 | trim_trailing_whitespace = false 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - "master" 8 | 9 | jobs: 10 | format: 11 | name: Format code 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: dtolnay/rust-toolchain@stable 16 | with: 17 | components: rustfmt 18 | - run: cargo fmt -- --check 19 | 20 | clippy: 21 | name: Clippy check 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v4 25 | - uses: dtolnay/rust-toolchain@stable 26 | with: 27 | components: clippy 28 | - uses: Swatinem/rust-cache@v2 29 | - run: cargo clippy --all-targets --all-features -- --deny warnings 30 | 31 | test: 32 | name: Test code 33 | runs-on: ubuntu-latest 34 | strategy: 35 | matrix: 36 | rust_version: [stable, beta, nightly] 37 | fail-fast: false 38 | steps: 39 | - uses: actions/checkout@v3 40 | - uses: dtolnay/rust-toolchain@master 41 | with: 42 | toolchain: ${{matrix.rust_version}} 43 | - uses: Swatinem/rust-cache@v2 44 | with: 45 | key: ${{matrix.rust_version}} 46 | - run: cargo update 47 | - run: cargo test --all-features 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | *.bk 3 | .idea 4 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "autocfg" 7 | version = "1.4.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 10 | 11 | [[package]] 12 | name = "bitflags" 13 | version = "2.9.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" 16 | 17 | [[package]] 18 | name = "byteorder" 19 | version = "1.5.0" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 22 | 23 | [[package]] 24 | name = "cfg-if" 25 | version = "1.0.0" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 28 | 29 | [[package]] 30 | name = "cssparser" 31 | version = "0.35.0" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "4e901edd733a1472f944a45116df3f846f54d37e67e68640ac8bb69689aca2aa" 34 | dependencies = [ 35 | "cssparser-macros", 36 | "dtoa-short", 37 | "itoa", 38 | "phf", 39 | "smallvec", 40 | ] 41 | 42 | [[package]] 43 | name = "cssparser-macros" 44 | version = "0.6.1" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" 47 | dependencies = [ 48 | "quote", 49 | "syn", 50 | ] 51 | 52 | [[package]] 53 | name = "derive_more" 54 | version = "0.99.19" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f" 57 | dependencies = [ 58 | "proc-macro2", 59 | "quote", 60 | "syn", 61 | ] 62 | 63 | [[package]] 64 | name = "dtoa" 65 | version = "1.0.10" 66 | source = "registry+https://github.com/rust-lang/crates.io-index" 67 | checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04" 68 | 69 | [[package]] 70 | name = "dtoa-short" 71 | version = "0.3.5" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" 74 | dependencies = [ 75 | "dtoa", 76 | ] 77 | 78 | [[package]] 79 | name = "ego-tree" 80 | version = "0.10.0" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" 83 | 84 | [[package]] 85 | name = "equivalent" 86 | version = "1.0.2" 87 | source = "registry+https://github.com/rust-lang/crates.io-index" 88 | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 89 | 90 | [[package]] 91 | name = "futf" 92 | version = "0.1.5" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" 95 | dependencies = [ 96 | "mac", 97 | "new_debug_unreachable", 98 | ] 99 | 100 | [[package]] 101 | name = "fxhash" 102 | version = "0.2.1" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 105 | dependencies = [ 106 | "byteorder", 107 | ] 108 | 109 | [[package]] 110 | name = "getopts" 111 | version = "0.2.21" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" 114 | dependencies = [ 115 | "unicode-width", 116 | ] 117 | 118 | [[package]] 119 | name = "hashbrown" 120 | version = "0.15.2" 121 | source = "registry+https://github.com/rust-lang/crates.io-index" 122 | checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" 123 | 124 | [[package]] 125 | name = "html5ever" 126 | version = "0.31.0" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "953cbbe631aae7fc0a112702ad5d3aaf09da38beaf45ea84610d6e1c358f569c" 129 | dependencies = [ 130 | "log", 131 | "mac", 132 | "markup5ever", 133 | "match_token", 134 | ] 135 | 136 | [[package]] 137 | name = "indexmap" 138 | version = "2.9.0" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" 141 | dependencies = [ 142 | "equivalent", 143 | "hashbrown", 144 | ] 145 | 146 | [[package]] 147 | name = "itoa" 148 | version = "1.0.15" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 151 | 152 | [[package]] 153 | name = "libc" 154 | version = "0.2.171" 155 | source = "registry+https://github.com/rust-lang/crates.io-index" 156 | checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" 157 | 158 | [[package]] 159 | name = "lock_api" 160 | version = "0.4.12" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" 163 | dependencies = [ 164 | "autocfg", 165 | "scopeguard", 166 | ] 167 | 168 | [[package]] 169 | name = "log" 170 | version = "0.4.27" 171 | source = "registry+https://github.com/rust-lang/crates.io-index" 172 | checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" 173 | 174 | [[package]] 175 | name = "mac" 176 | version = "0.1.1" 177 | source = "registry+https://github.com/rust-lang/crates.io-index" 178 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 179 | 180 | [[package]] 181 | name = "markup5ever" 182 | version = "0.16.0" 183 | source = "registry+https://github.com/rust-lang/crates.io-index" 184 | checksum = "0ba2225413ed418d540a2c8247d794f4b0527a021da36f69c05344d716dc44c1" 185 | dependencies = [ 186 | "log", 187 | "phf", 188 | "phf_codegen", 189 | "string_cache", 190 | "string_cache_codegen", 191 | "tendril", 192 | ] 193 | 194 | [[package]] 195 | name = "match_token" 196 | version = "0.1.0" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" 199 | dependencies = [ 200 | "proc-macro2", 201 | "quote", 202 | "syn", 203 | ] 204 | 205 | [[package]] 206 | name = "new_debug_unreachable" 207 | version = "1.0.6" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" 210 | 211 | [[package]] 212 | name = "parking_lot" 213 | version = "0.12.3" 214 | source = "registry+https://github.com/rust-lang/crates.io-index" 215 | checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" 216 | dependencies = [ 217 | "lock_api", 218 | "parking_lot_core", 219 | ] 220 | 221 | [[package]] 222 | name = "parking_lot_core" 223 | version = "0.9.10" 224 | source = "registry+https://github.com/rust-lang/crates.io-index" 225 | checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" 226 | dependencies = [ 227 | "cfg-if", 228 | "libc", 229 | "redox_syscall", 230 | "smallvec", 231 | "windows-targets", 232 | ] 233 | 234 | [[package]] 235 | name = "phf" 236 | version = "0.11.3" 237 | source = "registry+https://github.com/rust-lang/crates.io-index" 238 | checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" 239 | dependencies = [ 240 | "phf_macros", 241 | "phf_shared", 242 | ] 243 | 244 | [[package]] 245 | name = "phf_codegen" 246 | version = "0.11.3" 247 | source = "registry+https://github.com/rust-lang/crates.io-index" 248 | checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" 249 | dependencies = [ 250 | "phf_generator", 251 | "phf_shared", 252 | ] 253 | 254 | [[package]] 255 | name = "phf_generator" 256 | version = "0.11.3" 257 | source = "registry+https://github.com/rust-lang/crates.io-index" 258 | checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" 259 | dependencies = [ 260 | "phf_shared", 261 | "rand", 262 | ] 263 | 264 | [[package]] 265 | name = "phf_macros" 266 | version = "0.11.3" 267 | source = "registry+https://github.com/rust-lang/crates.io-index" 268 | checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" 269 | dependencies = [ 270 | "phf_generator", 271 | "phf_shared", 272 | "proc-macro2", 273 | "quote", 274 | "syn", 275 | ] 276 | 277 | [[package]] 278 | name = "phf_shared" 279 | version = "0.11.3" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" 282 | dependencies = [ 283 | "siphasher", 284 | ] 285 | 286 | [[package]] 287 | name = "precomputed-hash" 288 | version = "0.1.1" 289 | source = "registry+https://github.com/rust-lang/crates.io-index" 290 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 291 | 292 | [[package]] 293 | name = "proc-macro2" 294 | version = "1.0.94" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" 297 | dependencies = [ 298 | "unicode-ident", 299 | ] 300 | 301 | [[package]] 302 | name = "quote" 303 | version = "1.0.40" 304 | source = "registry+https://github.com/rust-lang/crates.io-index" 305 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" 306 | dependencies = [ 307 | "proc-macro2", 308 | ] 309 | 310 | [[package]] 311 | name = "rand" 312 | version = "0.8.5" 313 | source = "registry+https://github.com/rust-lang/crates.io-index" 314 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 315 | dependencies = [ 316 | "rand_core", 317 | ] 318 | 319 | [[package]] 320 | name = "rand_core" 321 | version = "0.6.4" 322 | source = "registry+https://github.com/rust-lang/crates.io-index" 323 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 324 | 325 | [[package]] 326 | name = "redox_syscall" 327 | version = "0.5.11" 328 | source = "registry+https://github.com/rust-lang/crates.io-index" 329 | checksum = "d2f103c6d277498fbceb16e84d317e2a400f160f46904d5f5410848c829511a3" 330 | dependencies = [ 331 | "bitflags", 332 | ] 333 | 334 | [[package]] 335 | name = "scopeguard" 336 | version = "1.2.0" 337 | source = "registry+https://github.com/rust-lang/crates.io-index" 338 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 339 | 340 | [[package]] 341 | name = "scraper" 342 | version = "0.23.1" 343 | dependencies = [ 344 | "cssparser", 345 | "ego-tree", 346 | "getopts", 347 | "html5ever", 348 | "indexmap", 349 | "precomputed-hash", 350 | "selectors", 351 | "serde", 352 | "tendril", 353 | ] 354 | 355 | [[package]] 356 | name = "selectors" 357 | version = "0.28.0" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "db3079aef7a4383aff1e60eca2818995d3de8168e85ae4b6ea8fb2804b182c54" 360 | dependencies = [ 361 | "bitflags", 362 | "cssparser", 363 | "derive_more", 364 | "fxhash", 365 | "log", 366 | "new_debug_unreachable", 367 | "phf", 368 | "phf_codegen", 369 | "precomputed-hash", 370 | "servo_arc", 371 | "smallvec", 372 | ] 373 | 374 | [[package]] 375 | name = "serde" 376 | version = "1.0.219" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" 379 | dependencies = [ 380 | "serde_derive", 381 | ] 382 | 383 | [[package]] 384 | name = "serde_derive" 385 | version = "1.0.219" 386 | source = "registry+https://github.com/rust-lang/crates.io-index" 387 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" 388 | dependencies = [ 389 | "proc-macro2", 390 | "quote", 391 | "syn", 392 | ] 393 | 394 | [[package]] 395 | name = "servo_arc" 396 | version = "0.4.0" 397 | source = "registry+https://github.com/rust-lang/crates.io-index" 398 | checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a" 399 | dependencies = [ 400 | "stable_deref_trait", 401 | ] 402 | 403 | [[package]] 404 | name = "siphasher" 405 | version = "1.0.1" 406 | source = "registry+https://github.com/rust-lang/crates.io-index" 407 | checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" 408 | 409 | [[package]] 410 | name = "smallvec" 411 | version = "1.15.0" 412 | source = "registry+https://github.com/rust-lang/crates.io-index" 413 | checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" 414 | 415 | [[package]] 416 | name = "stable_deref_trait" 417 | version = "1.2.0" 418 | source = "registry+https://github.com/rust-lang/crates.io-index" 419 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" 420 | 421 | [[package]] 422 | name = "string_cache" 423 | version = "0.8.9" 424 | source = "registry+https://github.com/rust-lang/crates.io-index" 425 | checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" 426 | dependencies = [ 427 | "new_debug_unreachable", 428 | "parking_lot", 429 | "phf_shared", 430 | "precomputed-hash", 431 | "serde", 432 | ] 433 | 434 | [[package]] 435 | name = "string_cache_codegen" 436 | version = "0.5.4" 437 | source = "registry+https://github.com/rust-lang/crates.io-index" 438 | checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" 439 | dependencies = [ 440 | "phf_generator", 441 | "phf_shared", 442 | "proc-macro2", 443 | "quote", 444 | ] 445 | 446 | [[package]] 447 | name = "syn" 448 | version = "2.0.100" 449 | source = "registry+https://github.com/rust-lang/crates.io-index" 450 | checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" 451 | dependencies = [ 452 | "proc-macro2", 453 | "quote", 454 | "unicode-ident", 455 | ] 456 | 457 | [[package]] 458 | name = "tendril" 459 | version = "0.4.3" 460 | source = "registry+https://github.com/rust-lang/crates.io-index" 461 | checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" 462 | dependencies = [ 463 | "futf", 464 | "mac", 465 | "utf-8", 466 | ] 467 | 468 | [[package]] 469 | name = "unicode-ident" 470 | version = "1.0.18" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 473 | 474 | [[package]] 475 | name = "unicode-width" 476 | version = "0.1.14" 477 | source = "registry+https://github.com/rust-lang/crates.io-index" 478 | checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" 479 | 480 | [[package]] 481 | name = "utf-8" 482 | version = "0.7.6" 483 | source = "registry+https://github.com/rust-lang/crates.io-index" 484 | checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" 485 | 486 | [[package]] 487 | name = "windows-targets" 488 | version = "0.52.6" 489 | source = "registry+https://github.com/rust-lang/crates.io-index" 490 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 491 | dependencies = [ 492 | "windows_aarch64_gnullvm", 493 | "windows_aarch64_msvc", 494 | "windows_i686_gnu", 495 | "windows_i686_gnullvm", 496 | "windows_i686_msvc", 497 | "windows_x86_64_gnu", 498 | "windows_x86_64_gnullvm", 499 | "windows_x86_64_msvc", 500 | ] 501 | 502 | [[package]] 503 | name = "windows_aarch64_gnullvm" 504 | version = "0.52.6" 505 | source = "registry+https://github.com/rust-lang/crates.io-index" 506 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 507 | 508 | [[package]] 509 | name = "windows_aarch64_msvc" 510 | version = "0.52.6" 511 | source = "registry+https://github.com/rust-lang/crates.io-index" 512 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 513 | 514 | [[package]] 515 | name = "windows_i686_gnu" 516 | version = "0.52.6" 517 | source = "registry+https://github.com/rust-lang/crates.io-index" 518 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 519 | 520 | [[package]] 521 | name = "windows_i686_gnullvm" 522 | version = "0.52.6" 523 | source = "registry+https://github.com/rust-lang/crates.io-index" 524 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 525 | 526 | [[package]] 527 | name = "windows_i686_msvc" 528 | version = "0.52.6" 529 | source = "registry+https://github.com/rust-lang/crates.io-index" 530 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 531 | 532 | [[package]] 533 | name = "windows_x86_64_gnu" 534 | version = "0.52.6" 535 | source = "registry+https://github.com/rust-lang/crates.io-index" 536 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 537 | 538 | [[package]] 539 | name = "windows_x86_64_gnullvm" 540 | version = "0.52.6" 541 | source = "registry+https://github.com/rust-lang/crates.io-index" 542 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 543 | 544 | [[package]] 545 | name = "windows_x86_64_msvc" 546 | version = "0.52.6" 547 | source = "registry+https://github.com/rust-lang/crates.io-index" 548 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 549 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | 4 | members = ["scraper"] 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2016, June McEnroe 2 | Copyright © 2017, Vivek Kushwaha 3 | Copyright © 2024-2025, rust-scraper Contributors 4 | 5 | Permission to use, copy, modify, and/or distribute this software for any 6 | purpose with or without fee is hereby granted, provided that the above 7 | copyright notice and this permission notice appear in all copies. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scraper/README.md -------------------------------------------------------------------------------- /scraper/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "ahash" 7 | version = "0.8.11" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" 10 | dependencies = [ 11 | "cfg-if", 12 | "getrandom", 13 | "once_cell", 14 | "version_check", 15 | "zerocopy", 16 | ] 17 | 18 | [[package]] 19 | name = "autocfg" 20 | version = "1.1.0" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 23 | 24 | [[package]] 25 | name = "bitflags" 26 | version = "1.3.2" 27 | source = "registry+https://github.com/rust-lang/crates.io-index" 28 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 29 | 30 | [[package]] 31 | name = "bitflags" 32 | version = "2.4.1" 33 | source = "registry+https://github.com/rust-lang/crates.io-index" 34 | checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" 35 | 36 | [[package]] 37 | name = "byteorder" 38 | version = "1.5.0" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 41 | 42 | [[package]] 43 | name = "cfg-if" 44 | version = "1.0.0" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 47 | 48 | [[package]] 49 | name = "cssparser" 50 | version = "0.31.2" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" 53 | dependencies = [ 54 | "cssparser-macros", 55 | "dtoa-short", 56 | "itoa", 57 | "phf 0.11.2", 58 | "smallvec", 59 | ] 60 | 61 | [[package]] 62 | name = "cssparser-macros" 63 | version = "0.6.1" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" 66 | dependencies = [ 67 | "quote", 68 | "syn 2.0.38", 69 | ] 70 | 71 | [[package]] 72 | name = "derive_more" 73 | version = "0.99.17" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" 76 | dependencies = [ 77 | "proc-macro2", 78 | "quote", 79 | "syn 1.0.109", 80 | ] 81 | 82 | [[package]] 83 | name = "dtoa" 84 | version = "1.0.9" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" 87 | 88 | [[package]] 89 | name = "dtoa-short" 90 | version = "0.3.4" 91 | source = "registry+https://github.com/rust-lang/crates.io-index" 92 | checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74" 93 | dependencies = [ 94 | "dtoa", 95 | ] 96 | 97 | [[package]] 98 | name = "ego-tree" 99 | version = "0.7.0" 100 | source = "registry+https://github.com/rust-lang/crates.io-index" 101 | checksum = "870d6b5641b5e3e9baf0f79d6d5ccb5b171ec31038dd74fc4c93662756a3d2b1" 102 | 103 | [[package]] 104 | name = "equivalent" 105 | version = "1.0.1" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" 108 | 109 | [[package]] 110 | name = "futf" 111 | version = "0.1.5" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" 114 | dependencies = [ 115 | "mac", 116 | "new_debug_unreachable", 117 | ] 118 | 119 | [[package]] 120 | name = "fxhash" 121 | version = "0.2.1" 122 | source = "registry+https://github.com/rust-lang/crates.io-index" 123 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 124 | dependencies = [ 125 | "byteorder", 126 | ] 127 | 128 | [[package]] 129 | name = "getopts" 130 | version = "0.2.21" 131 | source = "registry+https://github.com/rust-lang/crates.io-index" 132 | checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" 133 | dependencies = [ 134 | "unicode-width", 135 | ] 136 | 137 | [[package]] 138 | name = "getrandom" 139 | version = "0.2.10" 140 | source = "registry+https://github.com/rust-lang/crates.io-index" 141 | checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" 142 | dependencies = [ 143 | "cfg-if", 144 | "libc", 145 | "wasi", 146 | ] 147 | 148 | [[package]] 149 | name = "hashbrown" 150 | version = "0.14.2" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" 153 | 154 | [[package]] 155 | name = "html5ever" 156 | version = "0.27.0" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" 159 | dependencies = [ 160 | "log", 161 | "mac", 162 | "markup5ever", 163 | "proc-macro2", 164 | "quote", 165 | "syn 2.0.38", 166 | ] 167 | 168 | [[package]] 169 | name = "indexmap" 170 | version = "2.4.0" 171 | source = "registry+https://github.com/rust-lang/crates.io-index" 172 | checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" 173 | dependencies = [ 174 | "equivalent", 175 | "hashbrown", 176 | ] 177 | 178 | [[package]] 179 | name = "itoa" 180 | version = "1.0.9" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" 183 | 184 | [[package]] 185 | name = "libc" 186 | version = "0.2.149" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" 189 | 190 | [[package]] 191 | name = "lock_api" 192 | version = "0.4.11" 193 | source = "registry+https://github.com/rust-lang/crates.io-index" 194 | checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" 195 | dependencies = [ 196 | "autocfg", 197 | "scopeguard", 198 | ] 199 | 200 | [[package]] 201 | name = "log" 202 | version = "0.4.20" 203 | source = "registry+https://github.com/rust-lang/crates.io-index" 204 | checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" 205 | 206 | [[package]] 207 | name = "mac" 208 | version = "0.1.1" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 211 | 212 | [[package]] 213 | name = "markup5ever" 214 | version = "0.12.0" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "c7940b09815a02810a42b9e1bc41c069880a87de68e9b1dcbe754a3ba3b47c20" 217 | dependencies = [ 218 | "log", 219 | "phf 0.11.2", 220 | "phf_codegen 0.11.2", 221 | "string_cache", 222 | "string_cache_codegen", 223 | "tendril", 224 | ] 225 | 226 | [[package]] 227 | name = "new_debug_unreachable" 228 | version = "1.0.4" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 231 | 232 | [[package]] 233 | name = "once_cell" 234 | version = "1.19.0" 235 | source = "registry+https://github.com/rust-lang/crates.io-index" 236 | checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" 237 | 238 | [[package]] 239 | name = "parking_lot" 240 | version = "0.12.1" 241 | source = "registry+https://github.com/rust-lang/crates.io-index" 242 | checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" 243 | dependencies = [ 244 | "lock_api", 245 | "parking_lot_core", 246 | ] 247 | 248 | [[package]] 249 | name = "parking_lot_core" 250 | version = "0.9.9" 251 | source = "registry+https://github.com/rust-lang/crates.io-index" 252 | checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" 253 | dependencies = [ 254 | "cfg-if", 255 | "libc", 256 | "redox_syscall", 257 | "smallvec", 258 | "windows-targets", 259 | ] 260 | 261 | [[package]] 262 | name = "phf" 263 | version = "0.10.1" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" 266 | dependencies = [ 267 | "phf_shared 0.10.0", 268 | ] 269 | 270 | [[package]] 271 | name = "phf" 272 | version = "0.11.2" 273 | source = "registry+https://github.com/rust-lang/crates.io-index" 274 | checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" 275 | dependencies = [ 276 | "phf_macros", 277 | "phf_shared 0.11.2", 278 | ] 279 | 280 | [[package]] 281 | name = "phf_codegen" 282 | version = "0.10.0" 283 | source = "registry+https://github.com/rust-lang/crates.io-index" 284 | checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" 285 | dependencies = [ 286 | "phf_generator 0.10.0", 287 | "phf_shared 0.10.0", 288 | ] 289 | 290 | [[package]] 291 | name = "phf_codegen" 292 | version = "0.11.2" 293 | source = "registry+https://github.com/rust-lang/crates.io-index" 294 | checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" 295 | dependencies = [ 296 | "phf_generator 0.11.2", 297 | "phf_shared 0.11.2", 298 | ] 299 | 300 | [[package]] 301 | name = "phf_generator" 302 | version = "0.10.0" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" 305 | dependencies = [ 306 | "phf_shared 0.10.0", 307 | "rand", 308 | ] 309 | 310 | [[package]] 311 | name = "phf_generator" 312 | version = "0.11.2" 313 | source = "registry+https://github.com/rust-lang/crates.io-index" 314 | checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" 315 | dependencies = [ 316 | "phf_shared 0.11.2", 317 | "rand", 318 | ] 319 | 320 | [[package]] 321 | name = "phf_macros" 322 | version = "0.11.2" 323 | source = "registry+https://github.com/rust-lang/crates.io-index" 324 | checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" 325 | dependencies = [ 326 | "phf_generator 0.11.2", 327 | "phf_shared 0.11.2", 328 | "proc-macro2", 329 | "quote", 330 | "syn 2.0.38", 331 | ] 332 | 333 | [[package]] 334 | name = "phf_shared" 335 | version = "0.10.0" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" 338 | dependencies = [ 339 | "siphasher", 340 | ] 341 | 342 | [[package]] 343 | name = "phf_shared" 344 | version = "0.11.2" 345 | source = "registry+https://github.com/rust-lang/crates.io-index" 346 | checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" 347 | dependencies = [ 348 | "siphasher", 349 | ] 350 | 351 | [[package]] 352 | name = "ppv-lite86" 353 | version = "0.2.17" 354 | source = "registry+https://github.com/rust-lang/crates.io-index" 355 | checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" 356 | 357 | [[package]] 358 | name = "precomputed-hash" 359 | version = "0.1.1" 360 | source = "registry+https://github.com/rust-lang/crates.io-index" 361 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 362 | 363 | [[package]] 364 | name = "proc-macro2" 365 | version = "1.0.69" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" 368 | dependencies = [ 369 | "unicode-ident", 370 | ] 371 | 372 | [[package]] 373 | name = "quote" 374 | version = "1.0.33" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" 377 | dependencies = [ 378 | "proc-macro2", 379 | ] 380 | 381 | [[package]] 382 | name = "rand" 383 | version = "0.8.5" 384 | source = "registry+https://github.com/rust-lang/crates.io-index" 385 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 386 | dependencies = [ 387 | "libc", 388 | "rand_chacha", 389 | "rand_core", 390 | ] 391 | 392 | [[package]] 393 | name = "rand_chacha" 394 | version = "0.3.1" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 397 | dependencies = [ 398 | "ppv-lite86", 399 | "rand_core", 400 | ] 401 | 402 | [[package]] 403 | name = "rand_core" 404 | version = "0.6.4" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 407 | dependencies = [ 408 | "getrandom", 409 | ] 410 | 411 | [[package]] 412 | name = "redox_syscall" 413 | version = "0.4.1" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" 416 | dependencies = [ 417 | "bitflags 1.3.2", 418 | ] 419 | 420 | [[package]] 421 | name = "scopeguard" 422 | version = "1.2.0" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 425 | 426 | [[package]] 427 | name = "scraper" 428 | version = "0.20.0" 429 | dependencies = [ 430 | "ahash", 431 | "cssparser", 432 | "ego-tree", 433 | "getopts", 434 | "html5ever", 435 | "indexmap", 436 | "selectors", 437 | "tendril", 438 | ] 439 | 440 | [[package]] 441 | name = "selectors" 442 | version = "0.25.0" 443 | source = "registry+https://github.com/rust-lang/crates.io-index" 444 | checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" 445 | dependencies = [ 446 | "bitflags 2.4.1", 447 | "cssparser", 448 | "derive_more", 449 | "fxhash", 450 | "log", 451 | "new_debug_unreachable", 452 | "phf 0.10.1", 453 | "phf_codegen 0.10.0", 454 | "precomputed-hash", 455 | "servo_arc", 456 | "smallvec", 457 | ] 458 | 459 | [[package]] 460 | name = "serde" 461 | version = "1.0.190" 462 | source = "registry+https://github.com/rust-lang/crates.io-index" 463 | checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7" 464 | dependencies = [ 465 | "serde_derive", 466 | ] 467 | 468 | [[package]] 469 | name = "serde_derive" 470 | version = "1.0.190" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3" 473 | dependencies = [ 474 | "proc-macro2", 475 | "quote", 476 | "syn 2.0.38", 477 | ] 478 | 479 | [[package]] 480 | name = "servo_arc" 481 | version = "0.3.0" 482 | source = "registry+https://github.com/rust-lang/crates.io-index" 483 | checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" 484 | dependencies = [ 485 | "stable_deref_trait", 486 | ] 487 | 488 | [[package]] 489 | name = "siphasher" 490 | version = "0.3.11" 491 | source = "registry+https://github.com/rust-lang/crates.io-index" 492 | checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" 493 | 494 | [[package]] 495 | name = "smallvec" 496 | version = "1.11.1" 497 | source = "registry+https://github.com/rust-lang/crates.io-index" 498 | checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" 499 | 500 | [[package]] 501 | name = "stable_deref_trait" 502 | version = "1.2.0" 503 | source = "registry+https://github.com/rust-lang/crates.io-index" 504 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" 505 | 506 | [[package]] 507 | name = "string_cache" 508 | version = "0.8.7" 509 | source = "registry+https://github.com/rust-lang/crates.io-index" 510 | checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" 511 | dependencies = [ 512 | "new_debug_unreachable", 513 | "once_cell", 514 | "parking_lot", 515 | "phf_shared 0.10.0", 516 | "precomputed-hash", 517 | "serde", 518 | ] 519 | 520 | [[package]] 521 | name = "string_cache_codegen" 522 | version = "0.5.2" 523 | source = "registry+https://github.com/rust-lang/crates.io-index" 524 | checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" 525 | dependencies = [ 526 | "phf_generator 0.10.0", 527 | "phf_shared 0.10.0", 528 | "proc-macro2", 529 | "quote", 530 | ] 531 | 532 | [[package]] 533 | name = "syn" 534 | version = "1.0.109" 535 | source = "registry+https://github.com/rust-lang/crates.io-index" 536 | checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" 537 | dependencies = [ 538 | "proc-macro2", 539 | "quote", 540 | "unicode-ident", 541 | ] 542 | 543 | [[package]] 544 | name = "syn" 545 | version = "2.0.38" 546 | source = "registry+https://github.com/rust-lang/crates.io-index" 547 | checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" 548 | dependencies = [ 549 | "proc-macro2", 550 | "quote", 551 | "unicode-ident", 552 | ] 553 | 554 | [[package]] 555 | name = "tendril" 556 | version = "0.4.3" 557 | source = "registry+https://github.com/rust-lang/crates.io-index" 558 | checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" 559 | dependencies = [ 560 | "futf", 561 | "mac", 562 | "utf-8", 563 | ] 564 | 565 | [[package]] 566 | name = "unicode-ident" 567 | version = "1.0.12" 568 | source = "registry+https://github.com/rust-lang/crates.io-index" 569 | checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" 570 | 571 | [[package]] 572 | name = "unicode-width" 573 | version = "0.1.11" 574 | source = "registry+https://github.com/rust-lang/crates.io-index" 575 | checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" 576 | 577 | [[package]] 578 | name = "utf-8" 579 | version = "0.7.6" 580 | source = "registry+https://github.com/rust-lang/crates.io-index" 581 | checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" 582 | 583 | [[package]] 584 | name = "version_check" 585 | version = "0.9.4" 586 | source = "registry+https://github.com/rust-lang/crates.io-index" 587 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" 588 | 589 | [[package]] 590 | name = "wasi" 591 | version = "0.11.0+wasi-snapshot-preview1" 592 | source = "registry+https://github.com/rust-lang/crates.io-index" 593 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 594 | 595 | [[package]] 596 | name = "windows-targets" 597 | version = "0.48.5" 598 | source = "registry+https://github.com/rust-lang/crates.io-index" 599 | checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" 600 | dependencies = [ 601 | "windows_aarch64_gnullvm", 602 | "windows_aarch64_msvc", 603 | "windows_i686_gnu", 604 | "windows_i686_msvc", 605 | "windows_x86_64_gnu", 606 | "windows_x86_64_gnullvm", 607 | "windows_x86_64_msvc", 608 | ] 609 | 610 | [[package]] 611 | name = "windows_aarch64_gnullvm" 612 | version = "0.48.5" 613 | source = "registry+https://github.com/rust-lang/crates.io-index" 614 | checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" 615 | 616 | [[package]] 617 | name = "windows_aarch64_msvc" 618 | version = "0.48.5" 619 | source = "registry+https://github.com/rust-lang/crates.io-index" 620 | checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" 621 | 622 | [[package]] 623 | name = "windows_i686_gnu" 624 | version = "0.48.5" 625 | source = "registry+https://github.com/rust-lang/crates.io-index" 626 | checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" 627 | 628 | [[package]] 629 | name = "windows_i686_msvc" 630 | version = "0.48.5" 631 | source = "registry+https://github.com/rust-lang/crates.io-index" 632 | checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" 633 | 634 | [[package]] 635 | name = "windows_x86_64_gnu" 636 | version = "0.48.5" 637 | source = "registry+https://github.com/rust-lang/crates.io-index" 638 | checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" 639 | 640 | [[package]] 641 | name = "windows_x86_64_gnullvm" 642 | version = "0.48.5" 643 | source = "registry+https://github.com/rust-lang/crates.io-index" 644 | checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" 645 | 646 | [[package]] 647 | name = "windows_x86_64_msvc" 648 | version = "0.48.5" 649 | source = "registry+https://github.com/rust-lang/crates.io-index" 650 | checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" 651 | 652 | [[package]] 653 | name = "zerocopy" 654 | version = "0.7.32" 655 | source = "registry+https://github.com/rust-lang/crates.io-index" 656 | checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" 657 | dependencies = [ 658 | "zerocopy-derive", 659 | ] 660 | 661 | [[package]] 662 | name = "zerocopy-derive" 663 | version = "0.7.32" 664 | source = "registry+https://github.com/rust-lang/crates.io-index" 665 | checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" 666 | dependencies = [ 667 | "proc-macro2", 668 | "quote", 669 | "syn 2.0.38", 670 | ] 671 | -------------------------------------------------------------------------------- /scraper/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "scraper" 3 | version = "0.23.1" 4 | edition = "2021" 5 | 6 | description = "HTML parsing and querying with CSS selectors" 7 | keywords = ["html", "css", "selector", "scraping"] 8 | 9 | authors = ["June McEnroe "] 10 | license = "ISC" 11 | 12 | repository = "https://github.com/causal-agent/scraper" 13 | readme = "README.md" 14 | 15 | [dependencies] 16 | cssparser = "0.35.0" 17 | ego-tree = "0.10.0" 18 | html5ever = "0.31.0" 19 | indexmap = { version = "2.7.1", optional = true } 20 | precomputed-hash = "0.1.1" 21 | selectors = "0.28.0" 22 | serde = { version = "1.0.218", optional = true } 23 | tendril = "0.4.3" 24 | 25 | [dependencies.getopts] 26 | version = "0.2.21" 27 | optional = true 28 | 29 | [features] 30 | default = ["main", "errors"] 31 | deterministic = ["indexmap"] 32 | main = ["getopts"] 33 | atomic = [] 34 | errors = [] 35 | serde = ["dep:serde"] 36 | 37 | [[bin]] 38 | name = "scraper" 39 | path = "src/main.rs" 40 | required-features = ["main"] 41 | -------------------------------------------------------------------------------- /scraper/LICENSE: -------------------------------------------------------------------------------- 1 | ./../LICENSE -------------------------------------------------------------------------------- /scraper/README.md: -------------------------------------------------------------------------------- 1 | # scraper 2 | 3 | [![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] 4 | [![downloads](https://img.shields.io/crates/d/scraper)][crate] 5 | [![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] 6 | 7 | HTML parsing and querying with CSS selectors. 8 | 9 | `scraper` is on [Crates.io][crate] and [GitHub][github]. 10 | 11 | [crate]: https://crates.io/crates/scraper 12 | [github]: https://github.com/causal-agent/scraper 13 | [tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml 14 | 15 | Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. 16 | 17 | ## Examples 18 | 19 | ### Parsing a document 20 | 21 | ```rust 22 | use scraper::Html; 23 | 24 | let html = r#" 25 | 26 | 27 | Hello, world! 28 |

Hello, world!

29 | "#; 30 | 31 | let document = Html::parse_document(html); 32 | ``` 33 | 34 | ### Parsing a fragment 35 | 36 | ```rust 37 | use scraper::Html; 38 | let fragment = Html::parse_fragment("

Hello, world!

"); 39 | ``` 40 | 41 | ### Parsing a selector 42 | 43 | ```rust 44 | use scraper::Selector; 45 | let selector = Selector::parse("h1.foo").unwrap(); 46 | ``` 47 | 48 | ### Selecting elements 49 | 50 | ```rust 51 | use scraper::{Html, Selector}; 52 | 53 | let html = r#" 54 |
    55 |
  • Foo
  • 56 |
  • Bar
  • 57 |
  • Baz
  • 58 |
59 | "#; 60 | 61 | let fragment = Html::parse_fragment(html); 62 | let selector = Selector::parse("li").unwrap(); 63 | 64 | for element in fragment.select(&selector) { 65 | assert_eq!("li", element.value().name()); 66 | } 67 | ``` 68 | 69 | ### Selecting descendent elements 70 | 71 | ```rust 72 | use scraper::{Html, Selector}; 73 | 74 | let html = r#" 75 |
    76 |
  • Foo
  • 77 |
  • Bar
  • 78 |
  • Baz
  • 79 |
80 | "#; 81 | 82 | let fragment = Html::parse_fragment(html); 83 | let ul_selector = Selector::parse("ul").unwrap(); 84 | let li_selector = Selector::parse("li").unwrap(); 85 | 86 | let ul = fragment.select(&ul_selector).next().unwrap(); 87 | for element in ul.select(&li_selector) { 88 | assert_eq!("li", element.value().name()); 89 | } 90 | ``` 91 | 92 | ### Accessing element attributes 93 | 94 | ```rust 95 | use scraper::{Html, Selector}; 96 | 97 | let fragment = Html::parse_fragment(r#""#); 98 | let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); 99 | 100 | let input = fragment.select(&selector).next().unwrap(); 101 | assert_eq!(Some("bar"), input.value().attr("value")); 102 | ``` 103 | 104 | ### Serializing HTML and inner HTML 105 | 106 | ```rust 107 | use scraper::{Html, Selector}; 108 | 109 | let fragment = Html::parse_fragment("

Hello, world!

"); 110 | let selector = Selector::parse("h1").unwrap(); 111 | 112 | let h1 = fragment.select(&selector).next().unwrap(); 113 | 114 | assert_eq!("

Hello, world!

", h1.html()); 115 | assert_eq!("Hello, world!", h1.inner_html()); 116 | ``` 117 | 118 | ### Accessing descendent text 119 | 120 | ```rust 121 | use scraper::{Html, Selector}; 122 | 123 | let fragment = Html::parse_fragment("

Hello, world!

"); 124 | let selector = Selector::parse("h1").unwrap(); 125 | 126 | let h1 = fragment.select(&selector).next().unwrap(); 127 | let text = h1.text().collect::>(); 128 | 129 | assert_eq!(vec!["Hello, ", "world!"], text); 130 | ``` 131 | 132 | ### Manipulating the DOM 133 | 134 | ```rust 135 | use html5ever::tree_builder::TreeSink; 136 | use scraper::{Html, Selector}; 137 | 138 | let html = "hello

REMOVE ME

"; 139 | let selector = Selector::parse(".hello").unwrap(); 140 | let mut document = Html::parse_document(html); 141 | let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); 142 | for id in node_ids { 143 | document.remove_from_parent(&id); 144 | } 145 | assert_eq!(document.html(), "hello"); 146 | ``` 147 | 148 | ## Contributing 149 | 150 | Please feel free to open pull requests. If you're planning on implementing 151 | something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) 152 | then please open an issue first. 153 | -------------------------------------------------------------------------------- /scraper/examples/document.rs: -------------------------------------------------------------------------------- 1 | extern crate scraper; 2 | 3 | use std::io::{self, Read, Write}; 4 | 5 | use scraper::{Html, Selector}; 6 | 7 | fn main() { 8 | let mut input = String::new(); 9 | let mut stdout = io::stdout(); 10 | let mut stdin = io::stdin(); 11 | 12 | write!(stdout, "CSS selector: ").unwrap(); 13 | stdout.flush().unwrap(); 14 | stdin.read_line(&mut input).unwrap(); 15 | let selector = Selector::parse(&input).unwrap(); 16 | 17 | writeln!(stdout, "HTML document:").unwrap(); 18 | stdout.flush().unwrap(); 19 | input.clear(); 20 | stdin.read_to_string(&mut input).unwrap(); 21 | let document = Html::parse_document(&input); 22 | 23 | println!("{:#?}", document); 24 | 25 | for node in document.select(&selector) { 26 | println!("{:?}", node.value()); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /scraper/examples/fragment.rs: -------------------------------------------------------------------------------- 1 | extern crate scraper; 2 | 3 | use std::io::{self, Read, Write}; 4 | 5 | use scraper::{Html, Selector}; 6 | 7 | fn main() { 8 | let mut input = String::new(); 9 | let mut stdout = io::stdout(); 10 | let mut stdin = io::stdin(); 11 | 12 | write!(stdout, "CSS selector: ").unwrap(); 13 | stdout.flush().unwrap(); 14 | stdin.read_line(&mut input).unwrap(); 15 | let selector = Selector::parse(&input).unwrap(); 16 | 17 | writeln!(stdout, "HTML fragment:").unwrap(); 18 | stdout.flush().unwrap(); 19 | input.clear(); 20 | stdin.read_to_string(&mut input).unwrap(); 21 | let fragment = Html::parse_fragment(&input); 22 | 23 | println!("{:#?}", fragment); 24 | 25 | for node in fragment.select(&selector) { 26 | println!("{:?}", node.value()); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /scraper/scraper.1: -------------------------------------------------------------------------------- 1 | .Dd October 29, 2018 2 | .Dt SCRAPER 1 3 | .Os 4 | . 5 | .Sh NAME 6 | .Nm scraper 7 | .Nd HTML querying with CSS selectors 8 | . 9 | .Sh SYNOPSIS 10 | .Nm 11 | .Op Fl HIcint 12 | .Op Fl a Ar attr 13 | .Op Fl d | f 14 | .Ar selector 15 | .Op Ar 16 | . 17 | .Sh DESCRIPTION 18 | The 19 | .Nm 20 | utility parses HTML and outputs elements matching CSS selectors. 21 | . 22 | .Pp 23 | The arguments are as follows: 24 | .Bl -tag -width Ds 25 | . 26 | .It Fl H , Fl \-html 27 | Output the HTML of the matching elements. 28 | This is the default. 29 | . 30 | .It Fl I , Fl \-inner\-html 31 | Output the inner HTML of the matching elements. 32 | . 33 | .It Fl a Ar attr , Fl \-attr Ar attr 34 | Output the value of the attribute 35 | .Ar attr 36 | of the matching elements. 37 | . 38 | .It Fl c , Fl \-classes 39 | Output the classes of the matching elements. 40 | . 41 | .It Fl d , Fl \-document 42 | Parse the input as HTML documents. 43 | This is the default. 44 | . 45 | .It Fl f , Fl \-fragment 46 | Parse the input as HTML fragments. 47 | . 48 | .It Fl i , Fl \-id 49 | Output the IDs of the matching elements. 50 | . 51 | .It Fl n , Fl \-name 52 | Output the names of the matching elements. 53 | . 54 | .It Fl t , Fl \-text 55 | Output the text of the matching elements. 56 | .El 57 | . 58 | .Sh EXIT STATUS 59 | The 60 | .Nm 61 | utility exits 0 on success, 62 | 1 if no elements match, 63 | and >1 if an error occurs. 64 | . 65 | .Sh AUTHORS 66 | .An June McEnroe Aq Mt june@causal.agency 67 | .An Vivek Kushwaha Aq Mt yoursvivek@gmail.com 68 | .Pp 69 | The 70 | .Nm 71 | utility relies heavily on code from the 72 | .Lk https://servo.org "Servo project" . 73 | -------------------------------------------------------------------------------- /scraper/src/element_ref/element.rs: -------------------------------------------------------------------------------- 1 | use html5ever::Namespace; 2 | use selectors::{ 3 | attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint}, 4 | bloom::BloomFilter, 5 | matching, Element, OpaqueElement, 6 | }; 7 | 8 | use super::ElementRef; 9 | use crate::selector::{CssLocalName, CssString, NonTSPseudoClass, PseudoElement, Simple}; 10 | 11 | /// Note: will never match against non-tree-structure pseudo-classes. 12 | impl Element for ElementRef<'_> { 13 | type Impl = Simple; 14 | 15 | fn opaque(&self) -> OpaqueElement { 16 | OpaqueElement::new(self.node.value()) 17 | } 18 | 19 | fn parent_element(&self) -> Option { 20 | self.parent().and_then(ElementRef::wrap) 21 | } 22 | 23 | fn parent_node_is_shadow_root(&self) -> bool { 24 | false 25 | } 26 | 27 | fn containing_shadow_host(&self) -> Option { 28 | None 29 | } 30 | 31 | fn is_pseudo_element(&self) -> bool { 32 | false 33 | } 34 | 35 | fn is_part(&self, _name: &CssLocalName) -> bool { 36 | false 37 | } 38 | 39 | fn is_same_type(&self, other: &Self) -> bool { 40 | self.value().name == other.value().name 41 | } 42 | 43 | fn imported_part(&self, _: &CssLocalName) -> Option { 44 | None 45 | } 46 | 47 | fn prev_sibling_element(&self) -> Option { 48 | self.prev_siblings() 49 | .find(|sibling| sibling.value().is_element()) 50 | .map(ElementRef::new) 51 | } 52 | 53 | fn next_sibling_element(&self) -> Option { 54 | self.next_siblings() 55 | .find(|sibling| sibling.value().is_element()) 56 | .map(ElementRef::new) 57 | } 58 | 59 | fn first_element_child(&self) -> Option { 60 | self.children() 61 | .find(|child| child.value().is_element()) 62 | .map(ElementRef::new) 63 | } 64 | 65 | fn is_html_element_in_html_document(&self) -> bool { 66 | // FIXME: Is there more to this? 67 | self.value().name.ns == ns!(html) 68 | } 69 | 70 | fn has_local_name(&self, name: &CssLocalName) -> bool { 71 | self.value().name.local == name.0 72 | } 73 | 74 | fn has_namespace(&self, namespace: &Namespace) -> bool { 75 | &self.value().name.ns == namespace 76 | } 77 | 78 | fn attr_matches( 79 | &self, 80 | ns: &NamespaceConstraint<&Namespace>, 81 | local_name: &CssLocalName, 82 | operation: &AttrSelectorOperation<&CssString>, 83 | ) -> bool { 84 | self.value().attrs.iter().any(|(key, value)| { 85 | !matches!(*ns, NamespaceConstraint::Specific(url) if *url != key.ns) 86 | && local_name.0 == key.local 87 | && operation.eval_str(value) 88 | }) 89 | } 90 | 91 | fn match_non_ts_pseudo_class( 92 | &self, 93 | _pc: &NonTSPseudoClass, 94 | _context: &mut matching::MatchingContext<'_, Self::Impl>, 95 | ) -> bool { 96 | false 97 | } 98 | 99 | fn match_pseudo_element( 100 | &self, 101 | _pe: &PseudoElement, 102 | _context: &mut matching::MatchingContext, 103 | ) -> bool { 104 | false 105 | } 106 | 107 | fn is_link(&self) -> bool { 108 | self.value().name() == "link" 109 | } 110 | 111 | fn is_html_slot_element(&self) -> bool { 112 | true 113 | } 114 | 115 | fn has_id(&self, id: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { 116 | match self.value().id() { 117 | Some(val) => case_sensitivity.eq(id.0.as_bytes(), val.as_bytes()), 118 | None => false, 119 | } 120 | } 121 | 122 | fn has_class(&self, name: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { 123 | self.value().has_class(&name.0, case_sensitivity) 124 | } 125 | 126 | fn has_custom_state(&self, _name: &CssLocalName) -> bool { 127 | false 128 | } 129 | 130 | fn is_empty(&self) -> bool { 131 | !self 132 | .children() 133 | .any(|child| child.value().is_element() || child.value().is_text()) 134 | } 135 | 136 | fn is_root(&self) -> bool { 137 | self.parent() 138 | .is_some_and(|parent| parent.value().is_document()) 139 | } 140 | 141 | fn apply_selector_flags(&self, _flags: matching::ElementSelectorFlags) {} 142 | 143 | fn add_element_unique_hashes(&self, _filter: &mut BloomFilter) -> bool { 144 | // FIXME: Do we want to add `self.node.id()` here? 145 | false 146 | } 147 | } 148 | 149 | #[cfg(test)] 150 | mod tests { 151 | use crate::html::Html; 152 | use crate::selector::{CssLocalName, Selector}; 153 | use selectors::attr::CaseSensitivity; 154 | use selectors::Element; 155 | 156 | #[test] 157 | fn test_has_id() { 158 | let html = ""; 159 | let fragment = Html::parse_fragment(html); 160 | let sel = Selector::parse("p").unwrap(); 161 | 162 | let element = fragment.select(&sel).next().unwrap(); 163 | assert!(element.has_id( 164 | &CssLocalName::from("link_id_456"), 165 | CaseSensitivity::CaseSensitive 166 | )); 167 | 168 | let html = "

hey there

"; 169 | let fragment = Html::parse_fragment(html); 170 | let element = fragment.select(&sel).next().unwrap(); 171 | assert!(!element.has_id( 172 | &CssLocalName::from("any_link_id"), 173 | CaseSensitivity::CaseSensitive 174 | )); 175 | } 176 | 177 | #[test] 178 | fn test_is_link() { 179 | let html = ""; 180 | let fragment = Html::parse_fragment(html); 181 | let sel = Selector::parse("link").unwrap(); 182 | let element = fragment.select(&sel).next().unwrap(); 183 | assert!(element.is_link()); 184 | 185 | let html = "

hey there

"; 186 | let fragment = Html::parse_fragment(html); 187 | let sel = Selector::parse("p").unwrap(); 188 | let element = fragment.select(&sel).next().unwrap(); 189 | assert!(!element.is_link()); 190 | } 191 | 192 | #[test] 193 | fn test_has_class() { 194 | let html = "

hey there

"; 195 | let fragment = Html::parse_fragment(html); 196 | let sel = Selector::parse("p").unwrap(); 197 | let element = fragment.select(&sel).next().unwrap(); 198 | assert!(element.has_class( 199 | &CssLocalName::from("my_class"), 200 | CaseSensitivity::CaseSensitive 201 | )); 202 | 203 | let html = "

hey there

"; 204 | let fragment = Html::parse_fragment(html); 205 | let sel = Selector::parse("p").unwrap(); 206 | let element = fragment.select(&sel).next().unwrap(); 207 | assert!(!element.has_class( 208 | &CssLocalName::from("my_class"), 209 | CaseSensitivity::CaseSensitive 210 | )); 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /scraper/src/element_ref/mod.rs: -------------------------------------------------------------------------------- 1 | //! Element references. 2 | 3 | use std::fmt::{self, Debug}; 4 | use std::iter::FusedIterator; 5 | use std::ops::Deref; 6 | 7 | use ego_tree::iter::{Edge, Traverse}; 8 | use ego_tree::NodeRef; 9 | use html5ever::serialize::{serialize, SerializeOpts, TraversalScope}; 10 | use selectors::matching::SelectorCaches; 11 | 12 | use crate::node::Element; 13 | use crate::{Node, Selector}; 14 | 15 | /// Wrapper around a reference to an element node. 16 | /// 17 | /// This wrapper implements the `Element` trait from the `selectors` crate, which allows it to be 18 | /// matched against CSS selectors. 19 | #[derive(Clone, Copy, PartialEq, Eq)] 20 | pub struct ElementRef<'a> { 21 | node: NodeRef<'a, Node>, 22 | } 23 | 24 | impl<'a> ElementRef<'a> { 25 | fn new(node: NodeRef<'a, Node>) -> Self { 26 | ElementRef { node } 27 | } 28 | 29 | /// Wraps a `NodeRef` only if it references a `Node::Element`. 30 | pub fn wrap(node: NodeRef<'a, Node>) -> Option { 31 | if node.value().is_element() { 32 | Some(ElementRef::new(node)) 33 | } else { 34 | None 35 | } 36 | } 37 | 38 | /// Returns the `Element` referenced by `self`. 39 | pub fn value(&self) -> &'a Element { 40 | self.node.value().as_element().unwrap() 41 | } 42 | 43 | /// Returns an iterator over descendent elements matching a selector. 44 | pub fn select<'b>(&self, selector: &'b Selector) -> Select<'a, 'b> { 45 | let mut inner = self.traverse(); 46 | inner.next(); // Skip Edge::Open(self). 47 | 48 | Select { 49 | scope: *self, 50 | inner, 51 | selector, 52 | caches: Default::default(), 53 | } 54 | } 55 | 56 | fn serialize(&self, traversal_scope: TraversalScope) -> String { 57 | let opts = SerializeOpts { 58 | scripting_enabled: false, // It's not clear what this does. 59 | traversal_scope, 60 | create_missing_parent: false, 61 | }; 62 | let mut buf = Vec::new(); 63 | serialize(&mut buf, self, opts).unwrap(); 64 | String::from_utf8(buf).unwrap() 65 | } 66 | 67 | /// Returns the HTML of this element. 68 | pub fn html(&self) -> String { 69 | self.serialize(TraversalScope::IncludeNode) 70 | } 71 | 72 | /// Returns the inner HTML of this element. 73 | pub fn inner_html(&self) -> String { 74 | self.serialize(TraversalScope::ChildrenOnly(None)) 75 | } 76 | 77 | /// Returns the value of an attribute. 78 | pub fn attr(&self, attr: &str) -> Option<&'a str> { 79 | self.value().attr(attr) 80 | } 81 | 82 | /// Returns an iterator over descendent text nodes. 83 | pub fn text(&self) -> Text<'a> { 84 | Text { 85 | inner: self.traverse(), 86 | } 87 | } 88 | 89 | /// Iterate over all child nodes which are elements 90 | /// 91 | /// # Example 92 | /// 93 | /// ``` 94 | /// # use scraper::Html; 95 | /// let fragment = Html::parse_fragment("foobarbazqux"); 96 | /// 97 | /// let children = fragment.root_element().child_elements().map(|element| element.value().name()).collect::>(); 98 | /// assert_eq!(children, ["span", "a"]); 99 | /// ``` 100 | pub fn child_elements(&self) -> impl Iterator> { 101 | self.children().filter_map(ElementRef::wrap) 102 | } 103 | 104 | /// Iterate over all descendent nodes which are elements 105 | /// 106 | /// # Example 107 | /// 108 | /// ``` 109 | /// # use scraper::Html; 110 | /// let fragment = Html::parse_fragment("foobarbazqux"); 111 | /// 112 | /// let descendants = fragment.root_element().descendent_elements().map(|element| element.value().name()).collect::>(); 113 | /// assert_eq!(descendants, ["html", "span", "b", "a", "i"]); 114 | /// ``` 115 | pub fn descendent_elements(&self) -> impl Iterator> { 116 | self.descendants().filter_map(ElementRef::wrap) 117 | } 118 | } 119 | 120 | impl Debug for ElementRef<'_> { 121 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 122 | Debug::fmt(self.value(), f) 123 | } 124 | } 125 | 126 | impl<'a> Deref for ElementRef<'a> { 127 | type Target = NodeRef<'a, Node>; 128 | fn deref(&self) -> &NodeRef<'a, Node> { 129 | &self.node 130 | } 131 | } 132 | 133 | /// Iterator over descendent elements matching a selector. 134 | pub struct Select<'a, 'b> { 135 | scope: ElementRef<'a>, 136 | inner: Traverse<'a, Node>, 137 | selector: &'b Selector, 138 | caches: SelectorCaches, 139 | } 140 | 141 | impl Debug for Select<'_, '_> { 142 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 143 | fmt.debug_struct("Select") 144 | .field("scope", &self.scope) 145 | .field("inner", &self.inner) 146 | .field("selector", &self.selector) 147 | .field("caches", &"..") 148 | .finish() 149 | } 150 | } 151 | 152 | impl Clone for Select<'_, '_> { 153 | fn clone(&self) -> Self { 154 | Self { 155 | scope: self.scope, 156 | inner: self.inner.clone(), 157 | selector: self.selector, 158 | caches: Default::default(), 159 | } 160 | } 161 | } 162 | 163 | impl<'a> Iterator for Select<'a, '_> { 164 | type Item = ElementRef<'a>; 165 | 166 | fn next(&mut self) -> Option> { 167 | for edge in &mut self.inner { 168 | if let Edge::Open(node) = edge { 169 | if let Some(element) = ElementRef::wrap(node) { 170 | if self.selector.matches_with_scope_and_cache( 171 | &element, 172 | Some(self.scope), 173 | &mut self.caches, 174 | ) { 175 | return Some(element); 176 | } 177 | } 178 | } 179 | } 180 | None 181 | } 182 | } 183 | 184 | impl FusedIterator for Select<'_, '_> {} 185 | 186 | /// Iterator over descendent text nodes. 187 | #[derive(Debug, Clone)] 188 | pub struct Text<'a> { 189 | inner: Traverse<'a, Node>, 190 | } 191 | 192 | impl<'a> Iterator for Text<'a> { 193 | type Item = &'a str; 194 | 195 | fn next(&mut self) -> Option<&'a str> { 196 | for edge in &mut self.inner { 197 | if let Edge::Open(node) = edge { 198 | if let Node::Text(ref text) = node.value() { 199 | return Some(&**text); 200 | } 201 | } 202 | } 203 | None 204 | } 205 | } 206 | 207 | impl FusedIterator for Text<'_> {} 208 | 209 | mod element; 210 | mod serializable; 211 | 212 | #[cfg(test)] 213 | mod tests { 214 | use crate::html::Html; 215 | use crate::selector::Selector; 216 | 217 | #[test] 218 | fn test_scope() { 219 | let html = r" 220 |
221 | 1 222 | 223 | 2 224 | 3 225 | 226 |
227 | "; 228 | let fragment = Html::parse_fragment(html); 229 | let sel1 = Selector::parse("div > span").unwrap(); 230 | let sel2 = Selector::parse(":scope > b").unwrap(); 231 | 232 | let element1 = fragment.select(&sel1).next().unwrap(); 233 | let element2 = element1.select(&sel2).next().unwrap(); 234 | assert_eq!(element2.inner_html(), "3"); 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /scraper/src/element_ref/serializable.rs: -------------------------------------------------------------------------------- 1 | use std::io::Error; 2 | 3 | use html5ever::serialize::{Serialize, Serializer, TraversalScope}; 4 | 5 | use crate::ElementRef; 6 | 7 | impl Serialize for ElementRef<'_> { 8 | fn serialize( 9 | &self, 10 | serializer: &mut S, 11 | traversal_scope: TraversalScope, 12 | ) -> Result<(), Error> { 13 | crate::node::serializable::serialize(**self, serializer, traversal_scope) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /scraper/src/error.rs: -------------------------------------------------------------------------------- 1 | //! Custom error types for diagnostics 2 | //! Includes re-exported error types from dependencies 3 | 4 | mod utils; 5 | 6 | use std::{error::Error, fmt::Display}; 7 | 8 | use cssparser::{BasicParseErrorKind, ParseErrorKind, Token}; 9 | use selectors::parser::SelectorParseErrorKind; 10 | 11 | /// Error type that is returned when calling `Selector::parse` 12 | #[derive(Debug, Clone)] 13 | pub enum SelectorErrorKind<'a> { 14 | /// A `Token` was not expected 15 | UnexpectedToken(Token<'a>), 16 | 17 | /// End-Of-Line was unexpected 18 | EndOfLine, 19 | 20 | /// `@` rule is invalid 21 | InvalidAtRule(String), 22 | 23 | /// The body of an `@` rule is invalid 24 | InvalidAtRuleBody, 25 | 26 | /// The qualified rule is invalid 27 | QualRuleInvalid, 28 | 29 | /// Expected a `::` for a pseudoelement 30 | ExpectedColonOnPseudoElement(Token<'a>), 31 | 32 | /// Expected an identity for a pseudoelement 33 | ExpectedIdentityOnPseudoElement(Token<'a>), 34 | 35 | /// A `SelectorParseErrorKind` error that isn't really supposed to happen did 36 | UnexpectedSelectorParseError(SelectorParseErrorKind<'a>), 37 | } 38 | 39 | impl<'a> From>> for SelectorErrorKind<'a> { 40 | fn from(original: cssparser::ParseError<'a, SelectorParseErrorKind<'a>>) -> Self { 41 | // NOTE: This could be improved, but I dont 42 | // exactly know how 43 | match original.kind { 44 | ParseErrorKind::Basic(err) => SelectorErrorKind::from(err), 45 | ParseErrorKind::Custom(err) => SelectorErrorKind::from(err), 46 | } 47 | } 48 | } 49 | 50 | impl<'a> From> for SelectorErrorKind<'a> { 51 | fn from(err: BasicParseErrorKind<'a>) -> Self { 52 | match err { 53 | BasicParseErrorKind::UnexpectedToken(token) => Self::UnexpectedToken(token), 54 | BasicParseErrorKind::EndOfInput => Self::EndOfLine, 55 | BasicParseErrorKind::AtRuleInvalid(rule) => Self::InvalidAtRule(rule.to_string()), 56 | BasicParseErrorKind::AtRuleBodyInvalid => Self::InvalidAtRuleBody, 57 | BasicParseErrorKind::QualifiedRuleInvalid => Self::QualRuleInvalid, 58 | } 59 | } 60 | } 61 | 62 | impl<'a> From> for SelectorErrorKind<'a> { 63 | fn from(err: SelectorParseErrorKind<'a>) -> Self { 64 | match err { 65 | SelectorParseErrorKind::PseudoElementExpectedColon(token) => { 66 | Self::ExpectedColonOnPseudoElement(token) 67 | } 68 | SelectorParseErrorKind::PseudoElementExpectedIdent(token) => { 69 | Self::ExpectedIdentityOnPseudoElement(token) 70 | } 71 | other => Self::UnexpectedSelectorParseError(other), 72 | } 73 | } 74 | } 75 | 76 | impl Display for SelectorErrorKind<'_> { 77 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 78 | write!( 79 | f, 80 | "{}", 81 | match self { 82 | Self::UnexpectedToken(token) => { 83 | format!("Token {:?} was not expected", utils::render_token(token)) 84 | } 85 | Self::EndOfLine => "Unexpected EOL".to_string(), 86 | Self::InvalidAtRule(rule) => format!("Invalid @-rule {:?}", rule), 87 | Self::InvalidAtRuleBody => "The body of an @-rule was invalid".to_string(), 88 | Self::QualRuleInvalid => "The qualified name was invalid".to_string(), 89 | Self::ExpectedColonOnPseudoElement(token) => format!( 90 | "Expected a ':' token for pseudoelement, got {:?} instead", 91 | utils::render_token(token) 92 | ), 93 | Self::ExpectedIdentityOnPseudoElement(token) => format!( 94 | "Expected identity for pseudoelement, got {:?} instead", 95 | utils::render_token(token) 96 | ), 97 | Self::UnexpectedSelectorParseError(err) => format!( 98 | "Unexpected error occurred. Please report this to the developer\n{:#?}", 99 | err 100 | ), 101 | } 102 | ) 103 | } 104 | } 105 | 106 | impl Error for SelectorErrorKind<'_> { 107 | fn description(&self) -> &str { 108 | match self { 109 | Self::UnexpectedToken(_) => "Token was not expected", 110 | Self::EndOfLine => "Unexpected EOL", 111 | Self::InvalidAtRule(_) => "Invalid @-rule", 112 | Self::InvalidAtRuleBody => "The body of an @-rule was invalid", 113 | Self::QualRuleInvalid => "The qualified name was invalid", 114 | Self::ExpectedColonOnPseudoElement(_) => "Missing colon character on pseudoelement", 115 | Self::ExpectedIdentityOnPseudoElement(_) => "Missing pseudoelement identity", 116 | Self::UnexpectedSelectorParseError(_) => "Unexpected error", 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /scraper/src/error/utils.rs: -------------------------------------------------------------------------------- 1 | use cssparser::Token; 2 | 3 | pub(crate) fn render_token(token: &Token<'_>) -> String { 4 | match token { 5 | Token::Ident(ident) => ident.to_string(), 6 | Token::AtKeyword(value) => format!("@{}", value), 7 | Token::Hash(name) | Token::IDHash(name) => format!("#{}", name), 8 | Token::QuotedString(value) => format!("\"{}\"", value), 9 | Token::UnquotedUrl(value) => value.to_string(), 10 | Token::Number { 11 | has_sign: signed, 12 | value: num, 13 | int_value: _, 14 | } 15 | | Token::Percentage { 16 | has_sign: signed, 17 | unit_value: num, 18 | int_value: _, 19 | } => render_number(*signed, *num, token), 20 | Token::Dimension { 21 | has_sign: signed, 22 | value: num, 23 | int_value: _, 24 | unit, 25 | } => format!("{}{}", render_int(*signed, *num), unit), 26 | Token::WhiteSpace(_) => String::from(" "), 27 | Token::Comment(comment) => format!("/* {} */", comment), 28 | Token::Function(name) => format!("{}()", name), 29 | Token::BadString(string) => format!("", string), 30 | Token::BadUrl(url) => format!("", url), 31 | // Single-character token 32 | Token::Colon => ":".into(), 33 | Token::Semicolon => ";".into(), 34 | Token::Comma => ",".into(), 35 | Token::IncludeMatch => "~=".into(), 36 | Token::DashMatch => "|=".into(), 37 | Token::PrefixMatch => "^=".into(), 38 | Token::SuffixMatch => "$=".into(), 39 | Token::SubstringMatch => "*=".into(), 40 | Token::CDO => "".into(), 42 | Token::ParenthesisBlock => "<(".into(), 43 | Token::SquareBracketBlock => "<[".into(), 44 | Token::CurlyBracketBlock => "<{".into(), 45 | Token::CloseParenthesis => "<)".into(), 46 | Token::CloseSquareBracket => "<]".into(), 47 | Token::CloseCurlyBracket => "<}".into(), 48 | Token::Delim(delim) => (*delim).into(), 49 | } 50 | } 51 | 52 | fn render_number(signed: bool, num: f32, token: &Token) -> String { 53 | let num = render_int(signed, num); 54 | 55 | match token { 56 | Token::Number { .. } => num, 57 | Token::Percentage { .. } => format!("{}%", num), 58 | _ => panic!("render_number is not supposed to be called on a non-numerical token"), 59 | } 60 | } 61 | 62 | fn render_int(signed: bool, num: f32) -> String { 63 | if signed { 64 | render_int_signed(num) 65 | } else { 66 | render_int_unsigned(num) 67 | } 68 | } 69 | 70 | fn render_int_signed(num: f32) -> String { 71 | if num > 0.0 { 72 | format!("+{}", num) 73 | } else { 74 | format!("-{}", num) 75 | } 76 | } 77 | 78 | fn render_int_unsigned(num: f32) -> String { 79 | format!("{}", num) 80 | } 81 | 82 | #[cfg(test)] 83 | mod tests { 84 | use crate::Selector; 85 | 86 | #[test] 87 | fn regression_test_issue212() { 88 | let err = Selector::parse("div138293@!#@!!@#").unwrap_err(); 89 | assert_eq!(err.to_string(), "Token \"@\" was not expected"); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /scraper/src/html/mod.rs: -------------------------------------------------------------------------------- 1 | //! HTML documents and fragments. 2 | 3 | #[cfg(feature = "errors")] 4 | use std::borrow::Cow; 5 | use std::fmt; 6 | use std::iter::FusedIterator; 7 | 8 | use ego_tree::iter::Nodes; 9 | use ego_tree::Tree; 10 | use html5ever::serialize::SerializeOpts; 11 | use html5ever::tree_builder::QuirksMode; 12 | use html5ever::{driver, serialize, QualName}; 13 | use selectors::matching::SelectorCaches; 14 | use tendril::TendrilSink; 15 | 16 | use crate::selector::Selector; 17 | use crate::{ElementRef, Node}; 18 | 19 | pub use tree_sink::HtmlTreeSink; 20 | 21 | /// An HTML tree. 22 | /// 23 | /// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the 24 | /// `errors` field. The `tree` will still be populated as best as possible. 25 | /// 26 | /// Implements the `TreeSink` trait from the `html5ever` crate, which allows HTML to be parsed. 27 | #[derive(Debug, Clone, PartialEq, Eq)] 28 | pub struct Html { 29 | #[cfg(feature = "errors")] 30 | /// Parse errors. 31 | pub errors: Vec>, 32 | 33 | /// The quirks mode. 34 | pub quirks_mode: QuirksMode, 35 | 36 | /// The node tree. 37 | pub tree: Tree, 38 | } 39 | 40 | impl Html { 41 | /// Creates an empty HTML document. 42 | pub fn new_document() -> Self { 43 | Html { 44 | #[cfg(feature = "errors")] 45 | errors: Vec::new(), 46 | quirks_mode: QuirksMode::NoQuirks, 47 | tree: Tree::new(Node::Document), 48 | } 49 | } 50 | 51 | /// Creates an empty HTML fragment. 52 | pub fn new_fragment() -> Self { 53 | Html { 54 | #[cfg(feature = "errors")] 55 | errors: Vec::new(), 56 | quirks_mode: QuirksMode::NoQuirks, 57 | tree: Tree::new(Node::Fragment), 58 | } 59 | } 60 | 61 | /// Parses a string of HTML as a document. 62 | /// 63 | /// This is a convenience method for the following: 64 | /// 65 | /// ``` 66 | /// # extern crate html5ever; 67 | /// # extern crate scraper; 68 | /// # extern crate tendril; 69 | /// # fn main() { 70 | /// # let document = ""; 71 | /// use html5ever::driver::{self, ParseOpts}; 72 | /// use scraper::{Html, HtmlTreeSink}; 73 | /// use tendril::TendrilSink; 74 | /// 75 | /// let parser = driver::parse_document(HtmlTreeSink::new(Html::new_document()), ParseOpts::default()); 76 | /// let html = parser.one(document); 77 | /// # } 78 | /// ``` 79 | pub fn parse_document(document: &str) -> Self { 80 | let parser = 81 | driver::parse_document(HtmlTreeSink::new(Self::new_document()), Default::default()); 82 | parser.one(document) 83 | } 84 | 85 | /// Parses a string of HTML as a fragment. 86 | pub fn parse_fragment(fragment: &str) -> Self { 87 | let parser = driver::parse_fragment( 88 | HtmlTreeSink::new(Self::new_fragment()), 89 | Default::default(), 90 | QualName::new(None, ns!(html), local_name!("body")), 91 | Vec::new(), 92 | ); 93 | parser.one(fragment) 94 | } 95 | 96 | /// Returns an iterator over elements matching a selector. 97 | pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> { 98 | Select { 99 | inner: self.tree.nodes(), 100 | selector, 101 | caches: Default::default(), 102 | } 103 | } 104 | 105 | /// Returns the root `` element. 106 | pub fn root_element(&self) -> ElementRef { 107 | let root_node = self 108 | .tree 109 | .root() 110 | .children() 111 | .find(|child| child.value().is_element()) 112 | .expect("html node missing"); 113 | ElementRef::wrap(root_node).unwrap() 114 | } 115 | 116 | /// Serialize entire document into HTML. 117 | pub fn html(&self) -> String { 118 | let opts = SerializeOpts { 119 | scripting_enabled: false, // It's not clear what this does. 120 | traversal_scope: serialize::TraversalScope::IncludeNode, 121 | create_missing_parent: false, 122 | }; 123 | let mut buf = Vec::new(); 124 | serialize(&mut buf, self, opts).unwrap(); 125 | String::from_utf8(buf).unwrap() 126 | } 127 | } 128 | 129 | /// Iterator over elements matching a selector. 130 | pub struct Select<'a, 'b> { 131 | inner: Nodes<'a, Node>, 132 | selector: &'b Selector, 133 | caches: SelectorCaches, 134 | } 135 | 136 | impl fmt::Debug for Select<'_, '_> { 137 | fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { 138 | fmt.debug_struct("Select") 139 | .field("inner", &self.inner) 140 | .field("selector", &self.selector) 141 | .field("caches", &"..") 142 | .finish() 143 | } 144 | } 145 | 146 | impl Clone for Select<'_, '_> { 147 | fn clone(&self) -> Self { 148 | Self { 149 | inner: self.inner.clone(), 150 | selector: self.selector, 151 | caches: Default::default(), 152 | } 153 | } 154 | } 155 | 156 | impl<'a> Iterator for Select<'a, '_> { 157 | type Item = ElementRef<'a>; 158 | 159 | fn next(&mut self) -> Option> { 160 | for node in self.inner.by_ref() { 161 | if let Some(element) = ElementRef::wrap(node) { 162 | if element.parent().is_some() 163 | && self 164 | .selector 165 | .matches_with_scope_and_cache(&element, None, &mut self.caches) 166 | { 167 | return Some(element); 168 | } 169 | } 170 | } 171 | None 172 | } 173 | 174 | fn size_hint(&self) -> (usize, Option) { 175 | let (_lower, upper) = self.inner.size_hint(); 176 | 177 | (0, upper) 178 | } 179 | } 180 | 181 | impl DoubleEndedIterator for Select<'_, '_> { 182 | fn next_back(&mut self) -> Option { 183 | for node in self.inner.by_ref().rev() { 184 | if let Some(element) = ElementRef::wrap(node) { 185 | if element.parent().is_some() 186 | && self 187 | .selector 188 | .matches_with_scope_and_cache(&element, None, &mut self.caches) 189 | { 190 | return Some(element); 191 | } 192 | } 193 | } 194 | None 195 | } 196 | } 197 | 198 | impl FusedIterator for Select<'_, '_> {} 199 | 200 | mod serializable; 201 | mod tree_sink; 202 | 203 | #[cfg(test)] 204 | mod tests { 205 | use super::Html; 206 | use super::Selector; 207 | 208 | #[test] 209 | fn root_element_fragment() { 210 | let html = Html::parse_fragment(r#"1"#); 211 | let root_ref = html.root_element(); 212 | let href = root_ref 213 | .select(&Selector::parse("a").unwrap()) 214 | .next() 215 | .unwrap(); 216 | assert_eq!(href.inner_html(), "1"); 217 | assert_eq!(href.value().attr("href").unwrap(), "http://github.com"); 218 | } 219 | 220 | #[test] 221 | fn root_element_document_doctype() { 222 | let html = Html::parse_document("\nabc"); 223 | let root_ref = html.root_element(); 224 | let title = root_ref 225 | .select(&Selector::parse("title").unwrap()) 226 | .next() 227 | .unwrap(); 228 | assert_eq!(title.inner_html(), "abc"); 229 | } 230 | 231 | #[test] 232 | fn root_element_document_comment() { 233 | let html = Html::parse_document("abc"); 234 | let root_ref = html.root_element(); 235 | let title = root_ref 236 | .select(&Selector::parse("title").unwrap()) 237 | .next() 238 | .unwrap(); 239 | assert_eq!(title.inner_html(), "abc"); 240 | } 241 | 242 | #[test] 243 | fn select_is_reversible() { 244 | let html = Html::parse_document("

element1

element2

element3

"); 245 | let selector = Selector::parse("p").unwrap(); 246 | let result: Vec<_> = html 247 | .select(&selector) 248 | .rev() 249 | .map(|e| e.inner_html()) 250 | .collect(); 251 | assert_eq!(result, vec!["element3", "element2", "element1"]); 252 | } 253 | 254 | #[test] 255 | fn select_has_a_size_hint() { 256 | let html = Html::parse_document("

element1

element2

element3

"); 257 | let selector = Selector::parse("p").unwrap(); 258 | let (lower, upper) = html.select(&selector).size_hint(); 259 | assert_eq!(lower, 0); 260 | assert_eq!(upper, Some(10)); 261 | } 262 | 263 | #[cfg(feature = "atomic")] 264 | #[test] 265 | fn html_is_send() { 266 | fn send_sync() {} 267 | send_sync::(); 268 | } 269 | } 270 | -------------------------------------------------------------------------------- /scraper/src/html/serializable.rs: -------------------------------------------------------------------------------- 1 | use std::io::Error; 2 | 3 | use html5ever::serialize::{Serialize, Serializer, TraversalScope}; 4 | 5 | use crate::Html; 6 | 7 | impl Serialize for Html { 8 | fn serialize( 9 | &self, 10 | serializer: &mut S, 11 | traversal_scope: TraversalScope, 12 | ) -> Result<(), Error> { 13 | crate::node::serializable::serialize(self.tree.root(), serializer, traversal_scope) 14 | } 15 | } 16 | 17 | #[cfg(test)] 18 | mod tests { 19 | use crate::Html; 20 | 21 | #[test] 22 | fn test_serialize() { 23 | let src = r#"

Hello world!

"#; 24 | let html = Html::parse_document(src); 25 | assert_eq!(html.html(), src); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /scraper/src/html/tree_sink.rs: -------------------------------------------------------------------------------- 1 | use super::Html; 2 | use crate::node::{Comment, Doctype, Element, Node, ProcessingInstruction, Text}; 3 | use crate::tendril_util::make as make_tendril; 4 | use ego_tree::NodeId; 5 | use html5ever::tendril::StrTendril; 6 | use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; 7 | use html5ever::Attribute; 8 | use html5ever::QualName; 9 | use std::borrow::Cow; 10 | use std::cell::{Ref, RefCell}; 11 | 12 | /// Wraps `Html` instances as sinks to drive parsing 13 | #[derive(Debug)] 14 | pub struct HtmlTreeSink(pub RefCell); 15 | 16 | impl HtmlTreeSink { 17 | /// Wrap a `Html`instance as a sink to drive parsing 18 | pub fn new(html: Html) -> Self { 19 | Self(RefCell::new(html)) 20 | } 21 | } 22 | 23 | /// Note: does not support the `