├── .github └── workflows │ └── build.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE.md ├── README.md ├── flake.lock ├── flake.nix ├── src ├── link.rs ├── main.rs └── pretty_print.rs └── tests └── cli.rs /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build binaries 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | release: 9 | permissions: 10 | contents: write 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest, macos-latest, windows-latest] 15 | steps: 16 | - uses: actions/checkout@v2 17 | - uses: actions-rs/toolchain@v1 18 | with: 19 | toolchain: stable 20 | - uses: actions/cache@v2 21 | with: 22 | path: | 23 | ~/.cargo/bin/ 24 | ~/.cargo/registry/index/ 25 | ~/.cargo/registry/cache/ 26 | ~/.cargo/git/db/ 27 | target/ 28 | key: ${{ runner.os }}-cargo-${{ hashFiles('Cargo.lock') }} 29 | - name: Build release 30 | run: cargo build --release 31 | - name: Archive as .tar.gz (Linux) 32 | if: matrix.os == 'ubuntu-latest' 33 | run: tar cfz htmlq-x86_64-linux.tar.gz -C target/release htmlq 34 | - name: Archive as .tar.gz (macOS) 35 | if: matrix.os == 'macos-latest' 36 | run: tar cfz htmlq-x86_64-darwin.tar.gz -C target/release htmlq 37 | - name: Archive as .zip (Windows) 38 | if: matrix.os == 'windows-latest' 39 | shell: bash 40 | run: 7z a -tzip -mm=Deflate htmlq-x86_64-windows.zip ./target/release/htmlq.exe 41 | - name: Publish 42 | uses: softprops/action-gh-release@v1 43 | with: 44 | files: | 45 | htmlq*.tar.gz 46 | htmlq*.zip -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "ansi_term" 7 | version = "0.12.1" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" 10 | dependencies = [ 11 | "winapi", 12 | ] 13 | 14 | [[package]] 15 | name = "atty" 16 | version = "0.2.14" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 19 | dependencies = [ 20 | "hermit-abi", 21 | "libc", 22 | "winapi", 23 | ] 24 | 25 | [[package]] 26 | name = "autocfg" 27 | version = "1.1.0" 28 | source = "registry+https://github.com/rust-lang/crates.io-index" 29 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 30 | 31 | [[package]] 32 | name = "bitflags" 33 | version = "1.3.2" 34 | source = "registry+https://github.com/rust-lang/crates.io-index" 35 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 36 | 37 | [[package]] 38 | name = "byteorder" 39 | version = "1.4.3" 40 | source = "registry+https://github.com/rust-lang/crates.io-index" 41 | checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" 42 | 43 | [[package]] 44 | name = "cfg-if" 45 | version = "1.0.0" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 48 | 49 | [[package]] 50 | name = "clap" 51 | version = "2.34.0" 52 | source = "registry+https://github.com/rust-lang/crates.io-index" 53 | checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" 54 | dependencies = [ 55 | "ansi_term", 56 | "atty", 57 | "bitflags", 58 | "strsim", 59 | "textwrap", 60 | "unicode-width", 61 | "vec_map", 62 | ] 63 | 64 | [[package]] 65 | name = "convert_case" 66 | version = "0.4.0" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" 69 | 70 | [[package]] 71 | name = "cssparser" 72 | version = "0.27.2" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" 75 | dependencies = [ 76 | "cssparser-macros", 77 | "dtoa-short", 78 | "itoa", 79 | "matches", 80 | "phf", 81 | "proc-macro2", 82 | "quote", 83 | "smallvec", 84 | "syn", 85 | ] 86 | 87 | [[package]] 88 | name = "cssparser-macros" 89 | version = "0.6.0" 90 | source = "registry+https://github.com/rust-lang/crates.io-index" 91 | checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" 92 | dependencies = [ 93 | "quote", 94 | "syn", 95 | ] 96 | 97 | [[package]] 98 | name = "derive_more" 99 | version = "0.99.17" 100 | source = "registry+https://github.com/rust-lang/crates.io-index" 101 | checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" 102 | dependencies = [ 103 | "convert_case", 104 | "proc-macro2", 105 | "quote", 106 | "rustc_version", 107 | "syn", 108 | ] 109 | 110 | [[package]] 111 | name = "dtoa" 112 | version = "0.4.8" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" 115 | 116 | [[package]] 117 | name = "dtoa-short" 118 | version = "0.3.3" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "bde03329ae10e79ede66c9ce4dc930aa8599043b0743008548680f25b91502d6" 121 | dependencies = [ 122 | "dtoa", 123 | ] 124 | 125 | [[package]] 126 | name = "form_urlencoded" 127 | version = "1.1.0" 128 | source = "registry+https://github.com/rust-lang/crates.io-index" 129 | checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" 130 | dependencies = [ 131 | "percent-encoding", 132 | ] 133 | 134 | [[package]] 135 | name = "futf" 136 | version = "0.1.5" 137 | source = "registry+https://github.com/rust-lang/crates.io-index" 138 | checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" 139 | dependencies = [ 140 | "mac", 141 | "new_debug_unreachable", 142 | ] 143 | 144 | [[package]] 145 | name = "fxhash" 146 | version = "0.2.1" 147 | source = "registry+https://github.com/rust-lang/crates.io-index" 148 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 149 | dependencies = [ 150 | "byteorder", 151 | ] 152 | 153 | [[package]] 154 | name = "getrandom" 155 | version = "0.1.16" 156 | source = "registry+https://github.com/rust-lang/crates.io-index" 157 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 158 | dependencies = [ 159 | "cfg-if", 160 | "libc", 161 | "wasi 0.9.0+wasi-snapshot-preview1", 162 | ] 163 | 164 | [[package]] 165 | name = "getrandom" 166 | version = "0.2.8" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" 169 | dependencies = [ 170 | "cfg-if", 171 | "libc", 172 | "wasi 0.11.0+wasi-snapshot-preview1", 173 | ] 174 | 175 | [[package]] 176 | name = "hermit-abi" 177 | version = "0.1.19" 178 | source = "registry+https://github.com/rust-lang/crates.io-index" 179 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 180 | dependencies = [ 181 | "libc", 182 | ] 183 | 184 | [[package]] 185 | name = "html5ever" 186 | version = "0.25.2" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148" 189 | dependencies = [ 190 | "log", 191 | "mac", 192 | "markup5ever", 193 | "proc-macro2", 194 | "quote", 195 | "syn", 196 | ] 197 | 198 | [[package]] 199 | name = "htmlq" 200 | version = "0.4.0" 201 | dependencies = [ 202 | "clap", 203 | "html5ever", 204 | "kuchiki", 205 | "lazy_static", 206 | "url", 207 | ] 208 | 209 | [[package]] 210 | name = "idna" 211 | version = "0.3.0" 212 | source = "registry+https://github.com/rust-lang/crates.io-index" 213 | checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" 214 | dependencies = [ 215 | "unicode-bidi", 216 | "unicode-normalization", 217 | ] 218 | 219 | [[package]] 220 | name = "itoa" 221 | version = "0.4.8" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" 224 | 225 | [[package]] 226 | name = "kuchiki" 227 | version = "0.8.1" 228 | source = "registry+https://github.com/rust-lang/crates.io-index" 229 | checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" 230 | dependencies = [ 231 | "cssparser", 232 | "html5ever", 233 | "matches", 234 | "selectors", 235 | ] 236 | 237 | [[package]] 238 | name = "lazy_static" 239 | version = "1.4.0" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 242 | 243 | [[package]] 244 | name = "libc" 245 | version = "0.2.137" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89" 248 | 249 | [[package]] 250 | name = "lock_api" 251 | version = "0.4.9" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" 254 | dependencies = [ 255 | "autocfg", 256 | "scopeguard", 257 | ] 258 | 259 | [[package]] 260 | name = "log" 261 | version = "0.4.17" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" 264 | dependencies = [ 265 | "cfg-if", 266 | ] 267 | 268 | [[package]] 269 | name = "mac" 270 | version = "0.1.1" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 273 | 274 | [[package]] 275 | name = "markup5ever" 276 | version = "0.10.1" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" 279 | dependencies = [ 280 | "log", 281 | "phf", 282 | "phf_codegen", 283 | "string_cache", 284 | "string_cache_codegen", 285 | "tendril", 286 | ] 287 | 288 | [[package]] 289 | name = "matches" 290 | version = "0.1.9" 291 | source = "registry+https://github.com/rust-lang/crates.io-index" 292 | checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" 293 | 294 | [[package]] 295 | name = "new_debug_unreachable" 296 | version = "1.0.4" 297 | source = "registry+https://github.com/rust-lang/crates.io-index" 298 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 299 | 300 | [[package]] 301 | name = "nodrop" 302 | version = "0.1.14" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" 305 | 306 | [[package]] 307 | name = "once_cell" 308 | version = "1.16.0" 309 | source = "registry+https://github.com/rust-lang/crates.io-index" 310 | checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" 311 | 312 | [[package]] 313 | name = "parking_lot" 314 | version = "0.12.1" 315 | source = "registry+https://github.com/rust-lang/crates.io-index" 316 | checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" 317 | dependencies = [ 318 | "lock_api", 319 | "parking_lot_core", 320 | ] 321 | 322 | [[package]] 323 | name = "parking_lot_core" 324 | version = "0.9.5" 325 | source = "registry+https://github.com/rust-lang/crates.io-index" 326 | checksum = "7ff9f3fef3968a3ec5945535ed654cb38ff72d7495a25619e2247fb15a2ed9ba" 327 | dependencies = [ 328 | "cfg-if", 329 | "libc", 330 | "redox_syscall", 331 | "smallvec", 332 | "windows-sys", 333 | ] 334 | 335 | [[package]] 336 | name = "percent-encoding" 337 | version = "2.2.0" 338 | source = "registry+https://github.com/rust-lang/crates.io-index" 339 | checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" 340 | 341 | [[package]] 342 | name = "phf" 343 | version = "0.8.0" 344 | source = "registry+https://github.com/rust-lang/crates.io-index" 345 | checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" 346 | dependencies = [ 347 | "phf_macros", 348 | "phf_shared 0.8.0", 349 | "proc-macro-hack", 350 | ] 351 | 352 | [[package]] 353 | name = "phf_codegen" 354 | version = "0.8.0" 355 | source = "registry+https://github.com/rust-lang/crates.io-index" 356 | checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" 357 | dependencies = [ 358 | "phf_generator 0.8.0", 359 | "phf_shared 0.8.0", 360 | ] 361 | 362 | [[package]] 363 | name = "phf_generator" 364 | version = "0.8.0" 365 | source = "registry+https://github.com/rust-lang/crates.io-index" 366 | checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" 367 | dependencies = [ 368 | "phf_shared 0.8.0", 369 | "rand 0.7.3", 370 | ] 371 | 372 | [[package]] 373 | name = "phf_generator" 374 | version = "0.10.0" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" 377 | dependencies = [ 378 | "phf_shared 0.10.0", 379 | "rand 0.8.5", 380 | ] 381 | 382 | [[package]] 383 | name = "phf_macros" 384 | version = "0.8.0" 385 | source = "registry+https://github.com/rust-lang/crates.io-index" 386 | checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" 387 | dependencies = [ 388 | "phf_generator 0.8.0", 389 | "phf_shared 0.8.0", 390 | "proc-macro-hack", 391 | "proc-macro2", 392 | "quote", 393 | "syn", 394 | ] 395 | 396 | [[package]] 397 | name = "phf_shared" 398 | version = "0.8.0" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" 401 | dependencies = [ 402 | "siphasher", 403 | ] 404 | 405 | [[package]] 406 | name = "phf_shared" 407 | version = "0.10.0" 408 | source = "registry+https://github.com/rust-lang/crates.io-index" 409 | checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" 410 | dependencies = [ 411 | "siphasher", 412 | ] 413 | 414 | [[package]] 415 | name = "ppv-lite86" 416 | version = "0.2.17" 417 | source = "registry+https://github.com/rust-lang/crates.io-index" 418 | checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" 419 | 420 | [[package]] 421 | name = "precomputed-hash" 422 | version = "0.1.1" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 425 | 426 | [[package]] 427 | name = "proc-macro-hack" 428 | version = "0.5.19" 429 | source = "registry+https://github.com/rust-lang/crates.io-index" 430 | checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" 431 | 432 | [[package]] 433 | name = "proc-macro2" 434 | version = "1.0.47" 435 | source = "registry+https://github.com/rust-lang/crates.io-index" 436 | checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" 437 | dependencies = [ 438 | "unicode-ident", 439 | ] 440 | 441 | [[package]] 442 | name = "quote" 443 | version = "1.0.21" 444 | source = "registry+https://github.com/rust-lang/crates.io-index" 445 | checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" 446 | dependencies = [ 447 | "proc-macro2", 448 | ] 449 | 450 | [[package]] 451 | name = "rand" 452 | version = "0.7.3" 453 | source = "registry+https://github.com/rust-lang/crates.io-index" 454 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 455 | dependencies = [ 456 | "getrandom 0.1.16", 457 | "libc", 458 | "rand_chacha 0.2.2", 459 | "rand_core 0.5.1", 460 | "rand_hc", 461 | "rand_pcg", 462 | ] 463 | 464 | [[package]] 465 | name = "rand" 466 | version = "0.8.5" 467 | source = "registry+https://github.com/rust-lang/crates.io-index" 468 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 469 | dependencies = [ 470 | "libc", 471 | "rand_chacha 0.3.1", 472 | "rand_core 0.6.4", 473 | ] 474 | 475 | [[package]] 476 | name = "rand_chacha" 477 | version = "0.2.2" 478 | source = "registry+https://github.com/rust-lang/crates.io-index" 479 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 480 | dependencies = [ 481 | "ppv-lite86", 482 | "rand_core 0.5.1", 483 | ] 484 | 485 | [[package]] 486 | name = "rand_chacha" 487 | version = "0.3.1" 488 | source = "registry+https://github.com/rust-lang/crates.io-index" 489 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 490 | dependencies = [ 491 | "ppv-lite86", 492 | "rand_core 0.6.4", 493 | ] 494 | 495 | [[package]] 496 | name = "rand_core" 497 | version = "0.5.1" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 500 | dependencies = [ 501 | "getrandom 0.1.16", 502 | ] 503 | 504 | [[package]] 505 | name = "rand_core" 506 | version = "0.6.4" 507 | source = "registry+https://github.com/rust-lang/crates.io-index" 508 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 509 | dependencies = [ 510 | "getrandom 0.2.8", 511 | ] 512 | 513 | [[package]] 514 | name = "rand_hc" 515 | version = "0.2.0" 516 | source = "registry+https://github.com/rust-lang/crates.io-index" 517 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 518 | dependencies = [ 519 | "rand_core 0.5.1", 520 | ] 521 | 522 | [[package]] 523 | name = "rand_pcg" 524 | version = "0.2.1" 525 | source = "registry+https://github.com/rust-lang/crates.io-index" 526 | checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" 527 | dependencies = [ 528 | "rand_core 0.5.1", 529 | ] 530 | 531 | [[package]] 532 | name = "redox_syscall" 533 | version = "0.2.16" 534 | source = "registry+https://github.com/rust-lang/crates.io-index" 535 | checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" 536 | dependencies = [ 537 | "bitflags", 538 | ] 539 | 540 | [[package]] 541 | name = "rustc_version" 542 | version = "0.4.0" 543 | source = "registry+https://github.com/rust-lang/crates.io-index" 544 | checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" 545 | dependencies = [ 546 | "semver", 547 | ] 548 | 549 | [[package]] 550 | name = "scopeguard" 551 | version = "1.1.0" 552 | source = "registry+https://github.com/rust-lang/crates.io-index" 553 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 554 | 555 | [[package]] 556 | name = "selectors" 557 | version = "0.22.0" 558 | source = "registry+https://github.com/rust-lang/crates.io-index" 559 | checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" 560 | dependencies = [ 561 | "bitflags", 562 | "cssparser", 563 | "derive_more", 564 | "fxhash", 565 | "log", 566 | "matches", 567 | "phf", 568 | "phf_codegen", 569 | "precomputed-hash", 570 | "servo_arc", 571 | "smallvec", 572 | "thin-slice", 573 | ] 574 | 575 | [[package]] 576 | name = "semver" 577 | version = "1.0.14" 578 | source = "registry+https://github.com/rust-lang/crates.io-index" 579 | checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" 580 | 581 | [[package]] 582 | name = "serde" 583 | version = "1.0.148" 584 | source = "registry+https://github.com/rust-lang/crates.io-index" 585 | checksum = "e53f64bb4ba0191d6d0676e1b141ca55047d83b74f5607e6d8eb88126c52c2dc" 586 | 587 | [[package]] 588 | name = "servo_arc" 589 | version = "0.1.1" 590 | source = "registry+https://github.com/rust-lang/crates.io-index" 591 | checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" 592 | dependencies = [ 593 | "nodrop", 594 | "stable_deref_trait", 595 | ] 596 | 597 | [[package]] 598 | name = "siphasher" 599 | version = "0.3.10" 600 | source = "registry+https://github.com/rust-lang/crates.io-index" 601 | checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" 602 | 603 | [[package]] 604 | name = "smallvec" 605 | version = "1.10.0" 606 | source = "registry+https://github.com/rust-lang/crates.io-index" 607 | checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" 608 | 609 | [[package]] 610 | name = "stable_deref_trait" 611 | version = "1.2.0" 612 | source = "registry+https://github.com/rust-lang/crates.io-index" 613 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" 614 | 615 | [[package]] 616 | name = "string_cache" 617 | version = "0.8.4" 618 | source = "registry+https://github.com/rust-lang/crates.io-index" 619 | checksum = "213494b7a2b503146286049378ce02b482200519accc31872ee8be91fa820a08" 620 | dependencies = [ 621 | "new_debug_unreachable", 622 | "once_cell", 623 | "parking_lot", 624 | "phf_shared 0.10.0", 625 | "precomputed-hash", 626 | "serde", 627 | ] 628 | 629 | [[package]] 630 | name = "string_cache_codegen" 631 | version = "0.5.2" 632 | source = "registry+https://github.com/rust-lang/crates.io-index" 633 | checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" 634 | dependencies = [ 635 | "phf_generator 0.10.0", 636 | "phf_shared 0.10.0", 637 | "proc-macro2", 638 | "quote", 639 | ] 640 | 641 | [[package]] 642 | name = "strsim" 643 | version = "0.8.0" 644 | source = "registry+https://github.com/rust-lang/crates.io-index" 645 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" 646 | 647 | [[package]] 648 | name = "syn" 649 | version = "1.0.105" 650 | source = "registry+https://github.com/rust-lang/crates.io-index" 651 | checksum = "60b9b43d45702de4c839cb9b51d9f529c5dd26a4aff255b42b1ebc03e88ee908" 652 | dependencies = [ 653 | "proc-macro2", 654 | "quote", 655 | "unicode-ident", 656 | ] 657 | 658 | [[package]] 659 | name = "tendril" 660 | version = "0.4.3" 661 | source = "registry+https://github.com/rust-lang/crates.io-index" 662 | checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" 663 | dependencies = [ 664 | "futf", 665 | "mac", 666 | "utf-8", 667 | ] 668 | 669 | [[package]] 670 | name = "textwrap" 671 | version = "0.11.0" 672 | source = "registry+https://github.com/rust-lang/crates.io-index" 673 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 674 | dependencies = [ 675 | "unicode-width", 676 | ] 677 | 678 | [[package]] 679 | name = "thin-slice" 680 | version = "0.1.1" 681 | source = "registry+https://github.com/rust-lang/crates.io-index" 682 | checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" 683 | 684 | [[package]] 685 | name = "tinyvec" 686 | version = "1.6.0" 687 | source = "registry+https://github.com/rust-lang/crates.io-index" 688 | checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" 689 | dependencies = [ 690 | "tinyvec_macros", 691 | ] 692 | 693 | [[package]] 694 | name = "tinyvec_macros" 695 | version = "0.1.0" 696 | source = "registry+https://github.com/rust-lang/crates.io-index" 697 | checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" 698 | 699 | [[package]] 700 | name = "unicode-bidi" 701 | version = "0.3.8" 702 | source = "registry+https://github.com/rust-lang/crates.io-index" 703 | checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" 704 | 705 | [[package]] 706 | name = "unicode-ident" 707 | version = "1.0.5" 708 | source = "registry+https://github.com/rust-lang/crates.io-index" 709 | checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" 710 | 711 | [[package]] 712 | name = "unicode-normalization" 713 | version = "0.1.22" 714 | source = "registry+https://github.com/rust-lang/crates.io-index" 715 | checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" 716 | dependencies = [ 717 | "tinyvec", 718 | ] 719 | 720 | [[package]] 721 | name = "unicode-width" 722 | version = "0.1.10" 723 | source = "registry+https://github.com/rust-lang/crates.io-index" 724 | checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" 725 | 726 | [[package]] 727 | name = "url" 728 | version = "2.3.1" 729 | source = "registry+https://github.com/rust-lang/crates.io-index" 730 | checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" 731 | dependencies = [ 732 | "form_urlencoded", 733 | "idna", 734 | "percent-encoding", 735 | ] 736 | 737 | [[package]] 738 | name = "utf-8" 739 | version = "0.7.6" 740 | source = "registry+https://github.com/rust-lang/crates.io-index" 741 | checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" 742 | 743 | [[package]] 744 | name = "vec_map" 745 | version = "0.8.2" 746 | source = "registry+https://github.com/rust-lang/crates.io-index" 747 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" 748 | 749 | [[package]] 750 | name = "wasi" 751 | version = "0.9.0+wasi-snapshot-preview1" 752 | source = "registry+https://github.com/rust-lang/crates.io-index" 753 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 754 | 755 | [[package]] 756 | name = "wasi" 757 | version = "0.11.0+wasi-snapshot-preview1" 758 | source = "registry+https://github.com/rust-lang/crates.io-index" 759 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 760 | 761 | [[package]] 762 | name = "winapi" 763 | version = "0.3.9" 764 | source = "registry+https://github.com/rust-lang/crates.io-index" 765 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 766 | dependencies = [ 767 | "winapi-i686-pc-windows-gnu", 768 | "winapi-x86_64-pc-windows-gnu", 769 | ] 770 | 771 | [[package]] 772 | name = "winapi-i686-pc-windows-gnu" 773 | version = "0.4.0" 774 | source = "registry+https://github.com/rust-lang/crates.io-index" 775 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 776 | 777 | [[package]] 778 | name = "winapi-x86_64-pc-windows-gnu" 779 | version = "0.4.0" 780 | source = "registry+https://github.com/rust-lang/crates.io-index" 781 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 782 | 783 | [[package]] 784 | name = "windows-sys" 785 | version = "0.42.0" 786 | source = "registry+https://github.com/rust-lang/crates.io-index" 787 | checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" 788 | dependencies = [ 789 | "windows_aarch64_gnullvm", 790 | "windows_aarch64_msvc", 791 | "windows_i686_gnu", 792 | "windows_i686_msvc", 793 | "windows_x86_64_gnu", 794 | "windows_x86_64_gnullvm", 795 | "windows_x86_64_msvc", 796 | ] 797 | 798 | [[package]] 799 | name = "windows_aarch64_gnullvm" 800 | version = "0.42.0" 801 | source = "registry+https://github.com/rust-lang/crates.io-index" 802 | checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" 803 | 804 | [[package]] 805 | name = "windows_aarch64_msvc" 806 | version = "0.42.0" 807 | source = "registry+https://github.com/rust-lang/crates.io-index" 808 | checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" 809 | 810 | [[package]] 811 | name = "windows_i686_gnu" 812 | version = "0.42.0" 813 | source = "registry+https://github.com/rust-lang/crates.io-index" 814 | checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" 815 | 816 | [[package]] 817 | name = "windows_i686_msvc" 818 | version = "0.42.0" 819 | source = "registry+https://github.com/rust-lang/crates.io-index" 820 | checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" 821 | 822 | [[package]] 823 | name = "windows_x86_64_gnu" 824 | version = "0.42.0" 825 | source = "registry+https://github.com/rust-lang/crates.io-index" 826 | checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" 827 | 828 | [[package]] 829 | name = "windows_x86_64_gnullvm" 830 | version = "0.42.0" 831 | source = "registry+https://github.com/rust-lang/crates.io-index" 832 | checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" 833 | 834 | [[package]] 835 | name = "windows_x86_64_msvc" 836 | version = "0.42.0" 837 | source = "registry+https://github.com/rust-lang/crates.io-index" 838 | checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" 839 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "htmlq" 3 | description = "Like jq, but for HTML." 4 | categories = ["command-line-utilities"] 5 | keywords = ["CSS", "HTML", "query"] 6 | repository = "https://github.com/mgdm/htmlq" 7 | documentation = "https://github.com/mgdm/htmlq/blob/master/README.md" 8 | readme = "README.md" 9 | license = "MIT" 10 | license-file = "LICENSE.md" 11 | version = "0.4.0" 12 | authors = ["Michael Maclean "] 13 | edition = "2021" 14 | exclude = ["/.github"] 15 | 16 | [dependencies] 17 | kuchiki = "0.8.1" 18 | html5ever = "0.25.1" 19 | clap = "2.33.3" 20 | lazy_static = "1.4.0" 21 | url = "2.2.2" 22 | 23 | [dev-dependencies] 24 | assert_cmd = "2.0" 25 | predicates = "2.1" 26 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Michael Maclean 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # htmlq 2 | Like [`jq`](https://stedolan.github.io/jq/), but for HTML. Uses [CSS selectors](https://developer.mozilla.org/en-US/docs/Learn/CSS/Introduction_to_CSS/Selectors) to extract bits of content from HTML files. 3 | 4 | ## Installation 5 | 6 | ### [Cargo](https://crates.io/crates/htmlq) 7 | 8 | ```sh 9 | cargo install htmlq 10 | ``` 11 | 12 | ### [FreeBSD pkg](https://www.freshports.org/textproc/htmlq) 13 | 14 | ```sh 15 | pkg install htmlq 16 | ``` 17 | 18 | ### [Homebrew](https://formulae.brew.sh/formula/htmlq) 19 | 20 | ```sh 21 | brew install htmlq 22 | ``` 23 | 24 | ### [Scoop](https://scoop.sh/) 25 | 26 | ```sh 27 | scoop install htmlq 28 | ``` 29 | 30 | ## Usage 31 | 32 | ```console 33 | $ htmlq -h 34 | htmlq 0.4.0 35 | Michael Maclean 36 | Runs CSS selectors on HTML 37 | 38 | USAGE: 39 | htmlq [FLAGS] [OPTIONS] [--] [selector]... 40 | 41 | FLAGS: 42 | -B, --detect-base Try to detect the base URL from the tag in the document. If not found, default to 43 | the value of --base, if supplied 44 | -h, --help Prints help information 45 | -w, --ignore-whitespace When printing text nodes, ignore those that consist entirely of whitespace 46 | -p, --pretty Pretty-print the serialised output 47 | -t, --text Output only the contents of text nodes inside selected elements 48 | -V, --version Prints version information 49 | 50 | OPTIONS: 51 | -a, --attribute Only return this attribute (if present) from selected elements 52 | -b, --base Use this URL as the base for links 53 | -f, --filename The input file. Defaults to stdin 54 | -o, --output The output file. Defaults to stdout 55 | -r, --remove-nodes ... Remove nodes matching this expression before output. May be specified multiple 56 | times 57 | 58 | ARGS: 59 | ... The CSS expression to select [default: html] 60 | $ 61 | ``` 62 | 63 | ## Examples 64 | 65 | ### Using with cURL to find part of a page by ID 66 | 67 | ```console 68 | $ curl --silent https://www.rust-lang.org/ | htmlq '#get-help' 69 |
70 |

Get help!

71 | 76 |
77 | 78 | 84 |
85 |
86 | ``` 87 | 88 | ### Find all the links in a page 89 | 90 | ```console 91 | $ curl --silent https://www.rust-lang.org/ | htmlq --attribute href a 92 | / 93 | /tools/install 94 | /learn 95 | /tools 96 | /governance 97 | /community 98 | https://blog.rust-lang.org/ 99 | /learn/get-started 100 | https://blog.rust-lang.org/2019/04/25/Rust-1.34.1.html 101 | https://blog.rust-lang.org/2018/12/06/Rust-1.31-and-rust-2018.html 102 | [...] 103 | ``` 104 | 105 | ### Get the text content of a post 106 | 107 | ```console 108 | $ curl --silent https://nixos.org/nixos/about.html | htmlq --text .main 109 | 110 | About NixOS 111 | 112 | NixOS is a GNU/Linux distribution that aims to 113 | improve the state of the art in system configuration management. In 114 | existing distributions, actions such as upgrades are dangerous: 115 | upgrading a package can cause other packages to break, upgrading an 116 | entire system is much less reliable than reinstalling from scratch, 117 | you can’t safely test what the results of a configuration change will 118 | be, you cannot easily undo changes to the system, and so on. We want 119 | to change that. NixOS has many innovative features: 120 | 121 | [...] 122 | ``` 123 | 124 | ### Remove a node before output 125 | 126 | There's a big SVG image in this page that I don't need, so here's how to remove it. 127 | 128 | ```console 129 | $ curl --silent https://nixos.org/ | ./target/debug/htmlq '.whynix' --remove-nodes svg 130 |
    131 |
  • 132 | 133 |

    Reproducible

    134 |

    135 | Nix builds packages in isolation from each other. This ensures that they 136 | are reproducible and don't have undeclared dependencies, so if a 137 | package works on one machine, it will also work on another. 138 |

    139 |
  • 140 |
  • 141 | 142 |

    Declarative

    143 |

    144 | Nix makes it trivial to share development and build 145 | environments for your projects, regardless of what programming 146 | languages and tools you’re using. 147 |

    148 |
  • 149 |
  • 150 | 151 |

    Reliable

    152 |

    153 | Nix ensures that installing or upgrading one package cannot 154 | break other packages. It allows you to roll back to 155 | previous versions, and ensures that no package is in an 156 | inconsistent state during an upgrade. 157 |

    158 |
  • 159 |
160 | ``` 161 | 162 | ### Pretty print HTML 163 | 164 | (This is a bit of a work in progress) 165 | 166 | ```console 167 | $ curl --silent https://mgdm.net | htmlq --pretty '#posts' 168 |
169 |

I write about... 170 |

171 |
    172 |
  • 173 | 175 |

    Debugging network connections on macOS with nettop 176 |

    177 |

    Using nettop to find out what network connections a program is trying to make. 178 |

    179 |
  • 180 | [...] 181 | ``` 182 | 183 | ### Syntax highlighting with [`bat`](https://github.com/sharkdp/bat) 184 | 185 | ```console 186 | $ curl --silent example.com | htmlq 'body' | bat --language html 187 | ``` 188 | 189 | > Syntax highlighted output 190 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "crate2nix": { 4 | "flake": false, 5 | "locked": { 6 | "lastModified": 1665362239, 7 | "narHash": "sha256-vNMu88WotPaOfuzubfOsLcaHB9WwDfV5/drEzY8tpFo=", 8 | "owner": "kolloch", 9 | "repo": "crate2nix", 10 | "rev": "cdcdd4950cc6ef1133b5f866a7c20dc06c130a84", 11 | "type": "github" 12 | }, 13 | "original": { 14 | "owner": "kolloch", 15 | "repo": "crate2nix", 16 | "type": "github" 17 | } 18 | }, 19 | "flake-utils": { 20 | "locked": { 21 | "lastModified": 1659877975, 22 | "narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=", 23 | "owner": "numtide", 24 | "repo": "flake-utils", 25 | "rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0", 26 | "type": "github" 27 | }, 28 | "original": { 29 | "owner": "numtide", 30 | "repo": "flake-utils", 31 | "type": "github" 32 | } 33 | }, 34 | "nixpkgs": { 35 | "locked": { 36 | "lastModified": 1665634984, 37 | "narHash": "sha256-zwXeMc96BD9iFxSB/SLr3dI8iYpqM+seX9qy6bGV+cw=", 38 | "owner": "NixOS", 39 | "repo": "nixpkgs", 40 | "rev": "cfea568da97a2668ef3cb3fc42eaacfb0e706807", 41 | "type": "github" 42 | }, 43 | "original": { 44 | "id": "nixpkgs", 45 | "type": "indirect" 46 | } 47 | }, 48 | "root": { 49 | "inputs": { 50 | "crate2nix": "crate2nix", 51 | "flake-utils": "flake-utils", 52 | "nixpkgs": "nixpkgs" 53 | } 54 | } 55 | }, 56 | "root": "root", 57 | "version": 7 58 | } 59 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "like jq, but for HTML."; 3 | 4 | inputs = { 5 | nixpkgs.url = "nixpkgs"; # Resolves to github:NixOS/nixpkgs 6 | # Helpers for system-specific outputs 7 | flake-utils.url = "github:numtide/flake-utils"; 8 | crate2nix = { 9 | url = "github:kolloch/crate2nix"; 10 | flake = false; 11 | }; 12 | }; 13 | 14 | outputs = { self, nixpkgs, crate2nix, flake-utils }: 15 | # Create system-specific outputs for the standard Nix systems 16 | # https://github.com/numtide/flake-utils/blob/master/default.nix#L3-L9 17 | flake-utils.lib.eachDefaultSystem (system: 18 | let 19 | pkgs = nixpkgs.legacyPackages.${system}; 20 | crateName = "htmlq"; 21 | 22 | inherit (import "${crate2nix}/tools.nix" { inherit pkgs; }) 23 | generatedCargoNix; 24 | 25 | project = pkgs.callPackage (generatedCargoNix { 26 | name = crateName; 27 | src = ./.; 28 | }) { 29 | defaultCrateOverrides = pkgs.defaultCrateOverrides // { 30 | # Crate dependency overrides go here 31 | }; 32 | }; 33 | in { 34 | packages.${crateName} = project.rootCrate.build; 35 | 36 | defaultPackage = self.packages.${system}.${crateName}; 37 | 38 | devShell = pkgs.mkShell { 39 | inputsFrom = builtins.attrValues self.packages.${system}; 40 | buildInputs = [ pkgs.cargo pkgs.rust-analyzer pkgs.clippy ]; 41 | }; 42 | }); 43 | } 44 | -------------------------------------------------------------------------------- /src/link.rs: -------------------------------------------------------------------------------- 1 | use html5ever::local_name; 2 | use kuchiki::NodeRef; 3 | use url::Url; 4 | 5 | pub fn rewrite_relative_url(node: &NodeRef, base: &Url) { 6 | let Some(elem) = node.as_element() else { 7 | return 8 | }; 9 | if !(local_name!("a") == elem.name.local 10 | || local_name!("link") == elem.name.local 11 | || local_name!("area") == elem.name.local) 12 | { 13 | return; 14 | }; 15 | let mut attrs = elem.attributes.borrow_mut(); 16 | 17 | if attrs.contains("href") { 18 | let Some(url) = attrs.get_mut("href") else { 19 | return 20 | }; 21 | if url.starts_with("////") { 22 | *url = url.trim_start_matches('/').to_string(); 23 | return; 24 | } 25 | let new_url = base.join(url).ok().unwrap_or_else(|| base.to_owned()); 26 | attrs.insert("href", new_url.to_string()); 27 | } 28 | } 29 | 30 | pub fn detect_base(document: &NodeRef) -> Option { 31 | let Ok(node) = document.select_first("base") else { 32 | return None 33 | }; 34 | 35 | let attrs = node.attributes.borrow(); 36 | 37 | if attrs.contains("href") { 38 | let href = attrs 39 | .get("href") 40 | .expect("should have retrieved href from node attributes"); 41 | return match Url::parse(href) { 42 | Ok(url) => Some(url), 43 | _ => None, 44 | }; 45 | } 46 | 47 | None 48 | } 49 | 50 | #[cfg(test)] 51 | mod tests { 52 | use html5ever::tendril::TendrilSink; 53 | 54 | use super::*; 55 | 56 | macro_rules! rewrite_tests { 57 | ($($name:ident: $value:expr,)*) => { 58 | $( 59 | #[test] 60 | fn $name() { 61 | let (mut input, expected) = $value; 62 | let base = Url::parse("https://mgdm.net").unwrap(); 63 | let doc = make_doc(&mut input); 64 | for css_match in doc 65 | .select("a, area, link") 66 | .expect("Failed to parse CSS selector while doing link rewriting") 67 | { 68 | let node = css_match.as_node(); 69 | rewrite_relative_url(&node, &base); 70 | } 71 | 72 | let result = serialize_doc(&doc); 73 | assert_eq!(expected, result); 74 | } 75 | )* 76 | } 77 | } 78 | 79 | macro_rules! detect_base_tests { 80 | ($($name:ident: $value:expr,)*) => { 81 | $( 82 | #[test] 83 | fn $name() { 84 | let (mut input, expected) = $value; 85 | let doc = make_doc(&mut input); 86 | let result = detect_base(&doc); 87 | assert_eq!(expected, result); 88 | } 89 | )* 90 | } 91 | } 92 | 93 | fn make_doc(html: &mut String) -> NodeRef { 94 | kuchiki::parse_html() 95 | .from_utf8() 96 | .read_from(&mut html.as_bytes()) 97 | .unwrap() 98 | } 99 | 100 | fn serialize_doc(doc: &NodeRef) -> String { 101 | let mut content: Vec = Vec::new(); 102 | doc.serialize(&mut content).unwrap(); 103 | std::str::from_utf8(&content).unwrap().to_string() 104 | } 105 | 106 | rewrite_tests! { 107 | rewrite_a_href: ( 108 | "Hello".to_string(), 109 | "Hello".to_string(), 110 | ), 111 | rewrite_link_href: ( 112 | "Hello".to_string(), 113 | "Hello".to_string(), 114 | ), 115 | rewrite_map_area_href: ( 116 | "".to_string(), 117 | "".to_string() 118 | ), 119 | do_not_rewrite_absolute_url: ( 120 | "Hello".to_string(), 121 | "Hello".to_string(), 122 | ), 123 | } 124 | 125 | detect_base_tests! { 126 | base_ok: ( 127 | "Hello".to_string(), 128 | Some(Url::parse("https://example.org").unwrap()) 129 | ), 130 | base_not_found: ( 131 | "Hello".to_string(), 132 | None 133 | ), 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate html5ever; 2 | extern crate kuchiki; 3 | 4 | #[macro_use] 5 | extern crate lazy_static; 6 | 7 | mod link; 8 | mod pretty_print; 9 | 10 | use clap::{App, Arg, ArgMatches}; 11 | use kuchiki::traits::*; 12 | use kuchiki::NodeRef; 13 | use std::borrow::BorrowMut; 14 | use std::error::Error; 15 | use std::fs::File; 16 | use std::io; 17 | use std::str; 18 | use url::Url; 19 | 20 | #[derive(Debug, Clone)] 21 | struct Config { 22 | input_path: String, 23 | output_path: String, 24 | selector: String, 25 | base: Option, 26 | detect_base: bool, 27 | text_only: bool, 28 | ignore_whitespace: bool, 29 | pretty_print: bool, 30 | remove_nodes: Option>, 31 | attributes: Option>, 32 | } 33 | 34 | impl Config { 35 | fn from_args(matches: ArgMatches) -> Option { 36 | let attributes = matches 37 | .values_of("attribute") 38 | .map(|values| values.map(String::from).collect()); 39 | 40 | let remove_nodes = matches 41 | .values_of("remove_nodes") 42 | .map(|values| values.map(String::from).collect()); 43 | 44 | let selector: String = match matches.values_of("selector") { 45 | Some(values) => values.collect::>().join(" "), 46 | None => String::from("html"), 47 | }; 48 | 49 | let base = matches.value_of("base").map(|b| b.to_owned()); 50 | 51 | Some(Config { 52 | input_path: String::from(matches.value_of("filename").unwrap_or("-")), 53 | output_path: String::from(matches.value_of("output").unwrap_or("-")), 54 | base, 55 | detect_base: matches.is_present("detect_base"), 56 | text_only: matches.is_present("text_only"), 57 | ignore_whitespace: matches.is_present("ignore_whitespace"), 58 | pretty_print: matches.is_present("pretty_print"), 59 | remove_nodes, 60 | attributes, 61 | selector, 62 | }) 63 | } 64 | } 65 | 66 | impl Default for Config { 67 | fn default() -> Self { 68 | Self { 69 | input_path: "-".to_string(), 70 | output_path: "-".to_string(), 71 | selector: "html".to_string(), 72 | base: None, 73 | detect_base: false, 74 | ignore_whitespace: true, 75 | pretty_print: true, 76 | text_only: false, 77 | remove_nodes: None, 78 | attributes: Some(vec![]), 79 | } 80 | } 81 | } 82 | 83 | fn select_attributes(node: &NodeRef, attributes: &[String], output: &mut dyn io::Write) { 84 | if let Some(as_element) = node.as_element() { 85 | for attr in attributes { 86 | if let Ok(elem_atts) = as_element.attributes.try_borrow() { 87 | if let Some(val) = elem_atts.get(attr.as_str()) { 88 | writeln!(output, "{}", val).ok(); 89 | } 90 | } 91 | } 92 | } 93 | } 94 | 95 | fn serialize_text(node: &NodeRef, ignore_whitespace: bool) -> String { 96 | let mut result = String::new(); 97 | for text_node in node.inclusive_descendants().text_nodes() { 98 | if ignore_whitespace && text_node.borrow().trim().is_empty() { 99 | continue; 100 | } 101 | 102 | result.push_str(&text_node.borrow()); 103 | 104 | if ignore_whitespace { 105 | result.push('\n'); 106 | } 107 | } 108 | 109 | result 110 | } 111 | 112 | fn get_config<'a, 'b>() -> App<'a, 'b> { 113 | App::new("htmlq") 114 | .version("0.4.0") 115 | .author("Michael Maclean ") 116 | .about("Runs CSS selectors on HTML") 117 | .arg( 118 | Arg::with_name("filename") 119 | .short("f") 120 | .long("filename") 121 | .value_name("FILE") 122 | .help("The input file. Defaults to stdin") 123 | .takes_value(true), 124 | ) 125 | .arg( 126 | Arg::with_name("output") 127 | .short("o") 128 | .long("output") 129 | .value_name("FILE") 130 | .help("The output file. Defaults to stdout") 131 | .takes_value(true), 132 | ) 133 | .arg( 134 | Arg::with_name("pretty_print") 135 | .short("p") 136 | .long("pretty") 137 | .help("Pretty-print the serialised output"), 138 | ) 139 | .arg( 140 | Arg::with_name("text_only") 141 | .short("t") 142 | .long("text") 143 | .help("Output only the contents of text nodes inside selected elements"), 144 | ) 145 | .arg( 146 | Arg::with_name("ignore_whitespace") 147 | .short("w") 148 | .long("ignore-whitespace") 149 | .help("When printing text nodes, ignore those that consist entirely of whitespace"), 150 | ) 151 | .arg( 152 | Arg::with_name("attribute") 153 | .short("a") 154 | .long("attribute") 155 | .takes_value(true) 156 | .help("Only return this attribute (if present) from selected elements"), 157 | ) 158 | .arg( 159 | Arg::with_name("base") 160 | .short("b") 161 | .long("base") 162 | .takes_value(true) 163 | .help("Use this URL as the base for links"), 164 | ) 165 | .arg( 166 | Arg::with_name("detect_base") 167 | .short("B") 168 | .long("detect-base") 169 | .help("Try to detect the base URL from the tag in the document. If not found, default to the value of --base, if supplied"), 170 | ) 171 | .arg( 172 | Arg::with_name("remove_nodes") 173 | .long("remove-nodes") 174 | .short("r") 175 | .multiple(true) 176 | .number_of_values(1) 177 | .takes_value(true) 178 | .value_name("SELECTOR") 179 | .help("Remove nodes matching this expression before output. May be specified multiple times") 180 | ) 181 | .arg( 182 | Arg::with_name("selector") 183 | .default_value("html") 184 | .multiple(true) 185 | .help("The CSS expression to select"), 186 | ) 187 | } 188 | 189 | fn main() -> Result<(), Box> { 190 | let config = get_config(); 191 | let matches = config.get_matches(); 192 | let config = Config::from_args(matches).unwrap_or_default(); 193 | 194 | let mut input: Box = match config.input_path.as_ref() { 195 | "-" => Box::new(std::io::stdin()), 196 | f => Box::new(File::open(f).expect("should have opened input file")), 197 | }; 198 | 199 | let stdout = std::io::stdout(); 200 | let mut output: Box = match config.output_path.as_ref() { 201 | "-" => Box::new(stdout.lock()), 202 | f => Box::new(File::create(f).expect("should have created output file")), 203 | }; 204 | 205 | let document = kuchiki::parse_html().from_utf8().read_from(&mut input)?; 206 | 207 | let base: Option = match (&config.base, &config.detect_base) { 208 | (Some(base), true) => link::detect_base(&document).or(Url::parse(&base).ok()), 209 | (Some(base), false) => Url::parse(&base).ok(), 210 | (None, true) => link::detect_base(&document), 211 | _ => None, 212 | }; 213 | 214 | let remove_node_selector = match config.remove_nodes { 215 | Some(ref remove_node_selectors) => remove_node_selectors.join(","), 216 | None => Default::default(), 217 | }; 218 | 219 | document 220 | .select(&config.selector) 221 | .expect("Failed to parse CSS selector") 222 | .filter(|noderef| { 223 | if let Ok(mut node) = noderef.as_node().select_first(&remove_node_selector) { 224 | node.borrow_mut().as_node().detach(); 225 | false 226 | } else { 227 | true 228 | } 229 | }) 230 | .map(|node| { 231 | if let Some(base) = &base { 232 | link::rewrite_relative_url(node.as_node(), &base) 233 | } 234 | node 235 | }) 236 | .for_each(|matched_noderef| { 237 | let node = matched_noderef.as_node(); 238 | 239 | if let Some(attributes) = &config.attributes { 240 | select_attributes(node, attributes, &mut output); 241 | return; 242 | } 243 | 244 | if config.text_only { 245 | // let content = serialize_text(node, config.ignore_whitespace); 246 | // output.write_all(format!("{}\n", content).as_ref()).ok(); 247 | writeln!(output, "{}", serialize_text(node, config.ignore_whitespace)).ok(); 248 | return; 249 | } 250 | 251 | if config.pretty_print { 252 | // let content = pretty_print::pretty_print(node); 253 | // output.write_all(content.as_ref()).ok(); 254 | writeln!(output, "{}", pretty_print::pretty_print(node)).ok(); 255 | return; 256 | } 257 | 258 | writeln!(output, "{}", node.to_string()).ok(); 259 | // let mut content: Vec = Vec::new(); 260 | // let Ok(_) = node.serialize(&mut content) else { 261 | // return 262 | // }; 263 | // output.write_all(format!("{}\n", content).as_ref()).ok(); 264 | }); 265 | 266 | Ok(()) 267 | } 268 | -------------------------------------------------------------------------------- /src/pretty_print.rs: -------------------------------------------------------------------------------- 1 | use html5ever::serialize::AttrRef; 2 | use html5ever::serialize::HtmlSerializer; 3 | use html5ever::serialize::Serialize; 4 | use html5ever::serialize::SerializeOpts; 5 | use html5ever::serialize::Serializer; 6 | use html5ever::serialize::TraversalScope; 7 | use html5ever::QualName; 8 | // use kuchiki::traits::TendrilSink; 9 | use kuchiki::NodeRef; 10 | use std::collections::HashSet; 11 | use std::io; 12 | use std::io::Write; 13 | use std::str; 14 | 15 | lazy_static! { 16 | static ref INLINE_ELEMENTS: HashSet<&'static str> = vec![ 17 | "a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "button", "canvas", "cite", 18 | "code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img", "input", 19 | "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output", "picture", 20 | "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small", "span", 21 | "strong", "sub", "sup", "svg", "template", "textarea", "time", "u", "tt", "var", "video", 22 | "wbr", 23 | ] 24 | .into_iter() 25 | .collect(); 26 | } 27 | 28 | fn is_inline(name: &str) -> bool { 29 | INLINE_ELEMENTS.contains(name) 30 | } 31 | 32 | struct PrettyPrint { 33 | indent: usize, 34 | previous_was_block: bool, 35 | inner: HtmlSerializer, 36 | } 37 | 38 | impl Serializer for PrettyPrint { 39 | fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> 40 | where 41 | AttrIter: Iterator>, 42 | { 43 | let inline = is_inline(&name.local); 44 | if !inline || self.previous_was_block { 45 | self.inner.writer.write_all(b"\n")?; 46 | self.inner.writer.write_all(&vec![b' '; self.indent])?; 47 | } 48 | 49 | self.indent += 2; 50 | self.inner.start_elem(name, attrs)?; 51 | 52 | Ok(()) 53 | } 54 | 55 | fn end_elem(&mut self, name: QualName) -> io::Result<()> { 56 | self.indent -= 2; 57 | 58 | if is_inline(&name.local) { 59 | self.previous_was_block = false; 60 | } else { 61 | self.inner.writer.write_all(b"\n")?; 62 | self.inner.writer.write_all(&vec![b' '; self.indent])?; 63 | self.previous_was_block = true; 64 | } 65 | 66 | self.inner.end_elem(name) 67 | } 68 | 69 | fn write_text(&mut self, text: &str) -> io::Result<()> { 70 | if text.trim().is_empty() { 71 | Ok(()) 72 | } else { 73 | if self.previous_was_block { 74 | self.inner.writer.write_all(b"\n")?; 75 | self.inner.writer.write_all(&vec![b' '; self.indent])?; 76 | } 77 | 78 | self.previous_was_block = false; 79 | self.inner.write_text(text) 80 | } 81 | } 82 | 83 | fn write_comment(&mut self, text: &str) -> io::Result<()> { 84 | self.inner.write_comment(text) 85 | } 86 | 87 | fn write_doctype(&mut self, name: &str) -> io::Result<()> { 88 | self.inner.write_doctype(name) 89 | } 90 | 91 | fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { 92 | self.inner.write_processing_instruction(target, data) 93 | } 94 | } 95 | 96 | pub fn pretty_print(node: &NodeRef) -> String { 97 | let mut content: Vec = Vec::new(); 98 | let mut pp = PrettyPrint { 99 | indent: 0, 100 | previous_was_block: false, 101 | inner: HtmlSerializer::new( 102 | &mut content, 103 | SerializeOpts { 104 | traversal_scope: TraversalScope::IncludeNode, 105 | ..Default::default() 106 | }, 107 | ), 108 | }; 109 | Serialize::serialize(node, &mut pp, TraversalScope::IncludeNode).unwrap(); 110 | str::from_utf8(content.as_ref()).unwrap().to_owned() 111 | } 112 | -------------------------------------------------------------------------------- /tests/cli.rs: -------------------------------------------------------------------------------- 1 | use assert_cmd::Command; 2 | use predicates::prelude::*; 3 | 4 | macro_rules! cmd_success_tests { 5 | ($($name:ident: $value:expr,)*) => { 6 | $( 7 | #[test] 8 | fn $name(){ 9 | let (stdin, args, expected) = $value; 10 | Command::cargo_bin("htmlq") 11 | .unwrap() 12 | .args(args) 13 | .write_stdin(stdin) 14 | .assert() 15 | .success() 16 | .stdout(predicate::str::diff(expected)); 17 | } 18 | )* 19 | } 20 | } 21 | 22 | cmd_success_tests!( 23 | find_by_class: ( 24 | "", 25 | [".hi"], 26 | "\n" 27 | ), 28 | find_by_id: ( 29 | "", 30 | ["#my-id"], 31 | "\n" 32 | ), 33 | remove_links: ( 34 | "", 35 | ["#my-id", "--remove-nodes", "a"], 36 | "
    \n", 37 | ), 38 | ); 39 | --------------------------------------------------------------------------------