├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── ammonia-compare ├── Cargo.lock ├── Cargo.toml ├── README.md ├── samples │ └── github-dekellum-frag.html └── src │ └── lib.rs ├── marked-cli ├── CHANGELOG.md ├── Cargo.toml ├── README.md ├── build.rs ├── clippy.toml └── src │ └── main.rs ├── marked-sanitizer ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── README.md ├── build.rs └── src │ └── lib.rs └── marked ├── CHANGELOG.md ├── Cargo.toml ├── README.md ├── benches └── round_trip.rs ├── build.rs ├── build ├── attributes ├── generate.rb ├── meta.rs.erb └── tags ├── clippy.toml ├── samples ├── documento_utf16be_bom.html ├── documento_utf16be_meta_utf16le.html ├── documento_utf16le.html ├── documento_utf16le_bom.html ├── documento_utf16le_meta_utf8.html ├── documento_utf8.html ├── documento_utf8_bom.html ├── documento_utf8_meta.html ├── documento_utf8_meta_utf16.html ├── documento_windows1252_meta.html ├── github-dekellum.html ├── iro0094_shiftjis_meta.html ├── matsunami_eucjp_meta.html └── russez_windows1251_meta.html └── src ├── chars.rs ├── decode.rs ├── decode └── encoding_hint.rs ├── dom.rs ├── dom ├── filter.rs ├── html.rs ├── html │ └── meta.rs ├── node_ref.rs ├── serializer.rs ├── tests.rs └── xml.rs ├── lib.rs └── logger.rs /.gitattributes: -------------------------------------------------------------------------------- 1 | * linguist-vendored 2 | *.rs linguist-vendored=false 3 | *.rb linguist-vendored=false 4 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | schedule: 6 | - cron: '05 16 * * 1,4' 7 | 8 | env: 9 | RUSTFLAGS: -Dwarnings 10 | 11 | jobs: 12 | 13 | test: 14 | name: ${{ matrix.rust }} ${{ matrix.os }} ${{ join(matrix.extras) }} 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | include: 20 | - rust: 1.38.0 21 | os: ubuntu-20.04 22 | - rust: 1.38.0 23 | os: ubuntu-20.04 24 | extras: [update] 25 | - rust: 1.38.0 26 | os: windows-latest 27 | - rust: 1.38.0 28 | os: windows-latest 29 | extras: [update] 30 | - rust: stable 31 | os: ubuntu-20.04 32 | extras: [update] 33 | - rust: nightly 34 | os: ubuntu-20.04 35 | - rust: nightly 36 | os: ubuntu-20.04 37 | extras: [update] 38 | 39 | steps: 40 | - name: Checkout 41 | uses: actions/checkout@v2 42 | 43 | - name: Install rust (${{ matrix.rust }}) 44 | uses: actions-rs/toolchain@v1 45 | with: 46 | profile: minimal 47 | toolchain: ${{ matrix.rust }} 48 | override: true 49 | 50 | - name: Update deps 51 | if: ${{ contains(matrix.extras, 'update') }} 52 | run: cargo update 53 | 54 | - name: Downgrade xml-rs for MSRV 55 | if: ${{ matrix.rust == '1.38.0' }} 56 | run: cargo update -p xml-rs --precise 0.8.0 57 | 58 | - name: Test 59 | run: cargo test 60 | 61 | - name: Test all features 62 | run: cargo test --all-features 63 | 64 | - name: Build all features/targets 65 | if: ${{ matrix.rust == 'nightly' }} 66 | run: cargo build --all-features --all-targets 67 | 68 | - name: Build marked-cli (all features) 69 | working-directory: marked-cli 70 | run: cargo build --all-features 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "bitflags" 5 | version = "1.2.1" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" 8 | 9 | [[package]] 10 | name = "cfg-if" 11 | version = "1.0.0" 12 | source = "registry+https://github.com/rust-lang/crates.io-index" 13 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 14 | 15 | [[package]] 16 | name = "clap" 17 | version = "2.33.3" 18 | source = "registry+https://github.com/rust-lang/crates.io-index" 19 | checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" 20 | dependencies = [ 21 | "bitflags", 22 | "term_size", 23 | "textwrap", 24 | "unicode-width", 25 | ] 26 | 27 | [[package]] 28 | name = "encoding_rs" 29 | version = "0.8.26" 30 | source = "registry+https://github.com/rust-lang/crates.io-index" 31 | checksum = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" 32 | dependencies = [ 33 | "cfg-if", 34 | ] 35 | 36 | [[package]] 37 | name = "futf" 38 | version = "0.1.4" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" 41 | dependencies = [ 42 | "mac", 43 | "new_debug_unreachable", 44 | ] 45 | 46 | [[package]] 47 | name = "getrandom" 48 | version = "0.1.16" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 51 | dependencies = [ 52 | "cfg-if", 53 | "libc", 54 | "wasi", 55 | ] 56 | 57 | [[package]] 58 | name = "html5ever" 59 | version = "0.25.1" 60 | source = "registry+https://github.com/rust-lang/crates.io-index" 61 | checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" 62 | dependencies = [ 63 | "log", 64 | "mac", 65 | "markup5ever", 66 | "proc-macro2", 67 | "quote", 68 | "syn", 69 | ] 70 | 71 | [[package]] 72 | name = "itoa" 73 | version = "0.4.7" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" 76 | 77 | [[package]] 78 | name = "lazy_static" 79 | version = "1.4.0" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 82 | 83 | [[package]] 84 | name = "libc" 85 | version = "0.2.82" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" 88 | 89 | [[package]] 90 | name = "log" 91 | version = "0.4.14" 92 | source = "registry+https://github.com/rust-lang/crates.io-index" 93 | checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" 94 | dependencies = [ 95 | "cfg-if", 96 | ] 97 | 98 | [[package]] 99 | name = "mac" 100 | version = "0.1.1" 101 | source = "registry+https://github.com/rust-lang/crates.io-index" 102 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 103 | 104 | [[package]] 105 | name = "marked" 106 | version = "0.3.0" 107 | dependencies = [ 108 | "encoding_rs", 109 | "html5ever", 110 | "lazy_static", 111 | "log", 112 | "markup5ever_rcdom", 113 | "mime", 114 | "rand", 115 | "string_cache", 116 | "tendril", 117 | "xml-rs", 118 | ] 119 | 120 | [[package]] 121 | name = "marked-cli" 122 | version = "0.3.1" 123 | dependencies = [ 124 | "clap", 125 | "encoding_rs", 126 | "html5ever", 127 | "log", 128 | "marked", 129 | ] 130 | 131 | [[package]] 132 | name = "markup5ever" 133 | version = "0.10.0" 134 | source = "registry+https://github.com/rust-lang/crates.io-index" 135 | checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" 136 | dependencies = [ 137 | "log", 138 | "phf", 139 | "phf_codegen", 140 | "serde", 141 | "serde_derive", 142 | "serde_json", 143 | "string_cache", 144 | "string_cache_codegen", 145 | "tendril", 146 | ] 147 | 148 | [[package]] 149 | name = "markup5ever_rcdom" 150 | version = "0.1.0" 151 | source = "git+https://github.com/dekellum/html5ever?branch=rcdom#14e6e4be4299d3940bed8b91de88241cbbc81d56" 152 | dependencies = [ 153 | "markup5ever", 154 | "tendril", 155 | ] 156 | 157 | [[package]] 158 | name = "mime" 159 | version = "0.3.16" 160 | source = "registry+https://github.com/rust-lang/crates.io-index" 161 | checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" 162 | 163 | [[package]] 164 | name = "new_debug_unreachable" 165 | version = "1.0.4" 166 | source = "registry+https://github.com/rust-lang/crates.io-index" 167 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 168 | 169 | [[package]] 170 | name = "phf" 171 | version = "0.8.0" 172 | source = "registry+https://github.com/rust-lang/crates.io-index" 173 | checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" 174 | dependencies = [ 175 | "phf_shared", 176 | ] 177 | 178 | [[package]] 179 | name = "phf_codegen" 180 | version = "0.8.0" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" 183 | dependencies = [ 184 | "phf_generator", 185 | "phf_shared", 186 | ] 187 | 188 | [[package]] 189 | name = "phf_generator" 190 | version = "0.8.0" 191 | source = "registry+https://github.com/rust-lang/crates.io-index" 192 | checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" 193 | dependencies = [ 194 | "phf_shared", 195 | "rand", 196 | ] 197 | 198 | [[package]] 199 | name = "phf_shared" 200 | version = "0.8.0" 201 | source = "registry+https://github.com/rust-lang/crates.io-index" 202 | checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" 203 | dependencies = [ 204 | "siphasher", 205 | ] 206 | 207 | [[package]] 208 | name = "ppv-lite86" 209 | version = "0.2.10" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" 212 | 213 | [[package]] 214 | name = "precomputed-hash" 215 | version = "0.1.1" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 218 | 219 | [[package]] 220 | name = "proc-macro2" 221 | version = "1.0.24" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" 224 | dependencies = [ 225 | "unicode-xid", 226 | ] 227 | 228 | [[package]] 229 | name = "quote" 230 | version = "1.0.8" 231 | source = "registry+https://github.com/rust-lang/crates.io-index" 232 | checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" 233 | dependencies = [ 234 | "proc-macro2", 235 | ] 236 | 237 | [[package]] 238 | name = "rand" 239 | version = "0.7.3" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 242 | dependencies = [ 243 | "getrandom", 244 | "libc", 245 | "rand_chacha", 246 | "rand_core", 247 | "rand_hc", 248 | "rand_pcg", 249 | ] 250 | 251 | [[package]] 252 | name = "rand_chacha" 253 | version = "0.2.2" 254 | source = "registry+https://github.com/rust-lang/crates.io-index" 255 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 256 | dependencies = [ 257 | "ppv-lite86", 258 | "rand_core", 259 | ] 260 | 261 | [[package]] 262 | name = "rand_core" 263 | version = "0.5.1" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 266 | dependencies = [ 267 | "getrandom", 268 | ] 269 | 270 | [[package]] 271 | name = "rand_hc" 272 | version = "0.2.0" 273 | source = "registry+https://github.com/rust-lang/crates.io-index" 274 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 275 | dependencies = [ 276 | "rand_core", 277 | ] 278 | 279 | [[package]] 280 | name = "rand_pcg" 281 | version = "0.2.1" 282 | source = "registry+https://github.com/rust-lang/crates.io-index" 283 | checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" 284 | dependencies = [ 285 | "rand_core", 286 | ] 287 | 288 | [[package]] 289 | name = "ryu" 290 | version = "1.0.5" 291 | source = "registry+https://github.com/rust-lang/crates.io-index" 292 | checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" 293 | 294 | [[package]] 295 | name = "serde" 296 | version = "1.0.118" 297 | source = "registry+https://github.com/rust-lang/crates.io-index" 298 | checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" 299 | 300 | [[package]] 301 | name = "serde_derive" 302 | version = "1.0.118" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" 305 | dependencies = [ 306 | "proc-macro2", 307 | "quote", 308 | "syn", 309 | ] 310 | 311 | [[package]] 312 | name = "serde_json" 313 | version = "1.0.61" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" 316 | dependencies = [ 317 | "itoa", 318 | "ryu", 319 | "serde", 320 | ] 321 | 322 | [[package]] 323 | name = "siphasher" 324 | version = "0.3.3" 325 | source = "registry+https://github.com/rust-lang/crates.io-index" 326 | checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" 327 | 328 | [[package]] 329 | name = "string_cache" 330 | version = "0.8.1" 331 | source = "registry+https://github.com/rust-lang/crates.io-index" 332 | checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a" 333 | dependencies = [ 334 | "lazy_static", 335 | "new_debug_unreachable", 336 | "phf_shared", 337 | "precomputed-hash", 338 | "serde", 339 | ] 340 | 341 | [[package]] 342 | name = "string_cache_codegen" 343 | version = "0.5.1" 344 | source = "registry+https://github.com/rust-lang/crates.io-index" 345 | checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" 346 | dependencies = [ 347 | "phf_generator", 348 | "phf_shared", 349 | "proc-macro2", 350 | "quote", 351 | ] 352 | 353 | [[package]] 354 | name = "syn" 355 | version = "1.0.58" 356 | source = "registry+https://github.com/rust-lang/crates.io-index" 357 | checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" 358 | dependencies = [ 359 | "proc-macro2", 360 | "quote", 361 | "unicode-xid", 362 | ] 363 | 364 | [[package]] 365 | name = "tendril" 366 | version = "0.4.2" 367 | source = "registry+https://github.com/rust-lang/crates.io-index" 368 | checksum = "a9ef557cb397a4f0a5a3a628f06515f78563f2209e64d47055d9dc6052bf5e33" 369 | dependencies = [ 370 | "encoding_rs", 371 | "futf", 372 | "mac", 373 | "utf-8", 374 | ] 375 | 376 | [[package]] 377 | name = "term_size" 378 | version = "0.3.2" 379 | source = "registry+https://github.com/rust-lang/crates.io-index" 380 | checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" 381 | dependencies = [ 382 | "libc", 383 | "winapi", 384 | ] 385 | 386 | [[package]] 387 | name = "textwrap" 388 | version = "0.11.0" 389 | source = "registry+https://github.com/rust-lang/crates.io-index" 390 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 391 | dependencies = [ 392 | "term_size", 393 | "unicode-width", 394 | ] 395 | 396 | [[package]] 397 | name = "unicode-width" 398 | version = "0.1.8" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" 401 | 402 | [[package]] 403 | name = "unicode-xid" 404 | version = "0.2.1" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" 407 | 408 | [[package]] 409 | name = "utf-8" 410 | version = "0.7.5" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" 413 | 414 | [[package]] 415 | name = "wasi" 416 | version = "0.9.0+wasi-snapshot-preview1" 417 | source = "registry+https://github.com/rust-lang/crates.io-index" 418 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 419 | 420 | [[package]] 421 | name = "winapi" 422 | version = "0.3.9" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 425 | dependencies = [ 426 | "winapi-i686-pc-windows-gnu", 427 | "winapi-x86_64-pc-windows-gnu", 428 | ] 429 | 430 | [[package]] 431 | name = "winapi-i686-pc-windows-gnu" 432 | version = "0.4.0" 433 | source = "registry+https://github.com/rust-lang/crates.io-index" 434 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 435 | 436 | [[package]] 437 | name = "winapi-x86_64-pc-windows-gnu" 438 | version = "0.4.0" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 441 | 442 | [[package]] 443 | name = "xml-rs" 444 | version = "0.8.3" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" 447 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ "marked", "marked-cli" ] 3 | exclude = [ "ammonia-compare", "marked-sanitizer" ] 4 | 5 | [patch.crates-io] 6 | "marked" = { path = "marked" } 7 | 8 | [profile.release] 9 | lto = "thin" 10 | incremental = false 11 | 12 | [profile.bench] 13 | lto = "thin" 14 | incremental = false 15 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright © 2020 David Kellum 2 | Copyright © 2019 the Märkəd authors 3 | Copyright © 2018 (the Kuchiki authors) 4 | Copyright © 2017 (the Victor authors) 5 | Copyright © 2014-2017 the html5ever project developers 6 | 7 | Permission is hereby granted, free of charge, to any 8 | person obtaining a copy of this software and associated 9 | documentation files (the "Software"), to deal in the 10 | Software without restriction, including without 11 | limitation the rights to use, copy, modify, merge, 12 | publish, distribute, sublicense, and/or sell copies of 13 | the Software, and to permit persons to whom the Software 14 | is furnished to do so, subject to the following 15 | conditions: 16 | 17 | The above copyright notice and this permission notice 18 | shall be included in all copies or substantial portions 19 | of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 22 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 23 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 24 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 25 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 27 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 28 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 29 | DEALINGS IN THE SOFTWARE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Märkəd Project 2 | 3 | [![deps status](https://deps.rs/repo/github/dekellum/marked/status.svg)](https://deps.rs/repo/github/dekellum/marked) 4 | [![CI Status](https://github.com/dekellum/marked/workflows/CI/badge.svg?branch=main)](https://github.com/dekellum/marked/actions?query=workflow%3ACI) 5 | 6 | A rust language project for parsing, filtering, selecting and serializing HTML 7 | and XML mark-up. 8 | 9 | See the _[marked]_ crate or _[marked-cli]_ crates or the README(s) and 10 | CHANGELOG(s) under this ([github hosted]) source tree and cargo workspace. 11 | 12 | ## Feature Overview 13 | 14 | Currently implemented features: 15 | 16 | ### A vector-allocated, indexed, DOM-like tree structure 17 | 18 | The `marked::Document` is a DOM-like tree structure suitable for HTML and 19 | XML. This was forked from the _[victor]_ project (same author as _html5ever_) 20 | and further optimized. It is implemented as a (std) `Vec` of `Node` types, 21 | which references parent, siblings and children via (std) `NonZeroU32` indexes 22 | for space efficiency. 23 | 24 | ### _html5ever_ integration 25 | 26 | Including HTML5 document and fragment parsing and HTML5 serialization (mark-up 27 | output). With the `marked::Document` (DOM), parsing and serialization is 28 | measurably faster (see benchmarks in source tree) than the `RcDom` previously 29 | included with *html5ever* associated crates, and mutating the `Document` is 30 | more straightforward, via a mutable reference. 31 | 32 | ### _xml-rs_ integration 33 | 34 | Strict, UTF-8 XML parsing to `marked::Document` is currently supported by 35 | integration of the _[xml-rs]_ crate. 36 | 37 | ### Legacy character encoding support 38 | 39 | An estimated 5% of the web remains in encodings other than UTF-8; too common to 40 | be treated as an error. Via `marked::html::parse_buffered`: 41 | 42 | * Decoding via _encoding_rs_ which implements _[The Encoding Standard]_ including 43 | alternative names (labels) for supported encodings. 44 | 45 | * HTML5 parsing restart from initial (4k) buffer with new encoding hints 46 | obtained from \/\ `charset` or an `http-equiv` `content-type` with 47 | charset. 48 | 49 | * Byte-Order-Mark BOM sniffing as high priority `EncodingHint` for UTF-8, UTF-16 50 | Big-Endian and UTF-16 Little-Endian. 51 | 52 | * "Impossible" hints from the above are ignored. For example, if we read a hint 53 | from UTF-8 that says its UTF-16LE (which would make it impossible to 54 | read the same hint if it was used). 55 | 56 | (Note that the _detection_ features are not currently provided by _html5ever_ and 57 | associated crates.) 58 | 59 | ### Rust "selectors" API 60 | 61 | A `NodeRef` type with "CSS selectors"-like methods to recursively `select` and 62 | `find` elements using closure predicates. We prefer direct rust language 63 | compiler support for writing such selection logic, over CSS or other 64 | interpreted DSL. 65 | 66 | ### HTML tag and attribute metadata 67 | 68 | See `marked::html::t` (tags) and `marked::html::a` (attributes) modules. 69 | 70 | ### Tree walking filters API 71 | 72 | Bulk modifications to the DOM is easily and efficiently achieved with mutating 73 | filter functions/closures and a tree walker (depth or breadth-first) 74 | implementation in _marked_. This style of interface is sometimes called the 75 | "visitor pattern". See `Document::filter_at` for details. The crate also 76 | includes the following built-in filters (a partial list): 77 | 78 | `detach_banned_element` 79 | : `Detach` known banned (via metadata) and unknown elements 80 | 81 | `retain_basic_attributes` 82 | : Remove all attributes that are not part of the "basic" logical set (via metadata) 83 | 84 | `fold_empty_inline` 85 | : `Fold` empty or meaninglessly "inline" elements 86 | 87 | `text_normalize` 88 | : Normalize text nodes by merging, replacing control characters and minimizing white-space. 89 | 90 | An unreleased example, compatibility test and benchmark of _ammonia_ crate 91 | equivalent filtering (for hygiene and safety) is included in the source tree 92 | ([./ammonia-compare]) 93 | 94 | ## Roadmap 95 | 96 | Features incomplete or unstarted which may be included in this project in the 97 | future (PRs welcome): 98 | 99 | * Complete (faster, more correct, legacy encodings) strict-mode XML parsing 100 | 101 | * Lenient-mode XML parsing 102 | 103 | * Optional (opt-in) direct charset detection (initial read buffer or entire 104 | document) via something like [chardet], integrated as high priority 105 | _EncodingHint_. 106 | 107 | * XML/HTML pretty-indenting serialization (combines well with the existing white-space 108 | normalization features) 109 | 110 | * XML (and XHTML) serialization 111 | 112 | ## License 113 | 114 | This project is dual licensed under either of following: 115 | 116 | * The Apache License, version 2.0 117 | ([LICENSE-APACHE] or http://www.apache.org/licenses/LICENSE-2.0) 118 | 119 | * The MIT License 120 | ([LICENSE-MIT] or http://opensource.org/licenses/MIT) 121 | 122 | ### Contribution 123 | 124 | Unless you explicitly state otherwise, any contribution intentionally submitted 125 | for inclusion in the _märkəd_ project by you, as defined by the Apache License, 126 | shall be dual licensed as above, without any additional terms or conditions. 127 | 128 | [github hosted]: https://github.com/dekellum/marked 129 | [marked]: https://docs.rs/crate/marked 130 | [marked-cli]: https://crates.io/crates/marked-cli 131 | [The Encoding Standard]: https://encoding.spec.whatwg.org/ 132 | [./ammonia-compare]: https://github.com/dekellum/marked/tree/main/ammonia-compare 133 | [victor]: https://github.com/SimonSapin/victor 134 | [chardet]: https://crates.io/crates/chardet 135 | [xml-rs]: https://crates.io/crates/xml-rs 136 | [LICENSE-APACHE]: https://github.com/dekellum/marked/tree/main/LICENSE-APACHE 137 | [LICENSE-MIT]: https://github.com/dekellum/marked/tree/main/LICENSE-MIT 138 | -------------------------------------------------------------------------------- /ammonia-compare/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "ammonia" 5 | version = "3.1.0" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "89eac85170f4b3fb3dc5e442c1cfb036cb8eecf9dbbd431a161ffad15d90ea3b" 8 | dependencies = [ 9 | "html5ever", 10 | "lazy_static", 11 | "maplit", 12 | "markup5ever_rcdom", 13 | "matches", 14 | "tendril", 15 | "url", 16 | ] 17 | 18 | [[package]] 19 | name = "ammonia-compare" 20 | version = "0.0.1" 21 | dependencies = [ 22 | "ammonia", 23 | "marked", 24 | ] 25 | 26 | [[package]] 27 | name = "cfg-if" 28 | version = "0.1.10" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" 31 | 32 | [[package]] 33 | name = "cfg-if" 34 | version = "1.0.0" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 37 | 38 | [[package]] 39 | name = "encoding_rs" 40 | version = "0.8.26" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" 43 | dependencies = [ 44 | "cfg-if 1.0.0", 45 | ] 46 | 47 | [[package]] 48 | name = "form_urlencoded" 49 | version = "1.0.0" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "ece68d15c92e84fa4f19d3780f1294e5ca82a78a6d515f1efaabcc144688be00" 52 | dependencies = [ 53 | "matches", 54 | "percent-encoding", 55 | ] 56 | 57 | [[package]] 58 | name = "futf" 59 | version = "0.1.4" 60 | source = "registry+https://github.com/rust-lang/crates.io-index" 61 | checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" 62 | dependencies = [ 63 | "mac", 64 | "new_debug_unreachable", 65 | ] 66 | 67 | [[package]] 68 | name = "getrandom" 69 | version = "0.1.16" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 72 | dependencies = [ 73 | "cfg-if 1.0.0", 74 | "libc", 75 | "wasi 0.9.0+wasi-snapshot-preview1", 76 | ] 77 | 78 | [[package]] 79 | name = "html5ever" 80 | version = "0.25.1" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" 83 | dependencies = [ 84 | "log", 85 | "mac", 86 | "markup5ever", 87 | "proc-macro2", 88 | "quote", 89 | "syn", 90 | ] 91 | 92 | [[package]] 93 | name = "idna" 94 | version = "0.2.0" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" 97 | dependencies = [ 98 | "matches", 99 | "unicode-bidi", 100 | "unicode-normalization", 101 | ] 102 | 103 | [[package]] 104 | name = "itoa" 105 | version = "0.4.7" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" 108 | 109 | [[package]] 110 | name = "lazy_static" 111 | version = "1.4.0" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 114 | 115 | [[package]] 116 | name = "libc" 117 | version = "0.2.82" 118 | source = "registry+https://github.com/rust-lang/crates.io-index" 119 | checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" 120 | 121 | [[package]] 122 | name = "log" 123 | version = "0.4.11" 124 | source = "registry+https://github.com/rust-lang/crates.io-index" 125 | checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" 126 | dependencies = [ 127 | "cfg-if 0.1.10", 128 | ] 129 | 130 | [[package]] 131 | name = "mac" 132 | version = "0.1.1" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 135 | 136 | [[package]] 137 | name = "maplit" 138 | version = "1.0.2" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" 141 | 142 | [[package]] 143 | name = "marked" 144 | version = "0.3.0" 145 | dependencies = [ 146 | "encoding_rs", 147 | "html5ever", 148 | "lazy_static", 149 | "log", 150 | "mime", 151 | "string_cache", 152 | "tendril", 153 | ] 154 | 155 | [[package]] 156 | name = "markup5ever" 157 | version = "0.10.0" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" 160 | dependencies = [ 161 | "log", 162 | "phf", 163 | "phf_codegen", 164 | "serde", 165 | "serde_derive", 166 | "serde_json", 167 | "string_cache", 168 | "string_cache_codegen", 169 | "tendril", 170 | ] 171 | 172 | [[package]] 173 | name = "markup5ever_rcdom" 174 | version = "0.1.0" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b" 177 | dependencies = [ 178 | "html5ever", 179 | "markup5ever", 180 | "tendril", 181 | "xml5ever", 182 | ] 183 | 184 | [[package]] 185 | name = "matches" 186 | version = "0.1.8" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" 189 | 190 | [[package]] 191 | name = "mime" 192 | version = "0.3.16" 193 | source = "registry+https://github.com/rust-lang/crates.io-index" 194 | checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" 195 | 196 | [[package]] 197 | name = "new_debug_unreachable" 198 | version = "1.0.4" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 201 | 202 | [[package]] 203 | name = "percent-encoding" 204 | version = "2.1.0" 205 | source = "registry+https://github.com/rust-lang/crates.io-index" 206 | checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" 207 | 208 | [[package]] 209 | name = "phf" 210 | version = "0.8.0" 211 | source = "registry+https://github.com/rust-lang/crates.io-index" 212 | checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" 213 | dependencies = [ 214 | "phf_shared", 215 | ] 216 | 217 | [[package]] 218 | name = "phf_codegen" 219 | version = "0.8.0" 220 | source = "registry+https://github.com/rust-lang/crates.io-index" 221 | checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" 222 | dependencies = [ 223 | "phf_generator", 224 | "phf_shared", 225 | ] 226 | 227 | [[package]] 228 | name = "phf_generator" 229 | version = "0.8.0" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" 232 | dependencies = [ 233 | "phf_shared", 234 | "rand", 235 | ] 236 | 237 | [[package]] 238 | name = "phf_shared" 239 | version = "0.8.0" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" 242 | dependencies = [ 243 | "siphasher", 244 | ] 245 | 246 | [[package]] 247 | name = "ppv-lite86" 248 | version = "0.2.10" 249 | source = "registry+https://github.com/rust-lang/crates.io-index" 250 | checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" 251 | 252 | [[package]] 253 | name = "precomputed-hash" 254 | version = "0.1.1" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 257 | 258 | [[package]] 259 | name = "proc-macro2" 260 | version = "1.0.24" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" 263 | dependencies = [ 264 | "unicode-xid", 265 | ] 266 | 267 | [[package]] 268 | name = "quote" 269 | version = "1.0.8" 270 | source = "registry+https://github.com/rust-lang/crates.io-index" 271 | checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" 272 | dependencies = [ 273 | "proc-macro2", 274 | ] 275 | 276 | [[package]] 277 | name = "rand" 278 | version = "0.7.3" 279 | source = "registry+https://github.com/rust-lang/crates.io-index" 280 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 281 | dependencies = [ 282 | "getrandom", 283 | "libc", 284 | "rand_chacha", 285 | "rand_core", 286 | "rand_hc", 287 | "rand_pcg", 288 | ] 289 | 290 | [[package]] 291 | name = "rand_chacha" 292 | version = "0.2.2" 293 | source = "registry+https://github.com/rust-lang/crates.io-index" 294 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 295 | dependencies = [ 296 | "ppv-lite86", 297 | "rand_core", 298 | ] 299 | 300 | [[package]] 301 | name = "rand_core" 302 | version = "0.5.1" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 305 | dependencies = [ 306 | "getrandom", 307 | ] 308 | 309 | [[package]] 310 | name = "rand_hc" 311 | version = "0.2.0" 312 | source = "registry+https://github.com/rust-lang/crates.io-index" 313 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 314 | dependencies = [ 315 | "rand_core", 316 | ] 317 | 318 | [[package]] 319 | name = "rand_pcg" 320 | version = "0.2.1" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" 323 | dependencies = [ 324 | "rand_core", 325 | ] 326 | 327 | [[package]] 328 | name = "ryu" 329 | version = "1.0.5" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" 332 | 333 | [[package]] 334 | name = "serde" 335 | version = "1.0.118" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" 338 | 339 | [[package]] 340 | name = "serde_derive" 341 | version = "1.0.118" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" 344 | dependencies = [ 345 | "proc-macro2", 346 | "quote", 347 | "syn", 348 | ] 349 | 350 | [[package]] 351 | name = "serde_json" 352 | version = "1.0.61" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" 355 | dependencies = [ 356 | "itoa", 357 | "ryu", 358 | "serde", 359 | ] 360 | 361 | [[package]] 362 | name = "siphasher" 363 | version = "0.3.3" 364 | source = "registry+https://github.com/rust-lang/crates.io-index" 365 | checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" 366 | 367 | [[package]] 368 | name = "string_cache" 369 | version = "0.8.1" 370 | source = "registry+https://github.com/rust-lang/crates.io-index" 371 | checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a" 372 | dependencies = [ 373 | "lazy_static", 374 | "new_debug_unreachable", 375 | "phf_shared", 376 | "precomputed-hash", 377 | "serde", 378 | ] 379 | 380 | [[package]] 381 | name = "string_cache_codegen" 382 | version = "0.5.1" 383 | source = "registry+https://github.com/rust-lang/crates.io-index" 384 | checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" 385 | dependencies = [ 386 | "phf_generator", 387 | "phf_shared", 388 | "proc-macro2", 389 | "quote", 390 | ] 391 | 392 | [[package]] 393 | name = "syn" 394 | version = "1.0.58" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" 397 | dependencies = [ 398 | "proc-macro2", 399 | "quote", 400 | "unicode-xid", 401 | ] 402 | 403 | [[package]] 404 | name = "tendril" 405 | version = "0.4.1" 406 | source = "registry+https://github.com/rust-lang/crates.io-index" 407 | checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" 408 | dependencies = [ 409 | "encoding_rs", 410 | "futf", 411 | "mac", 412 | "utf-8", 413 | ] 414 | 415 | [[package]] 416 | name = "time" 417 | version = "0.1.44" 418 | source = "registry+https://github.com/rust-lang/crates.io-index" 419 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" 420 | dependencies = [ 421 | "libc", 422 | "wasi 0.10.0+wasi-snapshot-preview1", 423 | "winapi", 424 | ] 425 | 426 | [[package]] 427 | name = "tinyvec" 428 | version = "1.1.0" 429 | source = "registry+https://github.com/rust-lang/crates.io-index" 430 | checksum = "ccf8dbc19eb42fba10e8feaaec282fb50e2c14b2726d6301dbfeed0f73306a6f" 431 | dependencies = [ 432 | "tinyvec_macros", 433 | ] 434 | 435 | [[package]] 436 | name = "tinyvec_macros" 437 | version = "0.1.0" 438 | source = "registry+https://github.com/rust-lang/crates.io-index" 439 | checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" 440 | 441 | [[package]] 442 | name = "unicode-bidi" 443 | version = "0.3.4" 444 | source = "registry+https://github.com/rust-lang/crates.io-index" 445 | checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" 446 | dependencies = [ 447 | "matches", 448 | ] 449 | 450 | [[package]] 451 | name = "unicode-normalization" 452 | version = "0.1.16" 453 | source = "registry+https://github.com/rust-lang/crates.io-index" 454 | checksum = "a13e63ab62dbe32aeee58d1c5408d35c36c392bba5d9d3142287219721afe606" 455 | dependencies = [ 456 | "tinyvec", 457 | ] 458 | 459 | [[package]] 460 | name = "unicode-xid" 461 | version = "0.2.1" 462 | source = "registry+https://github.com/rust-lang/crates.io-index" 463 | checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" 464 | 465 | [[package]] 466 | name = "url" 467 | version = "2.2.0" 468 | source = "registry+https://github.com/rust-lang/crates.io-index" 469 | checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" 470 | dependencies = [ 471 | "form_urlencoded", 472 | "idna", 473 | "matches", 474 | "percent-encoding", 475 | ] 476 | 477 | [[package]] 478 | name = "utf-8" 479 | version = "0.7.5" 480 | source = "registry+https://github.com/rust-lang/crates.io-index" 481 | checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" 482 | 483 | [[package]] 484 | name = "wasi" 485 | version = "0.9.0+wasi-snapshot-preview1" 486 | source = "registry+https://github.com/rust-lang/crates.io-index" 487 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 488 | 489 | [[package]] 490 | name = "wasi" 491 | version = "0.10.0+wasi-snapshot-preview1" 492 | source = "registry+https://github.com/rust-lang/crates.io-index" 493 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" 494 | 495 | [[package]] 496 | name = "winapi" 497 | version = "0.3.9" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 500 | dependencies = [ 501 | "winapi-i686-pc-windows-gnu", 502 | "winapi-x86_64-pc-windows-gnu", 503 | ] 504 | 505 | [[package]] 506 | name = "winapi-i686-pc-windows-gnu" 507 | version = "0.4.0" 508 | source = "registry+https://github.com/rust-lang/crates.io-index" 509 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 510 | 511 | [[package]] 512 | name = "winapi-x86_64-pc-windows-gnu" 513 | version = "0.4.0" 514 | source = "registry+https://github.com/rust-lang/crates.io-index" 515 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 516 | 517 | [[package]] 518 | name = "xml5ever" 519 | version = "0.16.1" 520 | source = "registry+https://github.com/rust-lang/crates.io-index" 521 | checksum = "0b1b52e6e8614d4a58b8e70cf51ec0cc21b256ad8206708bcff8139b5bbd6a59" 522 | dependencies = [ 523 | "log", 524 | "mac", 525 | "markup5ever", 526 | "time", 527 | ] 528 | -------------------------------------------------------------------------------- /ammonia-compare/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ammonia-compare" 3 | publish = false 4 | version = "0.0.1" 5 | authors = ["David Kellum "] 6 | edition = "2018" 7 | 8 | [lib] 9 | doctest = false 10 | 11 | [dependencies] 12 | marked = { version=">=0.3.0", path="../marked" } 13 | ammonia = { version=">=3.1.0, <3.2" } 14 | -------------------------------------------------------------------------------- /ammonia-compare/README.md: -------------------------------------------------------------------------------- 1 | # ammonia-compare 2 | 3 | This non-released sub-module includes compatible filtering with the default 4 | settings of the _[ammonia]_ crate (as of version 3.1.0) and comparative 5 | benchmarks with a single sample. Its split out into its own sub-module because 6 | we can't have an optional ammonia dev-dependency. 7 | 8 | ## License 9 | 10 | This project is dual licensed under either of following: 11 | 12 | * The Apache License, version 2.0 ([../LICENSE-APACHE]) 13 | or http://www.apache.org/licenses/LICENSE-2.0) 14 | 15 | * The MIT License ([../LICENSE-MIT]) 16 | or http://opensource.org/licenses/MIT) 17 | 18 | ### Contribution 19 | 20 | Unless you explicitly state otherwise, any contribution intentionally submitted 21 | for inclusion in _märkəd_ (ammonia-compare) by you, as defined by the Apache 22 | License, shall be dual licensed as above, without any additional terms or 23 | conditions. 24 | 25 | [ammonia]: https://crates.io/crates/ammonia 26 | [../LICENSE-APACHE]: https://github.com/dekellum/marked/tree/main/LICENSE-APACHE 27 | [../LICENSE-MIT]: https://github.com/dekellum/marked/tree/main/LICENSE-MIT 28 | -------------------------------------------------------------------------------- /ammonia-compare/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(rust_2018_idioms)] 2 | #![feature(test)] 3 | #![cfg(test)] 4 | 5 | extern crate test; // Still required, see rust-lang/rust#55133 6 | 7 | use std::default::Default; 8 | use std::io; 9 | use std::io::Read; 10 | use std::fs::File; 11 | 12 | use test::Bencher; 13 | 14 | use marked::{ 15 | chain_filters, filter, filter::Action, 16 | html::{a, t, parse_utf8_fragment}, 17 | NodeData, NodeRef 18 | }; 19 | 20 | // Detach tags for which content should not be retained 21 | // 22 | // This includes: 23 | // * The Ammonia::Builder::clean_content_tags default: STYLE and 24 | // SCRIPT, despite conflicting rustdoc. 25 | // * DOMs like *5ever rcdom have specific handling for TEMPLATE, 26 | // so we detach that as well. 27 | pub fn detach_clean_content_tags(_p: NodeRef<'_>, data: &mut NodeData) -> Action { 28 | if let Some(ref elm) = data.as_element() { 29 | match elm.name.local { 30 | t::STYLE | t::SCRIPT | t::TEMPLATE => return Action::Detach, 31 | _ => () 32 | } 33 | } 34 | Action::Continue 35 | } 36 | 37 | // Fold elements that are not in the default Ammonia::Builder::tags whitelist. 38 | pub fn fold_non_whitelist_tags(_p: NodeRef<'_>, data: &mut NodeData) -> Action { 39 | if let Some(ref elm) = data.as_element() { 40 | match elm.name.local { 41 | // The default Ammonia::Builder::tags whitelist should be kept 42 | t::A | t::ABBR | t::ACRONYM | t::AREA | t::ARTICLE | t::ASIDE | 43 | t::B | t::BDI | t::BDO | t::BLOCKQUOTE | t::BR | t::CAPTION | 44 | t::CENTER | t::CITE | t::CODE | t::COL | t::COLGROUP | t::DATA | 45 | t::DD | t::DEL | t::DETAILS | t::DFN | t::DIV | t::DL | t::DT | 46 | t::EM | t::FIGCAPTION | t::FIGURE | t::FOOTER | 47 | t::H1 | t::H2 | t::H3 | t::H4 | t::H5 | t::H6 | 48 | t::HEADER | t::HGROUP | t::HR | t::I | t::IMG | t::INS | t::KBD | 49 | t::LI | t::MAP | t::MARK | t::NAV | t::OL | t::P | t::PRE | t::Q | 50 | t::RP | t::RT | t::RTC | t::RUBY | t::S | t::SAMP | t::SMALL | 51 | t::SPAN | t::STRIKE | t::STRONG | t::SUB | t::SUMMARY | t::SUP | 52 | t::TABLE | t::TBODY | t::TD | t::TH | t::THEAD | t::TIME | t::TR | 53 | t::TT | t::U | t::UL | t::VAR | t::WBR 54 | => (), 55 | _ => return Action::Fold, 56 | } 57 | } 58 | Action::Continue 59 | } 60 | 61 | // Set the `` `rel` attribute based on default Ammonia::Builder::link_rel 62 | // 63 | // Ammonia removes the rel attribute before adding this at end of attributes, 64 | // so do the same here. 65 | fn link_rel(_p: NodeRef<'_>, data: &mut NodeData) -> Action { 66 | if let Some(elm) = data.as_element_mut() { 67 | if elm.is_elem(t::A) { 68 | // Ensure one rel attribute at end by removing first 69 | elm.remove_attr(a::REL); 70 | elm.set_attr(a::REL, "noopener noreferrer"); 71 | } 72 | } 73 | Action::Continue 74 | } 75 | 76 | #[bench] 77 | fn b40_marked_parse_only(b: &mut Bencher) { 78 | 79 | let mut frag = String::new(); 80 | sample_file("github-dekellum-frag.html") 81 | .expect("sample_file") 82 | .read_to_string(&mut frag) 83 | .expect("read_to_string"); 84 | let frag = frag.trim(); 85 | b.iter(|| { 86 | parse_utf8_fragment(frag.as_bytes()); 87 | }); 88 | } 89 | 90 | #[bench] 91 | fn b41_marked_clean(b: &mut Bencher) { 92 | 93 | let mut frag = String::new(); 94 | sample_file("github-dekellum-frag.html") 95 | .expect("sample_file") 96 | .read_to_string(&mut frag) 97 | .expect("read_to_string"); 98 | let frag = frag.trim(); 99 | b.iter(|| { 100 | let mut doc = parse_utf8_fragment(frag.as_bytes()); 101 | doc.filter_breadth(chain_filters!( 102 | detach_clean_content_tags, 103 | filter::detach_comments, 104 | filter::detach_pis, 105 | fold_non_whitelist_tags, 106 | // This is sufficient for this sample (with link_rel) but isn't the 107 | // exact same config as Ammonia defaults: 108 | filter::retain_basic_attributes, 109 | link_rel 110 | )); 111 | 112 | let out = doc.to_string(); 113 | assert_eq!(out.len(), 52062, /*"[[[{}]]]", out*/); 114 | }); 115 | } 116 | 117 | #[bench] 118 | fn b42_ammonia_clean(b: &mut Bencher) { 119 | let mut frag = String::new(); 120 | sample_file("github-dekellum-frag.html") 121 | .expect("sample_file") 122 | .read_to_string(&mut frag) 123 | .expect("read_to_string"); 124 | let frag = frag.trim(); 125 | let amm = ammonia::Builder::default(); 126 | b.iter(|| { 127 | let doc = amm.clean(&frag); 128 | let out = doc.to_string(); 129 | assert_eq!(out.len(), 52062, /*"[[[{}]]]", out*/); 130 | }); 131 | } 132 | 133 | fn sample_file(fname: &str) -> Result { 134 | let root = env!("CARGO_MANIFEST_DIR"); 135 | let fpath = format!("{}/samples/{}", root, fname); 136 | File::open(fpath) 137 | } 138 | -------------------------------------------------------------------------------- /marked-cli/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.3.1 (2021-2-1) 2 | * Broaden log dependency to include 0.4.14. 3 | 4 | * Add clippy config for primordial MSRV build.rs and for current MSRV. 5 | 6 | ## 0.3.0 (2021-1-3) 7 | 8 | * Update to _marked_ 0.3.0. 9 | 10 | ## 0.2.0 (2020-4-12) 11 | 12 | * Update to _marked_ 0.2.0. 13 | 14 | * Properly constrain dependencies (#1) 15 | 16 | ## 0.1.0 (2020-3-15) 17 | 18 | * Initial release. 19 | -------------------------------------------------------------------------------- /marked-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "marked-cli" 3 | version = "0.3.1" 4 | authors = ["David Kellum "] 5 | license = "MIT/Apache-2.0" 6 | description = "Command line tool for markup I/O processing" 7 | repository = "https://github.com/dekellum/marked" 8 | readme = "README.md" 9 | keywords = ["html", "sanitization"] 10 | categories = ["web-programming", "text-processing"] 11 | build = "build.rs" 12 | 13 | edition = "2018" 14 | 15 | [dependencies] 16 | marked = { version=">=0.3.0, <0.4.0" } 17 | html5ever = { version=">=0.25.1, <0.26" } 18 | encoding_rs = { version=">=0.8.13, <0.9" } 19 | clap = { version=">=2.33.0, <2.34", default-features=false, features=["wrap_help"] } 20 | log = { version=">=0.4.4, <0.4.15", features = ["std"] } 21 | 22 | bitflags = { version=">=1.0.0, <1.3", default-features=false } #max transitive 23 | 24 | [[bin]] 25 | name = "marked" 26 | path = "src/main.rs" 27 | doctest = false 28 | bench = false 29 | doc = false 30 | test = false 31 | -------------------------------------------------------------------------------- /marked-cli/README.md: -------------------------------------------------------------------------------- 1 | # marked-cli 2 | 3 | [![Change Log](https://img.shields.io/crates/v/marked-cli.svg?maxAge=3600&label=change%20log&color=9cf)](https://github.com/dekellum/marked/blob/main/marked-cli/CHANGELOG.md) 4 | [![Crates.io](https://img.shields.io/crates/v/marked.svg?maxAge=3600)](https://crates.io/crates/marked-cli) 5 | [![CI Status](https://github.com/dekellum/marked/workflows/CI/badge.svg?branch=main)](https://github.com/dekellum/marked/actions?query=workflow%3ACI) 6 | 7 | A command line tool for HTML/XML markup I/O processing, using the _[marked]_ library crate. 8 | 9 | See the linked rustdoc or the project workspace root [../README] for an 10 | overview. 11 | 12 | ## Minimum supported rust version 13 | 14 | MSRV := 1.38.0 15 | 16 | The crate will fail fast on any lower rustc (via a build.rs version 17 | check) and is also CI tested on this version. 18 | 19 | ## License 20 | 21 | This project is dual licensed under either of following: 22 | 23 | * The Apache License, version 2.0 24 | ([../LICENSE-APACHE] or http://www.apache.org/licenses/LICENSE-2.0) 25 | 26 | * The MIT License 27 | ([../LICENSE-MIT] or http://opensource.org/licenses/MIT) 28 | 29 | ### Contribution 30 | 31 | Unless you explicitly state otherwise, any contribution intentionally submitted 32 | for inclusion in _marked_(-_cli_) by you, as defined by the Apache License, 33 | shall be dual licensed as above, without any additional terms or conditions. 34 | 35 | [marked]: https://docs.rs/crate/marked 36 | [../README]: https://github.com/dekellum/marked#readme 37 | [../LICENSE-APACHE]: https://github.com/dekellum/marked/tree/main/LICENSE-APACHE 38 | [../LICENSE-MIT]: https://github.com/dekellum/marked/tree/main/LICENSE-MIT 39 | -------------------------------------------------------------------------------- /marked-cli/build.rs: -------------------------------------------------------------------------------- 1 | #![cfg_attr(feature = "cargo-clippy", allow(clippy::all))] 2 | 3 | use std::env; 4 | use std::process::Command; 5 | 6 | fn main() { 7 | static PACKAGE: &'static str = "marked-cli"; 8 | let msrv = vec![1, 38]; 9 | 10 | static VERSION: &'static str = env!("CARGO_PKG_VERSION"); 11 | static M_V: &'static str = "minimum supported rust version (MSRV)"; 12 | 13 | let rustv = rustc_version(); 14 | 15 | if rustv < msrv { 16 | panic!( 17 | "{} v{} {} is {} > {} (this rustc)", 18 | PACKAGE, VERSION, M_V, join(&msrv), join(&rustv)); 19 | } 20 | } 21 | 22 | fn join(ver: &[u16]) -> String { 23 | let mut out = String::new(); 24 | for v in ver { 25 | if !out.is_empty() { out.push('.'); } 26 | out.push_str(&v.to_string()); 27 | } 28 | out 29 | } 30 | 31 | // Parse `rustc --version` and return as vector of integers, or panic. 32 | fn rustc_version() -> Vec { 33 | let rustc = env::var("RUSTC").unwrap_or("rustc".to_owned()); 34 | let out = Command::new(rustc).arg("--version").output().unwrap(); 35 | let out = String::from_utf8(out.stdout).unwrap(); 36 | for l in out.lines() { 37 | if l.starts_with("rustc ") { 38 | let mut v = &l[6..]; 39 | if let Some(e) = v.find(" ") { 40 | v = &v[..e]; 41 | } 42 | let mut vp = v.split("-"); 43 | if let Some(v) = vp.next() { 44 | let vs: Vec = v.split(".") 45 | .filter_map(|vss| vss.parse().ok()) 46 | .collect(); 47 | if !vs.is_empty() { 48 | return vs; 49 | } 50 | } 51 | } 52 | } 53 | panic!("rustc version not found") 54 | } 55 | -------------------------------------------------------------------------------- /marked-cli/clippy.toml: -------------------------------------------------------------------------------- 1 | msrv = "1.38.0" 2 | -------------------------------------------------------------------------------- /marked-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | #![warn(rust_2018_idioms)] 2 | 3 | use std::error::Error as StdError; 4 | use std::fmt; 5 | use std::io; 6 | use std::process; 7 | use std::fs::File; 8 | 9 | use encoding_rs as enc; 10 | 11 | use marked::{ 12 | chain_filters, 13 | filter, 14 | html::parse_buffered, 15 | logger::setup_logger, 16 | EncodingHint, 17 | }; 18 | 19 | use clap::{ 20 | crate_version, 21 | Arg, App, AppSettings, SubCommand, 22 | }; 23 | 24 | use log::{debug, error}; 25 | 26 | // Conveniently compact type alias for dyn Trait `std::error::Error`. 27 | type Flaw = Box; 28 | 29 | #[derive(Debug)] 30 | pub(crate) struct ClError(String); 31 | 32 | impl fmt::Display for ClError { 33 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 34 | fmt::Display::fmt(&self.0, f) 35 | } 36 | } 37 | 38 | impl StdError for ClError {} 39 | 40 | macro_rules! quit { 41 | ($($args:tt)+) => { 42 | return Err($crate::ClError(format!($($args)+)).into()) 43 | }; 44 | } 45 | 46 | fn main() { 47 | let r = run(); 48 | if let Err(e) = r { 49 | error!("{}", e); 50 | process::exit(2); 51 | } 52 | } 53 | 54 | fn run() -> Result<(), Flaw> { 55 | let html = SubCommand::with_name("html") 56 | .setting(AppSettings::DeriveDisplayOrder) 57 | .about("HTML processing") 58 | .after_help( 59 | "Parses input, applies filters, and serializes to output.") 60 | .args(&[ 61 | Arg::with_name("output") 62 | .short("o") 63 | .long("output") 64 | .number_of_values(1) 65 | .help("Output to specified file (default: STDOUT)"), 66 | Arg::with_name("encoding") 67 | .short("e") 68 | .long("encoding") 69 | .number_of_values(1) 70 | .multiple(true) 71 | .help("Hint at input encoding label (default: UTF-8)"), 72 | Arg::with_name("filter-banned") 73 | .short("f") 74 | .long("filter-banned") 75 | .help("Filter banned tags and attributes"), 76 | Arg::with_name("text-normalize") 77 | .short("t") 78 | .long("text-normalize") 79 | .help("Aggressively normalize document text"), 80 | Arg::with_name("file") 81 | .required(false) 82 | .value_name("INPUT-FILE") 83 | .help("File path to read (default: STDIN)") 84 | ]); 85 | 86 | let app = App::new("marked") 87 | .version(crate_version!()) 88 | .about("Tool for *ML I/O filtering") 89 | .setting(AppSettings::SubcommandRequired) 90 | .setting(AppSettings::DeriveDisplayOrder) 91 | .max_term_width(100) 92 | .arg(Arg::with_name("debug") 93 | .short("d") 94 | .long("debug") 95 | .multiple(true) 96 | .help("Enable more logging, and up to `-dddd`") 97 | .global(true)) 98 | .subcommand(html); 99 | 100 | let mtch = app.get_matches(); 101 | setup_logger(mtch.occurrences_of("debug") as u32)?; 102 | 103 | let scname = mtch.subcommand_name().unwrap(); // required 104 | if scname != "html" { 105 | quit!("only html (command) processing is supported") 106 | } 107 | let mtch = mtch.subcommand_matches(scname).unwrap(); 108 | 109 | let eh = EncodingHint::shared_default(enc::UTF_8); 110 | 111 | if let Some(vals) = mtch.values_of("encoding") { 112 | for enc in vals { 113 | eh.borrow_mut().add_label_hint(&enc, 0.11); 114 | } 115 | debug!("encoding hint {:?}", eh.borrow()); 116 | } 117 | 118 | let fin = mtch.value_of("file"); 119 | let mut input: Box = if let Some(fin) = fin { 120 | Box::new(File::open(fin)?) 121 | } else { 122 | Box::new(io::stdin()) 123 | }; 124 | 125 | let mut doc = parse_buffered(eh, &mut input)?; 126 | 127 | // FIXME: report non-fatal errors? 128 | 129 | if mtch.is_present("filter-banned") { 130 | doc.filter_breadth(chain_filters!( 131 | filter::detach_banned_elements, 132 | filter::detach_comments, 133 | filter::detach_pis, 134 | filter::retain_basic_attributes, 135 | filter::xmp_to_pre, 136 | )); 137 | } 138 | 139 | if mtch.is_present("text-normalize") { 140 | doc.filter(filter::fold_empty_inline); 141 | doc.filter(filter::text_normalize); // Always use new pass. 142 | } 143 | 144 | let fout = mtch.value_of("output"); 145 | let mut output: Box = if let Some(fout) = fout { 146 | if Some(fout) != fin { 147 | Box::new(File::create(fout)?) 148 | } else { 149 | quit!( 150 | "input {} same as output {} not supported", 151 | fin.unwrap(), fout); 152 | } 153 | } else { 154 | Box::new(io::stdout()) 155 | }; 156 | 157 | doc.serialize(&mut output)?; 158 | 159 | Ok(()) 160 | } 161 | -------------------------------------------------------------------------------- /marked-sanitizer/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.0.0 (2020-3-16) 2 | 3 | * Just a name reservation. 4 | -------------------------------------------------------------------------------- /marked-sanitizer/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "ammonia" 5 | version = "3.1.0" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "89eac85170f4b3fb3dc5e442c1cfb036cb8eecf9dbbd431a161ffad15d90ea3b" 8 | dependencies = [ 9 | "html5ever", 10 | "lazy_static", 11 | "maplit", 12 | "markup5ever_rcdom", 13 | "matches", 14 | "tendril", 15 | "url", 16 | ] 17 | 18 | [[package]] 19 | name = "cfg-if" 20 | version = "0.1.10" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" 23 | 24 | [[package]] 25 | name = "cfg-if" 26 | version = "1.0.0" 27 | source = "registry+https://github.com/rust-lang/crates.io-index" 28 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 29 | 30 | [[package]] 31 | name = "encoding_rs" 32 | version = "0.8.26" 33 | source = "registry+https://github.com/rust-lang/crates.io-index" 34 | checksum = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" 35 | dependencies = [ 36 | "cfg-if 1.0.0", 37 | ] 38 | 39 | [[package]] 40 | name = "form_urlencoded" 41 | version = "1.0.0" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "ece68d15c92e84fa4f19d3780f1294e5ca82a78a6d515f1efaabcc144688be00" 44 | dependencies = [ 45 | "matches", 46 | "percent-encoding", 47 | ] 48 | 49 | [[package]] 50 | name = "futf" 51 | version = "0.1.4" 52 | source = "registry+https://github.com/rust-lang/crates.io-index" 53 | checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" 54 | dependencies = [ 55 | "mac", 56 | "new_debug_unreachable", 57 | ] 58 | 59 | [[package]] 60 | name = "getrandom" 61 | version = "0.1.16" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 64 | dependencies = [ 65 | "cfg-if 1.0.0", 66 | "libc", 67 | "wasi 0.9.0+wasi-snapshot-preview1", 68 | ] 69 | 70 | [[package]] 71 | name = "html5ever" 72 | version = "0.25.1" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" 75 | dependencies = [ 76 | "log", 77 | "mac", 78 | "markup5ever", 79 | "proc-macro2", 80 | "quote", 81 | "syn", 82 | ] 83 | 84 | [[package]] 85 | name = "idna" 86 | version = "0.2.0" 87 | source = "registry+https://github.com/rust-lang/crates.io-index" 88 | checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" 89 | dependencies = [ 90 | "matches", 91 | "unicode-bidi", 92 | "unicode-normalization", 93 | ] 94 | 95 | [[package]] 96 | name = "itoa" 97 | version = "0.4.7" 98 | source = "registry+https://github.com/rust-lang/crates.io-index" 99 | checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" 100 | 101 | [[package]] 102 | name = "lazy_static" 103 | version = "1.4.0" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 106 | 107 | [[package]] 108 | name = "libc" 109 | version = "0.2.82" 110 | source = "registry+https://github.com/rust-lang/crates.io-index" 111 | checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" 112 | 113 | [[package]] 114 | name = "log" 115 | version = "0.4.11" 116 | source = "registry+https://github.com/rust-lang/crates.io-index" 117 | checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" 118 | dependencies = [ 119 | "cfg-if 0.1.10", 120 | ] 121 | 122 | [[package]] 123 | name = "mac" 124 | version = "0.1.1" 125 | source = "registry+https://github.com/rust-lang/crates.io-index" 126 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 127 | 128 | [[package]] 129 | name = "maplit" 130 | version = "1.0.2" 131 | source = "registry+https://github.com/rust-lang/crates.io-index" 132 | checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" 133 | 134 | [[package]] 135 | name = "marked" 136 | version = "0.3.0" 137 | dependencies = [ 138 | "encoding_rs", 139 | "html5ever", 140 | "lazy_static", 141 | "log", 142 | "mime", 143 | "string_cache", 144 | "tendril", 145 | ] 146 | 147 | [[package]] 148 | name = "marked-sanitizer" 149 | version = "0.0.0" 150 | dependencies = [ 151 | "ammonia", 152 | "marked", 153 | ] 154 | 155 | [[package]] 156 | name = "markup5ever" 157 | version = "0.10.0" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" 160 | dependencies = [ 161 | "log", 162 | "phf", 163 | "phf_codegen", 164 | "serde", 165 | "serde_derive", 166 | "serde_json", 167 | "string_cache", 168 | "string_cache_codegen", 169 | "tendril", 170 | ] 171 | 172 | [[package]] 173 | name = "markup5ever_rcdom" 174 | version = "0.1.0" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b" 177 | dependencies = [ 178 | "html5ever", 179 | "markup5ever", 180 | "tendril", 181 | "xml5ever", 182 | ] 183 | 184 | [[package]] 185 | name = "matches" 186 | version = "0.1.8" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" 189 | 190 | [[package]] 191 | name = "mime" 192 | version = "0.3.16" 193 | source = "registry+https://github.com/rust-lang/crates.io-index" 194 | checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" 195 | 196 | [[package]] 197 | name = "new_debug_unreachable" 198 | version = "1.0.4" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 201 | 202 | [[package]] 203 | name = "percent-encoding" 204 | version = "2.1.0" 205 | source = "registry+https://github.com/rust-lang/crates.io-index" 206 | checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" 207 | 208 | [[package]] 209 | name = "phf" 210 | version = "0.8.0" 211 | source = "registry+https://github.com/rust-lang/crates.io-index" 212 | checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" 213 | dependencies = [ 214 | "phf_shared", 215 | ] 216 | 217 | [[package]] 218 | name = "phf_codegen" 219 | version = "0.8.0" 220 | source = "registry+https://github.com/rust-lang/crates.io-index" 221 | checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" 222 | dependencies = [ 223 | "phf_generator", 224 | "phf_shared", 225 | ] 226 | 227 | [[package]] 228 | name = "phf_generator" 229 | version = "0.8.0" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" 232 | dependencies = [ 233 | "phf_shared", 234 | "rand", 235 | ] 236 | 237 | [[package]] 238 | name = "phf_shared" 239 | version = "0.8.0" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" 242 | dependencies = [ 243 | "siphasher", 244 | ] 245 | 246 | [[package]] 247 | name = "ppv-lite86" 248 | version = "0.2.10" 249 | source = "registry+https://github.com/rust-lang/crates.io-index" 250 | checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" 251 | 252 | [[package]] 253 | name = "precomputed-hash" 254 | version = "0.1.1" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 257 | 258 | [[package]] 259 | name = "proc-macro2" 260 | version = "1.0.24" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" 263 | dependencies = [ 264 | "unicode-xid", 265 | ] 266 | 267 | [[package]] 268 | name = "quote" 269 | version = "1.0.8" 270 | source = "registry+https://github.com/rust-lang/crates.io-index" 271 | checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" 272 | dependencies = [ 273 | "proc-macro2", 274 | ] 275 | 276 | [[package]] 277 | name = "rand" 278 | version = "0.7.3" 279 | source = "registry+https://github.com/rust-lang/crates.io-index" 280 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 281 | dependencies = [ 282 | "getrandom", 283 | "libc", 284 | "rand_chacha", 285 | "rand_core", 286 | "rand_hc", 287 | "rand_pcg", 288 | ] 289 | 290 | [[package]] 291 | name = "rand_chacha" 292 | version = "0.2.2" 293 | source = "registry+https://github.com/rust-lang/crates.io-index" 294 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 295 | dependencies = [ 296 | "ppv-lite86", 297 | "rand_core", 298 | ] 299 | 300 | [[package]] 301 | name = "rand_core" 302 | version = "0.5.1" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 305 | dependencies = [ 306 | "getrandom", 307 | ] 308 | 309 | [[package]] 310 | name = "rand_hc" 311 | version = "0.2.0" 312 | source = "registry+https://github.com/rust-lang/crates.io-index" 313 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 314 | dependencies = [ 315 | "rand_core", 316 | ] 317 | 318 | [[package]] 319 | name = "rand_pcg" 320 | version = "0.2.1" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" 323 | dependencies = [ 324 | "rand_core", 325 | ] 326 | 327 | [[package]] 328 | name = "ryu" 329 | version = "1.0.5" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" 332 | 333 | [[package]] 334 | name = "serde" 335 | version = "1.0.118" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" 338 | 339 | [[package]] 340 | name = "serde_derive" 341 | version = "1.0.118" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" 344 | dependencies = [ 345 | "proc-macro2", 346 | "quote", 347 | "syn", 348 | ] 349 | 350 | [[package]] 351 | name = "serde_json" 352 | version = "1.0.61" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" 355 | dependencies = [ 356 | "itoa", 357 | "ryu", 358 | "serde", 359 | ] 360 | 361 | [[package]] 362 | name = "siphasher" 363 | version = "0.3.3" 364 | source = "registry+https://github.com/rust-lang/crates.io-index" 365 | checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" 366 | 367 | [[package]] 368 | name = "string_cache" 369 | version = "0.8.1" 370 | source = "registry+https://github.com/rust-lang/crates.io-index" 371 | checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a" 372 | dependencies = [ 373 | "lazy_static", 374 | "new_debug_unreachable", 375 | "phf_shared", 376 | "precomputed-hash", 377 | "serde", 378 | ] 379 | 380 | [[package]] 381 | name = "string_cache_codegen" 382 | version = "0.5.1" 383 | source = "registry+https://github.com/rust-lang/crates.io-index" 384 | checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" 385 | dependencies = [ 386 | "phf_generator", 387 | "phf_shared", 388 | "proc-macro2", 389 | "quote", 390 | ] 391 | 392 | [[package]] 393 | name = "syn" 394 | version = "1.0.58" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" 397 | dependencies = [ 398 | "proc-macro2", 399 | "quote", 400 | "unicode-xid", 401 | ] 402 | 403 | [[package]] 404 | name = "tendril" 405 | version = "0.4.1" 406 | source = "registry+https://github.com/rust-lang/crates.io-index" 407 | checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" 408 | dependencies = [ 409 | "encoding_rs", 410 | "futf", 411 | "mac", 412 | "utf-8", 413 | ] 414 | 415 | [[package]] 416 | name = "time" 417 | version = "0.1.44" 418 | source = "registry+https://github.com/rust-lang/crates.io-index" 419 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" 420 | dependencies = [ 421 | "libc", 422 | "wasi 0.10.0+wasi-snapshot-preview1", 423 | "winapi", 424 | ] 425 | 426 | [[package]] 427 | name = "tinyvec" 428 | version = "1.1.0" 429 | source = "registry+https://github.com/rust-lang/crates.io-index" 430 | checksum = "ccf8dbc19eb42fba10e8feaaec282fb50e2c14b2726d6301dbfeed0f73306a6f" 431 | dependencies = [ 432 | "tinyvec_macros", 433 | ] 434 | 435 | [[package]] 436 | name = "tinyvec_macros" 437 | version = "0.1.0" 438 | source = "registry+https://github.com/rust-lang/crates.io-index" 439 | checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" 440 | 441 | [[package]] 442 | name = "unicode-bidi" 443 | version = "0.3.4" 444 | source = "registry+https://github.com/rust-lang/crates.io-index" 445 | checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" 446 | dependencies = [ 447 | "matches", 448 | ] 449 | 450 | [[package]] 451 | name = "unicode-normalization" 452 | version = "0.1.16" 453 | source = "registry+https://github.com/rust-lang/crates.io-index" 454 | checksum = "a13e63ab62dbe32aeee58d1c5408d35c36c392bba5d9d3142287219721afe606" 455 | dependencies = [ 456 | "tinyvec", 457 | ] 458 | 459 | [[package]] 460 | name = "unicode-xid" 461 | version = "0.2.1" 462 | source = "registry+https://github.com/rust-lang/crates.io-index" 463 | checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" 464 | 465 | [[package]] 466 | name = "url" 467 | version = "2.2.0" 468 | source = "registry+https://github.com/rust-lang/crates.io-index" 469 | checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" 470 | dependencies = [ 471 | "form_urlencoded", 472 | "idna", 473 | "matches", 474 | "percent-encoding", 475 | ] 476 | 477 | [[package]] 478 | name = "utf-8" 479 | version = "0.7.5" 480 | source = "registry+https://github.com/rust-lang/crates.io-index" 481 | checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" 482 | 483 | [[package]] 484 | name = "wasi" 485 | version = "0.9.0+wasi-snapshot-preview1" 486 | source = "registry+https://github.com/rust-lang/crates.io-index" 487 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 488 | 489 | [[package]] 490 | name = "wasi" 491 | version = "0.10.0+wasi-snapshot-preview1" 492 | source = "registry+https://github.com/rust-lang/crates.io-index" 493 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" 494 | 495 | [[package]] 496 | name = "winapi" 497 | version = "0.3.9" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 500 | dependencies = [ 501 | "winapi-i686-pc-windows-gnu", 502 | "winapi-x86_64-pc-windows-gnu", 503 | ] 504 | 505 | [[package]] 506 | name = "winapi-i686-pc-windows-gnu" 507 | version = "0.4.0" 508 | source = "registry+https://github.com/rust-lang/crates.io-index" 509 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 510 | 511 | [[package]] 512 | name = "winapi-x86_64-pc-windows-gnu" 513 | version = "0.4.0" 514 | source = "registry+https://github.com/rust-lang/crates.io-index" 515 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 516 | 517 | [[package]] 518 | name = "xml5ever" 519 | version = "0.16.1" 520 | source = "registry+https://github.com/rust-lang/crates.io-index" 521 | checksum = "0b1b52e6e8614d4a58b8e70cf51ec0cc21b256ad8206708bcff8139b5bbd6a59" 522 | dependencies = [ 523 | "log", 524 | "mac", 525 | "markup5ever", 526 | "time", 527 | ] 528 | -------------------------------------------------------------------------------- /marked-sanitizer/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "marked-sanitizer" 3 | version = "0.0.0" 4 | authors = ["David Kellum "] 5 | edition = "2018" 6 | license = "MIT/Apache-2.0" 7 | description = "Sanitizer for Märkəd" 8 | repository = "https://github.com/dekellum/marked" 9 | readme = "README.md" 10 | keywords = ["html", "sanitization"] 11 | categories = ["web-programming", "text-processing"] 12 | build = "build.rs" 13 | 14 | [lib] 15 | doctest = false 16 | 17 | [dependencies] 18 | marked = { version=">=0.3.0", path="../marked" } 19 | ammonia = { version=">=3.1.0, <3.2" } 20 | -------------------------------------------------------------------------------- /marked-sanitizer/README.md: -------------------------------------------------------------------------------- 1 | # marked-sanitizer 2 | 3 | [![Rustdoc](https://docs.rs/marked-sanitizer/badge.svg)](https://docs.rs/marked-sanitizer) 4 | [![Change Log](https://img.shields.io/crates/v/marked-sanitizer.svg?maxAge=3600&label=change%20log&color=9cf)](https://github.com/dekellum/marked/blob/main/marked-sanitizer/CHANGELOG.md) 5 | [![Crates.io](https://img.shields.io/crates/v/marked-sanitizer.svg?maxAge=3600)](https://crates.io/crates/marked-sanitizer) 6 | [![CI Status](https://github.com/dekellum/marked/workflows/CI/badge.svg?branch=main)](https://github.com/dekellum/marked/actions?query=workflow%3ACI) 7 | 8 | For now, just reserving the name for potential use as a dedicated sanitizer 9 | crate. Such would presumably be (`Builder`) API compatible with the _[ammonia]_ 10 | crate. See source tree [../ammonia-compare] for a working prototype of the 11 | compatible filtering. 12 | 13 | ## License 14 | 15 | This project is dual licensed under either of following: 16 | 17 | * The Apache License, version 2.0 ([../LICENSE-APACHE]) 18 | or http://www.apache.org/licenses/LICENSE-2.0) 19 | 20 | * The MIT License ([../LICENSE-MIT]) 21 | or http://opensource.org/licenses/MIT) 22 | 23 | ### Contribution 24 | 25 | Unless you explicitly state otherwise, any contribution intentionally submitted 26 | for inclusion in _märkəd_ (marked-sanitizer) by you, as defined by the Apache 27 | License, shall be dual licensed as above, without any additional terms or 28 | conditions. 29 | 30 | [ammonia]: https://crates.io/crates/ammonia 31 | [../ammonia-compare]: https://github.com/dekellum/marked/tree/main/ammonia-compare 32 | [../LICENSE-APACHE]: https://github.com/dekellum/marked/tree/main/LICENSE-APACHE 33 | [../LICENSE-MIT]: https://github.com/dekellum/marked/tree/main/LICENSE-MIT 34 | -------------------------------------------------------------------------------- /marked-sanitizer/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::process::Command; 3 | 4 | fn main() { 5 | static PACKAGE: &'static str = "marked-sanitizer"; 6 | let msrv = vec![1, 38]; 7 | 8 | static VERSION: &'static str = env!("CARGO_PKG_VERSION"); 9 | static M_V: &'static str = "minimum supported rust version (MSRV)"; 10 | 11 | let rustv = rustc_version(); 12 | 13 | if rustv < msrv { 14 | panic!( 15 | "{} v{} {} is {} > {} (this rustc)", 16 | PACKAGE, VERSION, M_V, join(&msrv), join(&rustv)); 17 | } 18 | } 19 | 20 | fn join(ver: &[u16]) -> String { 21 | let mut out = String::new(); 22 | for v in ver { 23 | if !out.is_empty() { out.push('.'); } 24 | out.push_str(&v.to_string()); 25 | } 26 | out 27 | } 28 | 29 | // Parse `rustc --version` and return as vector of integers, or panic. 30 | fn rustc_version() -> Vec { 31 | let rustc = env::var("RUSTC").unwrap_or("rustc".to_owned()); 32 | let out = Command::new(rustc).arg("--version").output().unwrap(); 33 | let out = String::from_utf8(out.stdout).unwrap(); 34 | for l in out.lines() { 35 | if l.starts_with("rustc ") { 36 | let mut v = &l[6..]; 37 | if let Some(e) = v.find(" ") { 38 | v = &v[..e]; 39 | } 40 | let mut vp = v.split("-"); 41 | if let Some(v) = vp.next() { 42 | let vs: Vec = v.split(".") 43 | .filter_map(|vss| vss.parse().ok()) 44 | .collect(); 45 | if !vs.is_empty() { 46 | return vs; 47 | } 48 | } 49 | } 50 | } 51 | panic!("rustc version not found") 52 | } 53 | -------------------------------------------------------------------------------- /marked-sanitizer/src/lib.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked-sanitizer/src/lib.rs -------------------------------------------------------------------------------- /marked/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.3.0 (2021-1-3) 2 | 3 | * `Document::len()` now returns u32 for compatibility to `with_capacity`, etc. 4 | 5 | * `Document::fold()` now replaces the node's data with `NodeData::Hole` and 6 | returns the original `NodeData`. 7 | 8 | * `Document::detach()` now replaces all descendant node data with 9 | `NodeData::Hole` and returns a new independent `Document` fragment with must 10 | be used. Added `Document::unlink()` for cases where a returned fragment is not 11 | required. 12 | 13 | * Added `Document::attach_child()` and `attach_before_sibling()` as logical 14 | inverses to `detach()`. 15 | 16 | * Added `Document::descendants()` and `NodeRef::descendants()`, as more general 17 | form of `nodes()`. 18 | 19 | * Made `&self` lifetimes more lenient for many `NodeRef` methods. 20 | 21 | * Misc memory use optimizations in the form of better capacity guesses and 22 | selective application of `shrink_to_fit` based on tested cost, and the 23 | likelihood of the latter causing a memory move by the allocator. 24 | 25 | * Broaden log dependency to include later 0.4.x patch releases. 26 | 27 | ## 0.2.0 (2020-4-12) 28 | 29 | * The `marked::xml` module and xml-rs dependency is now under a non-default 30 | _xml_ feature. The xml-rs crate appears to not manage or test MSRV. Patch 31 | release 0.8.1 of xml-rs no longer builds on rust 0.38.0 (our MSRV). A 32 | workaround for our users is to constrain to xml-rs 0.8.0. 33 | 34 | * Replace unnamed `NodeData` enum structs with `DocumentType` and 35 | `ProcessingInstruction`. To these types and `Element`, add a private 36 | zero-size member for future proofing. 37 | 38 | * Add `Document::with_capacity` constructor, `Document::len` and 39 | `Document::is_empty` for visibility to capacity and occupied length of 40 | `Node`s. 41 | 42 | * Expose `Document::append_deep_clone` for appending a sub-graph from another 43 | document. 44 | 45 | * Add in-place `Document::compact`, found more efficient than `deep_clone` and 46 | drop of the original. 47 | 48 | * `Node` now implements `Clone` including parent/child/sibling references. 49 | 50 | * Add `Document:bulk_clone` for a faster clone without removing non-reachable 51 | nodes. 52 | 53 | * Properly constrain dependencies (#1) 54 | 55 | * Added various structural debug asserts. For example, for which `NodeData` 56 | variants can have children nodes, or where `NodeData::Document` and `Hole` 57 | nodes are applicable. 58 | 59 | * XML `Whitespace` events (a subcase of text) are now ignored on parse. 60 | 61 | ## 0.1.0 (2020-3-15) 62 | 63 | * Initial release. 64 | -------------------------------------------------------------------------------- /marked/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "marked" 3 | version = "0.3.0" 4 | authors = ["David Kellum "] 5 | edition = "2018" 6 | license = "MIT/Apache-2.0" 7 | description = "Parsing, filtering, selecting and serializing HTML/XML markup." 8 | repository = "https://github.com/dekellum/marked" 9 | readme = "README.md" 10 | keywords = ["html", "sanitization"] 11 | categories = ["web-programming", "text-processing"] 12 | build = "build.rs" 13 | 14 | [lib] 15 | doctest = false 16 | 17 | [dependencies] 18 | html5ever = { version=">=0.25.1, <0.26" } 19 | tendril = { version=">=0.4.1, <0.5", features=["encoding_rs"] } 20 | encoding_rs = { version=">=0.8.13, <0.9" } 21 | xml-rs = { version=">=0.8, <0.9", package="xml-rs", optional=true } 22 | string_cache = { version=">=0.8.0, <0.9" } 23 | mime = { version=">=0.3.14, <0.4" } 24 | log = { version=">=0.4.4, <0.5", features = ["std"] } 25 | lazy_static = { version=">=1.3.0, <1.5" } 26 | 27 | [features] 28 | default = [] 29 | xml = ["xml-rs"] 30 | 31 | [dev-dependencies] 32 | rand = { version=">=0.7.0, <0.8" } 33 | markup5ever_rcdom = { git="https://github.com/dekellum/html5ever", branch="rcdom" } 34 | 35 | [package.metadata.docs.rs] 36 | features = ["xml"] 37 | -------------------------------------------------------------------------------- /marked/README.md: -------------------------------------------------------------------------------- 1 | # marked 2 | 3 | [![Rustdoc](https://docs.rs/marked/badge.svg)](https://docs.rs/marked) 4 | [![Change Log](https://img.shields.io/crates/v/marked.svg?maxAge=3600&label=change%20log&color=9cf)](https://github.com/dekellum/marked/blob/main/marked/CHANGELOG.md) 5 | [![Crates.io](https://img.shields.io/crates/v/marked.svg?maxAge=3600)](https://crates.io/crates/marked) 6 | [![CI Status](https://github.com/dekellum/marked/workflows/CI/badge.svg?branch=main)](https://github.com/dekellum/marked/actions?query=workflow%3ACI) 7 | 8 | Parsing, filtering, selecting and serializing HTML/XML markup. 9 | 10 | See the above linked rustdoc or The Märkəd Project [../README] for a feature 11 | overview. 12 | 13 | ## Optional Features 14 | 15 | The following features may be enabled at build time. **All are disabled by 16 | default, unless otherwise noted.** 17 | 18 | _xml_ 19 | : Includes `marked::xml` module for xml support via the _xml-rs_ crate. 20 | 21 | ## Minimum supported rust version 22 | 23 | MSRV := 1.38.0 24 | 25 | The crate will fail fast on any lower rustc (via a build.rs version 26 | check) and is also CI tested on this version. 27 | 28 | Certain non-default features (e.g. _xml_) may include dependencies which have 29 | higher MSRV requirements. 30 | 31 | ## License 32 | 33 | This project is dual licensed under either of following: 34 | 35 | * The Apache License, version 2.0 36 | ([../LICENSE-APACHE] or http://www.apache.org/licenses/LICENSE-2.0) 37 | 38 | * The MIT License 39 | ([../LICENSE-MIT] or http://opensource.org/licenses/MIT) 40 | 41 | ### Contribution 42 | 43 | Unless you explicitly state otherwise, any contribution intentionally submitted 44 | for inclusion in _märkəd_ by you, as defined by the Apache License, shall be 45 | dual licensed as above, without any additional terms or conditions. 46 | 47 | [../README]: https://github.com/dekellum/marked#readme 48 | [../LICENSE-APACHE]: https://github.com/dekellum/marked/tree/main/LICENSE-APACHE 49 | [../LICENSE-MIT]: https://github.com/dekellum/marked/tree/main/LICENSE-MIT 50 | -------------------------------------------------------------------------------- /marked/benches/round_trip.rs: -------------------------------------------------------------------------------- 1 | #![warn(rust_2018_idioms)] 2 | 3 | #![feature(test)] 4 | extern crate test; // Still required, see rust-lang/rust#55133 5 | 6 | use std::default::Default; 7 | use std::io; 8 | use std::fs::File; 9 | 10 | use test::Bencher; 11 | 12 | use encoding_rs as enc; 13 | use html5ever::driver::ParseOpts; 14 | use html5ever::parse_document; 15 | use markup5ever_rcdom::{SerializableHandle, RcDom}; 16 | use html5ever::serialize as rc_serialize; 17 | 18 | use marked; 19 | use marked::{Decoder, Document, EncodingHint}; 20 | use marked::chain_filters; 21 | use marked::filter; 22 | use marked::html::parse_buffered; 23 | 24 | #[bench] 25 | fn b00_round_trip_rcdom(b: &mut Bencher) { 26 | b.iter(|| { 27 | let parser_sink = 28 | parse_document(RcDom::default(), ParseOpts::default()); 29 | let decoder = Decoder::new(enc::UTF_8, parser_sink); 30 | let mut fin = sample_file("github-dekellum.html") 31 | .expect("sample_file"); 32 | let doc = decoder.read_to_end(&mut fin).expect("parse"); 33 | let mut out = Vec::with_capacity(273108); 34 | let ser_handle: SerializableHandle = doc.document.clone().into(); 35 | rc_serialize(&mut out, &ser_handle, Default::default()) 36 | .expect("serialization"); 37 | assert_eq!(out.len(), 272273); 38 | }); 39 | } 40 | 41 | #[bench] 42 | fn b01_round_trip_marked(b: &mut Bencher) { 43 | b.iter(|| { 44 | let mut fin = sample_file("github-dekellum.html") 45 | .expect("sample_file"); 46 | let eh = EncodingHint::shared_default(enc::UTF_8); 47 | let doc = parse_buffered(eh, &mut fin).expect("parse"); 48 | let mut out = Vec::with_capacity(273108); 49 | doc.serialize(&mut out).expect("serialization"); 50 | assert_eq!(out.len(), 273108); 51 | }); 52 | } 53 | 54 | #[bench] 55 | fn b11_decode_eucjp_parse_marked(b: &mut Bencher) { 56 | b.iter(|| { 57 | let mut fin = sample_file("matsunami_eucjp_meta.html") 58 | .expect("sample_file"); 59 | let eh = EncodingHint::shared_default(enc::UTF_8); 60 | parse_buffered(eh, &mut fin).expect("parse"); 61 | }); 62 | } 63 | 64 | #[bench] 65 | fn b12_decode_windows1251_parse_marked(b: &mut Bencher) { 66 | b.iter(|| { 67 | let mut fin = sample_file("russez_windows1251_meta.html") 68 | .expect("sample_file"); 69 | let eh = EncodingHint::shared_default(enc::UTF_8); 70 | parse_buffered(eh, &mut fin).expect("parse"); 71 | }); 72 | } 73 | 74 | #[bench] 75 | fn b13_utf8_parse_marked(b: &mut Bencher) { 76 | b.iter(|| { 77 | let mut fin = sample_file("github-dekellum.html") 78 | .expect("sample_file"); 79 | let eh = EncodingHint::shared_default(enc::UTF_8); 80 | parse_buffered(eh, &mut fin).expect("parse"); 81 | }); 82 | } 83 | 84 | #[bench] 85 | fn b20_text_content(b: &mut Bencher) { 86 | let mut fin = sample_file("github-dekellum.html") 87 | .expect("sample_file"); 88 | let eh = EncodingHint::shared_default(enc::UTF_8); 89 | let doc = parse_buffered(eh, &mut fin).expect("parse"); 90 | 91 | b.iter(|| { 92 | let out = doc.document_node_ref().text(); 93 | assert_eq!(out.unwrap().len32(), 31637); 94 | }); 95 | } 96 | 97 | #[bench] 98 | fn b30_text_normalize_content(b: &mut Bencher) { 99 | let mut fin = sample_file("github-dekellum.html") 100 | .expect("sample_file"); 101 | let eh = EncodingHint::shared_default(enc::UTF_8); 102 | let doc = parse_buffered(eh, &mut fin).expect("parse"); 103 | b.iter(|| { 104 | let mut doc = doc.deep_clone(doc.root_element().unwrap()); 105 | filter_all(&mut doc); 106 | let out = doc.document_node_ref().text().unwrap(); 107 | assert_eq!(out.len32(), 3257, "txt: {}", out.as_ref()); 108 | }); 109 | } 110 | 111 | #[bench] 112 | fn b31_text_normalize_content_identity(b: &mut Bencher) { 113 | let mut fin = sample_file("github-dekellum.html") 114 | .expect("sample_file"); 115 | let eh = EncodingHint::shared_default(enc::UTF_8); 116 | let mut doc = parse_buffered(eh, &mut fin).expect("parse"); 117 | doc.filter(chain_filters!( 118 | filter::detach_banned_elements, 119 | filter::fold_empty_inline, 120 | filter::detach_comments, 121 | filter::detach_pis, 122 | filter::retain_basic_attributes, 123 | filter::xmp_to_pre, 124 | )); 125 | doc.filter(filter::text_normalize); // Always use new pass. 126 | 127 | b.iter(|| { 128 | doc.filter(chain_filters!( 129 | filter::detach_banned_elements, 130 | filter::fold_empty_inline, 131 | filter::detach_comments, 132 | filter::detach_pis, 133 | filter::retain_basic_attributes, 134 | filter::xmp_to_pre, 135 | )); 136 | doc.filter(filter::text_normalize); // New pass is realistic 137 | let out = doc.document_node_ref().text().unwrap(); 138 | assert_eq!(out.len32(), 3257, "txt: {}", out.as_ref()); 139 | }); 140 | } 141 | 142 | // In b5*_ benches below, compare with in-place `compact()`, so need to parse 143 | // and filter each time to produce a new "sparse" document. 144 | 145 | #[bench] 146 | fn b50_sparse_bulk_clone(b: &mut Bencher) { 147 | let mut fin = sample_file("github-dekellum.html") 148 | .expect("sample_file"); 149 | let eh = EncodingHint::shared_default(enc::UTF_8); 150 | let mut doc = parse_buffered(eh, &mut fin).expect("parse"); 151 | filter_all(&mut doc); 152 | b.iter(|| { 153 | let doc = doc.bulk_clone(); 154 | assert_eq!(5500, doc.len()); 155 | }); 156 | } 157 | 158 | #[bench] 159 | fn b51_sparse_bulk_clone_compact(b: &mut Bencher) { 160 | let mut fin = sample_file("github-dekellum.html") 161 | .expect("sample_file"); 162 | let eh = EncodingHint::shared_default(enc::UTF_8); 163 | let mut doc = parse_buffered(eh, &mut fin).expect("parse"); 164 | filter_all(&mut doc); 165 | b.iter(|| { 166 | let mut doc = doc.bulk_clone(); 167 | doc.compact(); 168 | assert_eq!(1497, doc.len()); 169 | }); 170 | } 171 | 172 | #[bench] 173 | fn b52_sparse_bulk_clone_deep_clone(b: &mut Bencher) { 174 | let mut fin = sample_file("github-dekellum.html") 175 | .expect("sample_file"); 176 | let eh = EncodingHint::shared_default(enc::UTF_8); 177 | let mut doc = parse_buffered(eh, &mut fin).expect("parse"); 178 | filter_all(&mut doc); 179 | b.iter(|| { 180 | let doc = doc.bulk_clone(); 181 | let dc = doc.deep_clone(Document::DOCUMENT_NODE_ID); 182 | assert_eq!(1497, dc.len()); 183 | }); 184 | } 185 | 186 | #[bench] 187 | fn b60_bulk_clone_unlink(b: &mut Bencher) { 188 | let mut fin = sample_file("github-dekellum.html") 189 | .expect("sample_file"); 190 | let eh = EncodingHint::shared_default(enc::UTF_8); 191 | let doc = parse_buffered(eh, &mut fin).expect("parse"); 192 | 193 | b.iter(|| { 194 | let mut doc = doc.bulk_clone(); 195 | let rid = doc.root_element().expect("root"); 196 | doc.unlink(rid); 197 | assert_eq!(5500, doc.len()); 198 | assert_eq!(2, doc.nodes().count()); 199 | }); 200 | } 201 | 202 | #[bench] 203 | fn b61_bulk_clone_detach(b: &mut Bencher) { 204 | let mut fin = sample_file("github-dekellum.html") 205 | .expect("sample_file"); 206 | let eh = EncodingHint::shared_default(enc::UTF_8); 207 | let doc = parse_buffered(eh, &mut fin).expect("parse"); 208 | 209 | b.iter(|| { 210 | let mut doc = doc.bulk_clone(); 211 | let rid = doc.root_element().expect("root"); 212 | let det = doc.detach(rid); 213 | assert_eq!(5499, det.len()); 214 | assert_eq!(5500, doc.len()); 215 | assert_eq!(2, doc.nodes().count()); 216 | }); 217 | } 218 | 219 | #[bench] 220 | fn b70_count(b: &mut Bencher) { 221 | let mut fin = sample_file("github-dekellum.html") 222 | .expect("sample_file"); 223 | let eh = EncodingHint::shared_default(enc::UTF_8); 224 | let doc = parse_buffered(eh, &mut fin).expect("parse"); 225 | 226 | b.iter(|| { 227 | assert_eq!(5500, doc.nodes().count()) 228 | }); 229 | } 230 | 231 | fn sample_file(fname: &str) -> Result { 232 | let root = env!("CARGO_MANIFEST_DIR"); 233 | let fpath = format!("{}/samples/{}", root, fname); 234 | File::open(fpath) 235 | } 236 | 237 | fn filter_all(doc: &mut Document) { 238 | doc.filter_breadth(chain_filters!( 239 | filter::detach_banned_elements, 240 | filter::detach_comments, 241 | filter::detach_pis, 242 | filter::retain_basic_attributes, 243 | filter::xmp_to_pre, 244 | )); 245 | 246 | doc.filter(filter::fold_empty_inline); 247 | doc.filter(filter::text_normalize); // Always use new pass. 248 | } 249 | -------------------------------------------------------------------------------- /marked/build.rs: -------------------------------------------------------------------------------- 1 | #![cfg_attr(feature = "cargo-clippy", allow(clippy::all))] 2 | 3 | use std::env; 4 | use std::process::Command; 5 | 6 | fn main() { 7 | static PACKAGE: &'static str = "marked"; 8 | let msrv = vec![1, 38]; 9 | 10 | static VERSION: &'static str = env!("CARGO_PKG_VERSION"); 11 | static M_V: &'static str = "minimum supported rust version (MSRV)"; 12 | 13 | let rustv = rustc_version(); 14 | 15 | if rustv < msrv { 16 | panic!( 17 | "{} v{} {} is {} > {} (this rustc)", 18 | PACKAGE, VERSION, M_V, join(&msrv), join(&rustv)); 19 | } 20 | } 21 | 22 | fn join(ver: &[u16]) -> String { 23 | let mut out = String::new(); 24 | for v in ver { 25 | if !out.is_empty() { out.push('.'); } 26 | out.push_str(&v.to_string()); 27 | } 28 | out 29 | } 30 | 31 | // Parse `rustc --version` and return as vector of integers, or panic. 32 | fn rustc_version() -> Vec { 33 | let rustc = env::var("RUSTC").unwrap_or("rustc".to_owned()); 34 | let out = Command::new(rustc).arg("--version").output().unwrap(); 35 | let out = String::from_utf8(out.stdout).unwrap(); 36 | for l in out.lines() { 37 | if l.starts_with("rustc ") { 38 | let mut v = &l[6..]; 39 | if let Some(e) = v.find(" ") { 40 | v = &v[..e]; 41 | } 42 | let mut vp = v.split("-"); 43 | if let Some(v) = vp.next() { 44 | let vs: Vec = v.split(".") 45 | .filter_map(|vss| vss.parse().ok()) 46 | .collect(); 47 | if !vs.is_empty() { 48 | return vs; 49 | } 50 | } 51 | } 52 | } 53 | panic!("rustc version not found") 54 | } 55 | -------------------------------------------------------------------------------- /marked/build/attributes: -------------------------------------------------------------------------------- 1 | # HTML Attributes 2 | # 3 | # Format: 4 | # Lines prefixed with `#` are comments, uninterpreted 5 | # Lines matching ([A-Z]+) `::` ALL (except: tags)? define groups of tags 6 | # Comma delimited columns: name, tags, description, flags 7 | # Tags marked with asterisk (*): attribute is for style purposes only. 8 | # 9 | # Flag codes: 10 | # U :: Currently undefined by our HTML parser provider 11 | 12 | # Sources 13 | # https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes 14 | # https://html.spec.whatwg.org/ 15 | # https://www.w3.org/TR/xhtml11/ 16 | # https://www.w3.org/TR/html4/ 17 | # https://www.w3schools.com/tags/ref_attributes.asp 18 | 19 | CORE :: ALL except: base head html meta param script style title 20 | class ,*CORE 21 | id ,*CORE a 22 | style ,*CORE 23 | hidden ,*CORE, hidden element 24 | title ,CORE, extra title 25 | 26 | LANG :: ALL except: base br frame frameset hr iframe param 27 | dir ,LANG, Text direction; ltr or rtl 28 | lang ,LANG, language_code; also xml:lang 29 | 30 | GLOBAL :: ALL except: 31 | base ,GLOBAL, inherited from xml:base (deprecated) 32 | 33 | # Meta tag attributes 34 | http-equiv ,meta, HTTP Header name 35 | content ,meta, text 36 | scheme ,meta, format URI 37 | 38 | # Anchor and link attributes 39 | charset ,a link meta, encoding of link or (meta) document 40 | coords ,*a, coordinates; i.e. image map 41 | hreflang ,a link, language_code of referent 42 | href ,a base link, URL 43 | media ,a area link 44 | name ,a param, section_name anchor 45 | rel ,a link 46 | rev ,a link 47 | shape ,*a 48 | target ,*a *base *link 49 | type ,a link embed object source 50 | 51 | # Image and some frame attributes 52 | src ,frame img audio embed source 53 | data ,object 54 | alt ,img area input 55 | height ,img picture embed video svg *tr *th *td *iframe *object 56 | width ,img picture embed video svg *table *tr *th *td *iframe *object 57 | decoding ,img, preferred method to decode, U 58 | # Table specific attributes 59 | abbr ,tr th 60 | align ,table tbody tfoot thead tr td th iframe object 61 | axis ,tr th 62 | bgcolor ,*table *tbody *tfoot *tr *td *th *col *colgroup *body 63 | border ,*table *img *object 64 | cellpadding ,*table 65 | cellspacing ,*table 66 | char ,*tr *td *th 67 | charoff ,*tr *td *th 68 | colspan ,tr td th 69 | frame ,*table 70 | headers ,tr td 71 | nowrap ,*tr *td *th 72 | rowspan ,tr td th 73 | rules ,*table 74 | scope ,tr td th 75 | span ,col colgroup 76 | summary ,table 77 | valign ,*tr *td 78 | value ,data param 79 | 80 | # Purposefully omitted 81 | # -- The event attributes on*, onmouse*, onkey*, etc. 82 | # -- data-* 83 | 84 | accept ,form input, (file) types accepted 85 | accept-charset ,form 86 | 87 | cite ,blockquote del ins q 88 | 89 | color ,*basefont *font *hr 90 | controls ,*audio *video 91 | datetime ,del ins time 92 | label ,option optgroup 93 | -------------------------------------------------------------------------------- /marked/build/generate.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'erb' 4 | require 'ostruct' 5 | 6 | # Generator for HTML.rs tags/attribute static metadata 7 | class Generator 8 | 9 | attr_reader :tags, :attributes 10 | 11 | BASEDIR = File.dirname(__FILE__) 12 | 13 | OUT_FILE = File.join(BASEDIR, '../src/dom/html/meta.rs') 14 | 15 | def run(out_file = OUT_FILE) 16 | parse_tags 17 | parse_attributes 18 | map_basic_attributes 19 | generate(out_file) 20 | end 21 | 22 | FLAGS = { 23 | 'E' => 'empty', 24 | 'D' => 'deprecated', 25 | 'I' => 'inline', 26 | 'M' => 'meta', 27 | 'B' => 'banned', 28 | 'U' => 'undefined' 29 | } 30 | 31 | def parse_tags 32 | @tags = [] 33 | 34 | open(File.join(BASEDIR, 'tags'), 'r') do |fin| 35 | fin.each do |line| 36 | case line 37 | when /^\s*#/, /^\s*$/ 38 | # ignore comment, empty lines 39 | when /^\s*[^\s,]+\s*,[^,]*,[^,]*$/ 40 | r = line.split(',').map { |c| c.strip } 41 | r = r.compact.reject { |c| c.empty? } 42 | flags = r[1].split(' ').map { |f| FLAGS[f] }.compact 43 | @tags << OpenStruct.new( 44 | :name => r[0], 45 | :flags => flags, 46 | :desc => r[2] 47 | ) 48 | else 49 | raise "Parse ERROR: line [#{line}]" 50 | end 51 | end 52 | end 53 | 54 | @tags.sort_by! { |o| o.name } 55 | @tags.uniq! { |o| o.name } 56 | @tag_max_len = @tags.map { |t| t.name.length }.max 57 | end 58 | 59 | def parse_attributes 60 | @attributes = [] 61 | tagsets = {} 62 | 63 | open(File.join(BASEDIR, 'attributes'), 'r') do |fin| 64 | fin.each do |line| 65 | case line 66 | when /^\s*#/, /^\s*$/ 67 | # ignore comment, empty lines 68 | when /^\s*([A-Z]+)\s*::\s*ALL\s+except:(.*)$/ 69 | sname = $1 70 | except = $2.split(' ').compact.reject { |t| t.empty? } 71 | tset = @tags.reject { |t| except.include?(t.name) } 72 | tset.map! { |t| t.name } 73 | tagsets[sname] = tset 74 | when /^\s*[^\s,]+\s*,/ 75 | r = line.split(',').map { |c| c.strip } 76 | r = r.compact.reject { |c| c.empty? } 77 | if r[3] 78 | flags = r[3].split(' ').map { |f| FLAGS[f] }.compact 79 | else 80 | flags = [] 81 | end 82 | # FIXME: Handle attributes, desc. 83 | 84 | btags = r[1].split(' ').compact.reject { |t| t.empty? || t =~ /^\*/ } 85 | btags = btags.map { |t| tagsets[ t ] || t }.flatten 86 | 87 | @attributes << OpenStruct.new( 88 | :name => r[0], 89 | :basic_tags => btags, 90 | :flags => flags, 91 | :desc => r[2] 92 | ) 93 | else 94 | raise "Parse ERROR: line [#{line}]" 95 | end 96 | end 97 | end 98 | 99 | @attributes.sort_by! { |o| o.name } 100 | @attributes.uniq! { |o| o.name } 101 | 102 | @attr_max_len = @attributes.map { |t| t.name.length }.max 103 | end 104 | 105 | def map_basic_attributes 106 | @tags.each do |tag| 107 | tag.basic_atts = 108 | @attributes.select { |attr| attr.basic_tags.include?(tag.name) } 109 | end 110 | end 111 | 112 | def twidth(val, extra = 0) 113 | val + (' ' * (@tag_max_len - val.length + extra)) 114 | end 115 | 116 | def awidth(val, extra = 0) 117 | val + (' ' * (@attr_max_len - val.length + extra)) 118 | end 119 | 120 | def const(val) 121 | val.gsub(/\-/, '_') 122 | end 123 | 124 | def clone_if(o, val) 125 | if o.flags.include?('undefined') 126 | "#{val}.clone()" 127 | else 128 | val 129 | end 130 | end 131 | 132 | def tags_with(flag) 133 | @tags 134 | .select {|t| t.flags.include?(flag) } 135 | .map { |t| t.name } 136 | .join(' ') 137 | end 138 | 139 | def map_flags(tag) 140 | tag.flags 141 | .reject { |f| f == "undefined" } 142 | .map { |f| "is_#{f}: true" } 143 | end 144 | 145 | def generate(out_file) 146 | erb_file = File.join(BASEDIR, 'meta.rs.erb') 147 | template = ERB.new(IO.read(erb_file), nil, '%') 148 | 149 | open(out_file, 'w') do |fout| 150 | fout << template.result(binding) 151 | end 152 | end 153 | 154 | end 155 | 156 | if $0 == __FILE__ 157 | Generator.new.run(*ARGV) 158 | end 159 | -------------------------------------------------------------------------------- /marked/build/meta.rs.erb: -------------------------------------------------------------------------------- 1 | //! Static metadata for HTML elements and attributes. 2 | //! 3 | //! This file is generated via build/generate.rb and the build/meta.rs.erb 4 | //! template. It should not be manually edited. To avoid any rust build-time 5 | //! dependency however, the resulting source file (src/dom/html/meta.rs) is 6 | //! also checked in. 7 | 8 | use std::collections::HashMap; 9 | 10 | use lazy_static::lazy_static; 11 | 12 | use crate::dom::LocalName; 13 | 14 | lazy_static! { 15 | /// A static lookup table for metadata on known HTML tags. 16 | pub static ref TAG_META: HashMap = init_tag_metadata(); 17 | } 18 | 19 | /// Metadata about HTML tags and their attributes. 20 | pub struct TagMeta { 21 | is_empty: bool, 22 | is_deprecated: bool, 23 | is_inline: bool, 24 | is_meta: bool, 25 | is_banned: bool, 26 | basic_attrs: Vec, 27 | } 28 | 29 | impl TagMeta { 30 | /// Return true if the element is defined to be empty: having no contents 31 | /// or end tag. 32 | /// 33 | /// Tags include: `<%=tags_with('empty')%>`. 34 | pub fn is_empty(&self) -> bool { 35 | self.is_empty 36 | } 37 | 38 | /// Return true if the tag is deprecated as of html5. 39 | /// 40 | /// Tags include: `<%=tags_with('deprecated')%>`. 41 | pub fn is_deprecated(&self) -> bool { 42 | self.is_deprecated 43 | } 44 | 45 | /// Return true if the tag reprsents an _inline_ element: is not a block 46 | /// layout producing element under normal use. 47 | /// 48 | /// Because HTML 5 no longer specifies this property, this is a 49 | /// somewhat arbitrary distinction maintained here, loosely based on HTML 4 50 | /// but extending for new tags. One noteworthy exception is that `
` is 51 | /// not considered inline. 52 | /// 53 | /// Tags include: `<%=tags_with('inline')%>`. 54 | pub fn is_inline(&self) -> bool { 55 | self.is_inline 56 | } 57 | 58 | /// Return true if the tag represents metadata only, where any content is 59 | /// not displayed text. e.g. ``. 60 | /// 61 | /// Tags include: `<%=tags_with('meta')%>`. 62 | pub fn is_meta(&self) -> bool { 63 | self.is_meta 64 | } 65 | 66 | /// Return true if the tag is banned/blacklisted: where no content should 67 | /// be extracted, displayed, or otherwise used. 68 | /// 69 | /// Tags include: `<%=tags_with('banned')%>`. 70 | pub fn is_banned(&self) -> bool { 71 | self.is_banned 72 | } 73 | 74 | /// Return true if the given name is part of the _basic_ set of known 75 | /// attributes for this element. 76 | /// 77 | /// This _basic set_ of attributes excludes, among other things, attributes 78 | /// that are used exclusively for styling purposes. 79 | pub fn has_basic_attr(&self, name: &LocalName) -> bool { 80 | self.basic_attrs.binary_search(name).is_ok() 81 | } 82 | } 83 | 84 | impl Default for TagMeta { 85 | fn default() -> TagMeta { 86 | TagMeta { 87 | is_empty: false, 88 | is_deprecated: false, 89 | is_inline: false, 90 | is_meta: false, 91 | is_banned: false, 92 | basic_attrs: vec![], 93 | } 94 | } 95 | } 96 | 97 | /// `Namespace` constants 98 | pub mod ns { 99 | use html5ever::ns; 100 | use crate::dom::Namespace; 101 | 102 | pub const HTML: Namespace = ns!(html); 103 | } 104 | 105 | /// HTML tag constants 106 | pub mod t { 107 | use html5ever::local_name as lname; 108 | use crate::dom::LocalName; 109 | 110 | % tags.each do |tag| 111 | % if tag.flags.include?('undefined') 112 | lazy_static::lazy_static! { 113 | % if tag.desc 114 | /// Tag `<<%= tag.name %>>`: <%= tag.desc %>. 115 | % unless tag.flags.empty? 116 | /// (meta: <%= tag.flags.join(' ') %>) 117 | % end 118 | /// 119 | /// This is a lazy static (struct) as its not defined by html5ever. 120 | % end 121 | pub static ref <%=const(tag.name.upcase)%>: LocalName = "<%=tag.name%>".into(); 122 | } 123 | % else 124 | % if tag.desc 125 | /// Tag `<<%= tag.name %>>`: <%= tag.desc %>. 126 | % unless tag.flags.empty? 127 | /// (meta: <%= tag.flags.join(' ') %>) 128 | % end 129 | % end 130 | pub const <%=twidth(const(tag.name.upcase) + ':', 4)%> LocalName = lname!("<%=tag.name%>"); 131 | % end 132 | % end 133 | } 134 | 135 | /// HTML attribute constants 136 | pub mod a { 137 | use html5ever::local_name as lname; 138 | use crate::dom::LocalName; 139 | 140 | % attributes.each do |a| 141 | % if a.flags.include?('undefined') 142 | lazy_static::lazy_static! { 143 | % if a.desc 144 | /// Attribute <%= a.name %>: <%= a.desc %>. 145 | /// 146 | /// This is a lazy static (struct) as its not defined by html5ever. 147 | % end 148 | pub static ref <%=const(a.name.upcase)%>: LocalName = "<%=a.name%>".into(); 149 | } 150 | % else 151 | % if a.desc 152 | /// Attribute <%= a.name %>: <%= a.desc %>. 153 | % end 154 | pub const <%=awidth(const(a.name.upcase) + ':', 4)%> LocalName = lname!("<%=a.name%>"); 155 | % end 156 | % end 157 | } 158 | 159 | fn init_tag_metadata() -> HashMap { 160 | let mut tag_meta = HashMap::with_capacity(<%= tags.length() %>); 161 | 162 | % tags.each do |tag| 163 | tag_meta.insert(t::<%=clone_if(tag, const(tag.name.upcase))%>, TagMeta { 164 | % map_flags(tag).each do |f| 165 | <%=f%>, 166 | % end 167 | basic_attrs: vec![ 168 | <%= tag.basic_atts.map { |a| clone_if(a, 'a::' + const(a.name.upcase)) }.join(', ') %> 169 | ], 170 | .. TagMeta::default() 171 | }); 172 | % end 173 | 174 | tag_meta 175 | } 176 | -------------------------------------------------------------------------------- /marked/build/tags: -------------------------------------------------------------------------------- 1 | # HTML Tags 2 | # 3 | # Format: 4 | # Lines prefixed with `#` are comments, uninterpreted 5 | # Comma delimited columns: name, flags (see codes below), description 6 | # 7 | # Flag codes: 8 | # E :: Empty Tag 9 | # S :: In Strict HTML 4.01/XHTML 1.0 10 | # T :: In Transitional HTML 4.01/XHTML 1.0 11 | # F :: In frameset annex 12 | # 5 :: In HTML5 13 | # D :: Deprecated 14 | # I :: Inline elements (Note
is not labeled inline.) 15 | # M :: Metadata elements (content not visible text), i.e. head 16 | # B :: Banned/blacklisted elements from which text should not be extracted. 17 | # U :: Currently undefined by our HTML parser provider 18 | # 19 | # Sources 20 | # https://developer.mozilla.org/en-US/docs/Web/HTML/Element 21 | # https://html.spec.whatwg.org/ 22 | # https://www.w3.org/TR/html4/ 23 | # https://www.w3.org/TR/xhtml11/ 24 | 25 | a , S T F 5 I , anchor 26 | abbr , S T F 5 I , abbreviation 27 | acronym , S T F D I , acronym 28 | address , S T F 5 , contact information for the author or owner 29 | applet , T F D , embedded applet 30 | area ,E S T F 5 , area inside an image-map 31 | article , 5 , Structure: an independent content element 32 | aside , 5 , Structure: tengentially related content 33 | audio , 5 I , Sound content 34 | b , S T F 5 I , bold text 35 | base ,E S T F 5 M , default address or target for all links on a page 36 | basefont ,E T F D I M , default font; color; or size for the text in a page 37 | bdi , 5 I , Text isolated from surrounding for BIDI formatting 38 | bdo , S T F 5 I , the text direction 39 | big , S T F D I , big text 40 | blink , D I , blinking text 41 | blockquote , S T F 5 , long quotation 42 | body , S T F 5 , the document's body 43 | br ,E S T F 5 , single line break 44 | button , S T F 5 I B, push button 45 | canvas , 5 I , canvas for drawing graphics and animations 46 | caption , S T F 5 , table caption 47 | center , T F D , centered text 48 | cite , S T F 5 I , citation 49 | code , S T F 5 I , computer code text 50 | col ,E S T F 5 , attribute values for one or more columns in a table 51 | colgroup , S T F 5 , group of columns in a table for formatting 52 | content , D B, Shadow DOM content placeholder element 53 | data , 5 I , adds machine-oriented data representation 54 | datalist , 5 I B, container for option elements 55 | dd , S T F 5 , description of a term in a definition list 56 | del , S T F 5 I , deleted text 57 | details , 5 , optional additional details (also: summary) 58 | dfn , S T F 5 I , definition term 59 | dialog , 5 , dialog box or other interactive component 60 | dir , T F D , directory list 61 | div , S T F 5 , section in a document 62 | dl , S T F 5 , definition list 63 | dt , S T F 5 , term (an item) in a definition list 64 | em , S T F 5 I , emphasized text 65 | embed ,E 5 I , embed content by external app or plug-in 66 | fieldset , S T F 5 B, border around elements in a form 67 | figcaption , 5 , Structure: a figure caption 68 | figure , 5 , Structure: self contained content that can be moved 69 | font , T F D I , font; color; or size for text 70 | footer , 5 , Structure: a footer of a section 71 | form , S T F 5 , form for user input 72 | frame ,E F D B, window (a frame) in a frameset 73 | frameset , F D B, set of frames 74 | h1 , S T F 5 , heading level 1 75 | h2 , S T F 5 , heading level 2 76 | h3 , S T F 5 , heading level 3 77 | h4 , S T F 5 , heading level 4 78 | h5 , S T F 5 , heading level 5 79 | h6 , S T F 5 , heading level 6 80 | head , S T F 5 M , information about the document 81 | header , 5 , Structure: a header of a section 82 | hgroup , 5 , Structure: a group of headings 83 | hr ,E S T F 5 , horizontal line 84 | html , S T F 5 , document 85 | i , S T F 5 I , italic text 86 | iframe , T F 5 I , inline frame 87 | img ,E S T F 5 I , image 88 | input ,E S T F 5 I B, input control 89 | ins , S T F 5 I , inserted text 90 | isindex , T F D , searchable index related to a document 91 | kbd , S T F 5 I , keyboard text 92 | label , S T F 5 I B, label for input or other element 93 | legend , S T F 5 B, caption for a fieldset element 94 | li , S T F 5 , list item 95 | link ,E S T F 5 M , relationship with an external resource 96 | listing , D , preformated text 97 | main , 5 , identify central topic/functional content 98 | map , S T F 5 I , image-map 99 | mark , 5 I , Text marked/highlighted for reference purposes 100 | menu , T F 5 D , menu list 101 | menuitem ,E 5 D , a command in a menu 102 | meta ,E S T F 5 M , metadata 103 | meter , 5 I , a linear guage for a scaler value 104 | nav , 5 , Structure: container for navigational links 105 | nobr , D I , contained text; white-space: nowrap 106 | noframes , T F D B, alternate content where frames not supported 107 | noscript , S T F 5 I B, alternate content script not supported 108 | object , S T F 5 I B, embedded object 109 | ol , S T F 5 , ordered list 110 | optgroup , S T F 5 B, group of related options in a select list 111 | option , S T F 5 B, option in a select list 112 | output , 5 I , content is (scripted) outcome of a user action. 113 | p , S T F 5 , paragraph 114 | param ,E S T F 5 , parameter for an object 115 | picture , 5 I , container for multiple img/source DPI 116 | plaintext , D , like xmp; no close tag 117 | pre , S T F 5 , preformatted text 118 | progress , 5 I , a progress bar 119 | q , S T F 5 I , short quotation 120 | rb , 5 , ruby base text 121 | rbc , 5 U, ruby base container (complex) 122 | rp , 5 , ruby simple text container 123 | rt , 5 , ruby annotation text 124 | rtc , 5 , ruby text container (complex) 125 | ruby , 5 I , ruby pronunciation aid 126 | s , T F 5 D I , strikethrough text 127 | samp , S T F 5 I , sample computer code 128 | script , S T F 5 I B, client-side script 129 | section , 5 , Structure: generic document/application section 130 | select , S T F 5 I B, select list (drop-down list) 131 | slot , 5 I B, (Shadow) DOM placeholder element 132 | small , S T F 5 I , small text 133 | source ,E 5 , source for picture/audio/video elements 134 | span , S T F 5 I , section in a document 135 | strike , T F D I , strikethrough text 136 | strong , S T F 5 I , strong text 137 | style , S T F 5 B, style information for a document 138 | sub , S T F 5 I , subscripted text 139 | summary , 5 , summary of details element 140 | sup , S T F 5 I , superscripted text 141 | svg , 5 , inline scalable vector graphics 142 | table , S T F 5 , table 143 | tbody , S T F 5 , Groups the body content in a table 144 | td , S T F 5 , cell in a table 145 | template , 5 B, html sub-tree notrenderered except by script 146 | textarea , S T F 5 I B, multi-line text input control 147 | tfoot , S T F 5 , Groups the footer content in a table 148 | th , S T F 5 , header cell in a table 149 | thead , S T F 5 , Groups the header content in a table 150 | time , 5 I , A date or time 151 | title , S T F 5 M , the title of a document 152 | tr , S T F 5 , row in a table 153 | tt , S T F D I , teletype text 154 | u , T F 5 D I , underlined text 155 | ul , S T F 5 , unordered list 156 | var , S T F 5 I , variable part of a text 157 | video , 5 I , video container 158 | wbr ,E 5 I , A line break opportunity 159 | xmp , D , preformatted text 160 | -------------------------------------------------------------------------------- /marked/clippy.toml: -------------------------------------------------------------------------------- 1 | msrv = "1.38.0" 2 | -------------------------------------------------------------------------------- /marked/samples/documento_utf16be_bom.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/documento_utf16be_bom.html -------------------------------------------------------------------------------- /marked/samples/documento_utf16be_meta_utf16le.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/documento_utf16be_meta_utf16le.html -------------------------------------------------------------------------------- /marked/samples/documento_utf16le.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/documento_utf16le.html -------------------------------------------------------------------------------- /marked/samples/documento_utf16le_bom.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/documento_utf16le_bom.html -------------------------------------------------------------------------------- /marked/samples/documento_utf16le_meta_utf8.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/documento_utf16le_meta_utf8.html -------------------------------------------------------------------------------- /marked/samples/documento_utf8.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Un documento electronica (titulo en ASCII) 4 | 5 | 6 |

¿De donde eres tú?

7 | 8 | 9 | -------------------------------------------------------------------------------- /marked/samples/documento_utf8_bom.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | Un documento electronica (titulo en ASCII) 4 | 5 | 6 |

¿De donde eres tú?

7 | 8 | 9 | -------------------------------------------------------------------------------- /marked/samples/documento_utf8_meta.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Un documento electronica (titulo en ASCII) 4 | 5 | 6 | 7 |

¿De donde eres tú?

8 | 9 | 10 | -------------------------------------------------------------------------------- /marked/samples/documento_utf8_meta_utf16.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Un documento electronica (titulo en ASCII) 5 | 6 | 7 | 8 |

¿De donde eres tú?

9 | 10 | 11 | -------------------------------------------------------------------------------- /marked/samples/documento_windows1252_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/documento_windows1252_meta.html -------------------------------------------------------------------------------- /marked/samples/iro0094_shiftjis_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/iro0094_shiftjis_meta.html -------------------------------------------------------------------------------- /marked/samples/matsunami_eucjp_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/matsunami_eucjp_meta.html -------------------------------------------------------------------------------- /marked/samples/russez_windows1251_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/russez_windows1251_meta.html -------------------------------------------------------------------------------- /marked/src/chars.rs: -------------------------------------------------------------------------------- 1 | use tendril::StrTendril; 2 | 3 | /// Replace or remove sequences of white-space and/or control characters, and 4 | /// optionally remove leading/trailing spaces. 5 | /// 6 | /// _What_ char classes to replace is given via `ws` and `ctrl` flags. If a 7 | /// sequence is _all_ control or zero-width spaces, then it is simple removed 8 | /// (without replacement). If there is at least one non-zero width white-space 9 | /// character then the sequence is replaces with U+0020 SPACE. The string (st) 10 | /// is only lazily re-allocated (replaced) if a change is required. 11 | pub(crate) fn replace_chars( 12 | st: &mut StrTendril, 13 | ws: bool, 14 | ctrl: bool, 15 | trim_start: bool, 16 | trim_end: bool) 17 | { 18 | let mut last = 0; 19 | let mut ost = None; // output lazy allocated 20 | let mut replacing = 0u8; 21 | 22 | let ins = st.as_ref(); 23 | for (i, ch) in ins.char_indices() { 24 | let rmask = replace_mask(ch, ws, ctrl); 25 | if rmask > 0 { 26 | if replacing == 0 { 27 | if ost.is_none() { 28 | ost = Some(StrTendril::new()); 29 | } 30 | ost.as_mut().unwrap().push_slice(&ins[last..i]); 31 | } 32 | replacing |= rmask; 33 | } else if replacing > 0 { 34 | if replacing >= 2 && 35 | (ost.as_ref().unwrap().len32() > 0 || !trim_start) 36 | { 37 | ost.as_mut().unwrap().push_char(' '); 38 | } 39 | last = i; 40 | replacing = 0; 41 | } 42 | } 43 | if replacing > 0 { 44 | if replacing >= 2 && !trim_end { 45 | ost.as_mut().unwrap().push_char(' '); 46 | } 47 | } else if ost.is_some() { 48 | ost.as_mut().unwrap().push_slice(&ins[last..]); 49 | } 50 | if ost.is_some() { 51 | *st = ost.take().unwrap(); 52 | } 53 | } 54 | 55 | // Compare CharClass to flags and return bit-1 (control or zero-width) or bit-2 56 | // (whitespace). 57 | fn replace_mask(c: char, ws: bool, ctrl: bool) -> u8 { 58 | use CharClass::*; 59 | match char_class(c) { 60 | ZeroSpace | Control if ctrl => 1, 61 | WhiteSpace if ws => 2, 62 | _ => 0, 63 | } 64 | } 65 | 66 | // Character classes of internal interest (not the same as Unicode classes). 67 | #[derive(Debug, Eq, PartialEq)] 68 | enum CharClass { 69 | Unclassified, 70 | WhiteSpace, 71 | ZeroSpace, 72 | Control, 73 | } 74 | 75 | /// True if all contained characters are classified as whitespace or controls. 76 | pub(crate) fn is_all_ctrl_ws(st: &StrTendril) -> bool { 77 | st.as_ref().chars().all(|c| char_class(c) != CharClass::Unclassified) 78 | } 79 | 80 | // Return CharClass for a char 81 | fn char_class(c: char) -> CharClass { 82 | use CharClass::*; 83 | match c { 84 | '\u{0000}'..='\u{0008}' => Control, // C0 (XML disallowed) 85 | '\u{0009}' | // HT 86 | '\u{000A}' | // LF 87 | '\u{000B}' => WhiteSpace, // VT 88 | '\u{000C}' => Control, // FF (C0) 89 | '\u{000D}' => WhiteSpace, // CR 90 | '\u{000E}'..='\u{001F}' => Control, // C0 91 | '\u{0020}' => WhiteSpace, // SPACE 92 | 93 | '\u{007F}' | // DEL (C0) 94 | '\u{0080}'..='\u{009F}' => Control, // C1 (XML disallowed) 95 | '\u{00A0}' => WhiteSpace, // NO-BREAK SPACE (NBSP) 96 | 97 | // Not always (zero) white; shows hypen when line is wrapped. 98 | // '\u{00AD}' => Un- // SOFT HYPHEN, 99 | 100 | // Not white, rendered with a line: 101 | // '\u{1680}' => Un- // OGHAM SPACE MARK 102 | 103 | // Effects subsequent characters in Mongolian: 104 | // '\u{180E}' => Un- // MONGOLIAN VOWEL SEPARATOR 105 | 106 | '\u{2000}'..='\u{200A}' => WhiteSpace, // EN QUAD..HAIR SPACE 107 | '\u{200B}' | // ZERO WIDTH SPACE 108 | '\u{200C}' => ZeroSpace, // ZERO WIDTH NON-JOINER 109 | 110 | '\u{2028}' | // LINE SEPARATOR 111 | '\u{2029}' | // PARAGRAPH SEPARATOR 112 | 113 | '\u{202F}' | // NARROW NO-BREAK SPACE 114 | 115 | '\u{205F}' => WhiteSpace, // MEDIUM MATHEMATICAL SPACE 116 | '\u{2060}' => ZeroSpace, // WORD JOINER 117 | 118 | '\u{3000}' => WhiteSpace, // IDEOGRAPHIC SPACE 119 | 120 | '\u{FEFF}' => ZeroSpace, // BOM or ZERO WIDTH NON-BREAKING 121 | '\u{FFFE}' | // Bad BOM (not assigned) 122 | '\u{FFFF}' => Control, // Not assigned (invalid) 123 | _ => Unclassified, 124 | } 125 | 126 | // FIXME: see markup5ever/data/mod.rs: C1_REPLACEMENTS replaced with 127 | // alt higher unicode characters. This should occur _before_ above 128 | // transform, at least for HTML? 129 | } 130 | 131 | #[cfg(test)] 132 | mod tests { 133 | use super::*; 134 | use tendril::SliceExt; 135 | 136 | #[test] 137 | fn test_char_class() { 138 | use CharClass::*; 139 | assert_eq!(Unclassified, char_class('x')); 140 | assert_eq!(Control, char_class('\u{0008}')); 141 | assert_eq!(ZeroSpace, char_class('\u{2060}')); 142 | assert_eq!(WhiteSpace, char_class('\n')); 143 | assert_eq!(WhiteSpace, char_class('\n')); 144 | } 145 | 146 | #[test] 147 | fn replace() { 148 | assert_clean("", "" ); 149 | assert_clean("", "\u{2060}" ); 150 | assert_clean(" ", " "); 151 | assert_clean(" ", "\t \r\n"); 152 | 153 | assert_clean("x", "x" ); 154 | assert_clean(" x ", " x "); 155 | assert_clean(" x", " x\u{2060}" ); 156 | assert_clean("x ", "x " ); 157 | 158 | assert_clean("aa b ", "\u{009F}a\u{009F}a b " ); 159 | 160 | assert_clean("aa b c ", "aa b c " ); 161 | assert_clean("aa b c", "aa \t b c" ); 162 | assert_clean(" aa b c", "\t aa \t b c"); 163 | } 164 | 165 | // Assert that super-ASCII character boundaries are properly observed 166 | #[test] 167 | fn replace_multibyte() { 168 | assert_clean("Ψ", "Ψ" ); 169 | assert_clean(" Ψ ", " Ψ "); 170 | assert_clean(" Ψ", " Ψ\u{2060}" ); 171 | assert_clean("Ψ ", "Ψ " ); 172 | 173 | assert_clean("αα β ", "\u{009F}α\u{009F}α β " ); 174 | 175 | assert_clean("αα β γ ", "αα β γ " ); 176 | assert_clean("αα β γ", "αα \t β γ" ); 177 | assert_clean(" αα β γ", "\t αα \t β γ"); 178 | } 179 | 180 | #[test] 181 | fn replace_ctrl_only() { 182 | assert_clean_ctrl("", "" ); 183 | assert_clean_ctrl("", "\u{2060}" ); 184 | assert_clean_ctrl(" ", " "); 185 | 186 | assert_clean_ctrl("x", "x" ); 187 | assert_clean_ctrl(" x ", " x "); 188 | assert_clean_ctrl(" x", " x\u{2060}" ); 189 | assert_clean_ctrl("x ", "x " ); 190 | 191 | assert_clean_ctrl("aaa β ", "\u{009F}a\u{009F}aa β " ); 192 | 193 | assert_clean_ctrl("aa β c ", "aa β c " ); 194 | assert_clean_ctrl("aa \t β c", "aa \t β c" ); 195 | assert_clean_ctrl("\t aa \t β c", "\t aa \t β c"); 196 | } 197 | 198 | #[test] 199 | fn replace_trim() { 200 | assert_clean_trim("", ""); 201 | assert_clean_trim("", "\t \r\n"); 202 | assert_clean_trim("", "\u{0000}"); //NUL 203 | assert_clean_trim("", "\u{FFFE}"); //BAD BOM 204 | assert_clean_trim("", "\u{00A0}\u{2007}\u{202F}"); 205 | 206 | assert_clean_trim("x", "x" ); 207 | assert_clean_trim("x", " x "); 208 | assert_clean_trim("x", " x" ); 209 | assert_clean_trim("x", "x " ); 210 | 211 | assert_clean_trim("aa b", " a\u{009F}a\u{009F} b " ); 212 | 213 | assert_clean_trim("aa b c", "aa b c " ); 214 | assert_clean_trim("aa b c", "aa \t b c" ); 215 | assert_clean_trim("aa b c", "\t aa \t b c"); 216 | } 217 | 218 | #[test] 219 | fn replace_trim_left() { 220 | assert_clean_trim_l("", ""); 221 | assert_clean_trim_l(" ", " "); 222 | assert_clean_trim_l(" ", "\t \r\n"); 223 | } 224 | 225 | #[test] 226 | fn replace_trim_right() { 227 | assert_clean_trim_r("", ""); 228 | assert_clean_trim_r("", " "); 229 | assert_clean_trim_r("", "\t \r\n"); 230 | } 231 | 232 | fn assert_clean_trim(exp: &str, src: &str) { 233 | let mut st = src.to_tendril(); 234 | replace_chars(&mut st, true, true, true, true); 235 | assert_eq!(exp, st.as_ref()); 236 | } 237 | 238 | fn assert_clean_trim_l(exp: &str, src: &str) { 239 | let mut st = src.to_tendril(); 240 | replace_chars(&mut st, true, true, true, false); 241 | assert_eq!(exp, st.as_ref()); 242 | } 243 | 244 | fn assert_clean_trim_r(exp: &str, src: &str) { 245 | let mut st = src.to_tendril(); 246 | replace_chars(&mut st, true, true, false, true); 247 | assert_eq!(exp, st.as_ref()); 248 | } 249 | 250 | fn assert_clean(exp: &str, src: &str) { 251 | let mut st = src.to_tendril(); 252 | replace_chars(&mut st, true, true, false, false); 253 | assert_eq!(exp, st.as_ref()); 254 | } 255 | 256 | fn assert_clean_ctrl(exp: &str, src: &str) { 257 | let mut st = src.to_tendril(); 258 | replace_chars(&mut st, false, true, false, false); 259 | assert_eq!(exp, st.as_ref()); 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /marked/src/decode.rs: -------------------------------------------------------------------------------- 1 | // Copyright © 2019 David Kellum 2 | // 3 | // The `Decoder` implemented here was originally derived from 4 | // `tendril::stream::LossyDecoder` and `tendril::stream::TendrilSink` 5 | // (version 0.4.1) source as found: 6 | // 7 | // https://github.com/servo/tendril 8 | // Copyright © 2015 Keegan McAllister 9 | // Licensed under the Apache license v2.0, or the MIT license 10 | 11 | //! Support for streaming charset decoding. 12 | 13 | use std::borrow::Cow; 14 | use std::io; 15 | 16 | use log::trace; 17 | use encoding_rs as enc; 18 | use enc::DecoderResult; 19 | 20 | use tendril::{Tendril, TendrilSink, Atomicity, NonAtomic}; 21 | use tendril::fmt as form; 22 | use tendril::stream::Utf8LossyDecoder; 23 | 24 | mod encoding_hint; 25 | 26 | pub use encoding_hint::{ 27 | EncodingHint, SharedEncodingHint, 28 | }; 29 | 30 | use crate::READ_BUFFER_SIZE; 31 | 32 | /// A `TendrilSink` adaptor that takes bytes, decodes them as the given 33 | /// character encoding, while replacing any ill-formed byte sequences with 34 | /// U+FFFD replacement characters, and emits Unicode (`StrTendril`). 35 | /// 36 | /// This allocates new tendrils for encodings other than UTF-8. 37 | pub struct Decoder 38 | where Sink: TendrilSink, A: Atomicity 39 | { 40 | mode: Mode, 41 | } 42 | 43 | enum Mode 44 | where Sink: TendrilSink, A: Atomicity 45 | { 46 | Utf8(Utf8LossyDecoder), 47 | Other(enc::Decoder, Sink), 48 | } 49 | 50 | impl Decoder 51 | where Sink: TendrilSink, A: Atomicity 52 | { 53 | pub fn new(encoding: &'static enc::Encoding, sink: Sink) -> Self { 54 | 55 | let mode = if encoding == enc::UTF_8 { 56 | Mode::Utf8(Utf8LossyDecoder::new(sink)) 57 | } else { 58 | Mode::Other(encoding.new_decoder(), sink) 59 | }; 60 | 61 | Decoder { mode } 62 | } 63 | 64 | /// Return reference to the inner sink. 65 | pub fn inner_sink(&self) -> &Sink { 66 | match self.mode { 67 | Mode::Utf8(ref utf8) => &utf8.inner_sink, 68 | Mode::Other(_, ref inner_sink) => inner_sink, 69 | } 70 | } 71 | 72 | /// Read until EOF of stream, processing each buffer, and finish this 73 | /// decoder. Returns the sink output or any io::Error. 74 | pub fn read_to_end(mut self, r: &mut R) 75 | -> Result 76 | where Self: Sized, R: io::Read 77 | { 78 | // Adapted from TendrilSink::read_from 79 | loop { 80 | let mut tendril = Tendril::::new(); 81 | unsafe { 82 | tendril.push_uninitialized(READ_BUFFER_SIZE); 83 | } 84 | loop { 85 | match r.read(&mut tendril) { 86 | Ok(0) => return Ok(self.finish()), 87 | Ok(n) => { 88 | tendril.pop_back(READ_BUFFER_SIZE - n as u32); 89 | self.process(tendril); 90 | break; 91 | } 92 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 93 | Err(e) => return Err(e) 94 | } 95 | } // repeat on interrupt 96 | } // repeat until EOF (0) or Err 97 | } 98 | } 99 | 100 | impl TendrilSink for Decoder 101 | where Sink: TendrilSink, A: Atomicity 102 | { 103 | type Output = Sink::Output; 104 | 105 | fn process(&mut self, t: Tendril) { 106 | match self.mode { 107 | Mode::Utf8(ref mut utf8) => utf8.process(t), 108 | Mode::Other(ref mut decoder, ref mut sink) => { 109 | if t.is_empty() { 110 | return; 111 | } 112 | decode_to_sink(t, decoder, sink, false); 113 | }, 114 | } 115 | } 116 | 117 | fn error(&mut self, desc: Cow<'static, str>) { 118 | match self.mode { 119 | Mode::Utf8(ref mut utf8) => utf8.error(desc), 120 | Mode::Other(_, ref mut sink) => sink.error(desc), 121 | } 122 | } 123 | 124 | fn finish(self) -> Sink::Output { 125 | match self.mode { 126 | Mode::Utf8(utf8) => utf8.finish(), 127 | Mode::Other(mut decoder, mut sink) => { 128 | decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); 129 | sink.finish() 130 | } 131 | } 132 | } 133 | } 134 | 135 | fn decode_to_sink( 136 | mut inpt: Tendril, 137 | decoder: &mut enc::Decoder, 138 | sink: &mut Sink, 139 | last: bool) 140 | where Sink: TendrilSink, A: Atomicity 141 | { 142 | loop { 143 | let mut outt = >::new(); 144 | let len = decoder 145 | .max_utf8_buffer_length(inpt.len()) 146 | .unwrap_or(READ_BUFFER_SIZE as usize); 147 | let len = std::cmp::min(len as u32, READ_BUFFER_SIZE); 148 | trace!("decode buffer len {}", len); 149 | unsafe { outt.push_uninitialized(len); } 150 | 151 | let (result, bytes_read, bytes_written) = 152 | decoder.decode_to_utf8_without_replacement(&inpt, &mut outt, last); 153 | if bytes_written > 0 { 154 | sink.process(unsafe { 155 | outt.subtendril(0, bytes_written as u32) 156 | .reinterpret_without_validating() 157 | }); 158 | } 159 | match result { 160 | DecoderResult::InputEmpty => break, 161 | DecoderResult::OutputFull => { 162 | trace!("decode OutputFull"); 163 | }, 164 | DecoderResult::Malformed(_, _) => { 165 | // String matched in Sink, don't change 166 | sink.error(Cow::Borrowed("invalid byte sequence")); 167 | sink.process("\u{FFFD}".into()); 168 | }, 169 | } 170 | inpt.pop_front(bytes_read as u32); 171 | if inpt.is_empty() { 172 | break; 173 | } 174 | } 175 | } 176 | 177 | #[cfg(test)] 178 | mod tests { 179 | use super::*; 180 | use tendril::SliceExt; 181 | 182 | struct Accumulate
183 | where A: Atomicity 184 | { 185 | tendrils: Vec>, 186 | errors: Vec, 187 | } 188 | 189 | impl Accumulate 190 | where A: Atomicity 191 | { 192 | fn new() -> Accumulate { 193 | Accumulate { 194 | tendrils: vec![], 195 | errors: vec![], 196 | } 197 | } 198 | } 199 | 200 | impl TendrilSink for Accumulate 201 | where A: Atomicity 202 | { 203 | type Output = (Vec>, Vec); 204 | 205 | fn process(&mut self, t: Tendril) { 206 | self.tendrils.push(t); 207 | } 208 | 209 | fn error(&mut self, desc: Cow<'static, str>) { 210 | self.errors.push(desc.into_owned()); 211 | } 212 | 213 | fn finish(self) -> Self::Output { 214 | (self.tendrils, self.errors) 215 | } 216 | } 217 | 218 | fn check_decode( 219 | mut decoder: Decoder>, 220 | input: &[&[u8]], 221 | expected: &str, 222 | errs: usize) 223 | { 224 | for x in input { 225 | decoder.process(x.to_tendril()); 226 | } 227 | let (tendrils, errors) = decoder.finish(); 228 | let mut tendril: Tendril = Tendril::new(); 229 | for t in tendrils { 230 | tendril.push_tendril(&t); 231 | } 232 | assert_eq!(expected, &*tendril); 233 | assert_eq!(errs, errors.len()); 234 | } 235 | 236 | pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; 237 | 238 | const UTF_8: Tests = &[ 239 | (&[], "", 0), 240 | (&[b""], "", 0), 241 | (&[b"xyz"], "xyz", 0), 242 | (&[b"x", b"y", b"z"], "xyz", 0), 243 | 244 | (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), 245 | (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), 246 | (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), 247 | (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), 248 | (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), 249 | (&[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], "\u{a66e}", 0), 250 | 251 | (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), 252 | (&[b"xy\xEA", b"\xFF", b"\x99\xAEz"], 253 | "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", 4), 254 | (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), 255 | 256 | // incomplete char at end of input 257 | (&[b"\xC0"], "\u{fffd}", 1), 258 | (&[b"\xEA\x99"], "\u{fffd}", 1), 259 | ]; 260 | 261 | #[test] 262 | fn decode_utf8_encoding_rs() { 263 | for &(input, expected, errs) in UTF_8 { 264 | let decoder = Decoder::new(enc::UTF_8, Accumulate::new()); 265 | check_decode(decoder, input, expected, errs); 266 | } 267 | } 268 | 269 | const KOI8_U: Tests = &[ 270 | (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), 271 | (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), 272 | (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), 273 | (&[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], "Энергия", 0), 274 | ]; 275 | 276 | #[test] 277 | fn decode_koi8_u_encoding_rs() { 278 | for &(input, expected, errs) in KOI8_U { 279 | let decoder = Decoder::new(enc::KOI8_U, Accumulate::new()); 280 | check_decode(decoder, input, expected, errs); 281 | } 282 | } 283 | 284 | const WINDOWS_949: Tests = &[ 285 | (&[], "", 0), 286 | (&[b""], "", 0), 287 | (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), 288 | (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), 289 | (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), 290 | (&[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], "안녕하세요", 0), 291 | (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), 292 | 293 | (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), 294 | (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), 295 | ]; 296 | 297 | #[test] 298 | fn decode_windows_949_encoding_rs() { 299 | for &(input, expected, errs) in WINDOWS_949 { 300 | let decoder = Decoder::new(enc::EUC_KR, Accumulate::new()); 301 | check_decode(decoder, input, expected, errs); 302 | } 303 | } 304 | } 305 | -------------------------------------------------------------------------------- /marked/src/decode/encoding_hint.rs: -------------------------------------------------------------------------------- 1 | 2 | use std::cell::RefCell; 3 | use std::collections::HashMap; 4 | use std::rc::Rc; 5 | 6 | use encoding_rs as enc; 7 | 8 | use crate::DEFAULT_CONF; 9 | 10 | /// A set of confidence-weighted evidence that a text document is in a 11 | /// particular encoding. 12 | #[derive(Debug)] 13 | pub struct EncodingHint { 14 | encodings: HashMap<&'static enc::Encoding, f32>, 15 | top: Option<&'static enc::Encoding>, 16 | confidence: f32, 17 | errors: u32, 18 | changed: bool, 19 | } 20 | 21 | /// An `EncodingHint` that can be shared between `Decoder` and `Sink`, by 22 | /// reference on the same thread, and internally mutated. The type is neither 23 | /// `Send` nor `Sync`. 24 | pub type SharedEncodingHint = Rc>; 25 | 26 | impl EncodingHint { 27 | /// Construct new, empty EncodingHint. 28 | fn new() -> EncodingHint { 29 | EncodingHint { 30 | encodings: HashMap::new(), 31 | top: None, 32 | confidence: 0.0, 33 | errors: 0, 34 | changed: false, 35 | } 36 | } 37 | 38 | /// Construct a new Encoding hint with the specified encoding at 39 | /// [`DEFAULT_CONF`] confidence, wrapped for sharing. 40 | pub fn shared_default(enc: &'static enc::Encoding) -> SharedEncodingHint { 41 | let mut eh = EncodingHint::new(); 42 | eh.add_hint(enc, DEFAULT_CONF); 43 | eh.clear_changed(); 44 | Rc::new(RefCell::new(eh)) 45 | } 46 | 47 | /// Construct a new Encoding hint with the specified encoding and 48 | /// confidence, wrapped for sharing. 49 | pub fn shared_with_hint(enc: &'static enc::Encoding, confidence: f32) 50 | -> SharedEncodingHint 51 | { 52 | let mut eh = EncodingHint::new(); 53 | eh.add_hint(enc, confidence); 54 | eh.clear_changed(); 55 | Rc::new(RefCell::new(eh)) 56 | } 57 | 58 | /// Add a hint for an encoding, by label ASCII-intepreted bytes, and some 59 | /// positive confidence value. If no encoding (or applicable replacement) 60 | /// is found for the specified label, returns false. Return true if an 61 | /// encoding is found _and_ this hint changes the top confidence encoding. 62 | pub fn add_label_hint(&mut self, enc: L, confidence: f32) 63 | -> bool 64 | where L: AsRef<[u8]> 65 | { 66 | if let Some(enc) = enc::Encoding::for_label(enc.as_ref()) { 67 | self.add_hint(enc, confidence) 68 | } else { 69 | false 70 | } 71 | } 72 | 73 | /// Add a hint for the specified encoding and some positive confidence 74 | /// value. Return true if this hint changes the top most confident 75 | /// encoding. 76 | pub fn add_hint(&mut self, enc: &'static enc::Encoding, confidence: f32) 77 | -> bool 78 | { 79 | assert!(confidence > 0.0); 80 | 81 | let new_conf = *( 82 | self.encodings.entry(enc) 83 | .and_modify(|c| *c += confidence) 84 | .or_insert(confidence) 85 | ); 86 | 87 | if new_conf > self.confidence { 88 | self.confidence = new_conf; 89 | if self.top == Some(enc) { 90 | false 91 | } else { 92 | self.top = Some(enc); 93 | self.changed = true; 94 | true 95 | } 96 | } else { 97 | false 98 | } 99 | } 100 | 101 | /// Return true if the given encoding name could be read with _both_ any 102 | /// current top encoding and from the provided encoding, from the same 103 | /// source bytes. 104 | /// 105 | /// All supported encoding names are ASCII, so any _parsed_ name encoding 106 | /// hint that would transition from an ASCII-compatible encoding 107 | /// (e.g. Windows-1252, UTF-8) to an ASCII-incompatible encoding 108 | /// (e.g. UTF-16, REPLACEMENT), or vice-versa, or to different 109 | /// ASCII-incompatible encodings (e.g. UTF-16BE to UTF-16LE) is nonsensical 110 | /// and should be ignored. 111 | /// 112 | /// Note that this check should only be applied to text hints in the 113 | /// document itself, and not applied to hints from Byte-Order-Marks since 114 | /// they aren't ASCII names, or the HTTP Content-Type header since it isn't 115 | /// part of the document body. 116 | pub fn could_read_from(&self, enc: &'static enc::Encoding) -> bool { 117 | if let Some(t) = self.top { 118 | if ( includes_ascii(t) && !includes_ascii(enc)) || 119 | (!includes_ascii(t) && t != enc) 120 | { 121 | return false; 122 | } 123 | } 124 | true 125 | } 126 | 127 | /// Return the top (most confident) encoding, if at least one encoding has 128 | /// been hinted. 129 | pub fn top(&self) -> Option<&'static enc::Encoding> { 130 | self.top 131 | } 132 | 133 | /// Return the summed confidence value for the top (most confident) 134 | /// encoding. Returns 0.0 if no hint has been provided. 135 | pub fn confidence(&self) -> f32 { 136 | self.confidence 137 | } 138 | 139 | /// Return the total errors accumulated since construction or the last call 140 | /// to `clear_errors`. 141 | pub fn errors(&self) -> u32 { 142 | self.errors 143 | } 144 | 145 | /// Increment errors count by one. 146 | pub fn increment_error(&mut self) { 147 | self.errors += 1 148 | } 149 | 150 | /// Return the latest top encoding if the top has changed since 151 | /// construction or the last call to `clear_changed`. 152 | pub fn changed(&self) -> Option<&'static enc::Encoding> { 153 | if self.changed { 154 | self.top 155 | } else { 156 | None 157 | } 158 | } 159 | 160 | /// Clear `changed` flag. 161 | pub fn clear_changed(&mut self) { 162 | self.changed = false; 163 | } 164 | /// Clear `errors` count. 165 | pub fn clear_errors(&mut self) { 166 | self.errors = 0; 167 | } 168 | } 169 | 170 | // Could the encoding include an ASCII text encoding name? 171 | // This is slightly more lenient then Encoding::is_ascii_compatible in that it 172 | // grants that ISO-2022-JP _could be_ in ASCII mode. 173 | fn includes_ascii(enc: &'static enc::Encoding) -> bool { 174 | !(enc == enc::UTF_16BE || enc == enc::UTF_16LE || enc == enc::REPLACEMENT) 175 | } 176 | 177 | #[cfg(test)] 178 | mod tests { 179 | use super::*; 180 | 181 | fn is_send() -> bool { true } 182 | fn is_sync() -> bool { true } 183 | 184 | #[test] 185 | fn test_send_sync() { 186 | assert!(is_send::()); 187 | assert!(is_sync::()); 188 | } 189 | 190 | // Adapted from static_asserts 1.1.0 `assert_not_impl_any` macro 191 | // MIT/Apache licensed 192 | 193 | trait AmbiguousIfImpl { 194 | fn some_f() -> bool { true } 195 | } 196 | impl AmbiguousIfImpl<()> for T {} 197 | 198 | #[allow(unused)] struct NotSync; 199 | impl AmbiguousIfImpl for T {} 200 | 201 | #[allow(unused)] struct NotSend; 202 | impl AmbiguousIfImpl for T {} 203 | 204 | #[test] 205 | fn test_not_send_nor_sync() { 206 | assert!(>::some_f()); 207 | } 208 | 209 | #[test] 210 | fn encoding_hint() { 211 | let mut encs = EncodingHint::new(); 212 | assert!( encs.add_label_hint("LATIN1", 0.3)); 213 | assert!(!encs.add_label_hint("iso-8859-1", 0.4)); 214 | assert!(!encs.add_label_hint("utf-8", 0.5)); 215 | assert_eq!( 216 | "windows-1252", encs.top().unwrap().name(), 217 | "desired replacement for first two hints" 218 | ); 219 | assert_eq!(0.3 + 0.4, encs.confidence()); 220 | } 221 | 222 | #[test] 223 | fn could_read_from() { 224 | let mut eh = EncodingHint::new(); 225 | eh.add_hint(enc::UTF_8, 0.5); 226 | assert!( eh.could_read_from(enc::UTF_8)); 227 | assert!( eh.could_read_from(enc::WINDOWS_1252)); 228 | assert!( eh.could_read_from(enc::ISO_2022_JP)); 229 | assert!(!eh.could_read_from(enc::UTF_16LE)); 230 | assert!(!eh.could_read_from(enc::UTF_16BE)); 231 | } 232 | 233 | #[test] 234 | fn could_read_from_multi_byte() { 235 | let mut eh = EncodingHint::new(); 236 | eh.add_hint(enc::UTF_16LE, 0.5); 237 | assert!( eh.could_read_from(enc::UTF_16LE)); 238 | assert!(!eh.could_read_from(enc::UTF_16BE)); 239 | assert!(!eh.could_read_from(enc::ISO_2022_JP)); 240 | assert!(!eh.could_read_from(enc::UTF_8)); 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /marked/src/dom/filter.rs: -------------------------------------------------------------------------------- 1 | //! Mutating visitor support for `Document`. 2 | 3 | use std::cell::RefCell; 4 | 5 | use log::debug; 6 | 7 | use crate::chars::{is_all_ctrl_ws, replace_chars}; 8 | use crate::dom::{ 9 | html::{t, TAG_META}, 10 | Document, Element, NodeData, NodeId, NodeRef, StrTendril 11 | }; 12 | 13 | /// An instruction returned by the `Fn` closure used by [`Document::filter`]. 14 | #[derive(Debug, PartialEq, Eq)] 15 | pub enum Action { 16 | /// Continue filtering, without further changes to this `Node`. 17 | Continue, 18 | 19 | /// Detach (unlink) this `Node`, and its children, from the tree. 20 | /// 21 | /// This remove references and replaces node data with `NodeData::Hole`. To 22 | /// free up the `Vec` slots for the node and any children, use 23 | /// [`Document::compact`]. 24 | Detach, 25 | 26 | /// Replace this `Node` with its children. Equivalent to `Detach` if 27 | /// returned for a `Node` with no children. 28 | Fold, 29 | } 30 | 31 | /// Mutating filter methods. 32 | impl Document { 33 | /// Perform a depth-first (children before parent nodes) walk of the entire 34 | /// `Document`, including the synthetic document node, applying the 35 | /// provided function. 36 | /// 37 | /// See [`Document::filter_at`] for additional details. 38 | pub fn filter(&mut self, mut f: F) 39 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 40 | { 41 | self.filter_at_ref(Document::DOCUMENT_NODE_ID, true, &mut f); 42 | } 43 | 44 | /// Perform a breadth-first (children after parent nodes) walk of the 45 | /// entire `Document`, including the synthetic document node, applying the 46 | /// provided function. 47 | /// 48 | /// See [`Document::filter_at`] for additional details. 49 | pub fn filter_breadth(&mut self, mut f: F) 50 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 51 | { 52 | self.filter_at_ref(Document::DOCUMENT_NODE_ID, false, &mut f); 53 | } 54 | 55 | /// Perform a depth-first (children before parent nodes) walk from the 56 | /// specified node ID, applying the provided function. 57 | /// 58 | /// ### Traversal order 59 | /// 60 | /// This variant performs a depth-first (children before parent nodes) tree 61 | /// walk, but there is also [`Document::filter_at_breadth`], a 62 | /// breadth-first (parent before children) variant. Filter functions such 63 | /// as [`detach_banned_elements`] may perform better breadth-first (but are 64 | /// _compatible_ with both traversal orders). Other functions such as 65 | /// [`fold_empty_inline`] and [`text_normalize`] will only yield complete 66 | /// results when run depth-first. See individual functions for 67 | /// compatibility with traversal orders. 68 | /// 69 | /// ### Filter functions 70 | /// 71 | /// The `f` parameter can be a closure or free-function in the form: 72 | /// 73 | /// ```no_run 74 | /// fn a_filter_fn(pos: NodeRef<'_>, data: &mut NodeData) -> Action; 75 | /// ``` 76 | /// 77 | /// Where `data` provides read-write access to the the `NodeData` of the 78 | /// current node being visited, and `pos` gives a read-only view to the 79 | /// remainder of the `Document`, e.g. parent, children, and siblings of the 80 | /// current node. Note that to avoid aliasing issues, the `NodeData` is 81 | /// actually moved out of the `Document` and replaced with a 82 | /// `NodeData::Hole` value which could be observed via `pos`. The 83 | /// potentially modified `NodeData` is moved back to the `Document` if the 84 | /// function returns `Action::Continue`. The function may also modify the 85 | /// `Document` by returning other [`Action`] values. 86 | /// 87 | /// For convenience and efficiency, multiple filter functions can be 88 | /// combined via the [`chain_filters`] macro and run in one pass. See also 89 | /// the [`filter`][crate::filter] module for included functions. 90 | /// 91 | /// Note that to free up all memory associated with filtered `Node`s that 92 | /// have been unlinked (`Action::Detach` or `Action::Fold`), use 93 | /// [`Document::compact`], or [`Document::deep_clone`] and drop the 94 | /// original `Document`. 95 | pub fn filter_at(&mut self, id: NodeId, mut f: F) 96 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 97 | { 98 | self.filter_at_ref(id, true, &mut f); 99 | } 100 | 101 | /// Perform a breadth-first (children after parent nodes) walk from the 102 | /// specified node ID, applying the provided function. 103 | /// 104 | /// See [`Document::filter_at`] for additional details. 105 | pub fn filter_at_breadth(&mut self, id: NodeId, mut f: F) 106 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 107 | { 108 | self.filter_at_ref(id, false, &mut f); 109 | } 110 | 111 | fn filter_at_ref(&mut self, id: NodeId, depth_first: bool, f: &mut F) 112 | -> Action 113 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 114 | { 115 | let res = if depth_first { 116 | self.walk_depth(id, f) 117 | } else { 118 | self.walk_breadth(id, f) 119 | }; 120 | 121 | match res { 122 | Action::Continue => {}, 123 | Action::Fold => { 124 | self.fold_only(id); 125 | } 126 | Action::Detach => { 127 | self.unlink_only(id); 128 | } 129 | } 130 | res 131 | } 132 | 133 | fn walk_depth(&mut self, id: NodeId, f: &mut F) -> Action 134 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 135 | { 136 | // Children first, recursively 137 | let mut next_child = self[id].first_child; 138 | while let Some(child) = next_child { 139 | // set before possible loss by filter action 140 | next_child = self[child].next_sibling; 141 | self.filter_at_ref(child, true, f); 142 | } 143 | 144 | self.filter_node(id, f) 145 | } 146 | 147 | fn walk_breadth(&mut self, id: NodeId, f: &mut F) -> Action 148 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 149 | { 150 | let res = self.filter_node(id, f); 151 | if res != Action::Continue { 152 | return res; 153 | } 154 | 155 | // Children after, recursively 156 | let mut next_child = self[id].first_child; 157 | while let Some(child) = next_child { 158 | // set before possible loss by filter action 159 | next_child = self[child].next_sibling; 160 | let prev = self[child].prev_sibling; 161 | let parent = self[child].parent; 162 | 163 | let res = self.filter_at_ref(child, false, f); 164 | 165 | if res == Action::Fold { 166 | if let Some(p) = prev { 167 | next_child = self[p].next_sibling; 168 | } else if let Some(p) = parent { 169 | next_child = self[p].first_child; 170 | } 171 | } 172 | } 173 | 174 | Action::Continue 175 | } 176 | 177 | fn filter_node(&mut self, id: NodeId, f: &mut F) -> Action 178 | where F: Fn(NodeRef<'_>, &mut NodeData) -> Action 179 | { 180 | // We need to temporarily replace node.data with a placeholder (Hole) 181 | // to appease the borrow checker. Otherwise there would be an aliasing 182 | // problem where the Document (&self) reference could see the same 183 | // NodeData passed as &mut. 184 | let mut ndata = self[id].take_data(); 185 | 186 | let res = f(NodeRef::new(self, id), &mut ndata); 187 | 188 | // We only need to reset the potentially mutated node.data if the 189 | // action is to continue, as all other cases result in the node 190 | // being detached. 191 | if res == Action::Continue { 192 | let node = &mut self[id]; 193 | match ndata { 194 | NodeData::Document | NodeData::Elem(_) => {} 195 | NodeData::Hole => { 196 | debug_assert!(false, "Filter changed to {:?}", ndata); 197 | } 198 | _ => { 199 | debug_assert!( 200 | node.first_child.is_none() && node.last_child.is_none(), 201 | "Filter changed node {:?} with children to {:?}", 202 | id, ndata); 203 | } 204 | } 205 | node.data = ndata; 206 | } 207 | res 208 | } 209 | } 210 | 211 | /// Compose a new filter closure, by chaining a list of 1 to many closures or 212 | /// function paths. Each is executed in order, while the returned action remains 213 | /// `Action::Continue`, or otherwise terminated early. 214 | #[macro_export] 215 | macro_rules! chain_filters { 216 | ($solo:expr $(,)?) => ( 217 | |pos: $crate::NodeRef<'_>, data: &mut $crate::NodeData| { 218 | $solo(pos, data) 219 | } 220 | ); 221 | ($first:expr $(, $subs:expr)+ $(,)?) => ( 222 | |pos: $crate::NodeRef<'_>, data: &mut $crate::NodeData| { 223 | let mut action: $crate::filter::Action = $first(pos, data); 224 | $( 225 | if action == $crate::filter::Action::Continue { 226 | action = $subs(pos, data); 227 | } 228 | )* 229 | action 230 | } 231 | ); 232 | } 233 | 234 | /// Detach known banned elements 235 | /// ([`TagMeta::is_banned`](crate::html::TagMeta::is_banned)) and any elements 236 | /// which are unknown. 237 | /// 238 | /// Compatible with depth or breadth-first filtering, but more efficiently 239 | /// executed breadth-first. 240 | pub fn detach_banned_elements(_p: NodeRef<'_>, data: &mut NodeData) -> Action { 241 | if let Some(ref mut elm) = data.as_element_mut() { 242 | if let Some(tmeta) = TAG_META.get(&elm.name.local) { 243 | if tmeta.is_banned() { 244 | return Action::Detach; 245 | } 246 | } else { 247 | debug!("Detaching unknown element tag {}", &elm.name.local); 248 | return Action::Detach; 249 | } 250 | } 251 | Action::Continue 252 | } 253 | 254 | /// Fold meaningless inline elements, which are empty or contain only logical 255 | /// whitespace. 256 | /// 257 | /// Logical whitespace is defined as all Unicode whitespace or control chars in 258 | /// child text, or the `
` element. Non-text oriented inline elements like 259 | /// `` and `