├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── ammonia-compare ├── Cargo.lock ├── Cargo.toml ├── README.md ├── samples │ └── github-dekellum-frag.html └── src │ └── lib.rs ├── marked-cli ├── CHANGELOG.md ├── Cargo.toml ├── README.md ├── build.rs ├── clippy.toml └── src │ └── main.rs ├── marked-sanitizer ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── README.md ├── build.rs └── src │ └── lib.rs └── marked ├── CHANGELOG.md ├── Cargo.toml ├── README.md ├── benches └── round_trip.rs ├── build.rs ├── build ├── attributes ├── generate.rb ├── meta.rs.erb └── tags ├── clippy.toml ├── samples ├── documento_utf16be_bom.html ├── documento_utf16be_meta_utf16le.html ├── documento_utf16le.html ├── documento_utf16le_bom.html ├── documento_utf16le_meta_utf8.html ├── documento_utf8.html ├── documento_utf8_bom.html ├── documento_utf8_meta.html ├── documento_utf8_meta_utf16.html ├── documento_windows1252_meta.html ├── github-dekellum.html ├── iro0094_shiftjis_meta.html ├── matsunami_eucjp_meta.html └── russez_windows1251_meta.html └── src ├── chars.rs ├── decode.rs ├── decode └── encoding_hint.rs ├── dom.rs ├── dom ├── filter.rs ├── html.rs ├── html │ └── meta.rs ├── node_ref.rs ├── serializer.rs ├── tests.rs └── xml.rs ├── lib.rs └── logger.rs /.gitattributes: -------------------------------------------------------------------------------- 1 | * linguist-vendored 2 | *.rs linguist-vendored=false 3 | *.rb linguist-vendored=false 4 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | schedule: 6 | - cron: '05 16 * * 1,4' 7 | 8 | env: 9 | RUSTFLAGS: -Dwarnings 10 | 11 | jobs: 12 | 13 | test: 14 | name: ${{ matrix.rust }} ${{ matrix.os }} ${{ join(matrix.extras) }} 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | include: 20 | - rust: 1.38.0 21 | os: ubuntu-20.04 22 | - rust: 1.38.0 23 | os: ubuntu-20.04 24 | extras: [update] 25 | - rust: 1.38.0 26 | os: windows-latest 27 | - rust: 1.38.0 28 | os: windows-latest 29 | extras: [update] 30 | - rust: stable 31 | os: ubuntu-20.04 32 | extras: [update] 33 | - rust: nightly 34 | os: ubuntu-20.04 35 | - rust: nightly 36 | os: ubuntu-20.04 37 | extras: [update] 38 | 39 | steps: 40 | - name: Checkout 41 | uses: actions/checkout@v2 42 | 43 | - name: Install rust (${{ matrix.rust }}) 44 | uses: actions-rs/toolchain@v1 45 | with: 46 | profile: minimal 47 | toolchain: ${{ matrix.rust }} 48 | override: true 49 | 50 | - name: Update deps 51 | if: ${{ contains(matrix.extras, 'update') }} 52 | run: cargo update 53 | 54 | - name: Downgrade xml-rs for MSRV 55 | if: ${{ matrix.rust == '1.38.0' }} 56 | run: cargo update -p xml-rs --precise 0.8.0 57 | 58 | - name: Test 59 | run: cargo test 60 | 61 | - name: Test all features 62 | run: cargo test --all-features 63 | 64 | - name: Build all features/targets 65 | if: ${{ matrix.rust == 'nightly' }} 66 | run: cargo build --all-features --all-targets 67 | 68 | - name: Build marked-cli (all features) 69 | working-directory: marked-cli 70 | run: cargo build --all-features 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "bitflags" 5 | version = "1.2.1" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" 8 | 9 | [[package]] 10 | name = "cfg-if" 11 | version = "1.0.0" 12 | source = "registry+https://github.com/rust-lang/crates.io-index" 13 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 14 | 15 | [[package]] 16 | name = "clap" 17 | version = "2.33.3" 18 | source = "registry+https://github.com/rust-lang/crates.io-index" 19 | checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" 20 | dependencies = [ 21 | "bitflags", 22 | "term_size", 23 | "textwrap", 24 | "unicode-width", 25 | ] 26 | 27 | [[package]] 28 | name = "encoding_rs" 29 | version = "0.8.26" 30 | source = "registry+https://github.com/rust-lang/crates.io-index" 31 | checksum = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" 32 | dependencies = [ 33 | "cfg-if", 34 | ] 35 | 36 | [[package]] 37 | name = "futf" 38 | version = "0.1.4" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" 41 | dependencies = [ 42 | "mac", 43 | "new_debug_unreachable", 44 | ] 45 | 46 | [[package]] 47 | name = "getrandom" 48 | version = "0.1.16" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 51 | dependencies = [ 52 | "cfg-if", 53 | "libc", 54 | "wasi", 55 | ] 56 | 57 | [[package]] 58 | name = "html5ever" 59 | version = "0.25.1" 60 | source = "registry+https://github.com/rust-lang/crates.io-index" 61 | checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" 62 | dependencies = [ 63 | "log", 64 | "mac", 65 | "markup5ever", 66 | "proc-macro2", 67 | "quote", 68 | "syn", 69 | ] 70 | 71 | [[package]] 72 | name = "itoa" 73 | version = "0.4.7" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" 76 | 77 | [[package]] 78 | name = "lazy_static" 79 | version = "1.4.0" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 82 | 83 | [[package]] 84 | name = "libc" 85 | version = "0.2.82" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" 88 | 89 | [[package]] 90 | name = "log" 91 | version = "0.4.14" 92 | source = "registry+https://github.com/rust-lang/crates.io-index" 93 | checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" 94 | dependencies = [ 95 | "cfg-if", 96 | ] 97 | 98 | [[package]] 99 | name = "mac" 100 | version = "0.1.1" 101 | source = "registry+https://github.com/rust-lang/crates.io-index" 102 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 103 | 104 | [[package]] 105 | name = "marked" 106 | version = "0.3.0" 107 | dependencies = [ 108 | "encoding_rs", 109 | "html5ever", 110 | "lazy_static", 111 | "log", 112 | "markup5ever_rcdom", 113 | "mime", 114 | "rand", 115 | "string_cache", 116 | "tendril", 117 | "xml-rs", 118 | ] 119 | 120 | [[package]] 121 | name = "marked-cli" 122 | version = "0.3.1" 123 | dependencies = [ 124 | "clap", 125 | "encoding_rs", 126 | "html5ever", 127 | "log", 128 | "marked", 129 | ] 130 | 131 | [[package]] 132 | name = "markup5ever" 133 | version = "0.10.0" 134 | source = "registry+https://github.com/rust-lang/crates.io-index" 135 | checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" 136 | dependencies = [ 137 | "log", 138 | "phf", 139 | "phf_codegen", 140 | "serde", 141 | "serde_derive", 142 | "serde_json", 143 | "string_cache", 144 | "string_cache_codegen", 145 | "tendril", 146 | ] 147 | 148 | [[package]] 149 | name = "markup5ever_rcdom" 150 | version = "0.1.0" 151 | source = "git+https://github.com/dekellum/html5ever?branch=rcdom#14e6e4be4299d3940bed8b91de88241cbbc81d56" 152 | dependencies = [ 153 | "markup5ever", 154 | "tendril", 155 | ] 156 | 157 | [[package]] 158 | name = "mime" 159 | version = "0.3.16" 160 | source = "registry+https://github.com/rust-lang/crates.io-index" 161 | checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" 162 | 163 | [[package]] 164 | name = "new_debug_unreachable" 165 | version = "1.0.4" 166 | source = "registry+https://github.com/rust-lang/crates.io-index" 167 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 168 | 169 | [[package]] 170 | name = "phf" 171 | version = "0.8.0" 172 | source = "registry+https://github.com/rust-lang/crates.io-index" 173 | checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" 174 | dependencies = [ 175 | "phf_shared", 176 | ] 177 | 178 | [[package]] 179 | name = "phf_codegen" 180 | version = "0.8.0" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" 183 | dependencies = [ 184 | "phf_generator", 185 | "phf_shared", 186 | ] 187 | 188 | [[package]] 189 | name = "phf_generator" 190 | version = "0.8.0" 191 | source = "registry+https://github.com/rust-lang/crates.io-index" 192 | checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" 193 | dependencies = [ 194 | "phf_shared", 195 | "rand", 196 | ] 197 | 198 | [[package]] 199 | name = "phf_shared" 200 | version = "0.8.0" 201 | source = "registry+https://github.com/rust-lang/crates.io-index" 202 | checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" 203 | dependencies = [ 204 | "siphasher", 205 | ] 206 | 207 | [[package]] 208 | name = "ppv-lite86" 209 | version = "0.2.10" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" 212 | 213 | [[package]] 214 | name = "precomputed-hash" 215 | version = "0.1.1" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 218 | 219 | [[package]] 220 | name = "proc-macro2" 221 | version = "1.0.24" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" 224 | dependencies = [ 225 | "unicode-xid", 226 | ] 227 | 228 | [[package]] 229 | name = "quote" 230 | version = "1.0.8" 231 | source = "registry+https://github.com/rust-lang/crates.io-index" 232 | checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" 233 | dependencies = [ 234 | "proc-macro2", 235 | ] 236 | 237 | [[package]] 238 | name = "rand" 239 | version = "0.7.3" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 242 | dependencies = [ 243 | "getrandom", 244 | "libc", 245 | "rand_chacha", 246 | "rand_core", 247 | "rand_hc", 248 | "rand_pcg", 249 | ] 250 | 251 | [[package]] 252 | name = "rand_chacha" 253 | version = "0.2.2" 254 | source = "registry+https://github.com/rust-lang/crates.io-index" 255 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 256 | dependencies = [ 257 | "ppv-lite86", 258 | "rand_core", 259 | ] 260 | 261 | [[package]] 262 | name = "rand_core" 263 | version = "0.5.1" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 266 | dependencies = [ 267 | "getrandom", 268 | ] 269 | 270 | [[package]] 271 | name = "rand_hc" 272 | version = "0.2.0" 273 | source = "registry+https://github.com/rust-lang/crates.io-index" 274 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 275 | dependencies = [ 276 | "rand_core", 277 | ] 278 | 279 | [[package]] 280 | name = "rand_pcg" 281 | version = "0.2.1" 282 | source = "registry+https://github.com/rust-lang/crates.io-index" 283 | checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" 284 | dependencies = [ 285 | "rand_core", 286 | ] 287 | 288 | [[package]] 289 | name = "ryu" 290 | version = "1.0.5" 291 | source = "registry+https://github.com/rust-lang/crates.io-index" 292 | checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" 293 | 294 | [[package]] 295 | name = "serde" 296 | version = "1.0.118" 297 | source = "registry+https://github.com/rust-lang/crates.io-index" 298 | checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" 299 | 300 | [[package]] 301 | name = "serde_derive" 302 | version = "1.0.118" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" 305 | dependencies = [ 306 | "proc-macro2", 307 | "quote", 308 | "syn", 309 | ] 310 | 311 | [[package]] 312 | name = "serde_json" 313 | version = "1.0.61" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" 316 | dependencies = [ 317 | "itoa", 318 | "ryu", 319 | "serde", 320 | ] 321 | 322 | [[package]] 323 | name = "siphasher" 324 | version = "0.3.3" 325 | source = "registry+https://github.com/rust-lang/crates.io-index" 326 | checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" 327 | 328 | [[package]] 329 | name = "string_cache" 330 | version = "0.8.1" 331 | source = "registry+https://github.com/rust-lang/crates.io-index" 332 | checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a" 333 | dependencies = [ 334 | "lazy_static", 335 | "new_debug_unreachable", 336 | "phf_shared", 337 | "precomputed-hash", 338 | "serde", 339 | ] 340 | 341 | [[package]] 342 | name = "string_cache_codegen" 343 | version = "0.5.1" 344 | source = "registry+https://github.com/rust-lang/crates.io-index" 345 | checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" 346 | dependencies = [ 347 | "phf_generator", 348 | "phf_shared", 349 | "proc-macro2", 350 | "quote", 351 | ] 352 | 353 | [[package]] 354 | name = "syn" 355 | version = "1.0.58" 356 | source = "registry+https://github.com/rust-lang/crates.io-index" 357 | checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" 358 | dependencies = [ 359 | "proc-macro2", 360 | "quote", 361 | "unicode-xid", 362 | ] 363 | 364 | [[package]] 365 | name = "tendril" 366 | version = "0.4.2" 367 | source = "registry+https://github.com/rust-lang/crates.io-index" 368 | checksum = "a9ef557cb397a4f0a5a3a628f06515f78563f2209e64d47055d9dc6052bf5e33" 369 | dependencies = [ 370 | "encoding_rs", 371 | "futf", 372 | "mac", 373 | "utf-8", 374 | ] 375 | 376 | [[package]] 377 | name = "term_size" 378 | version = "0.3.2" 379 | source = "registry+https://github.com/rust-lang/crates.io-index" 380 | checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" 381 | dependencies = [ 382 | "libc", 383 | "winapi", 384 | ] 385 | 386 | [[package]] 387 | name = "textwrap" 388 | version = "0.11.0" 389 | source = "registry+https://github.com/rust-lang/crates.io-index" 390 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 391 | dependencies = [ 392 | "term_size", 393 | "unicode-width", 394 | ] 395 | 396 | [[package]] 397 | name = "unicode-width" 398 | version = "0.1.8" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" 401 | 402 | [[package]] 403 | name = "unicode-xid" 404 | version = "0.2.1" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" 407 | 408 | [[package]] 409 | name = "utf-8" 410 | version = "0.7.5" 411 | source = "registry+https://github.com/rust-lang/crates.io-index" 412 | checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" 413 | 414 | [[package]] 415 | name = "wasi" 416 | version = "0.9.0+wasi-snapshot-preview1" 417 | source = "registry+https://github.com/rust-lang/crates.io-index" 418 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 419 | 420 | [[package]] 421 | name = "winapi" 422 | version = "0.3.9" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 425 | dependencies = [ 426 | "winapi-i686-pc-windows-gnu", 427 | "winapi-x86_64-pc-windows-gnu", 428 | ] 429 | 430 | [[package]] 431 | name = "winapi-i686-pc-windows-gnu" 432 | version = "0.4.0" 433 | source = "registry+https://github.com/rust-lang/crates.io-index" 434 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 435 | 436 | [[package]] 437 | name = "winapi-x86_64-pc-windows-gnu" 438 | version = "0.4.0" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 441 | 442 | [[package]] 443 | name = "xml-rs" 444 | version = "0.8.3" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" 447 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ "marked", "marked-cli" ] 3 | exclude = [ "ammonia-compare", "marked-sanitizer" ] 4 | 5 | [patch.crates-io] 6 | "marked" = { path = "marked" } 7 | 8 | [profile.release] 9 | lto = "thin" 10 | incremental = false 11 | 12 | [profile.bench] 13 | lto = "thin" 14 | incremental = false 15 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright © 2020 David Kellum 2 | Copyright © 2019 the Märkəd authors 3 | Copyright © 2018 (the Kuchiki authors) 4 | Copyright © 2017 (the Victor authors) 5 | Copyright © 2014-2017 the html5ever project developers 6 | 7 | Permission is hereby granted, free of charge, to any 8 | person obtaining a copy of this software and associated 9 | documentation files (the "Software"), to deal in the 10 | Software without restriction, including without 11 | limitation the rights to use, copy, modify, merge, 12 | publish, distribute, sublicense, and/or sell copies of 13 | the Software, and to permit persons to whom the Software 14 | is furnished to do so, subject to the following 15 | conditions: 16 | 17 | The above copyright notice and this permission notice 18 | shall be included in all copies or substantial portions 19 | of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 22 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 23 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 24 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 25 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 27 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 28 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 29 | DEALINGS IN THE SOFTWARE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Märkəd Project 2 | 3 | [](https://deps.rs/repo/github/dekellum/marked) 4 | [](https://github.com/dekellum/marked/actions?query=workflow%3ACI) 5 | 6 | A rust language project for parsing, filtering, selecting and serializing HTML 7 | and XML mark-up. 8 | 9 | See the _[marked]_ crate or _[marked-cli]_ crates or the README(s) and 10 | CHANGELOG(s) under this ([github hosted]) source tree and cargo workspace. 11 | 12 | ## Feature Overview 13 | 14 | Currently implemented features: 15 | 16 | ### A vector-allocated, indexed, DOM-like tree structure 17 | 18 | The `marked::Document` is a DOM-like tree structure suitable for HTML and 19 | XML. This was forked from the _[victor]_ project (same author as _html5ever_) 20 | and further optimized. It is implemented as a (std) `Vec` of `Node` types, 21 | which references parent, siblings and children via (std) `NonZeroU32` indexes 22 | for space efficiency. 23 | 24 | ### _html5ever_ integration 25 | 26 | Including HTML5 document and fragment parsing and HTML5 serialization (mark-up 27 | output). With the `marked::Document` (DOM), parsing and serialization is 28 | measurably faster (see benchmarks in source tree) than the `RcDom` previously 29 | included with *html5ever* associated crates, and mutating the `Document` is 30 | more straightforward, via a mutable reference. 31 | 32 | ### _xml-rs_ integration 33 | 34 | Strict, UTF-8 XML parsing to `marked::Document` is currently supported by 35 | integration of the _[xml-rs]_ crate. 36 | 37 | ### Legacy character encoding support 38 | 39 | An estimated 5% of the web remains in encodings other than UTF-8; too common to 40 | be treated as an error. Via `marked::html::parse_buffered`: 41 | 42 | * Decoding via _encoding_rs_ which implements _[The Encoding Standard]_ including 43 | alternative names (labels) for supported encodings. 44 | 45 | * HTML5 parsing restart from initial (4k) buffer with new encoding hints 46 | obtained from \
/\ `charset` or an `http-equiv` `content-type` with 47 | charset. 48 | 49 | * Byte-Order-Mark BOM sniffing as high priority `EncodingHint` for UTF-8, UTF-16 50 | Big-Endian and UTF-16 Little-Endian. 51 | 52 | * "Impossible" hints from the above are ignored. For example, if we read a hint 53 | from UTF-8 that says its UTF-16LE (which would make it impossible to 54 | read the same hint if it was used). 55 | 56 | (Note that the _detection_ features are not currently provided by _html5ever_ and 57 | associated crates.) 58 | 59 | ### Rust "selectors" API 60 | 61 | A `NodeRef` type with "CSS selectors"-like methods to recursively `select` and 62 | `find` elements using closure predicates. We prefer direct rust language 63 | compiler support for writing such selection logic, over CSS or other 64 | interpreted DSL. 65 | 66 | ### HTML tag and attribute metadata 67 | 68 | See `marked::html::t` (tags) and `marked::html::a` (attributes) modules. 69 | 70 | ### Tree walking filters API 71 | 72 | Bulk modifications to the DOM is easily and efficiently achieved with mutating 73 | filter functions/closures and a tree walker (depth or breadth-first) 74 | implementation in _marked_. This style of interface is sometimes called the 75 | "visitor pattern". See `Document::filter_at` for details. The crate also 76 | includes the following built-in filters (a partial list): 77 | 78 | `detach_banned_element` 79 | : `Detach` known banned (via metadata) and unknown elements 80 | 81 | `retain_basic_attributes` 82 | : Remove all attributes that are not part of the "basic" logical set (via metadata) 83 | 84 | `fold_empty_inline` 85 | : `Fold` empty or meaninglessly "inline" elements 86 | 87 | `text_normalize` 88 | : Normalize text nodes by merging, replacing control characters and minimizing white-space. 89 | 90 | An unreleased example, compatibility test and benchmark of _ammonia_ crate 91 | equivalent filtering (for hygiene and safety) is included in the source tree 92 | ([./ammonia-compare]) 93 | 94 | ## Roadmap 95 | 96 | Features incomplete or unstarted which may be included in this project in the 97 | future (PRs welcome): 98 | 99 | * Complete (faster, more correct, legacy encodings) strict-mode XML parsing 100 | 101 | * Lenient-mode XML parsing 102 | 103 | * Optional (opt-in) direct charset detection (initial read buffer or entire 104 | document) via something like [chardet], integrated as high priority 105 | _EncodingHint_. 106 | 107 | * XML/HTML pretty-indenting serialization (combines well with the existing white-space 108 | normalization features) 109 | 110 | * XML (and XHTML) serialization 111 | 112 | ## License 113 | 114 | This project is dual licensed under either of following: 115 | 116 | * The Apache License, version 2.0 117 | ([LICENSE-APACHE] or http://www.apache.org/licenses/LICENSE-2.0) 118 | 119 | * The MIT License 120 | ([LICENSE-MIT] or http://opensource.org/licenses/MIT) 121 | 122 | ### Contribution 123 | 124 | Unless you explicitly state otherwise, any contribution intentionally submitted 125 | for inclusion in the _märkəd_ project by you, as defined by the Apache License, 126 | shall be dual licensed as above, without any additional terms or conditions. 127 | 128 | [github hosted]: https://github.com/dekellum/marked 129 | [marked]: https://docs.rs/crate/marked 130 | [marked-cli]: https://crates.io/crates/marked-cli 131 | [The Encoding Standard]: https://encoding.spec.whatwg.org/ 132 | [./ammonia-compare]: https://github.com/dekellum/marked/tree/main/ammonia-compare 133 | [victor]: https://github.com/SimonSapin/victor 134 | [chardet]: https://crates.io/crates/chardet 135 | [xml-rs]: https://crates.io/crates/xml-rs 136 | [LICENSE-APACHE]: https://github.com/dekellum/marked/tree/main/LICENSE-APACHE 137 | [LICENSE-MIT]: https://github.com/dekellum/marked/tree/main/LICENSE-MIT 138 | -------------------------------------------------------------------------------- /ammonia-compare/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "ammonia" 5 | version = "3.1.0" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | checksum = "89eac85170f4b3fb3dc5e442c1cfb036cb8eecf9dbbd431a161ffad15d90ea3b" 8 | dependencies = [ 9 | "html5ever", 10 | "lazy_static", 11 | "maplit", 12 | "markup5ever_rcdom", 13 | "matches", 14 | "tendril", 15 | "url", 16 | ] 17 | 18 | [[package]] 19 | name = "ammonia-compare" 20 | version = "0.0.1" 21 | dependencies = [ 22 | "ammonia", 23 | "marked", 24 | ] 25 | 26 | [[package]] 27 | name = "cfg-if" 28 | version = "0.1.10" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" 31 | 32 | [[package]] 33 | name = "cfg-if" 34 | version = "1.0.0" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 37 | 38 | [[package]] 39 | name = "encoding_rs" 40 | version = "0.8.26" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" 43 | dependencies = [ 44 | "cfg-if 1.0.0", 45 | ] 46 | 47 | [[package]] 48 | name = "form_urlencoded" 49 | version = "1.0.0" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "ece68d15c92e84fa4f19d3780f1294e5ca82a78a6d515f1efaabcc144688be00" 52 | dependencies = [ 53 | "matches", 54 | "percent-encoding", 55 | ] 56 | 57 | [[package]] 58 | name = "futf" 59 | version = "0.1.4" 60 | source = "registry+https://github.com/rust-lang/crates.io-index" 61 | checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" 62 | dependencies = [ 63 | "mac", 64 | "new_debug_unreachable", 65 | ] 66 | 67 | [[package]] 68 | name = "getrandom" 69 | version = "0.1.16" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 72 | dependencies = [ 73 | "cfg-if 1.0.0", 74 | "libc", 75 | "wasi 0.9.0+wasi-snapshot-preview1", 76 | ] 77 | 78 | [[package]] 79 | name = "html5ever" 80 | version = "0.25.1" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" 83 | dependencies = [ 84 | "log", 85 | "mac", 86 | "markup5ever", 87 | "proc-macro2", 88 | "quote", 89 | "syn", 90 | ] 91 | 92 | [[package]] 93 | name = "idna" 94 | version = "0.2.0" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" 97 | dependencies = [ 98 | "matches", 99 | "unicode-bidi", 100 | "unicode-normalization", 101 | ] 102 | 103 | [[package]] 104 | name = "itoa" 105 | version = "0.4.7" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" 108 | 109 | [[package]] 110 | name = "lazy_static" 111 | version = "1.4.0" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 114 | 115 | [[package]] 116 | name = "libc" 117 | version = "0.2.82" 118 | source = "registry+https://github.com/rust-lang/crates.io-index" 119 | checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" 120 | 121 | [[package]] 122 | name = "log" 123 | version = "0.4.11" 124 | source = "registry+https://github.com/rust-lang/crates.io-index" 125 | checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" 126 | dependencies = [ 127 | "cfg-if 0.1.10", 128 | ] 129 | 130 | [[package]] 131 | name = "mac" 132 | version = "0.1.1" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 135 | 136 | [[package]] 137 | name = "maplit" 138 | version = "1.0.2" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" 141 | 142 | [[package]] 143 | name = "marked" 144 | version = "0.3.0" 145 | dependencies = [ 146 | "encoding_rs", 147 | "html5ever", 148 | "lazy_static", 149 | "log", 150 | "mime", 151 | "string_cache", 152 | "tendril", 153 | ] 154 | 155 | [[package]] 156 | name = "markup5ever" 157 | version = "0.10.0" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" 160 | dependencies = [ 161 | "log", 162 | "phf", 163 | "phf_codegen", 164 | "serde", 165 | "serde_derive", 166 | "serde_json", 167 | "string_cache", 168 | "string_cache_codegen", 169 | "tendril", 170 | ] 171 | 172 | [[package]] 173 | name = "markup5ever_rcdom" 174 | version = "0.1.0" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b" 177 | dependencies = [ 178 | "html5ever", 179 | "markup5ever", 180 | "tendril", 181 | "xml5ever", 182 | ] 183 | 184 | [[package]] 185 | name = "matches" 186 | version = "0.1.8" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" 189 | 190 | [[package]] 191 | name = "mime" 192 | version = "0.3.16" 193 | source = "registry+https://github.com/rust-lang/crates.io-index" 194 | checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" 195 | 196 | [[package]] 197 | name = "new_debug_unreachable" 198 | version = "1.0.4" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" 201 | 202 | [[package]] 203 | name = "percent-encoding" 204 | version = "2.1.0" 205 | source = "registry+https://github.com/rust-lang/crates.io-index" 206 | checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" 207 | 208 | [[package]] 209 | name = "phf" 210 | version = "0.8.0" 211 | source = "registry+https://github.com/rust-lang/crates.io-index" 212 | checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" 213 | dependencies = [ 214 | "phf_shared", 215 | ] 216 | 217 | [[package]] 218 | name = "phf_codegen" 219 | version = "0.8.0" 220 | source = "registry+https://github.com/rust-lang/crates.io-index" 221 | checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" 222 | dependencies = [ 223 | "phf_generator", 224 | "phf_shared", 225 | ] 226 | 227 | [[package]] 228 | name = "phf_generator" 229 | version = "0.8.0" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" 232 | dependencies = [ 233 | "phf_shared", 234 | "rand", 235 | ] 236 | 237 | [[package]] 238 | name = "phf_shared" 239 | version = "0.8.0" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" 242 | dependencies = [ 243 | "siphasher", 244 | ] 245 | 246 | [[package]] 247 | name = "ppv-lite86" 248 | version = "0.2.10" 249 | source = "registry+https://github.com/rust-lang/crates.io-index" 250 | checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" 251 | 252 | [[package]] 253 | name = "precomputed-hash" 254 | version = "0.1.1" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 257 | 258 | [[package]] 259 | name = "proc-macro2" 260 | version = "1.0.24" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" 263 | dependencies = [ 264 | "unicode-xid", 265 | ] 266 | 267 | [[package]] 268 | name = "quote" 269 | version = "1.0.8" 270 | source = "registry+https://github.com/rust-lang/crates.io-index" 271 | checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" 272 | dependencies = [ 273 | "proc-macro2", 274 | ] 275 | 276 | [[package]] 277 | name = "rand" 278 | version = "0.7.3" 279 | source = "registry+https://github.com/rust-lang/crates.io-index" 280 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 281 | dependencies = [ 282 | "getrandom", 283 | "libc", 284 | "rand_chacha", 285 | "rand_core", 286 | "rand_hc", 287 | "rand_pcg", 288 | ] 289 | 290 | [[package]] 291 | name = "rand_chacha" 292 | version = "0.2.2" 293 | source = "registry+https://github.com/rust-lang/crates.io-index" 294 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 295 | dependencies = [ 296 | "ppv-lite86", 297 | "rand_core", 298 | ] 299 | 300 | [[package]] 301 | name = "rand_core" 302 | version = "0.5.1" 303 | source = "registry+https://github.com/rust-lang/crates.io-index" 304 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 305 | dependencies = [ 306 | "getrandom", 307 | ] 308 | 309 | [[package]] 310 | name = "rand_hc" 311 | version = "0.2.0" 312 | source = "registry+https://github.com/rust-lang/crates.io-index" 313 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 314 | dependencies = [ 315 | "rand_core", 316 | ] 317 | 318 | [[package]] 319 | name = "rand_pcg" 320 | version = "0.2.1" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" 323 | dependencies = [ 324 | "rand_core", 325 | ] 326 | 327 | [[package]] 328 | name = "ryu" 329 | version = "1.0.5" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" 332 | 333 | [[package]] 334 | name = "serde" 335 | version = "1.0.118" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" 338 | 339 | [[package]] 340 | name = "serde_derive" 341 | version = "1.0.118" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" 344 | dependencies = [ 345 | "proc-macro2", 346 | "quote", 347 | "syn", 348 | ] 349 | 350 | [[package]] 351 | name = "serde_json" 352 | version = "1.0.61" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" 355 | dependencies = [ 356 | "itoa", 357 | "ryu", 358 | "serde", 359 | ] 360 | 361 | [[package]] 362 | name = "siphasher" 363 | version = "0.3.3" 364 | source = "registry+https://github.com/rust-lang/crates.io-index" 365 | checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" 366 | 367 | [[package]] 368 | name = "string_cache" 369 | version = "0.8.1" 370 | source = "registry+https://github.com/rust-lang/crates.io-index" 371 | checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a" 372 | dependencies = [ 373 | "lazy_static", 374 | "new_debug_unreachable", 375 | "phf_shared", 376 | "precomputed-hash", 377 | "serde", 378 | ] 379 | 380 | [[package]] 381 | name = "string_cache_codegen" 382 | version = "0.5.1" 383 | source = "registry+https://github.com/rust-lang/crates.io-index" 384 | checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" 385 | dependencies = [ 386 | "phf_generator", 387 | "phf_shared", 388 | "proc-macro2", 389 | "quote", 390 | ] 391 | 392 | [[package]] 393 | name = "syn" 394 | version = "1.0.58" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" 397 | dependencies = [ 398 | "proc-macro2", 399 | "quote", 400 | "unicode-xid", 401 | ] 402 | 403 | [[package]] 404 | name = "tendril" 405 | version = "0.4.1" 406 | source = "registry+https://github.com/rust-lang/crates.io-index" 407 | checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" 408 | dependencies = [ 409 | "encoding_rs", 410 | "futf", 411 | "mac", 412 | "utf-8", 413 | ] 414 | 415 | [[package]] 416 | name = "time" 417 | version = "0.1.44" 418 | source = "registry+https://github.com/rust-lang/crates.io-index" 419 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" 420 | dependencies = [ 421 | "libc", 422 | "wasi 0.10.0+wasi-snapshot-preview1", 423 | "winapi", 424 | ] 425 | 426 | [[package]] 427 | name = "tinyvec" 428 | version = "1.1.0" 429 | source = "registry+https://github.com/rust-lang/crates.io-index" 430 | checksum = "ccf8dbc19eb42fba10e8feaaec282fb50e2c14b2726d6301dbfeed0f73306a6f" 431 | dependencies = [ 432 | "tinyvec_macros", 433 | ] 434 | 435 | [[package]] 436 | name = "tinyvec_macros" 437 | version = "0.1.0" 438 | source = "registry+https://github.com/rust-lang/crates.io-index" 439 | checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" 440 | 441 | [[package]] 442 | name = "unicode-bidi" 443 | version = "0.3.4" 444 | source = "registry+https://github.com/rust-lang/crates.io-index" 445 | checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" 446 | dependencies = [ 447 | "matches", 448 | ] 449 | 450 | [[package]] 451 | name = "unicode-normalization" 452 | version = "0.1.16" 453 | source = "registry+https://github.com/rust-lang/crates.io-index" 454 | checksum = "a13e63ab62dbe32aeee58d1c5408d35c36c392bba5d9d3142287219721afe606" 455 | dependencies = [ 456 | "tinyvec", 457 | ] 458 | 459 | [[package]] 460 | name = "unicode-xid" 461 | version = "0.2.1" 462 | source = "registry+https://github.com/rust-lang/crates.io-index" 463 | checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" 464 | 465 | [[package]] 466 | name = "url" 467 | version = "2.2.0" 468 | source = "registry+https://github.com/rust-lang/crates.io-index" 469 | checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" 470 | dependencies = [ 471 | "form_urlencoded", 472 | "idna", 473 | "matches", 474 | "percent-encoding", 475 | ] 476 | 477 | [[package]] 478 | name = "utf-8" 479 | version = "0.7.5" 480 | source = "registry+https://github.com/rust-lang/crates.io-index" 481 | checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" 482 | 483 | [[package]] 484 | name = "wasi" 485 | version = "0.9.0+wasi-snapshot-preview1" 486 | source = "registry+https://github.com/rust-lang/crates.io-index" 487 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 488 | 489 | [[package]] 490 | name = "wasi" 491 | version = "0.10.0+wasi-snapshot-preview1" 492 | source = "registry+https://github.com/rust-lang/crates.io-index" 493 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" 494 | 495 | [[package]] 496 | name = "winapi" 497 | version = "0.3.9" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 500 | dependencies = [ 501 | "winapi-i686-pc-windows-gnu", 502 | "winapi-x86_64-pc-windows-gnu", 503 | ] 504 | 505 | [[package]] 506 | name = "winapi-i686-pc-windows-gnu" 507 | version = "0.4.0" 508 | source = "registry+https://github.com/rust-lang/crates.io-index" 509 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 510 | 511 | [[package]] 512 | name = "winapi-x86_64-pc-windows-gnu" 513 | version = "0.4.0" 514 | source = "registry+https://github.com/rust-lang/crates.io-index" 515 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 516 | 517 | [[package]] 518 | name = "xml5ever" 519 | version = "0.16.1" 520 | source = "registry+https://github.com/rust-lang/crates.io-index" 521 | checksum = "0b1b52e6e8614d4a58b8e70cf51ec0cc21b256ad8206708bcff8139b5bbd6a59" 522 | dependencies = [ 523 | "log", 524 | "mac", 525 | "markup5ever", 526 | "time", 527 | ] 528 | -------------------------------------------------------------------------------- /ammonia-compare/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ammonia-compare" 3 | publish = false 4 | version = "0.0.1" 5 | authors = ["David Kellum¿De donde eres tú?
7 | 8 | 9 | -------------------------------------------------------------------------------- /marked/samples/documento_utf8_bom.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |¿De donde eres tú?
7 | 8 | 9 | -------------------------------------------------------------------------------- /marked/samples/documento_utf8_meta.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |¿De donde eres tú?
8 | 9 | 10 | -------------------------------------------------------------------------------- /marked/samples/documento_utf8_meta_utf16.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |¿De donde eres tú?
9 | 10 | 11 | -------------------------------------------------------------------------------- /marked/samples/documento_windows1252_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/documento_windows1252_meta.html -------------------------------------------------------------------------------- /marked/samples/iro0094_shiftjis_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/iro0094_shiftjis_meta.html -------------------------------------------------------------------------------- /marked/samples/matsunami_eucjp_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/matsunami_eucjp_meta.html -------------------------------------------------------------------------------- /marked/samples/russez_windows1251_meta.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dekellum/marked/1010a62cfd57e8ecd0ce505f57a3a4e744b1cba1/marked/samples/russez_windows1251_meta.html -------------------------------------------------------------------------------- /marked/src/chars.rs: -------------------------------------------------------------------------------- 1 | use tendril::StrTendril; 2 | 3 | /// Replace or remove sequences of white-space and/or control characters, and 4 | /// optionally remove leading/trailing spaces. 5 | /// 6 | /// _What_ char classes to replace is given via `ws` and `ctrl` flags. If a 7 | /// sequence is _all_ control or zero-width spaces, then it is simple removed 8 | /// (without replacement). If there is at least one non-zero width white-space 9 | /// character then the sequence is replaces with U+0020 SPACE. The string (st) 10 | /// is only lazily re-allocated (replaced) if a change is required. 11 | pub(crate) fn replace_chars( 12 | st: &mut StrTendril, 13 | ws: bool, 14 | ctrl: bool, 15 | trim_start: bool, 16 | trim_end: bool) 17 | { 18 | let mut last = 0; 19 | let mut ost = None; // output lazy allocated 20 | let mut replacing = 0u8; 21 | 22 | let ins = st.as_ref(); 23 | for (i, ch) in ins.char_indices() { 24 | let rmask = replace_mask(ch, ws, ctrl); 25 | if rmask > 0 { 26 | if replacing == 0 { 27 | if ost.is_none() { 28 | ost = Some(StrTendril::new()); 29 | } 30 | ost.as_mut().unwrap().push_slice(&ins[last..i]); 31 | } 32 | replacing |= rmask; 33 | } else if replacing > 0 { 34 | if replacing >= 2 && 35 | (ost.as_ref().unwrap().len32() > 0 || !trim_start) 36 | { 37 | ost.as_mut().unwrap().push_char(' '); 38 | } 39 | last = i; 40 | replacing = 0; 41 | } 42 | } 43 | if replacing > 0 { 44 | if replacing >= 2 && !trim_end { 45 | ost.as_mut().unwrap().push_char(' '); 46 | } 47 | } else if ost.is_some() { 48 | ost.as_mut().unwrap().push_slice(&ins[last..]); 49 | } 50 | if ost.is_some() { 51 | *st = ost.take().unwrap(); 52 | } 53 | } 54 | 55 | // Compare CharClass to flags and return bit-1 (control or zero-width) or bit-2 56 | // (whitespace). 57 | fn replace_mask(c: char, ws: bool, ctrl: bool) -> u8 { 58 | use CharClass::*; 59 | match char_class(c) { 60 | ZeroSpace | Control if ctrl => 1, 61 | WhiteSpace if ws => 2, 62 | _ => 0, 63 | } 64 | } 65 | 66 | // Character classes of internal interest (not the same as Unicode classes). 67 | #[derive(Debug, Eq, PartialEq)] 68 | enum CharClass { 69 | Unclassified, 70 | WhiteSpace, 71 | ZeroSpace, 72 | Control, 73 | } 74 | 75 | /// True if all contained characters are classified as whitespace or controls. 76 | pub(crate) fn is_all_ctrl_ws(st: &StrTendril) -> bool { 77 | st.as_ref().chars().all(|c| char_class(c) != CharClass::Unclassified) 78 | } 79 | 80 | // Return CharClass for a char 81 | fn char_class(c: char) -> CharClass { 82 | use CharClass::*; 83 | match c { 84 | '\u{0000}'..='\u{0008}' => Control, // C0 (XML disallowed) 85 | '\u{0009}' | // HT 86 | '\u{000A}' | // LF 87 | '\u{000B}' => WhiteSpace, // VT 88 | '\u{000C}' => Control, // FF (C0) 89 | '\u{000D}' => WhiteSpace, // CR 90 | '\u{000E}'..='\u{001F}' => Control, // C0 91 | '\u{0020}' => WhiteSpace, // SPACE 92 | 93 | '\u{007F}' | // DEL (C0) 94 | '\u{0080}'..='\u{009F}' => Control, // C1 (XML disallowed) 95 | '\u{00A0}' => WhiteSpace, // NO-BREAK SPACE (NBSP) 96 | 97 | // Not always (zero) white; shows hypen when line is wrapped. 98 | // '\u{00AD}' => Un- // SOFT HYPHEN, 99 | 100 | // Not white, rendered with a line: 101 | // '\u{1680}' => Un- // OGHAM SPACE MARK 102 | 103 | // Effects subsequent characters in Mongolian: 104 | // '\u{180E}' => Un- // MONGOLIAN VOWEL SEPARATOR 105 | 106 | '\u{2000}'..='\u{200A}' => WhiteSpace, // EN QUAD..HAIR SPACE 107 | '\u{200B}' | // ZERO WIDTH SPACE 108 | '\u{200C}' => ZeroSpace, // ZERO WIDTH NON-JOINER 109 | 110 | '\u{2028}' | // LINE SEPARATOR 111 | '\u{2029}' | // PARAGRAPH SEPARATOR 112 | 113 | '\u{202F}' | // NARROW NO-BREAK SPACE 114 | 115 | '\u{205F}' => WhiteSpace, // MEDIUM MATHEMATICAL SPACE 116 | '\u{2060}' => ZeroSpace, // WORD JOINER 117 | 118 | '\u{3000}' => WhiteSpace, // IDEOGRAPHIC SPACE 119 | 120 | '\u{FEFF}' => ZeroSpace, // BOM or ZERO WIDTH NON-BREAKING 121 | '\u{FFFE}' | // Bad BOM (not assigned) 122 | '\u{FFFF}' => Control, // Not assigned (invalid) 123 | _ => Unclassified, 124 | } 125 | 126 | // FIXME: see markup5ever/data/mod.rs: C1_REPLACEMENTS replaced with 127 | // alt higher unicode characters. This should occur _before_ above 128 | // transform, at least for HTML? 129 | } 130 | 131 | #[cfg(test)] 132 | mod tests { 133 | use super::*; 134 | use tendril::SliceExt; 135 | 136 | #[test] 137 | fn test_char_class() { 138 | use CharClass::*; 139 | assert_eq!(Unclassified, char_class('x')); 140 | assert_eq!(Control, char_class('\u{0008}')); 141 | assert_eq!(ZeroSpace, char_class('\u{2060}')); 142 | assert_eq!(WhiteSpace, char_class('\n')); 143 | assert_eq!(WhiteSpace, char_class('\n')); 144 | } 145 | 146 | #[test] 147 | fn replace() { 148 | assert_clean("", "" ); 149 | assert_clean("", "\u{2060}" ); 150 | assert_clean(" ", " "); 151 | assert_clean(" ", "\t \r\n"); 152 | 153 | assert_clean("x", "x" ); 154 | assert_clean(" x ", " x "); 155 | assert_clean(" x", " x\u{2060}" ); 156 | assert_clean("x ", "x " ); 157 | 158 | assert_clean("aa b ", "\u{009F}a\u{009F}a b " ); 159 | 160 | assert_clean("aa b c ", "aa b c " ); 161 | assert_clean("aa b c", "aa \t b c" ); 162 | assert_clean(" aa b c", "\t aa \t b c"); 163 | } 164 | 165 | // Assert that super-ASCII character boundaries are properly observed 166 | #[test] 167 | fn replace_multibyte() { 168 | assert_clean("Ψ", "Ψ" ); 169 | assert_clean(" Ψ ", " Ψ "); 170 | assert_clean(" Ψ", " Ψ\u{2060}" ); 171 | assert_clean("Ψ ", "Ψ " ); 172 | 173 | assert_clean("αα β ", "\u{009F}α\u{009F}α β " ); 174 | 175 | assert_clean("αα β γ ", "αα β γ " ); 176 | assert_clean("αα β γ", "αα \t β γ" ); 177 | assert_clean(" αα β γ", "\t αα \t β γ"); 178 | } 179 | 180 | #[test] 181 | fn replace_ctrl_only() { 182 | assert_clean_ctrl("", "" ); 183 | assert_clean_ctrl("", "\u{2060}" ); 184 | assert_clean_ctrl(" ", " "); 185 | 186 | assert_clean_ctrl("x", "x" ); 187 | assert_clean_ctrl(" x ", " x "); 188 | assert_clean_ctrl(" x", " x\u{2060}" ); 189 | assert_clean_ctrl("x ", "x " ); 190 | 191 | assert_clean_ctrl("aaa β ", "\u{009F}a\u{009F}aa β " ); 192 | 193 | assert_clean_ctrl("aa β c ", "aa β c " ); 194 | assert_clean_ctrl("aa \t β c", "aa \t β c" ); 195 | assert_clean_ctrl("\t aa \t β c", "\t aa \t β c"); 196 | } 197 | 198 | #[test] 199 | fn replace_trim() { 200 | assert_clean_trim("", ""); 201 | assert_clean_trim("", "\t \r\n"); 202 | assert_clean_trim("", "\u{0000}"); //NUL 203 | assert_clean_trim("", "\u{FFFE}"); //BAD BOM 204 | assert_clean_trim("", "\u{00A0}\u{2007}\u{202F}"); 205 | 206 | assert_clean_trim("x", "x" ); 207 | assert_clean_trim("x", " x "); 208 | assert_clean_trim("x", " x" ); 209 | assert_clean_trim("x", "x " ); 210 | 211 | assert_clean_trim("aa b", " a\u{009F}a\u{009F} b " ); 212 | 213 | assert_clean_trim("aa b c", "aa b c " ); 214 | assert_clean_trim("aa b c", "aa \t b c" ); 215 | assert_clean_trim("aa b c", "\t aa \t b c"); 216 | } 217 | 218 | #[test] 219 | fn replace_trim_left() { 220 | assert_clean_trim_l("", ""); 221 | assert_clean_trim_l(" ", " "); 222 | assert_clean_trim_l(" ", "\t \r\n"); 223 | } 224 | 225 | #[test] 226 | fn replace_trim_right() { 227 | assert_clean_trim_r("", ""); 228 | assert_clean_trim_r("", " "); 229 | assert_clean_trim_r("", "\t \r\n"); 230 | } 231 | 232 | fn assert_clean_trim(exp: &str, src: &str) { 233 | let mut st = src.to_tendril(); 234 | replace_chars(&mut st, true, true, true, true); 235 | assert_eq!(exp, st.as_ref()); 236 | } 237 | 238 | fn assert_clean_trim_l(exp: &str, src: &str) { 239 | let mut st = src.to_tendril(); 240 | replace_chars(&mut st, true, true, true, false); 241 | assert_eq!(exp, st.as_ref()); 242 | } 243 | 244 | fn assert_clean_trim_r(exp: &str, src: &str) { 245 | let mut st = src.to_tendril(); 246 | replace_chars(&mut st, true, true, false, true); 247 | assert_eq!(exp, st.as_ref()); 248 | } 249 | 250 | fn assert_clean(exp: &str, src: &str) { 251 | let mut st = src.to_tendril(); 252 | replace_chars(&mut st, true, true, false, false); 253 | assert_eq!(exp, st.as_ref()); 254 | } 255 | 256 | fn assert_clean_ctrl(exp: &str, src: &str) { 257 | let mut st = src.to_tendril(); 258 | replace_chars(&mut st, false, true, false, false); 259 | assert_eq!(exp, st.as_ref()); 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /marked/src/decode.rs: -------------------------------------------------------------------------------- 1 | // Copyright © 2019 David Kellum 2 | // 3 | // The `Decoder` implemented here was originally derived from 4 | // `tendril::stream::LossyDecoder` and `tendril::stream::TendrilSink` 5 | // (version 0.4.1) source as found: 6 | // 7 | // https://github.com/servo/tendril 8 | // Copyright © 2015 Keegan McAllister 9 | // Licensed under the Apache license v2.0, or the MIT license 10 | 11 | //! Support for streaming charset decoding. 12 | 13 | use std::borrow::Cow; 14 | use std::io; 15 | 16 | use log::trace; 17 | use encoding_rs as enc; 18 | use enc::DecoderResult; 19 | 20 | use tendril::{Tendril, TendrilSink, Atomicity, NonAtomic}; 21 | use tendril::fmt as form; 22 | use tendril::stream::Utf8LossyDecoder; 23 | 24 | mod encoding_hint; 25 | 26 | pub use encoding_hint::{ 27 | EncodingHint, SharedEncodingHint, 28 | }; 29 | 30 | use crate::READ_BUFFER_SIZE; 31 | 32 | /// A `TendrilSink` adaptor that takes bytes, decodes them as the given 33 | /// character encoding, while replacing any ill-formed byte sequences with 34 | /// U+FFFD replacement characters, and emits Unicode (`StrTendril`). 35 | /// 36 | /// This allocates new tendrils for encodings other than UTF-8. 37 | pub struct Decoder