├── crates ├── js │ ├── test-pages │ ├── .gitignore │ ├── src │ │ └── utils.rs │ ├── tests │ │ └── web.rs │ ├── LICENSE_MIT │ └── Cargo.toml ├── bench │ ├── test-pages │ ├── Cargo.toml │ └── benches │ │ └── parse.rs ├── lua │ └── Cargo.toml └── cli │ └── Cargo.toml ├── test-pages ├── alt │ ├── arxiv │ └── arstechnica │ │ └── expected-metadata.json ├── ok │ ├── 005-unescape-html-entities │ │ ├── expected.html │ │ ├── expected-metadata.json │ │ └── source.html │ ├── js-link-replacement │ │ ├── expected.html │ │ ├── expected-metadata.json │ │ └── source.html │ ├── ietf-1 │ │ └── expected-metadata.json │ ├── replace-brs │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── qq │ │ ├── expected-metadata.json │ │ └── expected.html │ ├── hukumusume │ │ └── expected-metadata.json │ ├── 004-metadata-space-separated-properties │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── gmw │ │ └── expected-metadata.json │ ├── 003-metadata-preferred │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── table-style-attributes │ │ └── expected-metadata.json │ ├── 001 │ │ └── expected-metadata.json │ ├── v8-blog │ │ └── expected-metadata.json │ ├── keep-tabular-data │ │ └── expected-metadata.json │ ├── 002 │ │ └── expected-metadata.json │ ├── archive-of-our-own │ │ └── expected-metadata.json │ ├── dev418 │ │ └── expected-metadata.json │ ├── wikipedia-2 │ │ └── expected-metadata.json │ ├── la-nacion │ │ └── expected-metadata.json │ ├── ars-1 │ │ └── expected-metadata.json │ ├── cnn │ │ └── expected-metadata.json │ ├── aktualne │ │ └── expected-metadata.json │ ├── citylab-1 │ │ └── expected-metadata.json │ ├── videos-1 │ │ └── expected-metadata.json │ ├── tmz-1 │ │ ├── expected-metadata.json │ │ └── expected.html │ ├── clean-links │ │ └── expected-metadata.json │ ├── breitbart │ │ └── expected-metadata.json │ ├── medium-3 │ │ └── expected-metadata.json │ ├── medicalnewstoday │ │ └── expected-metadata.json │ ├── wikipedia-3 │ │ └── expected-metadata.json │ ├── wikia │ │ └── expected-metadata.json │ ├── wikipedia │ │ └── expected-metadata.json │ ├── aclu │ │ └── expected-metadata.json │ ├── lwn-1 │ │ └── expected-metadata.json │ ├── base-url │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── replace-font-tags │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── toc-missing │ │ └── expected-metadata.json │ ├── social-buttons │ │ ├── expected-metadata.json │ │ └── expected.html │ ├── base-url-base-element-relative │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── ehow-1 │ │ └── expected-metadata.json │ └── engadget │ │ └── expected-metadata.json ├── readability │ ├── invalid-attributes │ │ ├── expected.html │ │ ├── expected-metadata.json │ │ └── source.html │ ├── ol │ │ ├── source.html │ │ ├── expected.html │ │ └── expected-metadata.json │ ├── lazy-image-3 │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── rtl-1 │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── rtl-2 │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── rtl-3 │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── rtl-4 │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── metadata-content-missing │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── mathjax │ │ └── expected-metadata.json │ ├── mercurial │ │ └── expected-metadata.json │ ├── daringfireball-1 │ │ └── expected-metadata.json │ ├── webmd-2 │ │ └── expected-metadata.json │ ├── basic-tags-cleaning │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── remove-extra-brs │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── remove-script-tags │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── comment-inside-script-parsing │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── remove-aria-hidden │ │ ├── expected.html │ │ ├── expected-metadata.json │ │ └── source.html │ ├── remove-extra-paragraphs │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── wikipedia-4 │ │ └── expected-metadata.json │ ├── visibility-hidden │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── youth │ │ └── expected-metadata.json │ ├── pixnet │ │ └── expected-metadata.json │ ├── mozilla-2 │ │ └── expected-metadata.json │ ├── yahoo-4 │ │ ├── expected-metadata.json │ │ └── expected.html │ ├── keep-images │ │ └── expected-metadata.json │ ├── medium-2 │ │ └── expected-metadata.json │ ├── salon-1 │ │ └── expected-metadata.json │ ├── ebb-org │ │ └── expected-metadata.json │ ├── nytimes-1 │ │ └── expected-metadata.json │ ├── yahoo-2 │ │ └── expected-metadata.json │ ├── medium-1 │ │ └── expected-metadata.json │ ├── liberation-1 │ │ └── expected-metadata.json │ ├── quanta-1 │ │ └── expected-metadata.json │ ├── msn │ │ └── expected-metadata.json │ ├── folha │ │ └── expected-metadata.json │ ├── lazy-image-1 │ │ └── expected-metadata.json │ ├── bbc-1 │ │ └── expected-metadata.json │ ├── wapo-2 │ │ └── expected-metadata.json │ ├── heise │ │ ├── expected-metadata.json │ │ └── expected.html │ ├── nytimes-2 │ │ └── expected-metadata.json │ ├── webmd-1 │ │ └── expected-metadata.json │ ├── royal-road │ │ └── expected-metadata.json │ ├── nytimes-4 │ │ └── expected-metadata.json │ ├── blogger │ │ └── expected-metadata.json │ ├── cnet-svg-classes │ │ └── expected-metadata.json │ ├── cnet │ │ └── expected-metadata.json │ ├── tumblr │ │ └── expected-metadata.json │ ├── links-in-tables │ │ └── expected-metadata.json │ ├── buzzfeed-1 │ │ └── expected-metadata.json │ ├── google-sre-book-1 │ │ └── expected-metadata.json │ ├── nytimes-3 │ │ └── expected-metadata.json │ ├── wapo-1 │ │ └── expected-metadata.json │ ├── firefox-nightly-blog │ │ └── expected-metadata.json │ ├── spiceworks │ │ └── expected-metadata.json │ ├── lemonde-1 │ │ └── expected-metadata.json │ ├── nytimes-5 │ │ └── expected-metadata.json │ ├── mozilla-1 │ │ └── expected-metadata.json │ ├── gitlab-blog │ │ └── expected-metadata.json │ ├── topicseed-1 │ │ └── expected-metadata.json │ ├── herald-sun-1 │ │ └── expected-metadata.json │ ├── theverge │ │ └── expected-metadata.json │ ├── svg-parsing │ │ └── expected-metadata.json │ ├── videos-2 │ │ └── expected-metadata.json │ ├── schema-org-context-object │ │ └── expected-metadata.json │ ├── wordpress │ │ └── expected-metadata.json │ ├── embedded-videos │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── style-tags-removal │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── base-url-base-element │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── lifehacker-working │ │ └── expected-metadata.json │ ├── title-en-dash │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── normalize-spaces │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── parsely-metadata │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── lifehacker-post-comment-load │ │ └── expected-metadata.json │ ├── seattletimes-1 │ │ └── expected-metadata.json │ ├── article-author-tag │ │ └── expected-metadata.json │ ├── bug-1255978 │ │ └── expected-metadata.json │ ├── iab-1 │ │ └── expected-metadata.json │ ├── title-and-h1-discrepancy │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── telegraph │ │ └── expected-metadata.json │ ├── guardian-1 │ │ └── expected-metadata.json │ ├── yahoo-1 │ │ └── expected-metadata.json │ ├── simplyfound-1 │ │ ├── expected-metadata.json │ │ └── expected.html │ ├── dropbox-blog │ │ └── expected-metadata.json │ ├── ehow-2 │ │ └── expected-metadata.json │ ├── reordering-paragraphs │ │ ├── expected-metadata.json │ │ ├── expected.html │ │ └── source.html │ ├── lazy-image-2 │ │ └── expected-metadata.json │ ├── data-url-image │ │ └── expected-metadata.json │ ├── hidden-nodes │ │ ├── expected.html │ │ └── expected-metadata.json │ ├── yahoo-3 │ │ └── expected-metadata.json │ ├── missing-paragraphs │ │ └── expected-metadata.json │ └── arxiv │ │ └── expected-metadata.json ├── aclu_ld_meta.json └── ld.json ├── .gitignore ├── .gitattributes ├── .cargo └── config.toml ├── src ├── serde_helpers.rs ├── ac_automat.rs ├── lib.rs ├── grab_flags.rs └── readable.rs ├── .github ├── dependabot.yml └── workflows │ ├── rust.yml │ ├── coverage.yml │ ├── wasm.yml │ ├── audit.yml │ ├── release.yml │ └── benchmark.yml ├── tests ├── metadata.rs ├── alt.rs ├── readability.rs ├── bad.rs └── parse_policy.rs ├── LICENSE └── Cargo.toml /crates/js/test-pages: -------------------------------------------------------------------------------- 1 | ../../test-pages -------------------------------------------------------------------------------- /crates/bench/test-pages: -------------------------------------------------------------------------------- 1 | ../../test-pages -------------------------------------------------------------------------------- /test-pages/alt/arxiv: -------------------------------------------------------------------------------- 1 | ../readability/arxiv -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /examples 3 | *.js 4 | **/draft -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | 2 | test-pages/* linguist-vendored 3 | **/*.html linguist-detectable=false -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.wasm32-unknown-unknown] 2 | runner = 'wasm-bindgen-test-runner' 3 | -------------------------------------------------------------------------------- /crates/js/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | bin/ 5 | pkg/ 6 | wasm-pack.log 7 | -------------------------------------------------------------------------------- /test-pages/ok/005-unescape-html-entities/expected.html: -------------------------------------------------------------------------------- 1 |
Test
-------------------------------------------------------------------------------- /test-pages/ok/js-link-replacement/expected.html: -------------------------------------------------------------------------------- 1 |
2 | 3 |

abc

4 |

def

ghi 5 |
6 |
-------------------------------------------------------------------------------- /test-pages/readability/invalid-attributes/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

4 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. 5 |

6 |
7 |
8 | -------------------------------------------------------------------------------- /test-pages/ok/ietf-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "remoteStorage", 3 | "byline": "Jong, Michiel de", 4 | "dir": null, 5 | "lang": "en", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/ol/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
    4 |
  1. AI hasn’t meaningfully changed anything in cybersecurity so far. Deep fake phishing is still rare, L

  2. 5 |
6 | 7 | -------------------------------------------------------------------------------- /test-pages/readability/lazy-image-3/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Lazy Load with Alt includes jpg/png/webp extensions", 3 | "byline": null, 4 | "dir": null, 5 | "siteName": null, 6 | "publishedTime": null, 7 | "readerable": false 8 | } 9 | -------------------------------------------------------------------------------- /test-pages/ok/js-link-replacement/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Replace javascript: links", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "abc", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": false 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/replace-brs/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Replace brs test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsumdolor sit", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "RTL Test", 3 | "byline": null, 4 | "dir": "rtl", 5 | "excerpt": "Lorem ipsum dolor sit amet.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "RTL Test", 3 | "byline": null, 4 | "dir": "rtl", 5 | "excerpt": "Lorem ipsum dolor sit amet.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-3/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "RTL Test", 3 | "byline": null, 4 | "dir": "rtl", 5 | "excerpt": "Lorem ipsum dolor sit amet.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-4/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "RTL Test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/005-unescape-html-entities/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "&#xg; 😭 😭 � \u0000", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": false 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/ol/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
    3 |
  1. 4 |

    AI hasn’t meaningfully changed anything in cybersecurity so far. Deep fake phishing is still rare, L

    5 |
  2. 6 |
7 |
-------------------------------------------------------------------------------- /test-pages/ok/qq/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "DeepMind新电脑已可利用记忆自学 人工智能迈上新台阶_科技_腾讯网", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "DeepMind新电脑已可利用记忆自学 人工智能迈上新台阶", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/hukumusume/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "欲張りなイヌ <福娘童話集 きょうのイソップ童話>", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "福娘童話集 > きょうのイソップ童話 > 1月のイソップ童話 > 欲張りなイヌ", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/metadata-content-missing/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "My title", 3 | "byline": "Creator Name", 4 | "dir": null, 5 | "excerpt": "Preferred description", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/004-metadata-space-separated-properties/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Preferred title", 3 | "byline": "Creator Name", 4 | "dir": null, 5 | "excerpt": "Preferred description", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/005-unescape-html-entities/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Test 8 | 9 | -------------------------------------------------------------------------------- /test-pages/ok/gmw/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "宇航员在太空中喝酒会怎么样?后果很严重 _探索者 _光明网", 3 | "byline": "肖春芳", 4 | "dir": null, 5 | "excerpt": "不幸的是,对于希望能喝上一杯的太空探险者,那些将他们送上太空的政府机构普遍禁止他们染指包括酒在内的含酒精饮料。", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/mathjax/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "MathJax v3 with MathML input and HTML output", 3 | "byline": null, 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "When", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": false 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/mercurial/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Evolve: Shared Mutable History — evolve extension for Mercurial", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Contents", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/invalid-attributes/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Lorem Ipsum", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": false 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/003-metadata-preferred/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Dublin Core property title", 3 | "byline": "Dublin Core property author", 4 | "dir": null, 5 | "excerpt": "Dublin Core property description", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/js-link-replacement/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Replace javascript: links 6 | 7 | 8 | 9 |

abc

10 |

def

11 | ghi 12 |
13 | 14 | -------------------------------------------------------------------------------- /test-pages/ok/table-style-attributes/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "linux video", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "linux usability\n ...or, why do I bother. © 2002, 2003\n Jamie Zawinski", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/ol/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "AI hasn’t meaningfully changed anything in cybersecurity so far. Deep fake phishing is still rare, L", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": false 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/001/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Get your Frontend JavaScript Code Covered | Code", 3 | "byline": "Nicolas Perriault", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Nicolas Perriault's homepage.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/daringfireball-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Daring Fireball: Colophon", 3 | "byline": null, 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Daring Fireball is written and produced by John Gruber.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/webmd-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Superbugs: What They Are and How You Get Them", 3 | "byline": "By Kelli Miller WebMD Health News", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Drug-resistant bacteria, dubbed", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/v8-blog/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "standalone WebAssembly binaries using Emscripten · V8", 3 | "byline": null, 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Emscripten now supports standalone Wasm files, which do not need JavaScript.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/basic-tags-cleaning/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Basic tag cleaning test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/remove-extra-brs/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Remove trailing brs test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/remove-script-tags/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Remove script tags test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/comment-inside-script-parsing/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Test script parsing", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/remove-aria-hidden/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

4 |

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

5 |
6 |
-------------------------------------------------------------------------------- /test-pages/readability/remove-extra-paragraphs/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Replace font tags test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/wikipedia-4/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "List of films featuring time loops", 3 | "byline": "Contributors to Wikimedia projects", 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "From Wikipedia, the free encyclopedia", 7 | "siteName": "Wikimedia Foundation, Inc.", 8 | "publishedTime": "2014-03-27T19:11:24Z", 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/remove-aria-hidden/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Remove aria-hidden elements test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": false 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/alt/arstechnica/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Camera owner asks Canon, skies: Why is it $5/month for webcam software?", 3 | "byline": "Kevin Purdy", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "Just because it’s a good rig doesn’t mean you can use it on Zoom.", 7 | "siteName": "Ars Technica", 8 | "publishedTime": "2025-01-17T19:36:17+00:00", 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/keep-tabular-data/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Friday Facts #282 - 0.17 in sight | Factorio", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Posted by kovarex, TOGos, Ernestas, Albert on 2019-02-15, all posts", 6 | "siteName": "Factorio.com", 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "http://www.factorio.com/static/img/factorio-wheel.png" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/visibility-hidden/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Visibility hidden test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/youth/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "海外留学生看两会:出国前后关注点大不同_教育频道_中国青年网", 3 | "byline": "青网校园崔宁宁", 4 | "dir": null, 5 | "excerpt": "图为马素湘在澳大利亚悉尼游玩时的近影。出国前后关注点大不同出国前:政治科目会出啥考题?出国后:国家未来将如何发展?在采访中,我们了解到不少学子在出国前就每年守在电脑前观看两会直播。但是,随着年龄和阅历的增长,学子对两会的关注点在出国前后发生了很大的变化。在法国里昂国立应用科学院留学的卢宇表示,他还是个中学生时,就开始关注两会了。“我高中毕业后就出国留学了。", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/002/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "This API is so Fetching!", 3 | "byline": "Nikhil Marathe", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ...", 7 | "siteName": "Mozilla Hacks – the Web developer blog", 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/archive-of-our-own/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Conversations with a Cryptid - Chapter 1 - AMournfulHowlInTheNight - 僕のヒーローアカデミア | Boku no Hero Academia", 3 | "byline": "Organization for Transformative Works", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "An Archive of Our Own, a project of the Organization for Transformative Works", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/dev418/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Readability Test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/wikipedia-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "New Zealand", 3 | "byline": "Contributors to Wikimedia projects", 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "Coordinates: 42°S 174°E / 42°S 174°E", 7 | "siteName": "Wikimedia Foundation, Inc.", 8 | "publishedTime": "2001-10-29T01:59:14Z", 9 | "readerable": true, 10 | "image": "https://upload.wikimedia.org/wikipedia/commons/3/3e/Flag_of_New_Zealand.svg" 11 | } 12 | -------------------------------------------------------------------------------- /crates/bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "dom-smoothie-bench" 3 | version.workspace = true 4 | edition.workspace = true 5 | license.workspace = true 6 | rust-version.workspace = true 7 | repository.workspace = true 8 | authors.workspace = true 9 | publish = false 10 | 11 | [dependencies] 12 | dom_smoothie = { path = "../.." } 13 | 14 | 15 | [dev-dependencies] 16 | criterion = { version = "0.7.0" } 17 | 18 | [[bench]] 19 | name = "parse" 20 | harness = false -------------------------------------------------------------------------------- /crates/lua/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "dom-smoothie-lua" 3 | version.workspace = true 4 | edition.workspace = true 5 | license.workspace = true 6 | rust-version.workspace = true 7 | repository.workspace = true 8 | authors.workspace = true 9 | publish = false 10 | 11 | [lib] 12 | crate-type = ["cdylib"] 13 | 14 | [dependencies] 15 | dom_smoothie = { path = "../..", features = ["serde"] } 16 | mlua = { version = "0.11.5", features = ["lua54", "module", "serde"] } -------------------------------------------------------------------------------- /test-pages/readability/pixnet/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "新竹尖石_美樹營地賞楓 (2) @ 史蒂文的家_藍天 :: 痞客邦 PIXNET ::", 3 | "byline": "史蒂文的家_藍天 (stevenhgm)", 4 | "dir": null, 5 | "lang": "zh-TW", 6 | "excerpt": "一波波接續性低溫寒流報到 已將新竹尖石鄉後山一帶層層山巒披上嫣紅的彩衣 玉峰道路一路上雲氣山嵐滯留山頭 順路下切蜿蜒道路後不久即抵達來到\"玉峰國小\" \"美樹\"美", 7 | "siteName": "史蒂文的家_藍天", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://pic.pimg.tw/stevenhgm/1387895093-631461272.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/la-nacion/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Una solución no violenta para la cuestión mapuche", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Los pueblos indígenas reclaman por derechos que permanecen incumplidos, por eso es más eficiente canalizar la protesta que reprimirla", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "http://bucket.glanacion.com/anexos/fotos/77/2585177.jpg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/mozilla-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Welcome to Firefox Developer Edition", 3 | "byline": null, 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "Built for those who build the Web. Introducing the only browser made for developers.", 7 | "siteName": "Mozilla", 8 | "publishedTime": null, 9 | "readerable": false, 10 | "image": "https://mozorg.cdn.mozilla.net/media/img/firefox/developer/page-image.03bbe7da3199.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/ars-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Just-released Minecraft exploit makes it easy to crash game servers", 3 | "byline": "Dan Goodin", 4 | "dir": null, 5 | "lang": "en-us", 6 | "excerpt": "Two-year-old bug exposes thousands of servers to crippling attack.", 7 | "siteName": "Ars Technica", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://cdn.arstechnica.net/wp-content/uploads/2015/04/server-crash-640x215.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/yahoo-4/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "トレンドマイクロ、公衆無線LANを安全に使うためのアプリ「フリーWi-Fiプロテクション」(CNET Japan) - Yahoo!ニュース", 3 | "byline": "個人", 4 | "dir": null, 5 | "lang": "ja", 6 | "excerpt": "トレンドマイクロは3月9日、Wi-Fi利用時の通信を暗号化し保護するスマホ・タブレット - Yahoo!ニュース(CNET Japan)", 7 | "siteName": "Yahoo!ニュース", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://i.yimg.jp/images/jpnews/cre/common/all/images/fbico_ogp_600x600.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/cnn/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The 'birth lottery' and economic mobility", 3 | "byline": "Ahiza Garcia", 4 | "dir": null, 5 | "excerpt": "A recently-released report on poverty and inequality found that the U.S. ranks the lowest among countries with welfare states.", 6 | "siteName": "CNNMoney", 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "http://i2.cdn.turner.com/money/dam/assets/141103182938-income-inequality-780x439.png" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/keep-images/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Inside the Deep Web Drug Lab", 3 | "byline": "Joseph Cox", 4 | "dir": null, 5 | "excerpt": "Welcome to DoctorX’s Barcelona lab, where the drugs you bought online are tested for safety and purity. No questions ask…", 6 | "siteName": "Medium", 7 | "publishedTime": "2015-03-27T13:07:55.096Z", 8 | "readerable": true, 9 | "image": "https://d262ilb51hltx0.cloudfront.net/max/800/1*sLDnS1UWEFIS33uLMxq3cw.jpeg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/medium-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "On Behalf of “Literally”", 3 | "byline": "Courtney Kirchoff", 4 | "dir": null, 5 | "excerpt": "In defense of the word “literally” and why you or someone you know should stop misusing the word, lest they drive us fig…", 6 | "siteName": "Medium", 7 | "publishedTime": "2015-02-24T19:56:33.374Z", 8 | "readerable": true, 9 | "image": "https://d262ilb51hltx0.cloudfront.net/max/1600/1*eR_J8DurqygbhrwDg-WPnQ.png" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/salon-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The sharing economy is a lie: Uber, Ayn Rand and the truth about tech and libertarians", 3 | "byline": "Joanna Rothkopf", 4 | "dir": null, 5 | "excerpt": "Disruptive companies talk a good game about sharing. Uber's really just an under-regulated company making riches", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "http://media.salon.com/2014/12/uber_rand_paul.jpg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/aktualne/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "West Ham hrozí gigantům, okouzlil i Linekera. Součka je snadné přehlédnout", 3 | "byline": "Aleš Vávra", 4 | "dir": null, 5 | "lang": "cs", 6 | "excerpt": "Zázrak jedné sezony? West Ham United dává pochybovačům stále pádnější odpovědi.", 7 | "siteName": "Aktuálně.cz", 8 | "publishedTime": "2021-11-01T10:52:50+0100", 9 | "readerable": true, 10 | "image": "https://cdn.xsd.cz/original/9b6b601b440031309c748f864087a138.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/ebb-org/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )", 3 | "byline": "Bradley M. Kuhn (http://ebb.org/bkuhn/)", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "The website of Bradley M. Kuhn, aka Brad, aka bkuhn. This site includes his GPG keys, resume, blog, projects list, software, interviews, speeches and writing.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/nytimes-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "United States to Lift Sudan Sanctions", 3 | "byline": "Jeffrey Gettleman", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "For the first time since the 1990s, the country will be able to trade extensively with the United States.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://static01.nyt.com/images/2017/01/14/world/13SUDAN-1/13SUDAN-1-facebookJumbo.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/citylab-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Modern Ambitions Behind Neon", 3 | "byline": "Sarah Archer", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "The once-ubiquitous form of lighting was novel when it first emerged in the early 1900s, though it has since come to represent decline.", 7 | "siteName": "CityLab", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://cdn.citylab.com/media/img/citylab/2019/04/mr1/facebook.jpg?1556645448" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/yahoo-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Yahoo News - Latest News & Headlines", 3 | "byline": "NATALIYA VASILYEVA", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "The latest news and headlines from Yahoo! News. Get breaking news stories and in-depth coverage with videos and photos.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://s.yimg.com/os/mit/media/m/social/images/social_default_logo-1481777.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/videos-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "How to watch the 21 best films of 2017", 3 | "byline": "Alissa Wilkinson", 4 | "dir": null, 5 | "excerpt": "It was an extraordinary year for movies.", 6 | "siteName": "Vox", 7 | "publishedTime": "2017-12-15T08:50:02-05:00", 8 | "readerable": true, 9 | "image": "https://cdn.vox-cdn.com/thumbor/7WyjCLC7n6i1IG2eJB06qi1o7kQ=/0x148:2300x1352/fit-in/1200x630/cdn.vox-cdn.com/uploads/chorus_asset/file/9871033/Movies_end_of_year_2017.jpg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/medium-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Open Journalism Project: Better Student Journalism", 3 | "byline": "Pippin Lee", 4 | "dir": null, 5 | "excerpt": "We pushed out the first version of the Open Journalism site in January. Here’s what we’ve learned about student journali…", 6 | "siteName": "Medium", 7 | "publishedTime": "2015-03-17T16:27:40.294Z", 8 | "readerable": true, 9 | "image": "https://d262ilb51hltx0.cloudfront.net/max/800/1*oBWUXtszDsiv_-Qq2bFLTQ.png" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/tmz-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Lupita Nyong'o's $150K Pearl Oscar Dress -- STOLEN!!!", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lupita Nyong'o's now-famous Oscar dress -- adorned in pearls -- was stolen right out of her hotel room ... TMZ has learned. Law enforcement sources tell…", 6 | "siteName": "http://www.tmz.com", 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "http://ll-media.tmz.com/2015/02/26/0225-lupita-nyongo-getty-01-1200x630.jpg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/liberation-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Un troisième Français mort dans le séisme au Népal", 3 | "byline": "AFP", 4 | "dir": null, 5 | "lang": "fr", 6 | "excerpt": "Laurent Fabius a accueilli jeudi matin à Roissy un premier avion spécial ramenant des rescapés.", 7 | "siteName": "Libération.fr", 8 | "publishedTime": "2015-04-30T07:19:58", 9 | "readerable": true, 10 | "image": "http://md1.libe.com/photo/755923-000_hkg10175905.jpg?modified_at=1430371146&width=750" 11 | } 12 | -------------------------------------------------------------------------------- /crates/cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "dom_smoothie_cli" 3 | version.workspace = true 4 | edition.workspace = true 5 | license.workspace = true 6 | rust-version.workspace = true 7 | authors.workspace = true 8 | description = "A reference implementation of a CLI tool for the `dom_smoothie`" 9 | publish = false 10 | 11 | 12 | [dependencies] 13 | dom_smoothie = { path = "../.." } 14 | clap = {version = "4.5.53", features = ["derive"]} 15 | serde = {version = "1.0", features = ["derive"]} 16 | serde_json = {version = "1.0"} 17 | -------------------------------------------------------------------------------- /crates/js/src/utils.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | pub fn set_panic_hook() { 3 | // When the `console_error_panic_hook` feature is enabled, we can call the 4 | // `set_panic_hook` function at least once during initialization, and then 5 | // we will get better error messages if our code ever panics. 6 | // 7 | // For more details see 8 | // https://github.com/rustwasm/console_error_panic_hook#readme 9 | #[cfg(feature = "console_error_panic_hook")] 10 | console_error_panic_hook::set_once(); 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/visibility-hidden/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

4 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |
6 |
-------------------------------------------------------------------------------- /test-pages/readability/quanta-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Hidden Heroines of Chaos", 3 | "byline": "By Joshua Sokol", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Two women programmers played a pivotal role in the birth of chaos theory. Their previously untold story illustrates the changing status of computation in", 7 | "siteName": "Quanta Magazine", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://d2r55xnwy6nx47.cloudfront.net/uploads/2019/05/LHF_1200_O_Social.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /src/serde_helpers.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserializer, Serializer}; 2 | use tendril::StrTendril; 3 | 4 | pub fn serialize_str_tendril(value: &StrTendril, serializer: S) -> Result 5 | where 6 | S: Serializer, 7 | { 8 | serializer.serialize_str(value.as_ref()) 9 | } 10 | 11 | pub fn deserialize_str_tendril<'de, D>(deserializer: D) -> Result 12 | where 13 | D: Deserializer<'de>, 14 | { 15 | let s: String = serde::Deserialize::deserialize(deserializer)?; 16 | Ok(StrTendril::from(s)) 17 | } 18 | -------------------------------------------------------------------------------- /test-pages/aclu_ld_meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Facebook Is Tracking Me Even Though I’m Not on Facebook", 3 | "byline": "Daniel Kahn Gillmor", 4 | "excerpt": "Facebook collects data about people who have never even opted in. But there are ways these non-users can protect themselves.", 5 | "siteName": "American Civil Liberties Union", 6 | "publishedTime": "2018-04-05T06:00", 7 | "modifiedTime": "2018-04-11", 8 | "url": "https://www.aclu.org/blog/privacy-technology/internet-privacy/facebook-tracking-me-even-though-im-not-facebook" 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/msn/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Nintendo's first iPhone game will launch in December for $10", 3 | "byline": "Alex Perry", 4 | "dir": "ltr", 5 | "lang": "en-US", 6 | "excerpt": "Nintendo and Apple shocked the world earlier this year by announcing \"Super Mario Run,\" the legendary gaming company's first foray into mobile gaming.\u00A0", 7 | "siteName": "MSN", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://img-s-msn-com.akamaized.net/tenant/amp/entityid/AAkk5fh.img" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/folha/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Tite diz que errou ao levar taça da Libertadores a Lula em 2012", 3 | "byline": "Bruno (Henrique Zecchin) Rodrigues", 4 | "dir": null, 5 | "lang": "pt-BR", 6 | "excerpt": "Na ocasião, técnico do Corinthians entregou réplica do troféu ao ex-presidente", 7 | "siteName": "Folha de S.Paulo", 8 | "publishedTime": "2018-12-21T12:55:00Z", 9 | "readerable": true, 10 | "image": "https://f.i.uol.com.br/fotografia/2018/12/21/15454034955c1cfc67131dc_1545403495_3x2_rt.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/lazy-image-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Node.js and CPU profiling on production (in real-time without downtime)", 3 | "byline": "Vincent Vallet", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "How to run a CPU profiling with Node.js on your production in real-time and without interruption of service.", 7 | "siteName": "Voodoo Engineering", 8 | "publishedTime": "2019-10-18T17:23:34.816Z", 9 | "readerable": true, 10 | "image": "https://miro.medium.com/max/1200/1*EO-pr4RolgcAOj_Uk1rpDA.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/bbc-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Obama admits US gun laws are his 'biggest frustration'", 3 | "byline": "BBC News", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.", 7 | "siteName": "BBC News", 8 | "publishedTime": "2015-07-24T05:36:09+01:00", 9 | "readerable": true, 10 | "image": "http://ichef.bbci.co.uk/news/1024/cpsprodpb/3D8B/production/_84455751_84455749.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/invalid-attributes/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Lorem Ipsum 5 | 6 | 7 |
8 |
9 |
10 |
11 |
12 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. 13 |
14 |
15 |
16 |
17 |
18 | 19 | 20 | -------------------------------------------------------------------------------- /test-pages/readability/wapo-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Where do strained U.S.-Israeli relations go after Netanyahu’s victory?", 3 | "byline": "By Steven Mufson", 4 | "dir": null, 5 | "excerpt": "Few foreign leaders have so brazenly stood up to President Obama and the relationship could face its next test this month.", 6 | "siteName": "Washington Post", 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "http://img.washingtonpost.com/rw/2010-2019/WashingtonPost/2015/03/18/National-Economy/Images/Nic6429750-1138.jpg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/clean-links/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Bartleby the Scrivener Web Study Text", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Ere introducing the scrivener, as he first appeared to me, it is fit \n I make some mention of myself, my employees, my business, my chambers, \n and general surroundings; because some such description is indispensable \n to an adequate understanding of the chief character about to be presented.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/heise/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "1Password für Mac generiert Einmal-Passwörter", 3 | "byline": "Mac & i", 4 | "dir": null, 5 | "lang": "de", 6 | "excerpt": "Das in der iOS-Version bereits enthaltene TOTP-Feature ist nun auch für OS X 10.10 verfügbar. Zudem gibt es neue Zusatzfelder in der Datenbank und weitere Verbesserungen.", 7 | "siteName": "Mac & i", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://www.heise.de/imgs/18/1/4/6/2/3/5/1/Barcode-Scanner-With-Border-f0c62350bd8d9d96.jpeg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/nytimes-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Yahoo’s Sale to Verizon Leaves Shareholders With Little Say", 3 | "byline": "Steven Davidoff Solomon", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "The internet giant’s decision to sell its business is plagued with challenges that reveal how unusual deal structures can affect shareholders.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://static01.nyt.com/images/2016/07/30/business/db-dealprof/db-dealprof-facebookJumbo.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/webmd-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Babies Who Eat Peanuts Early May Avoid Allergy", 3 | "byline": "By Brenda Goodman, MA WebMD Health News", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Life-threatening peanut allergies have mysteriously been on the rise in the past decade, with little hope for a cure. But a groundbreaking new study may offer a way to stem that rise, while another may offer some hope for those who are already allergic.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/royal-road/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "ONE HUNDRED TWO: What kind of wordchain? - Super Supportive", 3 | "byline": "Follow Author", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "102 “Were you expecting the competition for the showers to be the highest drama part of gym class?” Alden asked Haoyu as the two of them headed (...)", 7 | "siteName": "Royal Road", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://www.royalroadcdn.com/public/covers-large/63759-super-supportive.jpg?time=1691780497" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/nytimes-4/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "As Debt Rises, the Government Will Soon Spend More on Interest Than on the Military", 3 | "byline": "Nelson D. Schwartz", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Tax cuts, spending increases and higher interest rates could make it harder to respond to future recessions and deal with other needs.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://static01.nyt.com/images/2018/09/15/business/15DEBTS01/15DEBTS01-facebookJumbo.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /test-pages/ok/breitbart/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?' - Breitbart", 3 | "byline": "by Lucas Nolan22 Dec 2016651", 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "Snopes fact checker and staff writer David Emery posted to Twitter asking if there were “any un-angry Trump supporters?”", 7 | "siteName": "Breitbart", 8 | "publishedTime": "2016-12-22T10:43:37-08:00", 9 | "readerable": true, 10 | "image": "http://media.breitbart.com/media/2016/11/GettyImages-621866810.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/blogger/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Open Verilog flow for Silego GreenPak4 programmable logic devices", 3 | "byline": "Andrew Zonenberg", 4 | "dir": "ltr", 5 | "excerpt": "I've written a couple of posts in the past few months but they were all for the blog at work so I figured I'm long overdue for one on Silic...", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "https://1.bp.blogspot.com/-YIPC5jkXkDE/Vy7YPSqFKWI/AAAAAAAAAxI/a7D6Ji2GxoUvcrwUkI4RLZcr2LFQEJCTACLcB/w1200-h630-p-nu/block-diagram.png" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/cnet-svg-classes/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Twitter Lite se estrena en México, Venezuela y otros nueve países", 3 | "byline": "César Salza", 4 | "dir": null, 5 | "lang": "es", 6 | "excerpt": "Twitter Lite llega a 11 países de América Latina, para ayudar a los usuarios con mala señal de sus redes móviles.", 7 | "siteName": "CNET en Español", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://cdn1.cnet.com/img/JumVcu1ZSLtPP8ui0UWaSlgi5RU=/670x503/2017/12/01/b36ce794-e0b8-495c-a198-184923a8f4e9/twitter-lite.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/cnet/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Zuckerberg offers peek at Facebook's acquisition strategies", 3 | "byline": "Steven Musil", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Facebook CEO says be a friend and have a shared vision, but scare them when you have to and move fast.", 7 | "siteName": "CNET", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://cnet3.cbsistatic.com/img/1JaRRjqhoGxDVkFxTRRWkZgyK2Q=/670x503/2014/03/21/863df5d9-e8b8-4b38-851b-5e3f77f2cf0e/mark-zuckerberg-facebook-home-10671610x407.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/tumblr/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Minecraft 1.8 - The Bountiful Update", 3 | "byline": null, 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "+ Added Granite, Andesite, and Diorite stone blocks, with smooth versions\n+ Added Slime Block\n+ Added Iron Trapdoor\n+ Added Prismarine and Sea Lantern blocks\n+ Added the Ocean Monument\n+ Added Red...", 7 | "siteName": "Minecraft Update News", 8 | "publishedTime": "2014-09-02T08:35:27-04:00", 9 | "readerable": true, 10 | "image": "http://assets.tumblr.com/images/og/fb_landscape_share.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/links-in-tables/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Saving Data: Reducing the size of App Updates by 65%", 3 | "byline": "Posted by Android Developers", 4 | "dir": "ltr", 5 | "excerpt": "Posted by Andrew Hayden, Software Engineer on Google Play Android users are downloading tens of billions of apps and games on Google Pla...", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "https://2.bp.blogspot.com/-chCZZinlUTg/WEcxvJo9gdI/AAAAAAAADnk/3ND_BspqN6Y2j5xxkLFW3RyS2Ig0NHZpQCLcB/w1200-h630-p-k-nu/ipsum-opsum.gif" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/medium-3/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Samantha and The Great Big Lie - John C. Welch - Medium", 3 | "byline": "John C. Welch", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "(EDIT: removed the link to Samantha’s post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…", 7 | "siteName": "Medium", 8 | "publishedTime": "2015-10-15T02:19:15.607Z", 9 | "readerable": true, 10 | "image": "https://miro.medium.com/max/398/1*kbPh7V97eyRodSOw2-ALDw.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/buzzfeed-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Student Dies After Diet Pills She Bought Online \"Burned Her Up From Within\"", 3 | "byline": "Mark Di Stefano BuzzFeed News Reporter", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "An inquest into Eloise Parry's death has been adjourned until July.", 7 | "siteName": "BuzzFeed", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://s3-static-ak.buzzfed.com/static/2015-04/22/5/campaign_images/webdr03/student-dies-after-diet-pills-she-bought-online-b-2-28712-1429696299-24_dblbig.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/medicalnewstoday/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "How does the brain turn unconscious information into conscious thought?", 3 | "byline": "By Ana Sandoiu", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "New research investigates the neurobiological timing of the so-called a-ha! moment that occurs we have come up with the solution to a complex problem.", 7 | "siteName": "Medical News Today", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://cdn1.medicalnewstoday.com/content/images/headlines/318/318674/hand-holding-brain-lightbulb.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/google-sre-book-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Google - Site Reliability Engineering", 3 | "byline": "Written by Rob Ewaschuk Edited by Betsy Beyer", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Google’s SRE teams have some basic principles and best practices for building successful monitoring and alerting systems. This chapter offers guidelines for what issues should interrupt a human via a page, and how to deal with issues that aren’t serious enough to trigger a page.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/nytimes-3/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Manhole Fires and Burst Pipes: How Winter Wreaks Havoc on What’s Underneath N.Y.C.", 3 | "byline": "Corey Kilgannon", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "New York’s aging below-street infrastructure is tough to maintain, and the corrosive rock salt and “freeze-thaw” cycles of winter make it even worse.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://static01.nyt.com/images/2019/02/21/nyregion/21winterutilities1/00winterutilities1-facebookJumbo.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/wapo-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Attack stokes instability fears in North Africa", 3 | "byline": "By Erin Cunningham", 4 | "dir": null, 5 | "excerpt": "The assault on Tunisia’s most renowned museum, in which gunmen killed at least 19 people, could heighten tensions in a nation that has become deeply divided between pro- and anti-Islamist factions.", 6 | "siteName": "Washington Post", 7 | "publishedTime": null, 8 | "readerable": true, 9 | "image": "http://img.washingtonpost.com/rw/2010-2019/WashingtonPost/2015/03/18/Foreign/Images/Nic6429927.jpg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/wikipedia-3/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Hermitian matrix", 3 | "byline": "Contributors to Wikimedia projects", 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "In mathematics, a Hermitian matrix (or self-adjoint matrix) is a complex square matrix that is equal to its own conjugate transpose—that is, the element in the i-th row and j-th column is equal to the complex conjugate of the element in the j-th row and i-th column, for all indices i and j:", 7 | "siteName": "Wikimedia Foundation, Inc.", 8 | "publishedTime": "2003-02-28T21:51:08Z", 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/wikia/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "'Star Wars' Original Cuts Might Get Released for 40th Anniversary", 3 | "byline": "James Akinaka", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "As a 40th birthday present to the Star Wars Saga and its fans, Lucasfilm could re-release the original versions of the original trilogy films.", 7 | "siteName": "Fandom powered by Wikia", 8 | "publishedTime": "2017-02-23T17:18:13-08:00", 9 | "readerable": true, 10 | "image": "https://vignette.wikia.nocookie.net/1fb5ee36-d9ae-4125-96d9-f52eb403f1c9/thumbnail-down/width/1280/height/720" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/firefox-nightly-blog/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "These Weeks in Firefox: Issue 85 – Firefox Nightly News", 3 | "byline": "Mike Conley", 4 | "dir": "ltr", 5 | "lang": "en-US", 6 | "excerpt": "Highlights Here's our Firefox Year in Review! Here’s our Performance Year in Review! We've just landed Bug 1553982, which aims to prevent starting an update while another Firefox instance ...", 7 | "siteName": "Firefox Nightly News", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://blog.nightly.mozilla.org/files/2019/07/nightly-blog-header.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/spiceworks/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Rewriting Rules of Engagement with Video in 2020: Vidyard Introduces New Features on its Video Platform", 3 | "byline": "Last Updated: January 7, 2025", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "Vidyard launches new tools and features on its platform to help professionals collaborate and share videos on the go.", 7 | "siteName": "Spiceworks", 8 | "publishedTime": "2020-07-10T14:15:42+00:00", 9 | "readerable": true, 10 | "image": "https://images.spiceworks.com/ad/93/9f099ec74374a67121155d25f38a/viyard.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/wikipedia/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Mozilla - Wikipedia", 3 | "byline": null, 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "Mozilla is a free-software community, created in 1998 by members of Netscape. The Mozilla community uses, develops, spreads and supports Mozilla products, thereby promoting exclusively free software and open standards, with only minor exceptions.[1] The community is supported institutionally by the Mozilla Foundation and its tax-paying subsidiary, the Mozilla Corporation.[2]", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/lemonde-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Le projet de loi sur le renseignement massivement approuvé à l'Assemblée", 3 | "byline": "Martin Untersinger (avec Damien Leloup et Morgane Tual)", 4 | "dir": null, 5 | "lang": "fr", 6 | "excerpt": "Largement approuvé par les députés, le texte sera désormais examiné par le Sénat, puis le Conseil constitutionnel.", 7 | "siteName": "Le Monde.fr", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://s1.lemde.fr/image/2015/05/05/600x315/4628128_3_47fc_projet-de-loi-renseignement_aeba800424730d672d1bd08faf203438.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/nytimes-5/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The New York Times en Español", 3 | "byline": "Tariq Panja", 4 | "dir": null, 5 | "lang": "es", 6 | "excerpt": "Entérate de lo que está pasando en el mundo y de las noticias económicas, de negocios, tecnología, arte, estilos de vida, deporte, ciencia y opiniones. No importa cuáles sean tus intereses: el Times lo cubre con inmejorable calidad, profundidad e independencia.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://static01.nyt.com/newsgraphics/images/icons/defaultPromoCrop.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/mozilla-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Firefox — Customize and make it your own — The most flexible browser on the Web", 3 | "byline": null, 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "It’s easier than ever to personalize Firefox and make it work the way\n you do.\n No other browser gives you so much choice and flexibility.", 7 | "siteName": "Mozilla", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://mozorg.cdn.mozilla.net/media/img/firefox/template/page-image.af8027a425de.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/aclu/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Facebook Is Tracking Me Even Though I’m Not on Facebook", 3 | "byline": "Daniel Kahn Gillmor", 4 | "dir": "ltr", 5 | "lang": "en", 6 | "excerpt": "Facebook collects data about people who have never even opted in. But there are ways these non-users can protect themselves.", 7 | "siteName": "American Civil Liberties Union", 8 | "publishedTime": "2018-04-05T06:00", 9 | "readerable": true, 10 | "image": "https://www.aclu.org/sites/default/files/styles/metatag_og_image_1200x630/public/field_share_image/web18-facebook-socialshare-1200x628-v02.png?itok=p77cQjOm" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/lwn-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "LWN.net Weekly Edition for March 26, 2015 [LWN.net]", 3 | "byline": "By Nathan Willis March 25, 2015", 4 | "dir": null, 5 | "excerpt": "The Arduino has been one of the biggest success stories of the open-hardware movement, but that success does not protect it from internal conflict. In recent months, two of the project's founders have come into conflict about the direction of future efforts—and that conflict has turned into a legal dispute about who owns the rights to the Arduino trademark.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/gitlab-blog/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "3 surprising findings from our 2024 Global DevSecOps Survey", 3 | "byline": "Dave Steer", 4 | "dir": null, 5 | "lang": "en-us", 6 | "excerpt": "This year, our survey revealed changes in organizations' investment priorities in the wake of AI — and how AI is shaping the way teams work.", 7 | "siteName": "GitLab", 8 | "publishedTime": "2024-06-25", 9 | "readerable": true, 10 | "image": "https://images.ctfassets.net/r9o86ar0p03f/Wz5s9ag9lbHesTOe6DEpF/64e9498cf34ee867e5fa5f6876733782/fy25-global-devsecops-report-blog-image.png?fm=webp&w=820&h=500" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/topicseed-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Content Depth — Write Comprehensively About Your Core Topics", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Content writers and marketers find it hard to write a lot of content about a very specific topic. They lose a lot of points on their content depth because they would rather focus on pushing thin content about plenty of topics.", 6 | "siteName": "topicseed", 7 | "publishedTime": "2018-06-12T23:00:00.000Z", 8 | "readerable": true, 9 | "image": "https://topicseed.com/static/9c97da26f6eeee98fc2e628ca3416226/57090/content-depth-seo.png" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/herald-sun-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Angry media won’t buckle over new surveillance laws", 3 | "byline": "Laurie Oakes", 4 | "dir": null, 5 | "lang": "en-au", 6 | "excerpt": "A HIGH-powered federal government team has been doing the rounds of media organisations in the past few days in an attempt to allay concerns about the impact of new surveillance legislation on press freedom. It failed.", 7 | "siteName": "HeraldSun", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://api.news.com.au/content/1.0/heraldsun/images/1227261885862?format=jpg&group=iphone&size=medium" 11 | } 12 | -------------------------------------------------------------------------------- /tests/metadata.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | 3 | mod common; 4 | 5 | use common::test_metadata; 6 | 7 | #[test] 8 | fn test_metadata_last_fail() { 9 | test_metadata( 10 | "./test-pages/readability/title-en-dash", 11 | Some("http://fakehost/test/"), 12 | ); 13 | } 14 | 15 | #[test] 16 | fn table_test_metadata() { 17 | let source_dirs = ["./test-pages/readability", "./test-pages/ok"]; 18 | for d in source_dirs { 19 | let paths = fs::read_dir(d).unwrap(); 20 | for p in paths { 21 | let pp = p.unwrap().path(); 22 | test_metadata(pp, Some("http://fakehost/test/")); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test-pages/readability/remove-aria-hidden/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Remove aria-hidden elements test 6 | 7 | 8 |
9 |

Lorem

10 |
11 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 12 | tempor incididunt ut labore et dolore magna aliqua.

13 |

Ut enim ad minim veniam, 14 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 15 | consequat.

16 |
17 |
18 | 19 | 20 | -------------------------------------------------------------------------------- /test-pages/readability/theverge/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Apple’s Vision Pro hands-on: the Retina display moment for headsets", 3 | "byline": "Alex Heath", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "I tried Apple’s new Vision Pro headset, and just like the introduction of the iPhone 4 over a decade ago, there’s no going back from here.", 7 | "siteName": "The Verge", 8 | "publishedTime": "2023-06-07T20:54:26.829Z", 9 | "readerable": true, 10 | "image": "https://cdn.vox-cdn.com/thumbor/5UHVh564x9p5TRFclZJ-bcWqfzo=/0x0:2040x1360/1200x628/filters:focal(1046x614:1047x615)/cdn.vox-cdn.com/uploads/chorus_asset/file/24709755/DSC00889.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/svg-parsing/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "SVG parsing", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\ntempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\nquis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\nconsequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\ncillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\nproident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/videos-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Screenshot : «Vape Wave», «6 Days», «Alphonse Président»…", 3 | "byline": "Alexandre Hervaud, Jérémy Piette", 4 | "dir": null, 5 | "lang": "fr", 6 | "excerpt": "Séries, documentaires, programmes jeunesse… Retrouvez les recommandations de Libération pour savoir quoi regarder sur vos écrans cette semaine.\nPour dépasser...", 7 | "siteName": "Libération", 8 | "publishedTime": "2017-11-24T18:42:20.314667", 9 | "readerable": true, 10 | "image": "https://medias.liberation.fr/photo/1075029-screenshot-alphonse-vape-wave-6-days.jpg?modified_at=1511536242&picto=fb&ratio_x=191&ratio_y=100&width=600" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/base-url/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Base URL test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/schema-org-context-object/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "South Korean president’s impeachment fails after ruling party lawmakers walk out", 3 | "byline": "Stella Kim, Jennifer Jett", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "South Korean President Yoon Suk Yeol apologized on Saturday for declaring martial law but did not say he would resign as he faces an impeachment vote.", 7 | "siteName": "NBC News", 8 | "publishedTime": "2024-12-06T22:00:40.000Z", 9 | "readerable": true, 10 | "image": "https://media-cldnry.s-nbcnews.com/image/upload/t_nbcnews-fp-1200-630,f_auto,q_auto:best/rockcms/2024-12/241206-yoon-south-korea-mb-0824-656188.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/wordpress/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Stack Overflow Jobs Data Shows ReactJS Skills in High Demand, WordPress Market Oversaturated with Developers", 3 | "byline": "Sarah Gooding", 4 | "dir": "ltr", 5 | "lang": "en-US", 6 | "excerpt": "Stack Overflow published its analysis of 2017 hiring trends based on the targeting options employers selected when posting to Stack Overflow Jobs. The report, which compares data from 200 companies…", 7 | "siteName": "WordPress Tavern", 8 | "publishedTime": "2017-03-09T23:16:02+00:00", 9 | "readerable": true, 10 | "image": "https://i0.wp.com/wptavern.com/wp-content/uploads/2016/07/stack-overflow.png?fit=1200%2C470&ssl=1" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/replace-font-tags/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Replace font tags test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/ok/toc-missing/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Simple Anomaly Detection Using Plain SQL", 3 | "byline": "Haki Benita", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Many developers think that having a critical bug in their code is the worse thing that can happen. Well, there is something much worst than that: Having a critical bug in your code and not knowing about it! Using some high school level statistics and a fair knowledge of SQL, I implemented a very simple anomaly detection system.", 7 | "siteName": "Haki Benita", 8 | "publishedTime": "2020-09-21", 9 | "readerable": true, 10 | "image": "https://hakibenita.com/images/00-sql-anomaly-detection-scatter-plot.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/social-buttons/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Share buttons removal test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/embedded-videos/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Embedded videos test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/style-tags-removal/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Style tags removal", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/base-url-base-element/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Base URL with base test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/lifehacker-working/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "How to Program Your Mind to Stop Buying Crap You Don’t Need", 3 | "byline": "Patrick Allan", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "We all buy things from time to time that we don't really need. It's okay to appeal to your wants every once in a while, as long as you're in control. If you struggle with clutter, impulse buys, and buyer's remorse, here's how to put your mind in the right place before you even set foot in a store.", 7 | "siteName": "Lifehacker", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://i.kinja-img.com/gawker-media/image/upload/s--hqqO9fze--/n1s6c2m6kc07iqdyllj6.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/title-en-dash/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Lorem ipsum dolor sit amet", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/normalize-spaces/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Normalize space test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem\n ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n\ttab here\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/parsely-metadata/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Some Other Title", 3 | "byline": "Jane Doe", 4 | "dir": null, 5 | "lang": null, 6 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 7 | "siteName": null, 8 | "publishedTime": "2024-04-20T04:20:00.000Z", 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/ok/base-url-base-element-relative/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Base URL with base relative test", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /test-pages/readability/lifehacker-post-comment-load/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "How to Program Your Mind to Stop Buying Crap You Don’t Need", 3 | "byline": "Patrick Allan", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "We all buy things from time to time that we don't really need. It's okay to appeal to your wants every once in a while, as long as you're in control. If you struggle with clutter, impulse buys, and buyer's remorse, here's how to put your mind in the right place before you even set foot in a store.", 7 | "siteName": "Lifehacker", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://i.kinja-img.com/gawker-media/image/upload/s--hqqO9fze--/n1s6c2m6kc07iqdyllj6.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/seattletimes-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Alaskan halibut, caught by a century-old Seattle boat, provides a glimpse of Amazon’s strategy with Whole Foods", 3 | "byline": "April 28, 2019 at 6:01 am Updated April 29, 2019 at 3:33 pm", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "The story of Whole Foods’ halibut deal opens a window into Amazon’s grocery strategy and draws a line from a Seattle industry with roots in the 19th century to the dominant economic force of the 21st.", 7 | "siteName": "The Seattle Times", 8 | "publishedTime": "2019-04-28 06:01:07", 9 | "readerable": true, 10 | "image": "https://static.seattletimes.com/wp-content/uploads/2019/04/120028-1200x630.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/ok/ehow-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "How to Build a Terrarium | eHow", 3 | "byline": "Lucy Akins", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "Glass cloche terrariums are not only appealing to the eye, but they also preserve a bit of nature in your home and serve as a simple, yet beautiful, piece of art. Closed terrariums are easy to care for, as they retain much of their own moisture and provide a warm environment with a consistent level of humidity. You won’t have to water the...", 7 | "siteName": "eHow", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://img-aws.ehowcdn.com/200x200/cme/photography.prod.demandstudios.com/16149374-814f-40bc-baf3-ca20f149f0ba.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/article-author-tag/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Deck of Cards That Made Tarot A Global Phenomenon", 3 | "byline": "Laura June Topolsky", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Picture a deck of tarot cards. What do you see? Maybe the Magician in his rich red robes, right arm raised high above him. Or the skeleton on horseback for...", 7 | "siteName": "Atlas Obscura", 8 | "publishedTime": "2015-07-10T09:53:00-04:00", 9 | "readerable": true, 10 | "image": "https://img.atlasobscura.com/5wW1rzVrUruU4w1FCmJf-zrZn1UZIsOgSvVqRkVjLAo/rt:fit/w:600/q:81/sm:1/scp:1/ar:1/aHR0cHM6Ly9hdGxh/cy1kZXYuczMuYW1h/em9uYXdzLmNvbS91/cGxvYWRzL2Fzc2V0/cy8wMzc0MWMyZGYx/MWJmNTFjOTdfSU1H/XzI3MDB2MS5qcGc.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/bug-1255978/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Seven secrets that hotel owners don't want you to know", 3 | "byline": "Hazel Sheffield", 4 | "dir": null, 5 | "excerpt": "Most people go to hotels for the pleasure of sleeping in a giant bed with clean white sheets and waking up to fresh towels in the morning. But those towels and sheets might not be as clean as they look, according to the hotel bosses that responded to an online thread about the things hotel owners don’t want you to know.", 6 | "siteName": "The Independent", 7 | "publishedTime": "2015-09-17T16:57:43+01:00", 8 | "readerable": true, 9 | "image": "https://static.independent.co.uk/s3fs-public/thumbnails/image/2015/12/06/10/bed-hotel-room.jpg" 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/lazy-image-3/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Test Case 1

4 | performance.jpg 5 |

Test Case 2

6 | performance.jpg 7 |
8 |
-------------------------------------------------------------------------------- /test-pages/readability/iab-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Getting LEAN with Digital Ad UX | IAB", 3 | "byline": "By Scott Cunningham", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "We messed up. As technologists, tasked with delivering content and services to users, we lost track of the user experience. Twenty years ago we saw an explosion of websites, built by developers around the world, providing all forms of content. This was the beginning of an age of enlightenment, the intersection of content and technology. … Continued", 7 | "siteName": "IAB", 8 | "publishedTime": "2015-10-15T08:00:26+00:00", 9 | "readerable": true, 10 | "image": "http://www.iab.com/wp-content/uploads/2015/10/getting-lean-with-digital-ad-ux-300x250.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/title-and-h1-discrepancy/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "This is a long title with a colon: Hello there", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Lorem\n ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | RUST_BACKTRACE: 1 12 | 13 | jobs: 14 | build: 15 | 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | rust: [ stable, nightly, 1.75.0 ] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - uses: dtolnay/rust-toolchain@master 24 | with: 25 | toolchain: ${{ matrix.rust }} 26 | - name: Build 27 | run: cargo build --verbose 28 | - name: Run tests 29 | run: cargo test --verbose --all-targets 30 | - name: Run tests with all features 31 | run: cargo test --verbose --all-targets --all-features 32 | -------------------------------------------------------------------------------- /test-pages/readability/lazy-image-3/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lazy Load with Alt includes jpg/png/webp extensions 6 | 7 | 8 |
9 |

Test Case 1

10 | performance.jpg 14 |

Test Case 2

15 | performance.jpg 19 |
20 | 21 | -------------------------------------------------------------------------------- /test-pages/readability/telegraph/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Zimbabwe coup: Robert Mugabe and wife Grace 'insisting he finishes his term', as priest steps in to mediate", 3 | "byline": "Our Foreign Staff", 4 | "dir": null, 5 | "lang": "en-GB", 6 | "excerpt": "Zimbabwe President Robert Mugabe, his wife Grace and two key figures from her G40 political faction are under house arrest at Mugabe's \"Blue House\" compound in Harare and are insisting the 93 year-old finishes his presidential term, a source said.", 7 | "siteName": "The Telegraph", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://www.telegraph.co.uk/content/dam/news/2017/11/16/TELEMMGLPICT000146889449-xlarge_trans_NvBQzQNjv4BqySoB6nTCgtc7U4LQ_FPO4hKi2sT3vi7ux2-RDZwC4QA.jpeg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/guardian-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "'What is the sea telling us?': Māori tribes fearful over whale strandings | Eleanor Ainge Roy", 3 | "byline": "Eleanor Ainge Roy", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "New Zealand’s whale whisperers worry that manmade changes in the ocean are behind the spike in beachings", 7 | "siteName": "the Guardian", 8 | "publishedTime": "2019-01-03T07:00:02.000Z", 9 | "readerable": true, 10 | "image": "https://i.guim.co.uk/img/media/df84b519a877d652e950ecd4248320eec985934e/0_320_4800_2880/master/4800.jpg?width=1200&height=630&quality=85&auto=format&fit=crop&overlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc&s=af41545b21b557e4f57dd4221b6a7f89" 11 | } 12 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | coverage: 11 | runs-on: ubuntu-24.04 12 | env: 13 | CARGO_TERM_COLOR: always 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Update Rust 17 | run: rustup update stable 18 | - name: Install cargo-llvm-cov 19 | uses: taiki-e/install-action@cargo-llvm-cov 20 | - name: Generate code coverage 21 | run: cargo llvm-cov --lcov --output-path lcov.info 22 | - name: Upload coverage to Codecov 23 | uses: codecov/codecov-action@v5 24 | with: 25 | token: ${{ secrets.CODECOV_TOKEN }} 26 | files: lcov.info 27 | fail_ci_if_error: true -------------------------------------------------------------------------------- /test-pages/readability/yahoo-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "These are the 8 coolest PlayStation VR games", 3 | "byline": "Ben Silverman", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "To help you decide what’s what, I’ve put together this list of the 8 PSVR games worth considering. Beloved cult hit “Rez” gets the VR treatment to help launch the PSVR, and the results are terrific. Chaos, for sure, and also “Thumper.” Called a “violent rhythm game” by its creators, “Thumper” is, well", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://l3.yimg.com/uu/api/res/1.2/4eRCPf9lJt_3q29.outekQ--/aD02Njk7dz03NDQ7c209MTthcHBpZD15dGFjaHlvbg--/http://media.zenfs.com/en/homerun/feed_manager_auto_publish_494/4406ef57dcb40376c513903b03bef048" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/simplyfound-1/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Raspberry Pi 3 - The credit card sized PC that cost only $35 - All-time bestselling computer in UK", 3 | "byline": "Joe Wee Monday, February 29, 2016 @ 11:10 PM UTC", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "The Raspberry Pi Foundation started by a handful of volunteers in 2012 when they released the original Raspberry Pi 256MB Model B without knowing what to expect. In a short four-year period they have grown to over sixty full-time employees and ha...", 7 | "siteName": "SIMPLYFOUND.COM | BY: JOE WEE", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://d34hb2g9mvfppu.cloudfront.net/m/images/cache/images/2016/02/29/apcnews2012raspberry_pi_logo_mainimage8_jpg8_322_27630a8388eb_lg.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /crates/js/tests/web.rs: -------------------------------------------------------------------------------- 1 | //! Test suite for the Web and headless browsers. 2 | 3 | #![cfg(target_arch = "wasm32")] 4 | 5 | extern crate wasm_bindgen_test; 6 | use wasm_bindgen::JsValue; 7 | use wasm_bindgen_test::*; 8 | 9 | //wasm_bindgen_test_configure!(run_in_browser); 10 | 11 | #[wasm_bindgen_test] 12 | fn test_parse() { 13 | let contents = include_str!("../test-pages/rustwiki_2024.html"); 14 | let res = dom_smoothie_js::parse(contents); 15 | assert!(res.is_ok()); 16 | } 17 | 18 | #[wasm_bindgen_test] 19 | fn test_parse_constructor() { 20 | let contents = include_str!("../test-pages/rustwiki_2024.html"); 21 | 22 | let mut ra = 23 | dom_smoothie_js::Readability::new(contents.to_string(), None, JsValue::null()).unwrap(); 24 | 25 | let article = ra.parse(); 26 | assert!(article.is_ok()); 27 | } 28 | -------------------------------------------------------------------------------- /src/ac_automat.rs: -------------------------------------------------------------------------------- 1 | use aho_corasick::{AhoCorasick, AhoCorasickKind}; 2 | use once_cell::sync::Lazy; 3 | 4 | use crate::glob::{CLASSES_NEGATIVE, CLASSES_POSITIVE, MAYBE_CANDIDATES, UNLIKELY_CANDIDATES}; 5 | 6 | pub(crate) static AC_UNLIKELY: Lazy = Lazy::new(|| ac_automaton(UNLIKELY_CANDIDATES)); 7 | pub(crate) static AC_MAYBE: Lazy = Lazy::new(|| ac_automaton(MAYBE_CANDIDATES)); 8 | pub(crate) static AC_CLASSES_NEGATIVE: Lazy = 9 | Lazy::new(|| ac_automaton(CLASSES_NEGATIVE)); 10 | pub(crate) static AC_CLASSES_POSITIVE: Lazy = 11 | Lazy::new(|| ac_automaton(CLASSES_POSITIVE)); 12 | 13 | fn ac_automaton(patterns: &[&str]) -> AhoCorasick { 14 | AhoCorasick::builder() 15 | .kind(Some(AhoCorasickKind::ContiguousNFA)) 16 | .build(patterns) 17 | .unwrap() 18 | } 19 | -------------------------------------------------------------------------------- /test-pages/readability/dropbox-blog/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "How we designed Dropbox’s ATF - an async task framework", 3 | "byline": "Arun Sai Krishnan", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "I joined Dropbox not long after graduating with a Master’s degree in computer science. Aside from an internship, this was my first big-league engineering job. My team had already begun designing a critical internal service that most of our software would use: It would handle asynchronous computing requests behind the scenes, powering everything from dragging a file into a Dropbox folder to scheduling a marketing campaign.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://aem.dropbox.com/cms/content/dam/dropbox/tech-blog/en-us/2020/11/atf/diagrams/Techblog-ATF-Social.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/ehow-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "How to Throw a Graduation Party on a Budget | eHow", 3 | "byline": "Gina Roberts-Grey", 4 | "dir": null, 5 | "lang": "en-US", 6 | "excerpt": "Graduation parties are a great way to commemorate the years of hard work teens and college co-eds devote to education. They’re also costly for mom and dad.The average cost of a graduation party in 2013 was a whopping $1,200, according to Graduationparty.com; $700 of that was allocated for food. However that budget was based on Midwestern...", 7 | "siteName": "eHow", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "http://img-aws.ehowcdn.com/200x200/cme/cme_public_images/www_ehow_com/cdn-write.demandstudios.com/upload/image/2F/86/5547EF62-EAF5-4256-945D-0496F61C862F/5547EF62-EAF5-4256-945D-0496F61C862F.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/reordering-paragraphs/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "", 3 | "byline": null, 4 | "dir": null, 5 | "excerpt": "Regarding item# 11111, under sufficiently extreme conditions, quarks may\n become deconfined and exist as free particles. In the course of asymptotic\n freedom, the strong interaction becomes weaker at higher temperatures.\n Eventually, color confinement would be lost and an extremely hot plasma\n of freely moving quarks and gluons would be formed. This theoretical phase\n of matter is called quark-gluon plasma.[81] The exact conditions needed\n to give rise to this state are unknown and have been the subject of a great\n deal of speculation and experimentation.", 6 | "siteName": null, 7 | "publishedTime": null, 8 | "readerable": true 9 | } 10 | -------------------------------------------------------------------------------- /tests/alt.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use std::fs; 4 | 5 | use common::test_alt_text; 6 | 7 | #[test] 8 | fn test_alt_formatted_last_fail() { 9 | test_alt_text( 10 | "./test-pages/alt/arxiv", 11 | dom_smoothie::TextMode::Formatted, 12 | "expected_alt.txt", 13 | ); 14 | } 15 | 16 | #[test] 17 | fn table_test_alt_formatted_text() { 18 | let paths = fs::read_dir("./test-pages/alt").unwrap(); 19 | for p in paths { 20 | let pp = p.unwrap().path(); 21 | test_alt_text(pp, dom_smoothie::TextMode::Formatted, "expected_alt.txt"); 22 | } 23 | } 24 | 25 | #[test] 26 | fn table_test_alt_markdown() { 27 | let paths = fs::read_dir("./test-pages/alt").unwrap(); 28 | for p in paths { 29 | let pp = p.unwrap().path(); 30 | test_alt_text(pp, dom_smoothie::TextMode::Markdown, "expected.md"); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /test-pages/readability/lazy-image-2/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Spectacular Story Of Metroid, One Of Gaming's Richest Universes", 3 | "byline": "Mama Robotnik", 4 | "dir": null, 5 | "lang": "en-us", 6 | "excerpt": "Nothing beats the passion of a true fan writing about something they love. That's what you're about to see here: one of the richest, most amazing tributes to a great gaming series that we've ever run on Kotaku. Warning #1: this one might make your browser chug, so close your other tabs. Warning #2: This piece might make it hurt a little more than there are no new Metroid games from Nintendo on the horizon.", 7 | "siteName": "Kotaku", 8 | "publishedTime": "2013-09-11T10:00:00-04:00", 9 | "readerable": true, 10 | "image": "https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,fl_progressive,g_center,h_675,pg_1,q_80,w_1200/18zu12g5xzyxojpg.jpg" 11 | } 12 | -------------------------------------------------------------------------------- /.github/workflows/wasm.yml: -------------------------------------------------------------------------------- 1 | name: wasm ci 2 | 3 | on: 4 | push: 5 | branches: [ "main", "feature/*" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | RUST_BACKTRACE: 1 12 | CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER: wasm-bindgen-test-runner 13 | 14 | jobs: 15 | test-wasm: 16 | 17 | runs-on: ubuntu-24.04 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Install stable rust 21 | uses: dtolnay/rust-toolchain@master 22 | with: 23 | toolchain: stable 24 | targets: wasm32-unknown-unknown 25 | - name: Install wasm-bindgen-cli 26 | uses: taiki-e/install-action@v2 27 | with: 28 | tool: wasm-pack 29 | - uses: Swatinem/rust-cache@v2 30 | with: 31 | workspaces: . 32 | - name: Run tests 33 | working-directory: crates/js 34 | run: wasm-pack test --node -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod config; 2 | mod glob; 3 | mod grab; 4 | mod grab_flags; 5 | mod helpers; 6 | mod matching; 7 | mod prep_article; 8 | mod readability; 9 | mod readable; 10 | mod score; 11 | #[cfg(feature = "serde")] 12 | mod serde_helpers; 13 | mod url_helpers; 14 | 15 | #[cfg(feature = "aho-corasick")] 16 | mod ac_automat; 17 | 18 | pub use config::{CandidateSelectMode, Config, ParsePolicy, TextMode}; 19 | pub use readability::Article; 20 | pub use readability::Metadata; 21 | pub use readability::Readability; 22 | pub use readable::is_probably_readable; 23 | 24 | use thiserror::Error; 25 | 26 | #[derive(Error, Debug)] 27 | pub enum ReadabilityError { 28 | #[error("the document URL must be absolute")] 29 | BadDocumentURL, 30 | #[error("failed to grab the article")] 31 | GrabFailed, 32 | #[error("too many elements in the document to parse (found {0}, maximum {1})")] 33 | TooManyElements(usize, usize), 34 | } 35 | -------------------------------------------------------------------------------- /src/grab_flags.rs: -------------------------------------------------------------------------------- 1 | use flagset::flags; 2 | 3 | flags! { 4 | /// Flags for the grab function, controlling different heuristics for content extraction. 5 | pub enum GrabFlags: u8 { 6 | /// Removes elements that are unlikely to be part of the main content. 7 | StripUnlikelys, 8 | /// Considers element class and id attributes when calculating content scores. 9 | WeightClasses, 10 | /// Applies additional content cleaning after identifying the main content. 11 | CleanConditionally, 12 | } 13 | } 14 | 15 | #[cfg(test)] 16 | mod tests { 17 | use flagset::FlagSet; 18 | 19 | use super::*; 20 | 21 | #[test] 22 | fn test_grab_flags() { 23 | let mut flags: FlagSet = FlagSet::full(); 24 | assert!(flags.contains(GrabFlags::StripUnlikelys)); 25 | flags -= GrabFlags::StripUnlikelys; 26 | assert!(!flags.contains(GrabFlags::StripUnlikelys)); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /test-pages/ld.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://schema.org", 3 | "@type": "Article", 4 | "name": "Rust (programming language)", 5 | "url": "https://en.wikipedia.org/wiki/Rust_(programming_language)", 6 | "sameAs": "http://www.wikidata.org/entity/Q575650", 7 | "mainEntity": "http://www.wikidata.org/entity/Q575650", 8 | "author": { 9 | "@type": "Organization", 10 | "name": "Contributors to Wikimedia projects" 11 | }, 12 | "publisher": { 13 | "@type": "Organization", 14 | "name": "Wikimedia Foundation, Inc.", 15 | "logo": { 16 | "@type": "ImageObject", 17 | "url": "https://www.wikimedia.org/static/images/wmf-hor-googpub.png" 18 | } 19 | }, 20 | "datePublished": "2010-10-30T22:30:54Z", 21 | "dateModified": "2024-10-03T02:23:08Z", 22 | "image": "https://upload.wikimedia.org/wikipedia/commons/d/d5/Rust_programming_language_black_logo.svg", 23 | "headline": "memory-safe programming language without garbage collection" 24 | } 25 | -------------------------------------------------------------------------------- /test-pages/ok/engadget/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Xbox One X review: A console that keeps up with gaming PCs", 3 | "byline": "Devindra Hardawar", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "The Xbox One X is the most powerful gaming console ever, but it's not for everyone yet.", 7 | "siteName": "Engadget", 8 | "publishedTime": "2017-11-03 03:01:00.000000", 9 | "readerable": true, 10 | "image": "https://o.aolcdn.com/images/dims?thumbnail=1200%2C630&quality=80&image_uri=https%3A%2F%2Fo.aolcdn.com%2Fimages%2Fdims%3Fcrop%3D1600%252C943%252C0%252C0%26quality%3D85%26format%3Djpg%26resize%3D1600%252C943%26image_uri%3Dhttp%253A%252F%252Fo.aolcdn.com%252Fhss%252Fstorage%252Fmidas%252Fd457f269d1400106a402302a310de800%252F205826071%252FXbox%252BOne%252BX%252Breview%252Bgallery%252B1.jpg%26client%3Da1acac3e1b3290917d92%26signature%3D9fa7b29452763464190c6edff6822489df516d1b&client=cbc79c14efcebee57402&signature=ab988e814a2686e0d93dd7ae306d241356fb7b9c" 11 | } 12 | -------------------------------------------------------------------------------- /tests/readability.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | 3 | mod common; 4 | 5 | use common::test_readability; 6 | 7 | #[test] 8 | fn table_test_readability() { 9 | let paths = fs::read_dir("./test-pages/readability").unwrap(); 10 | 11 | for p in paths { 12 | test_readability(p.unwrap().path()); 13 | } 14 | } 15 | 16 | #[test] 17 | #[cfg(feature = "serde")] 18 | fn test_serde() { 19 | let contents = include_str!("../test-pages/ok/base-url-base-element-relative/source.html"); 20 | let document_url = Some("http://fakehost/test/"); 21 | let mut ra = dom_smoothie::Readability::new(contents, document_url, None).unwrap(); 22 | let article = ra.parse().unwrap(); 23 | let article_json = serde_json::to_string(&article); 24 | assert!(article_json.is_ok()); 25 | 26 | let article_json = article_json.unwrap(); 27 | let article_copy: dom_smoothie::Article = serde_json::from_str(&article_json).unwrap(); 28 | assert_eq!(article.content, article_copy.content); 29 | } 30 | -------------------------------------------------------------------------------- /test-pages/readability/data-url-image/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Document", 3 | "byline": null, 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Lorem ipsum dolor sit amet consectetur adipisicing elit. Natus eaque totam provident obcaecati nisi praesentium iusto velit fuga debitis quidem ut repellat corrupti, eligendi inventore quibusdam perspiciatis delectus omnis pariatur excepturi quasi fugit? A adipisci natus nostrum, qui aperiam, at culpa corrupti autem enim earum vitae. Nostrum et officiis facere ex recusandae tenetur, delectus odit provident soluta id perferendis ducimus quibusdam corporis rerum voluptatem architecto sequi beatae quod mollitia voluptatibus earum tempora inventore ut. Deserunt reprehenderit recusandae nostrum, eaque fuga cum, repellat, perspiciatis ducimus in non consequatur ratione. Sint rerum necessitatibus deleniti odio earum voluptatum eos modi ab dolor minus.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/normalize-spaces/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tab here incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Foo

6 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |
8 |
-------------------------------------------------------------------------------- /test-pages/readability/style-tags-removal/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Foo

6 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |
8 |
-------------------------------------------------------------------------------- /test-pages/readability/title-en-dash/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |
6 |
-------------------------------------------------------------------------------- /test-pages/readability/hidden-nodes/expected.html: -------------------------------------------------------------------------------- 1 |
2 |

Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.

3 |

Secondary header

4 |

Third header

5 |
-------------------------------------------------------------------------------- /.github/workflows/audit.yml: -------------------------------------------------------------------------------- 1 | name: Rust Audit 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | RUST_BACKTRACE: 1 12 | 13 | jobs: 14 | 15 | msrv-verify: 16 | runs-on: ubuntu-24.04 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: dtolnay/rust-toolchain@master 20 | with: 21 | toolchain: 1.75.0 22 | - name: Install cargo msrv 23 | uses: taiki-e/install-action@v2 24 | with: 25 | tool: cargo-msrv 26 | - name: Verify MSRV 27 | run: cargo msrv verify 28 | 29 | audit: 30 | runs-on: ubuntu-24.04 31 | steps: 32 | - uses: actions/checkout@v4 33 | - name: Run clippy 34 | run: cargo clippy --verbose --all-targets --all-features -- -D warnings 35 | - name: Install cargo deny 36 | uses: taiki-e/install-action@cargo-deny 37 | - name: Check advisories 38 | run: cargo deny check advisories 39 | - name: Check bans 40 | run: cargo deny check bans -------------------------------------------------------------------------------- /test-pages/readability/remove-extra-brs/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

4 |

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

5 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
8 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

9 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

10 |
11 |
-------------------------------------------------------------------------------- /test-pages/readability/remove-script-tags/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

4 |

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

5 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
8 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

9 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

10 |
11 |
-------------------------------------------------------------------------------- /test-pages/readability/basic-tags-cleaning/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

4 |

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

5 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
8 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

9 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

10 |
11 |
-------------------------------------------------------------------------------- /test-pages/readability/remove-extra-paragraphs/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

4 |

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

5 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
8 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

9 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

10 |
11 |
-------------------------------------------------------------------------------- /test-pages/readability/comment-inside-script-parsing/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

4 |

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

5 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
8 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

9 |

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

10 |
11 |
-------------------------------------------------------------------------------- /test-pages/ok/003-metadata-preferred/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Test document title

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
-------------------------------------------------------------------------------- /test-pages/readability/parsely-metadata/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Test document title

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
-------------------------------------------------------------------------------- /test-pages/readability/metadata-content-missing/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Test document title

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
-------------------------------------------------------------------------------- /test-pages/ok/004-metadata-space-separated-properties/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Test document title

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
-------------------------------------------------------------------------------- /test-pages/readability/rtl-1/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |

Lorem

5 |

Lorem ipsum dolor sit amet.

6 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

8 |
9 |
10 |
-------------------------------------------------------------------------------- /test-pages/readability/rtl-2/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |

Lorem

5 |

Lorem ipsum dolor sit amet.

6 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

8 |
9 |
10 |
-------------------------------------------------------------------------------- /test-pages/readability/yahoo-3/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Veteran Wraps Baby in American Flag, Photo Sparks Controversy", 3 | "byline": "By GILLIAN MOHNEY March 11, 2015 3:46 PM", 4 | "dir": "ltr", 5 | "lang": "en-US", 6 | "excerpt": "A photographer and Navy veteran is fighting back after a photo she posted to Facebook started an online backlash. Vanessa Hicks said she had no idea her photo would be considered controversial. The photo, from a military family’s newborn photo shoot, showed a newborn infant wrapped in an American flag held by his father, who was in his military uniform. Hicks, a Navy veteran herself and the wife of an active-duty Navy member, said her intention was to honor the flag as well as her clients, who wanted to incorporate their military service in the photo shoot.", 7 | "siteName": "Yahoo", 8 | "publishedTime": null, 9 | "readerable": true, 10 | "image": "https://s.yimg.com/bt/api/res/1.2/qZaM9MLUOrxLg4IfXt_Niw--/YXBwaWQ9eW5ld3NfbGVnbztxPTc1O3c9NjAw/http://media.zenfs.com/en-US/video/video.abcnewsplus.com/559ecdbafdb839129816b5c79a996975.cf.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-3/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |

Lorem

5 |

Lorem ipsum dolor sit amet.

6 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

8 |
9 |
10 |
-------------------------------------------------------------------------------- /test-pages/readability/rtl-4/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |

Lorem

5 |

Lorem ipsum dolor sit amet.

6 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

8 |
9 |
10 |
-------------------------------------------------------------------------------- /test-pages/ok/replace-brs/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum
dolor sit

4 |

amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo

5 |

consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
8 |

Tempor

9 |

incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse

10 |

cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

11 |
12 |
-------------------------------------------------------------------------------- /test-pages/readability/title-and-h1-discrepancy/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

This is a long title with a colon: But the final text here is different

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |
7 |
-------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Mykola Humanov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crates/js/LICENSE_MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Mykola Humanov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test-pages/ok/replace-font-tags/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Foo

6 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |
8 |
-------------------------------------------------------------------------------- /test-pages/readability/hidden-nodes/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", 3 | "byline": null, 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/normalize-spaces/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Normalize space test 6 | 7 | 8 |
9 |

Lorem

10 |
11 | Lorem 12 | ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 13 | tab here 14 | incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 15 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 16 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 17 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 18 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 19 |
20 |

Foo

21 |
22 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 23 | quis nostrud exercitation 24 | 25 | 26 | 27 | 28 | ullamco laboris nisi ut aliquip ex ea commodo 29 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 30 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 31 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 32 |
33 |
34 | 35 | 36 | -------------------------------------------------------------------------------- /test-pages/ok/replace-brs/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Replace brs test 6 | 7 | 8 |
9 |

Lorem

10 |
11 | Lorem ipsum
dolor sit


amet, consectetur adipisicing elit, sed do eiusmod
12 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
13 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo

14 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
15 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
16 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 17 |
18 |

Foo

19 |
20 | Tempor

incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
21 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
22 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse

23 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
24 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 25 |
26 |
27 | 28 | 29 | -------------------------------------------------------------------------------- /test-pages/readability/title-en-dash/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Lorem ipsum dolor sit amet – My website 6 | 7 | 8 |
9 |
10 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 11 | incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 12 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 13 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 14 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 15 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 16 |
17 |
18 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 19 | incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 20 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 21 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 22 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 23 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 24 |
25 |
26 | 27 | 28 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-1/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | RTL Test 7 | 8 | 9 | 10 |
11 |
12 |

Lorem

13 |

14 | Lorem ipsum dolor sit amet. 15 |

16 |

17 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 18 |

19 |

20 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 21 |

22 |
23 |
24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-3/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | RTL Test 7 | 8 | 9 | 10 |
11 |
12 |

Lorem

13 |

14 | Lorem ipsum dolor sit amet. 15 |

16 |

17 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 18 |

19 |

20 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 21 |

22 |
23 |
24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-4/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | RTL Test 7 | 8 | 9 | 10 |
11 |
12 |

Lorem

13 |

14 | Lorem ipsum dolor sit amet. 15 |

16 |

17 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 18 |

19 |

20 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 21 |

22 |
23 |
24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test-pages/readability/rtl-2/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | RTL Test 7 | 8 | 9 | 10 |
11 |
12 |

Lorem

13 |

14 | Lorem ipsum dolor sit amet. 15 |

16 |

17 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 18 |

19 |

20 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 21 |

22 |
23 |
24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /test-pages/readability/remove-extra-brs/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Remove trailing brs test 6 | 7 | 8 |
9 |

Lorem

10 |
11 |
12 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 13 | tempor incididunt ut labore et dolore magna aliqua.

14 |

Ut enim ad minim veniam, 15 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 16 | consequat.


17 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 18 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 19 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

20 |
21 |

Foo

22 |
23 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 24 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 25 | consequat.

26 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 27 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 28 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

29 |
30 |
31 | 32 | 33 | -------------------------------------------------------------------------------- /test-pages/readability/missing-paragraphs/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt", 3 | "byline": "Henri Sivonen", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam\n voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit\n amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam\n nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed\n diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet.", 7 | "siteName": null, 8 | "publishedTime": null, 9 | "readerable": true 10 | } 11 | -------------------------------------------------------------------------------- /test-pages/readability/comment-inside-script-parsing/source.html: -------------------------------------------------------------------------------- 1 | 2 | Test script parsing 3 | 4 | 10 |
11 |

Lorem

12 |
13 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 14 | tempor incididunt ut labore et dolore magna aliqua.

15 |

Ut enim ad minim veniam, 16 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 17 | consequat.

18 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 19 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 20 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

21 |
22 |

Foo

23 |
24 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 25 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 26 | consequat.

27 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 28 | cillum dolore eu fugiat nulla pariatur. 29 | Excepteur sint occaecat cupidatat non 30 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

31 |
32 |
33 | 34 | 35 | -------------------------------------------------------------------------------- /test-pages/ok/replace-font-tags/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Replace font tags test 6 | 7 | 8 |
9 |

Lorem

10 |
11 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 12 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 13 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 14 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 15 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 16 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 17 |
18 |

Foo

19 |
20 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 21 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 22 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 23 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 24 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 25 |
26 |
27 | 28 | 29 | -------------------------------------------------------------------------------- /test-pages/readability/title-and-h1-discrepancy/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | This is a long title with a colon: Hello there 6 | 7 | 8 |
9 |

This is a long title with a colon: But the final text here is different

10 |
11 | Lorem 12 | ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 13 | incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 14 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 15 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 16 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 17 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 18 |
19 |
20 | Lorem 21 | ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 22 | incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 23 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 24 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 25 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 26 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 27 |
28 |
29 | 30 | 31 | -------------------------------------------------------------------------------- /test-pages/readability/remove-extra-paragraphs/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Replace font tags test 6 | 7 | 8 |
9 |

Lorem

10 |
11 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 12 | tempor incididunt ut labore et dolore magna aliqua.

13 |

14 |

Ut enim ad minim veniam, 15 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 16 | consequat.

17 |

18 |

19 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 20 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 21 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

22 |

23 |
24 |

Foo

25 |
26 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 27 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 28 | consequat.

29 |

30 |

31 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 32 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 33 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

34 |

35 | 36 | 37 |

38 |
39 |
40 | 41 | 42 | -------------------------------------------------------------------------------- /crates/js/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "dom-smoothie-js" 3 | version.workspace = true 4 | edition.workspace = true 5 | license.workspace = true 6 | rust-version.workspace = true 7 | authors.workspace = true 8 | description = "A wrapper around the `dom_smoothie` crate for extracting relevant content from web pages" 9 | repository = "https://github.com/niklak/dom_smoothie" 10 | publish = false 11 | 12 | keywords = ["html", "readability"] 13 | 14 | 15 | [lib] 16 | crate-type = ["cdylib", "rlib"] 17 | 18 | 19 | [dependencies] 20 | wasm-bindgen = "0.2.84" 21 | dom_smoothie = { path = "../..", features = ["serde"] } 22 | 23 | serde-wasm-bindgen = "0.6.5" 24 | lol_alloc = {version = "0.4.1", optional = true} 25 | cfg-if = "1.0.4" 26 | # The `console_error_panic_hook` crate provides better debugging of panics by 27 | # logging them with `console.error`. This is great for development, but requires 28 | # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for 29 | # code size when deploying. 30 | console_error_panic_hook = { version = "0.1.7", optional = true } 31 | 32 | [dev-dependencies] 33 | wasm-bindgen-test = "0.3.34" 34 | 35 | 36 | # `wasm-opt` is on by default in for the release profile, but it can be 37 | # disabled by setting it to `false` 38 | [package.metadata.wasm-pack.profile.release] 39 | wasm-opt = ['-Oz'] 40 | 41 | 42 | [features] 43 | default = ["console_error_panic_hook", "lol_alloc"] 44 | lol_alloc = ["dep:lol_alloc"] 45 | console_error_panic_hook = ["dep:console_error_panic_hook"] -------------------------------------------------------------------------------- /test-pages/readability/style-tags-removal/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Style tags removal 6 | 7 | 8 | 9 |
10 |

Lorem

11 | 14 |
15 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 16 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 17 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 18 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 19 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 20 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 21 |
22 | 27 |

Foo

28 |
29 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 30 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 31 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 32 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 33 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 34 |
35 |
36 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /test-pages/readability/basic-tags-cleaning/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Basic tag cleaning test 6 | 7 | 8 |
9 |

Lorem

10 |
11 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 12 | tempor incididunt ut labore et dolore magna aliqua.

13 |

Ut enim ad minim veniam, 14 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 15 | consequat.

16 | 17 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 18 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 19 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

20 |
21 |

Foo

22 |
23 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 24 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 25 | consequat.

26 | 27 | 28 | 29 | 30 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 31 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 32 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

33 |
34 |
35 | 36 | 37 | -------------------------------------------------------------------------------- /test-pages/readability/yahoo-4/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

トレンドマイクロは3月9日、Wi-Fi利用時の通信を暗号化し保護するスマホ・タブレット向けのセキュリティアプリ「フリーWi-Fiプロテクション」(iOS/Android)の発売を開始すると発表した。1年版ライセンスは2900円(税込)で、2年版ライセンスは5000円(税込)。

4 |

 フリーWi-Fiプロテクションは、App Storeおよび、Google Playにて販売され、既に提供しているスマホ・タブレット向け総合セキュリティ対策アプリ「ウイルスバスター モバイル」と併用することで、不正アプリや危険なウェブサイトからの保護に加え、通信の盗み見を防ぐことができる。

5 |

 2020年の東京オリンピック・パラリンピックの開催などを見据え、フリーWi-Fi(公衆無線LAN)の設置が促進され、フリーWi-Fiの利用者も増加している。

6 |

 一方で、脆弱な設定のフリーWi-Fiや攻撃者が設置した偽のフリーWi-Fiへの接続などによる情報漏えい、通信の盗み見などのセキュリティリスクが危惧されているという。

7 |

 正規事業者が提供する安全性の高いフリーWi-Fiのほかにも、通信を暗号化していない安全性の低いフリーWi-Fi、さらにはサイバー犯罪者が設置したフリーWi-Fiなどさまざまなものが混在している。また、利用者は、接続する前にひとつひとつ安全性を確認するのは難しい状況だとしている。

8 |

 トレンドマイクロがスマートフォン保持者でフリーWi-Fiの利用経験がある人に実施した調査では、回答者の約85%が安全なフリーWi-Fiと危険なフリーWi-Fiは「見分けられない」と回答。さらに、約65%がフリーWi-Fiの利用に不安を感じていると回答している。

9 |

 こうした環境の変化やユーザの状況を鑑み、フリーWi-Fiプロテクションの提供を開始する。同アプリをインストールすることで利用者は、万が一安全性の低いフリーWi-Fiのアクセスポイントに接続してしまった場合でも、その通信を暗号化でき、通信の盗み見やそれによる情報漏えいのリスクを低減できるようになる。

10 |

 具体的には、フリーWi-Fi利用時に、スマートフォンがフリーWi-Fiプロテクションインフラに接続することにより、フリーWi-Fiのアクセスポイントを介した通信がVPN(Virtual Private Network)で暗号化される。これにより利用者は、第三者から通信を傍受されることやデータの情報漏えいを防ぐことが可能。さらに、かんたん自動接続の機能により、通信を暗号化していない安全性が低いフリーWi-Fi接続時や利用者が指定したWi-Fiへ接続する際に、自動的に通信を暗号化し、利用者の通信を保護する。

11 |

 また、フリーWi-Fiプロテクションインフラと、莫大なセキュリティ情報のビッグデータを保有するクラウド型セキュリティ技術基盤「Trend Micro Smart Protection Network」(SPN)が連携することで、フリーWi-Fiプロテクションインフラを経由してインターネットを利用する際に、利用者がフィッシング詐欺サイトや偽サイトなどへの不正サイトへアクセスすることをブロックできるという。

12 |
13 |
-------------------------------------------------------------------------------- /test-pages/readability/visibility-hidden/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Visibility hidden test 6 | 7 | 8 |
9 |

Lorem

10 |
11 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 12 | tempor incididunt ut labore et dolore magna aliqua.

13 |

Ut enim ad minim veniam, 14 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 15 | consequat.

16 | 17 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 18 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 19 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

20 |
21 |

Foo

22 |
23 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 24 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 25 | consequat.

26 | 27 | 28 | 29 | 30 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 31 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 32 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

33 |
34 |
35 | 36 | 37 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [workspace.package] 3 | version = "0.14.0" 4 | edition = "2021" 5 | license = "MIT" 6 | rust-version = "1.75" 7 | repository = "https://github.com/niklak/dom_smoothie" 8 | authors = ["niklak "] 9 | 10 | 11 | [package] 12 | name = "dom_smoothie" 13 | version.workspace = true 14 | license.workspace = true 15 | repository.workspace = true 16 | edition.workspace = true 17 | authors.workspace = true 18 | rust-version.workspace = true 19 | description = "A Rust crate for extracting relevant content from web pages" 20 | documentation = "https://docs.rs/dom_smoothie/latest" 21 | keywords = ["html", "readability"] 22 | readme = "README.md" 23 | exclude = [".*", "test-pages", "deny.toml"] 24 | 25 | [dependencies] 26 | dom_query = {version = "0.24.0", features = ["mini_selector", "markdown"]} 27 | tendril = {version = "0.4.3"} 28 | once_cell = { version = "1" } 29 | serde = {version = "1.0", features = ["derive"], optional = true} 30 | gjson = {version = "0.8.1"} 31 | html-escape = "0.2.13" 32 | flagset = "0.4.7" 33 | unicode-segmentation = "1.12.0" 34 | thiserror = "2.0" 35 | phf = { version = "0.13.1", features = ["macros"] } 36 | foldhash = "0.2.0" 37 | aho-corasick = { version = "1.1.4", optional = true} 38 | 39 | [dev-dependencies] 40 | serde_json = {version = "1.0"} 41 | serde = {version = "1.0", features = ["derive"]} 42 | 43 | 44 | [features] 45 | serde = ["dep:serde"] 46 | aho-corasick = ["dep:aho-corasick"] 47 | 48 | [workspace] 49 | members = [ 50 | "crates/cli", 51 | "crates/js", 52 | "crates/bench", 53 | "crates/lua", 54 | ] 55 | 56 | [profile.bench] 57 | codegen-units = 1 58 | lto = "fat" 59 | incremental = false 60 | opt-level = 3 61 | -------------------------------------------------------------------------------- /test-pages/readability/metadata-content-missing/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title Element 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |

Test document title

15 |

16 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 17 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 18 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 19 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 20 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 21 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 22 |

23 |

24 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 25 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 26 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 27 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 28 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 29 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 30 |

31 |
32 | 33 | 34 | -------------------------------------------------------------------------------- /test-pages/readability/remove-script-tags/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Remove script tags test 6 | 7 | 8 | 9 |
10 |

Lorem

11 |
12 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 13 | tempor incididunt ut labore et dolore magna aliqua.

14 |

Ut enim ad minim veniam, 15 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 16 | consequat.

17 | 20 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 21 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 22 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

23 |
24 | 27 |

Foo

28 |
29 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 30 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 31 | consequat.

32 | 33 |

Duis aute irure dolor in reprehenderit in voluptate velit esse 34 | cillum dolore eu fugiat nulla pariatur. 35 | 38 | Excepteur sint occaecat cupidatat non 39 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

40 |
41 |
42 | 43 | 44 | -------------------------------------------------------------------------------- /tests/bad.rs: -------------------------------------------------------------------------------- 1 | use dom_smoothie::Readability; 2 | 3 | #[test] 4 | fn test_skip_body_ancestor() { 5 | let contents = r#" 6 | 7 | 8 | 9 | 10 | 11 | Some Title 12 | 13 | 14 | 15 |

Sign Up for Live Updates!

16 | 17 | 18 | "#; 19 | 20 | let mut ra = Readability::new(contents, None, None).unwrap(); 21 | let res = ra.parse().unwrap(); 22 | let expected: String = r#""# 25 | .split_whitespace() 26 | .collect(); 27 | let got: String = res.content.split_whitespace().collect(); 28 | assert_eq!(got, expected); 29 | } 30 | 31 | #[test] 32 | fn test_skip_body_ancestor_fragment() { 33 | let contents = r#" 34 | 37 | "#; 38 | 39 | let mut ra = Readability::new(contents, None, None).unwrap(); 40 | let res = ra.parse().unwrap(); 41 | let expected: String = r#""# 44 | .split_whitespace() 45 | .collect(); 46 | let got: String = res.content.split_whitespace().collect(); 47 | assert_eq!(got, expected); 48 | } 49 | -------------------------------------------------------------------------------- /test-pages/readability/reordering-paragraphs/expected.html: -------------------------------------------------------------------------------- 1 |
2 |

Regarding item# 11111, under sufficiently extreme conditions, quarks may become deconfined and exist as free particles. In the course of asymptotic freedom, the strong interaction becomes weaker at higher temperatures. Eventually, color confinement would be lost and an extremely hot plasma of freely moving quarks and gluons would be formed. This theoretical phase of matter is called quark-gluon plasma.[81] The exact conditions needed to give rise to this state are unknown and have been the subject of a great deal of speculation and experimentation.

3 |

Regarding item# 22222, under sufficiently extreme conditions, quarks may become deconfined and exist as free particles. In the course of asymptotic freedom, the strong interaction becomes weaker at higher temperatures. Eventually, color confinement would be lost and an extremely hot plasma of freely moving quarks and gluons would be formed. This theoretical phase of matter is called quark-gluon plasma.[81] The exact conditions needed to give rise to this state are unknown and have been the subject of a great deal of speculation and experimentation.

4 |

Regarding item# 33333, under sufficiently extreme conditions, quarks may become deconfined and exist as free particles. In the course of asymptotic freedom, the strong interaction becomes weaker at higher temperatures. Eventually, color confinement would be lost and an extremely hot plasma of freely moving quarks and gluons would be formed. This theoretical phase of matter is called quark-gluon plasma.[81] The exact conditions needed to give rise to this state are unknown and have been the subject of a great deal of speculation and experimentation.

5 |
6 |
-------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release Binaries 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | workflow_dispatch: 8 | 9 | permissions: 10 | contents: write 11 | 12 | jobs: 13 | build-and-upload: 14 | name: Build and upload 15 | 16 | strategy: 17 | matrix: 18 | include: 19 | - os: ubuntu-latest 20 | target: x86_64-unknown-linux-gnu 21 | src_file: dom_smoothie_cli 22 | dst_file: dom_smoothie_cli-x86_64-unknown-linux-gnu-${{ github.ref_name }} 23 | 24 | - os: macos-latest 25 | target: x86_64-apple-darwin 26 | src_file: dom_smoothie_cli 27 | dst_file: dom_smoothie_cli-x86_64-apple-darwin-${{ github.ref_name }} 28 | 29 | - os: windows-latest 30 | target: x86_64-pc-windows-gnu 31 | src_file: dom_smoothie_cli.exe 32 | dst_file: dom_smoothie_cli-x86_64-pc-windows-gnu-${{ github.ref_name }} 33 | 34 | runs-on: ${{ matrix.os }} 35 | 36 | steps: 37 | - uses: actions/checkout@v4 38 | 39 | - name: Install Rust 40 | uses: dtolnay/rust-toolchain@stable 41 | with: 42 | targets: ${{ matrix.target }} 43 | 44 | - name: Build Binary 45 | run: cargo build --release --target ${{ matrix.target }} 46 | working-directory: crates/cli 47 | 48 | - name: Install zip (Windows) 49 | if: matrix.os == 'windows-latest' 50 | run: choco install zip -y 51 | 52 | - name: Prepare Archive 53 | run: | 54 | zip -j ${{ matrix.dst_file }}.zip target/${{ matrix.target }}/release/${{ matrix.src_file }} 55 | 56 | - name: Release 57 | uses: softprops/action-gh-release@v1 58 | with: 59 | files: | 60 | ${{ matrix.dst_file }}.zip 61 | -------------------------------------------------------------------------------- /tests/parse_policy.rs: -------------------------------------------------------------------------------- 1 | use dom_smoothie::{ParsePolicy, Readability}; 2 | 3 | use std::collections::hash_map::DefaultHasher; 4 | use std::error::Error; 5 | use std::hash::{Hash, Hasher}; 6 | 7 | fn hash_text(text: &T) -> u64 { 8 | let mut hasher = DefaultHasher::new(); 9 | text.hash(&mut hasher); 10 | hasher.finish() 11 | } 12 | 13 | #[test] 14 | pub(crate) fn test_parse_with_policy() -> Result<(), Box> { 15 | // this is a case when each policy produces a different result 16 | let source_contents = include_str!("../test-pages/ok/wikipedia-2/source.html"); 17 | 18 | let policies: [ParsePolicy; 4] = [ 19 | ParsePolicy::Strict, 20 | ParsePolicy::Moderate, 21 | ParsePolicy::Clean, 22 | ParsePolicy::Raw, 23 | ]; 24 | let mut results = vec![]; 25 | 26 | for policy in policies { 27 | let mut r = Readability::new(source_contents, None, None)?; 28 | let article = r.parse_with_policy(policy)?; 29 | let content_hash = hash_text(&article.content.trim()); 30 | if !results.contains(&content_hash) { 31 | results.push(content_hash); 32 | } 33 | } 34 | assert_eq!(results.len(), policies.len()); 35 | Ok(()) 36 | } 37 | 38 | #[test] 39 | pub(crate) fn test_parse_with_policy_fail() -> Result<(), Box> { 40 | // Test that problematic HTML fails with Strict policy 41 | let source_contents = include_str!("../test-pages/readability/lazy-image-3/source.html"); 42 | let mut r = Readability::new(source_contents, None, None)?; 43 | let article = r.parse_with_policy(ParsePolicy::Strict); 44 | assert!(article.is_err()); 45 | 46 | let mut r = Readability::new(source_contents, None, None)?; 47 | let article = r.parse_with_policy(ParsePolicy::Raw); 48 | assert!(article.is_ok()); 49 | Ok(()) 50 | } 51 | -------------------------------------------------------------------------------- /test-pages/readability/parsely-metadata/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 |
18 |

Test document title

19 |

20 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 21 |

22 |

23 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 24 |

25 |
26 | 27 | 28 | -------------------------------------------------------------------------------- /test-pages/ok/tmz-1/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

4 |

Lupita Nyong'o

5 |

$150K Pearl Oscar Dress ... STOLEN!!!!

6 |

7 |
2/26/2015 7:11 AM PST BY TMZ STAFF
8 |
9 |

EXCLUSIVE 10 |

11 |

12 | 0225-lupita-nyongo-getty-01Lupita Nyong'o's now-famous Oscar dress -- adorned in pearls -- was stolen right out of her hotel room ... TMZ has learned. 13 |

14 |

Law enforcement sources tell TMZ ... the dress was taken out of Lupita's room at The London West Hollywood. The dress is made of pearls ... 6,000 white Akoya pearls. It's valued at $150,000.

15 |

Our sources say Lupita told cops it was taken from her room sometime between 8 AM and 9 PM Wednesday ... while she was gone.  

16 |

We're told there is security footage that cops are looking at that could catch the culprit right in the act. 

17 |

18 | update_graphic_red_bar12:00 PM PT -- Sheriff's deputies were at The London Thursday morning.  We know they were in the manager's office and we're told they have looked at security footage to determine if they can ID the culprit. 19 |

20 |

21 | 0226-SUB-london-hotel-swipe-tmz-02 22 |

23 |
24 |
25 |
-------------------------------------------------------------------------------- /test-pages/ok/004-metadata-space-separated-properties/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title Element 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |

Test document title

17 |

18 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 19 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 20 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 21 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 22 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 23 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 24 |

25 |

26 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 27 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 28 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 29 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 30 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 31 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 32 |

33 |
34 | 35 | -------------------------------------------------------------------------------- /test-pages/readability/embedded-videos/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

At root

6 | 7 | 8 | 9 |

In a paragraph

10 |

11 |

In a div

12 |

13 |

Foo

14 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

15 |
16 |
-------------------------------------------------------------------------------- /test-pages/ok/base-url/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Links

6 |

link

7 |

link

8 |

link

9 |

link

10 |

link

11 |

link

12 |

link

13 |

link

14 |

Images

15 |

16 |

17 |

18 |

19 |

20 |

Foo

21 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

22 |
23 |
-------------------------------------------------------------------------------- /test-pages/readability/base-url-base-element/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Links

6 |

link

7 |

link

8 |

link

9 |

link

10 |

link

11 |

link

12 |

link

13 |

link

14 |

Images

15 |

16 |

17 |

18 |

19 |

20 |

Foo

21 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

22 |
23 |
-------------------------------------------------------------------------------- /test-pages/ok/base-url/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Base URL test 6 | 7 | 8 |
9 |

Lorem

10 |
11 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 12 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 13 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 14 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 15 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 16 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 17 |
18 |

Links

19 |

link

20 |

link

21 |

link

22 |

link

23 |

link

24 |

link

25 |

link

26 |

link

27 |

Images

28 |

29 |

30 |

31 |

32 |

33 |

Foo

34 |
35 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 36 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 37 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 38 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 39 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 40 |
41 |
42 | 43 | 44 | -------------------------------------------------------------------------------- /crates/bench/benches/parse.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use std::hint::black_box; 3 | 4 | use dom_smoothie::{Article, Config, Readability, ReadabilityError}; 5 | 6 | fn dom_smoothie_parse(contents: &str, cfg: &Config) -> Result { 7 | let mut readability = Readability::new(contents, None, Some(cfg.clone()))?; 8 | readability.parse() 9 | } 10 | 11 | fn bench_dom_smoothie_parse(c: &mut Criterion) { 12 | let mut group = c.benchmark_group("dom_smoothie"); 13 | 14 | let small = include_str!("../test-pages/ok/ehow-1/source.html"); 15 | let medium = include_str!("../test-pages/ok/engadget/source.html"); 16 | let large = include_str!("../test-pages/ok/wikipedia-2/source.html"); 17 | 18 | // Test different sizes/types of content 19 | let test_cases = vec![ 20 | ("small", small, 5.0f32), 21 | ("medium", medium, 5.0f32), 22 | ("large", large, 5.0f32), 23 | ("small, min score to adjust 10", small, 10.0f32), 24 | ("medium, min score to adjust 10", medium, 10.0f32), 25 | ("large, min score to adjust 10", large, 10.0f32), 26 | ]; 27 | 28 | for (name, contents, min_score_to_adjust) in test_cases { 29 | let cfg = Config { 30 | min_score_to_adjust, 31 | ..Default::default() 32 | }; 33 | group.bench_with_input(BenchmarkId::new("parse", name), contents, |b, contents| { 34 | b.iter(|| { 35 | let res = dom_smoothie_parse(black_box(contents), black_box(&cfg)) 36 | .expect("Parsing failed"); 37 | black_box(res) 38 | }) 39 | }); 40 | } 41 | group.finish(); 42 | } 43 | 44 | fn configure_criterion() -> Criterion { 45 | Criterion::default() 46 | } 47 | 48 | criterion_group! { name = benches; config = configure_criterion(); targets = bench_dom_smoothie_parse } 49 | criterion_main!(benches); 50 | -------------------------------------------------------------------------------- /test-pages/ok/base-url-base-element-relative/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Links

6 |

link

7 |

link

8 |

link

9 |

link

10 |

link

11 |

link

12 |

link

13 |

link

14 |

Images

15 |

16 |

17 |

18 |

19 |

20 |

Foo

21 |

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

22 |
23 |
-------------------------------------------------------------------------------- /test-pages/readability/base-url-base-element/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Base URL with base test 7 | 8 | 9 |
10 |

Lorem

11 |
12 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 13 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 14 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 15 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 16 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 17 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 18 |
19 |

Links

20 |

link

21 |

link

22 |

link

23 |

link

24 |

link

25 |

link

26 |

link

27 |

link

28 |

Images

29 |

30 |

31 |

32 |

33 |

34 |

Foo

35 |
36 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 37 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 38 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 39 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 40 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 41 |
42 |
43 | 44 | 45 | -------------------------------------------------------------------------------- /test-pages/ok/base-url-base-element-relative/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Base URL with base relative test 7 | 8 | 9 |
10 |

Lorem

11 |
12 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 13 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 14 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 15 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 16 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 17 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 18 |
19 |

Links

20 |

link

21 |

link

22 |

link

23 |

link

24 |

link

25 |

link

26 |

link

27 |

link

28 |

Images

29 |

30 |

31 |

32 |

33 |

34 |

Foo

35 |
36 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 37 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 38 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 39 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 40 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 41 |
42 |
43 | 44 | 45 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: Benchmark 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | 7 | permissions: 8 | # deployments permission to deploy GitHub pages website 9 | deployments: write 10 | # contents permission to update benchmark contents in gh-pages branch 11 | contents: write 12 | 13 | jobs: 14 | benchmark: 15 | name: Performance regression check 16 | runs-on: ubuntu-24.04 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Install Rust 21 | uses: dtolnay/rust-toolchain@master 22 | with: 23 | toolchain: 1.89.0 24 | 25 | # Cache dependencies to speed up build 26 | - uses: Swatinem/rust-cache@v2 27 | 28 | # Run benchmark 29 | - name: Run benchmark 30 | run: cargo bench -p dom-smoothie-bench --bench parse -- --output-format bencher | tee benchmark-results.txt 31 | 32 | # Store benchmark results 33 | - name: Store benchmark result 34 | uses: benchmark-action/github-action-benchmark@v1 35 | with: 36 | name: Rust Benchmark 37 | tool: "cargo" 38 | output-file-path: benchmark-results.txt 39 | # Save the results as GitHub Pages 40 | github-token: ${{ secrets.GITHUB_TOKEN }} 41 | auto-push: true 42 | # Show alert with commit comment on detecting possible performance regression 43 | alert-threshold: "150%" 44 | comment-on-alert: true 45 | # Optional: Alert only when changes are made to specific files 46 | alert-comment-cc-users: "@niklak" 47 | # Optional: Enable failure when performance regresses 48 | # fail-on-alert: true 49 | 50 | # Configure GitHub Pages 51 | gh-pages-branch: gh-pages 52 | benchmark-data-dir-path: docs/dev/bench/ 53 | 54 | # Optional: Upload the results as artifacts 55 | - name: Upload benchmark results 56 | uses: actions/upload-artifact@v4 57 | with: 58 | name: benchmark-results 59 | path: benchmark-results.txt 60 | -------------------------------------------------------------------------------- /src/readable.rs: -------------------------------------------------------------------------------- 1 | use dom_query::Document; 2 | 3 | use tendril::format_tendril; 4 | 5 | #[allow(clippy::wildcard_imports)] 6 | use crate::glob::*; 7 | use crate::helpers::is_probably_visible; 8 | 9 | /// Estimates whether the document is readable in a *quick-and-dirty* way. 10 | /// 11 | /// 12 | /// # Arguments 13 | /// 14 | /// * `doc` - The reference to the [`dom_query::Document`] to check. 15 | /// * `min_score` - The minimum score required for the document to be considered readable. Defaults to 20.0. 16 | /// * `min_content_length` - The minimum content length required for the document to be considered readable. Defaults to 140. 17 | /// 18 | /// # Returns 19 | /// 20 | /// True if the document is readable, false otherwise. 21 | pub fn is_probably_readable( 22 | doc: &Document, 23 | min_score: Option, 24 | min_content_length: Option, 25 | ) -> bool { 26 | let min_score = min_score.unwrap_or(MIN_SCORE); 27 | let min_content_length = min_content_length.unwrap_or(MIN_CONTENT_LENGTH); 28 | 29 | let mut nodes = doc.select("p,pre,article").nodes().to_vec(); 30 | 31 | let br_parent_sel = doc.select("div > br").parent(); 32 | let br_parent_nodes = br_parent_sel.nodes(); 33 | nodes.extend_from_slice(br_parent_nodes); 34 | 35 | let mut score: f32 = 0.0; 36 | 37 | nodes.iter().any(|node| { 38 | if !is_probably_visible(node) { 39 | return false; 40 | } 41 | let match_string = 42 | format_tendril!("{} {}", node.attr_or("class", ""), node.attr_or("id", "")); 43 | 44 | if UNLIKELY_CANDIDATES.iter().any(|p| match_string.contains(p)) 45 | && !MAYBE_CANDIDATES.iter().any(|p| match_string.contains(p)) 46 | { 47 | return false; 48 | } 49 | 50 | if MATCHER_LI_P.match_element(node) { 51 | return false; 52 | } 53 | 54 | let text_content_length = node.text().trim().chars().count(); 55 | if text_content_length < min_content_length { 56 | return false; 57 | } 58 | 59 | score += ((text_content_length - min_content_length) as f32).sqrt(); 60 | if score > min_score { 61 | return true; 62 | } 63 | false 64 | }) 65 | } 66 | -------------------------------------------------------------------------------- /test-pages/readability/embedded-videos/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Embedded videos test 6 | 7 | 8 |
9 |

Lorem

10 |
11 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 12 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 13 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 14 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 15 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 16 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

17 |
18 |

Videos

19 |

At root

20 | 22 | 24 | 27 |

In a paragraph

28 |

30 |

In a div

31 |
33 |

Foo

34 |
35 | Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 36 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 37 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 38 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 39 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 40 |
41 |
42 | 43 | 44 | -------------------------------------------------------------------------------- /test-pages/readability/reordering-paragraphs/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 |

Regarding item# 11111, under sufficiently extreme conditions, quarks may 8 | become deconfined and exist as free particles. In the course of asymptotic 9 | freedom, the strong interaction becomes weaker at higher temperatures. 10 | Eventually, color confinement would be lost and an extremely hot plasma 11 | of freely moving quarks and gluons would be formed. This theoretical phase 12 | of matter is called quark-gluon plasma.[81] The exact conditions needed 13 | to give rise to this state are unknown and have been the subject of a great 14 | deal of speculation and experimentation.

15 |

Regarding item# 22222, under sufficiently extreme conditions, quarks may 16 | become deconfined and exist as free particles. In the course of asymptotic 17 | freedom, the strong interaction becomes weaker at higher temperatures. 18 | Eventually, color confinement would be lost and an extremely hot plasma 19 | of freely moving quarks and gluons would be formed. This theoretical phase 20 | of matter is called quark-gluon plasma.[81] The exact conditions needed 21 | to give rise to this state are unknown and have been the subject of a great 22 | deal of speculation and experimentation.

23 |

Regarding item# 33333, under sufficiently extreme conditions, quarks may 24 | become deconfined and exist as free particles. In the course of asymptotic 25 | freedom, the strong interaction becomes weaker at higher temperatures. 26 | Eventually, color confinement would be lost and an extremely hot plasma 27 | of freely moving quarks and gluons would be formed. This theoretical phase 28 | of matter is called quark-gluon plasma.[81] The exact conditions needed 29 | to give rise to this state are unknown and have been the subject of a great 30 | deal of speculation and experimentation.

31 |
32 | 33 | 34 | -------------------------------------------------------------------------------- /test-pages/readability/arxiv/expected-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "360Brew: A Decoder-only Foundation Model for Personalized Ranking and Recommendation", 3 | "byline": "[Submitted on 27 Jan 2025]", 4 | "dir": null, 5 | "lang": "en", 6 | "excerpt": "Ranking and recommendation systems are the foundation for numerous online experiences, ranging from search results to personalized content delivery. These systems have evolved into complex, multilayered architectures that leverage vast datasets and often incorporate thousands of predictive models. The maintenance and enhancement of these models is a labor intensive process that requires extensive feature engineering. This approach not only exacerbates technical debt but also hampers innovation in extending these systems to emerging problem domains. In this report, we present our research to address these challenges by utilizing a large foundation model with a textual interface for ranking and recommendation tasks. We illustrate several key advantages of our approach: (1) a single model can manage multiple predictive tasks involved in ranking and recommendation, (2) decoder models with textual interface due to their comprehension of reasoning capabilities, can generalize to new recommendation surfaces and out-of-domain problems, and (3) by employing natural language interfaces for task definitions and verbalizing member behaviors and their social connections, we eliminate the need for feature engineering and the maintenance of complex directed acyclic graphs of model dependencies. We introduce our research pre-production model, 360Brew V1.0, a 150B parameter, decoder-only model that has been trained and fine-tuned on LinkedIn's data and tasks. This model is capable of solving over 30 predictive tasks across various segments of the LinkedIn platform, achieving performance levels comparable to or exceeding those of current production systems based on offline metrics, without task-specific fine-tuning. Notably, each of these tasks is conventionally addressed by dedicated models that have been developed and maintained over multiple years by teams of a similar or larger size than our own.", 7 | "siteName": "arXiv.org", 8 | "publishedTime": null, 9 | "readerable": false, 10 | "image": "/static/browse/0.3.4/images/arxiv-logo-fb.png" 11 | } 12 | -------------------------------------------------------------------------------- /test-pages/readability/simplyfound-1/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

The Raspberry Pi Foundation started by a handful of volunteers in 2012 when they released the original Raspberry Pi 256MB Model B without knowing what to expect.  In a short four-year period they have grown to over sixty full-time employees and have shipped over eight million units to-date.  Raspberry Pi has achieved new heights by being shipped to the International Space Station for research and by being an affordable computing platforms used by teachers throughout the world.  "It has become the all-time best-selling computer in the UK".

4 |

Raspberry Pi 3 - A credit card sized PC that only costs $35 - Image: Raspberry Pi Foundation

5 |

Raspberry Pi Foundation is charity organization that pushes for a digital revolution with a mission to inspire kids to learn by creating computer-powered objects.  The foundation also helps teachers learn computing  skills through free training and readily available tutorials & example code for creating cool things such as music.

6 |

Raspberry Pi in educations - Image: Raspberry Pi Foundation

7 |

In celebration of their 4th year anniversary, the foundation has released Raspberry Pi 3 with the same price tag of $35 USD.  The 3rd revision features a 1.2GHz 64-bit quad-core ARM CPU with integrated Bluetooth 4.1 and 802.11n wireless LAN chipsets.  The ARM Cortex-A53 CPU along with other architectural enhancements making it the fastest Raspberry Pi to-date.  The 3rd revision is reportedly about 50-60% times faster than its predecessor Raspberry Pi 2 and about 10 times faster then the original Raspberry PI.

8 |

Raspberry Pi - Various Usage

9 |

Raspberry Pi 3 is now available via many online resellers.  At this time, you should use a recent 32-bit NOOBS or Raspbian image from their downloads page with a promise of a switch to a 64-bit version only if further investigation proves that there is indeed some value in moving to 64-bit mode.

10 |
11 |
-------------------------------------------------------------------------------- /test-pages/ok/social-buttons/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

Lorem ipsum dolor

4 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

5 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

6 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

7 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

8 |

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

9 |
10 |
-------------------------------------------------------------------------------- /test-pages/ok/003-metadata-preferred/source.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title Element 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 |

Test document title

27 |

28 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 29 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 30 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 31 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 32 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 33 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 34 |

35 |

36 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 37 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 38 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 39 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 40 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 41 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 42 |

43 |
44 | 45 | -------------------------------------------------------------------------------- /test-pages/ok/qq/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |

TNW中文站2016年10月14日07:17

5 |
6 |
7 |
8 |

转播到腾讯微博

9 |

DeepMind新电脑已可利用记忆自学 人工智能迈上新台阶

10 |
11 |

TNW中文站 10月14日报道

12 |

13 | 谷歌(微博) 在2014年收购的人工智能公司DeepMind开发出一款能够用自己的记忆学习新知识并利用这些知识来回答问题的计算机。 14 |

15 |

这款产品具有极其重要的意义,因为这意味着未来的人工智能技术可能不需要人类来教它就能回答人类提出的问题。

16 |

DeepMind表示,这款名为DNC(可微神经计算机)的AI模型可以接受家谱和伦敦地铁网络地图这样的信息,还可以回答与那些数据结构中的不同项目之间的关系有关的复杂问题。

17 |

例如,它可以回答“从邦德街开始,沿着中央线坐一站,环线坐四站,然后转朱比利线坐两站,你会到达哪个站?”这样的问题。

18 |

DeepMind称,DNC还可以帮你规划从沼泽门到皮卡迪利广场的最佳路线。

19 |

同样,它还可以理解和回答某个大家族中的成员之间的关系这样的复杂问题,比如“张三的大舅是谁?”。

20 |

DNC建立在神经网络的概念之上,神经网络可以模拟人类思想活动的方式。这种AI技术很适合与机器习得配套使用。

21 |

DeepMind的AlphaGo AI能够打败围棋冠军也跟这些神经网络有很大关系。但是AlphaGo必须进行训练才行,开发人员向AlphaGo提供了历史对弈中的大约3000万记录。让人工智能技术具备通过记忆学习的能力,就可以让它独自完成更复杂的任务。

22 |

DeepMind希望DNC可以推动计算行业实现更多突破。DeepMind已将其研究结果发表在科学刊物《自然》(Nature)上。(编译/林靖东)

23 |

精彩视频推荐 24 |

25 |
26 |

转播到腾讯微博

27 |

28 |
29 |

【美国The Next Web作品的中文相关权益归腾讯公司独家所有。未经授权,不得转载、摘编等。】

30 |
31 |
32 |

[责任编辑:alonliu]

33 |
34 |

您认为这篇文章与"新一网(08008.HK)"相关度高吗?

35 |
36 |
37 |
38 |
-------------------------------------------------------------------------------- /test-pages/readability/heise/expected.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 |
6 |

1Password scannt auch QR-Codes.

7 |

(Bild: Hersteller)

8 |
9 |
10 |

Das in der iOS-Version bereits enthaltene TOTP-Feature ist nun auch für OS X 10.10 verfügbar. Zudem gibt es neue Zusatzfelder in der Datenbank und weitere Verbesserungen.

11 |

AgileBits hat Version 5.3 seines bekannten Passwortmanagers 1Password für OS X freigegeben. Mit dem Update wird eine praktische Funktion nachgereicht, die die iOS-Version der Anwendung bereits seit längerem beherrscht: Das direkte Erstellen von Einmal-Passwörtern. Unterstützt wird dabei der TOTP-Standard (Time-Based One-Time Passwords), den unter anderem Firmen wie Evernote, Dropbox oder Google einsetzen, um ihre Zugänge besser abzusichern. Neben Account und regulärem Passwort wird dabei dann ein Zusatzcode verlangt, der nur kurze Zeit gilt.

12 |

Zur TOTP-Nutzung muss zunächst ein Startwert an 1Password übergeben werden. Das geht unter anderem per QR-Code, den die App über ein neues Scanfenster selbst einlesen kann – etwa aus dem Webbrowser. Eine Einführung in die Technik gibt ein kurzes Video. Die TOTP-Unterstützung in 1Password erlaubt es, auf ein zusätzliches Gerät (z.B. ein iPhone) neben dem Mac zu verzichten, das den Code liefert – was allerdings auch die Sicherheit verringert, weil es keinen "echten" zweiten Faktor mehr gibt.

13 |

Update 5.3 des Passwortmanagers liefert auch noch weitere Verbesserungen. So gibt es die Möglichkeit, FaceTime-Audio- oder Skype-Anrufe aus 1Password zu starten, die Zahl der Zusatzfelder in der Datenbank wurde erweitert und der Umgang mit unterschiedlichen Zeitzonen klappt besser. Die Engine zur Passworteingabe im Browser soll beschleunigt worden sein.

14 |

1Password kostet aktuell knapp 50 Euro im Mac App Store und setzt in seiner aktuellen Version mindestens OS X 10.10 voraus. (bsc) 15 |
16 |

17 |
18 |
--------------------------------------------------------------------------------