├── .github
└── workflows
│ └── evaluate.yml
├── .gitignore
├── LICENSE
├── README.rst
├── evaluate.py
├── extractors
├── go_domdistiller.py
├── go_domdistiller
│ ├── README.rst
│ ├── cli.go
│ ├── go.mod
│ └── go.sum
├── go_readability
│ ├── README.rst
│ ├── cli.go
│ ├── go.mod
│ └── go.sum
├── run_beautifulsoup.py
├── run_boilerpipe.py
├── run_dragnet.py
├── run_go_readability.py
├── run_goose3.py
├── run_html2text.py
├── run_html_text.py
├── run_inscriptis.py
├── run_justext.py
├── run_news_please.py
├── run_newspaper.py
├── run_readability.py
├── run_readability_js.py
├── run_trafilatura.py
└── run_xpath_text.py
├── ground-truth.json
├── html
├── 042bb7b5fedab6eac7db576522b89b93904c237d344bcbe14a6a5ab7f7335856.html.gz
├── 04a6711caa7c687592777718866e781e976e0fe684faebe8b3cedcef8cd0ea34.html.gz
├── 05844573ca7e1fba714d715bb11ca08c26e25328999c74a1cb3bc8a0e4399f0f.html.gz
├── 06e5123e4ef7cfb4533250dc45d1e03d0838fc66223f45c583c4d12f48b4da85.html.gz
├── 06ee193de4bd611f7fafbab0c59b0f6fe3495093516720632cd093b24c7a0e98.html.gz
├── 076f4f33bf75059db581bedf36e76fb65e89a8f7752db3339aa3ea11c5122f32.html.gz
├── 08f793762792bd252c75fb57544cdf506ffcc04785136cb87503f02364b82b56.html.gz
├── 098bb3e96c0acdf36efdcde45fb9cca3f8c82c7cb2071b76097a1b96155f1eb2.html.gz
├── 0d46122928b6f468cc4bbc694051d0dbae5702bc75a16dab82a99b58daf150a0.html.gz
├── 0dd1357045727799a447563fd8851f4ebe79f042073ea16991a9b67aa595f81a.html.gz
├── 0e014df693f182824fe5e24030ddbe1d0b96ddb9685cf20d5766457ed32ffa2d.html.gz
├── 0ec95c7261d122f304728e90c983450ef1ce1e0b423546835c397d50aaf0d0f2.html.gz
├── 11ea381ad92b5448cf66eae62f52ac565361a244c8881615fc6a7bb523cc0c32.html.gz
├── 14cc2a0ca59c62a8c9f205a171e9ccf4ef4cf69b0c642f51c8c65c051b39024f.html.gz
├── 156770d676ce79905198e1c8407f81e5ecfb617d9aa44712718707eb7e3b8e38.html.gz
├── 16c30add7e96315e9cc957d85aa876ccb6b70055f0ddab51547a586117cc1f56.html.gz
├── 1ace8c85aaee21b9d4505eca506d50c4721c29db62848b567a9703bfe0583892.html.gz
├── 1ee91d1fce65e09be8b8d2d29eab771546d98ca2ba5c862941e660e9fec12432.html.gz
├── 1f765c48780665e89cc3af1f7c9af47876e9fae9b5be4a936b0649e10f5e3198.html.gz
├── 20b2b64916b00b25203c9f1bf14248922f4d522f18328e9f876cce116df0083e.html.gz
├── 21486419bb109c5a62a68957f528e6ff29c92f58d8d3c1f2837c86ff3f3e11f9.html.gz
├── 232a43fb15abde807427b2a7bf4f772e27b8760554370956d8291df4e8166dbf.html.gz
├── 23aaecd14171f96cfd201a8a46666097e286ad71f74f29347a78c5ecba50da1e.html.gz
├── 264dc3ae31249cb1f50c50986e0952a4708c2e705d18a2d8bf0e525da6e2b485.html.gz
├── 287e4d9f4af31733aad6534aefb2bd00fb344ec8d6ebf1ac99dbc4d762da0ca4.html.gz
├── 291a8bf33ee49074f33dcff37544ac40506cae450db83b6cb63f02b9920b51c2.html.gz
├── 2c46804d9db4a85e8f8d31128ce0e11d02f25c7120c2faa5ec0664c604a47717.html.gz
├── 2f42ef1d3ea0c96e56355d3db93d0e06b47e760b74f6f4261278b8cd1c246dd6.html.gz
├── 30b771a40a4e96156d398716c877deef54b05d091770d2717c98e4c6b670010c.html.gz
├── 3252222e61fe78982cffe0b0bad2b089c27b32f65852d1c5d3951517f3c2e295.html.gz
├── 33fe2471fd553c6570f93997f208b4f39bf30be5947c3cfa620ee8eff3355ab9.html.gz
├── 34a7328535ad4e60b059f81d37eec5d25c2bc8de759ce9a7b5e47ac7dc6fd1b0.html.gz
├── 358cc4a080456476b0f883c56bdce796874c286ed6efab25f5718dd95fab42a8.html.gz
├── 359fee228518d55b921194561e9ca88e428df81940246f8fac7a75398377daea.html.gz
├── 35b158918c676ff2c74445517db76c83db70a805cc50b64e1369b354a027fcbd.html.gz
├── 360c732d1fdbfc6895d7096c0c0b8c0d581bb1af80160f4c6a0f1fd9ff85e469.html.gz
├── 374ac9a59a85196cdacc1679fb8993521a7b7d9d6533720f102300be1c7face4.html.gz
├── 39d5c43beb60605c3eec760c99500e62e7bd71ebbe4ae05edf382125e1b0b80a.html.gz
├── 3c5bf8db4272925bf1dd5713fc325e179fd0d1cc6fb8c77aa2d917cfd2518a32.html.gz
├── 3c6d3381ef52ca26be2fbde19c1b0fe17d85682b726dfecf5e300c1ca34546b1.html.gz
├── 3cb22bfabed8de715c0813a7bb5052363c96bd71ccce3bb2dfb3ab9d1d7a9bbc.html.gz
├── 3cb5e2f46626d5bb0345759453036f7eabc0b0c7796b796513606bf693060ced.html.gz
├── 3ce1c8fdf6ad2ded9e48a68be71eb069fc453ef1b75f47698428a1fdda0deb24.html.gz
├── 3d8f3404cf975af824d7866b7679bc45189c3eea6adb32f0a125a0904b1abbb2.html.gz
├── 3f65af7b6b98b1c9ae9a3e0d8a09a85600cdc44e26e4b3a6db96a31f4b1767e3.html.gz
├── 4219d096902dad9fd9d57e881e7928ca66bdf5334c2bc7dfddaa264887777a7a.html.gz
├── 42aad16bde9288623543642a9ce1a396be83e2db44aa2ff8cbbfe46e14abd7cc.html.gz
├── 432362af0be43f6da757ea778bd7f2f000094a565bdebac5af7442987a5372f3.html.gz
├── 4648a420af9984d45b76a4afedf4f74965f8a2e0bf1c69bd3da2dc189020f3c9.html.gz
├── 4a44ab3e4c41d56ce9b79eb07acb06aed1bc52aba68a950f06e7de7ef848400a.html.gz
├── 51374560f40088e227f0053ff1bb0b8525d10a8d7bfbff1cd6033f42347fd85b.html.gz
├── 51d066b0602c9421d8d6410bc4b931700978409a3faa2a984e8fbde519ad7241.html.gz
├── 5211188428849a31e309ef2475746563ff788b1591c89818c08d5abedec4ef5e.html.gz
├── 55bb6340e3d7dd8632ba45179ae43c39f8ad0cfcecb4719e3b9cf6106ffb70a3.html.gz
├── 57b4dafd18cfd0531b69f81e87158648227c673ef159f8d8c87d34e34bdb21f2.html.gz
├── 57d46c9d751e3fd3ffaf3ede7ac20cebd30eacb5ea78e1a6aa0a72059244e7ca.html.gz
├── 57e2e98887a1965689955921208e32f410b10e2b95c907e74e57982d3edf3cc6.html.gz
├── 5a822960e9a2cb1e664d334b6c936c5cb6e41fb5331877538c2c8339cb59d57e.html.gz
├── 5ae11e580afc12d3ba1a12944281e6a7a5dded5c98b4efcf24aedcb28f0d5b22.html.gz
├── 5caf91b8a4423735f866b089d2611ea14503584cf3b6f487c6d26eb7b9521fca.html.gz
├── 5f03fc173ebc6abdfae50b96ce0b05a6137b7d3f2ef379be35a9bb8ca9f49e87.html.gz
├── 5f9c5ed5d64dfe682d9bde13b9b4f032a3ebdbf165c06ec49c0705bcbe106e3b.html.gz
├── 5fa3154ec031ab35411a457d78eb5aa92c0e803c5329bd05c001e6d64009e206.html.gz
├── 5fa5679de56c43edf70685762c2d1f2de296432ae53aa46e075b552fee17cab8.html.gz
├── 5fbc7ccb504c755ae23a85499a17518483d7862b74b4a5c34d86ede1a1a4448e.html.gz
├── 612cd29826624e68ce96789c8049e16279dfd2fceb27434eea7943b2aaf84e90.html.gz
├── 624fcd903d56fc7055fa7097b330629450c095ad6937318deb027be7803bbf35.html.gz
├── 63db31a161b3c5b64e88c2978635cbc38d342ba82fd2c5335321203dcc55c76f.html.gz
├── 65408257dbe4b41f71a35ade24e30243265095fc1d4988a35b9a6ca52f2b4eab.html.gz
├── 65bf3048b500bbd84928d9122f99617ca898216b91add1d8b2ac09c670484a5c.html.gz
├── 65ce3a4577a0306994efa190a0d96e84014f9d4257ad54753e807ede518f02c0.html.gz
├── 680c2848e94a96f961a0964631de94ac572f83c45bfd0bec2deafa893bcfe15c.html.gz
├── 686bb170effe273eaff1c0f88e412172e8d972518a6d1454c896f52aafaa9643.html.gz
├── 6a72de37e8f98f4eee6c0821e593b35ce536cef6c8b424c5e1dd747ebe6621ba.html.gz
├── 6ebac05f637ece8aa57c298a2a5e3a8047f546f855d0f29cc683cea60ce85c85.html.gz
├── 702d1da63b8e064cb70617620e45c2d116b4912c9bc9d518dcf5ce54bb8057ed.html.gz
├── 70cb2d5bca75ab5a8f6bb378a38a52f882f6bda508de93b12502e74936d86ff2.html.gz
├── 776a1c046798b474e410f6edf3225d6a27fecd0de6aac22aef7b7f64fe87caaf.html.gz
├── 7837c9d66c815b9a21dd669a3dc21677c3f084b1b7dd603d56e87867d8970dd3.html.gz
├── 785affa2c34e6e4844ef080e98e1a1e532eeeb671bdacebfb9e98ad7320ff382.html.gz
├── 7916ecca969ffdd8f6fc32d171fbe0dd63db40fe4c1d2ade02b1dec5929a162f.html.gz
├── 7a457a4f71735c17b8b34fafc88835d225cf879b2d812311857a64cfc891eee9.html.gz
├── 7a664e40d256470fdb12d10c3f8d1c6db0581e9b080c71765e55f273a3ac7d03.html.gz
├── 7ab16ade32386ece353b8d31fc3bb7e660189efd5bf3c8549aaba101ad3f5ef5.html.gz
├── 7bb1ca90354313840329d2f569ea9fb3a582df2aa0a5e3669f8fc567eb6ea61b.html.gz
├── 7de5241947a5f7147fe9787c6f6fa16685bfe66e6c35510a68780f27690dc4f0.html.gz
├── 7dfc3e359d7c0ca48ac9046ae5759286cedf80abe7526fc6c6e6546b9ba43e33.html.gz
├── 7f93c1944a41d01960f8a16fdfda6c562e86f04ead8375ab796c4278402df9a8.html.gz
├── 8267acacb9e4a109b1f7ee7bafe735b73e9c94180b703b131f9e90c9be044f39.html.gz
├── 82b6d780c792df78dcfb00484d50c86fbc7f324a9eb5835b7615f028edb9a574.html.gz
├── 833caf3bdba53dcf48de273cf646370eebe9ac565744b0d0e941e298e1b79730.html.gz
├── 8380689f358c1e3a0f6fca6e11ed13e5304a74060139f7a584347db213950446.html.gz
├── 851498a2b9f4f0b578ac9700c245253dbc147a06c0fb3499adebf1c2d5663c29.html.gz
├── 85439e26c41c75901820d01a13e8cea7836abb58635ea3986f71a163ab0311d3.html.gz
├── 8634d1211c3f2b73041e6cadd5d59676619838949999a83a23c51a3195b44892.html.gz
├── 87438a0dacbeb979e72522f42b9020048da13dc5a079477114190c8855701b7f.html.gz
├── 87bf60570e6e2e33cb1f0fdb5600d6c85012e60be25ba6fa587b8f90eb9a3770.html.gz
├── 88c328b68b038a625b4b3f8c322215caa30b0e88af0754bd71056ffc15c7b4b7.html.gz
├── 8b194530308204139d9c8f7d495a26b117c78756ac1802cfc3c0a8bfdf2c0d50.html.gz
├── 8cad00dc22de45ba42e9540421b5f78333f7ac57b385d69acb27a53b9fd69f0c.html.gz
├── 8e3efab59f48fd29a1e1e7aa135880c4251a9f090f94999668cdbaec59d30b5a.html.gz
├── 921019755f4a96ac4abf9dbcb4ef9d5ac202624a542d5ea70912330aa6fcc71f.html.gz
├── 94fbcc26772088646cb977cecf1abc4012847a1f6927d09505cbf0c3d417ba07.html.gz
├── 95301fb7883e0ee5214d1111554d30dd97e08c6380d7699369c0b9c15f42e6aa.html.gz
├── 961bd85ca85aaf791b278cc4a60058e92d57c4f32a3411cf8e7d802af183c926.html.gz
├── 9a440270bf8625d586039dfae1b8df409b467524e075124cd7a5424a5806901b.html.gz
├── 9cb8224b660f36c932823ab613fb76a07928fcbc41956c4c1f96f4ecab9202aa.html.gz
├── 9da36ae4714bfccc72374c6c146e9d1cd3cca39e2110bd67ccdbcc806f4cf139.html.gz
├── 9e8c9f082a8d77c58c17bda03b6b4bb6a1d6883fe196c252db4ca83b9991e0d3.html.gz
├── 9ebb3af65694a953005df5bd3869b2cefc263e1dea0471e3ef361c66a264cdd3.html.gz
├── 9eef8162bbb67b0bd73792313b91b87dc9304f43f85f479e67e71c166417451e.html.gz
├── a078b3656adc0295d0e37bd4f599342f4a0894da2451e0ef3038ac045434fef3.html.gz
├── a1fca19b884e0e946ad3fbe2a7f5031e5e3b23372702a76db302b6143c77cb31.html.gz
├── a6968f427cdb786531cfb326518e674bd8b48af94df7c5c6165cdf40e944357a.html.gz
├── a860fb5eda1ac75df3bc95ba096ade649fdbb1bb566adb9fee3cb13e59f37604.html.gz
├── aadb38e527d5379306de3b910ec62cb2447cc1035686b2b2d152580f8f8a1ea2.html.gz
├── aade2ec8d1e7b0919aef1001c3ef0573f8a239e22d4d751d8e664f04ea77ef0d.html.gz
├── abd9d6291b6bfae0c3ffad8ab7623b482c6da46face0271dc42af6324d8f0ce5.html.gz
├── ac1bfdd4c510f679c58f1b62101630d40fda20a16703235ae0f56b65a465e423.html.gz
├── ac3c035520461017a7c5b248d8e39ef063cad4c0c7d7b7ecd68aff8f15099485.html.gz
├── ad826691a8a2f9c4ce50cf0b885af933c4b5119c1f6235cd7df1dfb83f255bcc.html.gz
├── ad9e9e596f21a6812fae27b5d9d622359826c368e471d7d5ff9ac4676eaac9cd.html.gz
├── aec5deeaada8b2fb81b55349da0229d5c77a4dc9605c1aaa31e5ce8b71358bc9.html.gz
├── b0cf2bbf0192315eec95ede9c59bbf4ae58699275739d590edc24b012e3e8800.html.gz
├── b37be3535e1fb61e5a238b7fa1ead1ad98b651cb09f138efadac3d54a122fb21.html.gz
├── b3c19dd5f0612d098788fa5173e491b3280da6226b492f8fe110f4ab1896cca8.html.gz
├── b6906ca016bbfc64c90426e098c75b3e8c84457a77f51f1e7ea6941cb80c2147.html.gz
├── b6fb53e9fb043c98eb1e6530a1074c40922e29025f5454809f3938a7c174faa3.html.gz
├── ba07d1e64775f4090e39116c382111f5a2cfe9528dd179673f4e9bfcea370c15.html.gz
├── ba4dfe2d3e817ff7b8b01172ccc307850fc5469bcdd26c3c48ca046cb88ab7cf.html.gz
├── bc13ff87b2630ffbebc33bc37b11178b14f03109055e1d17bf644f804b63d98a.html.gz
├── bd673bd7988144f0ab7b9c5e19fed140fb5aaa30d8894cb045b72d3b79a7dc54.html.gz
├── bdb56ac83513635db1d8b9eb46b2da4c0de8da2f1f28f5bf5163df3eb3d3ec06.html.gz
├── c00962aabe7bdd1fca78f5360ea7fa93cd7674863b05157e00827506a7aa58c4.html.gz
├── c13b9c0e04fb28d445d22e92bff6ab7f7800a429930677c28c4dad89f3269869.html.gz
├── c467d507551a836efa9cfe843ba5d7bafe519750e04d0c9ff0decf44f013f829.html.gz
├── c4a3637c6696f238cf9fe1c7fbb17bbb6731a71d4f5fe399b9b4fc3294a96a6b.html.gz
├── c50845a7158af12ee75acea301a3ea0dad1e848d6b9dbdb43ba7f2d825b2528b.html.gz
├── c582d3b772578e8feaa3cfd8f5ae8100bb6f0bc66048204a9a398395841c1164.html.gz
├── c58aa507c4deebd660f69905f9abb8f96d935f6e7210f597ed4cd32b3f39f7f7.html.gz
├── c69e539d689a8335a69042727f1b58edab09d5d99fb607ec625a63151a537dc2.html.gz
├── c7e39ac49fa1235f5d50f83bf2444248bd3aa4e6df044377916c812dd109ba23.html.gz
├── c81e134ed49902bcf69b551426b4a346c5a77ae993cac8bda68b5541a664ef4c.html.gz
├── c82b3d1d540bbbd6081bdfb78b4c068c583aa766bcaaefe7ad16d24e5413a829.html.gz
├── c90731f051d033e49e4cfcc920895051bbc3b54ef1a11519abcf22a115c3aa79.html.gz
├── cc03ddb5ef7d5f1fdb8a87f5e6dfd058a2a70acedf2551655a898dc5c18eb79e.html.gz
├── cc4aa22b8212aec7d289667c0a965569e6f06b9e9196ff8b02219bf2bc1b90d0.html.gz
├── d0382c0d9573a0a7beb1e649012d04ec7275ac23513ca6ca59e51477b028283c.html.gz
├── d1c57d7821e5a5b27fb468c59489601bb2a042b1c05221166e3221d2b5dc217f.html.gz
├── d48aeb9cf2f2ff15769a57513249b4a6a669159f3e50b335e741d4206a824e88.html.gz
├── d605bdef2cde7308a9f2fbd1484d4a9c3da0167177245d346da61e455f42208d.html.gz
├── d90bda7ed14df19574f4ca8b1ccde5752a78f40058af1393e81cc99adb3e8756.html.gz
├── db6b0816c612296c7f1f001c6df874214fcca0da0fc86fb3aea9358c7f681754.html.gz
├── dc7ccccc1f34eb2928cb238739aaf18c712d59d8d34b41acfb29178aeba65356.html.gz
├── dfd43bc0d46e7aaa78ba10fbcb5b9fdfe78771d36cb4c7497e17fb6f69170ec5.html.gz
├── e100c9612ad8495db03b2a9f968952d0eaa4853d9b32ded6a29f8e313a974873.html.gz
├── e1c7023ee2148901b086256fdd30a0893d10b0720b510d5ff07a021109347266.html.gz
├── e1cd54e5577d077df83a12a4753c3c8bf2d88d68cd709cc4c442874777581c4a.html.gz
├── e372e42c0a3df7b86e1c0bacf7bc14d042144a01e88833bc5a643d61b3547090.html.gz
├── e4c6a3b482403a8f60190ba27248cd52b250b86f5d4a8a10edcf7062c64fc3f5.html.gz
├── e593d7fe88f9f5cd6587ac172be2db6055d40b6f071023f97ab1ce373534261e.html.gz
├── e7301133baab43596f19076beab32096f6405b868e0a69bcfc3349e595d62475.html.gz
├── e7994d5500875202d93e736e8f0c8a0436107d10add94ce3789001b8c5c32358.html.gz
├── e7d77f1869803e24667fa0b985cff27fb4139951a5ffa494bc9ba810df48fb30.html.gz
├── ea25dd7edff4d27973600f35728f20aed5a3eedcc23257d9c3afc3d3e840c3de.html.gz
├── eb62ac8425e5573947ecde962d14433d18e5725cc4a8c908fe22f678e96a65a1.html.gz
├── ec3878db7e49b1ed354c511b132e3de5f773ff4fc8014163df58c22fffd93d2f.html.gz
├── ec7fc408c5ce66c22692a3f696c682f3de794bacfaca405d9a0dac5957051e5a.html.gz
├── ecb46e3e489d2aac92b2563112e1801077b4219a6db9751f18e228bcaf457802.html.gz
├── eecd2575093b85933997521d6babddd397599419588d7096c5c19dc4ffe2ea72.html.gz
├── ef2b3f268a67950c16563de9ca3209163c7618868c0216739e1e794e7884cc20.html.gz
├── ef4e67b66d63b5facef55c06a94d85f2ae01a0a1a4a3a1bcfe2499c8c8a7dacf.html.gz
├── f105de6e63ca91ea482f60193f6252092557f969f2fd128ff68c0d4d6b90dd7d.html.gz
├── f344ca5fb36e130f4344235fa22726f3367e09c211c120f21d9ae92effe902db.html.gz
├── f5c90a6d5253c3a21ff3168c64bea4b5ffade7a1ba5bed952a59ebee0d648d98.html.gz
├── f6ac15a4d98511396da23e4428deb5605422b1c8bbc8284e771f6896bdccf57f.html.gz
├── f81c6c05d9cbc93316992fa23ef74ec405194e292611f2e94f6a814868903665.html.gz
├── f8ff621a0b9b7646cc0d57d37416feabba2bf78ef5dd0bfc5b080f9f97bbe584.html.gz
├── fde930b01859de8311c6a14f8aa8c72be0659b551367803deb6736cf3526cf2e.html.gz
├── ff0f958ade714ebfaf5c0b42b1c0152a62063f4e6f72141406ccefc4a2677f21.html.gz
└── ffc109d474fdee1a59fa554df8b09643f4a7d45b23eceabad66f0712c3f7daed.html.gz
├── output
├── AutoExtract.json
├── Diffbot.json
├── beautifulsoup.json
├── boilerpipe.json
├── dragnet.json
├── go_domdistiller.json
├── go_readability.json
├── goose3.json
├── html-text.json
├── html2text.json
├── inscriptis.json
├── justext.json
├── news_please.json
├── newspaper.json
├── readability.json
├── readability_js.json
├── trafilatura.json
└── xpath-text.json
├── requirements.txt
└── tests.py
/.github/workflows/evaluate.yml:
--------------------------------------------------------------------------------
1 | name: Evaluate
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - master
8 |
9 | jobs:
10 | evaluator:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 |
16 | - name: Set up Python 3.7
17 | uses: actions/setup-python@v2
18 | with:
19 | python-version: '3.7'
20 |
21 | - name: Run Evaluation
22 | id: evaluation
23 | run: |
24 | RESULT=$(python3 evaluate.py)
25 | echo "${RESULT}"
26 | # hack for multiline output
27 | RESULT="${RESULT//'%'/'%25'}"
28 | RESULT="${RESULT//$'\n'/'%0A'}"
29 | RESULT="${RESULT//$'\r'/'%0D'}"
30 | echo "::set-output name=result::${RESULT}"
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .mypy_cache/
2 | __pycache__/
3 | extractors/go_readability/go_readability_cli
4 | extractors/go_domdistiller/go_domdistiller_cli
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Scrapinghub
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Article extraction benchmark: open-source libraries and commercial services
2 | ===========================================================================
3 |
4 | We evaluate the quality of article body
5 | extraction for commercial services
6 | `Zyte Automatic Extraction (ours) `_,
7 | `Diffbot `_
8 | and open-source libraries
9 | `newspaper3k `_,
10 | `readability-lxml `_,
11 | `dragnet `_,
12 | `boilerpipe `_,
13 | `html-text `_,
14 | `trafilatura `_,
15 | `go-readability `_,
16 | `Readability.js `_,
17 | `Go-DomDistiller `_.
18 | `news-please `_.
19 | `Goose3 `_,
20 | `inscriptis `_,
21 | `html2text `_,
22 | `jusText `_,
23 | `BeautifulSoup `_.
24 | We release evaluation datasets and scripts,
25 | and provide more details in a whitepaper.
26 |
27 | Article extraction is a task of extracting certain fields of an article
28 | (e.g. news or blog post), such as headline, article body, publication date,
29 | authors, etc. Article extraction systems must work on any web-site.
30 | Here we evaluate only the article body field, as this is one of the most important fields
31 | and one of the hardest to get right.
32 |
33 | .. contents::
34 |
35 | Results
36 | -------
37 |
38 | Results of the initial evaluation, done in November 2019::
39 |
40 | version F1 precision recall accuracy
41 | AutoExtract Nov 2019 0.970 ± 0.005 0.984 ± 0.002 0.956 ± 0.010 0.470 ± 0.037
42 | Diffbot Nov 2019 0.951 ± 0.010 0.958 ± 0.009 0.944 ± 0.013 0.348 ± 0.038
43 | boilerpipe ab3694d 0.860 ± 0.016 0.850 ± 0.016 0.870 ± 0.020 0.006 ± 0.006
44 | dragnet 1b65e7b 0.907 ± 0.014 0.925 ± 0.013 0.889 ± 0.019 0.221 ± 0.030
45 | html-text 0.5.1 0.665 ± 0.015 0.500 ± 0.017 0.994 ± 0.001 0.000 ± 0.000
46 | newspaper3k 0.2.8 0.912 ± 0.014 0.917 ± 0.014 0.906 ± 0.018 0.260 ± 0.032
47 | readability-lxml 0.7.1 0.922 ± 0.014 0.913 ± 0.014 0.931 ± 0.016 0.315 ± 0.035
48 | xpath-text 4.4.2 0.394 ± 0.020 0.246 ± 0.016 0.992 ± 0.001 0.000 ± 0.000
49 |
50 | Result of packages added after original evaluation::
51 |
52 | version F1 precision recall accuracy
53 | trafilatura 0.5.1 0.945 ± 0.009 0.925 ± 0.011 0.966 ± 0.009 0.221 ± 0.031
54 | go_readability bdc8717 0.943 ± 0.007 0.912 ± 0.009 0.975 ± 0.007 0.210 ± 0.030
55 | readability_js Feb 2021 0.887 ± 0.012 0.853 ± 0.013 0.924 ± 0.012 0.149 ± 0.026
56 | go_domdistiller 1c90a88 0.927 ± 0.007 0.901 ± 0.010 0.956 ± 0.010 0.066 ± 0.018
57 | news_please 1.5.17 0.911 ± 0.014 0.917 ± 0.013 0.906 ± 0.018 0.249 ± 0.032
58 | goose3 3.1.8 0.887 ± 0.016 0.930 ± 0.015 0.847 ± 0.021 0.227 ± 0.032
59 | inscriptis 1.1.2 0.679 ± 0.015 0.517 ± 0.017 0.993 ± 0.001 0.000 ± 0.000
60 | html2text 2020.1.16 0.662 ± 0.015 0.499 ± 0.017 0.983 ± 0.002 0.000 ± 0.000
61 | justext 2.2.0 0.802 ± 0.018 0.858 ± 0.017 0.754 ± 0.028 0.088 ± 0.021
62 | beautifulsoup 4.9.3 0.665 ± 0.015 0.499 ± 0.017 0.994 ± 0.001 0.000 ± 0.000
63 |
64 | Below you can find more details about the packages and result reproduction.
65 |
66 | More details
67 | ------------
68 |
69 | More details are available:
70 |
71 | - In the whitepaper at https://www.zyte.com/whitepaper-ebook/in-depth-analysis-and-evaluation-on-the-quality-of-article-body-extraction/
72 | - In a technical report attached to the v1.0.0 release at
73 | https://github.com/scrapinghub/article-extraction-benchmark/releases/tag/v1.0.0
74 |
75 | Installation
76 | ------------
77 |
78 | Clone this repo, and use Python 3.6+.
79 |
80 | Evaluation does not require any dependencies.
81 | Dependencies listed in ``requirements.txt`` are only for re-generating
82 | output files for open-source article extraction libraries.
83 | See below for their installation details.
84 |
85 | Data
86 | ----
87 |
88 | JSON data format: a dictionary which maps item ids to dictionaries,
89 | with the following fields:
90 |
91 | - ``articleBody``: text of the article
92 | - ``url``: page url (optional)
93 |
94 | All files should have the same keys.
95 | Ground truth is in ``ground-truth.json``,
96 | predictions from different systems is in ``output/*.json`` files.
97 |
98 | HTML files are in ``html`` folder. They were fetched with Splash headless
99 | browser with JS disabled by default. They are gzip-compressed and utf-8 encoded.
100 |
101 | Screenshots of all pages are not in the repo, they are available on github
102 | in the "Releases" section: https://github.com/scrapinghub/article-extraction-benchmark/releases
103 |
104 | Open-source libraries
105 | ---------------------
106 |
107 | In addition to benchmarking AutoExtract and Diffbot services, we also benchmark several
108 | open-source libraries that work directly on HTML files without a need for rendering
109 | or external resources:
110 |
111 | - newspaper3k: https://github.com/codelucas/newspaper
112 | - readability-lxml: https://github.com/buriy/python-readability
113 | - dragnet: https://github.com/dragnet-org/dragnet
114 | - boilerpipe: https://github.com/misja/python-boilerpipe
115 | - html-text: https://github.com/TeamHG-Memex/html-text -
116 | this is a baseline which extracts the full text of HTML page
117 | - trafilatura: https://github.com/adbar/trafilatura contributed by the author
118 | at https://github.com/scrapinghub/article-extraction-benchmark/pull/4
119 | - go-readability: https://github.com/go-shiori/go-readability
120 | - Readability.js: https://github.com/mozilla/readability
121 | - Go-DomDistiller: https://github.com/markusmobius/go-domdistiller
122 | - news-please: https://github.com/fhamborg/news-please
123 | - Goose3: https://github.com/goose3/goose3
124 | - inscriptis: https://github.com/weblyzard/inscriptis -
125 | converts HTML to text with a particular emphasis on nested tables
126 | - html2text: https://github.com/Alir3z4/html2text -
127 | converts HTML pages to Markup language
128 | - jusText: https://github.com/miso-belica/jusText -
129 | Heuristic based boilerplate removal tool
130 | - BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ -
131 | Python library for pulling data out of HTML and XML files.
132 |
133 | Output from these libraries is already present in the repo in ``output/*.json`` files.
134 | They were generated with ``extractors/run_*.py`` files.
135 |
136 | All dependencies are in ``requirements.txt``.
137 | Note that dragnet may fail to install at first try, as
138 | you need to have ``numpy`` and ``Cython`` installed, and have ``libxml2`` headers
139 | (``libxml2-dev`` on Ubuntu).
140 |
141 | boilerpipe requires a custom installation: use python2, you also need Java
142 | (e.g. install ``default-jre`` in Ubuntu), install it with
143 | ``pip install -e git+https://github.com/misja/python-boilerpipe.git@ab3694d7bf695b73f0684a028e70aa816d63e6cb#egg=boilerpipe``
144 |
145 | go-readability requires a custom installation: see README in ``extractors/go_readability``.
146 |
147 | Readability.js require a custom installation: install nodejs and install cli tool:
148 | ``npm install -g readability-cli@2.2.1-pre``
149 |
150 | Go-DomDistiller requires a custom installation: see README in ``extractors/go_domdistiller``.
151 |
152 | Evaluation
153 | ----------
154 |
155 | For evaluation, run::
156 |
157 | python3 evaluate.py
158 |
159 | We report precision, recall, F1, accuracy and their standard deviation estimated with bootstrap.
160 | Please refer to the technical report for more details.
161 |
162 | License
163 | -------
164 |
165 | License is MIT.
166 |
--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | from collections import Counter
4 | import json
5 | from pathlib import Path
6 | import random
7 | import re
8 | import statistics
9 | from typing import Any, Dict, Tuple, List
10 |
11 |
12 | def main():
13 | """ Perform evaluation for all ``output/*.json`` files,
14 | loading ground truth from ``groud-truth.json``.
15 | Python3.6+ is required.
16 | """
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--n-bootstrap', type=int, default=1000)
19 | parser.add_argument('--bootstrap-differences', action='store_true',
20 | help='run bootstrap for differences')
21 | parser.add_argument('--output', type=Path, help='output results as json')
22 | args = parser.parse_args()
23 | ground_truth = load_json(Path('ground-truth.json'))
24 | metrics_by_name = {}
25 | for path in sorted(Path('output').glob('*.json')):
26 | name = path.stem
27 | metrics = evaluate(ground_truth, load_json(path), args.n_bootstrap)
28 | print('{name:<20} '
29 | 'precision={precision:.3f} ± {precision_std:.3f} '
30 | 'recall={recall:.3f} ± {recall_std:.3f} '
31 | 'F1={f1:.3f} ± {f1_std:.3f} '
32 | 'accuracy={accuracy:.3f} ± {accuracy_std:.3f} '
33 | .format(name=name, **metrics))
34 | metrics_by_name[name] = metrics
35 |
36 | if args.bootstrap_differences:
37 | # check differences with bootstrap
38 | for name, metrics in sorted(metrics_by_name.items()):
39 | tp_fp_fns = metrics['tp_fp_fns']
40 | for other_name, other_metrics in sorted(metrics_by_name.items()):
41 | if name >= other_name:
42 | continue
43 | print(f'Comparison: {name} minus {other_name}')
44 | other_tp_fp_fns = other_metrics['tp_fp_fns']
45 | print_metrics_diff(tp_fp_fns, other_tp_fp_fns, args.n_bootstrap)
46 |
47 | if args.output:
48 | args.output.write_text(
49 | json.dumps(metrics_by_name, indent=4, sort_keys=True))
50 |
51 |
52 | def evaluate(
53 | ground_truth: Dict[str, Dict],
54 | prediction: Dict[str, Dict],
55 | n_bootstrap: int,
56 | ) -> Dict[str, Any]:
57 | if ground_truth.keys() != prediction.keys():
58 | raise ValueError('prediction keys do not match ground truth')
59 | tp_fp_fns = []
60 | accuracies = []
61 | for key in ground_truth.keys():
62 | true = ground_truth[key].get('articleBody', '')
63 | pred = prediction[key].get('articleBody', '')
64 | tp_fp_fns.append(string_shingle_matching(true=true, pred=pred))
65 | accuracies.append(get_accuracy(true=true, pred=pred))
66 | metrics: Dict[str, Any] = metrics_from_tp_fp_fns(tp_fp_fns)
67 | metrics['tp_fp_fns'] = tp_fp_fns
68 | metrics['accuracy'] = statistics.mean(accuracies)
69 |
70 | # add bootstrap estimates of condifence intervals
71 | b_values: Dict[str, List[float]] = {}
72 | for _ in range(n_bootstrap):
73 | n = len(tp_fp_fns)
74 | indices = [random.randint(0, n - 1) for _ in range(n)]
75 | b_metrics = metrics_from_tp_fp_fns([tp_fp_fns[i] for i in indices])
76 | for key in b_metrics:
77 | b_values.setdefault(key, []).append(b_metrics[key])
78 | b_values.setdefault('accuracy', []).append(
79 | statistics.mean([accuracies[i] for i in indices]))
80 | for key, values in sorted(b_values.items()):
81 | metrics[f'{key}_std'] = statistics.stdev(values)
82 |
83 | return metrics
84 |
85 |
86 | def print_metrics_diff(tp_fp_fns, other_tp_fp_fns, n_bootstrap):
87 | diffs = {}
88 | for _ in range(n_bootstrap):
89 | n = len(tp_fp_fns)
90 | indices = [random.randint(0, n - 1) for _ in range(n)]
91 | metrics = metrics_from_tp_fp_fns([tp_fp_fns[i] for i in indices])
92 | other_metrics = metrics_from_tp_fp_fns(
93 | [other_tp_fp_fns[i] for i in indices])
94 | for key in metrics:
95 | diffs.setdefault(key, []).append(metrics[key] - other_metrics[key])
96 | for key, values in sorted(diffs.items()):
97 | mean = statistics.mean(values)
98 | std = statistics.stdev(values)
99 | print(f'{key:<10} {mean:.3f} ± {std:.3f}')
100 |
101 |
102 | TP_FP_FN = Tuple[float, float, float]
103 |
104 |
105 | def metrics_from_tp_fp_fns(tp_fp_fns: List[TP_FP_FN]) -> Dict[str, float]:
106 | precision = statistics.mean([
107 | precision_score(tp, fp, fn) for tp, fp, fn in tp_fp_fns
108 | if tp + fp > 0])
109 | recall = statistics.mean([
110 | recall_score(tp, fp, fn) for tp, fp, fn in tp_fp_fns
111 | if tp + fn > 0])
112 | f1 = 2 * precision * recall / (precision + recall)
113 | return {
114 | 'f1': f1,
115 | 'precision': precision,
116 | 'recall': recall,
117 | }
118 |
119 |
120 | def precision_score(tp: float, fp: float, fn: float) -> float:
121 | if fp == fn == 0:
122 | return 1.
123 | if tp == fp == 0:
124 | return 0.
125 | return tp / (tp + fp)
126 |
127 |
128 | def recall_score(tp: float, fp: float, fn: float) -> float:
129 | if fp == fn == 0:
130 | return 1.
131 | if tp == fn == 0:
132 | return 0.
133 | return tp / (tp + fn)
134 |
135 |
136 | def get_accuracy(true: str, pred: str) -> float:
137 | return float(_tokenize(true) == _tokenize(pred))
138 |
139 |
140 | def string_shingle_matching(
141 | true: str, pred: str, ngram_n: int = 4,
142 | ) -> TP_FP_FN:
143 | """ Compute TP/FP/FN across shingles (joined ngrams).
144 | Intended to be used for articleBody comparison,
145 | similar to the one used here (with shingles instead of tokens):
146 | https://moz.com/devblog/benchmarking-python-content-extraction-algorithms-dragnet-readability-goose-and-eatiht/
147 | """
148 | true_shingles = _all_shingles(true, ngram_n)
149 | pred_shingles = _all_shingles(pred, ngram_n)
150 | tp = fp = fn = 0.
151 | for key in (set(true_shingles) | set(pred_shingles)):
152 | true_count = true_shingles.get(key, 0)
153 | pred_count = pred_shingles.get(key, 0)
154 | tp += min(true_count, pred_count)
155 | fp += max(0, pred_count - true_count)
156 | fn += max(0, true_count - pred_count)
157 | tp_fp_fn = [tp, fp, fn]
158 | s = sum(tp_fp_fn)
159 | # Normalize metrics so that longer texts do not have more weight.
160 | if s > 0:
161 | tp_fp_fn = [x / s for x in tp_fp_fn]
162 | return tuple(tp_fp_fn) # type: ignore
163 |
164 |
165 | def _all_shingles(text: str, ngram_n: int) -> Dict[Tuple[str, ...], int]:
166 | return dict(Counter(_ngrams(text, ngram_n)))
167 |
168 |
169 | _TOKEN_RE = re.compile(
170 | r'\w+', re.UNICODE | re.MULTILINE | re.IGNORECASE | re.DOTALL)
171 |
172 |
173 | def _tokenize(text: str) -> List[str]:
174 | # Note that such simple tokenization will work ok for any language,
175 | # even if several words will be clumped together, as we expect
176 | # that extra predicted text will still be separated.
177 | return _TOKEN_RE.findall(text or '')
178 |
179 |
180 | def _ngrams(text: str, n: int) -> List[Tuple[str, ...]]:
181 | tokens = _tokenize(text)
182 | result = []
183 | for i in range(0, max(1, len(tokens) - n + 1)):
184 | shingle = tuple(tokens[i: i + n])
185 | if shingle:
186 | result.append(shingle)
187 | return result
188 |
189 |
190 | def load_json(path: Path):
191 | with path.open('rt', encoding='utf8') as f:
192 | return json.load(f)
193 |
194 |
195 | if __name__ == '__main__':
196 | main()
197 |
--------------------------------------------------------------------------------
/extractors/go_domdistiller.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | import os
5 | import subprocess
6 | from pathlib import Path
7 | from tempfile import mkstemp
8 |
9 |
10 | # built executable file
11 | CLI_PATH = Path('extractors/go_domdistiller/go_domdistiller_cli')
12 |
13 |
14 | def main():
15 | output = {}
16 | for path in Path('html').glob('*.html.gz'):
17 | with gzip.open(path, 'rt', encoding='utf8') as f:
18 | html = f.read()
19 | item_id = path.stem.split('.')[0]
20 |
21 | # save html to temp file
22 | temp_filepath = mkstemp()[1]
23 | with open(temp_filepath, 'wt') as fw:
24 | fw.write(html)
25 |
26 | # get extracted content from go-domdistiller
27 | result = subprocess.run([CLI_PATH, temp_filepath], stdout=subprocess.PIPE)
28 |
29 | # destroy temp file
30 | os.remove(temp_filepath)
31 |
32 | output[item_id] = {'articleBody': result.stdout.decode('utf-8')}
33 | (Path('output') / 'go_domdistiller.json').write_text(
34 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
35 | encoding='utf8')
36 |
37 |
38 | if __name__ == '__main__':
39 | main()
40 |
--------------------------------------------------------------------------------
/extractors/go_domdistiller/README.rst:
--------------------------------------------------------------------------------
1 | Go-DomDistiller
2 | ===============
3 |
4 | Open Source article extractor written on golang: https://github.com/markusmobius/go-domdistiller.
5 | Based on `DOM Distiller `_ which is part of the Chromium project.
6 | The structure of this package follows the structure of the original Java code.
7 |
8 | Usage
9 | -----
10 |
11 | To use the library I'm wrote a simple cli-module that reads the contents of the file passed in the arguments and outputs the parsing result to stdout.
12 |
13 |
14 | Installation
15 | ------------
16 |
17 | 1. Install golang (I'm used version ``1.15.8``)
18 | 2. Go to the folder containing this file
19 | 3. Build an executable file:
20 |
21 | go build -o go_domdistiller_cli
22 |
--------------------------------------------------------------------------------
/extractors/go_domdistiller/cli.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | distiller "github.com/markusmobius/go-domdistiller"
8 | )
9 |
10 | func main() {
11 | if len(os.Args) < 2 {
12 | panic("Input file not provided in args")
13 | }
14 | if len(os.Args) > 2 {
15 | panic("Args accept only one argument")
16 | }
17 | input := os.Args[1]
18 |
19 | opts := &distiller.Options{
20 | ExtractTextOnly: true,
21 | SkipPagination: true,
22 | }
23 |
24 | article, err := distiller.ApplyForFile(input, opts)
25 | if err != nil {
26 | panic(err)
27 | }
28 |
29 | fmt.Print(article.HTML)
30 | }
31 |
--------------------------------------------------------------------------------
/extractors/go_domdistiller/go.mod:
--------------------------------------------------------------------------------
1 | module cli
2 |
3 | go 1.15
4 |
5 | require github.com/markusmobius/go-domdistiller v0.0.0-20201222130639-1c90a88d11c2
6 |
--------------------------------------------------------------------------------
/extractors/go_domdistiller/go.sum:
--------------------------------------------------------------------------------
1 | github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
2 | github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
3 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
5 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
6 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52 h1:wEe9mu6BOmGYT5yQ9ag5E38LHUMUv7/AFx0J8YNR8HI=
7 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52/go.mod h1:aLEd5DGjh1qYKnJJ/tC5OL0f3CV4CMcreDOn4RpCmUc=
8 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
9 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
10 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
11 | github.com/markusmobius/go-domdistiller v0.0.0-20201222130639-1c90a88d11c2 h1:Zq0OEILmCXTWQdMd1p8a7wk0RvuEfF70ON859jM1n7g=
12 | github.com/markusmobius/go-domdistiller v0.0.0-20201222130639-1c90a88d11c2/go.mod h1:EjE7+WYAL0k+KQX8viF0oy/MH7uKMXmhasAdedoSC3o=
13 | github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
14 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
15 | github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM=
16 | github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
17 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
18 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
19 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
20 | github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4/go.mod h1:+ccdNT0xMY1dtc5XBxumbYfOUhmduiGudqaDgD2rVRE=
21 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
22 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
23 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
24 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
25 | golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
26 | golang.org/x/net v0.0.0-20201031054903-ff519b6c9102 h1:42cLlJJdEh+ySyeUUbEQ5bsTiq8voBeTuweGVkY6Puw=
27 | golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
28 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
29 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
30 | golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
31 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
32 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
33 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
34 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
35 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
36 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
37 | gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
38 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
39 | gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
40 |
--------------------------------------------------------------------------------
/extractors/go_readability/README.rst:
--------------------------------------------------------------------------------
1 | Go-Readability
2 | ==============
3 |
4 | Open Source article extractor written on golang: https://github.com/go-shiori/go-readability . Based from `Readability.js `_ by Mozilla, and written line by line to make sure it looks and works as similar as possible.
5 |
6 | Usage
7 | -----
8 |
9 | To use the library I'm wrote a simple cli-module that reads the contents of the file passed in the arguments and outputs the parsing result to stdout.
10 |
11 |
12 | Installation
13 | ------------
14 |
15 | 1. Install golang (I'm used version ``1.15.8``)
16 | 2. Go to the folder containing this file
17 | 3. Build an executable file:
18 |
19 | go build -o go_readability_cli
20 |
--------------------------------------------------------------------------------
/extractors/go_readability/cli.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | readability "github.com/go-shiori/go-readability"
8 | )
9 |
10 | func main() {
11 | if len(os.Args) < 2 {
12 | panic("Input file not provided in args")
13 | }
14 | if len(os.Args) > 2 {
15 | panic("Args accept only one argument")
16 | }
17 | input := os.Args[1]
18 |
19 | fSrc, err := os.Open(input)
20 | defer fSrc.Close()
21 | if err != nil {
22 | panic(err)
23 | }
24 |
25 | article, err := readability.FromReader(fSrc, "https://fake-url.com")
26 | if err != nil {
27 | panic(err)
28 | }
29 |
30 | fmt.Print(article.TextContent)
31 | }
32 |
--------------------------------------------------------------------------------
/extractors/go_readability/go.mod:
--------------------------------------------------------------------------------
1 | module go_readability_cli
2 |
3 | go 1.15
4 |
5 | require github.com/go-shiori/go-readability v0.0.0-20201011032228-bdc871772408
6 |
--------------------------------------------------------------------------------
/extractors/go_readability/go.sum:
--------------------------------------------------------------------------------
1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
3 | github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
4 | github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
5 | github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
6 | github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
7 | github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
8 | github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
9 | github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
10 | github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
11 | github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
12 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
13 | github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
14 | github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
15 | github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
16 | github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
17 | github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
18 | github.com/cpuguy83/go-md2man/v2 v2.0.0 h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM=
19 | github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
20 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
21 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
22 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
23 | github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
24 | github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
25 | github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
26 | github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
27 | github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
28 | github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
29 | github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
30 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52 h1:wEe9mu6BOmGYT5yQ9ag5E38LHUMUv7/AFx0J8YNR8HI=
31 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52/go.mod h1:aLEd5DGjh1qYKnJJ/tC5OL0f3CV4CMcreDOn4RpCmUc=
32 | github.com/go-shiori/go-readability v0.0.0-20201011032228-bdc871772408 h1:xq7Sck0bwvgp/WWw6tHFDn3dUTCQwWRWLudr+inH/gs=
33 | github.com/go-shiori/go-readability v0.0.0-20201011032228-bdc871772408/go.mod h1:sz+ASCdyPdgLAjKpovPn+u+IZjBoGp7vWa1w1yZfi3Y=
34 | github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
35 | github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
36 | github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
37 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
38 | github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
39 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
40 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
41 | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
42 | github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
43 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
44 | github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
45 | github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
46 | github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
47 | github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
48 | github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
49 | github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
50 | github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
51 | github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
52 | github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
53 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
54 | github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
55 | github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
56 | github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
57 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
58 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
59 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
60 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
61 | github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
62 | github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
63 | github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
64 | github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
65 | github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
66 | github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
67 | github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
68 | github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
69 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
70 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
71 | github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
72 | github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso=
73 | github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
74 | github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
75 | github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
76 | github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
77 | github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
78 | github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
79 | github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
80 | github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
81 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
82 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
83 | github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0=
84 | github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
85 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
86 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
87 | github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
88 | github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM=
89 | github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
90 | github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM=
91 | github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
92 | github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
93 | github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
94 | github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE=
95 | github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
96 | github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
97 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
98 | github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE=
99 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
100 | github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
101 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
102 | github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
103 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
104 | github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
105 | github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
106 | github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
107 | github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
108 | go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
109 | go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
110 | go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
111 | go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
112 | golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
113 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
114 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
115 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
116 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
117 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
118 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
119 | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
120 | golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
121 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
122 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
123 | golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
124 | golang.org/x/net v0.0.0-20201010224723-4f7140c49acb h1:mUVeFHoDKis5nxCAzoAi7E8Ghb86EXh/RK6wtvJIqRY=
125 | golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
126 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
127 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
128 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
129 | golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
130 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
131 | golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
132 | golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
133 | golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
134 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
135 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
136 | golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
137 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
138 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
139 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
140 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
141 | golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
142 | golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
143 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
144 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
145 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
146 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
147 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
148 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
149 | google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
150 | gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
151 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
152 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
153 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
154 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
155 | gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
156 | gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
157 | gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
158 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
159 | gopkg.in/yaml.v2 v2.2.4 h1:/eiJrUcujPVeJ3xlSWaiNi3uSVmDGBK1pDHUHAnao1I=
160 | gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
161 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
162 |
--------------------------------------------------------------------------------
/extractors/run_beautifulsoup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | from bs4 import BeautifulSoup
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | bs = BeautifulSoup(html, 'html.parser')
16 | article = bs.get_text(separator=' ', strip=True)
17 | output[item_id] = {'articleBody': article}
18 | (Path('output') / 'beautifulsoup.json').write_text(
19 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
20 | encoding='utf8')
21 |
22 |
23 | if __name__ == '__main__':
24 | main()
25 |
--------------------------------------------------------------------------------
/extractors/run_boilerpipe.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | import codecs
3 | import gzip
4 | import json
5 | import glob
6 | import os.path
7 |
8 | from boilerpipe.extract import Extractor
9 |
10 |
11 | def main():
12 | output = {}
13 | for path in glob.glob('html/*.html.gz'):
14 | with gzip.open(path, 'rb') as f:
15 | html = f.read().decode('utf8')
16 | item_id = os.path.basename(path).split('.')[0]
17 | extractor = Extractor(extractor='ArticleExtractor', html=html)
18 | output[item_id] = {'articleBody': extractor.getText()}
19 | with codecs.open(os.path.join('output', 'boilerpipe.json'),
20 | 'wt', encoding='utf8') as f:
21 | json.dump(output, f, sort_keys=True, ensure_ascii=False, indent=4)
22 |
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/extractors/run_dragnet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | from dragnet import extract_content
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | content = extract_content(html, encoding='utf8')
16 | output[item_id] = {'articleBody': content}
17 | (Path('output') / 'dragnet.json').write_text(
18 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
19 | encoding='utf8')
20 |
21 |
22 | if __name__ == '__main__':
23 | main()
24 |
--------------------------------------------------------------------------------
/extractors/run_go_readability.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | import os
5 | import subprocess
6 | from pathlib import Path
7 | from tempfile import mkstemp
8 |
9 |
10 | # built executable file
11 | CLI_PATH = Path('extractors/go_readability/go_readability_cli')
12 |
13 |
14 | def main():
15 | output = {}
16 | for path in Path('html').glob('*.html.gz'):
17 | with gzip.open(path, 'rt', encoding='utf8') as f:
18 | html = f.read()
19 | item_id = path.stem.split('.')[0]
20 |
21 | # save html to temp file
22 | temp_filepath = mkstemp()[1]
23 | with open(temp_filepath, 'wt') as fw:
24 | fw.write(html)
25 |
26 | # get extracted content from go-readadbility
27 | result = subprocess.run([CLI_PATH, temp_filepath], stdout=subprocess.PIPE)
28 |
29 | # destroy temp file
30 | os.remove(temp_filepath)
31 |
32 | output[item_id] = {'articleBody': result.stdout.decode('utf-8')}
33 | (Path('output') / 'go_readability.json').write_text(
34 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
35 | encoding='utf8')
36 |
37 |
38 | if __name__ == '__main__':
39 | main()
40 |
--------------------------------------------------------------------------------
/extractors/run_goose3.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | from goose3 import Goose
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | g = Goose()
16 | article = g.extract(raw_html=html)
17 | output[item_id] = {'articleBody': article.cleaned_text}
18 | (Path('output') / 'goose3.json').write_text(
19 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
20 | encoding='utf8')
21 |
22 |
23 | if __name__ == '__main__':
24 | main()
25 |
--------------------------------------------------------------------------------
/extractors/run_html2text.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | from html2text import HTML2Text
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | h = HTML2Text()
16 | h.ignore_links = True
17 | h.ignore_images = True
18 | content = h.handle(html)
19 | output[item_id] = {'articleBody': content}
20 | (Path('output') / 'html2text.json').write_text(
21 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
22 | encoding='utf8')
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/extractors/run_html_text.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | import html_text
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | output[item_id] = {'articleBody': html_text.extract_text(html)}
16 | (Path('output') / 'html-text.json').write_text(
17 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
18 | encoding='utf8')
19 |
20 |
21 | if __name__ == '__main__':
22 | main()
23 |
--------------------------------------------------------------------------------
/extractors/run_inscriptis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | from inscriptis import get_text
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | content = get_text(html)
16 | output[item_id] = {'articleBody': content}
17 | (Path('output') / 'inscriptis.json').write_text(
18 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
19 | encoding='utf8')
20 |
21 |
22 | if __name__ == '__main__':
23 | main()
24 |
--------------------------------------------------------------------------------
/extractors/run_justext.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | import justext
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | article = ' '.join(
16 | [p.text for p in justext.justext(html, justext.get_stoplist("English"), 50, 200, 0.1, 0.2, 0.2, 200, True)
17 | if not p.is_boilerplate])
18 | output[item_id] = {'articleBody': article}
19 | (Path('output') / 'justext.json').write_text(
20 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
21 | encoding='utf8')
22 |
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/extractors/run_news_please.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | from newsplease import NewsPlease
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | article = NewsPlease.from_html(html, url=None)
16 | output[item_id] = {'articleBody': article.maintext}
17 | (Path('output') / 'news_please.json').write_text(
18 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
19 | encoding='utf8')
20 |
21 |
22 | if __name__ == '__main__':
23 | main()
24 |
--------------------------------------------------------------------------------
/extractors/run_newspaper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | from newspaper import Article
7 |
8 |
9 | def main():
10 | output = {}
11 | url_by_item_id = {item_id: item['url'] for item_id, item in json.loads(
12 | Path('ground-truth.json').read_text('utf8')).items()}
13 | for path in Path('html').glob('*.html.gz'):
14 | with gzip.open(path, 'rt', encoding='utf8') as f:
15 | html = f.read()
16 | item_id = path.stem.split('.')[0]
17 | article = Article(url_by_item_id[item_id])
18 | article.set_html(html)
19 | article.parse()
20 | output[item_id] = {'articleBody': article.text}
21 | (Path('output') / 'newspaper.json').write_text(
22 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
23 | encoding='utf8')
24 |
25 |
26 | if __name__ == '__main__':
27 | main()
28 |
--------------------------------------------------------------------------------
/extractors/run_readability.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | import html_text
7 | from readability import Document
8 |
9 |
10 | def main():
11 | output = {}
12 | for path in Path('html').glob('*.html.gz'):
13 | with gzip.open(path, 'rt', encoding='utf8') as f:
14 | html = f.read()
15 | item_id = path.stem.split('.')[0]
16 | doc = Document(html)
17 | text = html_text.extract_text(doc.summary(html_partial=True))
18 | output[item_id] = {'articleBody': text}
19 | (Path('output') / 'readability.json').write_text(
20 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
21 | encoding='utf8')
22 |
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/extractors/run_readability_js.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | import os
5 | import subprocess
6 | from pathlib import Path
7 | from tempfile import mkstemp
8 |
9 |
10 | # executable file from `readability-cli` package
11 | CLI_PATH = Path('/usr/local/bin/readable')
12 |
13 |
14 | def main():
15 | output = {}
16 | for path in Path('html').glob('*.html.gz'):
17 | with gzip.open(path, 'rt', encoding='utf8') as f:
18 | html = f.read()
19 | item_id = path.stem.split('.')[0]
20 |
21 | # save html to temp file
22 | temp_filepath = mkstemp()[1]
23 | with open(temp_filepath, 'wt') as fw:
24 | fw.write(html)
25 |
26 | # get extracted content from Readability.js (use readability-cli)
27 | result = subprocess.run(
28 | [CLI_PATH, temp_filepath, '--properties=text-content', '--low-confidence=force'],
29 | stdout=subprocess.PIPE
30 | )
31 |
32 | # destroy temp file
33 | os.remove(temp_filepath)
34 |
35 | output[item_id] = {'articleBody': result.stdout.decode('utf-8')}
36 | (Path('output') / 'readability_js.json').write_text(
37 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
38 | encoding='utf8')
39 |
40 |
41 | if __name__ == '__main__':
42 | main()
43 |
--------------------------------------------------------------------------------
/extractors/run_trafilatura.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | import trafilatura
7 |
8 |
9 | def main():
10 | output = {}
11 | for path in Path('html').glob('*.html.gz'):
12 | with gzip.open(path, 'rt', encoding='utf8') as f:
13 | html = f.read()
14 | item_id = path.stem.split('.')[0]
15 | output[item_id] = {'articleBody': trafilatura.extract(html, include_comments=False)}
16 | (Path('output') / 'trafilatura.json').write_text(
17 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
18 | encoding='utf8')
19 |
20 |
21 | if __name__ == '__main__':
22 | main()
23 |
--------------------------------------------------------------------------------
/extractors/run_xpath_text.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import gzip
3 | import json
4 | from pathlib import Path
5 |
6 | import lxml.html
7 |
8 |
9 | def xpath_text(html: str) -> str:
10 | root = lxml.html.fromstring(html)
11 | bodies = root.xpath('//body')
12 | if bodies:
13 | root = bodies[0]
14 | return ' '.join(root.xpath('.//text()'))
15 |
16 |
17 | def main():
18 | output = {}
19 | for path in Path('html').glob('*.html.gz'):
20 | with gzip.open(path, 'rt', encoding='utf8') as f:
21 | html = f.read()
22 | item_id = path.stem.split('.')[0]
23 | output[item_id] = {'articleBody': xpath_text(html)}
24 | (Path('output') / 'xpath-text.json').write_text(
25 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
26 | encoding='utf8')
27 |
28 |
29 | if __name__ == '__main__':
30 | main()
31 |
--------------------------------------------------------------------------------
/html/042bb7b5fedab6eac7db576522b89b93904c237d344bcbe14a6a5ab7f7335856.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/042bb7b5fedab6eac7db576522b89b93904c237d344bcbe14a6a5ab7f7335856.html.gz
--------------------------------------------------------------------------------
/html/04a6711caa7c687592777718866e781e976e0fe684faebe8b3cedcef8cd0ea34.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/04a6711caa7c687592777718866e781e976e0fe684faebe8b3cedcef8cd0ea34.html.gz
--------------------------------------------------------------------------------
/html/05844573ca7e1fba714d715bb11ca08c26e25328999c74a1cb3bc8a0e4399f0f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/05844573ca7e1fba714d715bb11ca08c26e25328999c74a1cb3bc8a0e4399f0f.html.gz
--------------------------------------------------------------------------------
/html/06e5123e4ef7cfb4533250dc45d1e03d0838fc66223f45c583c4d12f48b4da85.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/06e5123e4ef7cfb4533250dc45d1e03d0838fc66223f45c583c4d12f48b4da85.html.gz
--------------------------------------------------------------------------------
/html/06ee193de4bd611f7fafbab0c59b0f6fe3495093516720632cd093b24c7a0e98.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/06ee193de4bd611f7fafbab0c59b0f6fe3495093516720632cd093b24c7a0e98.html.gz
--------------------------------------------------------------------------------
/html/076f4f33bf75059db581bedf36e76fb65e89a8f7752db3339aa3ea11c5122f32.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/076f4f33bf75059db581bedf36e76fb65e89a8f7752db3339aa3ea11c5122f32.html.gz
--------------------------------------------------------------------------------
/html/08f793762792bd252c75fb57544cdf506ffcc04785136cb87503f02364b82b56.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/08f793762792bd252c75fb57544cdf506ffcc04785136cb87503f02364b82b56.html.gz
--------------------------------------------------------------------------------
/html/098bb3e96c0acdf36efdcde45fb9cca3f8c82c7cb2071b76097a1b96155f1eb2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/098bb3e96c0acdf36efdcde45fb9cca3f8c82c7cb2071b76097a1b96155f1eb2.html.gz
--------------------------------------------------------------------------------
/html/0d46122928b6f468cc4bbc694051d0dbae5702bc75a16dab82a99b58daf150a0.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0d46122928b6f468cc4bbc694051d0dbae5702bc75a16dab82a99b58daf150a0.html.gz
--------------------------------------------------------------------------------
/html/0dd1357045727799a447563fd8851f4ebe79f042073ea16991a9b67aa595f81a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0dd1357045727799a447563fd8851f4ebe79f042073ea16991a9b67aa595f81a.html.gz
--------------------------------------------------------------------------------
/html/0e014df693f182824fe5e24030ddbe1d0b96ddb9685cf20d5766457ed32ffa2d.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0e014df693f182824fe5e24030ddbe1d0b96ddb9685cf20d5766457ed32ffa2d.html.gz
--------------------------------------------------------------------------------
/html/0ec95c7261d122f304728e90c983450ef1ce1e0b423546835c397d50aaf0d0f2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0ec95c7261d122f304728e90c983450ef1ce1e0b423546835c397d50aaf0d0f2.html.gz
--------------------------------------------------------------------------------
/html/11ea381ad92b5448cf66eae62f52ac565361a244c8881615fc6a7bb523cc0c32.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/11ea381ad92b5448cf66eae62f52ac565361a244c8881615fc6a7bb523cc0c32.html.gz
--------------------------------------------------------------------------------
/html/14cc2a0ca59c62a8c9f205a171e9ccf4ef4cf69b0c642f51c8c65c051b39024f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/14cc2a0ca59c62a8c9f205a171e9ccf4ef4cf69b0c642f51c8c65c051b39024f.html.gz
--------------------------------------------------------------------------------
/html/156770d676ce79905198e1c8407f81e5ecfb617d9aa44712718707eb7e3b8e38.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/156770d676ce79905198e1c8407f81e5ecfb617d9aa44712718707eb7e3b8e38.html.gz
--------------------------------------------------------------------------------
/html/16c30add7e96315e9cc957d85aa876ccb6b70055f0ddab51547a586117cc1f56.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/16c30add7e96315e9cc957d85aa876ccb6b70055f0ddab51547a586117cc1f56.html.gz
--------------------------------------------------------------------------------
/html/1ace8c85aaee21b9d4505eca506d50c4721c29db62848b567a9703bfe0583892.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/1ace8c85aaee21b9d4505eca506d50c4721c29db62848b567a9703bfe0583892.html.gz
--------------------------------------------------------------------------------
/html/1ee91d1fce65e09be8b8d2d29eab771546d98ca2ba5c862941e660e9fec12432.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/1ee91d1fce65e09be8b8d2d29eab771546d98ca2ba5c862941e660e9fec12432.html.gz
--------------------------------------------------------------------------------
/html/1f765c48780665e89cc3af1f7c9af47876e9fae9b5be4a936b0649e10f5e3198.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/1f765c48780665e89cc3af1f7c9af47876e9fae9b5be4a936b0649e10f5e3198.html.gz
--------------------------------------------------------------------------------
/html/20b2b64916b00b25203c9f1bf14248922f4d522f18328e9f876cce116df0083e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/20b2b64916b00b25203c9f1bf14248922f4d522f18328e9f876cce116df0083e.html.gz
--------------------------------------------------------------------------------
/html/21486419bb109c5a62a68957f528e6ff29c92f58d8d3c1f2837c86ff3f3e11f9.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/21486419bb109c5a62a68957f528e6ff29c92f58d8d3c1f2837c86ff3f3e11f9.html.gz
--------------------------------------------------------------------------------
/html/232a43fb15abde807427b2a7bf4f772e27b8760554370956d8291df4e8166dbf.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/232a43fb15abde807427b2a7bf4f772e27b8760554370956d8291df4e8166dbf.html.gz
--------------------------------------------------------------------------------
/html/23aaecd14171f96cfd201a8a46666097e286ad71f74f29347a78c5ecba50da1e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/23aaecd14171f96cfd201a8a46666097e286ad71f74f29347a78c5ecba50da1e.html.gz
--------------------------------------------------------------------------------
/html/264dc3ae31249cb1f50c50986e0952a4708c2e705d18a2d8bf0e525da6e2b485.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/264dc3ae31249cb1f50c50986e0952a4708c2e705d18a2d8bf0e525da6e2b485.html.gz
--------------------------------------------------------------------------------
/html/287e4d9f4af31733aad6534aefb2bd00fb344ec8d6ebf1ac99dbc4d762da0ca4.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/287e4d9f4af31733aad6534aefb2bd00fb344ec8d6ebf1ac99dbc4d762da0ca4.html.gz
--------------------------------------------------------------------------------
/html/291a8bf33ee49074f33dcff37544ac40506cae450db83b6cb63f02b9920b51c2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/291a8bf33ee49074f33dcff37544ac40506cae450db83b6cb63f02b9920b51c2.html.gz
--------------------------------------------------------------------------------
/html/2c46804d9db4a85e8f8d31128ce0e11d02f25c7120c2faa5ec0664c604a47717.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/2c46804d9db4a85e8f8d31128ce0e11d02f25c7120c2faa5ec0664c604a47717.html.gz
--------------------------------------------------------------------------------
/html/2f42ef1d3ea0c96e56355d3db93d0e06b47e760b74f6f4261278b8cd1c246dd6.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/2f42ef1d3ea0c96e56355d3db93d0e06b47e760b74f6f4261278b8cd1c246dd6.html.gz
--------------------------------------------------------------------------------
/html/30b771a40a4e96156d398716c877deef54b05d091770d2717c98e4c6b670010c.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/30b771a40a4e96156d398716c877deef54b05d091770d2717c98e4c6b670010c.html.gz
--------------------------------------------------------------------------------
/html/3252222e61fe78982cffe0b0bad2b089c27b32f65852d1c5d3951517f3c2e295.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3252222e61fe78982cffe0b0bad2b089c27b32f65852d1c5d3951517f3c2e295.html.gz
--------------------------------------------------------------------------------
/html/33fe2471fd553c6570f93997f208b4f39bf30be5947c3cfa620ee8eff3355ab9.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/33fe2471fd553c6570f93997f208b4f39bf30be5947c3cfa620ee8eff3355ab9.html.gz
--------------------------------------------------------------------------------
/html/34a7328535ad4e60b059f81d37eec5d25c2bc8de759ce9a7b5e47ac7dc6fd1b0.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/34a7328535ad4e60b059f81d37eec5d25c2bc8de759ce9a7b5e47ac7dc6fd1b0.html.gz
--------------------------------------------------------------------------------
/html/358cc4a080456476b0f883c56bdce796874c286ed6efab25f5718dd95fab42a8.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/358cc4a080456476b0f883c56bdce796874c286ed6efab25f5718dd95fab42a8.html.gz
--------------------------------------------------------------------------------
/html/359fee228518d55b921194561e9ca88e428df81940246f8fac7a75398377daea.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/359fee228518d55b921194561e9ca88e428df81940246f8fac7a75398377daea.html.gz
--------------------------------------------------------------------------------
/html/35b158918c676ff2c74445517db76c83db70a805cc50b64e1369b354a027fcbd.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/35b158918c676ff2c74445517db76c83db70a805cc50b64e1369b354a027fcbd.html.gz
--------------------------------------------------------------------------------
/html/360c732d1fdbfc6895d7096c0c0b8c0d581bb1af80160f4c6a0f1fd9ff85e469.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/360c732d1fdbfc6895d7096c0c0b8c0d581bb1af80160f4c6a0f1fd9ff85e469.html.gz
--------------------------------------------------------------------------------
/html/374ac9a59a85196cdacc1679fb8993521a7b7d9d6533720f102300be1c7face4.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/374ac9a59a85196cdacc1679fb8993521a7b7d9d6533720f102300be1c7face4.html.gz
--------------------------------------------------------------------------------
/html/39d5c43beb60605c3eec760c99500e62e7bd71ebbe4ae05edf382125e1b0b80a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/39d5c43beb60605c3eec760c99500e62e7bd71ebbe4ae05edf382125e1b0b80a.html.gz
--------------------------------------------------------------------------------
/html/3c5bf8db4272925bf1dd5713fc325e179fd0d1cc6fb8c77aa2d917cfd2518a32.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3c5bf8db4272925bf1dd5713fc325e179fd0d1cc6fb8c77aa2d917cfd2518a32.html.gz
--------------------------------------------------------------------------------
/html/3c6d3381ef52ca26be2fbde19c1b0fe17d85682b726dfecf5e300c1ca34546b1.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3c6d3381ef52ca26be2fbde19c1b0fe17d85682b726dfecf5e300c1ca34546b1.html.gz
--------------------------------------------------------------------------------
/html/3cb22bfabed8de715c0813a7bb5052363c96bd71ccce3bb2dfb3ab9d1d7a9bbc.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3cb22bfabed8de715c0813a7bb5052363c96bd71ccce3bb2dfb3ab9d1d7a9bbc.html.gz
--------------------------------------------------------------------------------
/html/3cb5e2f46626d5bb0345759453036f7eabc0b0c7796b796513606bf693060ced.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3cb5e2f46626d5bb0345759453036f7eabc0b0c7796b796513606bf693060ced.html.gz
--------------------------------------------------------------------------------
/html/3ce1c8fdf6ad2ded9e48a68be71eb069fc453ef1b75f47698428a1fdda0deb24.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3ce1c8fdf6ad2ded9e48a68be71eb069fc453ef1b75f47698428a1fdda0deb24.html.gz
--------------------------------------------------------------------------------
/html/3d8f3404cf975af824d7866b7679bc45189c3eea6adb32f0a125a0904b1abbb2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3d8f3404cf975af824d7866b7679bc45189c3eea6adb32f0a125a0904b1abbb2.html.gz
--------------------------------------------------------------------------------
/html/3f65af7b6b98b1c9ae9a3e0d8a09a85600cdc44e26e4b3a6db96a31f4b1767e3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3f65af7b6b98b1c9ae9a3e0d8a09a85600cdc44e26e4b3a6db96a31f4b1767e3.html.gz
--------------------------------------------------------------------------------
/html/4219d096902dad9fd9d57e881e7928ca66bdf5334c2bc7dfddaa264887777a7a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/4219d096902dad9fd9d57e881e7928ca66bdf5334c2bc7dfddaa264887777a7a.html.gz
--------------------------------------------------------------------------------
/html/42aad16bde9288623543642a9ce1a396be83e2db44aa2ff8cbbfe46e14abd7cc.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/42aad16bde9288623543642a9ce1a396be83e2db44aa2ff8cbbfe46e14abd7cc.html.gz
--------------------------------------------------------------------------------
/html/432362af0be43f6da757ea778bd7f2f000094a565bdebac5af7442987a5372f3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/432362af0be43f6da757ea778bd7f2f000094a565bdebac5af7442987a5372f3.html.gz
--------------------------------------------------------------------------------
/html/4648a420af9984d45b76a4afedf4f74965f8a2e0bf1c69bd3da2dc189020f3c9.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/4648a420af9984d45b76a4afedf4f74965f8a2e0bf1c69bd3da2dc189020f3c9.html.gz
--------------------------------------------------------------------------------
/html/4a44ab3e4c41d56ce9b79eb07acb06aed1bc52aba68a950f06e7de7ef848400a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/4a44ab3e4c41d56ce9b79eb07acb06aed1bc52aba68a950f06e7de7ef848400a.html.gz
--------------------------------------------------------------------------------
/html/51374560f40088e227f0053ff1bb0b8525d10a8d7bfbff1cd6033f42347fd85b.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/51374560f40088e227f0053ff1bb0b8525d10a8d7bfbff1cd6033f42347fd85b.html.gz
--------------------------------------------------------------------------------
/html/51d066b0602c9421d8d6410bc4b931700978409a3faa2a984e8fbde519ad7241.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/51d066b0602c9421d8d6410bc4b931700978409a3faa2a984e8fbde519ad7241.html.gz
--------------------------------------------------------------------------------
/html/5211188428849a31e309ef2475746563ff788b1591c89818c08d5abedec4ef5e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5211188428849a31e309ef2475746563ff788b1591c89818c08d5abedec4ef5e.html.gz
--------------------------------------------------------------------------------
/html/55bb6340e3d7dd8632ba45179ae43c39f8ad0cfcecb4719e3b9cf6106ffb70a3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/55bb6340e3d7dd8632ba45179ae43c39f8ad0cfcecb4719e3b9cf6106ffb70a3.html.gz
--------------------------------------------------------------------------------
/html/57b4dafd18cfd0531b69f81e87158648227c673ef159f8d8c87d34e34bdb21f2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/57b4dafd18cfd0531b69f81e87158648227c673ef159f8d8c87d34e34bdb21f2.html.gz
--------------------------------------------------------------------------------
/html/57d46c9d751e3fd3ffaf3ede7ac20cebd30eacb5ea78e1a6aa0a72059244e7ca.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/57d46c9d751e3fd3ffaf3ede7ac20cebd30eacb5ea78e1a6aa0a72059244e7ca.html.gz
--------------------------------------------------------------------------------
/html/57e2e98887a1965689955921208e32f410b10e2b95c907e74e57982d3edf3cc6.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/57e2e98887a1965689955921208e32f410b10e2b95c907e74e57982d3edf3cc6.html.gz
--------------------------------------------------------------------------------
/html/5a822960e9a2cb1e664d334b6c936c5cb6e41fb5331877538c2c8339cb59d57e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5a822960e9a2cb1e664d334b6c936c5cb6e41fb5331877538c2c8339cb59d57e.html.gz
--------------------------------------------------------------------------------
/html/5ae11e580afc12d3ba1a12944281e6a7a5dded5c98b4efcf24aedcb28f0d5b22.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5ae11e580afc12d3ba1a12944281e6a7a5dded5c98b4efcf24aedcb28f0d5b22.html.gz
--------------------------------------------------------------------------------
/html/5caf91b8a4423735f866b089d2611ea14503584cf3b6f487c6d26eb7b9521fca.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5caf91b8a4423735f866b089d2611ea14503584cf3b6f487c6d26eb7b9521fca.html.gz
--------------------------------------------------------------------------------
/html/5f03fc173ebc6abdfae50b96ce0b05a6137b7d3f2ef379be35a9bb8ca9f49e87.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5f03fc173ebc6abdfae50b96ce0b05a6137b7d3f2ef379be35a9bb8ca9f49e87.html.gz
--------------------------------------------------------------------------------
/html/5f9c5ed5d64dfe682d9bde13b9b4f032a3ebdbf165c06ec49c0705bcbe106e3b.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5f9c5ed5d64dfe682d9bde13b9b4f032a3ebdbf165c06ec49c0705bcbe106e3b.html.gz
--------------------------------------------------------------------------------
/html/5fa3154ec031ab35411a457d78eb5aa92c0e803c5329bd05c001e6d64009e206.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5fa3154ec031ab35411a457d78eb5aa92c0e803c5329bd05c001e6d64009e206.html.gz
--------------------------------------------------------------------------------
/html/5fa5679de56c43edf70685762c2d1f2de296432ae53aa46e075b552fee17cab8.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5fa5679de56c43edf70685762c2d1f2de296432ae53aa46e075b552fee17cab8.html.gz
--------------------------------------------------------------------------------
/html/5fbc7ccb504c755ae23a85499a17518483d7862b74b4a5c34d86ede1a1a4448e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5fbc7ccb504c755ae23a85499a17518483d7862b74b4a5c34d86ede1a1a4448e.html.gz
--------------------------------------------------------------------------------
/html/612cd29826624e68ce96789c8049e16279dfd2fceb27434eea7943b2aaf84e90.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/612cd29826624e68ce96789c8049e16279dfd2fceb27434eea7943b2aaf84e90.html.gz
--------------------------------------------------------------------------------
/html/624fcd903d56fc7055fa7097b330629450c095ad6937318deb027be7803bbf35.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/624fcd903d56fc7055fa7097b330629450c095ad6937318deb027be7803bbf35.html.gz
--------------------------------------------------------------------------------
/html/63db31a161b3c5b64e88c2978635cbc38d342ba82fd2c5335321203dcc55c76f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/63db31a161b3c5b64e88c2978635cbc38d342ba82fd2c5335321203dcc55c76f.html.gz
--------------------------------------------------------------------------------
/html/65408257dbe4b41f71a35ade24e30243265095fc1d4988a35b9a6ca52f2b4eab.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/65408257dbe4b41f71a35ade24e30243265095fc1d4988a35b9a6ca52f2b4eab.html.gz
--------------------------------------------------------------------------------
/html/65bf3048b500bbd84928d9122f99617ca898216b91add1d8b2ac09c670484a5c.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/65bf3048b500bbd84928d9122f99617ca898216b91add1d8b2ac09c670484a5c.html.gz
--------------------------------------------------------------------------------
/html/65ce3a4577a0306994efa190a0d96e84014f9d4257ad54753e807ede518f02c0.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/65ce3a4577a0306994efa190a0d96e84014f9d4257ad54753e807ede518f02c0.html.gz
--------------------------------------------------------------------------------
/html/680c2848e94a96f961a0964631de94ac572f83c45bfd0bec2deafa893bcfe15c.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/680c2848e94a96f961a0964631de94ac572f83c45bfd0bec2deafa893bcfe15c.html.gz
--------------------------------------------------------------------------------
/html/686bb170effe273eaff1c0f88e412172e8d972518a6d1454c896f52aafaa9643.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/686bb170effe273eaff1c0f88e412172e8d972518a6d1454c896f52aafaa9643.html.gz
--------------------------------------------------------------------------------
/html/6a72de37e8f98f4eee6c0821e593b35ce536cef6c8b424c5e1dd747ebe6621ba.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/6a72de37e8f98f4eee6c0821e593b35ce536cef6c8b424c5e1dd747ebe6621ba.html.gz
--------------------------------------------------------------------------------
/html/6ebac05f637ece8aa57c298a2a5e3a8047f546f855d0f29cc683cea60ce85c85.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/6ebac05f637ece8aa57c298a2a5e3a8047f546f855d0f29cc683cea60ce85c85.html.gz
--------------------------------------------------------------------------------
/html/702d1da63b8e064cb70617620e45c2d116b4912c9bc9d518dcf5ce54bb8057ed.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/702d1da63b8e064cb70617620e45c2d116b4912c9bc9d518dcf5ce54bb8057ed.html.gz
--------------------------------------------------------------------------------
/html/70cb2d5bca75ab5a8f6bb378a38a52f882f6bda508de93b12502e74936d86ff2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/70cb2d5bca75ab5a8f6bb378a38a52f882f6bda508de93b12502e74936d86ff2.html.gz
--------------------------------------------------------------------------------
/html/776a1c046798b474e410f6edf3225d6a27fecd0de6aac22aef7b7f64fe87caaf.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/776a1c046798b474e410f6edf3225d6a27fecd0de6aac22aef7b7f64fe87caaf.html.gz
--------------------------------------------------------------------------------
/html/7837c9d66c815b9a21dd669a3dc21677c3f084b1b7dd603d56e87867d8970dd3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7837c9d66c815b9a21dd669a3dc21677c3f084b1b7dd603d56e87867d8970dd3.html.gz
--------------------------------------------------------------------------------
/html/785affa2c34e6e4844ef080e98e1a1e532eeeb671bdacebfb9e98ad7320ff382.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/785affa2c34e6e4844ef080e98e1a1e532eeeb671bdacebfb9e98ad7320ff382.html.gz
--------------------------------------------------------------------------------
/html/7916ecca969ffdd8f6fc32d171fbe0dd63db40fe4c1d2ade02b1dec5929a162f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7916ecca969ffdd8f6fc32d171fbe0dd63db40fe4c1d2ade02b1dec5929a162f.html.gz
--------------------------------------------------------------------------------
/html/7a457a4f71735c17b8b34fafc88835d225cf879b2d812311857a64cfc891eee9.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7a457a4f71735c17b8b34fafc88835d225cf879b2d812311857a64cfc891eee9.html.gz
--------------------------------------------------------------------------------
/html/7a664e40d256470fdb12d10c3f8d1c6db0581e9b080c71765e55f273a3ac7d03.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7a664e40d256470fdb12d10c3f8d1c6db0581e9b080c71765e55f273a3ac7d03.html.gz
--------------------------------------------------------------------------------
/html/7ab16ade32386ece353b8d31fc3bb7e660189efd5bf3c8549aaba101ad3f5ef5.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7ab16ade32386ece353b8d31fc3bb7e660189efd5bf3c8549aaba101ad3f5ef5.html.gz
--------------------------------------------------------------------------------
/html/7bb1ca90354313840329d2f569ea9fb3a582df2aa0a5e3669f8fc567eb6ea61b.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7bb1ca90354313840329d2f569ea9fb3a582df2aa0a5e3669f8fc567eb6ea61b.html.gz
--------------------------------------------------------------------------------
/html/7de5241947a5f7147fe9787c6f6fa16685bfe66e6c35510a68780f27690dc4f0.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7de5241947a5f7147fe9787c6f6fa16685bfe66e6c35510a68780f27690dc4f0.html.gz
--------------------------------------------------------------------------------
/html/7dfc3e359d7c0ca48ac9046ae5759286cedf80abe7526fc6c6e6546b9ba43e33.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7dfc3e359d7c0ca48ac9046ae5759286cedf80abe7526fc6c6e6546b9ba43e33.html.gz
--------------------------------------------------------------------------------
/html/7f93c1944a41d01960f8a16fdfda6c562e86f04ead8375ab796c4278402df9a8.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7f93c1944a41d01960f8a16fdfda6c562e86f04ead8375ab796c4278402df9a8.html.gz
--------------------------------------------------------------------------------
/html/8267acacb9e4a109b1f7ee7bafe735b73e9c94180b703b131f9e90c9be044f39.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8267acacb9e4a109b1f7ee7bafe735b73e9c94180b703b131f9e90c9be044f39.html.gz
--------------------------------------------------------------------------------
/html/82b6d780c792df78dcfb00484d50c86fbc7f324a9eb5835b7615f028edb9a574.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/82b6d780c792df78dcfb00484d50c86fbc7f324a9eb5835b7615f028edb9a574.html.gz
--------------------------------------------------------------------------------
/html/833caf3bdba53dcf48de273cf646370eebe9ac565744b0d0e941e298e1b79730.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/833caf3bdba53dcf48de273cf646370eebe9ac565744b0d0e941e298e1b79730.html.gz
--------------------------------------------------------------------------------
/html/8380689f358c1e3a0f6fca6e11ed13e5304a74060139f7a584347db213950446.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8380689f358c1e3a0f6fca6e11ed13e5304a74060139f7a584347db213950446.html.gz
--------------------------------------------------------------------------------
/html/851498a2b9f4f0b578ac9700c245253dbc147a06c0fb3499adebf1c2d5663c29.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/851498a2b9f4f0b578ac9700c245253dbc147a06c0fb3499adebf1c2d5663c29.html.gz
--------------------------------------------------------------------------------
/html/85439e26c41c75901820d01a13e8cea7836abb58635ea3986f71a163ab0311d3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/85439e26c41c75901820d01a13e8cea7836abb58635ea3986f71a163ab0311d3.html.gz
--------------------------------------------------------------------------------
/html/8634d1211c3f2b73041e6cadd5d59676619838949999a83a23c51a3195b44892.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8634d1211c3f2b73041e6cadd5d59676619838949999a83a23c51a3195b44892.html.gz
--------------------------------------------------------------------------------
/html/87438a0dacbeb979e72522f42b9020048da13dc5a079477114190c8855701b7f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/87438a0dacbeb979e72522f42b9020048da13dc5a079477114190c8855701b7f.html.gz
--------------------------------------------------------------------------------
/html/87bf60570e6e2e33cb1f0fdb5600d6c85012e60be25ba6fa587b8f90eb9a3770.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/87bf60570e6e2e33cb1f0fdb5600d6c85012e60be25ba6fa587b8f90eb9a3770.html.gz
--------------------------------------------------------------------------------
/html/88c328b68b038a625b4b3f8c322215caa30b0e88af0754bd71056ffc15c7b4b7.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/88c328b68b038a625b4b3f8c322215caa30b0e88af0754bd71056ffc15c7b4b7.html.gz
--------------------------------------------------------------------------------
/html/8b194530308204139d9c8f7d495a26b117c78756ac1802cfc3c0a8bfdf2c0d50.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8b194530308204139d9c8f7d495a26b117c78756ac1802cfc3c0a8bfdf2c0d50.html.gz
--------------------------------------------------------------------------------
/html/8cad00dc22de45ba42e9540421b5f78333f7ac57b385d69acb27a53b9fd69f0c.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8cad00dc22de45ba42e9540421b5f78333f7ac57b385d69acb27a53b9fd69f0c.html.gz
--------------------------------------------------------------------------------
/html/8e3efab59f48fd29a1e1e7aa135880c4251a9f090f94999668cdbaec59d30b5a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8e3efab59f48fd29a1e1e7aa135880c4251a9f090f94999668cdbaec59d30b5a.html.gz
--------------------------------------------------------------------------------
/html/921019755f4a96ac4abf9dbcb4ef9d5ac202624a542d5ea70912330aa6fcc71f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/921019755f4a96ac4abf9dbcb4ef9d5ac202624a542d5ea70912330aa6fcc71f.html.gz
--------------------------------------------------------------------------------
/html/94fbcc26772088646cb977cecf1abc4012847a1f6927d09505cbf0c3d417ba07.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/94fbcc26772088646cb977cecf1abc4012847a1f6927d09505cbf0c3d417ba07.html.gz
--------------------------------------------------------------------------------
/html/95301fb7883e0ee5214d1111554d30dd97e08c6380d7699369c0b9c15f42e6aa.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/95301fb7883e0ee5214d1111554d30dd97e08c6380d7699369c0b9c15f42e6aa.html.gz
--------------------------------------------------------------------------------
/html/961bd85ca85aaf791b278cc4a60058e92d57c4f32a3411cf8e7d802af183c926.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/961bd85ca85aaf791b278cc4a60058e92d57c4f32a3411cf8e7d802af183c926.html.gz
--------------------------------------------------------------------------------
/html/9a440270bf8625d586039dfae1b8df409b467524e075124cd7a5424a5806901b.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9a440270bf8625d586039dfae1b8df409b467524e075124cd7a5424a5806901b.html.gz
--------------------------------------------------------------------------------
/html/9cb8224b660f36c932823ab613fb76a07928fcbc41956c4c1f96f4ecab9202aa.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9cb8224b660f36c932823ab613fb76a07928fcbc41956c4c1f96f4ecab9202aa.html.gz
--------------------------------------------------------------------------------
/html/9da36ae4714bfccc72374c6c146e9d1cd3cca39e2110bd67ccdbcc806f4cf139.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9da36ae4714bfccc72374c6c146e9d1cd3cca39e2110bd67ccdbcc806f4cf139.html.gz
--------------------------------------------------------------------------------
/html/9e8c9f082a8d77c58c17bda03b6b4bb6a1d6883fe196c252db4ca83b9991e0d3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9e8c9f082a8d77c58c17bda03b6b4bb6a1d6883fe196c252db4ca83b9991e0d3.html.gz
--------------------------------------------------------------------------------
/html/9ebb3af65694a953005df5bd3869b2cefc263e1dea0471e3ef361c66a264cdd3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9ebb3af65694a953005df5bd3869b2cefc263e1dea0471e3ef361c66a264cdd3.html.gz
--------------------------------------------------------------------------------
/html/9eef8162bbb67b0bd73792313b91b87dc9304f43f85f479e67e71c166417451e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9eef8162bbb67b0bd73792313b91b87dc9304f43f85f479e67e71c166417451e.html.gz
--------------------------------------------------------------------------------
/html/a078b3656adc0295d0e37bd4f599342f4a0894da2451e0ef3038ac045434fef3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a078b3656adc0295d0e37bd4f599342f4a0894da2451e0ef3038ac045434fef3.html.gz
--------------------------------------------------------------------------------
/html/a1fca19b884e0e946ad3fbe2a7f5031e5e3b23372702a76db302b6143c77cb31.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a1fca19b884e0e946ad3fbe2a7f5031e5e3b23372702a76db302b6143c77cb31.html.gz
--------------------------------------------------------------------------------
/html/a6968f427cdb786531cfb326518e674bd8b48af94df7c5c6165cdf40e944357a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a6968f427cdb786531cfb326518e674bd8b48af94df7c5c6165cdf40e944357a.html.gz
--------------------------------------------------------------------------------
/html/a860fb5eda1ac75df3bc95ba096ade649fdbb1bb566adb9fee3cb13e59f37604.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a860fb5eda1ac75df3bc95ba096ade649fdbb1bb566adb9fee3cb13e59f37604.html.gz
--------------------------------------------------------------------------------
/html/aadb38e527d5379306de3b910ec62cb2447cc1035686b2b2d152580f8f8a1ea2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/aadb38e527d5379306de3b910ec62cb2447cc1035686b2b2d152580f8f8a1ea2.html.gz
--------------------------------------------------------------------------------
/html/aade2ec8d1e7b0919aef1001c3ef0573f8a239e22d4d751d8e664f04ea77ef0d.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/aade2ec8d1e7b0919aef1001c3ef0573f8a239e22d4d751d8e664f04ea77ef0d.html.gz
--------------------------------------------------------------------------------
/html/abd9d6291b6bfae0c3ffad8ab7623b482c6da46face0271dc42af6324d8f0ce5.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/abd9d6291b6bfae0c3ffad8ab7623b482c6da46face0271dc42af6324d8f0ce5.html.gz
--------------------------------------------------------------------------------
/html/ac1bfdd4c510f679c58f1b62101630d40fda20a16703235ae0f56b65a465e423.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ac1bfdd4c510f679c58f1b62101630d40fda20a16703235ae0f56b65a465e423.html.gz
--------------------------------------------------------------------------------
/html/ac3c035520461017a7c5b248d8e39ef063cad4c0c7d7b7ecd68aff8f15099485.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ac3c035520461017a7c5b248d8e39ef063cad4c0c7d7b7ecd68aff8f15099485.html.gz
--------------------------------------------------------------------------------
/html/ad826691a8a2f9c4ce50cf0b885af933c4b5119c1f6235cd7df1dfb83f255bcc.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ad826691a8a2f9c4ce50cf0b885af933c4b5119c1f6235cd7df1dfb83f255bcc.html.gz
--------------------------------------------------------------------------------
/html/ad9e9e596f21a6812fae27b5d9d622359826c368e471d7d5ff9ac4676eaac9cd.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ad9e9e596f21a6812fae27b5d9d622359826c368e471d7d5ff9ac4676eaac9cd.html.gz
--------------------------------------------------------------------------------
/html/aec5deeaada8b2fb81b55349da0229d5c77a4dc9605c1aaa31e5ce8b71358bc9.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/aec5deeaada8b2fb81b55349da0229d5c77a4dc9605c1aaa31e5ce8b71358bc9.html.gz
--------------------------------------------------------------------------------
/html/b0cf2bbf0192315eec95ede9c59bbf4ae58699275739d590edc24b012e3e8800.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b0cf2bbf0192315eec95ede9c59bbf4ae58699275739d590edc24b012e3e8800.html.gz
--------------------------------------------------------------------------------
/html/b37be3535e1fb61e5a238b7fa1ead1ad98b651cb09f138efadac3d54a122fb21.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b37be3535e1fb61e5a238b7fa1ead1ad98b651cb09f138efadac3d54a122fb21.html.gz
--------------------------------------------------------------------------------
/html/b3c19dd5f0612d098788fa5173e491b3280da6226b492f8fe110f4ab1896cca8.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b3c19dd5f0612d098788fa5173e491b3280da6226b492f8fe110f4ab1896cca8.html.gz
--------------------------------------------------------------------------------
/html/b6906ca016bbfc64c90426e098c75b3e8c84457a77f51f1e7ea6941cb80c2147.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b6906ca016bbfc64c90426e098c75b3e8c84457a77f51f1e7ea6941cb80c2147.html.gz
--------------------------------------------------------------------------------
/html/b6fb53e9fb043c98eb1e6530a1074c40922e29025f5454809f3938a7c174faa3.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b6fb53e9fb043c98eb1e6530a1074c40922e29025f5454809f3938a7c174faa3.html.gz
--------------------------------------------------------------------------------
/html/ba07d1e64775f4090e39116c382111f5a2cfe9528dd179673f4e9bfcea370c15.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ba07d1e64775f4090e39116c382111f5a2cfe9528dd179673f4e9bfcea370c15.html.gz
--------------------------------------------------------------------------------
/html/ba4dfe2d3e817ff7b8b01172ccc307850fc5469bcdd26c3c48ca046cb88ab7cf.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ba4dfe2d3e817ff7b8b01172ccc307850fc5469bcdd26c3c48ca046cb88ab7cf.html.gz
--------------------------------------------------------------------------------
/html/bc13ff87b2630ffbebc33bc37b11178b14f03109055e1d17bf644f804b63d98a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/bc13ff87b2630ffbebc33bc37b11178b14f03109055e1d17bf644f804b63d98a.html.gz
--------------------------------------------------------------------------------
/html/bd673bd7988144f0ab7b9c5e19fed140fb5aaa30d8894cb045b72d3b79a7dc54.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/bd673bd7988144f0ab7b9c5e19fed140fb5aaa30d8894cb045b72d3b79a7dc54.html.gz
--------------------------------------------------------------------------------
/html/bdb56ac83513635db1d8b9eb46b2da4c0de8da2f1f28f5bf5163df3eb3d3ec06.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/bdb56ac83513635db1d8b9eb46b2da4c0de8da2f1f28f5bf5163df3eb3d3ec06.html.gz
--------------------------------------------------------------------------------
/html/c00962aabe7bdd1fca78f5360ea7fa93cd7674863b05157e00827506a7aa58c4.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c00962aabe7bdd1fca78f5360ea7fa93cd7674863b05157e00827506a7aa58c4.html.gz
--------------------------------------------------------------------------------
/html/c13b9c0e04fb28d445d22e92bff6ab7f7800a429930677c28c4dad89f3269869.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c13b9c0e04fb28d445d22e92bff6ab7f7800a429930677c28c4dad89f3269869.html.gz
--------------------------------------------------------------------------------
/html/c467d507551a836efa9cfe843ba5d7bafe519750e04d0c9ff0decf44f013f829.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c467d507551a836efa9cfe843ba5d7bafe519750e04d0c9ff0decf44f013f829.html.gz
--------------------------------------------------------------------------------
/html/c4a3637c6696f238cf9fe1c7fbb17bbb6731a71d4f5fe399b9b4fc3294a96a6b.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c4a3637c6696f238cf9fe1c7fbb17bbb6731a71d4f5fe399b9b4fc3294a96a6b.html.gz
--------------------------------------------------------------------------------
/html/c50845a7158af12ee75acea301a3ea0dad1e848d6b9dbdb43ba7f2d825b2528b.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c50845a7158af12ee75acea301a3ea0dad1e848d6b9dbdb43ba7f2d825b2528b.html.gz
--------------------------------------------------------------------------------
/html/c582d3b772578e8feaa3cfd8f5ae8100bb6f0bc66048204a9a398395841c1164.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c582d3b772578e8feaa3cfd8f5ae8100bb6f0bc66048204a9a398395841c1164.html.gz
--------------------------------------------------------------------------------
/html/c58aa507c4deebd660f69905f9abb8f96d935f6e7210f597ed4cd32b3f39f7f7.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c58aa507c4deebd660f69905f9abb8f96d935f6e7210f597ed4cd32b3f39f7f7.html.gz
--------------------------------------------------------------------------------
/html/c69e539d689a8335a69042727f1b58edab09d5d99fb607ec625a63151a537dc2.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c69e539d689a8335a69042727f1b58edab09d5d99fb607ec625a63151a537dc2.html.gz
--------------------------------------------------------------------------------
/html/c7e39ac49fa1235f5d50f83bf2444248bd3aa4e6df044377916c812dd109ba23.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c7e39ac49fa1235f5d50f83bf2444248bd3aa4e6df044377916c812dd109ba23.html.gz
--------------------------------------------------------------------------------
/html/c81e134ed49902bcf69b551426b4a346c5a77ae993cac8bda68b5541a664ef4c.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c81e134ed49902bcf69b551426b4a346c5a77ae993cac8bda68b5541a664ef4c.html.gz
--------------------------------------------------------------------------------
/html/c82b3d1d540bbbd6081bdfb78b4c068c583aa766bcaaefe7ad16d24e5413a829.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c82b3d1d540bbbd6081bdfb78b4c068c583aa766bcaaefe7ad16d24e5413a829.html.gz
--------------------------------------------------------------------------------
/html/c90731f051d033e49e4cfcc920895051bbc3b54ef1a11519abcf22a115c3aa79.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c90731f051d033e49e4cfcc920895051bbc3b54ef1a11519abcf22a115c3aa79.html.gz
--------------------------------------------------------------------------------
/html/cc03ddb5ef7d5f1fdb8a87f5e6dfd058a2a70acedf2551655a898dc5c18eb79e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/cc03ddb5ef7d5f1fdb8a87f5e6dfd058a2a70acedf2551655a898dc5c18eb79e.html.gz
--------------------------------------------------------------------------------
/html/cc4aa22b8212aec7d289667c0a965569e6f06b9e9196ff8b02219bf2bc1b90d0.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/cc4aa22b8212aec7d289667c0a965569e6f06b9e9196ff8b02219bf2bc1b90d0.html.gz
--------------------------------------------------------------------------------
/html/d0382c0d9573a0a7beb1e649012d04ec7275ac23513ca6ca59e51477b028283c.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d0382c0d9573a0a7beb1e649012d04ec7275ac23513ca6ca59e51477b028283c.html.gz
--------------------------------------------------------------------------------
/html/d1c57d7821e5a5b27fb468c59489601bb2a042b1c05221166e3221d2b5dc217f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d1c57d7821e5a5b27fb468c59489601bb2a042b1c05221166e3221d2b5dc217f.html.gz
--------------------------------------------------------------------------------
/html/d48aeb9cf2f2ff15769a57513249b4a6a669159f3e50b335e741d4206a824e88.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d48aeb9cf2f2ff15769a57513249b4a6a669159f3e50b335e741d4206a824e88.html.gz
--------------------------------------------------------------------------------
/html/d605bdef2cde7308a9f2fbd1484d4a9c3da0167177245d346da61e455f42208d.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d605bdef2cde7308a9f2fbd1484d4a9c3da0167177245d346da61e455f42208d.html.gz
--------------------------------------------------------------------------------
/html/d90bda7ed14df19574f4ca8b1ccde5752a78f40058af1393e81cc99adb3e8756.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d90bda7ed14df19574f4ca8b1ccde5752a78f40058af1393e81cc99adb3e8756.html.gz
--------------------------------------------------------------------------------
/html/db6b0816c612296c7f1f001c6df874214fcca0da0fc86fb3aea9358c7f681754.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/db6b0816c612296c7f1f001c6df874214fcca0da0fc86fb3aea9358c7f681754.html.gz
--------------------------------------------------------------------------------
/html/dc7ccccc1f34eb2928cb238739aaf18c712d59d8d34b41acfb29178aeba65356.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/dc7ccccc1f34eb2928cb238739aaf18c712d59d8d34b41acfb29178aeba65356.html.gz
--------------------------------------------------------------------------------
/html/dfd43bc0d46e7aaa78ba10fbcb5b9fdfe78771d36cb4c7497e17fb6f69170ec5.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/dfd43bc0d46e7aaa78ba10fbcb5b9fdfe78771d36cb4c7497e17fb6f69170ec5.html.gz
--------------------------------------------------------------------------------
/html/e100c9612ad8495db03b2a9f968952d0eaa4853d9b32ded6a29f8e313a974873.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e100c9612ad8495db03b2a9f968952d0eaa4853d9b32ded6a29f8e313a974873.html.gz
--------------------------------------------------------------------------------
/html/e1c7023ee2148901b086256fdd30a0893d10b0720b510d5ff07a021109347266.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e1c7023ee2148901b086256fdd30a0893d10b0720b510d5ff07a021109347266.html.gz
--------------------------------------------------------------------------------
/html/e1cd54e5577d077df83a12a4753c3c8bf2d88d68cd709cc4c442874777581c4a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e1cd54e5577d077df83a12a4753c3c8bf2d88d68cd709cc4c442874777581c4a.html.gz
--------------------------------------------------------------------------------
/html/e372e42c0a3df7b86e1c0bacf7bc14d042144a01e88833bc5a643d61b3547090.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e372e42c0a3df7b86e1c0bacf7bc14d042144a01e88833bc5a643d61b3547090.html.gz
--------------------------------------------------------------------------------
/html/e4c6a3b482403a8f60190ba27248cd52b250b86f5d4a8a10edcf7062c64fc3f5.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e4c6a3b482403a8f60190ba27248cd52b250b86f5d4a8a10edcf7062c64fc3f5.html.gz
--------------------------------------------------------------------------------
/html/e593d7fe88f9f5cd6587ac172be2db6055d40b6f071023f97ab1ce373534261e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e593d7fe88f9f5cd6587ac172be2db6055d40b6f071023f97ab1ce373534261e.html.gz
--------------------------------------------------------------------------------
/html/e7301133baab43596f19076beab32096f6405b868e0a69bcfc3349e595d62475.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e7301133baab43596f19076beab32096f6405b868e0a69bcfc3349e595d62475.html.gz
--------------------------------------------------------------------------------
/html/e7994d5500875202d93e736e8f0c8a0436107d10add94ce3789001b8c5c32358.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e7994d5500875202d93e736e8f0c8a0436107d10add94ce3789001b8c5c32358.html.gz
--------------------------------------------------------------------------------
/html/e7d77f1869803e24667fa0b985cff27fb4139951a5ffa494bc9ba810df48fb30.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e7d77f1869803e24667fa0b985cff27fb4139951a5ffa494bc9ba810df48fb30.html.gz
--------------------------------------------------------------------------------
/html/ea25dd7edff4d27973600f35728f20aed5a3eedcc23257d9c3afc3d3e840c3de.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ea25dd7edff4d27973600f35728f20aed5a3eedcc23257d9c3afc3d3e840c3de.html.gz
--------------------------------------------------------------------------------
/html/eb62ac8425e5573947ecde962d14433d18e5725cc4a8c908fe22f678e96a65a1.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/eb62ac8425e5573947ecde962d14433d18e5725cc4a8c908fe22f678e96a65a1.html.gz
--------------------------------------------------------------------------------
/html/ec3878db7e49b1ed354c511b132e3de5f773ff4fc8014163df58c22fffd93d2f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ec3878db7e49b1ed354c511b132e3de5f773ff4fc8014163df58c22fffd93d2f.html.gz
--------------------------------------------------------------------------------
/html/ec7fc408c5ce66c22692a3f696c682f3de794bacfaca405d9a0dac5957051e5a.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ec7fc408c5ce66c22692a3f696c682f3de794bacfaca405d9a0dac5957051e5a.html.gz
--------------------------------------------------------------------------------
/html/ecb46e3e489d2aac92b2563112e1801077b4219a6db9751f18e228bcaf457802.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ecb46e3e489d2aac92b2563112e1801077b4219a6db9751f18e228bcaf457802.html.gz
--------------------------------------------------------------------------------
/html/eecd2575093b85933997521d6babddd397599419588d7096c5c19dc4ffe2ea72.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/eecd2575093b85933997521d6babddd397599419588d7096c5c19dc4ffe2ea72.html.gz
--------------------------------------------------------------------------------
/html/ef2b3f268a67950c16563de9ca3209163c7618868c0216739e1e794e7884cc20.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ef2b3f268a67950c16563de9ca3209163c7618868c0216739e1e794e7884cc20.html.gz
--------------------------------------------------------------------------------
/html/ef4e67b66d63b5facef55c06a94d85f2ae01a0a1a4a3a1bcfe2499c8c8a7dacf.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ef4e67b66d63b5facef55c06a94d85f2ae01a0a1a4a3a1bcfe2499c8c8a7dacf.html.gz
--------------------------------------------------------------------------------
/html/f105de6e63ca91ea482f60193f6252092557f969f2fd128ff68c0d4d6b90dd7d.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f105de6e63ca91ea482f60193f6252092557f969f2fd128ff68c0d4d6b90dd7d.html.gz
--------------------------------------------------------------------------------
/html/f344ca5fb36e130f4344235fa22726f3367e09c211c120f21d9ae92effe902db.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f344ca5fb36e130f4344235fa22726f3367e09c211c120f21d9ae92effe902db.html.gz
--------------------------------------------------------------------------------
/html/f5c90a6d5253c3a21ff3168c64bea4b5ffade7a1ba5bed952a59ebee0d648d98.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f5c90a6d5253c3a21ff3168c64bea4b5ffade7a1ba5bed952a59ebee0d648d98.html.gz
--------------------------------------------------------------------------------
/html/f6ac15a4d98511396da23e4428deb5605422b1c8bbc8284e771f6896bdccf57f.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f6ac15a4d98511396da23e4428deb5605422b1c8bbc8284e771f6896bdccf57f.html.gz
--------------------------------------------------------------------------------
/html/f81c6c05d9cbc93316992fa23ef74ec405194e292611f2e94f6a814868903665.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f81c6c05d9cbc93316992fa23ef74ec405194e292611f2e94f6a814868903665.html.gz
--------------------------------------------------------------------------------
/html/f8ff621a0b9b7646cc0d57d37416feabba2bf78ef5dd0bfc5b080f9f97bbe584.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f8ff621a0b9b7646cc0d57d37416feabba2bf78ef5dd0bfc5b080f9f97bbe584.html.gz
--------------------------------------------------------------------------------
/html/fde930b01859de8311c6a14f8aa8c72be0659b551367803deb6736cf3526cf2e.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/fde930b01859de8311c6a14f8aa8c72be0659b551367803deb6736cf3526cf2e.html.gz
--------------------------------------------------------------------------------
/html/ff0f958ade714ebfaf5c0b42b1c0152a62063f4e6f72141406ccefc4a2677f21.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ff0f958ade714ebfaf5c0b42b1c0152a62063f4e6f72141406ccefc4a2677f21.html.gz
--------------------------------------------------------------------------------
/html/ffc109d474fdee1a59fa554df8b09643f4a7d45b23eceabad66f0712c3f7daed.html.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ffc109d474fdee1a59fa554df8b09643f4a7d45b23eceabad66f0712c3f7daed.html.gz
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.8.2
2 | chardet==3.0.4
3 | cssselect==1.1.0
4 | cython==0.29.14
5 | -e git+git@github.com:dragnet-org/dragnet.git@1b65e7b0897ca061b5c90b3eefffbfc156a0cc3b#egg=dragnet
6 | feedfinder2==0.0.4
7 | feedparser==5.2.1
8 | jieba3k==0.35.1
9 | html-text==0.5.1
10 | lxml==4.4.2
11 | newspaper3k==0.2.8
12 | nltk==3.4.5
13 | numpy==1.18.1
14 | readability-lxml==0.7.1
15 | soupsieve==1.9.5
16 | scikit-learn==0.19.1 # same version as used by dragnet
17 | scipy==1.4.1
18 | tinysegmenter==0.3
19 | tldextract==2.2.2
20 | trafilatura==0.5.1
21 | news-please==1.5.17 # depend on half pypi projects...
22 | cchardet==2.1.7 # lost dependency in news-please
23 | goose3==3.1.8
24 | inscriptis==1.1.2
25 | html2text==2020.1.16
26 | justext==2.2.0
27 | beautifulsoup4==4.9.3
28 |
--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from evaluate import string_shingle_matching, _ngrams, _tokenize
4 |
5 |
6 | def test_tokenize():
7 | assert _tokenize('a b,cd:e(foo,bar) ') == \
8 | ['a', 'b', 'cd', 'e', 'foo', 'bar']
9 |
10 |
11 | @pytest.mark.parametrize(
12 | ['text', 'n', 'expected'],
13 | [('!', 4, []),
14 | ('a,b c ', 5, [('a', 'b', 'c')]),
15 | ('aa 11 c 22', 3, [('aa', '11', 'c'), ('11', 'c', '22')]),
16 | ('a b c a b c', 3, [('a', 'b', 'c'), ('b', 'c', 'a'),
17 | ('c', 'a', 'b'), ('a', 'b', 'c')]),
18 | ])
19 | def test_ngrams(text, n, expected):
20 | assert _ngrams(text, n) == expected
21 |
22 |
23 | @pytest.mark.parametrize(
24 | ['true', 'pred', 'tp_fp_fn'],
25 | [('a b c', 'a b c', (1, 0, 0)),
26 | ('a b c d', 'a b c', (0.5, 0, 0.5)),
27 | ('a b c', 'a b c d', (0.5, 0.5, 0)),
28 | ('', '', (0, 0, 0)),
29 | ('a', '', (0, 0, 1)),
30 | ('', 'a', (0, 1, 0)),
31 | ('a b c a b c', 'a b c', (0.25, 0, 0.75)),
32 | ('a b c', 'a b c a b c', (0.25, 0.75, 0)),
33 | ])
34 | def test_string_shingle_matching(true, pred, tp_fp_fn):
35 | assert string_shingle_matching(true, pred, ngram_n=3) == tp_fp_fn
36 |
--------------------------------------------------------------------------------