├── .github └── workflows │ └── evaluate.yml ├── .gitignore ├── LICENSE ├── README.rst ├── evaluate.py ├── extractors ├── go_domdistiller.py ├── go_domdistiller │ ├── README.rst │ ├── cli.go │ ├── go.mod │ └── go.sum ├── go_readability │ ├── README.rst │ ├── cli.go │ ├── go.mod │ └── go.sum ├── run_beautifulsoup.py ├── run_boilerpipe.py ├── run_dragnet.py ├── run_go_readability.py ├── run_goose3.py ├── run_html2text.py ├── run_html_text.py ├── run_inscriptis.py ├── run_justext.py ├── run_news_please.py ├── run_newspaper.py ├── run_readability.py ├── run_readability_js.py ├── run_trafilatura.py └── run_xpath_text.py ├── ground-truth.json ├── html ├── 042bb7b5fedab6eac7db576522b89b93904c237d344bcbe14a6a5ab7f7335856.html.gz ├── 04a6711caa7c687592777718866e781e976e0fe684faebe8b3cedcef8cd0ea34.html.gz ├── 05844573ca7e1fba714d715bb11ca08c26e25328999c74a1cb3bc8a0e4399f0f.html.gz ├── 06e5123e4ef7cfb4533250dc45d1e03d0838fc66223f45c583c4d12f48b4da85.html.gz ├── 06ee193de4bd611f7fafbab0c59b0f6fe3495093516720632cd093b24c7a0e98.html.gz ├── 076f4f33bf75059db581bedf36e76fb65e89a8f7752db3339aa3ea11c5122f32.html.gz ├── 08f793762792bd252c75fb57544cdf506ffcc04785136cb87503f02364b82b56.html.gz ├── 098bb3e96c0acdf36efdcde45fb9cca3f8c82c7cb2071b76097a1b96155f1eb2.html.gz ├── 0d46122928b6f468cc4bbc694051d0dbae5702bc75a16dab82a99b58daf150a0.html.gz ├── 0dd1357045727799a447563fd8851f4ebe79f042073ea16991a9b67aa595f81a.html.gz ├── 0e014df693f182824fe5e24030ddbe1d0b96ddb9685cf20d5766457ed32ffa2d.html.gz ├── 0ec95c7261d122f304728e90c983450ef1ce1e0b423546835c397d50aaf0d0f2.html.gz ├── 11ea381ad92b5448cf66eae62f52ac565361a244c8881615fc6a7bb523cc0c32.html.gz ├── 14cc2a0ca59c62a8c9f205a171e9ccf4ef4cf69b0c642f51c8c65c051b39024f.html.gz ├── 156770d676ce79905198e1c8407f81e5ecfb617d9aa44712718707eb7e3b8e38.html.gz ├── 16c30add7e96315e9cc957d85aa876ccb6b70055f0ddab51547a586117cc1f56.html.gz ├── 1ace8c85aaee21b9d4505eca506d50c4721c29db62848b567a9703bfe0583892.html.gz ├── 1ee91d1fce65e09be8b8d2d29eab771546d98ca2ba5c862941e660e9fec12432.html.gz ├── 1f765c48780665e89cc3af1f7c9af47876e9fae9b5be4a936b0649e10f5e3198.html.gz ├── 20b2b64916b00b25203c9f1bf14248922f4d522f18328e9f876cce116df0083e.html.gz ├── 21486419bb109c5a62a68957f528e6ff29c92f58d8d3c1f2837c86ff3f3e11f9.html.gz ├── 232a43fb15abde807427b2a7bf4f772e27b8760554370956d8291df4e8166dbf.html.gz ├── 23aaecd14171f96cfd201a8a46666097e286ad71f74f29347a78c5ecba50da1e.html.gz ├── 264dc3ae31249cb1f50c50986e0952a4708c2e705d18a2d8bf0e525da6e2b485.html.gz ├── 287e4d9f4af31733aad6534aefb2bd00fb344ec8d6ebf1ac99dbc4d762da0ca4.html.gz ├── 291a8bf33ee49074f33dcff37544ac40506cae450db83b6cb63f02b9920b51c2.html.gz ├── 2c46804d9db4a85e8f8d31128ce0e11d02f25c7120c2faa5ec0664c604a47717.html.gz ├── 2f42ef1d3ea0c96e56355d3db93d0e06b47e760b74f6f4261278b8cd1c246dd6.html.gz ├── 30b771a40a4e96156d398716c877deef54b05d091770d2717c98e4c6b670010c.html.gz ├── 3252222e61fe78982cffe0b0bad2b089c27b32f65852d1c5d3951517f3c2e295.html.gz ├── 33fe2471fd553c6570f93997f208b4f39bf30be5947c3cfa620ee8eff3355ab9.html.gz ├── 34a7328535ad4e60b059f81d37eec5d25c2bc8de759ce9a7b5e47ac7dc6fd1b0.html.gz ├── 358cc4a080456476b0f883c56bdce796874c286ed6efab25f5718dd95fab42a8.html.gz ├── 359fee228518d55b921194561e9ca88e428df81940246f8fac7a75398377daea.html.gz ├── 35b158918c676ff2c74445517db76c83db70a805cc50b64e1369b354a027fcbd.html.gz ├── 360c732d1fdbfc6895d7096c0c0b8c0d581bb1af80160f4c6a0f1fd9ff85e469.html.gz ├── 374ac9a59a85196cdacc1679fb8993521a7b7d9d6533720f102300be1c7face4.html.gz ├── 39d5c43beb60605c3eec760c99500e62e7bd71ebbe4ae05edf382125e1b0b80a.html.gz ├── 3c5bf8db4272925bf1dd5713fc325e179fd0d1cc6fb8c77aa2d917cfd2518a32.html.gz ├── 3c6d3381ef52ca26be2fbde19c1b0fe17d85682b726dfecf5e300c1ca34546b1.html.gz ├── 3cb22bfabed8de715c0813a7bb5052363c96bd71ccce3bb2dfb3ab9d1d7a9bbc.html.gz ├── 3cb5e2f46626d5bb0345759453036f7eabc0b0c7796b796513606bf693060ced.html.gz ├── 3ce1c8fdf6ad2ded9e48a68be71eb069fc453ef1b75f47698428a1fdda0deb24.html.gz ├── 3d8f3404cf975af824d7866b7679bc45189c3eea6adb32f0a125a0904b1abbb2.html.gz ├── 3f65af7b6b98b1c9ae9a3e0d8a09a85600cdc44e26e4b3a6db96a31f4b1767e3.html.gz ├── 4219d096902dad9fd9d57e881e7928ca66bdf5334c2bc7dfddaa264887777a7a.html.gz ├── 42aad16bde9288623543642a9ce1a396be83e2db44aa2ff8cbbfe46e14abd7cc.html.gz ├── 432362af0be43f6da757ea778bd7f2f000094a565bdebac5af7442987a5372f3.html.gz ├── 4648a420af9984d45b76a4afedf4f74965f8a2e0bf1c69bd3da2dc189020f3c9.html.gz ├── 4a44ab3e4c41d56ce9b79eb07acb06aed1bc52aba68a950f06e7de7ef848400a.html.gz ├── 51374560f40088e227f0053ff1bb0b8525d10a8d7bfbff1cd6033f42347fd85b.html.gz ├── 51d066b0602c9421d8d6410bc4b931700978409a3faa2a984e8fbde519ad7241.html.gz ├── 5211188428849a31e309ef2475746563ff788b1591c89818c08d5abedec4ef5e.html.gz ├── 55bb6340e3d7dd8632ba45179ae43c39f8ad0cfcecb4719e3b9cf6106ffb70a3.html.gz ├── 57b4dafd18cfd0531b69f81e87158648227c673ef159f8d8c87d34e34bdb21f2.html.gz ├── 57d46c9d751e3fd3ffaf3ede7ac20cebd30eacb5ea78e1a6aa0a72059244e7ca.html.gz ├── 57e2e98887a1965689955921208e32f410b10e2b95c907e74e57982d3edf3cc6.html.gz ├── 5a822960e9a2cb1e664d334b6c936c5cb6e41fb5331877538c2c8339cb59d57e.html.gz ├── 5ae11e580afc12d3ba1a12944281e6a7a5dded5c98b4efcf24aedcb28f0d5b22.html.gz ├── 5caf91b8a4423735f866b089d2611ea14503584cf3b6f487c6d26eb7b9521fca.html.gz ├── 5f03fc173ebc6abdfae50b96ce0b05a6137b7d3f2ef379be35a9bb8ca9f49e87.html.gz ├── 5f9c5ed5d64dfe682d9bde13b9b4f032a3ebdbf165c06ec49c0705bcbe106e3b.html.gz ├── 5fa3154ec031ab35411a457d78eb5aa92c0e803c5329bd05c001e6d64009e206.html.gz ├── 5fa5679de56c43edf70685762c2d1f2de296432ae53aa46e075b552fee17cab8.html.gz ├── 5fbc7ccb504c755ae23a85499a17518483d7862b74b4a5c34d86ede1a1a4448e.html.gz ├── 612cd29826624e68ce96789c8049e16279dfd2fceb27434eea7943b2aaf84e90.html.gz ├── 624fcd903d56fc7055fa7097b330629450c095ad6937318deb027be7803bbf35.html.gz ├── 63db31a161b3c5b64e88c2978635cbc38d342ba82fd2c5335321203dcc55c76f.html.gz ├── 65408257dbe4b41f71a35ade24e30243265095fc1d4988a35b9a6ca52f2b4eab.html.gz ├── 65bf3048b500bbd84928d9122f99617ca898216b91add1d8b2ac09c670484a5c.html.gz ├── 65ce3a4577a0306994efa190a0d96e84014f9d4257ad54753e807ede518f02c0.html.gz ├── 680c2848e94a96f961a0964631de94ac572f83c45bfd0bec2deafa893bcfe15c.html.gz ├── 686bb170effe273eaff1c0f88e412172e8d972518a6d1454c896f52aafaa9643.html.gz ├── 6a72de37e8f98f4eee6c0821e593b35ce536cef6c8b424c5e1dd747ebe6621ba.html.gz ├── 6ebac05f637ece8aa57c298a2a5e3a8047f546f855d0f29cc683cea60ce85c85.html.gz ├── 702d1da63b8e064cb70617620e45c2d116b4912c9bc9d518dcf5ce54bb8057ed.html.gz ├── 70cb2d5bca75ab5a8f6bb378a38a52f882f6bda508de93b12502e74936d86ff2.html.gz ├── 776a1c046798b474e410f6edf3225d6a27fecd0de6aac22aef7b7f64fe87caaf.html.gz ├── 7837c9d66c815b9a21dd669a3dc21677c3f084b1b7dd603d56e87867d8970dd3.html.gz ├── 785affa2c34e6e4844ef080e98e1a1e532eeeb671bdacebfb9e98ad7320ff382.html.gz ├── 7916ecca969ffdd8f6fc32d171fbe0dd63db40fe4c1d2ade02b1dec5929a162f.html.gz ├── 7a457a4f71735c17b8b34fafc88835d225cf879b2d812311857a64cfc891eee9.html.gz ├── 7a664e40d256470fdb12d10c3f8d1c6db0581e9b080c71765e55f273a3ac7d03.html.gz ├── 7ab16ade32386ece353b8d31fc3bb7e660189efd5bf3c8549aaba101ad3f5ef5.html.gz ├── 7bb1ca90354313840329d2f569ea9fb3a582df2aa0a5e3669f8fc567eb6ea61b.html.gz ├── 7de5241947a5f7147fe9787c6f6fa16685bfe66e6c35510a68780f27690dc4f0.html.gz ├── 7dfc3e359d7c0ca48ac9046ae5759286cedf80abe7526fc6c6e6546b9ba43e33.html.gz ├── 7f93c1944a41d01960f8a16fdfda6c562e86f04ead8375ab796c4278402df9a8.html.gz ├── 8267acacb9e4a109b1f7ee7bafe735b73e9c94180b703b131f9e90c9be044f39.html.gz ├── 82b6d780c792df78dcfb00484d50c86fbc7f324a9eb5835b7615f028edb9a574.html.gz ├── 833caf3bdba53dcf48de273cf646370eebe9ac565744b0d0e941e298e1b79730.html.gz ├── 8380689f358c1e3a0f6fca6e11ed13e5304a74060139f7a584347db213950446.html.gz ├── 851498a2b9f4f0b578ac9700c245253dbc147a06c0fb3499adebf1c2d5663c29.html.gz ├── 85439e26c41c75901820d01a13e8cea7836abb58635ea3986f71a163ab0311d3.html.gz ├── 8634d1211c3f2b73041e6cadd5d59676619838949999a83a23c51a3195b44892.html.gz ├── 87438a0dacbeb979e72522f42b9020048da13dc5a079477114190c8855701b7f.html.gz ├── 87bf60570e6e2e33cb1f0fdb5600d6c85012e60be25ba6fa587b8f90eb9a3770.html.gz ├── 88c328b68b038a625b4b3f8c322215caa30b0e88af0754bd71056ffc15c7b4b7.html.gz ├── 8b194530308204139d9c8f7d495a26b117c78756ac1802cfc3c0a8bfdf2c0d50.html.gz ├── 8cad00dc22de45ba42e9540421b5f78333f7ac57b385d69acb27a53b9fd69f0c.html.gz ├── 8e3efab59f48fd29a1e1e7aa135880c4251a9f090f94999668cdbaec59d30b5a.html.gz ├── 921019755f4a96ac4abf9dbcb4ef9d5ac202624a542d5ea70912330aa6fcc71f.html.gz ├── 94fbcc26772088646cb977cecf1abc4012847a1f6927d09505cbf0c3d417ba07.html.gz ├── 95301fb7883e0ee5214d1111554d30dd97e08c6380d7699369c0b9c15f42e6aa.html.gz ├── 961bd85ca85aaf791b278cc4a60058e92d57c4f32a3411cf8e7d802af183c926.html.gz ├── 9a440270bf8625d586039dfae1b8df409b467524e075124cd7a5424a5806901b.html.gz ├── 9cb8224b660f36c932823ab613fb76a07928fcbc41956c4c1f96f4ecab9202aa.html.gz ├── 9da36ae4714bfccc72374c6c146e9d1cd3cca39e2110bd67ccdbcc806f4cf139.html.gz ├── 9e8c9f082a8d77c58c17bda03b6b4bb6a1d6883fe196c252db4ca83b9991e0d3.html.gz ├── 9ebb3af65694a953005df5bd3869b2cefc263e1dea0471e3ef361c66a264cdd3.html.gz ├── 9eef8162bbb67b0bd73792313b91b87dc9304f43f85f479e67e71c166417451e.html.gz ├── a078b3656adc0295d0e37bd4f599342f4a0894da2451e0ef3038ac045434fef3.html.gz ├── a1fca19b884e0e946ad3fbe2a7f5031e5e3b23372702a76db302b6143c77cb31.html.gz ├── a6968f427cdb786531cfb326518e674bd8b48af94df7c5c6165cdf40e944357a.html.gz ├── a860fb5eda1ac75df3bc95ba096ade649fdbb1bb566adb9fee3cb13e59f37604.html.gz ├── aadb38e527d5379306de3b910ec62cb2447cc1035686b2b2d152580f8f8a1ea2.html.gz ├── aade2ec8d1e7b0919aef1001c3ef0573f8a239e22d4d751d8e664f04ea77ef0d.html.gz ├── abd9d6291b6bfae0c3ffad8ab7623b482c6da46face0271dc42af6324d8f0ce5.html.gz ├── ac1bfdd4c510f679c58f1b62101630d40fda20a16703235ae0f56b65a465e423.html.gz ├── ac3c035520461017a7c5b248d8e39ef063cad4c0c7d7b7ecd68aff8f15099485.html.gz ├── ad826691a8a2f9c4ce50cf0b885af933c4b5119c1f6235cd7df1dfb83f255bcc.html.gz ├── ad9e9e596f21a6812fae27b5d9d622359826c368e471d7d5ff9ac4676eaac9cd.html.gz ├── aec5deeaada8b2fb81b55349da0229d5c77a4dc9605c1aaa31e5ce8b71358bc9.html.gz ├── b0cf2bbf0192315eec95ede9c59bbf4ae58699275739d590edc24b012e3e8800.html.gz ├── b37be3535e1fb61e5a238b7fa1ead1ad98b651cb09f138efadac3d54a122fb21.html.gz ├── b3c19dd5f0612d098788fa5173e491b3280da6226b492f8fe110f4ab1896cca8.html.gz ├── b6906ca016bbfc64c90426e098c75b3e8c84457a77f51f1e7ea6941cb80c2147.html.gz ├── b6fb53e9fb043c98eb1e6530a1074c40922e29025f5454809f3938a7c174faa3.html.gz ├── ba07d1e64775f4090e39116c382111f5a2cfe9528dd179673f4e9bfcea370c15.html.gz ├── ba4dfe2d3e817ff7b8b01172ccc307850fc5469bcdd26c3c48ca046cb88ab7cf.html.gz ├── bc13ff87b2630ffbebc33bc37b11178b14f03109055e1d17bf644f804b63d98a.html.gz ├── bd673bd7988144f0ab7b9c5e19fed140fb5aaa30d8894cb045b72d3b79a7dc54.html.gz ├── bdb56ac83513635db1d8b9eb46b2da4c0de8da2f1f28f5bf5163df3eb3d3ec06.html.gz ├── c00962aabe7bdd1fca78f5360ea7fa93cd7674863b05157e00827506a7aa58c4.html.gz ├── c13b9c0e04fb28d445d22e92bff6ab7f7800a429930677c28c4dad89f3269869.html.gz ├── c467d507551a836efa9cfe843ba5d7bafe519750e04d0c9ff0decf44f013f829.html.gz ├── c4a3637c6696f238cf9fe1c7fbb17bbb6731a71d4f5fe399b9b4fc3294a96a6b.html.gz ├── c50845a7158af12ee75acea301a3ea0dad1e848d6b9dbdb43ba7f2d825b2528b.html.gz ├── c582d3b772578e8feaa3cfd8f5ae8100bb6f0bc66048204a9a398395841c1164.html.gz ├── c58aa507c4deebd660f69905f9abb8f96d935f6e7210f597ed4cd32b3f39f7f7.html.gz ├── c69e539d689a8335a69042727f1b58edab09d5d99fb607ec625a63151a537dc2.html.gz ├── c7e39ac49fa1235f5d50f83bf2444248bd3aa4e6df044377916c812dd109ba23.html.gz ├── c81e134ed49902bcf69b551426b4a346c5a77ae993cac8bda68b5541a664ef4c.html.gz ├── c82b3d1d540bbbd6081bdfb78b4c068c583aa766bcaaefe7ad16d24e5413a829.html.gz ├── c90731f051d033e49e4cfcc920895051bbc3b54ef1a11519abcf22a115c3aa79.html.gz ├── cc03ddb5ef7d5f1fdb8a87f5e6dfd058a2a70acedf2551655a898dc5c18eb79e.html.gz ├── cc4aa22b8212aec7d289667c0a965569e6f06b9e9196ff8b02219bf2bc1b90d0.html.gz ├── d0382c0d9573a0a7beb1e649012d04ec7275ac23513ca6ca59e51477b028283c.html.gz ├── d1c57d7821e5a5b27fb468c59489601bb2a042b1c05221166e3221d2b5dc217f.html.gz ├── d48aeb9cf2f2ff15769a57513249b4a6a669159f3e50b335e741d4206a824e88.html.gz ├── d605bdef2cde7308a9f2fbd1484d4a9c3da0167177245d346da61e455f42208d.html.gz ├── d90bda7ed14df19574f4ca8b1ccde5752a78f40058af1393e81cc99adb3e8756.html.gz ├── db6b0816c612296c7f1f001c6df874214fcca0da0fc86fb3aea9358c7f681754.html.gz ├── dc7ccccc1f34eb2928cb238739aaf18c712d59d8d34b41acfb29178aeba65356.html.gz ├── dfd43bc0d46e7aaa78ba10fbcb5b9fdfe78771d36cb4c7497e17fb6f69170ec5.html.gz ├── e100c9612ad8495db03b2a9f968952d0eaa4853d9b32ded6a29f8e313a974873.html.gz ├── e1c7023ee2148901b086256fdd30a0893d10b0720b510d5ff07a021109347266.html.gz ├── e1cd54e5577d077df83a12a4753c3c8bf2d88d68cd709cc4c442874777581c4a.html.gz ├── e372e42c0a3df7b86e1c0bacf7bc14d042144a01e88833bc5a643d61b3547090.html.gz ├── e4c6a3b482403a8f60190ba27248cd52b250b86f5d4a8a10edcf7062c64fc3f5.html.gz ├── e593d7fe88f9f5cd6587ac172be2db6055d40b6f071023f97ab1ce373534261e.html.gz ├── e7301133baab43596f19076beab32096f6405b868e0a69bcfc3349e595d62475.html.gz ├── e7994d5500875202d93e736e8f0c8a0436107d10add94ce3789001b8c5c32358.html.gz ├── e7d77f1869803e24667fa0b985cff27fb4139951a5ffa494bc9ba810df48fb30.html.gz ├── ea25dd7edff4d27973600f35728f20aed5a3eedcc23257d9c3afc3d3e840c3de.html.gz ├── eb62ac8425e5573947ecde962d14433d18e5725cc4a8c908fe22f678e96a65a1.html.gz ├── ec3878db7e49b1ed354c511b132e3de5f773ff4fc8014163df58c22fffd93d2f.html.gz ├── ec7fc408c5ce66c22692a3f696c682f3de794bacfaca405d9a0dac5957051e5a.html.gz ├── ecb46e3e489d2aac92b2563112e1801077b4219a6db9751f18e228bcaf457802.html.gz ├── eecd2575093b85933997521d6babddd397599419588d7096c5c19dc4ffe2ea72.html.gz ├── ef2b3f268a67950c16563de9ca3209163c7618868c0216739e1e794e7884cc20.html.gz ├── ef4e67b66d63b5facef55c06a94d85f2ae01a0a1a4a3a1bcfe2499c8c8a7dacf.html.gz ├── f105de6e63ca91ea482f60193f6252092557f969f2fd128ff68c0d4d6b90dd7d.html.gz ├── f344ca5fb36e130f4344235fa22726f3367e09c211c120f21d9ae92effe902db.html.gz ├── f5c90a6d5253c3a21ff3168c64bea4b5ffade7a1ba5bed952a59ebee0d648d98.html.gz ├── f6ac15a4d98511396da23e4428deb5605422b1c8bbc8284e771f6896bdccf57f.html.gz ├── f81c6c05d9cbc93316992fa23ef74ec405194e292611f2e94f6a814868903665.html.gz ├── f8ff621a0b9b7646cc0d57d37416feabba2bf78ef5dd0bfc5b080f9f97bbe584.html.gz ├── fde930b01859de8311c6a14f8aa8c72be0659b551367803deb6736cf3526cf2e.html.gz ├── ff0f958ade714ebfaf5c0b42b1c0152a62063f4e6f72141406ccefc4a2677f21.html.gz └── ffc109d474fdee1a59fa554df8b09643f4a7d45b23eceabad66f0712c3f7daed.html.gz ├── output ├── AutoExtract.json ├── Diffbot.json ├── beautifulsoup.json ├── boilerpipe.json ├── dragnet.json ├── go_domdistiller.json ├── go_readability.json ├── goose3.json ├── html-text.json ├── html2text.json ├── inscriptis.json ├── justext.json ├── news_please.json ├── newspaper.json ├── readability.json ├── readability_js.json ├── trafilatura.json └── xpath-text.json ├── requirements.txt └── tests.py /.github/workflows/evaluate.yml: -------------------------------------------------------------------------------- 1 | name: Evaluate 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - master 8 | 9 | jobs: 10 | evaluator: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Python 3.7 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: '3.7' 20 | 21 | - name: Run Evaluation 22 | id: evaluation 23 | run: | 24 | RESULT=$(python3 evaluate.py) 25 | echo "${RESULT}" 26 | # hack for multiline output 27 | RESULT="${RESULT//'%'/'%25'}" 28 | RESULT="${RESULT//$'\n'/'%0A'}" 29 | RESULT="${RESULT//$'\r'/'%0D'}" 30 | echo "::set-output name=result::${RESULT}" 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .mypy_cache/ 2 | __pycache__/ 3 | extractors/go_readability/go_readability_cli 4 | extractors/go_domdistiller/go_domdistiller_cli 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Scrapinghub 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Article extraction benchmark: open-source libraries and commercial services 2 | =========================================================================== 3 | 4 | We evaluate the quality of article body 5 | extraction for commercial services 6 | `Zyte Automatic Extraction (ours) `_, 7 | `Diffbot `_ 8 | and open-source libraries 9 | `newspaper3k `_, 10 | `readability-lxml `_, 11 | `dragnet `_, 12 | `boilerpipe `_, 13 | `html-text `_, 14 | `trafilatura `_, 15 | `go-readability `_, 16 | `Readability.js `_, 17 | `Go-DomDistiller `_. 18 | `news-please `_. 19 | `Goose3 `_, 20 | `inscriptis `_, 21 | `html2text `_, 22 | `jusText `_, 23 | `BeautifulSoup `_. 24 | We release evaluation datasets and scripts, 25 | and provide more details in a whitepaper. 26 | 27 | Article extraction is a task of extracting certain fields of an article 28 | (e.g. news or blog post), such as headline, article body, publication date, 29 | authors, etc. Article extraction systems must work on any web-site. 30 | Here we evaluate only the article body field, as this is one of the most important fields 31 | and one of the hardest to get right. 32 | 33 | .. contents:: 34 | 35 | Results 36 | ------- 37 | 38 | Results of the initial evaluation, done in November 2019:: 39 | 40 | version F1 precision recall accuracy 41 | AutoExtract Nov 2019 0.970 ± 0.005 0.984 ± 0.002 0.956 ± 0.010 0.470 ± 0.037 42 | Diffbot Nov 2019 0.951 ± 0.010 0.958 ± 0.009 0.944 ± 0.013 0.348 ± 0.038 43 | boilerpipe ab3694d 0.860 ± 0.016 0.850 ± 0.016 0.870 ± 0.020 0.006 ± 0.006 44 | dragnet 1b65e7b 0.907 ± 0.014 0.925 ± 0.013 0.889 ± 0.019 0.221 ± 0.030 45 | html-text 0.5.1 0.665 ± 0.015 0.500 ± 0.017 0.994 ± 0.001 0.000 ± 0.000 46 | newspaper3k 0.2.8 0.912 ± 0.014 0.917 ± 0.014 0.906 ± 0.018 0.260 ± 0.032 47 | readability-lxml 0.7.1 0.922 ± 0.014 0.913 ± 0.014 0.931 ± 0.016 0.315 ± 0.035 48 | xpath-text 4.4.2 0.394 ± 0.020 0.246 ± 0.016 0.992 ± 0.001 0.000 ± 0.000 49 | 50 | Result of packages added after original evaluation:: 51 | 52 | version F1 precision recall accuracy 53 | trafilatura 0.5.1 0.945 ± 0.009 0.925 ± 0.011 0.966 ± 0.009 0.221 ± 0.031 54 | go_readability bdc8717 0.943 ± 0.007 0.912 ± 0.009 0.975 ± 0.007 0.210 ± 0.030 55 | readability_js Feb 2021 0.887 ± 0.012 0.853 ± 0.013 0.924 ± 0.012 0.149 ± 0.026 56 | go_domdistiller 1c90a88 0.927 ± 0.007 0.901 ± 0.010 0.956 ± 0.010 0.066 ± 0.018 57 | news_please 1.5.17 0.911 ± 0.014 0.917 ± 0.013 0.906 ± 0.018 0.249 ± 0.032 58 | goose3 3.1.8 0.887 ± 0.016 0.930 ± 0.015 0.847 ± 0.021 0.227 ± 0.032 59 | inscriptis 1.1.2 0.679 ± 0.015 0.517 ± 0.017 0.993 ± 0.001 0.000 ± 0.000 60 | html2text 2020.1.16 0.662 ± 0.015 0.499 ± 0.017 0.983 ± 0.002 0.000 ± 0.000 61 | justext 2.2.0 0.802 ± 0.018 0.858 ± 0.017 0.754 ± 0.028 0.088 ± 0.021 62 | beautifulsoup 4.9.3 0.665 ± 0.015 0.499 ± 0.017 0.994 ± 0.001 0.000 ± 0.000 63 | 64 | Below you can find more details about the packages and result reproduction. 65 | 66 | More details 67 | ------------ 68 | 69 | More details are available: 70 | 71 | - In the whitepaper at https://www.zyte.com/whitepaper-ebook/in-depth-analysis-and-evaluation-on-the-quality-of-article-body-extraction/ 72 | - In a technical report attached to the v1.0.0 release at 73 | https://github.com/scrapinghub/article-extraction-benchmark/releases/tag/v1.0.0 74 | 75 | Installation 76 | ------------ 77 | 78 | Clone this repo, and use Python 3.6+. 79 | 80 | Evaluation does not require any dependencies. 81 | Dependencies listed in ``requirements.txt`` are only for re-generating 82 | output files for open-source article extraction libraries. 83 | See below for their installation details. 84 | 85 | Data 86 | ---- 87 | 88 | JSON data format: a dictionary which maps item ids to dictionaries, 89 | with the following fields: 90 | 91 | - ``articleBody``: text of the article 92 | - ``url``: page url (optional) 93 | 94 | All files should have the same keys. 95 | Ground truth is in ``ground-truth.json``, 96 | predictions from different systems is in ``output/*.json`` files. 97 | 98 | HTML files are in ``html`` folder. They were fetched with Splash headless 99 | browser with JS disabled by default. They are gzip-compressed and utf-8 encoded. 100 | 101 | Screenshots of all pages are not in the repo, they are available on github 102 | in the "Releases" section: https://github.com/scrapinghub/article-extraction-benchmark/releases 103 | 104 | Open-source libraries 105 | --------------------- 106 | 107 | In addition to benchmarking AutoExtract and Diffbot services, we also benchmark several 108 | open-source libraries that work directly on HTML files without a need for rendering 109 | or external resources: 110 | 111 | - newspaper3k: https://github.com/codelucas/newspaper 112 | - readability-lxml: https://github.com/buriy/python-readability 113 | - dragnet: https://github.com/dragnet-org/dragnet 114 | - boilerpipe: https://github.com/misja/python-boilerpipe 115 | - html-text: https://github.com/TeamHG-Memex/html-text - 116 | this is a baseline which extracts the full text of HTML page 117 | - trafilatura: https://github.com/adbar/trafilatura contributed by the author 118 | at https://github.com/scrapinghub/article-extraction-benchmark/pull/4 119 | - go-readability: https://github.com/go-shiori/go-readability 120 | - Readability.js: https://github.com/mozilla/readability 121 | - Go-DomDistiller: https://github.com/markusmobius/go-domdistiller 122 | - news-please: https://github.com/fhamborg/news-please 123 | - Goose3: https://github.com/goose3/goose3 124 | - inscriptis: https://github.com/weblyzard/inscriptis - 125 | converts HTML to text with a particular emphasis on nested tables 126 | - html2text: https://github.com/Alir3z4/html2text - 127 | converts HTML pages to Markup language 128 | - jusText: https://github.com/miso-belica/jusText - 129 | Heuristic based boilerplate removal tool 130 | - BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ - 131 | Python library for pulling data out of HTML and XML files. 132 | 133 | Output from these libraries is already present in the repo in ``output/*.json`` files. 134 | They were generated with ``extractors/run_*.py`` files. 135 | 136 | All dependencies are in ``requirements.txt``. 137 | Note that dragnet may fail to install at first try, as 138 | you need to have ``numpy`` and ``Cython`` installed, and have ``libxml2`` headers 139 | (``libxml2-dev`` on Ubuntu). 140 | 141 | boilerpipe requires a custom installation: use python2, you also need Java 142 | (e.g. install ``default-jre`` in Ubuntu), install it with 143 | ``pip install -e git+https://github.com/misja/python-boilerpipe.git@ab3694d7bf695b73f0684a028e70aa816d63e6cb#egg=boilerpipe`` 144 | 145 | go-readability requires a custom installation: see README in ``extractors/go_readability``. 146 | 147 | Readability.js require a custom installation: install nodejs and install cli tool: 148 | ``npm install -g readability-cli@2.2.1-pre`` 149 | 150 | Go-DomDistiller requires a custom installation: see README in ``extractors/go_domdistiller``. 151 | 152 | Evaluation 153 | ---------- 154 | 155 | For evaluation, run:: 156 | 157 | python3 evaluate.py 158 | 159 | We report precision, recall, F1, accuracy and their standard deviation estimated with bootstrap. 160 | Please refer to the technical report for more details. 161 | 162 | License 163 | ------- 164 | 165 | License is MIT. 166 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from collections import Counter 4 | import json 5 | from pathlib import Path 6 | import random 7 | import re 8 | import statistics 9 | from typing import Any, Dict, Tuple, List 10 | 11 | 12 | def main(): 13 | """ Perform evaluation for all ``output/*.json`` files, 14 | loading ground truth from ``groud-truth.json``. 15 | Python3.6+ is required. 16 | """ 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--n-bootstrap', type=int, default=1000) 19 | parser.add_argument('--bootstrap-differences', action='store_true', 20 | help='run bootstrap for differences') 21 | parser.add_argument('--output', type=Path, help='output results as json') 22 | args = parser.parse_args() 23 | ground_truth = load_json(Path('ground-truth.json')) 24 | metrics_by_name = {} 25 | for path in sorted(Path('output').glob('*.json')): 26 | name = path.stem 27 | metrics = evaluate(ground_truth, load_json(path), args.n_bootstrap) 28 | print('{name:<20} ' 29 | 'precision={precision:.3f} ± {precision_std:.3f} ' 30 | 'recall={recall:.3f} ± {recall_std:.3f} ' 31 | 'F1={f1:.3f} ± {f1_std:.3f} ' 32 | 'accuracy={accuracy:.3f} ± {accuracy_std:.3f} ' 33 | .format(name=name, **metrics)) 34 | metrics_by_name[name] = metrics 35 | 36 | if args.bootstrap_differences: 37 | # check differences with bootstrap 38 | for name, metrics in sorted(metrics_by_name.items()): 39 | tp_fp_fns = metrics['tp_fp_fns'] 40 | for other_name, other_metrics in sorted(metrics_by_name.items()): 41 | if name >= other_name: 42 | continue 43 | print(f'Comparison: {name} minus {other_name}') 44 | other_tp_fp_fns = other_metrics['tp_fp_fns'] 45 | print_metrics_diff(tp_fp_fns, other_tp_fp_fns, args.n_bootstrap) 46 | 47 | if args.output: 48 | args.output.write_text( 49 | json.dumps(metrics_by_name, indent=4, sort_keys=True)) 50 | 51 | 52 | def evaluate( 53 | ground_truth: Dict[str, Dict], 54 | prediction: Dict[str, Dict], 55 | n_bootstrap: int, 56 | ) -> Dict[str, Any]: 57 | if ground_truth.keys() != prediction.keys(): 58 | raise ValueError('prediction keys do not match ground truth') 59 | tp_fp_fns = [] 60 | accuracies = [] 61 | for key in ground_truth.keys(): 62 | true = ground_truth[key].get('articleBody', '') 63 | pred = prediction[key].get('articleBody', '') 64 | tp_fp_fns.append(string_shingle_matching(true=true, pred=pred)) 65 | accuracies.append(get_accuracy(true=true, pred=pred)) 66 | metrics: Dict[str, Any] = metrics_from_tp_fp_fns(tp_fp_fns) 67 | metrics['tp_fp_fns'] = tp_fp_fns 68 | metrics['accuracy'] = statistics.mean(accuracies) 69 | 70 | # add bootstrap estimates of condifence intervals 71 | b_values: Dict[str, List[float]] = {} 72 | for _ in range(n_bootstrap): 73 | n = len(tp_fp_fns) 74 | indices = [random.randint(0, n - 1) for _ in range(n)] 75 | b_metrics = metrics_from_tp_fp_fns([tp_fp_fns[i] for i in indices]) 76 | for key in b_metrics: 77 | b_values.setdefault(key, []).append(b_metrics[key]) 78 | b_values.setdefault('accuracy', []).append( 79 | statistics.mean([accuracies[i] for i in indices])) 80 | for key, values in sorted(b_values.items()): 81 | metrics[f'{key}_std'] = statistics.stdev(values) 82 | 83 | return metrics 84 | 85 | 86 | def print_metrics_diff(tp_fp_fns, other_tp_fp_fns, n_bootstrap): 87 | diffs = {} 88 | for _ in range(n_bootstrap): 89 | n = len(tp_fp_fns) 90 | indices = [random.randint(0, n - 1) for _ in range(n)] 91 | metrics = metrics_from_tp_fp_fns([tp_fp_fns[i] for i in indices]) 92 | other_metrics = metrics_from_tp_fp_fns( 93 | [other_tp_fp_fns[i] for i in indices]) 94 | for key in metrics: 95 | diffs.setdefault(key, []).append(metrics[key] - other_metrics[key]) 96 | for key, values in sorted(diffs.items()): 97 | mean = statistics.mean(values) 98 | std = statistics.stdev(values) 99 | print(f'{key:<10} {mean:.3f} ± {std:.3f}') 100 | 101 | 102 | TP_FP_FN = Tuple[float, float, float] 103 | 104 | 105 | def metrics_from_tp_fp_fns(tp_fp_fns: List[TP_FP_FN]) -> Dict[str, float]: 106 | precision = statistics.mean([ 107 | precision_score(tp, fp, fn) for tp, fp, fn in tp_fp_fns 108 | if tp + fp > 0]) 109 | recall = statistics.mean([ 110 | recall_score(tp, fp, fn) for tp, fp, fn in tp_fp_fns 111 | if tp + fn > 0]) 112 | f1 = 2 * precision * recall / (precision + recall) 113 | return { 114 | 'f1': f1, 115 | 'precision': precision, 116 | 'recall': recall, 117 | } 118 | 119 | 120 | def precision_score(tp: float, fp: float, fn: float) -> float: 121 | if fp == fn == 0: 122 | return 1. 123 | if tp == fp == 0: 124 | return 0. 125 | return tp / (tp + fp) 126 | 127 | 128 | def recall_score(tp: float, fp: float, fn: float) -> float: 129 | if fp == fn == 0: 130 | return 1. 131 | if tp == fn == 0: 132 | return 0. 133 | return tp / (tp + fn) 134 | 135 | 136 | def get_accuracy(true: str, pred: str) -> float: 137 | return float(_tokenize(true) == _tokenize(pred)) 138 | 139 | 140 | def string_shingle_matching( 141 | true: str, pred: str, ngram_n: int = 4, 142 | ) -> TP_FP_FN: 143 | """ Compute TP/FP/FN across shingles (joined ngrams). 144 | Intended to be used for articleBody comparison, 145 | similar to the one used here (with shingles instead of tokens): 146 | https://moz.com/devblog/benchmarking-python-content-extraction-algorithms-dragnet-readability-goose-and-eatiht/ 147 | """ 148 | true_shingles = _all_shingles(true, ngram_n) 149 | pred_shingles = _all_shingles(pred, ngram_n) 150 | tp = fp = fn = 0. 151 | for key in (set(true_shingles) | set(pred_shingles)): 152 | true_count = true_shingles.get(key, 0) 153 | pred_count = pred_shingles.get(key, 0) 154 | tp += min(true_count, pred_count) 155 | fp += max(0, pred_count - true_count) 156 | fn += max(0, true_count - pred_count) 157 | tp_fp_fn = [tp, fp, fn] 158 | s = sum(tp_fp_fn) 159 | # Normalize metrics so that longer texts do not have more weight. 160 | if s > 0: 161 | tp_fp_fn = [x / s for x in tp_fp_fn] 162 | return tuple(tp_fp_fn) # type: ignore 163 | 164 | 165 | def _all_shingles(text: str, ngram_n: int) -> Dict[Tuple[str, ...], int]: 166 | return dict(Counter(_ngrams(text, ngram_n))) 167 | 168 | 169 | _TOKEN_RE = re.compile( 170 | r'\w+', re.UNICODE | re.MULTILINE | re.IGNORECASE | re.DOTALL) 171 | 172 | 173 | def _tokenize(text: str) -> List[str]: 174 | # Note that such simple tokenization will work ok for any language, 175 | # even if several words will be clumped together, as we expect 176 | # that extra predicted text will still be separated. 177 | return _TOKEN_RE.findall(text or '') 178 | 179 | 180 | def _ngrams(text: str, n: int) -> List[Tuple[str, ...]]: 181 | tokens = _tokenize(text) 182 | result = [] 183 | for i in range(0, max(1, len(tokens) - n + 1)): 184 | shingle = tuple(tokens[i: i + n]) 185 | if shingle: 186 | result.append(shingle) 187 | return result 188 | 189 | 190 | def load_json(path: Path): 191 | with path.open('rt', encoding='utf8') as f: 192 | return json.load(f) 193 | 194 | 195 | if __name__ == '__main__': 196 | main() 197 | -------------------------------------------------------------------------------- /extractors/go_domdistiller.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | import os 5 | import subprocess 6 | from pathlib import Path 7 | from tempfile import mkstemp 8 | 9 | 10 | # built executable file 11 | CLI_PATH = Path('extractors/go_domdistiller/go_domdistiller_cli') 12 | 13 | 14 | def main(): 15 | output = {} 16 | for path in Path('html').glob('*.html.gz'): 17 | with gzip.open(path, 'rt', encoding='utf8') as f: 18 | html = f.read() 19 | item_id = path.stem.split('.')[0] 20 | 21 | # save html to temp file 22 | temp_filepath = mkstemp()[1] 23 | with open(temp_filepath, 'wt') as fw: 24 | fw.write(html) 25 | 26 | # get extracted content from go-domdistiller 27 | result = subprocess.run([CLI_PATH, temp_filepath], stdout=subprocess.PIPE) 28 | 29 | # destroy temp file 30 | os.remove(temp_filepath) 31 | 32 | output[item_id] = {'articleBody': result.stdout.decode('utf-8')} 33 | (Path('output') / 'go_domdistiller.json').write_text( 34 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 35 | encoding='utf8') 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /extractors/go_domdistiller/README.rst: -------------------------------------------------------------------------------- 1 | Go-DomDistiller 2 | =============== 3 | 4 | Open Source article extractor written on golang: https://github.com/markusmobius/go-domdistiller. 5 | Based on `DOM Distiller `_ which is part of the Chromium project. 6 | The structure of this package follows the structure of the original Java code. 7 | 8 | Usage 9 | ----- 10 | 11 | To use the library I'm wrote a simple cli-module that reads the contents of the file passed in the arguments and outputs the parsing result to stdout. 12 | 13 | 14 | Installation 15 | ------------ 16 | 17 | 1. Install golang (I'm used version ``1.15.8``) 18 | 2. Go to the folder containing this file 19 | 3. Build an executable file: 20 | 21 | go build -o go_domdistiller_cli 22 | -------------------------------------------------------------------------------- /extractors/go_domdistiller/cli.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | distiller "github.com/markusmobius/go-domdistiller" 8 | ) 9 | 10 | func main() { 11 | if len(os.Args) < 2 { 12 | panic("Input file not provided in args") 13 | } 14 | if len(os.Args) > 2 { 15 | panic("Args accept only one argument") 16 | } 17 | input := os.Args[1] 18 | 19 | opts := &distiller.Options{ 20 | ExtractTextOnly: true, 21 | SkipPagination: true, 22 | } 23 | 24 | article, err := distiller.ApplyForFile(input, opts) 25 | if err != nil { 26 | panic(err) 27 | } 28 | 29 | fmt.Print(article.HTML) 30 | } 31 | -------------------------------------------------------------------------------- /extractors/go_domdistiller/go.mod: -------------------------------------------------------------------------------- 1 | module cli 2 | 3 | go 1.15 4 | 5 | require github.com/markusmobius/go-domdistiller v0.0.0-20201222130639-1c90a88d11c2 6 | -------------------------------------------------------------------------------- /extractors/go_domdistiller/go.sum: -------------------------------------------------------------------------------- 1 | github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= 2 | github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= 3 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 6 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52 h1:wEe9mu6BOmGYT5yQ9ag5E38LHUMUv7/AFx0J8YNR8HI= 7 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52/go.mod h1:aLEd5DGjh1qYKnJJ/tC5OL0f3CV4CMcreDOn4RpCmUc= 8 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 9 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 10 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 11 | github.com/markusmobius/go-domdistiller v0.0.0-20201222130639-1c90a88d11c2 h1:Zq0OEILmCXTWQdMd1p8a7wk0RvuEfF70ON859jM1n7g= 12 | github.com/markusmobius/go-domdistiller v0.0.0-20201222130639-1c90a88d11c2/go.mod h1:EjE7+WYAL0k+KQX8viF0oy/MH7uKMXmhasAdedoSC3o= 13 | github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= 14 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 15 | github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= 16 | github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= 17 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 18 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 19 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 20 | github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4/go.mod h1:+ccdNT0xMY1dtc5XBxumbYfOUhmduiGudqaDgD2rVRE= 21 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 22 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 23 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 24 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 25 | golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 26 | golang.org/x/net v0.0.0-20201031054903-ff519b6c9102 h1:42cLlJJdEh+ySyeUUbEQ5bsTiq8voBeTuweGVkY6Puw= 27 | golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 28 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 29 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 30 | golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 31 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA= 32 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 33 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 34 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 35 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 36 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 37 | gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 38 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 39 | gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 40 | -------------------------------------------------------------------------------- /extractors/go_readability/README.rst: -------------------------------------------------------------------------------- 1 | Go-Readability 2 | ============== 3 | 4 | Open Source article extractor written on golang: https://github.com/go-shiori/go-readability . Based from `Readability.js `_ by Mozilla, and written line by line to make sure it looks and works as similar as possible. 5 | 6 | Usage 7 | ----- 8 | 9 | To use the library I'm wrote a simple cli-module that reads the contents of the file passed in the arguments and outputs the parsing result to stdout. 10 | 11 | 12 | Installation 13 | ------------ 14 | 15 | 1. Install golang (I'm used version ``1.15.8``) 16 | 2. Go to the folder containing this file 17 | 3. Build an executable file: 18 | 19 | go build -o go_readability_cli 20 | -------------------------------------------------------------------------------- /extractors/go_readability/cli.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | readability "github.com/go-shiori/go-readability" 8 | ) 9 | 10 | func main() { 11 | if len(os.Args) < 2 { 12 | panic("Input file not provided in args") 13 | } 14 | if len(os.Args) > 2 { 15 | panic("Args accept only one argument") 16 | } 17 | input := os.Args[1] 18 | 19 | fSrc, err := os.Open(input) 20 | defer fSrc.Close() 21 | if err != nil { 22 | panic(err) 23 | } 24 | 25 | article, err := readability.FromReader(fSrc, "https://fake-url.com") 26 | if err != nil { 27 | panic(err) 28 | } 29 | 30 | fmt.Print(article.TextContent) 31 | } 32 | -------------------------------------------------------------------------------- /extractors/go_readability/go.mod: -------------------------------------------------------------------------------- 1 | module go_readability_cli 2 | 3 | go 1.15 4 | 5 | require github.com/go-shiori/go-readability v0.0.0-20201011032228-bdc871772408 6 | -------------------------------------------------------------------------------- /extractors/go_readability/go.sum: -------------------------------------------------------------------------------- 1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 3 | github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= 4 | github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= 5 | github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= 6 | github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= 7 | github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= 8 | github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= 9 | github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= 10 | github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= 11 | github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= 12 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= 13 | github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= 14 | github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= 15 | github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= 16 | github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= 17 | github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= 18 | github.com/cpuguy83/go-md2man/v2 v2.0.0 h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM= 19 | github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= 20 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 21 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 22 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 23 | github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= 24 | github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= 25 | github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= 26 | github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= 27 | github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= 28 | github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= 29 | github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= 30 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52 h1:wEe9mu6BOmGYT5yQ9ag5E38LHUMUv7/AFx0J8YNR8HI= 31 | github.com/go-shiori/dom v0.0.0-20201011032054-d6b74a54fe52/go.mod h1:aLEd5DGjh1qYKnJJ/tC5OL0f3CV4CMcreDOn4RpCmUc= 32 | github.com/go-shiori/go-readability v0.0.0-20201011032228-bdc871772408 h1:xq7Sck0bwvgp/WWw6tHFDn3dUTCQwWRWLudr+inH/gs= 33 | github.com/go-shiori/go-readability v0.0.0-20201011032228-bdc871772408/go.mod h1:sz+ASCdyPdgLAjKpovPn+u+IZjBoGp7vWa1w1yZfi3Y= 34 | github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= 35 | github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= 36 | github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= 37 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 38 | github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 39 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= 40 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 41 | github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 42 | github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= 43 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= 44 | github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= 45 | github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= 46 | github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= 47 | github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= 48 | github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= 49 | github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= 50 | github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= 51 | github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= 52 | github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= 53 | github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= 54 | github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= 55 | github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= 56 | github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= 57 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= 58 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 59 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= 60 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 61 | github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= 62 | github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= 63 | github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= 64 | github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= 65 | github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= 66 | github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= 67 | github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= 68 | github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 69 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 70 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 71 | github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= 72 | github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= 73 | github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= 74 | github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= 75 | github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= 76 | github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= 77 | github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= 78 | github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= 79 | github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= 80 | github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= 81 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= 82 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 83 | github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= 84 | github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= 85 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= 86 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= 87 | github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= 88 | github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= 89 | github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= 90 | github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= 91 | github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= 92 | github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= 93 | github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= 94 | github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE= 95 | github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= 96 | github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= 97 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 98 | github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE= 99 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 100 | github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 101 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 102 | github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= 103 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 104 | github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= 105 | github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= 106 | github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= 107 | github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= 108 | go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= 109 | go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= 110 | go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= 111 | go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= 112 | golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= 113 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 114 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 115 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 116 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= 117 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 118 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 119 | golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 120 | golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 121 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 122 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 123 | golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= 124 | golang.org/x/net v0.0.0-20201010224723-4f7140c49acb h1:mUVeFHoDKis5nxCAzoAi7E8Ghb86EXh/RK6wtvJIqRY= 125 | golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 126 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= 127 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 128 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 129 | golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 130 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 131 | golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 132 | golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 133 | golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 134 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 135 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 136 | golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 137 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA= 138 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 139 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 140 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 141 | golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= 142 | golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 143 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 144 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 145 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= 146 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= 147 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= 148 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= 149 | google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= 150 | gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= 151 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 152 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 153 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= 154 | gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 155 | gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= 156 | gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= 157 | gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 158 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 159 | gopkg.in/yaml.v2 v2.2.4 h1:/eiJrUcujPVeJ3xlSWaiNi3uSVmDGBK1pDHUHAnao1I= 160 | gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 161 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 162 | -------------------------------------------------------------------------------- /extractors/run_beautifulsoup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | bs = BeautifulSoup(html, 'html.parser') 16 | article = bs.get_text(separator=' ', strip=True) 17 | output[item_id] = {'articleBody': article} 18 | (Path('output') / 'beautifulsoup.json').write_text( 19 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 20 | encoding='utf8') 21 | 22 | 23 | if __name__ == '__main__': 24 | main() 25 | -------------------------------------------------------------------------------- /extractors/run_boilerpipe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | import codecs 3 | import gzip 4 | import json 5 | import glob 6 | import os.path 7 | 8 | from boilerpipe.extract import Extractor 9 | 10 | 11 | def main(): 12 | output = {} 13 | for path in glob.glob('html/*.html.gz'): 14 | with gzip.open(path, 'rb') as f: 15 | html = f.read().decode('utf8') 16 | item_id = os.path.basename(path).split('.')[0] 17 | extractor = Extractor(extractor='ArticleExtractor', html=html) 18 | output[item_id] = {'articleBody': extractor.getText()} 19 | with codecs.open(os.path.join('output', 'boilerpipe.json'), 20 | 'wt', encoding='utf8') as f: 21 | json.dump(output, f, sort_keys=True, ensure_ascii=False, indent=4) 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /extractors/run_dragnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | from dragnet import extract_content 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | content = extract_content(html, encoding='utf8') 16 | output[item_id] = {'articleBody': content} 17 | (Path('output') / 'dragnet.json').write_text( 18 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 19 | encoding='utf8') 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /extractors/run_go_readability.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | import os 5 | import subprocess 6 | from pathlib import Path 7 | from tempfile import mkstemp 8 | 9 | 10 | # built executable file 11 | CLI_PATH = Path('extractors/go_readability/go_readability_cli') 12 | 13 | 14 | def main(): 15 | output = {} 16 | for path in Path('html').glob('*.html.gz'): 17 | with gzip.open(path, 'rt', encoding='utf8') as f: 18 | html = f.read() 19 | item_id = path.stem.split('.')[0] 20 | 21 | # save html to temp file 22 | temp_filepath = mkstemp()[1] 23 | with open(temp_filepath, 'wt') as fw: 24 | fw.write(html) 25 | 26 | # get extracted content from go-readadbility 27 | result = subprocess.run([CLI_PATH, temp_filepath], stdout=subprocess.PIPE) 28 | 29 | # destroy temp file 30 | os.remove(temp_filepath) 31 | 32 | output[item_id] = {'articleBody': result.stdout.decode('utf-8')} 33 | (Path('output') / 'go_readability.json').write_text( 34 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 35 | encoding='utf8') 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /extractors/run_goose3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | from goose3 import Goose 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | g = Goose() 16 | article = g.extract(raw_html=html) 17 | output[item_id] = {'articleBody': article.cleaned_text} 18 | (Path('output') / 'goose3.json').write_text( 19 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 20 | encoding='utf8') 21 | 22 | 23 | if __name__ == '__main__': 24 | main() 25 | -------------------------------------------------------------------------------- /extractors/run_html2text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | from html2text import HTML2Text 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | h = HTML2Text() 16 | h.ignore_links = True 17 | h.ignore_images = True 18 | content = h.handle(html) 19 | output[item_id] = {'articleBody': content} 20 | (Path('output') / 'html2text.json').write_text( 21 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 22 | encoding='utf8') 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /extractors/run_html_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | import html_text 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | output[item_id] = {'articleBody': html_text.extract_text(html)} 16 | (Path('output') / 'html-text.json').write_text( 17 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 18 | encoding='utf8') 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /extractors/run_inscriptis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | from inscriptis import get_text 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | content = get_text(html) 16 | output[item_id] = {'articleBody': content} 17 | (Path('output') / 'inscriptis.json').write_text( 18 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 19 | encoding='utf8') 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /extractors/run_justext.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | import justext 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | article = ' '.join( 16 | [p.text for p in justext.justext(html, justext.get_stoplist("English"), 50, 200, 0.1, 0.2, 0.2, 200, True) 17 | if not p.is_boilerplate]) 18 | output[item_id] = {'articleBody': article} 19 | (Path('output') / 'justext.json').write_text( 20 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 21 | encoding='utf8') 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /extractors/run_news_please.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | from newsplease import NewsPlease 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | article = NewsPlease.from_html(html, url=None) 16 | output[item_id] = {'articleBody': article.maintext} 17 | (Path('output') / 'news_please.json').write_text( 18 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 19 | encoding='utf8') 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /extractors/run_newspaper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | from newspaper import Article 7 | 8 | 9 | def main(): 10 | output = {} 11 | url_by_item_id = {item_id: item['url'] for item_id, item in json.loads( 12 | Path('ground-truth.json').read_text('utf8')).items()} 13 | for path in Path('html').glob('*.html.gz'): 14 | with gzip.open(path, 'rt', encoding='utf8') as f: 15 | html = f.read() 16 | item_id = path.stem.split('.')[0] 17 | article = Article(url_by_item_id[item_id]) 18 | article.set_html(html) 19 | article.parse() 20 | output[item_id] = {'articleBody': article.text} 21 | (Path('output') / 'newspaper.json').write_text( 22 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 23 | encoding='utf8') 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /extractors/run_readability.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | import html_text 7 | from readability import Document 8 | 9 | 10 | def main(): 11 | output = {} 12 | for path in Path('html').glob('*.html.gz'): 13 | with gzip.open(path, 'rt', encoding='utf8') as f: 14 | html = f.read() 15 | item_id = path.stem.split('.')[0] 16 | doc = Document(html) 17 | text = html_text.extract_text(doc.summary(html_partial=True)) 18 | output[item_id] = {'articleBody': text} 19 | (Path('output') / 'readability.json').write_text( 20 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 21 | encoding='utf8') 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /extractors/run_readability_js.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | import os 5 | import subprocess 6 | from pathlib import Path 7 | from tempfile import mkstemp 8 | 9 | 10 | # executable file from `readability-cli` package 11 | CLI_PATH = Path('/usr/local/bin/readable') 12 | 13 | 14 | def main(): 15 | output = {} 16 | for path in Path('html').glob('*.html.gz'): 17 | with gzip.open(path, 'rt', encoding='utf8') as f: 18 | html = f.read() 19 | item_id = path.stem.split('.')[0] 20 | 21 | # save html to temp file 22 | temp_filepath = mkstemp()[1] 23 | with open(temp_filepath, 'wt') as fw: 24 | fw.write(html) 25 | 26 | # get extracted content from Readability.js (use readability-cli) 27 | result = subprocess.run( 28 | [CLI_PATH, temp_filepath, '--properties=text-content', '--low-confidence=force'], 29 | stdout=subprocess.PIPE 30 | ) 31 | 32 | # destroy temp file 33 | os.remove(temp_filepath) 34 | 35 | output[item_id] = {'articleBody': result.stdout.decode('utf-8')} 36 | (Path('output') / 'readability_js.json').write_text( 37 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 38 | encoding='utf8') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /extractors/run_trafilatura.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | import trafilatura 7 | 8 | 9 | def main(): 10 | output = {} 11 | for path in Path('html').glob('*.html.gz'): 12 | with gzip.open(path, 'rt', encoding='utf8') as f: 13 | html = f.read() 14 | item_id = path.stem.split('.')[0] 15 | output[item_id] = {'articleBody': trafilatura.extract(html, include_comments=False)} 16 | (Path('output') / 'trafilatura.json').write_text( 17 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 18 | encoding='utf8') 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /extractors/run_xpath_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gzip 3 | import json 4 | from pathlib import Path 5 | 6 | import lxml.html 7 | 8 | 9 | def xpath_text(html: str) -> str: 10 | root = lxml.html.fromstring(html) 11 | bodies = root.xpath('//body') 12 | if bodies: 13 | root = bodies[0] 14 | return ' '.join(root.xpath('.//text()')) 15 | 16 | 17 | def main(): 18 | output = {} 19 | for path in Path('html').glob('*.html.gz'): 20 | with gzip.open(path, 'rt', encoding='utf8') as f: 21 | html = f.read() 22 | item_id = path.stem.split('.')[0] 23 | output[item_id] = {'articleBody': xpath_text(html)} 24 | (Path('output') / 'xpath-text.json').write_text( 25 | json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), 26 | encoding='utf8') 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /html/042bb7b5fedab6eac7db576522b89b93904c237d344bcbe14a6a5ab7f7335856.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/042bb7b5fedab6eac7db576522b89b93904c237d344bcbe14a6a5ab7f7335856.html.gz -------------------------------------------------------------------------------- /html/04a6711caa7c687592777718866e781e976e0fe684faebe8b3cedcef8cd0ea34.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/04a6711caa7c687592777718866e781e976e0fe684faebe8b3cedcef8cd0ea34.html.gz -------------------------------------------------------------------------------- /html/05844573ca7e1fba714d715bb11ca08c26e25328999c74a1cb3bc8a0e4399f0f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/05844573ca7e1fba714d715bb11ca08c26e25328999c74a1cb3bc8a0e4399f0f.html.gz -------------------------------------------------------------------------------- /html/06e5123e4ef7cfb4533250dc45d1e03d0838fc66223f45c583c4d12f48b4da85.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/06e5123e4ef7cfb4533250dc45d1e03d0838fc66223f45c583c4d12f48b4da85.html.gz -------------------------------------------------------------------------------- /html/06ee193de4bd611f7fafbab0c59b0f6fe3495093516720632cd093b24c7a0e98.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/06ee193de4bd611f7fafbab0c59b0f6fe3495093516720632cd093b24c7a0e98.html.gz -------------------------------------------------------------------------------- /html/076f4f33bf75059db581bedf36e76fb65e89a8f7752db3339aa3ea11c5122f32.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/076f4f33bf75059db581bedf36e76fb65e89a8f7752db3339aa3ea11c5122f32.html.gz -------------------------------------------------------------------------------- /html/08f793762792bd252c75fb57544cdf506ffcc04785136cb87503f02364b82b56.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/08f793762792bd252c75fb57544cdf506ffcc04785136cb87503f02364b82b56.html.gz -------------------------------------------------------------------------------- /html/098bb3e96c0acdf36efdcde45fb9cca3f8c82c7cb2071b76097a1b96155f1eb2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/098bb3e96c0acdf36efdcde45fb9cca3f8c82c7cb2071b76097a1b96155f1eb2.html.gz -------------------------------------------------------------------------------- /html/0d46122928b6f468cc4bbc694051d0dbae5702bc75a16dab82a99b58daf150a0.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0d46122928b6f468cc4bbc694051d0dbae5702bc75a16dab82a99b58daf150a0.html.gz -------------------------------------------------------------------------------- /html/0dd1357045727799a447563fd8851f4ebe79f042073ea16991a9b67aa595f81a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0dd1357045727799a447563fd8851f4ebe79f042073ea16991a9b67aa595f81a.html.gz -------------------------------------------------------------------------------- /html/0e014df693f182824fe5e24030ddbe1d0b96ddb9685cf20d5766457ed32ffa2d.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0e014df693f182824fe5e24030ddbe1d0b96ddb9685cf20d5766457ed32ffa2d.html.gz -------------------------------------------------------------------------------- /html/0ec95c7261d122f304728e90c983450ef1ce1e0b423546835c397d50aaf0d0f2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/0ec95c7261d122f304728e90c983450ef1ce1e0b423546835c397d50aaf0d0f2.html.gz -------------------------------------------------------------------------------- /html/11ea381ad92b5448cf66eae62f52ac565361a244c8881615fc6a7bb523cc0c32.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/11ea381ad92b5448cf66eae62f52ac565361a244c8881615fc6a7bb523cc0c32.html.gz -------------------------------------------------------------------------------- /html/14cc2a0ca59c62a8c9f205a171e9ccf4ef4cf69b0c642f51c8c65c051b39024f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/14cc2a0ca59c62a8c9f205a171e9ccf4ef4cf69b0c642f51c8c65c051b39024f.html.gz -------------------------------------------------------------------------------- /html/156770d676ce79905198e1c8407f81e5ecfb617d9aa44712718707eb7e3b8e38.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/156770d676ce79905198e1c8407f81e5ecfb617d9aa44712718707eb7e3b8e38.html.gz -------------------------------------------------------------------------------- /html/16c30add7e96315e9cc957d85aa876ccb6b70055f0ddab51547a586117cc1f56.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/16c30add7e96315e9cc957d85aa876ccb6b70055f0ddab51547a586117cc1f56.html.gz -------------------------------------------------------------------------------- /html/1ace8c85aaee21b9d4505eca506d50c4721c29db62848b567a9703bfe0583892.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/1ace8c85aaee21b9d4505eca506d50c4721c29db62848b567a9703bfe0583892.html.gz -------------------------------------------------------------------------------- /html/1ee91d1fce65e09be8b8d2d29eab771546d98ca2ba5c862941e660e9fec12432.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/1ee91d1fce65e09be8b8d2d29eab771546d98ca2ba5c862941e660e9fec12432.html.gz -------------------------------------------------------------------------------- /html/1f765c48780665e89cc3af1f7c9af47876e9fae9b5be4a936b0649e10f5e3198.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/1f765c48780665e89cc3af1f7c9af47876e9fae9b5be4a936b0649e10f5e3198.html.gz -------------------------------------------------------------------------------- /html/20b2b64916b00b25203c9f1bf14248922f4d522f18328e9f876cce116df0083e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/20b2b64916b00b25203c9f1bf14248922f4d522f18328e9f876cce116df0083e.html.gz -------------------------------------------------------------------------------- /html/21486419bb109c5a62a68957f528e6ff29c92f58d8d3c1f2837c86ff3f3e11f9.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/21486419bb109c5a62a68957f528e6ff29c92f58d8d3c1f2837c86ff3f3e11f9.html.gz -------------------------------------------------------------------------------- /html/232a43fb15abde807427b2a7bf4f772e27b8760554370956d8291df4e8166dbf.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/232a43fb15abde807427b2a7bf4f772e27b8760554370956d8291df4e8166dbf.html.gz -------------------------------------------------------------------------------- /html/23aaecd14171f96cfd201a8a46666097e286ad71f74f29347a78c5ecba50da1e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/23aaecd14171f96cfd201a8a46666097e286ad71f74f29347a78c5ecba50da1e.html.gz -------------------------------------------------------------------------------- /html/264dc3ae31249cb1f50c50986e0952a4708c2e705d18a2d8bf0e525da6e2b485.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/264dc3ae31249cb1f50c50986e0952a4708c2e705d18a2d8bf0e525da6e2b485.html.gz -------------------------------------------------------------------------------- /html/287e4d9f4af31733aad6534aefb2bd00fb344ec8d6ebf1ac99dbc4d762da0ca4.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/287e4d9f4af31733aad6534aefb2bd00fb344ec8d6ebf1ac99dbc4d762da0ca4.html.gz -------------------------------------------------------------------------------- /html/291a8bf33ee49074f33dcff37544ac40506cae450db83b6cb63f02b9920b51c2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/291a8bf33ee49074f33dcff37544ac40506cae450db83b6cb63f02b9920b51c2.html.gz -------------------------------------------------------------------------------- /html/2c46804d9db4a85e8f8d31128ce0e11d02f25c7120c2faa5ec0664c604a47717.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/2c46804d9db4a85e8f8d31128ce0e11d02f25c7120c2faa5ec0664c604a47717.html.gz -------------------------------------------------------------------------------- /html/2f42ef1d3ea0c96e56355d3db93d0e06b47e760b74f6f4261278b8cd1c246dd6.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/2f42ef1d3ea0c96e56355d3db93d0e06b47e760b74f6f4261278b8cd1c246dd6.html.gz -------------------------------------------------------------------------------- /html/30b771a40a4e96156d398716c877deef54b05d091770d2717c98e4c6b670010c.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/30b771a40a4e96156d398716c877deef54b05d091770d2717c98e4c6b670010c.html.gz -------------------------------------------------------------------------------- /html/3252222e61fe78982cffe0b0bad2b089c27b32f65852d1c5d3951517f3c2e295.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3252222e61fe78982cffe0b0bad2b089c27b32f65852d1c5d3951517f3c2e295.html.gz -------------------------------------------------------------------------------- /html/33fe2471fd553c6570f93997f208b4f39bf30be5947c3cfa620ee8eff3355ab9.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/33fe2471fd553c6570f93997f208b4f39bf30be5947c3cfa620ee8eff3355ab9.html.gz -------------------------------------------------------------------------------- /html/34a7328535ad4e60b059f81d37eec5d25c2bc8de759ce9a7b5e47ac7dc6fd1b0.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/34a7328535ad4e60b059f81d37eec5d25c2bc8de759ce9a7b5e47ac7dc6fd1b0.html.gz -------------------------------------------------------------------------------- /html/358cc4a080456476b0f883c56bdce796874c286ed6efab25f5718dd95fab42a8.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/358cc4a080456476b0f883c56bdce796874c286ed6efab25f5718dd95fab42a8.html.gz -------------------------------------------------------------------------------- /html/359fee228518d55b921194561e9ca88e428df81940246f8fac7a75398377daea.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/359fee228518d55b921194561e9ca88e428df81940246f8fac7a75398377daea.html.gz -------------------------------------------------------------------------------- /html/35b158918c676ff2c74445517db76c83db70a805cc50b64e1369b354a027fcbd.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/35b158918c676ff2c74445517db76c83db70a805cc50b64e1369b354a027fcbd.html.gz -------------------------------------------------------------------------------- /html/360c732d1fdbfc6895d7096c0c0b8c0d581bb1af80160f4c6a0f1fd9ff85e469.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/360c732d1fdbfc6895d7096c0c0b8c0d581bb1af80160f4c6a0f1fd9ff85e469.html.gz -------------------------------------------------------------------------------- /html/374ac9a59a85196cdacc1679fb8993521a7b7d9d6533720f102300be1c7face4.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/374ac9a59a85196cdacc1679fb8993521a7b7d9d6533720f102300be1c7face4.html.gz -------------------------------------------------------------------------------- /html/39d5c43beb60605c3eec760c99500e62e7bd71ebbe4ae05edf382125e1b0b80a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/39d5c43beb60605c3eec760c99500e62e7bd71ebbe4ae05edf382125e1b0b80a.html.gz -------------------------------------------------------------------------------- /html/3c5bf8db4272925bf1dd5713fc325e179fd0d1cc6fb8c77aa2d917cfd2518a32.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3c5bf8db4272925bf1dd5713fc325e179fd0d1cc6fb8c77aa2d917cfd2518a32.html.gz -------------------------------------------------------------------------------- /html/3c6d3381ef52ca26be2fbde19c1b0fe17d85682b726dfecf5e300c1ca34546b1.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3c6d3381ef52ca26be2fbde19c1b0fe17d85682b726dfecf5e300c1ca34546b1.html.gz -------------------------------------------------------------------------------- /html/3cb22bfabed8de715c0813a7bb5052363c96bd71ccce3bb2dfb3ab9d1d7a9bbc.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3cb22bfabed8de715c0813a7bb5052363c96bd71ccce3bb2dfb3ab9d1d7a9bbc.html.gz -------------------------------------------------------------------------------- /html/3cb5e2f46626d5bb0345759453036f7eabc0b0c7796b796513606bf693060ced.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3cb5e2f46626d5bb0345759453036f7eabc0b0c7796b796513606bf693060ced.html.gz -------------------------------------------------------------------------------- /html/3ce1c8fdf6ad2ded9e48a68be71eb069fc453ef1b75f47698428a1fdda0deb24.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3ce1c8fdf6ad2ded9e48a68be71eb069fc453ef1b75f47698428a1fdda0deb24.html.gz -------------------------------------------------------------------------------- /html/3d8f3404cf975af824d7866b7679bc45189c3eea6adb32f0a125a0904b1abbb2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3d8f3404cf975af824d7866b7679bc45189c3eea6adb32f0a125a0904b1abbb2.html.gz -------------------------------------------------------------------------------- /html/3f65af7b6b98b1c9ae9a3e0d8a09a85600cdc44e26e4b3a6db96a31f4b1767e3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/3f65af7b6b98b1c9ae9a3e0d8a09a85600cdc44e26e4b3a6db96a31f4b1767e3.html.gz -------------------------------------------------------------------------------- /html/4219d096902dad9fd9d57e881e7928ca66bdf5334c2bc7dfddaa264887777a7a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/4219d096902dad9fd9d57e881e7928ca66bdf5334c2bc7dfddaa264887777a7a.html.gz -------------------------------------------------------------------------------- /html/42aad16bde9288623543642a9ce1a396be83e2db44aa2ff8cbbfe46e14abd7cc.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/42aad16bde9288623543642a9ce1a396be83e2db44aa2ff8cbbfe46e14abd7cc.html.gz -------------------------------------------------------------------------------- /html/432362af0be43f6da757ea778bd7f2f000094a565bdebac5af7442987a5372f3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/432362af0be43f6da757ea778bd7f2f000094a565bdebac5af7442987a5372f3.html.gz -------------------------------------------------------------------------------- /html/4648a420af9984d45b76a4afedf4f74965f8a2e0bf1c69bd3da2dc189020f3c9.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/4648a420af9984d45b76a4afedf4f74965f8a2e0bf1c69bd3da2dc189020f3c9.html.gz -------------------------------------------------------------------------------- /html/4a44ab3e4c41d56ce9b79eb07acb06aed1bc52aba68a950f06e7de7ef848400a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/4a44ab3e4c41d56ce9b79eb07acb06aed1bc52aba68a950f06e7de7ef848400a.html.gz -------------------------------------------------------------------------------- /html/51374560f40088e227f0053ff1bb0b8525d10a8d7bfbff1cd6033f42347fd85b.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/51374560f40088e227f0053ff1bb0b8525d10a8d7bfbff1cd6033f42347fd85b.html.gz -------------------------------------------------------------------------------- /html/51d066b0602c9421d8d6410bc4b931700978409a3faa2a984e8fbde519ad7241.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/51d066b0602c9421d8d6410bc4b931700978409a3faa2a984e8fbde519ad7241.html.gz -------------------------------------------------------------------------------- /html/5211188428849a31e309ef2475746563ff788b1591c89818c08d5abedec4ef5e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5211188428849a31e309ef2475746563ff788b1591c89818c08d5abedec4ef5e.html.gz -------------------------------------------------------------------------------- /html/55bb6340e3d7dd8632ba45179ae43c39f8ad0cfcecb4719e3b9cf6106ffb70a3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/55bb6340e3d7dd8632ba45179ae43c39f8ad0cfcecb4719e3b9cf6106ffb70a3.html.gz -------------------------------------------------------------------------------- /html/57b4dafd18cfd0531b69f81e87158648227c673ef159f8d8c87d34e34bdb21f2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/57b4dafd18cfd0531b69f81e87158648227c673ef159f8d8c87d34e34bdb21f2.html.gz -------------------------------------------------------------------------------- /html/57d46c9d751e3fd3ffaf3ede7ac20cebd30eacb5ea78e1a6aa0a72059244e7ca.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/57d46c9d751e3fd3ffaf3ede7ac20cebd30eacb5ea78e1a6aa0a72059244e7ca.html.gz -------------------------------------------------------------------------------- /html/57e2e98887a1965689955921208e32f410b10e2b95c907e74e57982d3edf3cc6.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/57e2e98887a1965689955921208e32f410b10e2b95c907e74e57982d3edf3cc6.html.gz -------------------------------------------------------------------------------- /html/5a822960e9a2cb1e664d334b6c936c5cb6e41fb5331877538c2c8339cb59d57e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5a822960e9a2cb1e664d334b6c936c5cb6e41fb5331877538c2c8339cb59d57e.html.gz -------------------------------------------------------------------------------- /html/5ae11e580afc12d3ba1a12944281e6a7a5dded5c98b4efcf24aedcb28f0d5b22.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5ae11e580afc12d3ba1a12944281e6a7a5dded5c98b4efcf24aedcb28f0d5b22.html.gz -------------------------------------------------------------------------------- /html/5caf91b8a4423735f866b089d2611ea14503584cf3b6f487c6d26eb7b9521fca.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5caf91b8a4423735f866b089d2611ea14503584cf3b6f487c6d26eb7b9521fca.html.gz -------------------------------------------------------------------------------- /html/5f03fc173ebc6abdfae50b96ce0b05a6137b7d3f2ef379be35a9bb8ca9f49e87.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5f03fc173ebc6abdfae50b96ce0b05a6137b7d3f2ef379be35a9bb8ca9f49e87.html.gz -------------------------------------------------------------------------------- /html/5f9c5ed5d64dfe682d9bde13b9b4f032a3ebdbf165c06ec49c0705bcbe106e3b.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5f9c5ed5d64dfe682d9bde13b9b4f032a3ebdbf165c06ec49c0705bcbe106e3b.html.gz -------------------------------------------------------------------------------- /html/5fa3154ec031ab35411a457d78eb5aa92c0e803c5329bd05c001e6d64009e206.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5fa3154ec031ab35411a457d78eb5aa92c0e803c5329bd05c001e6d64009e206.html.gz -------------------------------------------------------------------------------- /html/5fa5679de56c43edf70685762c2d1f2de296432ae53aa46e075b552fee17cab8.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5fa5679de56c43edf70685762c2d1f2de296432ae53aa46e075b552fee17cab8.html.gz -------------------------------------------------------------------------------- /html/5fbc7ccb504c755ae23a85499a17518483d7862b74b4a5c34d86ede1a1a4448e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/5fbc7ccb504c755ae23a85499a17518483d7862b74b4a5c34d86ede1a1a4448e.html.gz -------------------------------------------------------------------------------- /html/612cd29826624e68ce96789c8049e16279dfd2fceb27434eea7943b2aaf84e90.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/612cd29826624e68ce96789c8049e16279dfd2fceb27434eea7943b2aaf84e90.html.gz -------------------------------------------------------------------------------- /html/624fcd903d56fc7055fa7097b330629450c095ad6937318deb027be7803bbf35.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/624fcd903d56fc7055fa7097b330629450c095ad6937318deb027be7803bbf35.html.gz -------------------------------------------------------------------------------- /html/63db31a161b3c5b64e88c2978635cbc38d342ba82fd2c5335321203dcc55c76f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/63db31a161b3c5b64e88c2978635cbc38d342ba82fd2c5335321203dcc55c76f.html.gz -------------------------------------------------------------------------------- /html/65408257dbe4b41f71a35ade24e30243265095fc1d4988a35b9a6ca52f2b4eab.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/65408257dbe4b41f71a35ade24e30243265095fc1d4988a35b9a6ca52f2b4eab.html.gz -------------------------------------------------------------------------------- /html/65bf3048b500bbd84928d9122f99617ca898216b91add1d8b2ac09c670484a5c.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/65bf3048b500bbd84928d9122f99617ca898216b91add1d8b2ac09c670484a5c.html.gz -------------------------------------------------------------------------------- /html/65ce3a4577a0306994efa190a0d96e84014f9d4257ad54753e807ede518f02c0.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/65ce3a4577a0306994efa190a0d96e84014f9d4257ad54753e807ede518f02c0.html.gz -------------------------------------------------------------------------------- /html/680c2848e94a96f961a0964631de94ac572f83c45bfd0bec2deafa893bcfe15c.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/680c2848e94a96f961a0964631de94ac572f83c45bfd0bec2deafa893bcfe15c.html.gz -------------------------------------------------------------------------------- /html/686bb170effe273eaff1c0f88e412172e8d972518a6d1454c896f52aafaa9643.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/686bb170effe273eaff1c0f88e412172e8d972518a6d1454c896f52aafaa9643.html.gz -------------------------------------------------------------------------------- /html/6a72de37e8f98f4eee6c0821e593b35ce536cef6c8b424c5e1dd747ebe6621ba.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/6a72de37e8f98f4eee6c0821e593b35ce536cef6c8b424c5e1dd747ebe6621ba.html.gz -------------------------------------------------------------------------------- /html/6ebac05f637ece8aa57c298a2a5e3a8047f546f855d0f29cc683cea60ce85c85.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/6ebac05f637ece8aa57c298a2a5e3a8047f546f855d0f29cc683cea60ce85c85.html.gz -------------------------------------------------------------------------------- /html/702d1da63b8e064cb70617620e45c2d116b4912c9bc9d518dcf5ce54bb8057ed.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/702d1da63b8e064cb70617620e45c2d116b4912c9bc9d518dcf5ce54bb8057ed.html.gz -------------------------------------------------------------------------------- /html/70cb2d5bca75ab5a8f6bb378a38a52f882f6bda508de93b12502e74936d86ff2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/70cb2d5bca75ab5a8f6bb378a38a52f882f6bda508de93b12502e74936d86ff2.html.gz -------------------------------------------------------------------------------- /html/776a1c046798b474e410f6edf3225d6a27fecd0de6aac22aef7b7f64fe87caaf.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/776a1c046798b474e410f6edf3225d6a27fecd0de6aac22aef7b7f64fe87caaf.html.gz -------------------------------------------------------------------------------- /html/7837c9d66c815b9a21dd669a3dc21677c3f084b1b7dd603d56e87867d8970dd3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7837c9d66c815b9a21dd669a3dc21677c3f084b1b7dd603d56e87867d8970dd3.html.gz -------------------------------------------------------------------------------- /html/785affa2c34e6e4844ef080e98e1a1e532eeeb671bdacebfb9e98ad7320ff382.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/785affa2c34e6e4844ef080e98e1a1e532eeeb671bdacebfb9e98ad7320ff382.html.gz -------------------------------------------------------------------------------- /html/7916ecca969ffdd8f6fc32d171fbe0dd63db40fe4c1d2ade02b1dec5929a162f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7916ecca969ffdd8f6fc32d171fbe0dd63db40fe4c1d2ade02b1dec5929a162f.html.gz -------------------------------------------------------------------------------- /html/7a457a4f71735c17b8b34fafc88835d225cf879b2d812311857a64cfc891eee9.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7a457a4f71735c17b8b34fafc88835d225cf879b2d812311857a64cfc891eee9.html.gz -------------------------------------------------------------------------------- /html/7a664e40d256470fdb12d10c3f8d1c6db0581e9b080c71765e55f273a3ac7d03.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7a664e40d256470fdb12d10c3f8d1c6db0581e9b080c71765e55f273a3ac7d03.html.gz -------------------------------------------------------------------------------- /html/7ab16ade32386ece353b8d31fc3bb7e660189efd5bf3c8549aaba101ad3f5ef5.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7ab16ade32386ece353b8d31fc3bb7e660189efd5bf3c8549aaba101ad3f5ef5.html.gz -------------------------------------------------------------------------------- /html/7bb1ca90354313840329d2f569ea9fb3a582df2aa0a5e3669f8fc567eb6ea61b.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7bb1ca90354313840329d2f569ea9fb3a582df2aa0a5e3669f8fc567eb6ea61b.html.gz -------------------------------------------------------------------------------- /html/7de5241947a5f7147fe9787c6f6fa16685bfe66e6c35510a68780f27690dc4f0.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7de5241947a5f7147fe9787c6f6fa16685bfe66e6c35510a68780f27690dc4f0.html.gz -------------------------------------------------------------------------------- /html/7dfc3e359d7c0ca48ac9046ae5759286cedf80abe7526fc6c6e6546b9ba43e33.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7dfc3e359d7c0ca48ac9046ae5759286cedf80abe7526fc6c6e6546b9ba43e33.html.gz -------------------------------------------------------------------------------- /html/7f93c1944a41d01960f8a16fdfda6c562e86f04ead8375ab796c4278402df9a8.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/7f93c1944a41d01960f8a16fdfda6c562e86f04ead8375ab796c4278402df9a8.html.gz -------------------------------------------------------------------------------- /html/8267acacb9e4a109b1f7ee7bafe735b73e9c94180b703b131f9e90c9be044f39.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8267acacb9e4a109b1f7ee7bafe735b73e9c94180b703b131f9e90c9be044f39.html.gz -------------------------------------------------------------------------------- /html/82b6d780c792df78dcfb00484d50c86fbc7f324a9eb5835b7615f028edb9a574.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/82b6d780c792df78dcfb00484d50c86fbc7f324a9eb5835b7615f028edb9a574.html.gz -------------------------------------------------------------------------------- /html/833caf3bdba53dcf48de273cf646370eebe9ac565744b0d0e941e298e1b79730.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/833caf3bdba53dcf48de273cf646370eebe9ac565744b0d0e941e298e1b79730.html.gz -------------------------------------------------------------------------------- /html/8380689f358c1e3a0f6fca6e11ed13e5304a74060139f7a584347db213950446.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8380689f358c1e3a0f6fca6e11ed13e5304a74060139f7a584347db213950446.html.gz -------------------------------------------------------------------------------- /html/851498a2b9f4f0b578ac9700c245253dbc147a06c0fb3499adebf1c2d5663c29.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/851498a2b9f4f0b578ac9700c245253dbc147a06c0fb3499adebf1c2d5663c29.html.gz -------------------------------------------------------------------------------- /html/85439e26c41c75901820d01a13e8cea7836abb58635ea3986f71a163ab0311d3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/85439e26c41c75901820d01a13e8cea7836abb58635ea3986f71a163ab0311d3.html.gz -------------------------------------------------------------------------------- /html/8634d1211c3f2b73041e6cadd5d59676619838949999a83a23c51a3195b44892.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8634d1211c3f2b73041e6cadd5d59676619838949999a83a23c51a3195b44892.html.gz -------------------------------------------------------------------------------- /html/87438a0dacbeb979e72522f42b9020048da13dc5a079477114190c8855701b7f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/87438a0dacbeb979e72522f42b9020048da13dc5a079477114190c8855701b7f.html.gz -------------------------------------------------------------------------------- /html/87bf60570e6e2e33cb1f0fdb5600d6c85012e60be25ba6fa587b8f90eb9a3770.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/87bf60570e6e2e33cb1f0fdb5600d6c85012e60be25ba6fa587b8f90eb9a3770.html.gz -------------------------------------------------------------------------------- /html/88c328b68b038a625b4b3f8c322215caa30b0e88af0754bd71056ffc15c7b4b7.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/88c328b68b038a625b4b3f8c322215caa30b0e88af0754bd71056ffc15c7b4b7.html.gz -------------------------------------------------------------------------------- /html/8b194530308204139d9c8f7d495a26b117c78756ac1802cfc3c0a8bfdf2c0d50.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8b194530308204139d9c8f7d495a26b117c78756ac1802cfc3c0a8bfdf2c0d50.html.gz -------------------------------------------------------------------------------- /html/8cad00dc22de45ba42e9540421b5f78333f7ac57b385d69acb27a53b9fd69f0c.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8cad00dc22de45ba42e9540421b5f78333f7ac57b385d69acb27a53b9fd69f0c.html.gz -------------------------------------------------------------------------------- /html/8e3efab59f48fd29a1e1e7aa135880c4251a9f090f94999668cdbaec59d30b5a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/8e3efab59f48fd29a1e1e7aa135880c4251a9f090f94999668cdbaec59d30b5a.html.gz -------------------------------------------------------------------------------- /html/921019755f4a96ac4abf9dbcb4ef9d5ac202624a542d5ea70912330aa6fcc71f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/921019755f4a96ac4abf9dbcb4ef9d5ac202624a542d5ea70912330aa6fcc71f.html.gz -------------------------------------------------------------------------------- /html/94fbcc26772088646cb977cecf1abc4012847a1f6927d09505cbf0c3d417ba07.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/94fbcc26772088646cb977cecf1abc4012847a1f6927d09505cbf0c3d417ba07.html.gz -------------------------------------------------------------------------------- /html/95301fb7883e0ee5214d1111554d30dd97e08c6380d7699369c0b9c15f42e6aa.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/95301fb7883e0ee5214d1111554d30dd97e08c6380d7699369c0b9c15f42e6aa.html.gz -------------------------------------------------------------------------------- /html/961bd85ca85aaf791b278cc4a60058e92d57c4f32a3411cf8e7d802af183c926.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/961bd85ca85aaf791b278cc4a60058e92d57c4f32a3411cf8e7d802af183c926.html.gz -------------------------------------------------------------------------------- /html/9a440270bf8625d586039dfae1b8df409b467524e075124cd7a5424a5806901b.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9a440270bf8625d586039dfae1b8df409b467524e075124cd7a5424a5806901b.html.gz -------------------------------------------------------------------------------- /html/9cb8224b660f36c932823ab613fb76a07928fcbc41956c4c1f96f4ecab9202aa.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9cb8224b660f36c932823ab613fb76a07928fcbc41956c4c1f96f4ecab9202aa.html.gz -------------------------------------------------------------------------------- /html/9da36ae4714bfccc72374c6c146e9d1cd3cca39e2110bd67ccdbcc806f4cf139.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9da36ae4714bfccc72374c6c146e9d1cd3cca39e2110bd67ccdbcc806f4cf139.html.gz -------------------------------------------------------------------------------- /html/9e8c9f082a8d77c58c17bda03b6b4bb6a1d6883fe196c252db4ca83b9991e0d3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9e8c9f082a8d77c58c17bda03b6b4bb6a1d6883fe196c252db4ca83b9991e0d3.html.gz -------------------------------------------------------------------------------- /html/9ebb3af65694a953005df5bd3869b2cefc263e1dea0471e3ef361c66a264cdd3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9ebb3af65694a953005df5bd3869b2cefc263e1dea0471e3ef361c66a264cdd3.html.gz -------------------------------------------------------------------------------- /html/9eef8162bbb67b0bd73792313b91b87dc9304f43f85f479e67e71c166417451e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/9eef8162bbb67b0bd73792313b91b87dc9304f43f85f479e67e71c166417451e.html.gz -------------------------------------------------------------------------------- /html/a078b3656adc0295d0e37bd4f599342f4a0894da2451e0ef3038ac045434fef3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a078b3656adc0295d0e37bd4f599342f4a0894da2451e0ef3038ac045434fef3.html.gz -------------------------------------------------------------------------------- /html/a1fca19b884e0e946ad3fbe2a7f5031e5e3b23372702a76db302b6143c77cb31.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a1fca19b884e0e946ad3fbe2a7f5031e5e3b23372702a76db302b6143c77cb31.html.gz -------------------------------------------------------------------------------- /html/a6968f427cdb786531cfb326518e674bd8b48af94df7c5c6165cdf40e944357a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a6968f427cdb786531cfb326518e674bd8b48af94df7c5c6165cdf40e944357a.html.gz -------------------------------------------------------------------------------- /html/a860fb5eda1ac75df3bc95ba096ade649fdbb1bb566adb9fee3cb13e59f37604.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/a860fb5eda1ac75df3bc95ba096ade649fdbb1bb566adb9fee3cb13e59f37604.html.gz -------------------------------------------------------------------------------- /html/aadb38e527d5379306de3b910ec62cb2447cc1035686b2b2d152580f8f8a1ea2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/aadb38e527d5379306de3b910ec62cb2447cc1035686b2b2d152580f8f8a1ea2.html.gz -------------------------------------------------------------------------------- /html/aade2ec8d1e7b0919aef1001c3ef0573f8a239e22d4d751d8e664f04ea77ef0d.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/aade2ec8d1e7b0919aef1001c3ef0573f8a239e22d4d751d8e664f04ea77ef0d.html.gz -------------------------------------------------------------------------------- /html/abd9d6291b6bfae0c3ffad8ab7623b482c6da46face0271dc42af6324d8f0ce5.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/abd9d6291b6bfae0c3ffad8ab7623b482c6da46face0271dc42af6324d8f0ce5.html.gz -------------------------------------------------------------------------------- /html/ac1bfdd4c510f679c58f1b62101630d40fda20a16703235ae0f56b65a465e423.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ac1bfdd4c510f679c58f1b62101630d40fda20a16703235ae0f56b65a465e423.html.gz -------------------------------------------------------------------------------- /html/ac3c035520461017a7c5b248d8e39ef063cad4c0c7d7b7ecd68aff8f15099485.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ac3c035520461017a7c5b248d8e39ef063cad4c0c7d7b7ecd68aff8f15099485.html.gz -------------------------------------------------------------------------------- /html/ad826691a8a2f9c4ce50cf0b885af933c4b5119c1f6235cd7df1dfb83f255bcc.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ad826691a8a2f9c4ce50cf0b885af933c4b5119c1f6235cd7df1dfb83f255bcc.html.gz -------------------------------------------------------------------------------- /html/ad9e9e596f21a6812fae27b5d9d622359826c368e471d7d5ff9ac4676eaac9cd.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ad9e9e596f21a6812fae27b5d9d622359826c368e471d7d5ff9ac4676eaac9cd.html.gz -------------------------------------------------------------------------------- /html/aec5deeaada8b2fb81b55349da0229d5c77a4dc9605c1aaa31e5ce8b71358bc9.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/aec5deeaada8b2fb81b55349da0229d5c77a4dc9605c1aaa31e5ce8b71358bc9.html.gz -------------------------------------------------------------------------------- /html/b0cf2bbf0192315eec95ede9c59bbf4ae58699275739d590edc24b012e3e8800.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b0cf2bbf0192315eec95ede9c59bbf4ae58699275739d590edc24b012e3e8800.html.gz -------------------------------------------------------------------------------- /html/b37be3535e1fb61e5a238b7fa1ead1ad98b651cb09f138efadac3d54a122fb21.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b37be3535e1fb61e5a238b7fa1ead1ad98b651cb09f138efadac3d54a122fb21.html.gz -------------------------------------------------------------------------------- /html/b3c19dd5f0612d098788fa5173e491b3280da6226b492f8fe110f4ab1896cca8.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b3c19dd5f0612d098788fa5173e491b3280da6226b492f8fe110f4ab1896cca8.html.gz -------------------------------------------------------------------------------- /html/b6906ca016bbfc64c90426e098c75b3e8c84457a77f51f1e7ea6941cb80c2147.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b6906ca016bbfc64c90426e098c75b3e8c84457a77f51f1e7ea6941cb80c2147.html.gz -------------------------------------------------------------------------------- /html/b6fb53e9fb043c98eb1e6530a1074c40922e29025f5454809f3938a7c174faa3.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/b6fb53e9fb043c98eb1e6530a1074c40922e29025f5454809f3938a7c174faa3.html.gz -------------------------------------------------------------------------------- /html/ba07d1e64775f4090e39116c382111f5a2cfe9528dd179673f4e9bfcea370c15.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ba07d1e64775f4090e39116c382111f5a2cfe9528dd179673f4e9bfcea370c15.html.gz -------------------------------------------------------------------------------- /html/ba4dfe2d3e817ff7b8b01172ccc307850fc5469bcdd26c3c48ca046cb88ab7cf.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ba4dfe2d3e817ff7b8b01172ccc307850fc5469bcdd26c3c48ca046cb88ab7cf.html.gz -------------------------------------------------------------------------------- /html/bc13ff87b2630ffbebc33bc37b11178b14f03109055e1d17bf644f804b63d98a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/bc13ff87b2630ffbebc33bc37b11178b14f03109055e1d17bf644f804b63d98a.html.gz -------------------------------------------------------------------------------- /html/bd673bd7988144f0ab7b9c5e19fed140fb5aaa30d8894cb045b72d3b79a7dc54.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/bd673bd7988144f0ab7b9c5e19fed140fb5aaa30d8894cb045b72d3b79a7dc54.html.gz -------------------------------------------------------------------------------- /html/bdb56ac83513635db1d8b9eb46b2da4c0de8da2f1f28f5bf5163df3eb3d3ec06.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/bdb56ac83513635db1d8b9eb46b2da4c0de8da2f1f28f5bf5163df3eb3d3ec06.html.gz -------------------------------------------------------------------------------- /html/c00962aabe7bdd1fca78f5360ea7fa93cd7674863b05157e00827506a7aa58c4.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c00962aabe7bdd1fca78f5360ea7fa93cd7674863b05157e00827506a7aa58c4.html.gz -------------------------------------------------------------------------------- /html/c13b9c0e04fb28d445d22e92bff6ab7f7800a429930677c28c4dad89f3269869.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c13b9c0e04fb28d445d22e92bff6ab7f7800a429930677c28c4dad89f3269869.html.gz -------------------------------------------------------------------------------- /html/c467d507551a836efa9cfe843ba5d7bafe519750e04d0c9ff0decf44f013f829.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c467d507551a836efa9cfe843ba5d7bafe519750e04d0c9ff0decf44f013f829.html.gz -------------------------------------------------------------------------------- /html/c4a3637c6696f238cf9fe1c7fbb17bbb6731a71d4f5fe399b9b4fc3294a96a6b.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c4a3637c6696f238cf9fe1c7fbb17bbb6731a71d4f5fe399b9b4fc3294a96a6b.html.gz -------------------------------------------------------------------------------- /html/c50845a7158af12ee75acea301a3ea0dad1e848d6b9dbdb43ba7f2d825b2528b.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c50845a7158af12ee75acea301a3ea0dad1e848d6b9dbdb43ba7f2d825b2528b.html.gz -------------------------------------------------------------------------------- /html/c582d3b772578e8feaa3cfd8f5ae8100bb6f0bc66048204a9a398395841c1164.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c582d3b772578e8feaa3cfd8f5ae8100bb6f0bc66048204a9a398395841c1164.html.gz -------------------------------------------------------------------------------- /html/c58aa507c4deebd660f69905f9abb8f96d935f6e7210f597ed4cd32b3f39f7f7.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c58aa507c4deebd660f69905f9abb8f96d935f6e7210f597ed4cd32b3f39f7f7.html.gz -------------------------------------------------------------------------------- /html/c69e539d689a8335a69042727f1b58edab09d5d99fb607ec625a63151a537dc2.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c69e539d689a8335a69042727f1b58edab09d5d99fb607ec625a63151a537dc2.html.gz -------------------------------------------------------------------------------- /html/c7e39ac49fa1235f5d50f83bf2444248bd3aa4e6df044377916c812dd109ba23.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c7e39ac49fa1235f5d50f83bf2444248bd3aa4e6df044377916c812dd109ba23.html.gz -------------------------------------------------------------------------------- /html/c81e134ed49902bcf69b551426b4a346c5a77ae993cac8bda68b5541a664ef4c.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c81e134ed49902bcf69b551426b4a346c5a77ae993cac8bda68b5541a664ef4c.html.gz -------------------------------------------------------------------------------- /html/c82b3d1d540bbbd6081bdfb78b4c068c583aa766bcaaefe7ad16d24e5413a829.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c82b3d1d540bbbd6081bdfb78b4c068c583aa766bcaaefe7ad16d24e5413a829.html.gz -------------------------------------------------------------------------------- /html/c90731f051d033e49e4cfcc920895051bbc3b54ef1a11519abcf22a115c3aa79.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/c90731f051d033e49e4cfcc920895051bbc3b54ef1a11519abcf22a115c3aa79.html.gz -------------------------------------------------------------------------------- /html/cc03ddb5ef7d5f1fdb8a87f5e6dfd058a2a70acedf2551655a898dc5c18eb79e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/cc03ddb5ef7d5f1fdb8a87f5e6dfd058a2a70acedf2551655a898dc5c18eb79e.html.gz -------------------------------------------------------------------------------- /html/cc4aa22b8212aec7d289667c0a965569e6f06b9e9196ff8b02219bf2bc1b90d0.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/cc4aa22b8212aec7d289667c0a965569e6f06b9e9196ff8b02219bf2bc1b90d0.html.gz -------------------------------------------------------------------------------- /html/d0382c0d9573a0a7beb1e649012d04ec7275ac23513ca6ca59e51477b028283c.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d0382c0d9573a0a7beb1e649012d04ec7275ac23513ca6ca59e51477b028283c.html.gz -------------------------------------------------------------------------------- /html/d1c57d7821e5a5b27fb468c59489601bb2a042b1c05221166e3221d2b5dc217f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d1c57d7821e5a5b27fb468c59489601bb2a042b1c05221166e3221d2b5dc217f.html.gz -------------------------------------------------------------------------------- /html/d48aeb9cf2f2ff15769a57513249b4a6a669159f3e50b335e741d4206a824e88.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d48aeb9cf2f2ff15769a57513249b4a6a669159f3e50b335e741d4206a824e88.html.gz -------------------------------------------------------------------------------- /html/d605bdef2cde7308a9f2fbd1484d4a9c3da0167177245d346da61e455f42208d.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d605bdef2cde7308a9f2fbd1484d4a9c3da0167177245d346da61e455f42208d.html.gz -------------------------------------------------------------------------------- /html/d90bda7ed14df19574f4ca8b1ccde5752a78f40058af1393e81cc99adb3e8756.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/d90bda7ed14df19574f4ca8b1ccde5752a78f40058af1393e81cc99adb3e8756.html.gz -------------------------------------------------------------------------------- /html/db6b0816c612296c7f1f001c6df874214fcca0da0fc86fb3aea9358c7f681754.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/db6b0816c612296c7f1f001c6df874214fcca0da0fc86fb3aea9358c7f681754.html.gz -------------------------------------------------------------------------------- /html/dc7ccccc1f34eb2928cb238739aaf18c712d59d8d34b41acfb29178aeba65356.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/dc7ccccc1f34eb2928cb238739aaf18c712d59d8d34b41acfb29178aeba65356.html.gz -------------------------------------------------------------------------------- /html/dfd43bc0d46e7aaa78ba10fbcb5b9fdfe78771d36cb4c7497e17fb6f69170ec5.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/dfd43bc0d46e7aaa78ba10fbcb5b9fdfe78771d36cb4c7497e17fb6f69170ec5.html.gz -------------------------------------------------------------------------------- /html/e100c9612ad8495db03b2a9f968952d0eaa4853d9b32ded6a29f8e313a974873.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e100c9612ad8495db03b2a9f968952d0eaa4853d9b32ded6a29f8e313a974873.html.gz -------------------------------------------------------------------------------- /html/e1c7023ee2148901b086256fdd30a0893d10b0720b510d5ff07a021109347266.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e1c7023ee2148901b086256fdd30a0893d10b0720b510d5ff07a021109347266.html.gz -------------------------------------------------------------------------------- /html/e1cd54e5577d077df83a12a4753c3c8bf2d88d68cd709cc4c442874777581c4a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e1cd54e5577d077df83a12a4753c3c8bf2d88d68cd709cc4c442874777581c4a.html.gz -------------------------------------------------------------------------------- /html/e372e42c0a3df7b86e1c0bacf7bc14d042144a01e88833bc5a643d61b3547090.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e372e42c0a3df7b86e1c0bacf7bc14d042144a01e88833bc5a643d61b3547090.html.gz -------------------------------------------------------------------------------- /html/e4c6a3b482403a8f60190ba27248cd52b250b86f5d4a8a10edcf7062c64fc3f5.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e4c6a3b482403a8f60190ba27248cd52b250b86f5d4a8a10edcf7062c64fc3f5.html.gz -------------------------------------------------------------------------------- /html/e593d7fe88f9f5cd6587ac172be2db6055d40b6f071023f97ab1ce373534261e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e593d7fe88f9f5cd6587ac172be2db6055d40b6f071023f97ab1ce373534261e.html.gz -------------------------------------------------------------------------------- /html/e7301133baab43596f19076beab32096f6405b868e0a69bcfc3349e595d62475.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e7301133baab43596f19076beab32096f6405b868e0a69bcfc3349e595d62475.html.gz -------------------------------------------------------------------------------- /html/e7994d5500875202d93e736e8f0c8a0436107d10add94ce3789001b8c5c32358.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e7994d5500875202d93e736e8f0c8a0436107d10add94ce3789001b8c5c32358.html.gz -------------------------------------------------------------------------------- /html/e7d77f1869803e24667fa0b985cff27fb4139951a5ffa494bc9ba810df48fb30.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/e7d77f1869803e24667fa0b985cff27fb4139951a5ffa494bc9ba810df48fb30.html.gz -------------------------------------------------------------------------------- /html/ea25dd7edff4d27973600f35728f20aed5a3eedcc23257d9c3afc3d3e840c3de.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ea25dd7edff4d27973600f35728f20aed5a3eedcc23257d9c3afc3d3e840c3de.html.gz -------------------------------------------------------------------------------- /html/eb62ac8425e5573947ecde962d14433d18e5725cc4a8c908fe22f678e96a65a1.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/eb62ac8425e5573947ecde962d14433d18e5725cc4a8c908fe22f678e96a65a1.html.gz -------------------------------------------------------------------------------- /html/ec3878db7e49b1ed354c511b132e3de5f773ff4fc8014163df58c22fffd93d2f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ec3878db7e49b1ed354c511b132e3de5f773ff4fc8014163df58c22fffd93d2f.html.gz -------------------------------------------------------------------------------- /html/ec7fc408c5ce66c22692a3f696c682f3de794bacfaca405d9a0dac5957051e5a.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ec7fc408c5ce66c22692a3f696c682f3de794bacfaca405d9a0dac5957051e5a.html.gz -------------------------------------------------------------------------------- /html/ecb46e3e489d2aac92b2563112e1801077b4219a6db9751f18e228bcaf457802.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ecb46e3e489d2aac92b2563112e1801077b4219a6db9751f18e228bcaf457802.html.gz -------------------------------------------------------------------------------- /html/eecd2575093b85933997521d6babddd397599419588d7096c5c19dc4ffe2ea72.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/eecd2575093b85933997521d6babddd397599419588d7096c5c19dc4ffe2ea72.html.gz -------------------------------------------------------------------------------- /html/ef2b3f268a67950c16563de9ca3209163c7618868c0216739e1e794e7884cc20.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ef2b3f268a67950c16563de9ca3209163c7618868c0216739e1e794e7884cc20.html.gz -------------------------------------------------------------------------------- /html/ef4e67b66d63b5facef55c06a94d85f2ae01a0a1a4a3a1bcfe2499c8c8a7dacf.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ef4e67b66d63b5facef55c06a94d85f2ae01a0a1a4a3a1bcfe2499c8c8a7dacf.html.gz -------------------------------------------------------------------------------- /html/f105de6e63ca91ea482f60193f6252092557f969f2fd128ff68c0d4d6b90dd7d.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f105de6e63ca91ea482f60193f6252092557f969f2fd128ff68c0d4d6b90dd7d.html.gz -------------------------------------------------------------------------------- /html/f344ca5fb36e130f4344235fa22726f3367e09c211c120f21d9ae92effe902db.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f344ca5fb36e130f4344235fa22726f3367e09c211c120f21d9ae92effe902db.html.gz -------------------------------------------------------------------------------- /html/f5c90a6d5253c3a21ff3168c64bea4b5ffade7a1ba5bed952a59ebee0d648d98.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f5c90a6d5253c3a21ff3168c64bea4b5ffade7a1ba5bed952a59ebee0d648d98.html.gz -------------------------------------------------------------------------------- /html/f6ac15a4d98511396da23e4428deb5605422b1c8bbc8284e771f6896bdccf57f.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f6ac15a4d98511396da23e4428deb5605422b1c8bbc8284e771f6896bdccf57f.html.gz -------------------------------------------------------------------------------- /html/f81c6c05d9cbc93316992fa23ef74ec405194e292611f2e94f6a814868903665.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f81c6c05d9cbc93316992fa23ef74ec405194e292611f2e94f6a814868903665.html.gz -------------------------------------------------------------------------------- /html/f8ff621a0b9b7646cc0d57d37416feabba2bf78ef5dd0bfc5b080f9f97bbe584.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/f8ff621a0b9b7646cc0d57d37416feabba2bf78ef5dd0bfc5b080f9f97bbe584.html.gz -------------------------------------------------------------------------------- /html/fde930b01859de8311c6a14f8aa8c72be0659b551367803deb6736cf3526cf2e.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/fde930b01859de8311c6a14f8aa8c72be0659b551367803deb6736cf3526cf2e.html.gz -------------------------------------------------------------------------------- /html/ff0f958ade714ebfaf5c0b42b1c0152a62063f4e6f72141406ccefc4a2677f21.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ff0f958ade714ebfaf5c0b42b1c0152a62063f4e6f72141406ccefc4a2677f21.html.gz -------------------------------------------------------------------------------- /html/ffc109d474fdee1a59fa554df8b09643f4a7d45b23eceabad66f0712c3f7daed.html.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/7c60d682959d3140fae3df45121c24e9e4614009/html/ffc109d474fdee1a59fa554df8b09643f4a7d45b23eceabad66f0712c3f7daed.html.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.8.2 2 | chardet==3.0.4 3 | cssselect==1.1.0 4 | cython==0.29.14 5 | -e git+git@github.com:dragnet-org/dragnet.git@1b65e7b0897ca061b5c90b3eefffbfc156a0cc3b#egg=dragnet 6 | feedfinder2==0.0.4 7 | feedparser==5.2.1 8 | jieba3k==0.35.1 9 | html-text==0.5.1 10 | lxml==4.4.2 11 | newspaper3k==0.2.8 12 | nltk==3.4.5 13 | numpy==1.18.1 14 | readability-lxml==0.7.1 15 | soupsieve==1.9.5 16 | scikit-learn==0.19.1 # same version as used by dragnet 17 | scipy==1.4.1 18 | tinysegmenter==0.3 19 | tldextract==2.2.2 20 | trafilatura==0.5.1 21 | news-please==1.5.17 # depend on half pypi projects... 22 | cchardet==2.1.7 # lost dependency in news-please 23 | goose3==3.1.8 24 | inscriptis==1.1.2 25 | html2text==2020.1.16 26 | justext==2.2.0 27 | beautifulsoup4==4.9.3 28 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from evaluate import string_shingle_matching, _ngrams, _tokenize 4 | 5 | 6 | def test_tokenize(): 7 | assert _tokenize('a b,cd:e(foo,bar) ') == \ 8 | ['a', 'b', 'cd', 'e', 'foo', 'bar'] 9 | 10 | 11 | @pytest.mark.parametrize( 12 | ['text', 'n', 'expected'], 13 | [('!', 4, []), 14 | ('a,b c ', 5, [('a', 'b', 'c')]), 15 | ('aa 11 c 22', 3, [('aa', '11', 'c'), ('11', 'c', '22')]), 16 | ('a b c a b c', 3, [('a', 'b', 'c'), ('b', 'c', 'a'), 17 | ('c', 'a', 'b'), ('a', 'b', 'c')]), 18 | ]) 19 | def test_ngrams(text, n, expected): 20 | assert _ngrams(text, n) == expected 21 | 22 | 23 | @pytest.mark.parametrize( 24 | ['true', 'pred', 'tp_fp_fn'], 25 | [('a b c', 'a b c', (1, 0, 0)), 26 | ('a b c d', 'a b c', (0.5, 0, 0.5)), 27 | ('a b c', 'a b c d', (0.5, 0.5, 0)), 28 | ('', '', (0, 0, 0)), 29 | ('a', '', (0, 0, 1)), 30 | ('', 'a', (0, 1, 0)), 31 | ('a b c a b c', 'a b c', (0.25, 0, 0.75)), 32 | ('a b c', 'a b c a b c', (0.25, 0.75, 0)), 33 | ]) 34 | def test_string_shingle_matching(true, pred, tp_fp_fn): 35 | assert string_shingle_matching(true, pred, ngram_n=3) == tp_fp_fn 36 | --------------------------------------------------------------------------------