├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── benchmark ├── __init__.py ├── data │ ├── article │ │ ├── base.json │ │ ├── htmls │ │ │ ├── 00f02be72b94daeea50a3a9bbc3fd8e3d1b9f1d42a5ef808bc180f59d7b9c964.html │ │ │ ├── 0293c0b3ec71d0c7897d31e82ba46d9bf3bb505d1b5850a41052bbd974051b00.html │ │ │ ├── 06586406af5a4d947b537aecdc40979b40694823be02302e015413cab28d50e0.html │ │ │ ├── 090704b0cb2e2ae4e435507f38afe591b641a672afa133d7065f48992a444d9b.html │ │ │ ├── 0987b0ff61a8eca73758b6c328af71dca752bdfa9e7e0139dad6f08c29252960.html │ │ │ ├── 0bf3df388798c5db6ba173b2084d35eb84804edfa218d779a4e8a5569b02cc4a.html │ │ │ ├── 0c3fcbaa26a30637563e2061cabb06474fcf536829c25ae0f4164a42883eb6d3.html │ │ │ ├── 0d18823b30fc212dd2efa8b2afa5ac1b2a9f224bd050adff44b4371289d3db33.html │ │ │ ├── 0eebc3efd179794935e495b0c4da98dde249ca4e5df52da18e5fc1009c09f2f2.html │ │ │ ├── 10cacf9b6be1ac305d35d6069583518b84371a2fe7a38f07c02a9dfbfb55359b.html │ │ │ ├── 1427dd7f392330083bf15a0c088cc8a699e1204ac2fa924b3ec79ea6ee7ca1fc.html │ │ │ ├── 14b709df543c9c22a524bd7e4989c463a5613d7c7b402b19ffc2826156a73b72.html │ │ │ ├── 153c5ed5508e61555a5906671d48f6ee01700778837d80469ffbf49ba2e44ca3.html │ │ │ ├── 1786ad30159462d3108b203b0915f93280327ab6938cf1ed6c6ec8adf5873be5.html │ │ │ ├── 18191aeed0c5136af8791bfbf47afad9bc18cdbfead793ed5d283feeeb2d4acd.html │ │ │ ├── 19851fb2dac0a2fa7f055d66aae86bea916167bdad780d0baccb38be1fe6407e.html │ │ │ ├── 1b929fec5b591f3870c961d4f7e01609c48bf9b67605fab26216c21a0f1de10e.html │ │ │ ├── 1d7210cc086d92569dbd4dedbe0a2434cb0a5579d1ce6b68df20dd0326abf9a9.html │ │ │ ├── 1e20e0271231de85fba26cef6db2bedebf916402368a4602c5adb5b2acf722e1.html │ │ │ ├── 1ed561e5a954dd7db5286140b150b38549760719ec382e57ccc785ccc1108f63.html │ │ │ ├── 1f2f22aa6812fd2c4a8035670cb73b509ce622248a0d4081011a980b0bdeaa0a.html │ │ │ ├── 1f930df31eb35178b5438a4a86c3992c4f0a379662807778e5f8b0084ccc5849.html │ │ │ ├── 201807fb8181aaf90d179b60ba22f8d788d9be79a3e3e27b6e497e88f2dea59d.html │ │ │ ├── 21475f4bb9752111b7c3964f02c945d5e940953abf63132b29e2f9ce4e80c322.html │ │ │ ├── 21e4badbb560413a20f0e852dd85d5bbb420a4ccc31b11ee292a14b7f286b1b2.html │ │ │ ├── 224fc4b884907786988533a241425aca0405623c60d4296d94879bb8fa353087.html │ │ │ ├── 2338d64a03dcd33cdc0e7351495b53b979b8e612813020d58265f6669aa8a979.html │ │ │ ├── 23942aabb8b28ada0c651d540090e165c09947188f1f0ae466bbef1ac6cec8e7.html │ │ │ ├── 2681eeacec6ab049817f3908bab3ae67a01396cedc3ede29df29ecc94ecd1265.html │ │ │ ├── 2d5228514fd49969ab1b9dbbf0df063702fcbb50120f8b690a9049867d0f5e8f.html │ │ │ ├── 2d9cd036643e54129373cbdc5382b0fded35abbd6ccf8db86354cc7a51b55e5f.html │ │ │ ├── 2dd9425ba67be652e3e2f1dcc20797998897fe06008a80bf069338e3772494c8.html │ │ │ ├── 2faac35ae091220ba2dd2b93156196f927b225adbd9d21390d9705a8e48ce4e6.html │ │ │ ├── 38d28e676470df9665643349c5fa2582a481267b19818f3910319b6b11c4bdb2.html │ │ │ ├── 392414950a06b741929db379e4c471fc9149f80a8d23d7d0734407dfdb882f53.html │ │ │ ├── 39ab22e747cb77c150ef8d240435df0451b102a778502d29d75dbbebf3a5331f.html │ │ │ ├── 39dd62c5ba88ab41c15c92ceab61ed4606fc6e996d2ab0d2e10af0e315be94de.html │ │ │ ├── 3cd271958a1dbf0a1fbe55d7b70d8c1d3b1a55a9cdd71d9e7eb262a4f43dbb16.html │ │ │ ├── 3e1c692ab0ba2ddfc43b7f03d6370bb3d4dab89f1955d0dd3fb5ecff0fbbef2f.html │ │ │ ├── 3e863fa9ac3416188abb30733c0f2cfc54da8e55360d2a3747ba76cc638bb469.html │ │ │ ├── 3ef3b4e65fab1f387fb5ec483a13a88da9b0bc1533c2d9bf894532322b8e3860.html │ │ │ ├── 3f58e21331d4102b375a8551c7d6323dbe147f15945a78f85bfe9e3f60ba1452.html │ │ │ ├── 42076fe0b7e334620dd25c6df7186a69e4c28c084800e3a7a46e99c3b1cf51b4.html │ │ │ ├── 42823abb58e8e160a35fafb52f254f6c6e1d92f1916ecc3254ae492e6082b3d4.html │ │ │ ├── 43ff9f6d91fead23d667da4b79525543b8af915405528e2db5788cb73d6cb074.html │ │ │ ├── 452b8128babdcbc570f7b2a9e1f0c33fdafaf93f6bf07a3ed60b7aa3c2dd79cb.html │ │ │ ├── 4571254ba0d43a5698f25dae0f0276c9a5c5369fa96c0b6899ce3f32fb8f106a.html │ │ │ ├── 45fcfa7d6741f4361646905a7fdbd1928f6c00e8d4ff29c44c9a51e40b6b2607.html │ │ │ ├── 48b8426ce96bb8c741453a8f970301c01e6c3cc32661d2541513425873b9f117.html │ │ │ ├── 4cd29b7d9856e0451dd951d35e88257c3817c3659ed2108165a29eacab9a1276.html │ │ │ ├── 4e55eac42f97a6d97d14795891d2e24b715bec8c057c592a81c3d7ca08aafa10.html │ │ │ ├── 4f37a475ee044a12c0cf366ef9a00789b95f098a99f367d0a72de8552f3f1d06.html │ │ │ ├── 4fb489fade64edb35cdd847b5dc418879d52173c4d489fe345246e707733e178.html │ │ │ ├── 5319d33132365821e1eb47cd9431ab5fb2b34362a100905eecc4760224e164a8.html │ │ │ ├── 58dcad784309d2ef4a2db872b2f0f35902017346e2211d6d0bba3ea74bb6d6b2.html │ │ │ ├── 596a368b2153310d09ddff7ae579106c61bb85e9ba1d37e493f547e3dddc7cc9.html │ │ │ ├── 5ac62498cec0a03ef7e07ac47969b0093fc00c01a8d44d1af459f53a78ac3085.html │ │ │ ├── 5c427eeeb30fce808c9d4c89ffe85b0c96c79e326c9a2ce139bc4fefd698f9b3.html │ │ │ ├── 5da9b27b0b9b53ae469cda7615c39dd9001c7e8f942eb3efc63e86988712d9fc.html │ │ │ ├── 61d6985209086e37a9264092befd79c5355a6494a2cd6ee81edce56c5c1beedf.html │ │ │ ├── 631e7829b8fb596a2246ad0a56077720dc1674ae55a906b8d19d525cd1707c04.html │ │ │ ├── 6558a82cd7af98f97783a125f84d6aefccd7ab723f58c5f4ee58c2e11523ac29.html │ │ │ ├── 66211fc426fc70162d7560b0ebb6497e0cc76a3218f087612a0f92640af93746.html │ │ │ ├── 665b6857803ab81b73da628c35cf544684ace5d3112eccd181b130dbc1070b98.html │ │ │ ├── 6679b81a5620855a0e9661d2dac7d7c76e346e28e28bd8b1119ef5a1afe25b1e.html │ │ │ ├── 6683bc60a59873f472f6f5601ba16925d7b3b8be032ac67b918a4ebfccdce212.html │ │ │ ├── 6967b7f7863542e446d690f5b76cbbbe0f5797d638191c298db4efdf27ecc0ba.html │ │ │ ├── 6a096b630c676dd12a0d3d8b6c5c6cb3cf18371d5f8b5b23181a7e7f63b09d10.html │ │ │ ├── 6e6649bbcb2e4a759cd7057bf9c87c8bd35b3c0e88bbe39fce77d1da85e2de6a.html │ │ │ ├── 6fa03f75b5b2c0d701cdcfb45d07d423380e461819dbe42296ac3f39dc480154.html │ │ │ ├── 71bfee3a89353841d8b1f6f483b856afafd5347e880732214e8dad523dcdb592.html │ │ │ ├── 777ddd63e01e0fb13f4ab1f4bc6a2ff0c15e68c50ef6707d800e13d17fb5e99e.html │ │ │ ├── 780ac670d45ea4bfe7b445280c774c311847997219ab9520b4f5db9d3e4b717d.html │ │ │ ├── 7b9d87d30cf17c28b5e14e0e64857b8464c041d6485f92def6adcc6e7b338146.html │ │ │ ├── 7e99760d0920deb099946fd607f580080345cf332fae8ac7eb15fd78f38d701a.html │ │ │ ├── 7f67d45fb12c02e403bf4b561dc92fb80d413d4129a9f47a5aec9d15724e1814.html │ │ │ ├── 7fbc6805c3fb26a9f494fa373e9a6f414b3bdd095548179565fea99564b27013.html │ │ │ ├── 804c6ccd20eecf7542052bb1f97f8455ca7311dbf10f863f740a0c8be0ae9f33.html │ │ │ ├── 82da43b1a1c4425adca2b88af754420dc59a9d35849f67607e0f75e09d1d1cfa.html │ │ │ ├── 8330092ac9ab728b1625bda84ba449ef95daa36dd5a5c9d8f5138de5cf01128d.html │ │ │ ├── 83bd4dfc1e736b9a3885adc1ba5e74148b888887f494dc4e3f375a335e419a94.html │ │ │ ├── 84fc492060f77163e1d1ba54e2fafafb81f6ca777848a034da78305736e3397f.html │ │ │ ├── 89338bf8792c5bef1d4d886b674079ae8fb1be5223104346f570c155520287c0.html │ │ │ ├── 8a5d9673b0c29ddbbf294f1b851c5739e186ad9d2417604d620ed4656e4a942a.html │ │ │ ├── 8b007155e6d31b9ea9c1ce891fdd08f83f20b596656c980a945eb3d247dc5893.html │ │ │ ├── 8dc285368a405bf44e592339f829742529b8d889bb2e8e22f3b945ce72f223a3.html │ │ │ ├── 8eb264e24d5fc6cf2b59820810e24592d0eb26934235730e0dd22a7f0a98b2c9.html │ │ │ ├── 8f203971ba54bca9876108bb3e4ba6f53e915d33c792a93d77e8fdb032e1a376.html │ │ │ ├── 8f8e788c3abb73656291f8a3a43956862f5d16bc3bb737476c920378f4630a61.html │ │ │ ├── 9055e5d5b83ee055ee8e73c03c7c1c39d1fc226f9497f2fee1f8376bb573d688.html │ │ │ ├── 922d49eb9be017161c1a0867e8a5579590753b9ef5d4bc684eda3b8524def664.html │ │ │ ├── 931ec5cf06d4cf66a2b8f8bedbd9b85f4787b5e9178c18b20f070441859f2cbb.html │ │ │ ├── 97a2a5818ca17b036363f57674f442867ba73d941a8b436456c8cc528d161aee.html │ │ │ ├── 9a0e481ae0f33a2e75eeac05672316e8bc87a156db31a995700e458e85c3ba02.html │ │ │ ├── 9af148c8013298ffc245ded6c4321e98f0a876eb9a4c3aa5abc137ef00683896.html │ │ │ ├── 9c642bd57e7c44cd40fdd9025696a5bea933556f643513e59d9965b48ef3b7e0.html │ │ │ ├── 9c74ca6f161b0284b90e2d61944a9cf520a2fc686dc8e5220d3cd6717d63d76b.html │ │ │ ├── 9c962e02099b01b20b1e92b0872e704753fc9905f6ea435449248ae98dd70145.html │ │ │ ├── 9cb6bab22ea459ab0e4942b2c5d3221b54ff7b5ae5940cb0006de5feed4016e0.html │ │ │ ├── 9f4c1adb9f61d954f33a02ef1919248e69c007dcbe9d903a0e77b739779951da.html │ │ │ ├── a28ede747c822c44a927d649abc3073302cb1f6da099050f151f95dece6d42ae.html │ │ │ ├── a4034e1788ff94224f9b7c0565ecac881964b7b10aab50925f65b6bfcb83dd3f.html │ │ │ ├── a50621c231c9da7cc082a0054efd9b2a8c4fb4adb5444bcf3258461282f08a5f.html │ │ │ ├── a7348b7f0de11a3faa45f57d740272f416bded8edc19e2bbde4d47532b44ca7d.html │ │ │ ├── a9aaae6596a59b6472952d728c3cbc5411546695b934ba16c4fb06e8e95e67ff.html │ │ │ ├── aa1432d872173fe865badf36b3687cdc114c8e8b2a1186c90d6117160201377a.html │ │ │ ├── abb1b4fa25baf541535a63a40a5a437d46944291e59d0c77812faa173277d2fb.html │ │ │ ├── acce09a6bb58986ce483a40aa3fcb700e4ee0a380f2efe1f2caeaccebc9069f7.html │ │ │ ├── ad550d301fee29927706b6339b01a98389935cc42a35b824ea971b3427205f70.html │ │ │ ├── ae46e665b2bac91e8c575eabf6a9dc4f2fd33f470f0185c25dea8ddd826c976f.html │ │ │ ├── b03d2ddeb5568f044e9c4e50c6fb563e04f502a2c8c57cf3e0951304a907c123.html │ │ │ ├── b0fb85abe7a4b9adca8d258d6bbd636680fed4545c4a8c956d9be78018d2e8c4.html │ │ │ ├── b119d73de88d5a79d3a0ca8442bacc40aa70a66f60442af11a5f8a90dd242fae.html │ │ │ ├── b5b26f9fb331612dd636a7eff9fcd2503e81124b237d916bbabdef0ad0713285.html │ │ │ ├── b6c37cfc2024121dd3b52369dad725b47f65c1cd1136258ad7fa140414b1cfb8.html │ │ │ ├── b8b6fa9925cd8cdb3dfcf66e6d7192af6bed496c4473b335db159da117373702.html │ │ │ ├── b931df70657ce14d1f0c763407b325356a972d9befa68455afe9520c8bea0aab.html │ │ │ ├── bb5f5ce0c70bc6237b33747bc969ba58fc3a9ad8c93a77cb99abc4118d02a8a5.html │ │ │ ├── bc20eb5862bf84701625f749c1ff5673ce906a89e89cb602a45a7cf094708ab3.html │ │ │ ├── bf09df62b7499937512ee1ff34c6774cc0b3499ad383fd266a18bccd7024eb6d.html │ │ │ ├── c2020fd56735f8a698535b20c512c2978fcae1836ea12fa95c0587c60488af42.html │ │ │ ├── c3286475bd8bbbd7496306a93891ef4b8d5b968a031429764deeb633d6af5a02.html │ │ │ ├── c4053203f04f40ae3bbfb3bce50a390b3b28812e927067cc82bbdd4a2a319609.html │ │ │ ├── c4b9e096babb9f4b020032015760a261c85ac5cd8620c18abdbf745964edabf9.html │ │ │ ├── c6154d2bd427327540ef334370f8dd1cd2847c0df2254f5685882bb4a5bf3450.html │ │ │ ├── c76d772245a8446653badd54b5349f0c55dbbc42af357aa29b7ea2d4aa607bc5.html │ │ │ ├── c8cb0eb63bf9692ff790617d35958fd3b117773244736f6a13b4ae71060355d2.html │ │ │ ├── c9d7202a66cd79fcb61bccdc7911d10618ff02bd32296b4c939e44e1a42055fa.html │ │ │ ├── cadf847f87179b48a97e55f6a092167eb8e5fd2df9d4c58d746de09a340361b2.html │ │ │ ├── cb221f0267eb28eefacb7e7d3ccf4a02b24ca8287a6ce7b34dad4aa57c6c6b81.html │ │ │ ├── cb4a1e416d42539a35410102f06ad79e5dab614edd4c53251fb123081c69c2c8.html │ │ │ ├── cc1897c3d68906d2f86bee15a3405d309c486b777d40a391f45c8578e8e5bb3f.html │ │ │ ├── cf92f4271dfc45e262ac643fd9cb56d0a350a87c5d0c23c803023b5ff67b69bc.html │ │ │ ├── d231b17088855d546e63d626119f38fcb324a015767a0a3f002e068fd584c393.html │ │ │ ├── d429bb779a67f2ef64a8ece46d1ca8eba192a26e173ba25534d386a56b84088e.html │ │ │ ├── d5bf38e73afa7a9c95c924a31c3d27ef466ec10fdca8b3e8956e626f465a1ee9.html │ │ │ ├── d98d4324d257abcd18ed4cc55d04b9030b71f3941d1606d7693309910ef97c54.html │ │ │ ├── d997d758e164d7642a106dce06f0b68df60a1b48ac1bdb7cb9cbeec61607d798.html │ │ │ ├── ded29d8508239773f29f5bd4e5653df916d593d2a188fe0e876224b29bacb028.html │ │ │ ├── df2e456100350d66c4981ef890ed8601235e09c6073e2e77a074c54685744b28.html │ │ │ ├── e25d0dd87c529dc7434ef3ecf25af39d56b09547d1a7d72ff31ed70a952d8874.html │ │ │ ├── e3adaf980a5bb4811780fa868f2d4f4d942c0162957afa871fc3fc51b14c7917.html │ │ │ ├── e5c5921e7c711e759e89e988fe587e99c6ab31b516d8b32922468b8551ec2873.html │ │ │ ├── e751800204690e100c8e8802128e886fbd6bf7360903dedc419a69246be2cb6a.html │ │ │ ├── e870e9deb7e33c9675736e8876172d3b0ca3cc001365cb580c4be79d8b09e8f7.html │ │ │ ├── e8dff24871ec675824595dd6c09f523abdff2134977ab766e47ef6c74bd6aa13.html │ │ │ ├── e95f478204362affdd3b41ed598d95ff5a08596cf79048371e4a053c387784b2.html │ │ │ ├── ea5bdb58d64b345dd80afd12651010fd442d4c8bdf45b540e61c4ff7c37fb30f.html │ │ │ ├── eba2fde503ef846ebcfec14ea819dbda05c779dd8f957db23b4f6951cfc7fe39.html │ │ │ ├── ee33f16b5c119fc1986c79ffbf290ec9d11b3d41014fe933005a3e2a33b84674.html │ │ │ ├── eee73319935536d6bfdd70c3ec8cd1ff406aec151b06be086462f35a899138a0.html │ │ │ ├── f01449a294a0ef14d81ddaa7f7fc9a445054d0c2be1d18ff2c1ad19e05f6cacf.html │ │ │ ├── f1e9572d5e8ee235fb28cb652f5fa64dcc4b298e36d884e37149ec25edda5fc4.html │ │ │ ├── f6d0929419b4ce71ce70630a8fac43dcee3735f62f87d22a62f74c45b8103d76.html │ │ │ ├── f88779ee9ff9c5d1eaf8c0d1023a0722418a7023bc4ed2f90d3cf8bbab0d1fc9.html │ │ │ ├── fb45fc029ee1244a433fb44a580c5a4ccbd88a87238d01047979c106f1ac98be.html │ │ │ ├── fd0a94a0e3bc8c58f5f29464d9704583da51914abe528a14411de80ffffae785.html │ │ │ └── ff25a6d974197f41e97db5014e0f2414bde5af2959a49a8906b46e42c83f828a.html │ │ └── scores │ │ │ ├── 2024-11-21_16-24-13.json │ │ │ └── ori.json │ └── forum │ │ ├── base.json │ │ ├── htmls │ │ ├── 009ca1a442c9e2568c7fb2842385a1a14cc5ab32ca5fcaef948a46e7647e20bb.html │ │ ├── 044e68e44f3c616bdd2509955ebdabc82f7fe370c0ce5af373d47edfcffbe2e9.html │ │ ├── 064630183c160b4e071152e52acc93690ed04f8e3747ace8c948aeca4d9f4259.html │ │ ├── 07e4af23103e8528ec3de33bfdea48b88c5c6d5f05d4fc6abd9d97f39918d695.html │ │ ├── 08412f7a050466793eb2952d83f8c051dd01dd3cd863b93175b0a3ff93eb5089.html │ │ ├── 0995d7a4e47d2c61a5ef0a631b38fb3f7f8d640e91f27862e448b4c3432832c4.html │ │ ├── 0f5b479e1d93a104e24a309acacb34582838cefd59e17dc629a87bab2c2df280.html │ │ ├── 12de85ce952723f18b2f14e587594cd5130e9eb24fb68e21177380fe8b4efb62.html │ │ ├── 13e5e1dc5565ced7bc89bcfecafdf56dfad212144e0f411b8119833025875f99.html │ │ ├── 177560436e1c0ec0ad88fd0b029bcde03c23e536532ddd24d7e3543d8deaca2a.html │ │ ├── 19618876358be36995003aae24d8332aca122ec44ab3eccf17e2e40a25768272.html │ │ ├── 1f95e1499390a2bc1bc80263ddba93402009d5d50fa72c79fb8bf2ebf9a9e07e.html │ │ ├── 20373e03defc0ad0abaa47a78a3f3b13def41e9e3991f693b21562bf281063e8.html │ │ ├── 20bf4bbd323160ebf940f01132a4fa12d573b9de8f3f45c4c55b45e72cb46466.html │ │ ├── 21e9f215eb21da5670b2f0dea05f3a37dab4fbf22d266fa0f8bc6d62b1be9257.html │ │ ├── 22b2f108fb7d918779d530dc9371b1f4746b9ad714b9484a66b3587a85041b63.html │ │ ├── 2c983edd919e457e220e2d746b65b32f9ce035fc074957e90473b4b0f15c948e.html │ │ ├── 2d327bf740c37765cc655d3aa1d77f8dd10ad21c264f847d1b515667afac35b3.html │ │ ├── 39c13f85b5bd4e98188dd723bc4d58ec8c47d41b01c44d2dce336ceb3fb2dee2.html │ │ ├── 3a43c342657cab390d70029ef10b0ceecd3ea9272b8acfd1d0c2d8cb398a9f01.html │ │ ├── 3bea25afda1671509d82cee7e6529ae362a60b7c1f5817d4627a455cc9b23ab3.html │ │ ├── 3c7a89566edecb803f20309cfb53e939ec5d9baf6325b8bdd73eba2e6aa8cd5e.html │ │ ├── 4010bd17c4aa2acc80ab187026d7ed9f3e80f89f3d1af5bcde85fbaec4de3ae7.html │ │ ├── 43d2dcb628b6b99c23deb7478628c00645100ecd5025ddd5e91a9d89f04af35d.html │ │ ├── 44996889451f80a931c07eeb3501b64901c806abc4922f25940fe2c78a4408c6.html │ │ ├── 468076813c81be09f5fb7ab7b5f4654f248a5e5e13f2da7d29939e3cb3b7aa75.html │ │ ├── 4d0c2c013be04bb5d96ec57cfc23c4e74779bbd2ebd9fae93a26b010114ed9fd.html │ │ ├── 4fba6a0488f756b9f12f832fe61bc535963226455a931e9dc5f5761892efc9c6.html │ │ ├── 539e02538322863397072cdcde81556a05cf5aae1864d41e4dca9fc119df6241.html │ │ ├── 566030b8de34488d9e3be67dff3736021ba5733f50ef5a6a788680889a7c2e0c.html │ │ ├── 58167e71f3004e3b0ad13ea820a1b2c5e5dea66c555461c9b13ab30adba31870.html │ │ ├── 5b0ef856b2df5dff20276b5c23b10e7d2553249c020c86bfc8bb5b3b5ccbbb84.html │ │ ├── 5d5c1f0e282cdb389f98b0fd7b9f13de8f2f367de259dbd2f27b1a6b86dfdc09.html │ │ ├── 60136ae2420aa9f7bdf390c5a90d982cea8756c2d2820b6ebdc963b3bb7e566e.html │ │ ├── 60dacbf71199f6594a070af5f5b13cd8e82f2b993e1934749446ce6132a914f4.html │ │ ├── 62846b492e1b2af2ad655c497ff23534759185398463ce18c4fd60bfd70104a0.html │ │ ├── 65899f42a8d21be98aefe5be44608b948de30c7f8bd9a47291e22af99a303321.html │ │ ├── 660f1afdd0b5d2c4ea33e0e22710e13e5bce0a3c936cd3cfb5dc409a1207031d.html │ │ ├── 6897d6ef37e9d5306649c363dc4b5bc774c7011603ac1e1082681aec2c2bd6b5.html │ │ ├── 6d0accd9dc90227d72b3d732ae3c47a108de0250917e20b3e3a8f5943928e97e.html │ │ ├── 6db2e2d672d8b4fe7cb13ce5a5520444734b2ea0641a4a82546d2af4c0466cd0.html │ │ ├── 717e9d39a4c7c938c24eb16eec0def5a55574d15e759824f1f2cda16f2a0c76b.html │ │ ├── 74b533483d3592a9e8242465bc60efa73d93da9f323ce5081c07580b7928f718.html │ │ ├── 756fed7005a6618308777794e395f670183707540b7d72c0ab9fa900c2b21c8d.html │ │ ├── 77d118aedba20777037422e6748c758d6f7013a2e1f108fd336204a551f68991.html │ │ ├── 79ca5580a26a5093246756487dd277ea1e1c3d67606c7957dcf829e193c7237d.html │ │ ├── 7abbb70c1b321e1afbbf8b117b1095dc64a4fbda187f6bd0d6ded9da8612b617.html │ │ ├── 7b01ccf33237e2a5bf89ffaad72f1792f58c5efca1df84077f8d4a23c013f199.html │ │ ├── 7ecdac9a0d50cb49c80d5f38e2ac2732b4515821fa672228bfb5feec5e11374c.html │ │ ├── 81abaa708d7b56f8aa077fc5111669f90289e376a182ffa3f63b16881b7c3dc6.html │ │ ├── 87344ed138bfc3b574e178eed253f5a39b9720afed6a02fbbca884eb00611c70.html │ │ ├── 8cf0af42df35f70b52aaa5a62143146bf0743093a3cef6e94c25b218c89d6640.html │ │ ├── 8eea20685c1505c4e3662ea8e8bf713d2dd8efe5496dae7c862af45dc051fd20.html │ │ ├── 9864ca4dc1b299638a96ccc3fd6e175fc861bd96f9b71d588689e9475f7aec94.html │ │ ├── 9bc65d367d3b3bdad827f0722209d66c4dc0e8f1c67b42194ad709d01bbf83e8.html │ │ ├── a055a6fb9c7c43ccf85157c3d2632cba2ebe9fb367018ccf58701524275efd46.html │ │ ├── a11ddefa8af004ec2f50799231071470d5535e24f09c560b3dc90c7367dde6b3.html │ │ ├── a2323d652d0eaf1bb2d361d9165fc4b56131dc8e0c9bb1a680386defaf4fa2a4.html │ │ ├── a4c329f2524d1262dff5ed35a267a1c6ac25d7fd2b775ce2a7a59d762251a089.html │ │ ├── a6b698cf8edc93e22f1eab9e32596d0b9859d7d288bde50477199527ae90a5cd.html │ │ ├── a7f7b83e8874683971c02a7d88e07acf346dc4c939a0ee310ab94879f45fd55c.html │ │ ├── ad412ab23b75646d54e363b4d813c35b33b353be18fa8c80fcb6adba87ab1b32.html │ │ ├── b1ed184853c385778644ac71605bc978d8365d8c59bdd22f1a0f2b137ccf379f.html │ │ ├── b5bb37137a346f6d341184bf16defecdaa7c0d0011da3f66ae729bca830e98a5.html │ │ ├── b679ffe3d623720a935250ff9aa6f5cfc230685e60182f18285d0984289f4eb1.html │ │ ├── b7baf39d30ba3d4da269b3a0d2704c79c1f8eb1e60d34dd0546c296f42b5d2da.html │ │ ├── b9b9c688999f6f670cdae43eaa749602bf7d12fb7695ad52ad6e6b1111c16aba.html │ │ ├── baece742e02c5670371f0399f57ec8b17879d0b99e09d2c52b2a264227878058.html │ │ ├── bba250a462daa9c49237a544c65be2cf0a22ff7eb04c14f578cba9f8b8c177b1.html │ │ ├── bc87ae6202bd353c4738b45a6645a304ccc465ccc7b457afd6806afa8373fc26.html │ │ ├── bcf122d31e28283b7c1ddccf424679e67ca547d59d8a2c7dadaca7475452b3e0.html │ │ ├── be673eda133de3729a144d296b9adfb8b2e1178da8ff8e5fc3d5f72d8a67b807.html │ │ ├── bf03d4e9a96bd6b8cee833f2b9816608faac4c00137d6f4bc40934f22a7f9cdb.html │ │ ├── c06b9eed5a1fb6f692c195cc1c7f2302c452a0cf2edb7115514c9c2dffb0327d.html │ │ ├── c0caa08e1dce04b2eea78815db66fca4aa0cbbf3e78051e401d9bd218df41dc0.html │ │ ├── c2229829ebfbbc94606db24e88a81a3049cb30a54861898637ceb8281b59fe80.html │ │ ├── c229997de4d0385331ade49903aa2a6096d82be1ca0507aa22133aa1de006690.html │ │ ├── c48a8fd6d23b4583a9cfb2ba25f25e7e2a2c0c660ee9dd49546c41a7a1156c86.html │ │ ├── c5feac45ec7e84a1798d38f46ede8f91fa0ef794b87843510311f2bc080ebb76.html │ │ ├── cb3f140868ff7df8db5050d890bd601d6af3a67762b4d9ed2f7c10aeef3cd6ab.html │ │ ├── cbfa92fd5fb8ea8ca75cf15b02f5511b5bd04791897dea0210c2ff1f7fb5e810.html │ │ ├── cc252d928ca428fcfbdf144d57de749390f1c3b535a1e913e137f62aab059559.html │ │ ├── cd3b71a55c7f1211d9da95025e60c51029e7e345c5dd4441fe1ad00fde3232cc.html │ │ ├── cd66147a62b65a9b9bf37fe2006d42a60d539d5a1b67c6047c495583826353ea.html │ │ ├── d4654175c7dfe1fd7a599c880cad070fc3f861945fea77cece948599ba2bd548.html │ │ ├── d676aafbd914fbd3c728501c9f467c600d066927e32b31cc076ed740626fe22d.html │ │ ├── d6d39f43bca5709252e776c2efbc4fc741505cf2607c7c198611eb469e156c31.html │ │ ├── d6f33ea298eb113185aa213608d14de7cfc5daf65cd6638856724460e2b82ed4.html │ │ ├── dd58d570743b864f0b91a2db873aa7d4100959948993e364033889951b15e22c.html │ │ ├── df28ea9a48811ae7caca36f1c9c952179e0ba8cd3240280648bb89c273ee6262.html │ │ ├── e25631d558f6a211f7fb8a186f0468c016b034360e441e70fe95e0e7b12b3f2c.html │ │ ├── e4662f72940f9f9bb0b43ae057f0f533c2da3e1e93b0ee789408b983fcaba260.html │ │ ├── eaddae4e0149832f898381e65268fd32a07196c3f1d450022f273721c854fdf0.html │ │ ├── ebc78eae1107eb46ebe2b7e1c5fb0c984abdafac3bad5c6ad0204a4e9c91a1bb.html │ │ ├── ed1ddec0eeff48776e851f62b0c9e5ffe917695010c7949c545ca284f672331f.html │ │ ├── ee410831250e8edad3706b17590f23de4ff9f2d3153459cf2ec15c36574e5fc2.html │ │ ├── f1f0c9cca11ad20446dc3208242aac13865591303c1284955d891e6a29325184.html │ │ ├── f297ca438db75f6d058306992436ad6338135ce001f10e181b5decc3a3b20602.html │ │ ├── f488fea4ba42184e6346de139372670264d3661412518fe30141bb61b308b65a.html │ │ ├── f7682fdba3ae6ccb4caafaf5d4fa608b119324861f90aa375f164c0d6acff76d.html │ │ ├── f8a495be9c3382fbbcc20e071940923af6b9c32502fbb1c6b0f9b2bbd16fbd38.html │ │ ├── fbe48a1911e4d3a026a6124acfb9537debd6149dc8ec6fe8663133a67df43f48.html │ │ └── fd153ac6f97f6ec0e9e004972159c745b3b0f817edd85830f1e1e10a2997ac21.html │ │ └── scores │ │ ├── 2024-11-21_16-22-06.json │ │ └── ori.json ├── eval-requirements.txt ├── evaluate_articles.py └── evaluate_forums.py ├── magic_html ├── __init__.py ├── config.py ├── extractors │ ├── __init__.py │ ├── article_extractor.py │ ├── base_extractor.py │ ├── custom_extractor.py │ ├── forum_extractor.py │ ├── title_extractor.py │ └── weixin_extractor.py ├── mmltex │ ├── README │ ├── cmarkup.xsl │ ├── entities.xsl │ ├── glayout.xsl │ ├── mmltex.xsl │ ├── scripts.xsl │ ├── tables.xsl │ └── tokens.xsl ├── readability_plus.py └── utils.py ├── requirements.txt ├── setup.py └── tests └── __init__.py /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/** linguist-detectable=false 2 | benchmark/** linguist-detectable=false 3 | *.xsl linguist-documentation=true 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | .idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | # LSP config files 174 | pyrightconfig.json 175 | 176 | # End of https://www.toptal.com/developers/gitignore/api/python 177 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # magic-html - 通用HTML数据提取器 2 | 3 | 欢迎使用magic-html,这是一个旨在简化从HTML中提取主体区域内容的Python库。 4 | 5 | 6 | 7 | ## 项目描述 8 | 9 | magic-html提供了一套工具,能够轻松地从HTML中提取主体区域内容。无论您处理的是复杂的HTML结构还是简单的网页,这个库都旨在为您的HTML抽取需求提供一个便捷高效的接口。 10 | 11 | 12 | 13 | ## 特点 14 | 15 | - 返回主体区域html结构,可自定义输出纯文本/markdown 16 | - 支持多模态抽取 17 | - 支持多种版面extractor,文章/论坛 18 | - 支持latex公式提取转换 19 | 20 | 21 | 22 | ## 安装 23 | 24 | ```shell 25 | pip install https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.2-released/magic_html-0.1.2-py3-none-any.whl 26 | ``` 27 | 28 | 29 | 30 | ## 使用 31 | 32 | ```python 33 | from magic_html import GeneralExtractor 34 | 35 | # 初始化提取器 36 | extractor = GeneralExtractor() 37 | 38 | url = "http://example.com/" 39 | html = """ 40 | 41 | 42 | 43 | 44 | Example Domain 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 |
53 |

Example Domain

54 |

This domain is for use in illustrative examples in documents. You may use this 55 | domain in literature without prior coordination or asking for permission.

56 |

More information...

57 |
58 | 59 | 60 | """ 61 | 62 | # 文章类型HTML提取数据 63 | data = extractor.extract(html, base_url=url) 64 | 65 | # 论坛类型HTML提取数据 66 | # data = extractor.extract(html, base_url=url, html_type="forum") 67 | 68 | # 微信文章HTML提取数据 69 | # data = extractor.extract(html, base_url=url, html_type="weixin") 70 | 71 | print(data) 72 | ``` 73 | 74 | 75 | 76 | ## benchmark report 77 | 78 | 根据html页面类型,文章/论坛,对比不同开源通用抽取框架抽取准确性 79 | 80 | 文章类型:选取头部新闻、博客站点共标注158个html页面 81 | 82 | ```Python 83 | ╒══════════════════════╤═════════════╤════════════╤═══════════╕ 84 | │ func │ prec_mean │ rec_mean │ f1_mean │ 85 | ╞══════════════════════╪═════════════╪════════════╪═══════════╡ 86 | │ magic_html │ 0.908865 │ 0.95032 │ 0.92913 │ 87 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 88 | │ trafilatura │ 0.833434 │ 0.912384 │ 0.871124 │ 89 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 90 | │ trafilatura_fallback │ 0.831229 │ 0.933713 │ 0.879496 │ 91 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 92 | │ readability-lxml │ 0.86587 │ 0.861391 │ 0.863625 │ 93 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 94 | │ newspaper3k │ 0.409585 │ 0.372083 │ 0.389935 │ 95 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 96 | │ goose3 │ 0.525717 │ 0.457669 │ 0.489339 │ 97 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 98 | │ justext │ 0.224945 │ 0.117092 │ 0.154014 │ 99 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 100 | │ gne │ 0.828849 │ 0.629112 │ 0.715299 │ 101 | ╘══════════════════════╧═════════════╧════════════╧═══════════╛ 102 | ``` 103 | 104 | 105 | 106 | 论坛类型:选取头部论坛、问答站点与开源建站框架搭建站点共103个html页面 107 | 108 | ```Python 109 | ╒══════════════════════╤═════════════╤════════════╤═══════════╕ 110 | │ func │ prec_mean │ rec_mean │ f1_mean │ 111 | ╞══════════════════════╪═════════════╪════════════╪═══════════╡ 112 | │ magic_html │ 0.796252 │ 0.826819 │ 0.811248 │ 113 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 114 | │ trafilatura │ 0.716009 │ 0.695947 │ 0.705835 │ 115 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 116 | │ trafilatura_fallback │ 0.730304 │ 0.691328 │ 0.710282 │ 117 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 118 | │ readability-lxml │ 0.788018 │ 0.445087 │ 0.568867 │ 119 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 120 | │ newspaper3k │ 0.596976 │ 0.298322 │ 0.397837 │ 121 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 122 | │ goose3 │ 0.675835 │ 0.312969 │ 0.427821 │ 123 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 124 | │ justext │ 0.175889 │ 0.0517628 │ 0.0799863 │ 125 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 126 | │ gne │ 0.81003 │ 0.389709 │ 0.526241 │ 127 | ╘══════════════════════╧═════════════╧════════════╧═══════════╛ 128 | ``` 129 | 130 | 131 | 132 | ## 许可 133 | 134 | 本项目代码采用[Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0.html)授权。 135 | 136 | 137 | 138 | ## 鸣谢 139 | 140 | - [trafilatura](https://github.com/adbar/trafilatura) 141 | - [readability-lxml](https://github.com/buriy/python-readability) 142 | 143 | -------------------------------------------------------------------------------- /benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- -------------------------------------------------------------------------------- /benchmark/data/article/htmls/42823abb58e8e160a35fafb52f254f6c6e1d92f1916ecc3254ae492e6082b3d4.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | Home>> 4 |
5 |

Monitoring system aids wild animal protection efforts in NE China

6 |
(People's Daily Online) 11:35, November 23, 2022
7 | 8 |

9 | Since it was established in October 2021, the Northeast China Tiger and Leopard National Park, which has a forest coverage rate of 97.74 percent, has seen steady growth in its distribution area and number of Siberian tigers and Amur leopards, as well as other wild animals living inside it.

10 |

11 |

12 |

13 | Photo shows a Siberian tiger in the Northeast China Tiger and Leopard National Park. (People’s Daily Online/courtesy of the National Forestry and Grassland Administration)

14 |

15 | Thanks to a monitoring network which covers air and land, staff members working inside the national park can obtain data on the distribution of wild animals inside the park conveniently via their smartphones.

16 |

17 | "The monitoring system can send videos of wild Siberian tigers and Amur leopards in real time. This informationized and intelligent management platform can help facilitate construction of the national park," said Feng Limin, deputy director of the monitoring and research center of Siberian tigers and Amur leopards under the National Forestry and Grassland Administration, and who is also in charge of fieldwork at the research center for Siberian tigers and Amur leopards at Beijing Normal University.

18 |

19 | In the 10 years after 2005, Feng's team installed a total of 3,000 infrared cameras deep in the mountains and forests of Jilin Province and Heilongjiang Province, where Siberian tigers and Amur leopards are mainly distributed, to monitor their activities, Feng explained.

20 |

21 | In autumn of 2016, Ge Jianping, a professor from Beijing Normal University and also founder of the university's research center for Siberian tigers and Amur leopards, together with members of his research team, came to the area where the Northeast China Tiger and Leopard National Park is currently located to look for more suitable technological support for their wild animal protection efforts.

22 |

23 | After spending seven days searching for a solution, they eventually found that the place had wide optical fiber network coverage. The researchers thought they could use these facilities to promote informatization of their protection efforts.

24 |

25 |

26 |

27 | Photo shows an Amur leopard in the Northeast China Tiger and Leopard National Park. (People’s Daily Online/courtesy of the National Forestry and Grassland Administration)

28 |

29 | The research team then worked together with media companies and more than 10 domestic high-tech companies, jointly developing a 700 MHz frequency band suitable for the forest environment. They also developed key components and equipment including real-time monitoring equipment, terminal communication modules and connectors, and a management platform, among other equipment. In February 2018, a natural resources monitoring system was put into operation in the area.

30 |

31 | There are 95 base stations inside the national park, of which over 60 were built at the sites of former forestry fire monitoring towers. This practice has not only saved on costs, but also protected the vegetation.

32 |

33 | More than 20,000 monitoring terminals, including infrared cameras, have been set up inside the national park, capturing and then transmitting videos and images of wild Siberian tigers and Amur leopards over 30,000 times.

34 |

35 | At present, more than 6,800 forest rangers are engaged in the work of protecting wild animals inside the Northeast China Tiger and Leopard National Park. In addition to the rangers, more and more local villagers have started to join the protection drive. Han Changxue, a farmer from Hunchun city, Jilin Province, became a forest ranger in 2021. After receiving training, he can now skilfully remove traps and identify tracks of wild animals in the forest.

36 |

37 | The monitoring system has greatly improved patrol and management efficiency for forest rangers.

38 |

39 | "In recent years, we've participated in more than 580 maintenance tasks, repairing and replacing the relevant equipment to ensure that the monitoring system operates smoothly," said a person in charge of maintenance of the monitoring system.

40 |

41 | Thanks to this technology, the number of wild Siberian tigers inside the Northeast China Tiger and Leopard National Park has gone up to 50 from 27 in 2015, and the number of Amur leopards has exceeded 60 from 42 in 2015.

42 | 43 |
(Web editor: Hongyu, Du Mingming)
44 |
-------------------------------------------------------------------------------- /benchmark/data/article/htmls/4cd29b7d9856e0451dd951d35e88257c3817c3659ed2108165a29eacab9a1276.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 |
5 | 6 |
Fiji secured their first win in South Africa since 2009 as they thrashed the United States 29-15 in the final of the HSBC Cape Town Sevens on Sunday.
7 |
However, despite their loss in the final, the Americans still managed to take the lead in the World Series standings with New Zealand just one point behind in second with Fiji and England in third and fourth respectively.
8 |
In the much-anticipated final in the legislative capital of South Africa, Fiji captain Kalione Nasoko opened the scoring with a solo run after some excellent Fiji defending. Subsequently, they were handed two gift tries as Vilimoni Botitu first got between a Madison Hughes pass and was free to run from deep in his own half. Subsequently, Sevuloni Mocenacagi exploited a Perry Baker's howler to extend his side's lead to 17-0 at halftime.
9 |
Botitu, who won HSBC Player of the Final, capitalized on a gap after halftime and despite Ben Pinkelman's two valiant tries, Fiji cruised through to the title victory.
10 |
11 | 12 |
13 |
USA's Perry Baker (L) is tackled during the HSBC World Rugby Sevens tournament final match between the United States and Fiji at the Cape Town Stadium in Cape Town, South Africa, December 9, 2018. /VCG Photo
14 |
15 |
16 |
17 |

USA's Perry Baker (L) is tackled during the HSBC World Rugby Sevens tournament final match between the United States and Fiji at the Cape Town Stadium in Cape Town, South Africa, December 9, 2018. /VCG Photo

18 |
19 |
20 |
21 | 22 |
Nasoko's line break allowed his second and Carlin Isles capped the final with a magnificent run to the corner, but the United States left the African country on a high note as they finished second for the second consecutive event.
23 |
After his side's emphatic triumph, Fiji head coach Gareth Baber said, “It's been a long time coming for the Fijians to win in Cape Town. We were disappointed last week that we didn't manage to perform to the level that we've done this week. 
24 |
“It makes me most proud that these players have done everything for their families back home and the country of Fiji. It's great. Fijian fans are everywhere in the world, wherever we travel, and we say a big thank you to all those who make a big effort to come and support us.”
25 |
Earlier in the day, hosts South Africa edged past arch-rivals New Zealand 10-5 to clinch the bronze medal while England finished fifth with a 14-7 win over Spain.
26 |
Argentina won the Challenge Trophy. The next stop on the HSBC World Rugby Sevens Series is Hamilton and the two-day event will be staged on January 26-27.
27 | 28 | 29 | 30 |
31 |
32 |
-------------------------------------------------------------------------------- /benchmark/data/article/htmls/83bd4dfc1e736b9a3885adc1ba5e74148b888887f494dc4e3f375a335e419a94.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 科普中国-辟谣文章 6 | 10 | 14 | 18 | 19 | 20 | 21 | 26 | 30 | 40 | 41 | 42 | 43 | 44 |
45 |
46 | 47 | 48 | 49 | 62 | 116 |
117 | 118 | 食品安全 122 | 暴雨 126 | 溺水 130 | 矿泉水 134 | 蚊子 138 |
139 |
140 |
141 | 142 | 143 | 144 |
145 |
146 |
147 |
148 | 徒手抓子弹真的存在吗?
149 |
150 | 来源: 科普中国-科学原理 153 | 154 |
155 |
156 |
157 |
158 |
159 |

看过电影《功夫》的人可能会记得火云邪神徒手抓子弹的剧情。那么在现实中,徒手抓子弹可不可以?首先介绍几个物理概念,在了解了物理知识后,再来考虑徒手接子弹是否行得通。

第一个是相对运动,它指的是一个物体对于另一个物体相对位置的改变。例如,坐在一辆行进的火车上,从车窗向外望去,会发现火车外的景象会朝自己的身后运动,最后消失在远处。这是因为我们选取地面作为静止参考系,而我们在向前运动,所以地面上的物体都会相对于我们而言向后运动;换句话说,如果我们在一辆向后开的车上,那么窗外的物体则是向前运动,这就是相对运动。

这都是假设某一个物体静止的情况,如果两者都在运动,如两辆并头前进的火车呢?这又要介绍另一个物理名词:相对速度。在物理学上“相对”可以划分多种情况,在这里我们讨论的是简单的例子,即在平动惯性参考系下的相对速度,假设两者都是匀速或匀加速运动,二者并没有相对加速度。例如,在接力赛交接棒的时候为什么接棒的人要先跑一段距离才接棒,而不是站在原地等棒到?就是为了在接棒的时候两个人的相对速度能小一点。如果两个人用相同的速度跑步,那么他们之间的速度就等于零,两个人相对于另一个人静止的,对于交接棒来说就非常方便。这就是在相对速度等于零的情况下的相对运动。

接下来讲讲徒手抓子弹的可行性。从上述的例子和说明中,我们可以猜出如果要徒手抓子弹的条件是什么。理论上,只要让你的速度和子弹的速度一样快的时候,子弹相对于你来说就是静止的,就可以轻松的抓住它。这就是徒手抓飞行中的子弹的唯一条件。

但是,这只是假设理想情况下,毕竟子弹的速度非常快,大概每秒四百多米左右。一般来说能达到这样的速度的话,只有坐在飞机上才能满足条件。而在这样的条件下应该没有人敢伸出手去抓子弹。所以,在理论上,当两者速度一样或区别不大的时候,物体对我们而言是静止或者以非常小的速度运行,我们便可以对它进行操作,例如航天器的对接,就是将二者的相对速度不断减小,最后在相对速度非常小或者几乎为零的情况下进行对接。不然以航天器每秒十几公里的速度,撞上去就是一个大灾难。在理想条件成立之后还要考虑现实意义,想要徒手抓子弹,不仅要和子弹保持相同速度,还能够伸出手而不被其他条件所干扰,具体实施起来非常困难。所以徒手接子弹只是电视特效而已,在现实生活中并不存在。

4vCKVZcfzM239uuCdaSbADa6FEg1rXU1xkwg.png

本作品为“科普中国-科学原理一点通”原创,转载时务请注明出处。

160 |
161 |
162 |
163 |
164 | 165 |
166 |
167 |  子弹 170 |  相对论 173 |
174 |
所属分类:数理化 
175 |
176 |
177 |
178 |
辟谣
179 |
180 | 181 |
182 | 183 |
184 |
185 | 186 |
187 |
复核
188 |
189 |
190 |
191 |
192 |
193 | 194 | 195 | 时间:2019-09-24 17:08 197 | 204 | 215 |
216 |
217 | 218 |
219 |
相关推荐
220 | 244 |
245 |
246 |
247 |
248 | 257 | 258 | 288 | 289 | 298 | 299 | 300 | -------------------------------------------------------------------------------- /benchmark/data/article/htmls/9c962e02099b01b20b1e92b0872e704753fc9905f6ea435449248ae98dd70145.html: -------------------------------------------------------------------------------- 1 |
A girl is comforted before a candle light vigil in Raleigh, North Carolina, on Oct. 15, 2022. The vigil is held to honor the five victims killed in a shooting two days earlier. [Photo/cfp.cn]

Multiple schools in the United States have been subject to school shooting hoaxes. 

Students or others will call the police to misreport that a school is under attack. Sometimes they say they witnessed a shooter. Sometimes they say they are the shooter. Sometimes they say they planted a bomb. Either way, it has a terroristic effect and results in the police being involved and the school being shut down.

Fake school shootings have real effects.

In a country like the U.S., where mass bloodshed at schools and in public squares is a weekly occurrence, police chiefs say they can't ignore any reported threat, no matter how unreliable it may seem. These false reports tie up the phones and waste police resources.

Then there is the effect on the students. Students say they are traumatized even by these shootings that didn't happen. They are told to go into lockdown. They are, from the time it is reported, under the notion that they are under attack. Since they have seen so many torn-flesh, broken-bones and lead-to-skin shootings play out, they have no reason to think otherwise. Some people have been hurt fleeing a school after a false report.

In fact, Marjory Stoneman Douglas High in Parkland, Florida, the subject of one of the worst shootings in American history, which is still fresh in students' memories as the trial of the shooter who perpetuated it just wrapped up, was barraged with threats during the trial's closing phase. The defamation trial against radio shock jock Alex Jones, who falsely claimed that the Parkland shooting never happened, had also wrapped up around the same time. Jones was found guilty and ordered to pay millions of dollars, while he encouraged his crazed listeners to take action.

It is evident from this and other cases that there is an intent to sow fear in some of the bomb hoaxes. The conspiracy theorists are particularly outraged against the Parkland victims for being victims of brutality that they think never happened. Some of them have vandalized the graves of the dead. They think they are supporting Alex Jones by making new threats against Marjory Stoneman Douglas High.

There have also been threats against historically black colleges and universities (HBCUs) that reek of racial terrorism. For decades, America's major universities were segregated. The white majority didn't allow blacks to attend school with whites. So black Americans established schools of their own. 

While schools have been desegregated, racial justice advocates still point to issues of inequality at elite white institutions. HBCUs still have an important role to serve. But their existence angers white racists. 

Howard University reports receiving eight hoax bomb threats this year. Howard is the alma mater of U.S. Vice President Kamala Harris, who is the first black and first female to hold such a high office in American history. 

However, not all of the hoax shootings reported are cases of terrorism. Sometimes students just want to miss school. According to Newsweek, "School shooting hoaxes were also the result of a TikTok trend late last year, during which some social media posts encouraged students to call in a fake school shooting incident in order to get out of school early."

America's unregulated social media wasteland has allowed hate speech, lies and propaganda to spread unabated. Now it is a breeding ground for false reports of school shootings. It seems that it is just the latest cool thing to do, like eating tide pods or playing the "fire challenge," other trends that spread on social media. 

America's twin passions of unrestricted access to guns and shameless social media provocation are hence united in a truly depraved trend.

Mitchell Blatt is a columnist with China.org.cn. For more information please visit:

http://www.china.org.cn/opinion/MitchellBlatt.htm

Opinion articles reflect the views of their authors, not necessarily those of China.org.cn.

If you would like to contribute, please contact us at opinion@china.org.cn.

2 |
Follow China.org.cn on Twitter and Facebook to join the conversation.
ChinaNews App Download
-------------------------------------------------------------------------------- /benchmark/data/article/htmls/9f4c1adb9f61d954f33a02ef1919248e69c007dcbe9d903a0e77b739779951da.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 夜雨_原文、翻译及赏析_白居易诗词_读古诗词网 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 24 | 25 | 26 |
27 |
28 | 29 | 37 | 38 | 43 | 44 |
45 | 46 |
47 |
48 |
49 | 首页 50 | 51 | 唐诗三百首 52 | 53 | 宋词三百首 54 | 55 | 古诗十九首 56 | 57 | 诗词名句 58 | 59 | 文言文 60 | 61 | 62 | 63 | 词牌名 64 |
65 |
66 |
67 |
68 | 69 |
70 |
71 | 先秦 72 | 两汉 73 | 魏晋 74 | 南北朝 75 | 隋代 76 | 唐代 77 | 五代 78 | 宋代 79 | 金朝 80 | 元代 81 | 明代 82 | 清代 83 | 现代 84 | 近代 85 | 86 |
87 |
88 | 91 |
92 | 93 | 94 | 129 | 130 | 131 |
132 | 133 |
134 | 135 | yè yǔ 136 | 137 |

夜雨

138 |
139 | 140 |
141 |

朝代:唐代

142 | 143 |

原文:

144 |
145 | 我有所念人,隔在远远乡。
我有所感事,结在深深肠。
乡远去不得,无日不瞻望。
肠深解不得,无夕不思量。
况此残灯夜,独宿在空堂。
秋天殊未晓,风雨正苍苍。
不学头陀法,前心安可忘。 146 |
147 |
148 | 149 | 150 | 151 | 152 |
153 |

译文及注释

154 | 更多 155 |
156 | 译文
我有着深深思念的人,却相隔在远远的异乡。
我有所感怀的事情,深深的刻在心上。
故乡遥远回不去,我没有一天不遥望它。
内心痛苦万分却无处化解,日日夜夜未曾停止思念。
我的前途似乎也迷茫无望,孤独的在空空的屋子里睡觉。
秋天尚未来临,却已风雨纷纷。
不曾学过苦行僧的佛法,如何忘记曾经的过往!
注释
①乡:家乡。
②瞻望:往远处或高处看,敬仰并寄以希望。
③夕:日落的时候:夕阳。夕照。朝夕相处。无夕:日日夜夜。
④残灯:不好的事。夜:黑夜,代指前途的... 157 |
158 | 159 | 160 |
161 | 162 |
163 |

创作背景

164 | 更多 165 |
166 |

作者:佚名
此诗写于元和六年,时年白居易四十岁。从“独宿在空堂”可以看出,此诗是为一个与作者相爱的女子而写。且学者周相录考证,这个女子就是白居易几度在诗作中提到过的“东邻婵娟子”湘灵。

167 |
168 | 169 | 170 |
171 |
172 |

赏析

173 | 更多 174 |
175 |

作者:佚名
这首诗不是唐代所流行的工整的今体诗,它共有七句话,前四句大量的重复用字,也并不合乎诗歌的习惯。意像的描写被放在了叙事之后,全诗没有比喻、没有用典,也没有大量的兴、比之作,可以说是完全没有格律的羁绊,用最直白的语言,抒发了最真挚的情感。
开头开门见山的写出了所要记叙的事情,好像是憋闷了许久的言语冲口而出,强烈的感情色彩也在无需意像的渲染的情况下,立刻展现在读者面前。四句话形式上两两相同,但情感上却是层层深入的,第一句提到了人的思念和远,都是平平的概述,点到为止,并没有提及是一个什么样的人,如何的思念她。第二句则重点写到了思念,写到思念的程度,也并没有深写。行文...

176 |
177 |
178 | 179 |
转载请注明:原文链接 - https://dugushici.com/ancient_proses/21337
180 |
181 | 182 |
183 |
184 |
猜你喜欢:
185 |
186 | 思念 187 | 爱情 188 |
189 |
190 |
191 |

白居易

192 | 193 | 194 | 195 | 196 | 197 |
白居易    白居易(772~846),字乐天,晚年又号称香山居士,河南郑州新郑人,是我国唐代伟大的现实主义诗人,他的诗歌题材广泛,形式多样,语言平易通俗,有“诗魔”和“诗王”之称。官至翰林学士、左赞善大夫。有《白氏长庆集》传世,代表诗作有《长恨歌》、《卖炭翁》、《琵琶行》等。白居易祖籍山西、陕西、出生于河南郑州新郑,葬于洛阳。白居易故居纪念馆坐落于洛阳市郊。白园(白居易墓)坐落在洛阳城南香山的琵琶峰。
198 |
199 | 200 |
201 |

白居易其他诗词更多

202 | 214 |
215 | 216 | 217 | 218 | 219 |
220 | 221 | 222 | 223 |
224 | 225 |
226 |
227 | 唐诗三百首全集  |   228 | 宋词三百首全集  |   229 | 古诗十九首全集  |   230 | 诗词名句 231 | |   232 | 词牌名大全 233 |
234 |
235 | 友情链接: 236 | 手Q阅读 237 |
238 | Copyright © 2017 读古诗词网 - dugushici.com  |  免责声明  |  站务邮箱:lang681502@126.com 239 |
240 | 241 | 250 | 251 | 259 | 260 | 261 | 262 | 263 | -------------------------------------------------------------------------------- /benchmark/data/article/htmls/c2020fd56735f8a698535b20c512c2978fcae1836ea12fa95c0587c60488af42.html: -------------------------------------------------------------------------------- 1 |

2 |

Guests attend a ceremony to mark the resumption of operation of the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link at the Hong Kong West Kowloon Station in south China's Hong Kong, Jan. 15, 2023.

3 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Li Gang)

4 |

HONG KONG/GUANGZHOU, Jan. 15 (Xinhua) -- It was a little past dawn Sunday when the Hong Kong West Kowloon Station welcomed its first batch of travelers, who couldn't hide their excitement behind the masks as they were about to catch the very first high-speed train in nearly three years to the Chinese mainland.

5 |

"My son can't wait to get a taste of the special dishes of our hometown," said a traveler surnamed Fu, who hasn't been back to northeastern China for two years. Soon she and her son would get onboard the first train in the morning to Shenzhen, where she would then transfer to her hometown for the upcoming Chinese Lunar New Year holiday.

6 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage.

7 |

Liao Jun, a student from Jiangxi Province who studies in Hong Kong, was excited to catch the first train back to the mainland.

8 |

"Today will be a day to remember. The resumption of high-speed rail services is of great significance to the overall connectivity between the Hong Kong Special Administrative Region and the mainland," he said.

9 |

The first train left for Shenzhen at 7:03 a.m. local time. In only 18 minutes, the train arrived at Shenzhenbei station, where a southbound high-speed train was ready to take passengers to Hong Kong.

10 |

The windows of the train were decorated with Chinese paper-cuts that read "happy new year" and wishes for the upcoming Year of the Rabbit, which falls on Jan. 22.

11 |

At the Hong Kong West Kowloon Station, inbound passengers were greeted with gift bags that read "Hello, Hong Kong." The arrival hall was decorated with potted flowers that signal good fortune for a new year.

12 |

"I printed out the ticket today as a souvenir," said a businessman surnamed Bu, who just took the train from Shenzhen to Hong Kong to visit his clients.

13 |

"It would be of great help to be able to talk to my clients face to face," he said.

14 |

By 10:00 a.m. Sunday morning, nine northbound bullet trains and seven southbound bullet trains had taken around 1,400 passengers to their destinations. Tickets were sold out for most of the trains.

15 |

16 |

Passengers are greeted with gift bags at the Hong Kong West Kowloon Station in south China's Hong Kong, Jan. 15, 2023.

17 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Li Gang)

18 |

19 |

Passengers get their tickets checked at the Hong Kong West Kowloon Station in south China's Hong Kong, Jan. 15, 2023.

20 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Li Gang)

21 |

22 |

Passengers are greeted with gift bags at the Hong Kong West Kowloon Station in south China's Hong Kong, Jan. 15, 2023.

23 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Li Gang)

24 |

25 |

Passengers pose for photos aboard the train G5607 bound for Hong Kong at the Shenzhen North Railway Station in Shenzhen, south China's Guangdong Province, Jan. 15, 2023.

26 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Mao Siqian)

27 |

28 |

Crew members of the train G5607 bound for Hong Kong welcome passengers at the Shenzhen North Railway Station in Shenzhen, south China's Guangdong Province, Jan. 15, 2023.

29 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Mao Siqian)

30 |

31 |

Passengers get their tickets checked at the Shenzhen North Railway Station in Shenzhen, south China's Guangdong Province, Jan. 15, 2023.

32 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Mao Siqian)

33 |

34 |

The train G5607 bound for Hong Kong is pictured at the Shenzhen North Railway Station in Shenzhen, south China's Guangdong Province, Jan. 15, 2023.

35 |

After nearly three years of service suspension due to the COVID-19 pandemic, the Hong Kong section of the Guangzhou-Shenzhen-Hong Kong Express Rail Link resumed operation Sunday, with an average of 38.5 pairs of high-speed trains on a daily basis running from stations in Guangzhou and Shenzhen to the Hong Kong West Kowloon Station at the initial stage. (Xinhua/Mao Siqian)

-------------------------------------------------------------------------------- /benchmark/data/article/htmls/c9d7202a66cd79fcb61bccdc7911d10618ff02bd32296b4c939e44e1a42055fa.html: -------------------------------------------------------------------------------- 1 |
2 |

 

"We have strongly advocated in favor of public support using public money to support the airline industry because we think that first of all we are absolutely key for the economic development."

That's what Alexandre De Juniac, IATA Director-General and CEO, told CGTN Europe today, as we mark the middle of Aviation Week on CGTN. 

In today's interview we hear from another key voice in commercial flying, though one you might not have heard from before – Annette Groeneveld of the European Cabin Crew Association joined us from Amsterdam.

But first, today's headlines...

There is good news for foreign investors looking towards China, as the country's commerce minister announced plans to reverse a drop in foreign direct investment. 

But European exporters to the U.S. may be facing a challenging time, with Washington reportedly mulling over even more taxes on goods like olives and decaf coffee (though I admit I was not aware such a thing existed – typical journalist, I know). 

Finally, my colleague Nilay Syam, whom the attentive readers among you will recognize from this very email when I'm away, has written an explanation of the "hub and spoke" model used by major airlines the world over. Do give it a read, here.

Happy reading, 

Patrick Atack

Digital business correspondent 

P.S. Did you know we send this briefing by email, too? Sign up here.

3 |
4 | 5 |
6 | 7 |

Foreign investors will be given more freedom in the service, manufacturing and agricultural sectors based on the newly released negative list for 2020, according to China's economic planner on Wednesday. China will beef up efforts to maintain stable foreign trade and investment, Commerce Minister Zhong Shan told media. 

Despite a second month in a row of climbing consumer confidence in the Republic of Ireland, shoppers' sentiments are still yet to recover from the country's record drop, which followed the COVID-19 lockdown. But the uptick is a good sign for the Irish economy, which is on a renewed trajectory to reopen. 

The International Monetary Fund says the global economy faces a deeper recession than it previously thought. The organisation predicted the pandemic will cause global growth to fall by nearly five percent this year – a worsening of around one-and-a-half percentage points compared with its forecast in April. The IMF expects the economies of countries such as the United Kingdom, France and Mexico to experience falls of more than 10 percent

The U.S. is reported to be considering new levies on European goods, such as olives, as the dispute between the EU and Washington over state aid to Airbus rumbles on. An additional $3.1 billion of taxes is being mooted, according to a Trade Representative document, listing products from France, Germany, Spain and the UK. 

The Wirecard saga continues with a global edge, as authorities in the Philippines confirmed they are looking for the German firm's second-in-command, Jan Marsalek, following German media reports the former chief of operations had flown to the country after news broke of the firm's missing billions. 

Swissport, which supplies services to 21 airports in the UK, is to halve its workforce due to market conditions following the pandemic and lockdown. The firm said it was consulting on cutting up to 4,556 jobs, which unions said was "devastating" to UK regional airports. 

The International Labour Organization's Manuela Tomei has repeated the body's warning that millions of migrant workers could be "very easy prey for labor market exploitation" amid a global downturn. In some countries, residency rights are linked to work, so if a migrant worker loses their job, they may be forced out of the nation. 

Chinese liquor brand Kweichow Moutai is now worth $260 billion according to Refinitiv data, making it the world's most valuable refreshment company, overtaking Western stalwarts like Coca-Cola. In terms of alcohol manufacturers, Moutai is now bigger than AB InBev, Diageo and Heineken combined. 

Air quality across European cities has dropped again, after months of lockdown, as vehicles return to the roads. The European Environment Agency said Paris, Brussels and Milan have seen higher nitrogen oxide levels since lockdown was partially eased. 

Indian tech sector workers have expressed their anger at the suspension of the U.S. H1-B visa, with South Asian immigrants to the U.S. tech and IT sectors making up more than 70 percent of the H1-B recipients. Many of those affected have made lives in the U.S. but cannot return after taking holiday or travelling to see relatives. 

Mercedes-Benz's journey towards an autonomous vehicle has taken a step forward, as the German brand's parent company Daimler signed a deal with semiconductor maker Nvidia Corp. "We want to launch a ground-breaking software-defined computer architecture for driving assistance and autonomous drive," Daimler CEO Ola Kaellenius said. 

Premier Foods, makers of stock cubes and instant gravy, saw profits and revenues exceed its expectations, as lockdown has driven people to cook at home rather than ordering in or going out to eat. Though first quarter revenues were up 20 percent, the firm warned that it did not know how the next two quarters would look

And the Majestic Wines-owned online retailer Naked Wines saw an 81 percent surge in orders in April, and had to stop taking orders for a period in May to ensure stocks would last. Again, though, the future is uncertain with pubs and restaurants opening on 4 July in the UK.

8 |

Video: Luring back sunseekers

Benidorm has become synonymous with low-cost beach holidays since it first boomed as a tourist resort in the 1960s. Its mix of sun, sea and sand at an affordable price that has been bringing in 16 million people a year – but the COVID-19 outbreak has changed that.

With the state of alarm now lifted, Spain is eager to get its tourism sector up and running again.

 

9 | 10 | 11 | 12 |
13 | 14 |
15 |
16 |
17 |
18 | 19 |
20 | 21 |
22 | 23 |
24 | 25 |
26 | 02:05 27 |
28 | 29 | 30 | 31 |
32 |
33 |
34 |
35 |
36 | 37 |

 

Annette Groeneveld is the President of the European Cabin Crew Association, and she spoke to CGTN Europe about the reaction of airlines to the pandemic – and how she sees the global industry changing after COVID-19. 

What is it like flying as cabin crew now, in June 2020? 

It's a very different experience. What you usually do is you try to take care of your passengers the best way you can, have a lot of contact with them, try to show your interests and talk to them. And now actually what you're doing is you try to keep your distance, and that dynamic on-board is so different that it leaves you a bit stunned when you finish the flight, because you hardly know how your passengers have experienced your flight. So it's quite unnerving to see how things have changed in so short a time.  

Looking long-term, which changes do you think we will see in global aviation?  

Well, I think it will be a long time before global aviation will be the same again, if ever. I think what we will see, is that we will have to think very hard on how we want things to be on-board. Airlines have been densifying their cabins quite drastically over the last years because the aviation industry is a very marginal business. And to make the company sustainable, they have to take more passengers into the cabin, breaking down galleys, putting in more seats, which is what we can see now is very counter-productive – because social distancing of one and a half meters is, let me be clear, not possible onboard an aircraft.  

But we have airlines that have been densifying to offer tickets for low prices, and I think what we see now is the consequences of that movement within the aviation industry. If you want to fly all over Europe or maybe even globally for very low prices, like 19 euros from Amsterdam to Malaga or for 350 euros from Amsterdam to Bangkok, then I think the situation that we're in right now will recur and recur.  

Airlines are stuffing too many passengers in their cabins. There is not enough space for cabin crew. We have aisles that are as wide as our trolleys. There is nowhere anyone can go. So I think we really have to rethink how we want to travel.  

Has there been enough industry leadership seen through this crisis?  

Well, what we've seen is that I think it caught everybody by surprise. And airlines have been scrambling all over Europe to take their measures. But because there was no clear protocol within the European Union, airlines were basically left to fend for themselves and find out the best way to control things. And I think that should have been better managed. I think we should have had a comprehensive guideline, where the European Union would make sure that airlines would treat this crisis in the same way, because we've seen everything between no measures at all to airlines who were very quick to react and to provide their cabin crew with all kinds of materials to protect themselves and the crew. So I think that could have been managed a lot better.  

We need comprehensive rules to ensure that all airlines approach these problems in the same way. 

38 |

This week is Aviation Week on CGTN. The below graphic is taken from my colleague Nilay's long read on the Hub and Spoke model, employed by global airlines. But could it be under threat from COVID-19 and "ultra long haul" flights? Read more here. 

39 |
40 | 41 |
42 | 43 | 44 | 45 | 46 |
47 |
48 |
-------------------------------------------------------------------------------- /benchmark/data/forum/htmls/a6b698cf8edc93e22f1eab9e32596d0b9859d7d288bde50477199527ae90a5cd.html: -------------------------------------------------------------------------------- 1 | 2 | Regex match line containing string 3 | 4 | 5 | 6 | 7 | 327 | status 328 | 333 |
334 |

Regex match line containing string

mariorojas - 2 years ago (last modified 2 years ago) 335 |

336 |

^.*substring.*$
337 | 
338 |

339 | I usually use this regex to filter specific lines in a huge log file. 340 |

341 | Please change substring with the text you're trying to filter. 342 |

343 | -------------------------------------------------------------------------------- /benchmark/data/forum/htmls/e4662f72940f9f9bb0b43ae057f0f533c2da3e1e93b0ee789408b983fcaba260.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Nim forum 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 21 | 22 | 23 | 24 |

Search results

shirleyquirk

...In JS or Ruby there's no compiler at all, yet programmers are able to reason about the code.

25 |

compiler/interpreter/transpiler tomato/tomato. the point, i believe, is about when it's justified for the <box that turns words into electrons> to impose it...

tcheran
...close() 26 | ``` 27 | 28 | The output is like this: 29 | 30 | 100K_RAT;Species;Rattus norvegicus (Rat) 31 | 104K_THEPA;Species;Theileria parva 32 | 108_LYCES;Species;Lycopersicon esculentum (**Tomato**) 33 | 10KD_VIGUN;Species;Vigna unguiculata (Cowpea) 34 | 110K_PLAKN;Species;Plasmodium knowlesi 35 | 11S3_HELAN;Species;Helianthus annuus (Common sunflower) 36 | 37 | (...) 38 | 39 | However I'm not...
40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /benchmark/data/forum/scores/2024-11-21_16-22-06.json: -------------------------------------------------------------------------------- 1 | {"https://movie.douban.com/subject/36151693/comments?status=P": {"f1": 0.8217433888344761}, "https://stackoverflow.com/questions/1077347/hello-world-in-python": {"f1": 0.9469767441860466}, "https://forums.bellaonline.com/ubbthreads.php/topics/888393": {"f1": 0.8598265895953758}, "https://community.familysearch.org/en/discussion/118004/'%20+%20moreLink%20+%20'": {"f1": 0.7195357833655706}, "https://careercup.com/question?id=2780": {"f1": 0.8971553610503281}, "https://zhidao.baidu.com/question/1247295277789522099.html?fr=search&word=%E5%9C%B0%E9%9C%87%E5%8F%B0": {"f1": 0.8055045871559633}, "https://zhidao.baidu.com/question/2127549042326254267.html?qbl=relate_question_2": {"f1": 0.8970189701897019}, "https://tieba.baidu.com/p/8057560733": {"f1": 0.9777306468716861}, "https://www.zhihu.com/question/586816837/answer/2916374892": {"f1": 0.9349453080023029}, "https://www.reddit.com/r/BoomersBeingFools/comments/1cq6hci/boomer_did_not_like_the_fact_i_took_down_the/": {"f1": 0.6962320773591197}, "https://www.reddit.com/r/BoomersBeingFools/comments/1cqcfkk/boomer_took_offense_to_my_teacher_appreciation/?chainedPosts=t3_1cq6hci": {"f1": 0.6100656455142232}, "https://www.zhihu.com/question/651439951/answer/3452833521": {"f1": 0.9988683515654471}, "https://www.codeproject.com/Questions/5381388/How-do-I-detect-spots-on-tablets": {"f1": 0.7193763919821826}, "https://twitter.com/PKU1898/status/1785126929055805894": {"f1": 0.8076923076923077}, "https://x.com/PKU1898/status/1734240373080572056": {"f1": 0.9}, "https://weibo.com/1618051664/Oe6wIynDn?refer_flag=1001030103_": {"f1": 0.504225352112676}, "https://www.pinterest.com/pin/unique-and-funny-mothers-day-cards-that-will-make-mom-lol--57280226501887680/": {"f1": 0.5893416927899686}, "https://ohnotheydidnt.livejournal.com/128386917.html": {"f1": 0.9738219895287958}, "https://www.kaskus.co.id/thread/6640f533517176a36c034afa/seremmm-banget-pernah-denger-gantung-jodoh-jangan-sampe-deh?ref=homelanding&med=hot_thread&style=thumb": {"f1": 0.8936970837253058}, "https://www.kaskus.co.id/thread/65f0495ce12575606b0c9640/saya-kumpulkan-komunitas-anak-dan-orang-tua-untuk-jual-beli-mainan-tak-terpakai?ref=postlist-193&med=hot_thread": {"f1": 0.8894009216589862}, "https://www.lucianne.com/2024/05/21/rfk_jr_vows_to_give_black_farmers_5_billion_in_reparations_if_he_wins_the_white_house_and_says_funds_are_not_money_that_is_entitlement_despite_white_counterparts_labeling_it_racist_128894.html": {"f1": 0.39837847798046805}, "https://www.worldaffairsboard.com/forum/political-general-discussion/science-technology/1586800-a-small-victory-dance": {"f1": 0.7429443173150267}, "https://www.worldaffairsboard.com/forum/wab-community-information/wab-information-center/48939-thread-title-correction": {"f1": 0.5960264900662252}, "https://www.scienceforums.net/topic/133914-photon-absorption-and-electron-transition-levels%C2%A0/": {"f1": 0.40017436791630345}, "https://forum.uipath.com/t/uipath-load-testing/739028/3": {"f1": 0.9431438127090301}, "https://forum.uipath.com/t/how-to-run-uipath-studio-and-studiox-on-same-computer/421350": {"f1": 0.508108108108108}, "https://ruby-china.org/topics/43661": {"f1": 0.7836456558773425}, "https://community.sunrise.ch/d/36387-abbonamento-in-pausa-upc-in-sunrise-no": {"f1": 0.989247311827957}, "https://forums.debian.net/viewtopic.php?t=159270": {"f1": 0.9735576923076924}, "https://theultimatetone.com/thread-214.html": {"f1": 0.982}, "https://theultimatetone.com/thread-268.html": {"f1": 0.9992779783393501}, "https://forums.debian.net/viewtopic.php?t=158836": {"f1": 0.9715061058344641}, "https://what.thedailywtf.com/topic/29130/the-10th-anniversary-of-the-rocky-lobster-incident/5": {"f1": 0.9662551440329219}, "https://forum.squarespace.com/topic/296584-how-can-i-have-the-same-image-at-the-top-of-all-my-blog-posts/": {"f1": 0.8886597938144329}, "https://forum.squarespace.com/topic/298933-how-to-hide-specific-product-variant-options-from-product-detail-page/": {"f1": 0.9578544061302682}, "https://www.marshallforum.com/media/for-hire-in-new-zealand-tony-tone-hire-2-x-jcm800-2203-heads-on-a-jcm800-bass-cabinet-1-x-76-super-bass-1-x-2555-half-stack-1-x-jcm800-combo-10.7130/": {"f1": 1}, "https://csolbbs.tiancity.com/forum.php?mod=viewthread&tid=38672#lastpost": {"f1": 0.6271604938271604}, "https://csolbbs.tiancity.com/forum.php?mod=viewthread&tid=38733&extra=": {"f1": 0.6517857142857143}, "https://english.stackexchange.com/questions/339859/what-do-you-call-a-person-who-is-a-mentor-to-someone-but-he-is-not-aware-of-it": {"f1": 0.5154894671623297}, "https://www.marshallforum.com/threads/extending-pilot-light-lifespan-with-a-resistor.138154/": {"f1": 0.8931552587646077}, "https://forums.flaskbb.org/topic/2295-need-some-help-to-install-flaskbb-on-windows": {"f1": 0.9959514170040485}, "https://forums.flaskbb.org/topic/2322-can-t-see-any-routes": {"f1": 0.9951219512195122}, "https://meta.answer.dev/questions/D1JI2/how-to-add-new-api": {"f1": 0.8767123287671234}, "https://meta.answer.dev/questions/D1l2/there-is-also-a-problem-with-the-smtp-mailbox": {"f1": 0.9508196721311475}, "https://patent.bbscloud.com/info/2497?csr=1": {"f1": 0.793103448275862}, "https://mlog.club/article/6208943": {"f1": 0.9207317073170732}, "https://www.paopao.info/#/post?id=1080035365": {"f1": 0.4813805631244324}, "https://mlog.club/topic/1666": {"f1": 0.888888888888889}, "https://www.developers.pub/faq/1274706": {"f1": 0.8520179372197308}, "https://bbs.nanshengbbs.top/detail/171": {"f1": 0.9939462426345952}, "https://misago-project.org/t/lost-connection-with-the-application/1289/#post-6344": {"f1": 0.9732620320855615}, "https://learnku.com/articles/86852": {"f1": 0.9428129829984544}, "https://learnku.com/articles/77685": {"f1": 0.7832167832167832}, "https://www.kunena.org/forum/general-questions-and-how-tos/168417-social-login-options-like-facebook": {"f1": 0.5249457700650759}, "https://insightful.demo.talkyard.io/-5/how-to-estimate-how-long-an-unfamiliar-task-will-take": {"f1": 0.8892892892892893}, "https://insightful.demo.talkyard.io/-252/i-have-an-issue": {"f1": 0.7586206896551725}, "https://discuss.flarum.org.cn/d/15448/3": {"f1": 1}, "https://discuss.flarum.org.cn/d/15648": {"f1": 0.9129886506935686}, "https://devdojo.com/question/remove-page-from-query-string": {"f1": 0.9657320872274143}, "https://devdojo.com/question/new-upcoming-elements-1": {"f1": 0.9446808510638297}, "https://www.kunena.org/forum/k-6-3-0-support/168431-forum-statistics-bug-return-value-must-be-of-type-array,-none-returned": {"f1": 0.5353982300884956}, "https://forum.agreper.com/thread/1/": {"f1": 0.9372384937238493}, "https://forum.agreper.com/thread/13/": {"f1": 0.7764705882352941}, "https://forum.nim-lang.org/search?q=tomato": {"f1": 0.9511400651465798}, "https://cycling74.com/forums/gen-patch-spacial-arrangement-different-performance": {"f1": 0.9860992907801418}, "https://cycling74.com/forums/somebody-help-me": {"f1": 0.9360613810741688}, "https://css-tricks.com/forums/topic/last-child-or/": {"f1": 0.9163120567375886}, "https://tabletennis.ph/forum/international-events/john-russel-misal-started-strong-at-southeast-asian-regional-olympic-qualification/": {"f1": 0.8614800759013284}, "https://tabletennis.ph/forum/international-events/john-russel-misal-started-strong-at-southeast-asian-regional-olympic-qualification/#post-147": {"f1": 0.8614800759013284}, "https://boards.weddingbee.com/topic/fav-nail-colour-style-for-your-ring/": {"f1": 0.8904933814681107}, "https://tudodetectores.com/community/confira-ultimas-descobertas-dos-detectoristas/nao-se-iludam-com-pepitas-de-ouro-nao-e-facil-encontrar/#post-301": {"f1": 0.9695652173913044}, "https://tudodetectores.com/community/detectores-de-pepitas-de-ouro/qual-e-melhor-nokta-gold-finder-2000-ou-gold-monster-1000/": {"f1": 0.9549702633814783}, "https://demo.sabaidiscuss.com/questions/question/is-there-any-cms-better-than-wordpress-or-should-i-roll-my-own": {"f1": 0.9340727048675291}, "http://demo.designwall.com/dwqa/question/a-few-quick-pre-sale-questions/": {"f1": 0.9686520376175548}, "https://forumengine.enginethemes.com/thread/new-packages-directoryengine-and-freelanceengine/": {"f1": 0.9697933227344993}, "https://forumengine.enginethemes.com/thread/announcement-delay-in-support-from-april-28th-to-may-03rd-2/": {"f1": 0.9446254071661238}, "http://demo.designwall.com/dwqa/question/can-i-custom-the-page-template-of-single-question/": {"f1": 1}, "https://asgaros.com/support/topic/blocke-nur-ab-bestimmter-benutzerrolle-nebeneinander/": {"f1": 0.6469760900140646}, "https://demo.wpdiscussionboard.com/discussion-topics/how-do-i-start-getting-art-commissions/": {"f1": 0.9903083700440528}, "https://demo.wpdiscussionboard.com/discussion-topics/kaj-mislite-kupimo-flow/": {"f1": 0.8387096774193548}, "https://www.speakoutwireless.ca/speak/phone-features/need-some-help-setting-up-data-on-android-phone/": {"f1": 0.8843899840170485}, "https://www.speakoutwireless.ca/speak/7-eleven-rates-and-plans/july-august-2023-25-account-bonus-with-100-top-up/": {"f1": 0.819548872180451}, "https://asgaros.com/support/topic/design-colors-and-seo-in-the-asgaros-forum/": {"f1": 0.6941775014132279}, "https://clevious.com/questions/question/when-should-one-switch-from-shared-hosting-to-a-vps/": {"f1": 0.9857612267250823}, "https://forumwpplugin.com/topic/mark-as-read-read-content-etc/": {"f1": 0.6445497630331753}, "https://www.hahn-tech.com/ans/how-to-export-trade-alerts-from-thinkorswim/": {"f1": 0.886509635974304}, "https://www.hahn-tech.com/tradestation-solutions/": {"f1": 0.914572864321608}, "https://www.quora.com/What-is-the-value-of-sport-culture-participation-in-school": {"f1": 0.9653379549393414}, "https://www.quora.com/Do-you-support-pyramid-schemes": {"f1": 0.2783563186039966}, "https://engineering.stackexchange.com/questions/60671/how-to-build-apartment-floors-ceilings-to-not-transfer-sound": {"f1": 0.9072815533980583}, "https://www.codeproject.com/Questions/5382597/How-I-can-integrate-EID-reader-with-tablet": {"f1": 0.36649214659685864}, "https://imgur.com/gallery/c6eFtrZ": {"f1": 0.7506053268765133}, "https://imgur.com/gallery/comet-portugal-HqvSwNh": {"f1": 0.6415094339622641}, "https://www.lucianne.com/2024/05/12/antisemitism_awareness_act_is_the_antidote_to_dei_128386.html": {"f1": 0.1436599071585634}, "https://www.scienceforums.net/topic/131993-sum-of-prime-numbers/": {"f1": 0.7304785894206549}, "https://what.thedailywtf.com/topic/11267/about-the-side-bar-wtf-category/2": {"f1": 0.1834862385321101}, "https://puzzling.stackexchange.com/questions/126764/which-duplo-digit-is-6-or-9": {"f1": 0.6946721311475409}, "https://patent.bbscloud.com/info/1454?csr=1": {"f1": 0.2608695652173913}, "https://www.developers.pub/faq/1312719": {"f1": 0.04}, "https://bbs.nanshengbbs.top/topic/detail/1102": {"f1": 0.44943820224719105}, "https://css-tricks.com/forums/topic/grid-items-appear-to-float-left/": {"f1": 0.8571428571428571}, "https://magiciansandmagic.com/forums/topic/top-3-mentalists-time/": {"f1": 0.1864406779661017}, "https://magiciansandmagic.com/forums/topic/how-to-use-the-forums/": {"f1": 0.0878048780487805}} -------------------------------------------------------------------------------- /benchmark/data/forum/scores/ori.json: -------------------------------------------------------------------------------- 1 | { 2 | "https://movie.douban.com/subject/36151693/comments?status=P": { 3 | "f1": 0.8217433888344761 4 | }, 5 | "https://stackoverflow.com/questions/1077347/hello-world-in-python": { 6 | "f1": 0.9258266309204647 7 | }, 8 | "https://forums.bellaonline.com/ubbthreads.php/topics/888393": { 9 | "f1": 0.8569299552906109 10 | }, 11 | "https://community.familysearch.org/en/discussion/118004/'%20+%20moreLink%20+%20'": { 12 | "f1": 0.7081218274111675 13 | }, 14 | "https://careercup.com/question?id=2780": { 15 | "f1": 0.8971553610503281 16 | }, 17 | "https://zhidao.baidu.com/question/1247295277789522099.html?fr=search&word=%E5%9C%B0%E9%9C%87%E5%8F%B0": { 18 | "f1": 0.8077276908923643 19 | }, 20 | "https://zhidao.baidu.com/question/2127549042326254267.html?qbl=relate_question_2": { 21 | "f1": 0.9006802721088434 22 | }, 23 | "https://tieba.baidu.com/p/8057560733": { 24 | "f1": 0.9386098427194319 25 | }, 26 | "https://www.zhihu.com/question/586816837/answer/2916374892": { 27 | "f1": 0.9349453080023029 28 | }, 29 | "https://www.reddit.com/r/BoomersBeingFools/comments/1cq6hci/boomer_did_not_like_the_fact_i_took_down_the/": { 30 | "f1": 0.7061486231610713 31 | }, 32 | "https://www.reddit.com/r/BoomersBeingFools/comments/1cqcfkk/boomer_took_offense_to_my_teacher_appreciation/?chainedPosts=t3_1cq6hci": { 33 | "f1": 0.5872773536895673 34 | }, 35 | "https://www.zhihu.com/question/651439951/answer/3452833521": { 36 | "f1": 0.9988683515654471 37 | }, 38 | "https://www.codeproject.com/Questions/5381388/How-do-I-detect-spots-on-tablets": { 39 | "f1": 0.7225950782997763 40 | }, 41 | "https://twitter.com/PKU1898/status/1785126929055805894": { 42 | "f1": 0.8076923076923077 43 | }, 44 | "https://x.com/PKU1898/status/1734240373080572056": { 45 | "f1": 0.9 46 | }, 47 | "https://weibo.com/1618051664/Oe6wIynDn?refer_flag=1001030103_": { 48 | "f1": 0.50997150997151 49 | }, 50 | "https://www.pinterest.com/pin/unique-and-funny-mothers-day-cards-that-will-make-mom-lol--57280226501887680/": { 51 | "f1": 0.5893416927899686 52 | }, 53 | "https://ohnotheydidnt.livejournal.com/128386917.html": { 54 | "f1": 0.9327354260089686 55 | }, 56 | "https://www.kaskus.co.id/thread/6640f533517176a36c034afa/seremmm-banget-pernah-denger-gantung-jodoh-jangan-sampe-deh?ref=homelanding&med=hot_thread&style=thumb": { 57 | "f1": 0.8936970837253058 58 | }, 59 | "https://www.kaskus.co.id/thread/65f0495ce12575606b0c9640/saya-kumpulkan-komunitas-anak-dan-orang-tua-untuk-jual-beli-mainan-tak-terpakai?ref=postlist-193&med=hot_thread": { 60 | "f1": 0.8894009216589862 61 | }, 62 | "https://www.lucianne.com/2024/05/21/rfk_jr_vows_to_give_black_farmers_5_billion_in_reparations_if_he_wins_the_white_house_and_says_funds_are_not_money_that_is_entitlement_despite_white_counterparts_labeling_it_racist_128894.html": { 63 | "f1": 0.4189526184538653 64 | }, 65 | "https://www.worldaffairsboard.com/forum/political-general-discussion/science-technology/1586800-a-small-victory-dance": { 66 | "f1": 0.7429443173150267 67 | }, 68 | "https://www.worldaffairsboard.com/forum/wab-community-information/wab-information-center/48939-thread-title-correction": { 69 | "f1": 0.5960264900662252 70 | }, 71 | "https://www.scienceforums.net/topic/133914-photon-absorption-and-electron-transition-levels%C2%A0/": { 72 | "f1": 0.40017436791630345 73 | }, 74 | "https://forum.uipath.com/t/uipath-load-testing/739028/3": { 75 | "f1": 0.9431438127090301 76 | }, 77 | "https://forum.uipath.com/t/how-to-run-uipath-studio-and-studiox-on-same-computer/421350": { 78 | "f1": 0.508108108108108 79 | }, 80 | "https://ruby-china.org/topics/43661": { 81 | "f1": 0.957592339261286 82 | }, 83 | "https://community.sunrise.ch/d/36387-abbonamento-in-pausa-upc-in-sunrise-no": { 84 | "f1": 0.989247311827957 85 | }, 86 | "https://forums.debian.net/viewtopic.php?t=159270": { 87 | "f1": 0.9688995215311005 88 | }, 89 | "https://theultimatetone.com/thread-214.html": { 90 | "f1": 0.9800399201596807 91 | }, 92 | "https://theultimatetone.com/thread-268.html": { 93 | "f1": 0.9921146953405018 94 | }, 95 | "https://forums.debian.net/viewtopic.php?t=158836": { 96 | "f1": 0.9521276595744681 97 | }, 98 | "https://what.thedailywtf.com/topic/29130/the-10th-anniversary-of-the-rocky-lobster-incident/5": { 99 | "f1": 0.9662551440329219 100 | }, 101 | "https://forum.squarespace.com/topic/296584-how-can-i-have-the-same-image-at-the-top-of-all-my-blog-posts/": { 102 | "f1": 0.8886597938144329 103 | }, 104 | "https://forum.squarespace.com/topic/298933-how-to-hide-specific-product-variant-options-from-product-detail-page/": { 105 | "f1": 0.9578544061302682 106 | }, 107 | "https://www.marshallforum.com/media/for-hire-in-new-zealand-tony-tone-hire-2-x-jcm800-2203-heads-on-a-jcm800-bass-cabinet-1-x-76-super-bass-1-x-2555-half-stack-1-x-jcm800-combo-10.7130/": { 108 | "f1": 0.8524590163934427 109 | }, 110 | "https://csolbbs.tiancity.com/forum.php?mod=viewthread&tid=38672#lastpost": { 111 | "f1": 0.6939890710382515 112 | }, 113 | "https://csolbbs.tiancity.com/forum.php?mod=viewthread&tid=38733&extra=": { 114 | "f1": 0.6919431279620853 115 | }, 116 | "https://english.stackexchange.com/questions/339859/what-do-you-call-a-person-who-is-a-mentor-to-someone-but-he-is-not-aware-of-it": { 117 | "f1": 0.491725768321513 118 | }, 119 | "https://www.marshallforum.com/threads/extending-pilot-light-lifespan-with-a-resistor.138154/": { 120 | "f1": 0.8592000000000001 121 | }, 122 | "https://forums.flaskbb.org/topic/2295-need-some-help-to-install-flaskbb-on-windows": { 123 | "f1": 0.9959514170040485 124 | }, 125 | "https://forums.flaskbb.org/topic/2322-can-t-see-any-routes": { 126 | "f1": 0.9951219512195122 127 | }, 128 | "https://meta.answer.dev/questions/D1JI2/how-to-add-new-api": { 129 | "f1": 0.8795180722891566 130 | }, 131 | "https://meta.answer.dev/questions/D1l2/there-is-also-a-problem-with-the-smtp-mailbox": { 132 | "f1": 0.8990825688073395 133 | }, 134 | "https://patent.bbscloud.com/info/2497?csr=1": { 135 | "f1": 0.793103448275862 136 | }, 137 | "https://mlog.club/article/6208943": { 138 | "f1": 0.9437500000000001 139 | }, 140 | "https://www.paopao.info/#/post?id=1080035365": { 141 | "f1": 0.4635036496350365 142 | }, 143 | "https://mlog.club/topic/1666": { 144 | "f1": 0.888888888888889 145 | }, 146 | "https://www.developers.pub/faq/1274706": { 147 | "f1": 0.8597285067873303 148 | }, 149 | "https://bbs.nanshengbbs.top/detail/171": { 150 | "f1": 0.9939462426345952 151 | }, 152 | "https://misago-project.org/t/lost-connection-with-the-application/1289/#post-6344": { 153 | "f1": 0.9732620320855615 154 | }, 155 | "https://learnku.com/articles/86852": { 156 | "f1": 0.9298780487804877 157 | }, 158 | "https://learnku.com/articles/77685": { 159 | "f1": 0.7435897435897435 160 | }, 161 | "https://www.kunena.org/forum/general-questions-and-how-tos/168417-social-login-options-like-facebook": { 162 | "f1": 0.5249457700650759 163 | }, 164 | "https://insightful.demo.talkyard.io/-5/how-to-estimate-how-long-an-unfamiliar-task-will-take": { 165 | "f1": 0.8892892892892893 166 | }, 167 | "https://insightful.demo.talkyard.io/-252/i-have-an-issue": { 168 | "f1": 0.7586206896551725 169 | }, 170 | "https://discuss.flarum.org.cn/d/15448/3": { 171 | "f1": 1 172 | }, 173 | "https://discuss.flarum.org.cn/d/15648": { 174 | "f1": 0.9129886506935686 175 | }, 176 | "https://devdojo.com/question/remove-page-from-query-string": { 177 | "f1": 0.9646569646569646 178 | }, 179 | "https://devdojo.com/question/new-upcoming-elements-1": { 180 | "f1": 0.9446808510638297 181 | }, 182 | "https://www.kunena.org/forum/k-6-3-0-support/168431-forum-statistics-bug-return-value-must-be-of-type-array,-none-returned": { 183 | "f1": 0.5353982300884956 184 | }, 185 | "https://forum.agreper.com/thread/1/": { 186 | "f1": 0.9372384937238493 187 | }, 188 | "https://forum.agreper.com/thread/13/": { 189 | "f1": 0.7764705882352941 190 | }, 191 | "https://forum.nim-lang.org/search?q=tomato": { 192 | "f1": 0.9511400651465798 193 | }, 194 | "https://cycling74.com/forums/gen-patch-spacial-arrangement-different-performance": { 195 | "f1": 0.9860992907801418 196 | }, 197 | "https://cycling74.com/forums/somebody-help-me": { 198 | "f1": 0.9360613810741688 199 | }, 200 | "https://css-tricks.com/forums/topic/last-child-or/": { 201 | "f1": 0.9163120567375886 202 | }, 203 | "https://tabletennis.ph/forum/international-events/john-russel-misal-started-strong-at-southeast-asian-regional-olympic-qualification/": { 204 | "f1": 0.8614800759013284 205 | }, 206 | "https://tabletennis.ph/forum/international-events/john-russel-misal-started-strong-at-southeast-asian-regional-olympic-qualification/#post-147": { 207 | "f1": 0.8614800759013284 208 | }, 209 | "https://boards.weddingbee.com/topic/fav-nail-colour-style-for-your-ring/": { 210 | "f1": 0.8904933814681107 211 | }, 212 | "https://tudodetectores.com/community/confira-ultimas-descobertas-dos-detectoristas/nao-se-iludam-com-pepitas-de-ouro-nao-e-facil-encontrar/#post-301": { 213 | "f1": 0.9695652173913044 214 | }, 215 | "https://tudodetectores.com/community/detectores-de-pepitas-de-ouro/qual-e-melhor-nokta-gold-finder-2000-ou-gold-monster-1000/": { 216 | "f1": 0.9549702633814783 217 | }, 218 | "https://demo.sabaidiscuss.com/questions/question/is-there-any-cms-better-than-wordpress-or-should-i-roll-my-own": { 219 | "f1": 0.920461445051609 220 | }, 221 | "http://demo.designwall.com/dwqa/question/a-few-quick-pre-sale-questions/": { 222 | "f1": 0.9686520376175548 223 | }, 224 | "https://forumengine.enginethemes.com/thread/new-packages-directoryengine-and-freelanceengine/": { 225 | "f1": 0.9516380655226209 226 | }, 227 | "https://forumengine.enginethemes.com/thread/announcement-delay-in-support-from-april-28th-to-may-03rd-2/": { 228 | "f1": 0.9329073482428115 229 | }, 230 | "http://demo.designwall.com/dwqa/question/can-i-custom-the-page-template-of-single-question/": { 231 | "f1": 0.8918918918918919 232 | }, 233 | "https://asgaros.com/support/topic/blocke-nur-ab-bestimmter-benutzerrolle-nebeneinander/": { 234 | "f1": 0.6469760900140646 235 | }, 236 | "https://demo.wpdiscussionboard.com/discussion-topics/how-do-i-start-getting-art-commissions/": { 237 | "f1": 0.9903083700440528 238 | }, 239 | "https://demo.wpdiscussionboard.com/discussion-topics/kaj-mislite-kupimo-flow/": { 240 | "f1": 0.8387096774193548 241 | }, 242 | "https://www.speakoutwireless.ca/speak/phone-features/need-some-help-setting-up-data-on-android-phone/": { 243 | "f1": 0.8843899840170485 244 | }, 245 | "https://www.speakoutwireless.ca/speak/7-eleven-rates-and-plans/july-august-2023-25-account-bonus-with-100-top-up/": { 246 | "f1": 0.819548872180451 247 | }, 248 | "https://asgaros.com/support/topic/design-colors-and-seo-in-the-asgaros-forum/": { 249 | "f1": 0.6941775014132279 250 | }, 251 | "https://clevious.com/questions/question/when-should-one-switch-from-shared-hosting-to-a-vps/": { 252 | "f1": 0.9857612267250823 253 | }, 254 | "https://forumwpplugin.com/topic/mark-as-read-read-content-etc/": { 255 | "f1": 0.6445497630331753 256 | }, 257 | "https://www.hahn-tech.com/ans/how-to-export-trade-alerts-from-thinkorswim/": { 258 | "f1": 0.8903225806451613 259 | }, 260 | "https://www.hahn-tech.com/tradestation-solutions/": { 261 | "f1": 0.914572864321608 262 | }, 263 | "https://www.quora.com/What-is-the-value-of-sport-culture-participation-in-school": { 264 | "f1": 0.969538729329852 265 | }, 266 | "https://www.quora.com/Do-you-support-pyramid-schemes": { 267 | "f1": 0.2817415730337079 268 | }, 269 | "https://engineering.stackexchange.com/questions/60671/how-to-build-apartment-floors-ceilings-to-not-transfer-sound": { 270 | "f1": 0.8962874251497006 271 | }, 272 | "https://www.codeproject.com/Questions/5382597/How-I-can-integrate-EID-reader-with-tablet": { 273 | "f1": 0.3703703703703703 274 | }, 275 | "https://imgur.com/gallery/c6eFtrZ": { 276 | "f1": 0.7506053268765133 277 | }, 278 | "https://imgur.com/gallery/comet-portugal-HqvSwNh": { 279 | "f1": 0.6415094339622641 280 | }, 281 | "https://www.lucianne.com/2024/05/12/antisemitism_awareness_act_is_the_antidote_to_dei_128386.html": { 282 | "f1": 0.1941747572815534 283 | }, 284 | "https://www.scienceforums.net/topic/131993-sum-of-prime-numbers/": { 285 | "f1": 0.7472035794183445 286 | }, 287 | "https://what.thedailywtf.com/topic/11267/about-the-side-bar-wtf-category/2": { 288 | "f1": 0.1834862385321101 289 | }, 290 | "https://puzzling.stackexchange.com/questions/126764/which-duplo-digit-is-6-or-9": { 291 | "f1": 0.6576139670223085 292 | }, 293 | "https://patent.bbscloud.com/info/1454?csr=1": { 294 | "f1": 0.2608695652173913 295 | }, 296 | "https://www.developers.pub/faq/1312719": { 297 | "f1": 0 298 | }, 299 | "https://bbs.nanshengbbs.top/topic/detail/1102": { 300 | "f1": 0.0 301 | }, 302 | "https://css-tricks.com/forums/topic/grid-items-appear-to-float-left/": { 303 | "f1": 0.0819672131147541 304 | }, 305 | "https://magiciansandmagic.com/forums/topic/top-3-mentalists-time/": { 306 | "f1": 0.1864406779661017 307 | }, 308 | "https://magiciansandmagic.com/forums/topic/how-to-use-the-forums/": { 309 | "f1": 0.0878048780487805 310 | } 311 | } -------------------------------------------------------------------------------- /benchmark/eval-requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | jieba 3 | ltp 4 | numpy 5 | rouge_score 6 | tabulate 7 | trafilatura 8 | readability-lxml 9 | newspaper3k 10 | goose3 11 | justext 12 | gne -------------------------------------------------------------------------------- /benchmark/evaluate_articles.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | logging.basicConfig(level=logging.INFO) 5 | 6 | import json 7 | import jieba 8 | 9 | jieba.setLogLevel(logging.INFO) 10 | from datetime import datetime 11 | from tabulate import tabulate 12 | import numpy as np 13 | 14 | from copy import deepcopy 15 | from bs4 import BeautifulSoup 16 | from ltp import StnSplit 17 | from rouge_score.rouge_scorer import _summary_level_lcs 18 | 19 | stp = StnSplit() 20 | 21 | 22 | def get_score(target, prediction): 23 | def get_sents(text): 24 | # 分句 25 | sents = stp.split(text) 26 | sents = [x for x in sents if len(x)] 27 | return sents 28 | 29 | target_tokens_list = [ 30 | [x for x in jieba.lcut(s) if x != " "] for s in get_sents(target) 31 | ] 32 | prediction_tokens_list = [ 33 | [x for x in jieba.lcut(s) if x != " "] for s in get_sents(prediction) 34 | ] 35 | 36 | scoress = _summary_level_lcs(target_tokens_list, prediction_tokens_list) 37 | return scoress 38 | 39 | 40 | def rouge_eval(ref, cand): 41 | """ 42 | 计算给定的参考文本和候选文本之间的rouge-L的precision,recall and F1 score 43 | :param ref: str, reference_txt, 即true label 44 | :param cand: str, candidate_text, 即pred label 45 | :return: 列表,元素是字典 46 | """ 47 | t = {"prec": 1, "rec": 1, "f1": 1} 48 | if ref == cand: 49 | return t 50 | score = get_score(ref, cand) 51 | t["prec"] = score.precision 52 | t["rec"] = score.recall 53 | t["f1"] = score.fmeasure 54 | return t 55 | 56 | 57 | def evaluate_result(datas, name=""): 58 | scores = [] 59 | prec = [] 60 | rec = [] 61 | current_scores = {} 62 | down_num = 0 63 | up_dum = 0 64 | for x in datas: 65 | scores.append(rouge_eval(x["content"], x["extract_content"])) 66 | for idx, item in enumerate(scores): 67 | prec.append(item["prec"]) 68 | rec.append(item["rec"]) 69 | if name == "magic_html": 70 | if ori_scores.get(datas[idx]["url"], ""): 71 | if item['f1'] > ori_scores[datas[idx]["url"]]['f1']: 72 | up_dum += 1 73 | # print(datas[idx]["url"], item['f1'], 'up', item['f1'] - ori_scores[datas[idx]["url"]]['f1']) 74 | elif item['f1'] < ori_scores[datas[idx]["url"]]['f1']: 75 | down_num += 1 76 | # print(datas[idx]["url"], item['f1'], 'down', item['f1'] - ori_scores[datas[idx]["url"]]['f1']) 77 | current_scores[datas[idx]["url"]] = { 78 | "f1": item['f1'] 79 | } 80 | if item['f1'] < 0.5: 81 | print(datas[idx]["url"], item['f1']) 82 | if name == "magic_html": 83 | print('magic_html', 'up:', up_dum, 'down:', down_num) 84 | now = datetime.now() 85 | current_time = now.strftime("%Y-%m-%d_%H-%M-%S") 86 | with open(f'data/article/scores/{current_time}.json', 'w', encoding='utf-8') as f: 87 | f.write(json.dumps(current_scores, ensure_ascii=False)) 88 | 89 | prec_mean = np.array(prec).mean() 90 | rec_mean = np.array(rec).mean() 91 | f1_mean = 2 * prec_mean * rec_mean / (prec_mean + rec_mean) 92 | global_info["prec_mean"].append(prec_mean) 93 | global_info["rec_mean"].append(rec_mean) 94 | global_info["f1_mean"].append(f1_mean) 95 | 96 | 97 | def get_content_text(html: str) -> str: 98 | soup = BeautifulSoup(html, "lxml") 99 | # 使用get_text()方法抽取所有文本内容,参数"\n"作为不同标签间的分隔符,strip=True去除多余空白 100 | text_content = soup.get_text("\n", strip=True) 101 | return text_content 102 | 103 | 104 | global_datas = [] 105 | 106 | global_info = { 107 | "func": [], 108 | "prec_mean": [], 109 | "rec_mean": [], 110 | "f1_mean": [], 111 | } 112 | 113 | with open("data/article/base.json", "r", encoding="utf-8") as f: 114 | for k, v in json.loads(f.read()).items(): 115 | html_str = "" 116 | with open(f"data/article/htmls/{k}.html", "r", encoding="utf-8") as ff: 117 | html_str = ff.read() 118 | v["html"] = html_str 119 | global_datas.append(v) 120 | 121 | 122 | def run_magic_html(name): 123 | from magic_html import GeneralExtractor 124 | 125 | datas = deepcopy(global_datas) 126 | extractor = GeneralExtractor() 127 | for x in datas: 128 | x["extract_content"] = get_content_text( 129 | extractor.extract(html=x["html"], base_url=x["url"])["html"] 130 | ) 131 | global_info["func"].append(name) 132 | evaluate_result(datas, name=name) 133 | 134 | 135 | def run_trafilatura(name): 136 | from trafilatura import extract 137 | 138 | datas = deepcopy(global_datas) 139 | for x in datas: 140 | x["extract_content"] = extract( 141 | x["html"], include_comments=False, no_fallback=True 142 | ) 143 | global_info["func"].append(name) 144 | evaluate_result(datas) 145 | 146 | 147 | def run_trafilatura_fallback(name): 148 | from trafilatura import extract 149 | 150 | datas = deepcopy(global_datas) 151 | for x in datas: 152 | x["extract_content"] = extract( 153 | x["html"], include_comments=False, no_fallback=False 154 | ) 155 | global_info["func"].append(name) 156 | evaluate_result(datas) 157 | 158 | 159 | def run_readability_lxml(name): 160 | from readability import Document 161 | 162 | datas = deepcopy(global_datas) 163 | for x in datas: 164 | x["extract_content"] = get_content_text(Document(x["html"]).summary()) 165 | global_info["func"].append(name) 166 | evaluate_result(datas) 167 | 168 | 169 | def run_newspaper3k(name): 170 | from newspaper import fulltext 171 | 172 | datas = deepcopy(global_datas) 173 | for x in datas: 174 | try: 175 | x["extract_content"] = fulltext(x["html"]) 176 | except: 177 | x["extract_content"] = "" 178 | global_info["func"].append(name) 179 | evaluate_result(datas) 180 | 181 | 182 | def run_goose3(name): 183 | from goose3 import Goose 184 | 185 | g = Goose() 186 | datas = deepcopy(global_datas) 187 | for x in datas: 188 | try: 189 | x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text 190 | except: 191 | x["extract_content"] = "" 192 | global_info["func"].append(name) 193 | evaluate_result(datas) 194 | 195 | 196 | def run_justext(name): 197 | import justext 198 | 199 | datas = deepcopy(global_datas) 200 | for x in datas: 201 | paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200, 202 | True) # stop_words 203 | valid = [ 204 | paragraph.text 205 | for paragraph in paragraphs 206 | if not paragraph.is_boilerplate 207 | ] 208 | 209 | x["extract_content"] = ' '.join(valid) 210 | global_info["func"].append(name) 211 | evaluate_result(datas) 212 | 213 | 214 | def run_gne(name): 215 | from gne import GeneralNewsExtractor 216 | 217 | extractor = GeneralNewsExtractor() 218 | datas = deepcopy(global_datas) 219 | for x in datas: 220 | x["extract_content"] = extractor.extract(x["html"])["content"] 221 | global_info["func"].append(name) 222 | evaluate_result(datas) 223 | 224 | 225 | # magic_html每条测试数据分数变化 226 | ori_scores = {} 227 | try: 228 | with open('data/article/scores/ori.json', 'r', encoding='utf-8') as f: 229 | ori_scores = json.loads(f.read()) 230 | except: 231 | pass 232 | 233 | # 自定义需要对比的方法 234 | all_funcs = { 235 | "magic_html": run_magic_html, 236 | "trafilatura": run_trafilatura, 237 | "trafilatura_fallback": run_trafilatura_fallback, 238 | "readability-lxml": run_readability_lxml, 239 | "newspaper3k": run_newspaper3k, 240 | "goose3": run_goose3, 241 | "justext": run_justext, 242 | "gne": run_gne 243 | } 244 | 245 | for k, v in all_funcs.items(): 246 | v(k) 247 | 248 | print("文章类型网页") 249 | print("当前结果") 250 | print(tabulate(global_info, headers="keys", tablefmt="fancy_grid")) 251 | print("基准结果") 252 | print( 253 | """ 254 | ╒══════════════════════╤═════════════╤════════════╤═══════════╕ 255 | │ func │ prec_mean │ rec_mean │ f1_mean │ 256 | ╞══════════════════════╪═════════════╪════════════╪═══════════╡ 257 | │ magic_html │ 0.908865 │ 0.95032 │ 0.92913 │ 258 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 259 | │ trafilatura │ 0.833434 │ 0.912384 │ 0.871124 │ 260 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 261 | │ trafilatura_fallback │ 0.831229 │ 0.933713 │ 0.879496 │ 262 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 263 | │ readability-lxml │ 0.86587 │ 0.861391 │ 0.863625 │ 264 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 265 | │ newspaper3k │ 0.409585 │ 0.372083 │ 0.389935 │ 266 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 267 | │ goose3 │ 0.525717 │ 0.457669 │ 0.489339 │ 268 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 269 | │ justext │ 0.224945 │ 0.117092 │ 0.154014 │ 270 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 271 | │ gne │ 0.828849 │ 0.629112 │ 0.715299 │ 272 | ╘══════════════════════╧═════════════╧════════════╧═══════════╛ 273 | """.strip() 274 | ) 275 | -------------------------------------------------------------------------------- /benchmark/evaluate_forums.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | logging.basicConfig(level=logging.INFO) 5 | 6 | import json 7 | import jieba 8 | 9 | jieba.setLogLevel(logging.INFO) 10 | from datetime import datetime 11 | from tabulate import tabulate 12 | import numpy as np 13 | 14 | from copy import deepcopy 15 | from bs4 import BeautifulSoup 16 | from ltp import StnSplit 17 | from rouge_score.rouge_scorer import _summary_level_lcs 18 | 19 | stp = StnSplit() 20 | 21 | 22 | def get_score(target, prediction): 23 | def get_sents(text): 24 | # 分句 25 | sents = stp.split(text) 26 | sents = [x for x in sents if len(x)] 27 | return sents 28 | 29 | target_tokens_list = [ 30 | [x for x in jieba.lcut(s) if x != " "] for s in get_sents(target) 31 | ] 32 | prediction_tokens_list = [ 33 | [x for x in jieba.lcut(s) if x != " "] for s in get_sents(prediction) 34 | ] 35 | 36 | scoress = _summary_level_lcs(target_tokens_list, prediction_tokens_list) 37 | return scoress 38 | 39 | 40 | def rouge_eval(ref, cand): 41 | """ 42 | 计算给定的参考文本和候选文本之间的rouge-L的precision,recall and F1 score 43 | :param ref: str, reference_txt, 即true label 44 | :param cand: str, candidate_text, 即pred label 45 | :return: 列表,元素是字典 46 | """ 47 | t = {"prec": 1, "rec": 1, "f1": 1} 48 | if ref == cand: 49 | return t 50 | score = get_score(ref, cand) 51 | t["prec"] = score.precision 52 | t["rec"] = score.recall 53 | t["f1"] = score.fmeasure 54 | return t 55 | 56 | 57 | def evaluate_result(datas, name=""): 58 | scores = [] 59 | prec = [] 60 | rec = [] 61 | current_scores = {} 62 | down_num = 0 63 | up_dum = 0 64 | for x in datas: 65 | scores.append(rouge_eval(x["content"], x["extract_content"])) 66 | for idx, item in enumerate(scores): 67 | prec.append(item["prec"]) 68 | rec.append(item["rec"]) 69 | if name == "magic_html": 70 | if ori_scores.get(datas[idx]["url"], ""): 71 | if item['f1'] > ori_scores[datas[idx]["url"]]['f1']: 72 | up_dum += 1 73 | # print(datas[idx]["url"], item['f1'], 'up', item['f1'] - ori_scores[datas[idx]["url"]]['f1']) 74 | elif item['f1'] < ori_scores[datas[idx]["url"]]['f1']: 75 | down_num += 1 76 | # print(datas[idx]["url"], item['f1'], 'down', item['f1'] - ori_scores[datas[idx]["url"]]['f1']) 77 | current_scores[datas[idx]["url"]] = { 78 | "f1": item['f1'] 79 | } 80 | if item['f1'] < 0.5: 81 | print(datas[idx]["url"], item['f1']) 82 | if name == "magic_html": 83 | print('magic_html', 'up:', up_dum, 'down:', down_num) 84 | now = datetime.now() 85 | current_time = now.strftime("%Y-%m-%d_%H-%M-%S") 86 | with open(f'data/forum/scores/{current_time}.json', 'w', encoding='utf-8') as f: 87 | f.write(json.dumps(current_scores, ensure_ascii=False)) 88 | 89 | prec_mean = np.array(prec).mean() 90 | rec_mean = np.array(rec).mean() 91 | f1_mean = 2 * prec_mean * rec_mean / (prec_mean + rec_mean) 92 | global_info["prec_mean"].append(prec_mean) 93 | global_info["rec_mean"].append(rec_mean) 94 | global_info["f1_mean"].append(f1_mean) 95 | 96 | 97 | def get_content_text(html: str) -> str: 98 | soup = BeautifulSoup(html, "lxml") 99 | # 使用get_text()方法抽取所有文本内容,参数"\n"作为不同标签间的分隔符,strip=True去除多余空白 100 | text_content = soup.get_text("\n", strip=True) 101 | return text_content 102 | 103 | 104 | global_datas = [] 105 | 106 | global_info = { 107 | "func": [], 108 | "prec_mean": [], 109 | "rec_mean": [], 110 | "f1_mean": [], 111 | } 112 | 113 | with open("data/forum/base.json", "r", encoding="utf-8") as f: 114 | for k, v in json.loads(f.read()).items(): 115 | html_str = "" 116 | with open(f"data/forum/htmls/{k}.html", "r", encoding="utf-8") as ff: 117 | html_str = ff.read() 118 | v["html"] = html_str 119 | global_datas.append(v) 120 | 121 | 122 | def run_magic_html(name): 123 | from magic_html import GeneralExtractor 124 | 125 | datas = deepcopy(global_datas) 126 | extractor = GeneralExtractor() 127 | for x in datas: 128 | x["extract_content"] = get_content_text( 129 | extractor.extract(html=x["html"], base_url=x["url"], html_type="forum")[ 130 | "html" 131 | ] 132 | ) 133 | global_info["func"].append(name) 134 | evaluate_result(datas, name=name) 135 | 136 | 137 | def run_trafilatura(name): 138 | from trafilatura import extract 139 | 140 | datas = deepcopy(global_datas) 141 | for x in datas: 142 | x["extract_content"] = extract( 143 | x["html"], include_comments=True, no_fallback=True 144 | ) 145 | global_info["func"].append(name) 146 | evaluate_result(datas) 147 | 148 | 149 | def run_trafilatura_fallback(name): 150 | from trafilatura import extract 151 | 152 | datas = deepcopy(global_datas) 153 | for x in datas: 154 | x["extract_content"] = extract( 155 | x["html"], include_comments=True, no_fallback=False 156 | ) 157 | global_info["func"].append(name) 158 | evaluate_result(datas) 159 | 160 | 161 | def run_readability_lxml(name): 162 | from readability import Document 163 | 164 | datas = deepcopy(global_datas) 165 | for x in datas: 166 | x["extract_content"] = get_content_text(Document(x["html"]).summary()) 167 | global_info["func"].append(name) 168 | evaluate_result(datas) 169 | 170 | 171 | def run_newspaper3k(name): 172 | from newspaper import fulltext 173 | 174 | datas = deepcopy(global_datas) 175 | for x in datas: 176 | try: 177 | x["extract_content"] = fulltext(x["html"]) 178 | except: 179 | x["extract_content"] = "" 180 | global_info["func"].append(name) 181 | evaluate_result(datas) 182 | 183 | 184 | def run_goose3(name): 185 | from goose3 import Goose 186 | 187 | g = Goose() 188 | datas = deepcopy(global_datas) 189 | for x in datas: 190 | try: 191 | x["extract_content"] = g.extract(raw_html=x["html"]).cleaned_text 192 | except: 193 | x["extract_content"] = "" 194 | global_info["func"].append(name) 195 | evaluate_result(datas) 196 | 197 | 198 | def run_justext(name): 199 | import justext 200 | 201 | datas = deepcopy(global_datas) 202 | for x in datas: 203 | paragraphs = justext.justext(x["html"], justext.get_stoplist("German"), 50, 200, 0.1, 0.2, 0.2, 200, 204 | True) # stop_words 205 | valid = [ 206 | paragraph.text 207 | for paragraph in paragraphs 208 | if not paragraph.is_boilerplate 209 | ] 210 | 211 | x["extract_content"] = ' '.join(valid) 212 | global_info["func"].append(name) 213 | evaluate_result(datas) 214 | 215 | 216 | def run_gne(name): 217 | from gne import GeneralNewsExtractor 218 | 219 | extractor = GeneralNewsExtractor() 220 | datas = deepcopy(global_datas) 221 | for x in datas: 222 | x["extract_content"] = extractor.extract(x["html"])["content"] 223 | global_info["func"].append(name) 224 | evaluate_result(datas) 225 | 226 | 227 | # magic_html每条测试数据分数变化 228 | ori_scores = {} 229 | try: 230 | with open('data/forum/scores/ori.json', 'r', encoding='utf-8') as f: 231 | ori_scores = json.loads(f.read()) 232 | except: 233 | pass 234 | 235 | # 自定义需要对比的方法 236 | all_funcs = { 237 | "magic_html": run_magic_html, 238 | "trafilatura": run_trafilatura, 239 | "trafilatura_fallback": run_trafilatura_fallback, 240 | "readability-lxml": run_readability_lxml, 241 | "newspaper3k": run_newspaper3k, 242 | "goose3": run_goose3, 243 | "justext": run_justext, 244 | "gne": run_gne 245 | } 246 | 247 | for k, v in all_funcs.items(): 248 | v(k) 249 | 250 | print("论坛类型网页") 251 | print("当前结果") 252 | print(tabulate(global_info, headers="keys", tablefmt="fancy_grid")) 253 | print("基准结果") 254 | print(''' 255 | ╒══════════════════════╤═════════════╤════════════╤═══════════╕ 256 | │ func │ prec_mean │ rec_mean │ f1_mean │ 257 | ╞══════════════════════╪═════════════╪════════════╪═══════════╡ 258 | │ magic_html │ 0.796252 │ 0.826819 │ 0.811248 │ 259 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 260 | │ trafilatura │ 0.716009 │ 0.695947 │ 0.705835 │ 261 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 262 | │ trafilatura_fallback │ 0.730304 │ 0.691328 │ 0.710282 │ 263 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 264 | │ readability-lxml │ 0.788018 │ 0.445087 │ 0.568867 │ 265 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 266 | │ newspaper3k │ 0.596976 │ 0.298322 │ 0.397837 │ 267 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 268 | │ goose3 │ 0.675835 │ 0.312969 │ 0.427821 │ 269 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 270 | │ justext │ 0.175889 │ 0.0517628 │ 0.0799863 │ 271 | ├──────────────────────┼─────────────┼────────────┼───────────┤ 272 | │ gne │ 0.81003 │ 0.389709 │ 0.526241 │ 273 | ╘══════════════════════╧═════════════╧════════════╧═══════════╛ 274 | '''.strip()) 275 | -------------------------------------------------------------------------------- /magic_html/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | from urllib.parse import urlparse 4 | from magic_html.extractors.article_extractor import ArticleExtractor 5 | from magic_html.extractors.weixin_extractor import WeixinExtractor 6 | from magic_html.extractors.forum_extractor import ForumExtractor 7 | from magic_html.extractors.custom_extractor import CustomExtractor 8 | 9 | 10 | class GeneralExtractor: 11 | def __init__(self, config_path=""): 12 | if config_path: 13 | """ 14 | demo rule config file json: 15 | { 16 | "www.***.com": { 17 | "clean": ["//script", "//style"], 18 | "title": { 19 | "mode": "xpath", 20 | "value": "//div[@class='media-body']/h4/text()" 21 | }, 22 | "content": { 23 | "mode": "xpath", 24 | "value": "//div[@class='message break-all']" 25 | } 26 | } 27 | } 28 | """ 29 | try: 30 | with open(config_path, 'r', encoding='utf-8') as f: 31 | self.rule = json.loads(f.read()) 32 | except: 33 | pass 34 | else: 35 | self.rule = {} 36 | 37 | def extract(self, html="", **kwargs) -> dict: 38 | base_url = kwargs.get("base_url", "") 39 | html_type = kwargs.pop("html_type", None) 40 | if html_type: 41 | if html_type == "forum": 42 | return ForumExtractor().extract(html=html, **kwargs) 43 | elif html_type == "weixin": 44 | return WeixinExtractor().extract(html=html, **kwargs) 45 | if base_url: 46 | netloc = urlparse(base_url).netloc 47 | if netloc in self.rule: 48 | try: 49 | new_kwargs = dict() 50 | new_kwargs["rule"] = self.rule[netloc] 51 | new_kwargs.update(kwargs) 52 | return CustomExtractor().extract(html=html, **new_kwargs) 53 | except: 54 | # 当自定义规则不能覆盖站点所有板块时,使用 55 | return ArticleExtractor().extract(html=html, **kwargs) 56 | if netloc == "mp.weixin.qq.com": 57 | return WeixinExtractor().extract(html=html, **kwargs) 58 | return ArticleExtractor().extract(html=html, **kwargs) 59 | -------------------------------------------------------------------------------- /magic_html/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | Unique_ID = "all_ids_pjtest_20300101_921b9a" 4 | 5 | PAYWALL_DISCARD_XPATH = [ 6 | """.//*[(self::div or self::p)][ 7 | contains(@id, "paywall") or contains(@id, "premium") or 8 | contains(@class, "paid-content") or contains(@class, "paidcontent") or 9 | contains(@class, "obfuscated") or contains(@class, "blurred") or 10 | contains(@class, "restricted") or contains(@class, "overlay") 11 | ]""", 12 | ] 13 | 14 | OVERALL_DISCARD_XPATH = [ 15 | # navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts 16 | """.//*[(self::div or self::item or self::ul 17 | or self::p or self::section or self::span)][ 18 | contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer") 19 | or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or 20 | contains(@id, "viral") or contains(@class, "viral") or 21 | starts-with(@id, "shar") or starts-with(@class, "shar") or 22 | contains(@class, "share-") or 23 | contains(translate(@id, "S", "s"), "share") or 24 | contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or 25 | contains(@id, "syndication") or contains(@class, "syndication") or 26 | starts-with(@id, "jp-") or starts-with(@id, "dpsp-content") or 27 | contains(@class, "embedded") or contains(@class, "embed") 28 | or contains(@id, "newsletter") or contains(@class, "newsletter") 29 | or contains(@class, "subnav") or 30 | contains(@id, "cookie") or contains(@class, "cookie") or contains(@id, "tags") 31 | or contains(@class, "tags") or contains(@id, "sidebar") or 32 | contains(@class, "sidebar") or contains(@id, "banner") or contains(@class, "banner") 33 | or contains(@class, "meta") or 34 | contains(@id, "menu") or contains(@class, "menu") or 35 | contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav") 36 | or starts-with(@class, "nav") or contains(translate(@class, "N", "n"), "navigation") or 37 | contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav") 38 | or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or 39 | contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or 40 | contains(@id, "author") or contains(@class, "author") or 41 | contains(@id, "button") or contains(@class, "button") 42 | or contains(translate(@class, "B", "b"), "byline") 43 | or contains(@class, "rating") or starts-with(@class, "widget") or 44 | contains(@class, "attachment") or contains(@class, "timestamp") or 45 | contains(@class, "user-info") or contains(@class, "user-profile") or 46 | contains(@class, "-ad-") or contains(@class, "-icon") 47 | or contains(@class, "article-infos") or 48 | contains(translate(@class, "I", "i"), "infoline") 49 | or contains(@data-component, "MostPopularStories") 50 | or contains(@class, "outbrain") or contains(@class, "taboola") 51 | or contains(@class, "criteo") or contains(@class, "options") 52 | or contains(@class, "consent") or contains(@class, "modal-content") 53 | or contains(@class, "paid-content") or contains(@class, "paidcontent") 54 | or contains(@id, "premium-") or contains(@id, "paywall") 55 | or contains(@class, "obfuscated") or contains(@class, "blurred") 56 | or contains(@class, " ad ") 57 | or contains(@class, "next-post") 58 | or contains(@class, "yin") or contains(@class, "zlylin") or 59 | contains(@class, "xg1") or contains(@id, "bmdh") 60 | or @data-lp-replacement-content]""", 61 | # hidden parts 62 | """.//*[starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden") 63 | or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint") 64 | or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true" 65 | or contains(@class, "notloaded")]""", 66 | # comment debris 67 | # or contains(@class, "message-container") or contains(@id, "message_container") 68 | """.//*[@class="comments-title" or contains(@class, "comments-title") or 69 | contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or 70 | contains(@class, "-reply-") or contains(@class, "message") or contains(@id, "message_container") 71 | or contains(@id, "akismet") or contains(@class, "akismet")] """, 72 | ] 73 | 74 | TEASER_DISCARD_XPATH = [ 75 | """.//*[(self::div or self::item or self::ul 76 | or self::p or self::section or self::span)][ 77 | contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser") 78 | ]""", 79 | ] 80 | 81 | PRECISION_DISCARD_XPATH = [ 82 | ".//header", 83 | """.//*[(self::div or self::item or self::ul 84 | or self::p or self::section or self::span)][ 85 | contains(@id, "bottom") or contains(@class, "bottom") or 86 | contains(@id, "link") or contains(@class, "link") 87 | or contains(@style, "border") 88 | ]""", 89 | ] 90 | 91 | DISCARD_IMAGE_ELEMENTS = [ 92 | """.//*[(self::div or self::item or self::ul 93 | or self::p or self::section or self::span)][ 94 | contains(@id, "caption") or contains(@class, "caption") 95 | ] 96 | """ 97 | ] 98 | 99 | REMOVE_COMMENTS_XPATH = [ 100 | """.//*[(self::div or self::ul or self::section)][ 101 | starts-with(translate(@id, "C","c"), 'comment') or 102 | starts-with(translate(@class, "C","c"), 'comment') or starts-with(translate(@name, "C","c"), 'comment') or 103 | contains(@class, 'article-comments') or contains(@class, 'post-comments') 104 | or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread') 105 | or starts-with(@id, 'dsq-comments') 106 | ]""" 107 | ] 108 | 109 | CONTENT_EXTRACTOR_NOISE_XPATHS = [ 110 | # '//div[contains(@class, "comment") or contains(@name, "comment") or contains(@id, "comment")]', 111 | '//div[starts-with(@class, "advert") or starts-with(@name, "advert") or starts-with(@id, "advert")]', 112 | '//div[contains(@style, "display: none")]', 113 | '//div[contains(@style, "display:none")]', 114 | ] 115 | 116 | # 保留图片,音频,视频 117 | MANUALLY_CLEANED = [ 118 | "aside", 119 | "embed", 120 | "footer", 121 | "head", 122 | "iframe", 123 | "menu", 124 | "object", 125 | "script", 126 | "applet", 127 | "canvas", 128 | "map", 129 | "svg", 130 | "area", 131 | "blink", 132 | "button", 133 | "datalist", 134 | "dialog", 135 | "frame", 136 | "frameset", 137 | "fieldset", 138 | "link", 139 | "input", 140 | "ins", 141 | "label", 142 | "legend", 143 | "marquee", 144 | "menuitem", 145 | "nav", 146 | "noscript", 147 | "optgroup", 148 | "option", 149 | "output", 150 | "param", 151 | "progress", 152 | "rp", 153 | "rt", 154 | "rtc", 155 | "select", 156 | "style", 157 | "track", 158 | "textarea", 159 | "time", 160 | "use", 161 | ] 162 | 163 | MANUALLY_STRIPPED = [ 164 | "abbr", 165 | "acronym", 166 | "address", 167 | "bdi", 168 | "bdo", 169 | "big", 170 | "cite", 171 | "data", 172 | "dfn", 173 | "font", 174 | "hgroup", 175 | "ins", 176 | "mark", 177 | "meta", 178 | "ruby", 179 | "small", 180 | "tbody", 181 | "template", 182 | "tfoot", 183 | "thead", 184 | ] 185 | 186 | CUT_EMPTY_ELEMS = { 187 | "article", 188 | "b", 189 | "blockquote", 190 | "dd", 191 | "div", 192 | "dt", 193 | "em", 194 | "h1", 195 | "h2", 196 | "h3", 197 | "h4", 198 | "h5", 199 | "h6", 200 | "i", 201 | "li", 202 | "main", 203 | "p", 204 | "pre", 205 | "q", 206 | "section", 207 | "span", 208 | "strong", 209 | } 210 | 211 | USELESS_ATTR = [ 212 | "share", 213 | "contribution", 214 | "copyright", 215 | "copy-right", 216 | "disclaimer", 217 | "recommend", 218 | "related", 219 | "footer", 220 | "social", 221 | "submeta", 222 | "report-infor", 223 | ] 224 | 225 | BODY_XPATH = [ 226 | """.//*[(self::article or self::div or self::main or self::section)][ 227 | @class="post" or @class="entry" or 228 | contains(@class, "post-text") or contains(@class, "post_text") or 229 | contains(@class, "post-body") or contains(@class, "post-entry") or contains(@class, "postentry") or 230 | contains(@class, "post-content") or contains(@class, "post_content") or 231 | contains(@class, "postcontent") or contains(@class, "postContent") or 232 | contains(@class, "article-text") or contains(@class, "articletext") or contains(@class, "articleText") 233 | or contains(@id, "entry-content") or 234 | contains(@class, "entry-content") or contains(@id, "article-content") or 235 | contains(@class, "article-content") or contains(@id, "article__content") or 236 | contains(@class, "article__content") or contains(@id, "article-body") or 237 | contains(@class, "article-body") or contains(@id, "article__body") or 238 | contains(@class, "article__body") or @itemprop="articleBody" or 239 | contains(translate(@id, "B", "b"), "articlebody") or contains(translate(@class, "B", "b"), "articlebody") 240 | or @id="articleContent" or contains(@class, "ArticleContent") or 241 | contains(@class, "page-content") or contains(@class, "text-content") or 242 | contains(@id, "body-text") or contains(@class, "body-text") or contains(@class, "body-content") or contains(translate(@class, "B", "b"), "textbody") or 243 | contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]""", 244 | "(.//article)[1]", 245 | """(.//*[(self::article or self::div or self::main or self::section)][ 246 | contains(@class, 'post-bodycopy') or 247 | contains(@class, 'storycontent') or contains(@class, 'story-content') or 248 | @class='postarea' or @class='art-postcontent' or 249 | contains(@class, 'theme-content') or contains(@class, 'blog-content') or 250 | contains(@class, 'section-content') or contains(@class, 'single-content') or 251 | contains(@class, 'single-post') or 252 | contains(@class, 'main-column') or contains(@class, 'wpb_text_column') or 253 | starts-with(@id, 'primary') or starts-with(@class, 'article ') or @class="text" or 254 | @id="article" or @class="cell" or @id="story" or @class="story" or 255 | contains(@class, "story-body") or contains(@class, "field-body") or 256 | contains(translate(@class, "FULTEX","fultex"), "fulltext") 257 | or @role='article'])[1]""", 258 | """(.//*[(self::article or self::div or self::main or self::section)][ 259 | contains(@id, "content-main") or contains(@class, "content-main") or contains(@class, "content_main") or 260 | contains(@id, "content-body") or contains(@class, "content-body") or contains(@id, "contentBody") 261 | or contains(@class, "content__body") or contains(translate(@id, "CM","cm"), "main-content") or contains(translate(@class, "CM","cm"), "main-content") 262 | or contains(translate(@class, "CP","cp"), "page-content") or 263 | @id="content" or @class="content"])[1]""", 264 | '(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]', 265 | ] 266 | 267 | Forum_XPATH = [ 268 | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ 269 | contains(@id, 'question') or contains(@class, 'question')]""", 270 | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ 271 | contains(@id, 'answer') or contains(@class, 'answer')]""", 272 | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ 273 | contains(@id, 'comment') or contains(@class, 'comment') or contains(@class, 'Comment')]""", 274 | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][contains(@class, "message-container") or contains(@id, "message_container") or contains(@class, "Messages_container")]""", 275 | """.//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][ 276 | contains(@id, 'comment-content') or contains(@class, 'comment-content') or contains(@class, 'comment-body') or contains(@class, 'comment-body') or contains(@class, "post-reply") or contains(@class, "reply_content") or contains(@class, "reply-content") or contains(@class, "reply_post") or contains(@class, "post-reply") or contains(@id, "reply") or contains(@class, "post-text") or contains(@class, "post_text") or 277 | contains(@class, "post-body") or contains(@class, "postbody") or contains(@class, "post-entry") or contains(@class, "postentry") or contains(@component, 'post') or 278 | contains(@class, "post-content") or contains(@class, "post_content") or contains(@class, "p_content") or contains(@class, "Post_content") or contains(@class, "message-post") or contains(@class, "js-post")]""", 279 | # id 包含post-加数字组成的形式 280 | """.//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][contains(@id, 'post-') or contains(@id, 'post_')]""" 281 | ] 282 | 283 | METAS = [ 284 | '//meta[starts-with(@property, "og:title")]/@content', 285 | '//meta[starts-with(@name, "og:title")]/@content', 286 | '//meta[starts-with(@property, "title")]/@content', 287 | '//meta[starts-with(@name, "title")]/@content', 288 | '//meta[starts-with(@property, "page:title")]/@content', 289 | '//meta[starts-with(@name, "page:title")]/@content', 290 | ] 291 | -------------------------------------------------------------------------------- /magic_html/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | -------------------------------------------------------------------------------- /magic_html/extractors/article_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from magic_html.utils import * 4 | from magic_html.extractors.base_extractor import BaseExtractor 5 | from magic_html.extractors.title_extractor import TitleExtractor 6 | 7 | 8 | class ArticleExtractor(BaseExtractor): 9 | def __init__(self) -> None: 10 | super().__init__() 11 | 12 | def extract(self, html="", base_url="") -> dict: 13 | html = html.replace(" ", " ").replace(" ", " ") 14 | tree = load_html(html) 15 | if tree is None: 16 | raise ValueError 17 | 18 | title = TitleExtractor().process(tree) 19 | 20 | # base_url 21 | base_href = tree.xpath("//base/@href") 22 | 23 | if base_href and "http" in base_href[0]: 24 | base_url = base_href[0] 25 | 26 | if "://blog.csdn.net/" in base_url: 27 | for dtree in tree.xpath('//div[@id="content_views"]//ul[@class="pre-numbering"]'): 28 | self.remove_node(dtree) 29 | 30 | # 标签转换, 增加数学标签处理 31 | format_tree = self.convert_tags(tree, base_url=base_url) 32 | 33 | # 删除script style等标签及其内容 34 | normal_tree = self.clean_tags(format_tree) 35 | 36 | subtree, xp_num, drop_list = self.xp_1_5(normal_tree) 37 | if xp_num == "others": 38 | subtree, drop_list = self.prune_unwanted_sections(normal_tree) 39 | body_html = self.get_content_html(subtree, xp_num, base_url) 40 | 41 | return { 42 | "xp_num": xp_num, 43 | "drop_list": drop_list, 44 | "html": body_html, 45 | "title": title, 46 | "base_url": base_url, 47 | } 48 | -------------------------------------------------------------------------------- /magic_html/extractors/custom_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import re 3 | 4 | from magic_html.utils import * 5 | from magic_html.extractors.base_extractor import BaseExtractor 6 | from magic_html.extractors.title_extractor import TitleExtractor 7 | 8 | 9 | class CustomExtractor(BaseExtractor): 10 | def __init__(self) -> None: 11 | super().__init__() 12 | 13 | def use_clean_rule(self, tree, clean_rules): 14 | for clean_rule in clean_rules: 15 | for x in tree.xpath(clean_rule): 16 | self.remove_node(x) 17 | return tree 18 | 19 | def use_extract_rule(self, tree, extract_rule): 20 | if "/text()" in extract_rule["value"]: 21 | return "".join(tree.xpath(extract_rule["value"])).strip() 22 | return tree.xpath(extract_rule["value"])[0] 23 | 24 | def extract(self, html="", base_url="", rule={}) -> dict: 25 | tree = load_html(html) 26 | if tree is None: 27 | raise ValueError 28 | 29 | # base_url 30 | base_href = tree.xpath("//base/@href") 31 | 32 | if base_href and "http" in base_href[0]: 33 | base_url = base_href[0] 34 | 35 | if "clean" in rule: 36 | tree = self.use_clean_rule(tree, rule["clean"]) 37 | 38 | # 获取title 39 | if "title" not in rule: 40 | title = TitleExtractor().process(tree) 41 | else: 42 | title = self.use_extract_rule(tree, rule["title"]) 43 | 44 | # 文章区域 45 | try: 46 | body_tree = self.use_extract_rule(tree, rule["content"]) 47 | except: 48 | raise ValueError 49 | body_html = tostring(body_tree, encoding=str) 50 | 51 | return { 52 | "xp_num": "custom", 53 | "drop_list": False, 54 | "html": body_html, 55 | "title": title, 56 | "base_url": base_url 57 | } 58 | -------------------------------------------------------------------------------- /magic_html/extractors/forum_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import re 3 | 4 | from magic_html.config import Forum_XPATH, Unique_ID 5 | from magic_html.utils import * 6 | from magic_html.extractors.base_extractor import BaseExtractor 7 | from magic_html.extractors.title_extractor import TitleExtractor 8 | 9 | 10 | class ForumExtractor(BaseExtractor): 11 | def __init__(self) -> None: 12 | super().__init__() 13 | 14 | def extract(self, html="", base_url="") -> dict: 15 | self.need_comment = True 16 | html = html.replace(" ", " ").replace(" ", " ") 17 | tree = load_html(html) 18 | if tree is None: 19 | raise ValueError 20 | 21 | # 获取title 22 | title = TitleExtractor().process(tree) 23 | 24 | # base_url 25 | base_href = tree.xpath("//base/@href") 26 | 27 | if base_href and "http" in base_href[0]: 28 | base_url = base_href[0] 29 | self.generate_unique_id(tree) 30 | 31 | format_tree = self.convert_tags(tree, base_url=base_url) 32 | 33 | normal_tree = self.clean_tags(format_tree) 34 | 35 | subtree, xp_num, drop_list = self.xp_1_5(normal_tree) 36 | if xp_num == "others": 37 | subtree, drop_list = self.prune_unwanted_sections(normal_tree) 38 | body_html = self.get_content_html(subtree, xp_num, base_url) 39 | 40 | # 论坛等独有 41 | body_html_tree = fromstring(body_html) 42 | try: 43 | body_tree = body_html_tree.body 44 | except: 45 | body_tree = Element("body") 46 | body_tree.extend(body_html_tree) 47 | main_ids = body_tree.xpath(f".//@{Unique_ID}") 48 | 49 | for main_id in main_ids: 50 | main_tree = normal_tree.xpath( 51 | f".//*[@{Unique_ID}={main_id}]" 52 | ) 53 | if main_tree: 54 | self.remove_node(main_tree[0]) 55 | if not main_ids: 56 | main_ids = [-1] 57 | 58 | if xp_num != "others": 59 | normal_tree, _ = self.prune_unwanted_sections(normal_tree) 60 | for c_xpath in Forum_XPATH: 61 | while normal_tree.xpath(c_xpath): 62 | x = normal_tree.xpath(c_xpath)[0] 63 | self.remove_node(x) 64 | if "'post-'" in c_xpath: 65 | if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+', 66 | x.attrib.get("id", 67 | "").lower())): 68 | continue 69 | if ( 70 | "header" in x.attrib.get("class", "").lower() 71 | or "header" in x.attrib.get("id", "").lower() 72 | ): 73 | continue 74 | try: 75 | if int(x.attrib.get(Unique_ID, "0")) > int( 76 | main_ids[-1] 77 | ): 78 | body_tree.append(x) 79 | else: 80 | prefix_div = Element("div") 81 | suffix_div = Element("div") 82 | need_prefix = False 83 | need_suffix = False 84 | while x.xpath( 85 | f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" 86 | ): 87 | tmp_x = x.xpath( 88 | f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" 89 | )[0] 90 | self.remove_node(tmp_x) 91 | suffix_div.append(tmp_x) 92 | need_suffix = True 93 | while x.xpath( 94 | f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" 95 | ): 96 | tmp_x = x.xpath( 97 | f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" 98 | )[0] 99 | self.remove_node(tmp_x) 100 | prefix_div.append(tmp_x) 101 | need_prefix = True 102 | if need_prefix: 103 | body_tree.insert(0, prefix_div) 104 | if need_suffix: 105 | body_tree.append(suffix_div) 106 | 107 | except: 108 | pass 109 | 110 | body_html = re.sub( 111 | f' {Unique_ID}="\d+"', 112 | "", 113 | tostring(body_tree, encoding=str), 114 | ) 115 | 116 | return { 117 | "xp_num": xp_num, 118 | "drop_list": drop_list, 119 | "html": body_html, 120 | "title": title, 121 | "base_url": base_url 122 | } 123 | -------------------------------------------------------------------------------- /magic_html/extractors/title_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from magic_html.utils import * 4 | from magic_html.config import * 5 | 6 | 7 | class TitleExtractor: 8 | def extract_by_meta(self, element: HtmlElement): 9 | for xpath in METAS: 10 | title = element.xpath(xpath) 11 | if title: 12 | return "".join(title) 13 | 14 | def extract_by_title(self, element: HtmlElement): 15 | return "".join(element.xpath("//title//text()")).strip() 16 | 17 | def extract_by_hs(self, element: HtmlElement): 18 | hs = element.xpath("//h1//text()|//h2//text()|//h3//text()") 19 | return hs or [] 20 | 21 | def extract_by_h(self, element: HtmlElement): 22 | for xpath in ["//h1", "//h2", "//h3"]: 23 | children = element.xpath(xpath) 24 | if not children: 25 | continue 26 | child = children[0] 27 | texts = child.xpath("./text()") 28 | if texts and len(texts): 29 | return texts[0].strip() 30 | 31 | def process(self, element: HtmlElement): 32 | title_extracted_by_meta = self.extract_by_meta(element) 33 | if title_extracted_by_meta: 34 | return title_extracted_by_meta 35 | title_extracted_by_h = self.extract_by_h(element) 36 | title_extracted_by_hs = self.extract_by_hs(element) 37 | title_extracted_by_title = self.extract_by_title(element) 38 | title_extracted_by_hs = sorted( 39 | title_extracted_by_hs, 40 | key=lambda x: similarity2(x, title_extracted_by_title), 41 | reverse=True, 42 | ) 43 | if title_extracted_by_hs: 44 | return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) 45 | 46 | if title_extracted_by_title: 47 | return title_extracted_by_title 48 | 49 | return title_extracted_by_h 50 | -------------------------------------------------------------------------------- /magic_html/extractors/weixin_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from magic_html.utils import * 4 | from magic_html.extractors.base_extractor import BaseExtractor 5 | from magic_html.extractors.title_extractor import TitleExtractor 6 | 7 | 8 | class WeixinExtractor(BaseExtractor): 9 | def __init__(self) -> None: 10 | super().__init__() 11 | 12 | def extract(self, html="", base_url="") -> dict: 13 | html = html.replace(" ", " ") 14 | tree = load_html(html) 15 | if tree is None: 16 | raise ValueError 17 | 18 | # 获取title 19 | title = TitleExtractor().process(tree) 20 | 21 | # base_url 22 | base_href = tree.xpath("//base/@href") 23 | 24 | if base_href and "http" in base_href[0]: 25 | base_url = base_href[0] 26 | 27 | # 文章区域 28 | try: 29 | body_tree_match = tree.xpath('.//*[@id="img-content"]') 30 | if not body_tree_match: 31 | body_tree_match = tree.xpath('.//*[@id="js_content"]') 32 | body_tree = body_tree_match[0] 33 | except: 34 | raise ValueError 35 | 36 | # 去除 js , style, comment 37 | for script in body_tree.xpath(".//script"): 38 | self.remove_node(script) 39 | for style in body_tree.xpath(".//style"): 40 | self.remove_node(style) 41 | for comment in body_tree.xpath(".//comment()"): 42 | self.remove_node(comment) 43 | 44 | # 删除所有的公众号介绍 45 | for mp in body_tree.xpath('.//div[@id="meta_content"]'): 46 | self.remove_node(mp) 47 | for mp in body_tree.xpath('.//div[@id="js_tags"]'): 48 | self.remove_node(mp) 49 | for mp in body_tree.xpath('.//div[@class="original_area_primary"]'): 50 | self.remove_node(mp) 51 | # 隐藏的封禁 介绍 52 | for mp in body_tree.xpath('.//section[@class="wx_profile_card_inner"]'): 53 | self.remove_node(mp) 54 | # 特殊的wx卡片介绍 55 | for mp in body_tree.xpath( 56 | ".//section[contains(@class, 'wx_profile_msg_inner')]" 57 | ): 58 | self.remove_node(mp) 59 | 60 | # 针对杂乱内容进行去除 61 | all_raga = body_tree.xpath( 62 | ".//*[contains(@style, 'color: rgba(255, 255, 255, 0)')] | .//*[contains(@style, 'color: rgba(255 255 255 0)')]" 63 | ) 64 | 65 | for mp in all_raga: 66 | flag_have_color_rgb, detail_style = self.ensure_have_color_rgb( 67 | mp.attrib["style"] 68 | ) 69 | 70 | if not flag_have_color_rgb: 71 | continue 72 | self.remove_node(mp) 73 | 74 | for img in body_tree.xpath(".//img"): 75 | 76 | if "data-src" not in img.attrib: 77 | continue 78 | 79 | try: 80 | img.set("src", img.attrib["data-src"]) 81 | except Exception as e: 82 | continue 83 | 84 | for h1 in body_tree.xpath(".//h1"): 85 | if not h1.text: 86 | continue 87 | h1.text = h1.text.replace("\n", "").strip() 88 | 89 | body_html = tostring(body_tree, encoding=str) 90 | 91 | return { 92 | "xp_num": "weixin", 93 | "drop_list": False, 94 | "html": body_html, 95 | "title": title, 96 | "base_url": base_url 97 | } 98 | 99 | @staticmethod 100 | def ensure_have_color_rgb(htmlstr): 101 | pattern = r"(? 39 | 40 | * Import or include either the main stylesheet, or the 41 | stylesheet module you wish to use, directly from the library 42 | website; http://www.raleigh.ru/MathML/mmltex/. For example: 43 | 44 | 45 | 46 | Obtaining The Library 47 | --------------------- 48 | 49 | The XSLT MathML Library is available for download as: 50 | 51 | * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip 52 | 53 | Copyright 54 | --------- 55 | 56 | Copyright (C) 2001, 2002 Vasil Yaroshevich 57 | 58 | Permission is hereby granted, free of charge, to any person 59 | obtaining a copy of this software and associated documentation 60 | files (the ``Software''), to deal in the Software without 61 | restriction, including without limitation the rights to use, 62 | copy, modify, merge, publish, distribute, sublicense, and/or 63 | sell copies of the Software, and to permit persons to whom the 64 | Software is furnished to do so, subject to the following 65 | conditions: 66 | 67 | The above copyright notice and this permission notice shall be 68 | included in all copies or substantial portions of the Software. 69 | 70 | Except as contained in this notice, the names of individuals 71 | credited with contribution to this software shall not be used in 72 | advertising or otherwise to promote the sale, use or other 73 | dealings in this Software without prior written authorization 74 | from the individuals in question. 75 | 76 | Any stylesheet derived from this Software that is publically 77 | distributed will be identified with a different name and the 78 | version strings in any derived Software will be changed so that 79 | no possibility of confusion between the derived package and this 80 | Software will exist. 81 | 82 | Warranty 83 | -------- 84 | 85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 88 | NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER 89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 92 | OTHER DEALINGS IN THE SOFTWARE. 93 | 94 | Contacting the Author 95 | --------------------- 96 | 97 | These stylesheets are maintained by Vasil Yaroshevich, . 98 | -------------------------------------------------------------------------------- /magic_html/mmltex/glayout.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 21 | 22 | 23 | \genfrac{}{}{ 24 | 25 | 26 | 27 | ex 28 | 29 | 30 | .05ex 31 | 32 | 33 | 34 | .2ex 35 | 36 | 37 | 38 | 39 | 40 | }{}{ 41 | 42 | 43 | \frac{ 44 | 45 | 46 | 47 | \hfill 48 | 49 | 50 | 51 | \hfill 52 | 53 | }{ 54 | 55 | \hfill 56 | 57 | 58 | 59 | \hfill 60 | 61 | } 62 | 63 | 64 | 65 | 66 | 67 | \sqrt[ 68 | 69 | ]{ 70 | 71 | } 72 | 73 | 74 | 75 | exception 25: 76 | \text{exception 25:} 77 | 78 | 79 | 80 | 81 | 82 | \sqrt{ 83 | 84 | } 85 | 86 | 87 | 88 | 89 | 90 | 91 | \left 92 | 93 | 94 | \ 95 | 96 | 97 | 98 | \left( 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | , 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | \right 134 | 135 | 136 | \ 137 | 138 | 139 | 140 | \right) 141 | 142 | 143 | 144 | 145 | \phantom{ 146 | 147 | } 148 | 149 | 150 | 151 | 152 | 153 | \overline{ 154 | 155 | \hspace{.2em}|} 156 | 157 | 158 | \sqrt{ 159 | 160 | } 161 | 162 | 163 | \overline{) 164 | 165 | } 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | \colorbox[rgb]{ 177 | 178 | 179 | 180 | }{$ 181 | 182 | 183 | \textcolor[rgb]{ 184 | 185 | 186 | 187 | }{ 188 | 189 | 190 | 191 | } 192 | 193 | 194 | $} 195 | 196 | 197 | 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /magic_html/mmltex/mmltex.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | $ 41 | 42 | $ 43 | 44 | 45 | -------------------------------------------------------------------------------- /magic_html/mmltex/scripts.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | \overline{ 33 | 34 | 35 | 36 | 37 | } 38 | 39 | 40 | \overbrace{ 41 | 42 | 43 | 44 | 45 | } 46 | 47 | 48 | \underline{ 49 | 50 | 51 | 52 | 53 | 54 | } 55 | 56 | 57 | \underbrace{ 58 | 59 | 60 | 61 | 62 | 63 | } 64 | 65 | 67 | 75 | 76 | _{ 77 | 78 | }^{ 79 | 80 | } 81 | 82 | 83 | \underset{ 84 | 85 | }{\overset{ 86 | 87 | }{ 88 | 89 | }} 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | \overline{ 131 | 132 | } 133 | 134 | 135 | \overbrace{ 136 | 137 | } 138 | 139 | 141 | 149 | 150 | ^{ 151 | 152 | } 153 | 154 | 155 | \stackrel{ 156 | 157 | }{ 158 | 159 | } 160 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | \underline{ 176 | 177 | } 178 | 179 | 180 | \underbrace{ 181 | 182 | } 183 | 184 | 186 | 194 | 195 | _{ 196 | 197 | } 198 | 199 | 200 | \underset{ 201 | 202 | }{ 203 | 204 | } 205 | 206 | 207 | 208 | 209 | 210 | { 211 | 212 | }_{ 213 | 214 | }^{ 215 | 216 | } 217 | 218 | 219 | 220 | { 221 | 222 | }^{ 223 | 224 | } 225 | 226 | 227 | 228 | { 229 | 230 | }_{ 231 | 232 | } 233 | 234 | 235 | 236 | 237 | 238 | {}_{ 239 | 240 | } 241 | 242 | 243 | {}^{ 244 | 245 | } 246 | 247 | 248 | 249 | 250 | 251 | {} 252 | 253 | 254 | _{ 255 | 256 | } 257 | 258 | 259 | ^{ 260 | 261 | } 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | {} 276 | 277 | 278 | _{ 279 | 280 | } 281 | 282 | 283 | ^{ 284 | 285 | } 286 | 287 | 288 | 289 | 290 | 291 | 292 | -------------------------------------------------------------------------------- /magic_html/mmltex/tables.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | \multicolumn{ 15 | 16 | }{c}{ 17 | 18 | } 19 | 20 | & 21 | 22 | 23 | 24 | 25 | 26 | 27 | \hfill 28 | 29 | 30 | 31 | \hfill 32 | 33 | 34 | 36 | & 37 | 38 | 39 | 40 | 41 | 42 | 43 | \\ 44 | 45 | 46 | 47 | 48 | \begin{array}{ 49 | 50 | | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | | 85 | 86 | } 87 | 88 | \hline 89 | 90 | 91 | 92 | \\ \hline 93 | 94 | \end{array} 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /magic_html/mmltex/tokens.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | \mathrm{ 21 | 22 | } 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | \text{ 45 | 46 | } 47 | 48 | 49 | 50 | \phantom{\rule 51 | 52 | [- 53 | 54 | ] 55 | 56 | { 57 | 58 | 0ex 59 | 60 | 61 | }{ 62 | 63 | 0ex 64 | 65 | 66 | }} 67 | 68 | 69 | 70 | 71 | 72 | " 73 | 74 | 75 | " 76 | 77 | 78 | 79 | 80 | 81 | \colorbox[rgb]{ 82 | 83 | 84 | 85 | }{$ 86 | 87 | 88 | \textcolor[rgb]{ 89 | 90 | 91 | 92 | }{ 93 | 94 | 95 | 96 | 97 | \mathrm{ 98 | 99 | 100 | \mathbf{ 101 | 102 | 103 | \mathit{ 104 | 105 | 106 | \mathbit{ 107 | 108 | 109 | \mathbb{ 110 | 111 | 112 | { 113 | 114 | 115 | \mathcal{ 116 | 117 | 118 | \mathsc{ 119 | 120 | 121 | \mathfrak{ 122 | 123 | 124 | \mathsf{ 125 | 126 | 127 | \mathbsf{ 128 | 129 | 130 | \mathsfit{ 131 | 132 | 133 | \mathbsfit{ 134 | 135 | 136 | \mathtt{ 137 | 138 | 139 | { 140 | 141 | 142 | 143 | 144 | 145 | } 146 | 147 | 148 | } 149 | 150 | 151 | $} 152 | 153 | 154 | 155 | 156 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | , 189 | 190 | 191 | 192 | 193 | 194 | , 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | , 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | , 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 0,1,1 239 | 0,0,0 240 | 0,0,1 241 | 1,0,1 242 | .5,.5,.5 243 | 0,.5,0 244 | 0,1,0 245 | .5,0,0 246 | 0,0,.5 247 | .5,.5,0 248 | .5,0,.5 249 | 1,0,0 250 | .75,.75,.75 251 | 0,.5,.5 252 | 1,1,1 253 | 1,1,0 254 | 255 | Exception at color template 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | Exception at Hex2Decimal template 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Brotli 2 | cchardet==2.2.0a2 3 | charset_normalizer 4 | lxml<5.2.0 5 | numpy 6 | py_asciimath 7 | urllib3 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from setuptools import setup 4 | import subprocess 5 | 6 | 7 | def parse_requirements(filename): 8 | with open(filename) as f: 9 | lines = f.read().splitlines() 10 | 11 | requires = [] 12 | 13 | for line in lines: 14 | if "http" in line: 15 | pkg_name_with_version = line.split("/")[-1].split("-")[0] 16 | requires.append(pkg_name_with_version) 17 | else: 18 | requires.append(line) 19 | 20 | return requires 21 | 22 | 23 | def get_version(): 24 | command = ["git", "describe", "--tags"] 25 | try: 26 | version = subprocess.check_output(command).decode().strip() 27 | version_parts = version.split("-") 28 | if len(version_parts) > 1 and version_parts[0].startswith( 29 | "magic_html" 30 | ): 31 | return version_parts[1] 32 | else: 33 | raise ValueError( 34 | f"Invalid version tag {version}. Expected format is magic_html--released." 35 | ) 36 | except Exception as e: 37 | print(e) 38 | return "0.0.0" 39 | 40 | 41 | requires = parse_requirements("requirements.txt") 42 | 43 | setup( 44 | name="magic_html", 45 | version=get_version(), 46 | packages=["magic_html", "magic_html.extractors"], 47 | package_data={"magic_html": ["mmltex/*.xsl"]}, 48 | install_requires=requires, 49 | python_requires=">=3.8", 50 | zip_safe=False, 51 | ) 52 | # python setup.py bdist_wheel 53 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | --------------------------------------------------------------------------------