├── .gitignore ├── readme.md ├── readme.md.template ├── render-readme.sh ├── requirements.txt ├── scripts ├── article.py ├── download.py └── extract.py └── urls.tsv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | .vscode/ 141 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # MassiveSumm: a very large-scale, very multilingual, news summarisation dataset 2 | This repository contains links to data and code to fetch and reproduce the data described in our EMNLP 2021 paper titled "[MassiveSumm: a very large-scale, very multilingual, news summarisation dataset](https://aclanthology.org/2021.emnlp-main.797/)". A (massive) multilingual dataset consisting of 92 diverse languages, across 35 writing scripts. With this work we attempt to take the first steps towards providing a diverse data foundation for in summarisation in many languages. 3 | 4 | > *Disclaimer: The data is noisy and recall-oriented. In fact, we highly recommend reading our analysis on the efficacy of this type of methods for data collection.* 5 | 6 | 7 | ## Get the Data 8 | Redistributing data from web is a tricky matter. We are working on providing efficient access to the entire dataset, as well as expanding it even further. For the time being we only provide links to reproduce subsets of the entire dataset through either common crawl and the wayback machine. The dataset is also available upon request ([djam@itu.dk](mailto:djam@itu.dk)). 9 | 10 | 11 | In the table below is a listing of files containing URLs and metadata required to fetch data from common crawl. 12 | lang | wayback | cc 13 | ------|----------------------------------------------------------------------------------|-------------------------------------------------------------------------------- 14 | afr | [link](https://drive.google.com/file/d/1m7ctoWs5or8HsFbuW5pBu_PodpND0J3e/view) | - 15 | amh | [link](https://drive.google.com/file/d/1k0_65Zb00VGm5i-hnFYuY7lNzjVUiWvl/view) | [link](https://drive.google.com/file/d/1_awz_-B0iWtaPdKih8H8Kz4HGnJSwvRq/view) 16 | ara | [link](https://drive.google.com/file/d/1raYOtsrpmD-yAGo50Kr917Ns3tXEJBci/view) | [link](https://drive.google.com/file/d/1HvCeJ3p59sdhb1xLFGNVHr10hHEg8phA/view) 17 | asm | [link](https://drive.google.com/file/d/1iGPZdk-PKQn0M_q8ENl-sJY861sT0kaE/view) | - 18 | aym | [link](https://drive.google.com/file/d/12XzyUHfrOi317OLU0QsOgl6eZvVOK-bF/view) | - 19 | aze | [link](https://drive.google.com/file/d/1JIqaeoNJt3VATqqjzP27fUBlTAlCvagz/view) | [link](https://drive.google.com/file/d/1CftuzziqiR5QezH9oYL-bCE_KpKdKQgK/view) 20 | bam | [link](https://drive.google.com/file/d/1Yb9YOENj0Kf8FK19eXHD-iHu5nuqR_3-/view) | [link](https://drive.google.com/file/d/1MWQVJMBLmc_8qktep7FGohVbHdXR0iKx/view) 21 | ben | [link](https://drive.google.com/file/d/1lOso52ouqtddUGF5RGOkIlVfL3kD3oTB/view) | [link](https://drive.google.com/file/d/1wK6YTRkXuc4df8C-Ko-PaWB1pIeDfY8q/view) 22 | bod | [link](https://drive.google.com/file/d/1RmonaYMfzj-sw5uM1crJvJxn1FEWfDNi/view) | [link](https://drive.google.com/file/d/1vnZb9PUjRCX6E__OlCAqBGxdu3-19Q8W/view) 23 | bos | [link](https://drive.google.com/file/d/1alV_CwZpxzAcEfuBCp5TCeMr3qOPZZHW/view) | [link](https://drive.google.com/file/d/1TTQVPZ4G7TGy7mFnN21XC3ZDlJlTpGhM/view) 24 | bul | [link](https://drive.google.com/file/d/1XU56P9Jd4Meo7YCEedRPu3qd3nOZRvSx/view) | [link](https://drive.google.com/file/d/13MJzUdrCLz-lo_c4IZOOupJY_50zvZHd/view) 25 | cat | [link](https://drive.google.com/file/d/1OqPLjlsUI-ldg6z2eEe_hA1tM1MCAfEt/view) | - 26 | ces | [link](https://drive.google.com/file/d/1na5Wx9P4SyVfHgIhRhFJ8RH0WpP2qAwt/view) | [link](https://drive.google.com/file/d/1tKzsoGFdDo93aKfEpkY5sSLsuN1hL4LV/view) 27 | cym | [link](https://drive.google.com/file/d/1wqb_fsyw9GBouoHkGq353nWXJZLAL4Ax/view) | [link](https://drive.google.com/file/d/1ewLaDdoC1An4hYr6LVLnPGPsY5h0ZiqS/view) 28 | dan | [link](https://drive.google.com/file/d/10Isyjz0Lw9F2JU3Lw4msLma49_i3CVJo/view) | [link](https://drive.google.com/file/d/1-VcQxG_YDngEaNNRBMn9vl6L3sIEnL_8/view) 29 | deu | [link](https://drive.google.com/file/d/1dguGPFKXkTvSVn2Yuyv7kBqWROouo7nM/view) | [link](https://drive.google.com/file/d/1LfBPlYTbmjWnZTM_e6twUVgzrOjY2kfp/view) 30 | ell | [link](https://drive.google.com/file/d/1hyQJHMMTP0WPaEMPeK5WpoToGuMU1m1C/view) | [link](https://drive.google.com/file/d/1dzbQc2K_rTIrkpcw9UgQyZ4PPk_ZYF5i/view) 31 | eng | [link](https://drive.google.com/file/d/1WumR27bj54A_ObzbM1FX7aR0Xiv80g8n/view) | [link](https://drive.google.com/file/d/1u-Zt56FKrJ9zVZRRSPiwqIoGaKHkTOuY/view) 32 | epo | [link](https://drive.google.com/file/d/1akp7L7cE9J75hdmxkjXIhLO30c1uZzV3/view) | - 33 | fas | [link](https://drive.google.com/file/d/1feVopBcYgz6TybgpYJjNma8Q8v1V36nT/view) | [link](https://drive.google.com/file/d/1AMz5xhJaR9Ud-oic4LA4-VoBfcT-cqWH/view) 34 | fil | [link](https://drive.google.com/file/d/16PBhI9DJZxju2du56u3OTgga-L7ImdKY/view) | - 35 | fra | [link](https://drive.google.com/file/d/19PCGH6Hxt2YiIiEcP224qSQ94D2f49lc/view) | [link](https://drive.google.com/file/d/1UQitwbOPwbbaXFb8xtV0chjeLvWzm3LN/view) 36 | ful | [link](https://drive.google.com/file/d/1glT_e_2kO9bb3mTRCYWUs4n0zGFBS8q9/view) | [link](https://drive.google.com/file/d/1eku0kULX4ZE9wQnUJMcqHy65Jsu21FFp/view) 37 | gle | [link](https://drive.google.com/file/d/1o078h9dEo2bJdmSmex2NA31yt491Cx_X/view) | [link](https://drive.google.com/file/d/1HNVxzYdmc1l_q4UOwNV6Yh2_QMfmlSaf/view) 38 | guj | [link](https://drive.google.com/file/d/19s9xs6DPeplFv3ME3V1Lhk7Zr_YJ3SMD/view) | [link](https://drive.google.com/file/d/1PWFIVGeCRuzAHH-w2UwVOANSvyeEXgFk/view) 39 | hat | [link](https://drive.google.com/file/d/1ioS9mTDjMlNOl8Z7by9F_YIT4BwZnEn-/view) | [link](https://drive.google.com/file/d/1yDorOERjCNFdDRt9viZyWpcO7gmXDyvr/view) 40 | hau | [link](https://drive.google.com/file/d/1oSLe6bPcfqkOtarZ5f5l_jBFLO2_Tafb/view) | [link](https://drive.google.com/file/d/1cYkwEYclvHnN8BLZf6z-DEyINGAHy34L/view) 41 | heb | [link](https://drive.google.com/file/d/1tHlRd6bg5zS7xvaEp5JOST7Ngb-DHecX/view) | - 42 | hin | [link](https://drive.google.com/file/d/1RDbFOOMV3FC71R_1QKxocwDgxP8csmqz/view) | [link](https://drive.google.com/file/d/1ZNcCqUV15Bv2FlY3qkMYyBWBm0hO4LKI/view) 43 | hrv | [link](https://drive.google.com/file/d/13PlLYJmEbZAc8mgMHbZH58-rLvU8bLSY/view) | - 44 | hun | [link](https://drive.google.com/file/d/157CC5cPhpWg5aM4iMjNtX0CdVyL1yO-J/view) | [link](https://drive.google.com/file/d/1R52kqwahdPHFkGpsGdAJE6Wkq38UGgFS/view) 45 | hye | [link](https://drive.google.com/file/d/1ZX0FmoSAmC_QJdwNo-KlqjrLG8ALup5L/view) | [link](https://drive.google.com/file/d/1ciACol27dN07_omNInoU_NqUvYmwXo6C/view) 46 | ibo | [link](https://drive.google.com/file/d/11cmywemBJuNeHkdwn_a4rPyKqbOM7zYF/view) | [link](https://drive.google.com/file/d/1oYOHwATB0PWNYvEv-azy_8MUgkxzFZCY/view) 47 | ind | [link](https://drive.google.com/file/d/1Cb0sJ-2cLYQdg3hKG7yC4bCziYjtpFRo/view) | [link](https://drive.google.com/file/d/1Sch920J5PqJbhpEQHNTjJ1ojiMU46tiQ/view) 48 | isl | [link](https://drive.google.com/file/d/183aUjkvgPtyafmAh3fvUj1OjFloa-nC6/view) | [link](https://drive.google.com/file/d/1wzccfq0RAN7c5c2BGhNMySp2yMYfj9ep/view) 49 | ita | [link](https://drive.google.com/file/d/1eGrviIr8FiRPaIK51l9mFbKEpr_RryuN/view) | [link](https://drive.google.com/file/d/123eRVzORxPIQnp75RMf0LsXWL21l76IH/view) 50 | jpn | [link](https://drive.google.com/file/d/16wRlWIwPIl3tBLJbrWRxDbnehLHHOWEt/view) | [link](https://drive.google.com/file/d/1vjYBbEmWg8PoztrcSqDjUe7ClCUNKHAL/view) 51 | kan | [link](https://drive.google.com/file/d/1J7jD8MjKkR0c_7OIZ7ahw4Bq_2jTgrYX/view) | [link](https://drive.google.com/file/d/18rBERL7l4zBupWwVHXasPu3jlegCM31B/view) 52 | kat | [link](https://drive.google.com/file/d/1S-CYer6Yu02tMRLBbxHtKFYCp33gXBzc/view) | [link](https://drive.google.com/file/d/1GSpqPf87onRlKHu4yoLzxQkAOSIE1GVW/view) 53 | khm | [link](https://drive.google.com/file/d/11OL9JKSTT8_zVQl77avrEVQiqXqV1J2p/view) | [link](https://drive.google.com/file/d/1-0m54dcSjGyBST9bodw1RJYqICsZCwuS/view) 54 | kin | [link](https://drive.google.com/file/d/1DnRV2pUU-b-f9DT27AtNLRJcx31AuRy4/view) | - 55 | kir | [link](https://drive.google.com/file/d/1DoBBN_nb_V-Ogl94KL-nM6iJH2WaGOpK/view) | [link](https://drive.google.com/file/d/1ncixaRUVSGcgTrMPhibN1Pfd4yIJ8c15/view) 56 | kor | [link](https://drive.google.com/file/d/1L3RY0coCdd-1HX4r0VkU2kQYdFOMuN_9/view) | [link](https://drive.google.com/file/d/14-QZft00ab2KAtjT1-p1fvaA45qKt3JJ/view) 57 | kur | [link](https://drive.google.com/file/d/15a_TBIEC1jYNVOTh_wKpKUrW8w_p3FoW/view) | [link](https://drive.google.com/file/d/1g3WTVRxMo5M5HOBuNJLU7KdSw1RQRdTx/view) 58 | lao | [link](https://drive.google.com/file/d/1oO7L92P1XUD6cNdh5MlDv9R-6jLjaYix/view) | [link](https://drive.google.com/file/d/1IOcXBGMoaA859RXzrXSV1WS2qMOweRUn/view) 59 | lav | [link](https://drive.google.com/file/d/1K6Z0RLc0yvyqHXIy3wYh8Elr3QlIMltz/view) | [link](https://drive.google.com/file/d/1AdXmbWraGH_Dh9_f2CcQhhqP5hIqnJXu/view) 60 | lin | [link](https://drive.google.com/file/d/1JTgwLaQgMSqOvdARhw82zrCwZpv5OFZV/view) | [link](https://drive.google.com/file/d/1QDYxfhMQDZeGVVRUsZjUzf7C2RUMYDMR/view) 61 | lit | [link](https://drive.google.com/file/d/1df6oV_UxxqZQnYRtmkVxUZe-2b4Bsiay/view) | [link](https://drive.google.com/file/d/1WjIJ-LZN0ZdqtE_NnoEiAiNhXm3eRGHk/view) 62 | mal | [link](https://drive.google.com/file/d/1hqp4OmL27HPBMVhYwZLf28Syha7mDmqx/view) | [link](https://drive.google.com/file/d/1tvsdnjRAiBFHc0Py-duJoqPlSwYlokie/view) 63 | mar | [link](https://drive.google.com/file/d/1BRcMJL_Zk1rq0hNcCYZAbZ3nMF0qmM1I/view) | [link](https://drive.google.com/file/d/1Z-ui3IipNQy3jpQqeNzrQiVZcXnUFs2e/view) 64 | mkd | [link](https://drive.google.com/file/d/1-UzcYkog_TjAnk9DjTR6vN29R3qjHICN/view) | [link](https://drive.google.com/file/d/1xpE3nPcs-m5WdbPyOX1H4wt9k06NMKN3/view) 65 | mlg | [link](https://drive.google.com/file/d/1dhzWeA8-JKFbhoLpbV1Yli5M4XHkrXdu/view) | [link](https://drive.google.com/file/d/11mpExgMv7VSdejMXUQLFnnPwitLUNudw/view) 66 | mon | [link](https://drive.google.com/file/d/1bPHLSMKtCI927f-I_skf0T98HAp-jN-A/view) | [link](https://drive.google.com/file/d/1rejguZ0HNNMZdV_9g6qXT6Si6QVXhuge/view) 67 | mya | [link](https://drive.google.com/file/d/1fXyMEKX8sz8-wCOKoLLXuqLXMsFZs9Ld/view) | [link](https://drive.google.com/file/d/1cLre9C9f1lm2Ds_8hv6f7h2R4phrXtVd/view) 68 | nde | [link](https://drive.google.com/file/d/1b_UekJ498qQv2DzeXjUWL20VMftds0ec/view) | [link](https://drive.google.com/file/d/1KxB5RLGMlteQOqBYu2DOhIXVzCUkhvDM/view) 69 | nep | [link](https://drive.google.com/file/d/1g-tRWW1dweVZMtkWnkm4j5-Qnbwzlh2P/view) | [link](https://drive.google.com/file/d/1jw_P1wenbskDfG8iQD3dYRb0oWnRba9N/view) 70 | nld | [link](https://drive.google.com/file/d/1JwV508z5Bx_3dHjCW0lMAhp0-8ykF7sF/view) | - 71 | ori | [link](https://drive.google.com/file/d/1eWnnCigfd8HmMueSvyPe7x1JcvGFReg2/view) | [link](https://drive.google.com/file/d/1a3t0X7PfphDZJiyJSL-FzsoZsDH9KIu4/view) 72 | orm | [link](https://drive.google.com/file/d/1oZ4S71rijKd32IL9Ww8VR-vr1mzh4WVT/view) | [link](https://drive.google.com/file/d/1-SopeFs8niXlmwWSe117-YDQ6ECK8xTh/view) 73 | pan | [link](https://drive.google.com/file/d/1Yr6Cy_gaJrbWNHz5khkjDR4mKZT7_TMO/view) | [link](https://drive.google.com/file/d/1t3sUOR_m4blOj8iIU1q8ohxUWPTWImcw/view) 74 | pol | [link](https://drive.google.com/file/d/1BSX_LcGIaDQOWXDqC3YmMFmRM2DQbUYb/view) | [link](https://drive.google.com/file/d/1pNOSculzyCNMjrQOarVhG_SDg1IbMpBr/view) 75 | por | [link](https://drive.google.com/file/d/1KWnsUKgIb2fJlRcOq0WhCzyE8cR0LhfB/view) | [link](https://drive.google.com/file/d/13ET2tIsrzFTzlb9Rd2KAp-FF7Y6R2Ker/view) 76 | prs | [link](https://drive.google.com/file/d/1izhl77L8R2r7YM4-Usu0VMAQtoO5sn7R/view) | [link](https://drive.google.com/file/d/11QMxXjH9vN0V6-UZXT2omxb7lm8zphqC/view) 77 | pus | [link](https://drive.google.com/file/d/1nJ0hBzj0z51htnwftGyd_I0DQbFdeypS/view) | [link](https://drive.google.com/file/d/1nccq6pEsvUhe1zvPTDoDKEob6cRKPpWP/view) 78 | ron | [link](https://drive.google.com/file/d/1XxDdroLJdAQZwtGmhPedZ-YZq_G9T-tr/view) | - 79 | run | [link](https://drive.google.com/file/d/1FVLjZI_oj6bGwP6tIMUTY-8yj4pjJm-5/view) | [link](https://drive.google.com/file/d/165N8Wh_TeTo7N_el6eWGmZ5ts3KBKU9Q/view) 80 | rus | [link](https://drive.google.com/file/d/17RWgFR6mIvxGr6RGhuRZspLtnlWZk1Ul/view) | [link](https://drive.google.com/file/d/15Cqcrbl_lG_oSED_hTyR_mb-dwpvw9J8/view) 81 | sin | [link](https://drive.google.com/file/d/158EtvATjJ39G7vThM69h4shcRqRXdT9K/view) | [link](https://drive.google.com/file/d/1gvqSIOkL7RDX-yg7O1VwHTdRUcNGcf_F/view) 82 | slk | [link](https://drive.google.com/file/d/1IGtxqiLlJqBhsfgbAUlyQAwQir2QmYGb/view) | [link](https://drive.google.com/file/d/1GDzuxd-KhBA_fHrDlO8HcCayLsG4-UkX/view) 83 | slv | [link](https://drive.google.com/file/d/1gWb-pImthObUPJ16hIO5HZif3XSvjIrK/view) | [link](https://drive.google.com/file/d/1T71uVRLX-wB-qeWFMyqtxlI91Dr2u7pn/view) 84 | sna | [link](https://drive.google.com/file/d/1FOacYT0S5sPVxmBbH4mHsH4uyrbQwOoM/view) | [link](https://drive.google.com/file/d/1wCysDOCUvsA3H-9CmrItwU7GHIxQt1rU/view) 85 | som | [link](https://drive.google.com/file/d/1IHxrknewcaTTQKrCH6K00lz3dPT5lIUc/view) | [link](https://drive.google.com/file/d/1oXDsB76ViX9ri5W_2xVEQqLWDqIQAh5O/view) 86 | spa | [link](https://drive.google.com/file/d/1y3iDoCDfT19MXgQtQ_z_dF4q8xgLK4Eq/view) | [link](https://drive.google.com/file/d/14dX8cePpcb-E7nS8brupu9lQkoLUPLSf/view) 87 | sqi | [link](https://drive.google.com/file/d/1rpOjaE3mjyl8LsLcU5QiEt1xmkxF0ntW/view) | [link](https://drive.google.com/file/d/1jz5sXn8JeHhZHjXb7r0wLqyynva7c8Al/view) 88 | srp | [link](https://drive.google.com/file/d/1wpp_f10LNB4Qb0F8-EMmdKWLMHb4cGuw/view) | [link](https://drive.google.com/file/d/1XJCyan1OL3UTI9_tNbvZX2ngQ2bhoy0U/view) 89 | swa | [link](https://drive.google.com/file/d/1L3DHVdngIRSd8eCjx7qyp5NBwziz6I6B/view) | [link](https://drive.google.com/file/d/1ukbOFz_dHYaIQnD4ub0hE9DqF_1mwVke/view) 90 | swe | [link](https://drive.google.com/file/d/1BgVrlj40Mlg4yOOMy1yPV8iyFbZTrRLE/view) | - 91 | tam | [link](https://drive.google.com/file/d/1VrmX5egg4zaZKPBA0Ic3jlMg_ovYkeW9/view) | [link](https://drive.google.com/file/d/1DWv-hkU0P2B0AysTFwOZ3aghW6lxF16A/view) 92 | tel | [link](https://drive.google.com/file/d/1zo3gNIH2sMczXpnWh-vVty3pr4-tOaBy/view) | [link](https://drive.google.com/file/d/1e0KIPqcKYHXmSjgLpxLJkEFumw8Bae_g/view) 93 | tet | [link](https://drive.google.com/file/d/1n-PVdlyti6wGtlGUalYeHOmGwZtG6xDe/view) | - 94 | tgk | [link](https://drive.google.com/file/d/1g6_1YKJbv7-5glBsreqspPP_VnBsRSXW/view) | - 95 | tha | [link](https://drive.google.com/file/d/1vTBPYxmkyWCqnboX3cVxxNHcfRcXAo_7/view) | [link](https://drive.google.com/file/d/197vyuI2JzOGczeVRqnUGu78G3T2WWRit/view) 96 | tir | [link](https://drive.google.com/file/d/1vkt2SRGiSPIJKzgU-XagWmx6rnQLtmZZ/view) | [link](https://drive.google.com/file/d/1lDazmqixV4Gem96O-c-gulqSNKUEcjnR/view) 97 | tur | [link](https://drive.google.com/file/d/1_39Hk7K-IKzvSiRLmue1mxANuPXQg9p5/view) | [link](https://drive.google.com/file/d/1Kole41CnnNArIt_rxfNlimk9EMQZFa8Y/view) 98 | ukr | [link](https://drive.google.com/file/d/1h2I-yan3WcVEyJfeyJD_fFFviJWK93N3/view) | [link](https://drive.google.com/file/d/1H8TUR73sJs_bvjJuLNB4szqiCIvTq5sp/view) 99 | urd | [link](https://drive.google.com/file/d/1p-lG1vEDp838GRzuPWc9hjfZeqjfeMh3/view) | [link](https://drive.google.com/file/d/1HDwEMuaULkZr6Mm39CifS2szyI_vql-G/view) 100 | uzb | [link](https://drive.google.com/file/d/134swKYwYcfCFbMSXe16hvmLVGqS7pOMb/view) | [link](https://drive.google.com/file/d/1nYOLG5UlV-YDeex8Tvi37hK4pM9wD7Wg/view) 101 | vie | [link](https://drive.google.com/file/d/1zm1AjKpOhEeaZgs2MeJsrFVjxT7kbyFL/view) | [link](https://drive.google.com/file/d/1uts1nSGwWNxEFZnsJi6SdamG2DAVPq-q/view) 102 | xho | [link](https://drive.google.com/file/d/1Gkq4cLknzh_cY9HBlWAGqkZez31vIarY/view) | [link](https://drive.google.com/file/d/1P31PeL7cVJ9eNE-YZ0ofH0pozT9ta5bP/view) 103 | yor | [link](https://drive.google.com/file/d/1KhCZk7wBsFkKsmU4XWffVTS37w1FguwE/view) | [link](https://drive.google.com/file/d/17ifvygtGzaIgDuqiFK0QDK1Jd7SOnkNd/view) 104 | yue | [link](https://drive.google.com/file/d/1u1ScUMdlOfyIyUOIcZBonIUb7rXJwGH8/view) | [link](https://drive.google.com/file/d/1blW_lXnUFa3poUwR6YuHd3N2fVoGuHhG/view) 105 | zho | [link](https://drive.google.com/file/d/10ipmN3CgNFXc6OQst6-Iasa5CaZLT93M/view) | [link](https://drive.google.com/file/d/13-qysDM2uAiT_E9KjAKsQZAQP_-0pyRL/view) 106 | bis | - | [link](https://drive.google.com/file/d/1zUn6LDov0zi_hxYbs9hX63UwLhcNcrsa/view) 107 | gla | - | [link](https://drive.google.com/file/d/1rIYRlZQ0Sl6By45hdOozAS_37Dv6LFQp/view) 108 | 109 | ## Cite Us! 110 | Please cite us if you use our data or methodology 111 | ``` 112 | @inproceedings{varab-schluter-2021-massivesumm, 113 | title = "{M}assive{S}umm: a very large-scale, very multilingual, news summarisation dataset", 114 | author = "Varab, Daniel and 115 | Schluter, Natalie", 116 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", 117 | month = nov, 118 | year = "2021", 119 | address = "Online and Punta Cana, Dominican Republic", 120 | publisher = "Association for Computational Linguistics", 121 | url = "https://aclanthology.org/2021.emnlp-main.797", 122 | pages = "10150--10161", 123 | abstract = "Current research in automatic summarisation is unapologetically anglo-centered{--}a persistent state-of-affairs, which also predates neural net approaches. High-quality automatic summarisation datasets are notoriously expensive to create, posing a challenge for any language. However, with digitalisation, archiving, and social media advertising of newswire articles, recent work has shown how, with careful methodology application, large-scale datasets can now be simply gathered instead of written. In this paper, we present a large-scale multilingual summarisation dataset containing articles in 92 languages, spread across 28.8 million articles, in more than 35 writing scripts. This is both the largest, most inclusive, existing automatic summarisation dataset, as well as one of the largest, most inclusive, ever published datasets for any NLP task. We present the first investigation on the efficacy of resource building from news platforms in the low-resource language setting. Finally, we provide some first insight on how low-resource language settings impact state-of-the-art automatic summarisation system performance.", 124 | } 125 | ``` -------------------------------------------------------------------------------- /readme.md.template: -------------------------------------------------------------------------------- 1 | # MassiveSumm: a very large-scale, very multilingual, news summarisation dataset 2 | This repository contains links to data and code to fetch and reproduce the data described in our EMNLP 2021 paper titled "[MassiveSumm: a very large-scale, very multilingual, news summarisation dataset](https://aclanthology.org/2021.emnlp-main.797/)". A (massive) multilingual dataset consisting of 92 diverse languages, across 35 writing scripts. With this work we attempt to take the first steps towards providing a diverse data foundation for in summarisation in many languages. 3 | 4 | > *Disclaimer: The data is noisy and recall-oriented. In fact, we highly recommend reading our analysis on the efficacy of this type of methods for data collection.* 5 | 6 | 7 | ## Get the Data 8 | Redistributing data from web is a tricky matter. We are working on providing efficient access to the entire dataset, as well as expanding it even further. For the time being we only provide links to reproduce subsets of the entire dataset through either common crawl and the wayback machine. The dataset is also available upon request ([djam@itu.dk](mailto:djam@itu.dk)). 9 | 10 | 11 | In the table below is a listing of files containing URLs and metadata required to fetch data from common crawl. 12 | ${LANGS} 13 | 14 | ## Cite Us! 15 | Please cite us if you use our data or methodology 16 | ``` 17 | @inproceedings{varab-schluter-2021-massivesumm, 18 | title = "{M}assive{S}umm: a very large-scale, very multilingual, news summarisation dataset", 19 | author = "Varab, Daniel and 20 | Schluter, Natalie", 21 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", 22 | month = nov, 23 | year = "2021", 24 | address = "Online and Punta Cana, Dominican Republic", 25 | publisher = "Association for Computational Linguistics", 26 | url = "https://aclanthology.org/2021.emnlp-main.797", 27 | pages = "10150--10161", 28 | abstract = "Current research in automatic summarisation is unapologetically anglo-centered{--}a persistent state-of-affairs, which also predates neural net approaches. High-quality automatic summarisation datasets are notoriously expensive to create, posing a challenge for any language. However, with digitalisation, archiving, and social media advertising of newswire articles, recent work has shown how, with careful methodology application, large-scale datasets can now be simply gathered instead of written. In this paper, we present a large-scale multilingual summarisation dataset containing articles in 92 languages, spread across 28.8 million articles, in more than 35 writing scripts. This is both the largest, most inclusive, existing automatic summarisation dataset, as well as one of the largest, most inclusive, ever published datasets for any NLP task. We present the first investigation on the efficacy of resource building from news platforms in the low-resource language setting. Finally, we provide some first insight on how low-resource language settings impact state-of-the-art automatic summarisation system performance.", 29 | } 30 | ``` -------------------------------------------------------------------------------- /render-readme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | LANGS=$(csvtomd urls.tsv -d "$(echo '\t')") 3 | export LANGS 4 | 5 | cat readme.md.template | envsubst > readme.md 6 | echo readme.md is updated -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | csvtomd 2 | requests 3 | orjson 4 | beautifulsoup4 5 | readability-lxml 6 | -------------------------------------------------------------------------------- /scripts/article.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2018 Max Grusky 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | This this file has been altered and adopted. Specifically, it is modified to accomodate 17 | the common crawl archive instead of the waybackmachine. 18 | """ 19 | 20 | import re 21 | 22 | from urllib.parse import quote, urlparse, urljoin 23 | from bs4 import BeautifulSoup 24 | from readability import Document 25 | 26 | 27 | _whitespace = re.compile(r"\s+") 28 | 29 | 30 | class Article(object): 31 | 32 | """ 33 | Reads in a {url: "", html: ""} archive entry from the downloader script. 34 | This will scrape the provided HTML and extract the summary and text. Note 35 | that the provided URL in this case is actually the ARCHIVE url (Maybe this 36 | should be made clearer in the downloader script?). 37 | """ 38 | 39 | def __init__(self, archive, html): 40 | 41 | self.archive = archive 42 | self.html = html if html is not None else "" 43 | 44 | # @djam my doing 45 | self.url = archive 46 | self.date = None 47 | # self._parse_archive() 48 | self._parse_html() 49 | 50 | def _parse_archive(self): 51 | 52 | *splits, url = self.archive.split("id_/") 53 | *_, date = splits[0].split("/") 54 | 55 | self.url = self.normalize_url(url) 56 | self.date = date 57 | 58 | def _parse_html(self): 59 | 60 | self._load_html() 61 | self._find_canonical_url() 62 | 63 | self._extract_text() 64 | self._extract_summary() 65 | 66 | def _extract_summary(self): 67 | 68 | self.all_summaries = {} 69 | 70 | for meta in self.soup.findAll("meta"): 71 | for attr, value in meta.attrs.items(): 72 | 73 | if attr in ("name", "property") and "description" in value: 74 | 75 | # Extract the tag content. If we can't find anything, 76 | # ignore it and move onto the next tag. 77 | 78 | try: 79 | 80 | self.all_summaries[value] = meta.get("content").strip() 81 | 82 | except Exception: 83 | 84 | continue 85 | 86 | if len(self.all_summaries) == 0: 87 | 88 | self.summary = None 89 | return 90 | 91 | for kind in ("og:description", "twitter:description", "description"): 92 | 93 | if kind in self.all_summaries: 94 | 95 | self.summary = self.all_summaries[kind] 96 | break 97 | 98 | else: 99 | 100 | random_pick = sorted(self.all_summaries)[0] 101 | self.summary = self.all_summaries[random_pick] 102 | 103 | def _extract_text(self): 104 | 105 | """ 106 | Uses Readability to extract the body text and titles of the articles. 107 | """ 108 | 109 | # Confusingly, the Readability package calls the body text of the article 110 | # its "summary." We want to create a plain text document from the body text, 111 | # so we need to extract the text from Readability's HTML version. 112 | 113 | body_soup = BeautifulSoup(self.readability.summary(), "lxml") 114 | 115 | # Now go through and extract each paragraph (in order). 116 | 117 | paragraph_text = [] 118 | for paragraph in body_soup.findAll("p"): 119 | 120 | # Very short pieces of text tend not to be article body text, but 121 | # captions, attributions, and advertising. It seems that excluding 122 | # paragraphs shorter than five words removes most of this. 123 | 124 | if len(paragraph.text.split()) >= 5: 125 | 126 | paragraph_body = _whitespace.sub(" ", paragraph.text).strip() 127 | paragraph_text.append(paragraph_body) 128 | 129 | # We join the plain text paragraphs of the article with double new lines. 130 | 131 | self.text = "\n\n".join(paragraph_text) 132 | 133 | # "Short title" uses in-page heuristics to remove cruft from