├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── logo.png └── teasor.png ├── code_pipeline └── .keep └── web_pipeline ├── README.md ├── download ├── download.md └── process_listings │ ├── download_cc_list.py │ ├── split_listing.py │ ├── split_listing_for_hash_generation.sh │ ├── split_listing_for_local_deduplication.sh │ ├── split_listing_for_quality_filtering.sh │ ├── split_listing_for_re-organizing_data_merge.sh │ ├── split_listing_for_re-organizing_data_split.sh │ └── split_listing_for_text_extraction.sh ├── mathml2latex ├── LICENSE ├── README.md ├── input.md ├── mathml2latex.py ├── mmltex │ ├── README │ ├── README2 │ ├── cmarkup.xsl │ ├── entities.xsl │ ├── glayout.xsl │ ├── mmltex.xsl │ ├── scripts.xsl │ ├── tables.xsl │ └── tokens.xsl ├── output.md ├── output.png └── unicode_map.py ├── requirements.txt ├── stage1_download_and_extract.py ├── url_filtering ├── url_filter.py └── urls │ ├── adult │ ├── domains │ ├── expressions │ ├── urls │ └── usage │ ├── blocklist │ ├── adult.tar.gz │ ├── adult │ │ ├── domains │ │ ├── expressions │ │ ├── urls │ │ └── usage │ ├── agressif.tar.gz │ ├── agressif │ │ ├── domains │ │ ├── expressions │ │ ├── urls │ │ └── usage │ ├── arjel.tar.gz │ ├── arjel │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── chat.tar.gz │ ├── chat │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── dating.tar.gz │ ├── dating │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── ddos.tar.gz │ ├── ddos │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── download.sh │ ├── filehosting.tar.gz │ ├── filehosting │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── gambling.tar.gz │ ├── gambling │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── mixed_adult.tar.gz │ ├── mixed_adult │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── phishing.tar.gz │ └── phishing │ │ ├── domains │ │ ├── urls │ │ └── usage │ ├── curated │ └── domains │ ├── url_blocklist_refinedweb_manual_inspection.csv │ └── whitelist │ ├── domains │ └── urls └── utils ├── bad_url_words.py ├── datatrove_utils.py ├── decont_utils ├── data │ ├── aime24.jsonl │ ├── aime25.jsonl │ ├── amc.jsonl │ ├── asdiv.jsonl │ ├── gsm8k.jsonl │ ├── math.jsonl │ ├── mathqa.jsonl │ ├── mawps.jsonl │ ├── mmlu_stem.jsonl │ ├── ocw.jsonl │ ├── sat.jsonl │ └── svamp.jsonl ├── datatrove_helper.py └── downstream_datasets.py ├── file_utils.py ├── latex_parsing.py └── math_fasttext.py /.gitattributes: -------------------------------------------------------------------------------- 1 | web_pipeline/url_filtering/urls/blocklist/adult/domains filter=lfs diff=lfs merge=lfs -text 2 | web_pipeline/url_filtering/urls/url_blocklist_refinedweb_manual_inspection.csv filter=lfs diff=lfs merge=lfs -text 3 | web_pipeline/url_filtering/urls/adult/domains filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

Logo MegaMath: An Open Math Pre-trainng Dataset with 370B Tokens.

3 | 4 | [![Dataset](https://img.shields.io/badge/Datasets-4d8cd8?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/datasets/LLM360/MegaMath) 5 | [![Tech Report](https://img.shields.io/badge/Tech%20Report-5f16a8?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2504.02807) 6 |
7 | 8 | ## About MegaMath 9 | 10 |
11 | Overview 12 |
13 | 14 | MegaMath is a large-scale pre-training dataset for math. 15 | It is curated via the following three efforts: 16 | 17 | - **Revisiting web data**: We re-extracted mathematical documents from Common Crawl with math-oriented HTML optimizations, fasttext-based filtering and deduplication, all for acquiring higher-quality data on the Internet. 18 | - **Recalling Math-related code data**: We identified high quality math-related code from large code training corpus, Stack-V2, further enhancing data diversity. 19 | - **Exploring Synthetic data**: We synthesized QA-style text, math-related code, and interleaved text-code blocks from web data or code data. 20 | 21 | ## How to Use 22 | 23 | MegaMath includes many different data variants which is tailored for different training demands. 24 | 25 | If you are training your LLM from scratch, we recommend you to use the full set of our web data. 26 | ```python 27 | from huggingface_hub import snapshot_download 28 | snapshot_download( 29 | repo_id="LLM360/MegaMath", 30 | local_dir="./", 31 | repo_type="dataset", 32 | allow_patterns=["megamath-web/*"] 33 | ) 34 | ``` 35 | 36 | If you are performing continual pre-training from strong base models, **MegaMath-Web-Pro** may be your best choice. 37 | ```python 38 | from huggingface_hub import snapshot_download 39 | snapshot_download( 40 | repo_id="LLM360/MegaMath", 41 | local_dir="./", 42 | repo_type="dataset", 43 | allow_patterns=["megamath-web-pro/*"] 44 | ) 45 | ``` 46 | 47 | We also provide **MegaMath-Code** which can enhance the performance of your LLM on solving math-related tasks via Python code. Moreover, MegaMath contains over 80B tokens of synthetic data, which can be used to further enhance the performance of your LLM on solving math-related tasks. 48 | 49 | ```python 50 | from huggingface_hub import snapshot_download 51 | snapshot_download( 52 | repo_id="LLM360/MegaMath", 53 | local_dir="./", 54 | repo_type="dataset", 55 | allow_patterns=[ 56 | "megamath-qa/*", 57 | "megamath-translated-code/*", 58 | "megamath-text-code-block/*", 59 | "megamath-code/*" 60 | ] 61 | ) 62 | ``` 63 | 64 | ## Data Pipeline 65 | 66 | Please refer to the [web_pipeline](./web_pipeline) for more details. We are actively working on the code pipeline and will update the README soon. 67 | 68 | 69 | ## Citation 70 | If you use our dataset or find our work useful, please cite 71 | ```bibtex 72 | @article{zhou2025megamath, 73 | title = {MegaMath: Pushing the Limits of Open Math Corpora}, 74 | author = {Zhou, Fan and Wang, Zengzhi and Ranjan, Nikhil and Cheng, Zhoujun and Tang, Liping and He, Guowei and Liu, Zhengzhong and Xing, Eric P.}, 75 | journal = {arXiv preprint arXiv:2504.02807}, 76 | year = {2025}, 77 | note = {Preprint} 78 | } 79 | ``` -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/assets/logo.png -------------------------------------------------------------------------------- /assets/teasor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/assets/teasor.png -------------------------------------------------------------------------------- /code_pipeline/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/code_pipeline/.keep -------------------------------------------------------------------------------- /web_pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Web Pipeline 2 | 3 | This folder contains the code for the web pipeline. 4 | First please follow instructions in [download](./download/download.md) folder to get all the available WARC file paths. 5 | 6 | ## Stage 1: Download and Extract 7 | This will download the WARC files from the Common Crawl and extract the text and HTML content. Meanwhile, we will perform language identification and math text filtering using fasttext models. 8 | 9 | ```bash 10 | python stage1_download_and_extract.py 11 | ``` 12 | 13 | ## Stage 2: Deduplication 14 | 15 | We mainly follow DataTrove's example to perform deduplication. 16 | Please refer to the example code in [datatrove](https://github.com/huggingface/datatrove/blob/main/examples/minhash_deduplication.py) for more details. The majority of the code is the same, but we use a different bucket size and hash function number (11 , 10). 17 | 18 | ## Stage 3: Re-extraction 19 | 20 | TODO -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/download_cc_list.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import wget 5 | import requests 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | def parse_args(): 10 | """Parse and return command line arguments.""" 11 | parser = argparse.ArgumentParser(description="Download Common Crawl index dumps.") 12 | parser.add_argument('--save_path', type=str, default="./commoncrawlList/", 13 | help="Path to save the downloaded Common Crawl list.") 14 | parser.add_argument('--skip_existing_dumps', type=bool, default=True, 15 | help="Whether to skip dumps already downloaded to 'save_path'.") 16 | return parser.parse_args() 17 | 18 | 19 | def get_available_dumps(url): 20 | """Fetch and parse the web page to list available Common Crawl dumps.""" 21 | response = requests.get(url) 22 | html_content = response.text 23 | soup = BeautifulSoup(html_content, 'html.parser') 24 | return soup.find_all('a', attrs={'class': "crawl-link w-inline-block"}) 25 | 26 | 27 | def main(): 28 | args = parse_args() 29 | 30 | # Ensure the directory exists where the dumps will be stored. 31 | if not os.path.exists(args.save_path): 32 | os.makedirs(args.save_path) 33 | 34 | # Get all the available dumps from Common Crawl's start page. 35 | url = 'https://commoncrawl.org/get-started' 36 | dump_links = get_available_dumps(url) 37 | 38 | # Prepare to track already downloaded dumps if skipping is enabled. 39 | existing_dumps = set(os.listdir(args.save_path)) if args.skip_existing_dumps else set() 40 | 41 | # Dumps to skip due to different file formats which are not supported for now. 42 | skip_list = {'CC-MAIN-2012', 'CC-MAIN-2009-2010', 'CC-MAIN-2008-2009'} 43 | 44 | # File to record names of newly downloaded dumps. 45 | with open(os.path.join(args.save_path, 'dumplist.txt'), 'w') as dump_file: 46 | for link in dump_links: 47 | dump_url = link.get('href') 48 | dump_name = dump_url.split('/')[-2] # Format: 'CC-MAIN-2024-30' 49 | 50 | # Skip dumps either in skip list or already downloaded. 51 | if dump_name in skip_list or dump_name in existing_dumps: 52 | continue 53 | 54 | # Construct download URL and local save path. 55 | dump_list_url = dump_url.split('index.html')[0] + 'warc.paths.gz' 56 | dump_save_path = os.path.join(args.save_path, dump_name) 57 | 58 | # Ensure dump directory exists and download the dump. 59 | if not os.path.exists(dump_save_path): 60 | os.makedirs(dump_save_path) 61 | wget.download(dump_list_url, out=dump_save_path) 62 | print(f"\n Successfully downloaded {dump_name}") 63 | dump_file.write(dump_name + '\n') 64 | 65 | 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/split_listing.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import io 4 | import os 5 | import random 6 | 7 | def start_split(dump_file_name_paths, files_num, store_dir, shuffle, consecutive): 8 | """ 9 | Splits the dump file listings into multiple files, either randomly or consecutively, 10 | depending on the flags provided. 11 | 12 | Args: 13 | dump_file_name_paths (list of str): Paths to the input dump file listings. 14 | files_num (int): Number of output files to generate. 15 | store_dir (str): Directory to store the output split files. 16 | shuffle (bool): Whether to shuffle the records before splitting. 17 | consecutive (bool): If True, store records consecutively across output files. 18 | """ 19 | file_name_list = [] 20 | 21 | # Load files and extend file name list accordingly 22 | for dump_file_name_path in dump_file_name_paths: 23 | if dump_file_name_path.endswith(".txt"): 24 | with open(dump_file_name_path) as records: 25 | file_name_list.extend(records) 26 | elif dump_file_name_path.endswith(".gz"): 27 | with gzip.open(dump_file_name_path, "rb") as stream: 28 | records = io.TextIOWrapper(stream, encoding="utf-8") 29 | file_name_list.extend(records) 30 | 31 | # Optionally shuffle the file name list 32 | if shuffle: 33 | random.shuffle(file_name_list) 34 | 35 | print(f"Total records: {len(file_name_list)}") 36 | 37 | # Ensure the storage directory exists 38 | if not os.path.exists(store_dir) and store_dir.endswith("/"): 39 | os.makedirs(store_dir, exist_ok=True) 40 | elif not os.path.exists(os.path.dirname(store_dir)): 41 | os.makedirs(os.path.dirname(store_dir), exist_ok=True) 42 | 43 | if consecutive: 44 | start_index = 0 # Initial index for consecutive file writing 45 | file_lines = len(file_name_list) 46 | base_lines_per_file = file_lines // files_num 47 | extra_lines = file_lines % files_num 48 | 49 | # Write records to output files 50 | for i in range(files_num): 51 | output_file = os.path.join(store_dir, f"Split_{i:03d}.txt") if store_dir.endswith("/") \ 52 | else f"{store_dir}.Split_{i:03d}.txt" 53 | with open(output_file, "w") as f: 54 | if consecutive: 55 | # Determine the number of lines this file should get 56 | lines_for_this_file = base_lines_per_file + (1 if i < extra_lines else 0) 57 | end_index = start_index + lines_for_this_file 58 | 59 | # Write the designated slice of records to the file 60 | f.writelines(file_name_list[start_index:end_index]) 61 | 62 | # Update the start index for the next file 63 | start_index = end_index 64 | else: 65 | # Distribute records skipping files_num indices for each record 66 | f.writelines(file_name_list[i::files_num]) 67 | 68 | if __name__ == '__main__': 69 | parser = argparse.ArgumentParser(description="Split dump file listings into multiple files.") 70 | parser.add_argument('--file_path', nargs='+', help='Paths to the dump file listings') 71 | parser.add_argument('--files_num', type=int, default=99, help='Number of output files to generate') 72 | parser.add_argument('--store_dir', type=str, default='./split_files', help='Output directory for split files') 73 | parser.add_argument('--shuffle', type=bool, default=False, help='Shuffle the records before splitting') 74 | parser.add_argument('--consecutive', type=bool, default=False, help='Store consecutive listings in one file') 75 | 76 | args = parser.parse_args() 77 | start_split(args.file_path, args.files_num, args.store_dir, args.shuffle, args.consecutive) 78 | -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/split_listing_for_hash_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Initialize an array to store the names of new dumps 4 | declare -a ALL_DUMPS 5 | 6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array 7 | # This file contains a list of new dump directories 8 | while IFS= read -r line; do 9 | ALL_DUMPS+=("$line") 10 | done < "commoncrawlList/dumplist.txt" 11 | 12 | for dump_date in "${ALL_DUMPS[@]}"; do 13 | all_file_paths=() 14 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 15 | all_file_paths+=("$file_path") 16 | 17 | # Execute a Python script to process the listed WARC file paths 18 | # --files_num: Specifies the number of output files to generate 19 | # --store_dir: Defines the directory where the split listings will be stored 20 | # --file_path: Passes the array of WARC file paths to the Python script 21 | # --shuffle: Enables shuffling of the file paths before processing 22 | echo $file_path 23 | python3 split_listing.py \ 24 | --files_num 20 \ 25 | --store_dir ../../listings/hash_generation/run-1/$dump_date \ 26 | --file_path "${all_file_paths[@]}" \ 27 | --shuffle True 28 | done -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/split_listing_for_local_deduplication.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Initialize an array to store the names of new dumps 4 | declare -a ALL_DUMPS 5 | 6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array 7 | # This file contains a list of new dump directories 8 | while IFS= read -r line; do 9 | ALL_DUMPS+=("$line") 10 | done < "commoncrawlList/dumplist.txt" 11 | 12 | for dump_date in "${ALL_DUMPS[@]}"; do 13 | all_file_paths=() 14 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 15 | all_file_paths+=("$file_path") 16 | 17 | # Execute a Python script to process the listed WARC file paths 18 | # --files_num: Specifies the number of output files to generate 19 | # --store_dir: Defines the directory where the split listings will be stored 20 | # --file_path: Passes the array of WARC file paths to the Python script 21 | # --consecutive: Enables consecutive listings of the file paths 22 | echo $file_path 23 | python3 split_listing.py \ 24 | --files_num 70 \ 25 | --store_dir ../../listings/local_deduplication/run-1/$dump_date \ 26 | --file_path "${all_file_paths[@]}" \ 27 | --consecutive True 28 | done -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/split_listing_for_quality_filtering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Initialize an array to store the names of new dumps 4 | declare -a ALL_DUMPS 5 | 6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array 7 | # This file contains a list of new dump directories 8 | while IFS= read -r line; do 9 | ALL_DUMPS+=("$line") 10 | done < "commoncrawlList/dumplist.txt" 11 | 12 | for dump_date in "${ALL_DUMPS[@]}"; do 13 | all_file_paths=() 14 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 15 | all_file_paths+=("$file_path") 16 | 17 | # Execute a Python script to process the listed WARC file paths 18 | # --files_num: Specifies the number of output files to generate 19 | # --store_dir: Defines the directory where the split listings will be stored 20 | # --file_path: Passes the array of WARC file paths to the Python script 21 | # --shuffle: Enables shuffling file paths in the listings 22 | echo $file_path 23 | python3 split_listing.py \ 24 | --files_num 20 \ 25 | --store_dir ../../listings/quality_filtering/run-1/${dump_date#CC-MAIN-} \ 26 | --file_path "${all_file_paths[@]}" \ 27 | --shuffle True 28 | done -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/split_listing_for_re-organizing_data_merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Initialize an array to store the names of new dumps 4 | declare -a ALL_DUMPS 5 | 6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array 7 | # This file contains a list of new dump directories 8 | while IFS= read -r line; do 9 | ALL_DUMPS+=("$line") 10 | done < "commoncrawlList/dumplist.txt" 11 | 12 | for dump_date in "${ALL_DUMPS[@]}"; do 13 | all_file_paths=() 14 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 15 | all_file_paths+=("$file_path") 16 | 17 | # Execute a Python script to process the listed WARC file paths 18 | # --files_num: Specifies the number of output files to generate 19 | # --store_dir: Defines the directory where the split listings will be stored 20 | # --file_path: Passes the array of WARC file paths to the Python script 21 | # --shuffle: Enables shuffling file paths in the listings 22 | echo $file_path 23 | python3 split_listing.py \ 24 | --files_num 10 \ 25 | --store_dir ../../listings/re-organizing_data/merge/run-1/1-1/${dump_date#CC-MAIN-} \ 26 | --file_path "${all_file_paths[@]}" \ 27 | --shuffle True 28 | done 29 | 30 | 31 | for dump_date in "${ALL_DUMPS[@]}"; do 32 | all_file_paths=() 33 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 34 | all_file_paths+=("$file_path") 35 | 36 | # Execute a Python script to process the listed WARC file paths 37 | # --files_num: Specifies the number of output files to generate 38 | # --store_dir: Defines the directory where the split listings will be stored 39 | # --file_path: Passes the array of WARC file paths to the Python script 40 | # --shuffle: Enables shuffling file paths in the listings 41 | echo $file_path 42 | python3 split_listing.py \ 43 | --files_num 10 \ 44 | --store_dir ../../listings/re-organizing_data/merge/run-1/2-5/${dump_date#CC-MAIN-} \ 45 | --file_path "${all_file_paths[@]}" \ 46 | --shuffle True 47 | done 48 | 49 | 50 | for dump_date in "${ALL_DUMPS[@]}"; do 51 | all_file_paths=() 52 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 53 | all_file_paths+=("$file_path") 54 | 55 | # Execute a Python script to process the listed WARC file paths 56 | # --files_num: Specifies the number of output files to generate 57 | # --store_dir: Defines the directory where the split listings will be stored 58 | # --file_path: Passes the array of WARC file paths to the Python script 59 | # --shuffle: Enables shuffling file paths in the listings 60 | echo $file_path 61 | python3 split_listing.py \ 62 | --files_num 7 \ 63 | --store_dir ../../listings/re-organizing_data/merge/run-1/6-10/${dump_date#CC-MAIN-} \ 64 | --file_path "${all_file_paths[@]}" \ 65 | --shuffle True 66 | done 67 | 68 | 69 | for dump_date in "${ALL_DUMPS[@]}"; do 70 | all_file_paths=() 71 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 72 | all_file_paths+=("$file_path") 73 | 74 | # Execute a Python script to process the listed WARC file paths 75 | # --files_num: Specifies the number of output files to generate 76 | # --store_dir: Defines the directory where the split listings will be stored 77 | # --file_path: Passes the array of WARC file paths to the Python script 78 | # --shuffle: Enables shuffling file paths in the listings 79 | echo $file_path 80 | python3 split_listing.py \ 81 | --files_num 5 \ 82 | --store_dir ../../listings/re-organizing_data/merge/run-1/11-100/${dump_date#CC-MAIN-} \ 83 | --file_path "${all_file_paths[@]}" \ 84 | --shuffle True 85 | done 86 | 87 | 88 | for dump_date in "${ALL_DUMPS[@]}"; do 89 | all_file_paths=() 90 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 91 | all_file_paths+=("$file_path") 92 | 93 | # Execute a Python script to process the listed WARC file paths 94 | # --files_num: Specifies the number of output files to generate 95 | # --store_dir: Defines the directory where the split listings will be stored 96 | # --file_path: Passes the array of WARC file paths to the Python script 97 | # --shuffle: Enables shuffling file paths in the listings 98 | echo $file_path 99 | python3 split_listing.py \ 100 | --files_num 3 \ 101 | --store_dir ../../listings/re-organizing_data/merge/run-1/101-1000/${dump_date#CC-MAIN-} \ 102 | --file_path "${all_file_paths[@]}" \ 103 | --shuffle True 104 | done 105 | 106 | 107 | for dump_date in "${ALL_DUMPS[@]}"; do 108 | all_file_paths=() 109 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 110 | all_file_paths+=("$file_path") 111 | 112 | # Execute a Python script to process the listed WARC file paths 113 | # --files_num: Specifies the number of output files to generate 114 | # --store_dir: Defines the directory where the split listings will be stored 115 | # --file_path: Passes the array of WARC file paths to the Python script 116 | # --consecutive: Enables consecutive listings of the file paths 117 | echo $file_path 118 | python3 split_listing.py \ 119 | --files_num 2 \ 120 | --store_dir ../../listings/re-organizing_data/merge/run-1/1001-inf/${dump_date#CC-MAIN-} \ 121 | --file_path "${all_file_paths[@]}" \ 122 | --shuffle True 123 | done -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/split_listing_for_re-organizing_data_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Initialize an array to store the names of new dumps 4 | declare -a ALL_DUMPS 5 | 6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array 7 | # This file contains a list of new dump directories 8 | while IFS= read -r line; do 9 | ALL_DUMPS+=("$line") 10 | done < "commoncrawlList/dumplist.txt" 11 | 12 | all_file_paths=() 13 | for dump_date in "${ALL_DUMPS[@]}"; do 14 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 15 | all_file_paths+=("$file_path") 16 | done 17 | 18 | # Execute a Python script to process the listed WARC file paths 19 | # --files_num: Specifies the number of output files to generate 20 | # --store_dir: Defines the directory where the split listings will be stored 21 | # --file_path: Passes the array of WARC file paths to the Python script 22 | # --shuffle: Enables shuffling file paths in the listings 23 | python3 split_listing.py \ 24 | --files_num 20 \ 25 | --store_dir ../../listings/re-organizing_data/split/run-1/ \ 26 | --file_path "${all_file_paths[@]}" \ 27 | --shuffle True -------------------------------------------------------------------------------- /web_pipeline/download/process_listings/split_listing_for_text_extraction.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Initialize an array to store the names of new dumps 4 | declare -a ALL_DUMPS 5 | 6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array 7 | # This file contains a list of new dump directories 8 | while IFS= read -r line; do 9 | ALL_DUMPS+=("$line") 10 | done < "commoncrawlList/dumplist.txt" 11 | 12 | for dump_date in "${ALL_DUMPS[@]}"; do 13 | all_file_paths=() 14 | file_path="commoncrawlList/$dump_date/warc.paths.gz" 15 | all_file_paths+=("$file_path") 16 | 17 | # Execute a Python script to process the listed WARC file paths 18 | # --files_num: Specifies the number of output files to generate 19 | # --store_dir: Defines the directory where the split listings will be stored 20 | # --file_path: Passes the array of WARC file paths to the Python script 21 | # --shuffle: Enables shuffling file paths in the listings 22 | echo $file_path 23 | python3 download/process_listings/split_listing.py \ 24 | --files_num 200 \ 25 | --store_dir listings/text_extraction/run-1/${dump_date#CC-MAIN-} \ 26 | --file_path "${all_file_paths[@]}" \ 27 | --shuffle True 28 | done -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Bo Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/README.md: -------------------------------------------------------------------------------- 1 | # Convert MathML to Latex 2 | 3 | ## Introduction 4 | 5 | This project provides a Python script to convert MathML into Latex. 6 | 7 | ## Usage 8 | 9 | ```bash 10 | ./mathml2latex input.md output.md 11 | ``` 12 | 13 | Example `input.md`: 14 | 15 | ``` 16 | Gradient 17 | 18 | Let be a scalar field. The gradient is 19 | 20 | 21 | ``` 22 | 23 | Example `output.md`: 24 | 25 | ``` 26 | Gradient 27 | Let $f:{\mathbb{R}}^{n}\to \mathbb{R}$ be a scalar field. The gradient is 28 | $$\nabla f\left(x\right)=\left[\begin{array}{c}\frac{\mathit{\partial}f}{\mathit{\partial}{x}_{1}}\\ \vdots \\ \frac{\mathit{\partial}f}{\mathit{\partial}{x}_{n}}\end{array}\right]$$ 29 | ``` 30 | 31 | `output.md` rendered as: 32 | 33 | ![Rendered Latex Math](output.png) 34 | 35 | ## Background 36 | 37 | I started this little project when attempting to migrate from OneNote to Markdown. I have a large number of math notes with heavy equations, which makes my journey much bumpier. 38 | 39 | In OneNote, equations are stored in MathML format; while in Markdown, equations are in the form of Latex. 40 | 41 | As this may help others in similar situations, I decided to jot down the approaches to convert OneNote to Markdown below. 42 | 43 | There exist at least three ways to do the conversion. 44 | 45 | 1. **OneNote --> Word --> Markdown** 46 | This [method](https://github.com/SjoerdV/ConvertOneNote2MarkDown) appears to be the most popular. I found several similar repos on Github. 47 | There are two steps in this approach: 48 | * **Step 1**: Export OneNote documents in Word format, i.e. `.docx` 49 | * This export function is supported in the [standalone version](https://www.onenote.com/download) of OneNote on Windows. I have not found it available on Mac or on the version installed from the Microsoft Store. 50 | * **Step 2**: Convert Word documents to Markdown with [Pandoc](https://pandoc.org/) 51 | * This approach has the advantage of being able to export all OneNote documents with a single PowerShell script. 52 | * However, it is a disaster for my equations. When exporting to `.docx`, all equations are converted into images, which not only breaks the line alignment but also loses the capability of editing equations in the future. 53 |
54 | 55 | 2. **OneNote --> HTML --> Markdown** 56 | This approach is based on the [one2html](https://github.com/msiemens/one2html) project which utilizes the [onenote.rs](https://github.com/msiemens/onenote.rs) parser. 57 | 58 | The converted equations are in the HTML format, rather than the image format. However, the converter does not support MathML which renders the converted equation garbled. 59 |
60 | 61 | 3. **OneMark --> Markdown (with MathML) --> Markdown (with Latex)** 62 | [OneMark](http://neux.studio/) is a great plugin that enables writing OneNote with Markdown syntax. It also comes with a handy function to export OneNote into Markdown. 63 | This approach consists of two steps: 64 | * **Step 1**: Export OneNote to Markdown with OneMark 65 | * Since OneMark only has Windows version, you need to do this on a Windows machine with the standalone version of OneNote. 66 | * One inefficiency here is that OneMark currently only supports exporting one page at a time. Thus it may be laborious if you have a large number of notes like me. 67 | * **Step 2**: Convert MathML to Latex in Markdown 68 | * Equations in the Markdown generated by OneMark are in form of MathML which is not edit-friendly and cannot be displayed in many Markdown editors. 69 | * To convert MathML to Latex, I write a Python script which results in this repo. 70 | 71 | ## Mechanism 72 | 73 | `mathml2latex.py` detects MathML blocks denoted by ``. This wrapper block is generated by OneMark. 74 | 75 | The conversion of a MathML block is conducted in two phases: 76 | 77 | 1. Invoke [XSLT MathML](http://xsltml.sourceforge.net/) library to transform MathML structures to Latex markups 78 |
79 | 80 | 2. Convert UTF characters to Latex markups 81 | * XSLT MathML only converts the math structures to Latex markups while leaving UTF symbols like `π` in the literal form. 82 | * Many Markdown editors fail to recognize these UTF symbols which results in a failure of rendering. 83 | * Thus, `unicode2latex()` utilizes a lookup table to convert these UTF symbols to Latex markups, e.g. `\pi` 84 | 85 | ## Related 86 | 87 | * [mathconverter](https://github.com/oerpub/mathconverter/) : a nice math converter that inspires this project. Unfortunately it is not Markdown friendly. 88 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/input.md: -------------------------------------------------------------------------------- 1 | Gradient 2 | 3 | Let be a scalar field. The gradient is 4 | 5 | 6 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mathml2latex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import os 6 | import sys 7 | from lxml import etree 8 | from mathml2latex.unicode_map import unicode_map 9 | 10 | # MathML to LaTeX conversion with XSLT from Vasil Yaroshevich 11 | base_path = os.path.dirname(os.path.realpath(__file__)) 12 | xslt_file = os.path.join(base_path, 'mmltex', 'mmltex.xsl') 13 | xslt = etree.parse(xslt_file) 14 | transform = etree.XSLT(xslt) 15 | 16 | 17 | # add by zzwang 18 | 19 | def preprocess_and_parse_xml(xml_content): 20 | # 替换常见的 HTML 实体 21 | # entity_replacements = { 22 | # ' ': ' ', # 非断空格 23 | # '<': '<', # 小于号 24 | # '>': '>', # 大于号 25 | # '&': '&', # &符号 26 | # '"': '"', # 双引号 27 | # ''': ''', # 单引号 28 | # } 29 | 30 | # for entity, replacement in entity_replacements.items(): 31 | # xml_content = xml_content.replace(entity, replacement) 32 | 33 | # # 移除或替换其他可能导致问题的字符 34 | # xml_content = re.sub(r'&#x([0-9a-fA-F]+);', lambda m: chr(int(m.group(1), 16)), xml_content) 35 | # xml_content = re.sub(r'&#([0-9]+);', lambda m: chr(int(m.group(1))), xml_content) 36 | 37 | # 尝试解析预处理后的内容 38 | try: 39 | return etree.fromstring(xml_content) 40 | except etree.XMLSyntaxError as e: 41 | print(f"解析错误: {e}") 42 | # 如果仍然失败,可以尝试使用更宽松的解析器 43 | parser = etree.XMLParser(recover=True) 44 | return etree.fromstring(xml_content, parser) 45 | 46 | def mathml2latex(mathml_block): 47 | # Preprocess to remove aliases 48 | mathml_block = mathml_block.replace('<<', '<<').replace('>>', '>>') 49 | # dom = etree.fromstring(mathml_block) 50 | dom = preprocess_and_parse_xml(mathml_block) 51 | return transform(dom) 52 | 53 | def unicode2latex(latex_block): 54 | latex_text = str(latex_block, 'utf-8').encode('ascii', 'backslashreplace') 55 | for utf_code, latex_code in unicode_map.items(): 56 | latex_text = str(latex_text).replace(utf_code, latex_code) 57 | latex_text = latex_text.replace('\\\\', '\\') # "\\" --> "\" 58 | latex_text = re.sub(r'\\textcolor\[rgb\]\{[0-9.,]+\}', '', latex_text) # "\textcolor[rgb]{...}" --> "" 59 | latex_text = latex_text.replace('\\ ~\\ ', '{\\sim}') # " ~ " --> "{\sim}" 60 | latex_text = latex_text[len('b\''):][:-len('\'')] # b'...' --> ... 61 | latex_text = re.sub(r'^\$ ', '$', latex_text) # "$ " --> "$" 62 | latex_text = latex_text.replace('{\\ }', '\\ ') # "{ }" --> " " 63 | latex_text = re.sub(r' \}', '}', latex_text) # " }" --> "}" 64 | latex_text = latex_text.replace('\\n\\[\\n\\t', '$$').replace('\\n\\]', '$$') 65 | return latex_text 66 | 67 | def convert(text): 68 | mathml_blocks = re.findall(r"", text) 69 | for mathml_block in mathml_blocks: 70 | latex_block = mathml2latex(mathml_block) 71 | latex_text = unicode2latex(latex_block) 72 | text = text.replace('', latex_text) 73 | # Remove multiple consecutive blank lines 74 | for _ in range(2): 75 | text = re.sub(r'\n\n', '\n', text) 76 | return text 77 | 78 | def main(): 79 | input_file = open(sys.argv[1], "r", encoding="utf-8") 80 | input = input_file.read() 81 | input_file.close() 82 | output = convert(input) 83 | output_file = open(sys.argv[2], "w", encoding="utf-8") 84 | output_file.write(output) 85 | output_file.close() 86 | 87 | # if __name__ == "__main__": 88 | # main() 89 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mmltex/README: -------------------------------------------------------------------------------- 1 | README for the XSLT MathML Library 2.1.2 2 | 3 | XSLT MathML Library is a set of XSLT stylesheets to transform 4 | MathML 2.0 to LaTeX. 5 | 6 | For more information, see 7 | http://www.raleigh.ru/MathML/mmltex/index.php?lang=en 8 | 9 | Manifest 10 | -------- 11 | 12 | README this file 13 | mmltex.xsl 14 | tokens.xsl 15 | glayout.xsl 16 | scripts.xsl 17 | tables.xsl 18 | entities.xsl 19 | cmarkup.xsl 20 | 21 | Use 22 | --- 23 | 24 | There are two ways of using the library: 25 | 26 | * Use a local copy of the library. 27 | 28 | 1. Download the distribution (see below). 29 | 30 | 2. Unpack the distribution, using unzip. 31 | 32 | 3. In your stylesheet import or include either the main 33 | stylesheet, mmltex.xsl, or the stylesheet module you 34 | wish to use, such as tokens.xsl. This example assumes 35 | that the distribution has been extracted into the same 36 | directory as your own stylesheet: 37 | 38 | 39 | 40 | * Import or include either the main stylesheet, or the 41 | stylesheet module you wish to use, directly from the library 42 | website; http://www.raleigh.ru/MathML/mmltex/. For example: 43 | 44 | 45 | 46 | Obtaining The Library 47 | --------------------- 48 | 49 | The XSLT MathML Library is available for download as: 50 | 51 | * Zip file: http://www.raleigh.ru/MathML/mmltex/xsltml_2.1.2.zip 52 | 53 | Copyright 54 | --------- 55 | 56 | Copyright (C) 2001-2003 Vasil Yaroshevich 57 | 58 | Permission is hereby granted, free of charge, to any person 59 | obtaining a copy of this software and associated documentation 60 | files (the ``Software''), to deal in the Software without 61 | restriction, including without limitation the rights to use, 62 | copy, modify, merge, publish, distribute, sublicense, and/or 63 | sell copies of the Software, and to permit persons to whom the 64 | Software is furnished to do so, subject to the following 65 | conditions: 66 | 67 | The above copyright notice and this permission notice shall be 68 | included in all copies or substantial portions of the Software. 69 | 70 | Except as contained in this notice, the names of individuals 71 | credited with contribution to this software shall not be used in 72 | advertising or otherwise to promote the sale, use or other 73 | dealings in this Software without prior written authorization 74 | from the individuals in question. 75 | 76 | Any stylesheet derived from this Software that is publically 77 | distributed will be identified with a different name and the 78 | version strings in any derived Software will be changed so that 79 | no possibility of confusion between the derived package and this 80 | Software will exist. 81 | 82 | Warranty 83 | -------- 84 | 85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 88 | NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER 89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 92 | OTHER DEALINGS IN THE SOFTWARE. 93 | 94 | Contacting the Author 95 | --------------------- 96 | 97 | These stylesheets are maintained by Vasil Yaroshevich, . 98 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mmltex/README2: -------------------------------------------------------------------------------- 1 | This file is not part of the original source code. 2 | 3 | Researched links to archived web page and Sourceforge project: 4 | 5 | https://sourceforge.net/projects/xsltml/files/xsltml/ 6 | 7 | https://web.archive.org/web/20160109063934/http://www.raleigh.ru/MathML/mmltex/index.php 8 | 9 | Google Translated to English: 10 | https://translate.google.com/translate?sl=ru&tl=en&u=https%3A%2F%2Fweb.archive.org%2Fweb%2F20160114170851%2Fhttp%3A%2F%2Fwww.raleigh.ru%2FMathML%2Fmmltex%2Findex.php 11 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mmltex/glayout.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | \genfrac{}{}{ 18 | 19 | 20 | 21 | ex 22 | 23 | 24 | 0ex 25 | 26 | 27 | .05ex 28 | 29 | 30 | 31 | .2ex 32 | 33 | 34 | 35 | 36 | 37 | }{}{ 38 | 39 | 40 | \frac{ 41 | 42 | 43 | 44 | \hfill 45 | 46 | 47 | 48 | \hfill 49 | 50 | }{ 51 | 52 | \hfill 53 | 54 | 55 | 56 | \hfill 57 | 58 | } 59 | 60 | 61 | 62 | \raisebox{1ex}{$ 63 | 64 | $}\!\left/ \!\raisebox{-1ex}{$ 65 | 66 | $}\right. 67 | 68 | 69 | 70 | 71 | 72 | 73 | \sqrt[ 74 | 75 | ]{ 76 | 77 | } 78 | 79 | 80 | 81 | exception 25: 82 | \text{exception 25:} 83 | 84 | 85 | 86 | 87 | 88 | \sqrt{ 89 | 90 | } 91 | 92 | 93 | 94 | 95 | 96 | 97 | \left 98 | 99 | 100 | \ 101 | 102 | 103 | \left. 104 | 105 | 106 | 107 | \left( 108 | 109 | 110 | 111 | 112 | 113 | 114 | , 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | \right 134 | 135 | 136 | \ 137 | 138 | 139 | \right. 140 | 141 | 142 | 143 | \right) 144 | 145 | 146 | 147 | 148 | \phantom{ 149 | 150 | } 151 | 152 | 153 | 154 | 155 | 156 | \overline{ 157 | 158 | \hspace{.2em}|} 159 | 160 | 161 | \sqrt{ 162 | 163 | } 164 | 165 | 166 | \overline{) 167 | 168 | } 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | {\displaystyle 180 | 181 | 182 | { 183 | 184 | \textstyle 185 | \scriptstyle 186 | \scriptscriptstyle 187 | 188 | 189 | 190 | \colorbox[rgb]{ 191 | 192 | 193 | 194 | }{$ 195 | 196 | 197 | \textcolor[rgb]{ 198 | 199 | 200 | 201 | }{ 202 | 203 | 204 | 205 | } 206 | 207 | 208 | $} 209 | 210 | 211 | } 212 | 213 | 214 | } 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mmltex/mmltex.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | $ 26 | 27 | $ 28 | 29 | 30 | 31 | \[ 32 | 33 | \] 34 | 35 | 36 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mmltex/scripts.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | \overline{ 20 | 21 | 22 | 23 | 24 | } 25 | 26 | 27 | \overbrace{ 28 | 29 | 30 | 31 | 32 | } 33 | 34 | 35 | \overleftarrow{ 36 | 37 | 38 | 39 | 40 | } 41 | 42 | 43 | \overrightarrow{ 44 | 45 | 46 | 47 | 48 | } 49 | 50 | 51 | \overleftrightarrow{ 52 | 53 | 54 | 55 | 56 | } 57 | 58 | 59 | \underline{ 60 | 61 | 62 | 63 | 64 | 65 | } 66 | 67 | 68 | \underbrace{ 69 | 70 | 71 | 72 | 73 | 74 | } 75 | 76 | 77 | \underleftarrow{ 78 | 79 | 80 | 81 | 82 | 83 | } 84 | 85 | 86 | \underrightarrow{ 87 | 88 | 89 | 90 | 91 | 92 | } 93 | 94 | 95 | \underleftrightarrow{ 96 | 97 | 98 | 99 | 100 | 101 | } 102 | 103 | 105 | 113 | 114 | _{ 115 | 116 | }^{ 117 | 118 | } 119 | 120 | 121 | \underset{ 122 | 123 | }{\overset{ 124 | 125 | }{ 126 | 127 | }} 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | \overline{ 153 | 154 | } 155 | 156 | 157 | \overbrace{ 158 | 159 | } 160 | 161 | 162 | \overleftarrow{ 163 | 164 | } 165 | 166 | 167 | \overrightarrow{ 168 | 169 | } 170 | 171 | 172 | \overleftrightarrow{ 173 | 174 | } 175 | 176 | 177 | \tilde{ 178 | 179 | } 180 | 181 | 182 | \check{ 183 | 184 | } 185 | 186 | 187 | \dot{ 188 | 189 | } 190 | 191 | 192 | \ddot{ 193 | 194 | } 195 | 196 | 197 | 198 | 199 | \widehat{ 200 | 201 | 202 | \hat{ 203 | 204 | 205 | } 206 | 207 | 209 | 217 | 218 | ^{ 219 | 220 | } 221 | 222 | 223 | \stackrel{ 224 | 225 | }{ 226 | 227 | } 228 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | \underline{ 244 | 245 | } 246 | 247 | 248 | \underbrace{ 249 | 250 | } 251 | 252 | 253 | \underleftarrow{ 254 | 255 | } 256 | 257 | 258 | \underrightarrow{ 259 | 260 | } 261 | 262 | 263 | \underleftrightarrow{ 264 | 265 | } 266 | 267 | 269 | 277 | 278 | _{ 279 | 280 | } 281 | 282 | 283 | \underset{ 284 | 285 | }{ 286 | 287 | } 288 | 289 | 290 | 291 | 292 | 293 | { 294 | 295 | }_{ 296 | 297 | }^{ 298 | 299 | } 300 | 301 | 302 | 303 | { 304 | 305 | }^{ 306 | 307 | } 308 | 309 | 310 | 311 | { 312 | 313 | }_{ 314 | 315 | } 316 | 317 | 318 | 319 | 320 | 321 | {}_{ 322 | 323 | } 324 | 325 | 326 | {}^{ 327 | 328 | } 329 | 330 | 331 | 332 | 333 | 334 | {} 335 | 336 | 337 | _{ 338 | 339 | } 340 | 341 | 342 | ^{ 343 | 344 | } 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | {} 359 | 360 | 361 | _{ 362 | 363 | } 364 | 365 | 366 | ^{ 367 | 368 | } 369 | 370 | 371 | 372 | 373 | 374 | 375 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mmltex/tables.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | \multicolumn{ 15 | 16 | }{c}{ 17 | 18 | } 19 | 20 | & 21 | 22 | 23 | 24 | 25 | 26 | 27 | \hfill 28 | 29 | 30 | 31 | \hfill 32 | 33 | 34 | 36 | & 37 | 38 | 39 | 40 | 41 | 42 | 43 | \\ 44 | 45 | 46 | 47 | 48 | \begin{array}{ 49 | 50 | | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | | 85 | 86 | } 87 | 88 | \hline 89 | 90 | 91 | 92 | \\ \hline 93 | 94 | \end{array} 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/mmltex/tokens.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | \textcolor{red}{ 20 | 21 | } 22 | 23 | 24 | 25 | 26 | 27 | \mathrm{ 28 | 29 | } 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | \mathrm{ 41 | 42 | } 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | \left 56 | 57 | 58 | \right 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | \text{ 72 | 73 | } 74 | 75 | 76 | 77 | \phantom{\rule 78 | 79 | [- 80 | 81 | ] 82 | 83 | { 84 | 85 | 0ex 86 | 87 | 88 | }{ 89 | 90 | 0ex 91 | 92 | 93 | }} 94 | 95 | 96 | 97 | 98 | 99 | '' 100 | 101 | 102 | '' 103 | 104 | 105 | 106 | 107 | 108 | \colorbox[rgb]{ 109 | 110 | 111 | 112 | }{$ 113 | 114 | 115 | \textcolor[rgb]{ 116 | 117 | 118 | 119 | }{ 120 | 121 | 122 | 123 | 124 | \mathrm{ 125 | 126 | 127 | \mathbf{ 128 | 129 | 130 | \mathit{ 131 | 132 | 133 | \mathit{ 134 | The value bold-italic for mathvariant is not supported 135 | 136 | 137 | \mathbb{ 138 | 139 | 140 | \mathfrak{ 141 | The value bold-fraktur for mathvariant is not supported 142 | 143 | 144 | \mathcal{ 145 | 146 | 147 | \mathcal{ 148 | The value bold-script for mathvariant is not supported 149 | 150 | 151 | \mathfrak{ 152 | 153 | 154 | \mathsf{ 155 | 156 | 157 | \mathsf{ 158 | The value bold-sans-serif for mathvariant is not supported 159 | 160 | 161 | \mathsf{ 162 | The value sans-serif-italic for mathvariant is not supported 163 | 164 | 165 | \mathsf{ 166 | The value sans-serif-bold-italic for mathvariant is not supported 167 | 168 | 169 | \mathtt{ 170 | 171 | 172 | { 173 | Error at mathvariant attribute 174 | 175 | 176 | 177 | 178 | 179 | } 180 | 181 | 182 | } 183 | 184 | 185 | $} 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | , 221 | 222 | 223 | 224 | 225 | 226 | , 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | , 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | , 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 0,1,1 271 | 0,0,0 272 | 0,0,1 273 | 1,0,1 274 | .5,.5,.5 275 | 0,.5,0 276 | 0,1,0 277 | .5,0,0 278 | 0,0,.5 279 | .5,.5,0 280 | .5,0,.5 281 | 1,0,0 282 | .75,.75,.75 283 | 0,.5,.5 284 | 1,1,1 285 | 1,1,0 286 | 287 | Exception at color template 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | Exception at Hex2Decimal template 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/output.md: -------------------------------------------------------------------------------- 1 | Gradient 2 | Let $f:{\mathbb{R}}^{n}\to \mathbb{R}$ be a scalar field. The gradient is 3 | $$\nabla f\left(x\right)=\left[\begin{array}{c}\frac{\mathit{\partial}f}{\mathit{\partial}{x}_{1}}\\ \vdots \\ \frac{\mathit{\partial}f}{\mathit{\partial}{x}_{n}}\end{array}\right]$$ 4 | -------------------------------------------------------------------------------- /web_pipeline/mathml2latex/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/mathml2latex/output.png -------------------------------------------------------------------------------- /web_pipeline/requirements.txt: -------------------------------------------------------------------------------- 1 | resiliparse 2 | datatrove 3 | fasttext 4 | nltk 5 | tqdm 6 | bs4 7 | wget 8 | pyahocorasick 9 | fasteners 10 | tldextract -------------------------------------------------------------------------------- /web_pipeline/url_filtering/url_filter.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | from datatrove.pipeline.filters.url_filter import URLFilter 3 | from datatrove.pipeline.writers.disk_base import DiskWriter 4 | from datatrove.data import Document 5 | from typing import Iterable 6 | import os 7 | import time 8 | import re 9 | import os 10 | 11 | ASSETS_PATH = "url_filtering" 12 | 13 | normalizer = re.compile(r"[^a-zA-Z0-9]+") 14 | 15 | def normalize(text, replace=""): 16 | return normalizer.sub(replace, text).lower() 17 | 18 | def parse_list(line, do_normalize=True): 19 | return {normalize(x) if do_normalize else x.strip() for x in line if x[0] != "#"} 20 | 21 | def get_list(abs_path: str, file_name: str, extra: set, do_normalize: bool = True): 22 | with open(os.path.join(abs_path, file_name)) as f: 23 | return parse_list(f, do_normalize).union(extra) 24 | 25 | class CustomURLFilterWithWhitelist(URLFilter): 26 | """ 27 | Extends URLFilter to include a whitelist functionality. 28 | URLs from whitelisted domains or exact whitelisted URLs will bypass all other filters. 29 | """ 30 | name = "😈Custom Url-filter With Whitelist" 31 | _requires_dependencies = ["tldextract", "fasteners", ("ahocorasick", "pyahocorasick")] 32 | 33 | def __init__( 34 | self, 35 | use_whitelist: bool = True, 36 | whitelist_domains: Iterable = None, 37 | whitelist_urls: Iterable = None, 38 | do_remove_curated_sources: bool = False, 39 | curated_domains: Iterable = None, 40 | do_load_from_cache: bool = True, 41 | do_add_extra_domain_and_urls: bool = False, 42 | exclusion_writer: DiskWriter = None, 43 | *args, 44 | **kwargs 45 | ): 46 | if do_add_extra_domain_and_urls: 47 | extra_domains, extra_urls = set(), set() 48 | blocklist_dir = os.path.join(ASSETS_PATH, "urls", "blocklist") 49 | for dirname in os.listdir(blocklist_dir): 50 | if not os.path.isdir(os.path.join(blocklist_dir, dirname)): 51 | continue 52 | extra_domains = get_list(os.path.join(blocklist_dir, dirname), "domains", extra_domains , do_normalize=False) 53 | print(f"domain size: {len(extra_domains)}") 54 | extra_urls = get_list(os.path.join(blocklist_dir, dirname), "urls", extra_urls, do_normalize=False) 55 | print(f"domain size: {len(extra_urls)}") 56 | 57 | print(f"Extra domains ({len(extra_domains)}) and urls ({len(extra_urls)})") 58 | super().__init__( 59 | extra_domains = extra_domains, 60 | extra_urls = extra_urls, 61 | exclusion_writer = exclusion_writer 62 | ) 63 | print("use extra domains and urls") 64 | else: 65 | super().__init__( 66 | exclusion_writer = exclusion_writer 67 | ) 68 | self.whitelist_domains = set(whitelist_domains or []) 69 | self.whitelist_urls = set(whitelist_urls or []) 70 | self.use_whitelist = use_whitelist 71 | self.do_remove_curated_sources = do_remove_curated_sources 72 | self.curated_domains = set(curated_domains or []) 73 | 74 | if do_load_from_cache: 75 | whitelist_dir = os.path.join(ASSETS_PATH, "urls", "whitelist") 76 | self.whitelist_domains = get_list(whitelist_dir, "domains", self.whitelist_domains, do_normalize=False) 77 | self.whitelist_urls = get_list(whitelist_dir, "urls", self.whitelist_urls, do_normalize=False) 78 | 79 | curated_dir = os.path.join(ASSETS_PATH, "urls", "curated") 80 | self.curated_domains = get_list(curated_dir, "domains", self.curated_domains, do_normalize=False) 81 | 82 | if not self.use_whitelist: 83 | self.whitelist_domains = set() 84 | self.whitelist_urls = set() 85 | if not self.do_remove_curated_sources: 86 | self.curated_domains = set() 87 | 88 | def filter(self, document: Document) -> bool | tuple[bool, str]: 89 | self.download_data() 90 | url = document.metadata.get("url") 91 | 92 | assert url, "Document does not have url in its metadata" 93 | url_info = self.tldextractor(url) 94 | 95 | # Check if the URL or its domain is in the whitelist 96 | if url in self.whitelist_urls or url_info.registered_domain in self.whitelist_domains or url_info.fqdn in self.whitelist_domains: 97 | return True 98 | 99 | if url_info.registered_domain in self.curated_domains or url_info.fqdn in self.curated_domains: 100 | if not self.do_remove_curated_sources: 101 | assert self.curated_domains == set() 102 | return False, "curated" 103 | 104 | # If not whitelisted, proceed with the original filtering logic 105 | return super().filter(document) 106 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/adult/domains: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d6c71c68acd2f7d28103f4a61614cfe73569060ca776b5bfa1bec5bf2843db62 3 | size 122607801 4 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/adult/expressions: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/adult/expressions -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/adult/usage: -------------------------------------------------------------------------------- 1 | black 2 | adult 3 | porn 4 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/adult.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/adult.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/adult/domains: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bbc3b59a265a9bda95b601d28f0a1a5524eff6276f70cdc23dd054c5a0bc1a9d 3 | size 122806347 4 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/adult/expressions: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/adult/expressions -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/adult/usage: -------------------------------------------------------------------------------- 1 | black 2 | adult 3 | porn 4 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/agressif.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/agressif.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/agressif/domains: -------------------------------------------------------------------------------- 1 | 118.123.4.224 2 | 128.121.249.189 3 | 14words.com 4 | 163.177.220.59 5 | 183.61.166.187 6 | 192.67.198.4 7 | 192.67.198.49 8 | 193.96.188.143 9 | 195.4.52.48 10 | 195.63.211.202 11 | 199.93.70.2 12 | 203.2.124.18 13 | 204.181.176.53 14 | 204.50.24.185 15 | 205.160.14.21 16 | 205.160.14.22 17 | 205.167.142.107 18 | 205.167.142.6 19 | 205.241.44.90 20 | 206.113.230.2 21 | 206.160.0.11 22 | 206.160.0.248 23 | 206.160.0.252 24 | 206.168.114.50 25 | 206.168.114.52 26 | 206.244.69.51 27 | 206.31.204.150 28 | 207.201.162.40 29 | 207.231.72.88 30 | 207.36.45.220 31 | 207.70.7.168 32 | 207.71.8.68 33 | 208.185.127.162 34 | 208.185.127.163 35 | 208.48.246.80 36 | 208.55.206.181 37 | 209.103.172.199 38 | 209.123.16.9 39 | 209.126.159.27 40 | 209.15.74.4 41 | 209.15.84.106 42 | 209.161.0.32 43 | 209.189.198.102 44 | 209.195.130.178 45 | 209.196.188.172 46 | 209.197.123.166 47 | 209.204.200.140 48 | 209.204.217.45 49 | 209.240.128.4 50 | 209.250.128.7 51 | 209.35.194.183 52 | 209.61.200.137 53 | 212.114.150.187 54 | 212.227.118.69 55 | 212.227.174.218 56 | 212.38.173.23 57 | 213.130.63.232 58 | 216.100.98.13 59 | 216.100.98.17 60 | 216.100.98.24 61 | 216.100.99.13 62 | 216.110.132.232 63 | 216.110.143.153 64 | 216.126.73.71 65 | 216.127.68.84 66 | 216.131.71.161 67 | 216.150.67.66 68 | 216.169.106.11 69 | 216.218.248.244 70 | 216.219.253.193 71 | 216.40.195.47 72 | 216.40.213.201 73 | 216.43.175.114 74 | 62.116.137.145 75 | 62.116.137.155 76 | 62.116.138.132 77 | 62.116.138.135 78 | 62.116.138.140 79 | 62.116.138.142 80 | 62.116.140.196 81 | 63.218.152.42 82 | 63.236.214.203 83 | 63.249.227.174 84 | 64.156.139.229 85 | 64.239.80.146 86 | 64.239.87.165 87 | 64.70.225.193 88 | 64.82.99.102 89 | 66.175.2.27 90 | 66.255.14.41 91 | 66.28.60.94 92 | 66.78.36.83 93 | 6killshit.tumblr.com 94 | 81.88.35.41 95 | 81.88.35.42 96 | _video-bagarre.com 97 | aaargh-international.org 98 | aaargh.com.mx 99 | abandonfear.tumblr.com 100 | abbc.com 101 | adelaideinstitute.org 102 | aevasitcomno.tk 103 | aime-et-sers.com 104 | air-photo.com 105 | al-jinan.org 106 | algerie-francaise.org 107 | americandefenseleague.com 108 | americannaziparty.com 109 | americanskinheads.com 110 | amren.com 111 | anp14.com 112 | anu.org 113 | anus.com 114 | aryan-nation.org 115 | aryan-nations.org 116 | aryan88.com 117 | aryannations88.com 118 | aryannationsknightskkk.org 119 | aryanwear.com 120 | aufmarsch.de 121 | auslaenderstopp.net 122 | azelin.files.wordpress.com 123 | bagarres.be 124 | bagarres.com 125 | barnesreview.org 126 | bayouknights.org 127 | bestgore.com 128 | bhsweden.tsx.org 129 | blacksandjews.com 130 | blancheurope.com 131 | blogdemariepauledarchicourt.hautetfort.com 132 | bloodandhonour.com 133 | bloodandhonour.de 134 | bloodshows.com 135 | bnp.net 136 | buchanan.org 137 | buendnis-rechts.de 138 | bulldog88.tsx.org 139 | burks.de 140 | cadaver.org 141 | campaign.davidduke.com 142 | cenobite.com 143 | christiangallery.com 144 | christianseparatist.org 145 | churchfliers.com 146 | civil-liberties.com 147 | codoh.com 148 | codoh.org 149 | cofcc.org 150 | compuserb.com 151 | contrelislam.org 152 | creator.org 153 | crusader.net 154 | daaargh.narod.ru 155 | dailyrotten.com 156 | dakingnv.cn 157 | dakingnv.com 158 | deadhouse.org 159 | deadhouse.xyz 160 | dealer-lejeu.com 161 | deathnet.com 162 | democratie-participative.com 163 | democratie-participative.fr 164 | democratie-participative.net 165 | democratie.participative.com 166 | democratie.participative.fr 167 | democratie.participative.net 168 | democratieparticipative.biz 169 | democratieparticipative.com 170 | democratieparticipative.fun 171 | democratieparticipative.host 172 | democratieparticipative.lol 173 | democratieparticipative.net 174 | democratieparticipative.online 175 | democratieparticipative.org 176 | democratieparticipative.site 177 | democratieparticipative.space 178 | democratieparticipative.website 179 | democrativeparticipative.link 180 | der-fuehrer.org 181 | der-stuermer.org 182 | deutsches-rechtsbuero.de 183 | deutsches-reich.de 184 | deviantsockpuppet.com 185 | dsz-verlag.de 186 | duke.org 187 | ety.com 188 | fa.federation-anarchiste.org 189 | faem.com 190 | fkun.de 191 | flawlesslogic.com 192 | forumpatriote.org 193 | fpp.co.uk 194 | france-avenir.com 195 | franceavenir.free.fr 196 | frank-rennicke.de 197 | freedomsite.org 198 | freikorps.com 199 | freimaurer.org 200 | front-comtois.com 201 | gaelle.hautetfort.com 202 | globalfire.tv 203 | godhatesfags.com 204 | gorecenter.com 205 | goresee.com 206 | gudeian.50megs.com 207 | guderian.ds4a.com 208 | hammerskins.com 209 | hangemhighrecords.com 210 | hanse-records.de 211 | harold-covington.org 212 | heathenfront.org 213 | heimatkunde.tsx.org 214 | heimattreue-jugend.de 215 | hitler.org 216 | hitlerisgod.com 217 | hoffman-info.com 218 | holywar.org 219 | iahushua.com 220 | ihr.org 221 | innerdepravity.com 222 | internationalknights.de 223 | intransigeants.com 224 | jdo.org 225 | jeffsarchive.com 226 | jesus-is-lord.com 227 | jetueunami.com 228 | jewwatch.com 229 | jihadology.net 230 | johnsack.com 231 | jungefreiheit.de 232 | k-k-k.com 233 | kamellia.com 234 | kekma.net 235 | killerkomics.com 236 | kingidentity.com 237 | kkk.bz 238 | kkk.com 239 | kkkk.net 240 | kriegsfront.tsx.org 241 | kukluxklan.net 242 | kukluxklan.org 243 | kulmbacher.freeservers.com 244 | le-projet-juif.com 245 | libreopinion.com 246 | louisbeam.com 247 | mankind.org 248 | melvig.org 249 | metapedia.org 250 | micetrap.net 251 | midgaard.org 252 | milgear.fi 253 | missiontoisrael.org 254 | mnsf.info 255 | modelguns-worldwide.com 256 | musicalterrorists.com 257 | mysticknights.org 258 | n-a-f.com 259 | naawp.com 260 | natall.com 261 | natvan.com 262 | nazi-lauck-nsdapao.com 263 | nazi.org 264 | newgrounds.com 265 | neworderknights.com 266 | nit.de 267 | nizkor.org 268 | nocturnevideoculte.com 269 | noontidepress.com 270 | nordfront.de.cx 271 | nordland.net 272 | nordzeit.de 273 | normanfinkelstein.com 274 | npd.net 275 | nsbm.org 276 | nseuropa.org 277 | nsm88.com 278 | nswpp.org 279 | nukeisrael.com 280 | oeuvre-francaise.com 281 | oeuvrefrancaise.com 282 | oikrach.com 283 | ostara.org 284 | ourhero.com 285 | paaargh.blogspot.com 286 | panzerfaust.com 287 | pathcom.com 288 | patriot.dk 289 | pornhulknews.com 290 | posse-comitatus.org 291 | propatria.org 292 | queinsania.com 293 | racnet.tsx.org 294 | radioislam.net 295 | radioislam.org 296 | rahowa.com 297 | rahowa.us 298 | resist.com 299 | resistance.com 300 | revilo-oliver.com 301 | revisionism.com 302 | revisionists.com 303 | rotten.com 304 | rudolf-hess.org 305 | sanctioned-suicide.net 306 | sanctioned-suicide.org 307 | school-fights.com 308 | scripturesforamerica.org 309 | seegore.com 310 | seek-info.com 311 | siegener-baerensturm.de 312 | signal-online.de 313 | sigrdrifa.com 314 | site88.8m.com 315 | skinheadxx.tsx.org 316 | sogore.com 317 | sos-racaille.org 318 | sosfrance.com 319 | splcenter.org 320 | spotlight.org 321 | ssenterprises.com 322 | ssman.com 323 | stormfront.org 324 | thiazi.net 325 | thinkmasa.org 326 | thulenet.com 327 | thulepublications.com 328 | tightrope.cc 329 | trashercorpse.free.fr 330 | tt-v.de 331 | ukar.org 332 | ungraindesable.the-savoisien.com 333 | unitedskins.com 334 | unitedstrike.com 335 | vanguardnewsnetwork.com 336 | vho.org 337 | volkermord.com 338 | volksgemeinschaft.org 339 | wakeupordie.com 340 | wckkkk.com 341 | wcotc.com 342 | webresistant.over-blog.com 343 | whemporium.com 344 | whitehonor.com 345 | whitepower.com 346 | whitepride.com 347 | whitepride.net 348 | whiterace.com 349 | whiteracist.com 350 | whitesingles.com 351 | whiteunitypress.com 352 | widerstand.com 353 | williscarto.com 354 | wno.org 355 | wotansdungeon.tsx.org 356 | wpww.com 357 | x-guns.com 358 | yoderanium.com 359 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/agressif/expressions: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/agressif/expressions -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/agressif/urls: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | version https://git-lfs.github.com/spec/v1 3 | oid sha256:c32c45f03148918256626cb9bf1a0a230e83a873230baa1617a12605471a26d8 4 | size 837 5 | ======= 6 | 129.105.212.34/~abutz 7 | 193.195.1.1/natofeur 8 | 204.71.88.20/ygg 9 | 204.71.88.21/ygg 10 | 204.71.88.22/ygg 11 | 204.71.88.23/ygg 12 | 64.15.239.150/~wikinger-versand 13 | adl.org/poisoning_web 14 | anjora.de/nwo 15 | archive.org/details/qatal3 16 | bigfoot.com/~wikinger-versand 17 | come.to/bah 18 | come.to/heilkroiter 19 | come.to/ndj 20 | concentric.net/~nwk 21 | corax.org/revisionism 22 | cri.univ-tlse1.fr/tools/test_filtrage/agressif/ 23 | cycad.com/cgi-bin/upstream 24 | ddc.net/ygg 25 | demon.co.uk/natofeur 26 | encyclopediadramatica.se/aborigines 27 | fortunecity.com/boozers/whitehart 28 | geocities.com/allo03714 29 | geocities.com/blaaargh8864 30 | go.to/bloodandhonour 31 | imbris.net/~fourteenwords 32 | members.theglobe.com/klanman1 33 | nidlink.com/~aryanvic 34 | nidlink.com/~fourteenwords 35 | ozemail.com.au/~drumbeat 36 | pubweb.acns.nwu.edu/~abutz 37 | relaypoint.net/~lsf 38 | ruspatriot.com/skinhead 39 | twitter.com/anp14 40 | website.yabz.net/chaaargh 41 | ziplink.net/~bright 42 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a 43 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/agressif/usage: -------------------------------------------------------------------------------- 1 | black 2 | aggressive 3 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/arjel.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/arjel.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/arjel/domains: -------------------------------------------------------------------------------- 1 | 200poker.fr 2 | 200pour100.fr 3 | 200pour100poker.fr 4 | 200pourcent.fr 5 | 200pourcentpoker.fr 6 | 888.fr 7 | 888poker.fr 8 | acfpoker.fr 9 | barrierepoker.fr 10 | betclic-mobile.fr 11 | betclic.fr 12 | betclick-mobile.fr 13 | betclickmobile.fr 14 | betclicmobile.fr 15 | betnet.fr 16 | bwin.fr 17 | chilipari.fr 18 | chilipoker.fr 19 | coupedumonde-pari.fr 20 | eurosportbet.fr 21 | everestpoker.fr 22 | football-pari.fr 23 | football365.fr 24 | france-pari.fr 25 | friendbet.fr 26 | fulltiltpoker.fr 27 | gamebookers.fr 28 | genybet.fr 29 | intralot.fr 30 | intralotpari.fr 31 | jechope.com 32 | jeux365.fr 33 | joa-club.fr 34 | joa-online.fr 35 | joaclub.fr 36 | joaonline.fr 37 | leturf.fr 38 | luckyjeux.fr 39 | mypok.fr 40 | pacificpoker.fr 41 | parions974.fr 42 | parionsweb.fdj.fr 43 | parionsweb.fr 44 | paris365.fr 45 | partouche.fr 46 | partybets.fr 47 | partypoker.fr 48 | peoplesbet.fr 49 | peoplesnetwork.fr 50 | pkr.fr 51 | placedesparis.fr 52 | pmu.fr 53 | poker365.fr 54 | poker83.fr 55 | pokerstars.fr 56 | pokersubito.fr 57 | pokerxtrem.fr 58 | sajoo.fr 59 | sportnco.fr 60 | titan.fr 61 | titanpartners.fr 62 | tranchant-poker.fr 63 | tranchantpoker.fr 64 | unibet.fr 65 | winamax.fr 66 | winga.fr 67 | wpt.fr 68 | wsop.fr 69 | zeturf.fr 70 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/arjel/urls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/arjel/urls -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/arjel/usage: -------------------------------------------------------------------------------- 1 | white 2 | black 3 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/chat.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/chat.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/chat/domains: -------------------------------------------------------------------------------- 1 | 12buzz.com 2 | 193.238.160.62 3 | 193.238.162.21 4 | 194.130.106.132 5 | 195.33.103.52 6 | 207.46.110.254 7 | 207.46.110.48 8 | 207.68.178.239 9 | 208.81.191.110 10 | 209.67.215.236 11 | 213.199.154.11 12 | 213.199.154.54 13 | 213.91.8.214 14 | 216.129.112.65 15 | 216.129.112.66 16 | 216.129.112.67 17 | 216.129.112.68 18 | 216.129.112.69 19 | 216.129.112.88 20 | 216.129.126.66 21 | 216.178.160.34 22 | 216.32.66.235 23 | 216.32.67.212 24 | 216.32.68.171 25 | 216.32.84.236 26 | 321chat.com 27 | 47.91.114.71 28 | 47.91.122.46 29 | 64.13.152.67 30 | 64.92.173.122 31 | 69.36.226.107 32 | 69.36.226.108 33 | 69.36.226.109 34 | 69.36.226.134 35 | 69.36.226.135 36 | 69.36.226.141 37 | 69.36.226.142 38 | 69.36.226.143 39 | 69.36.226.144 40 | 69.36.226.145 41 | 69.36.226.146 42 | 69.36.226.147 43 | 69.36.226.148 44 | 69.36.226.149 45 | 69.36.250.11 46 | 69.36.250.12 47 | 69.36.250.13 48 | 69.36.250.14 49 | 69.36.250.15 50 | 69.36.250.16 51 | 69.36.250.17 52 | 69.36.250.18 53 | 69.36.250.19 54 | 69.36.250.20 55 | 69.36.250.21 56 | 69.36.250.22 57 | 69.36.250.23 58 | 69.36.250.24 59 | 69.36.250.25 60 | 69.36.250.26 61 | 69.36.250.27 62 | 69.36.250.28 63 | 69.36.250.29 64 | 69.36.250.30 65 | 69.36.250.31 66 | 69.36.250.32 67 | 69.36.250.33 68 | 69.36.250.35 69 | 69.36.250.36 70 | 69.36.250.37 71 | 69.36.250.68 72 | 69.36.250.9 73 | 72.21.57.84 74 | 72.232.63.35 75 | 85.184.4.4 76 | 8ch.net 77 | aimexpress.oscar.aol.com 78 | airaim.com 79 | ajaxim.org 80 | api.msn.com 81 | assets.msn.com 82 | azarlive.com 83 | aznstar.free.fr 84 | babel.com 85 | bantu.com 86 | batepapo.uol.com.br 87 | bazoocam.org 88 | big-kiss.com 89 | blockedsuks.co.nr 90 | bloochat.com 91 | bonplanchat.com 92 | chaat.fr 93 | chapatiz.com 94 | chat-paradise.com 95 | chat.nrj.fr 96 | chat.org 97 | chat.ru 98 | chat.voila.fr 99 | chateagratis.net 100 | chateandogratis.org 101 | chateaya.org 102 | chatenabled.mail.google.com 103 | chatiw.me 104 | chatroom.conexionplacer.com 105 | chatroulette.com 106 | chatteurs.com 107 | clientless.net 108 | coco.fr 109 | communicationtube.com 110 | communicationtube.net 111 | crisp.chat 112 | discord.com 113 | discordapp.com 114 | e-messenger.net 115 | e-messenget.net 116 | easymessage.net 117 | easymessenger.net 118 | ebuddy.com 119 | emessenger.cl 120 | express.instan-t.com 121 | filter.msn.com 122 | gateway.messenger.live.com 123 | gazzag.com 124 | hopster.com 125 | i3connect.com 126 | ibypass.com 127 | icq.com 128 | iloveim.co.uk 129 | iloveim.com 130 | imaginarlo.com 131 | imhaha.com 132 | imo.im 133 | imtiger.com 134 | imunitive.com 135 | imvu.com 136 | interactiveni.com 137 | inversas.jazztel.es 138 | izuz.net 139 | jeempo.com 140 | jivochat.com 141 | jpager.yahoo.com 142 | jwchat.org 143 | kiwibox.com 144 | kmess.sourceforge.ne 145 | kolikoli.tk 146 | koolim.com 147 | laffer.sourceforge.net 148 | livechat.com 149 | livechatinc.com 150 | livechatinc.net 151 | liveperson.com 152 | loovchat.com 153 | mabber.com 154 | mangeloo.com 155 | mastaline.com 156 | mbm3550nl1n3.siteburg.com 157 | meebo.com 158 | meebo.com.br 159 | meebo.cust.layer42.net 160 | meebome.com 161 | mercury.to 162 | mess.be 163 | messbrasil.cidadeinternet.com.br 164 | messbrasil.com.br 165 | messenger.com 166 | messenger.hotmail.com 167 | messenger.msn.com 168 | messenger.sapo.pt 169 | messenger.services.live.com 170 | messenger.uol.com.br 171 | messenger.yahoo.com 172 | messengerfx.com 173 | messengerfx.com.br 174 | messfreak.be 175 | messplaza.nl 176 | mijnmessenger.nl 177 | mingle.pt 178 | miranda-im.org 179 | msecnd.net 180 | msedge.net 181 | msgweb.nl 182 | msn.audiowatcher.com 183 | msn2go.com 184 | msn2go.com.br 185 | msnanywhere.com 186 | msnfanatic.com 187 | msnger.com 188 | msnskins.be 189 | myoms.net 190 | ninemsn.com 191 | nootmobile.com 192 | ntp.msn.com 193 | omegle.com 194 | onlinemessenger.nl 195 | orkut.com 196 | orkut.com.br 197 | phonefox.com 198 | picachat.com 199 | pidgin.im 200 | piglet-im.com 201 | piglet.0900provider.nl 202 | plugoo.com 203 | polysolve.com 204 | pucinhell.mine.nu 205 | racewarkingdoms.com 206 | radiusim.com 207 | reverse.layeredtech.com 208 | screenname.aol.com 209 | skype.com 210 | skype.net 211 | skypeassets.com 212 | skypeassets.net 213 | smail.fr 214 | snapchat.com 215 | snapchatweb.com 216 | snimmer.com 217 | spidermessenger.com 218 | stopennui.com 219 | sweetim.com 220 | t-messenger.com 221 | talk.google.com 222 | talkgadget.google.com 223 | tchat-enligne.fr 224 | tchatgratuit.eu 225 | tchatteur.com 226 | thevirtualbrowser.com 227 | threema.ch 228 | toc.oscar.aol.com 229 | toperkut.com 230 | trillian.cc 231 | trouter.io 232 | userapi.com 233 | vk.com 234 | vypress.com 235 | wablet.com 236 | wbmsn.net 237 | web2messenger.com 238 | webgama.789mb.com 239 | webmessenger.com 240 | webmessenger.com.br 241 | webmessenger.msn.com 242 | webmessenger.msn.es 243 | webmessenger.yahoo.com 244 | webuzztogether.com 245 | whatsapp.net 246 | x-chat.fr 247 | your-freedom.net 248 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/chat/urls: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | version https://git-lfs.github.com/spec/v1 3 | oid sha256:b92ae419a1c37bd77c7f65466291bf17efaf9cd4a27105e331f49d6af732c411 4 | size 476 5 | ======= 6 | 207.46.5.10/gateway/gateway.dll 7 | 212.19.193.108/servlet/login 8 | 72.36.146.44/servlets/login 9 | chrishemple.co.uk/proxy 10 | cri.univ-tlse1.fr/tools/test_filtrage/chat/ 11 | douradina.pr.gov.br/jacare 12 | ec.rdn.it/sms.asp 13 | facebook.com/ajax/chat 14 | fleo.com.ar/chatbox/ 15 | freepgs.com/defilter 16 | google.com/talk 17 | jabber.meta.net.nz/webmsg/register.php 18 | leamonde.net/im/index.php 19 | mail.google.com/mail/channel/bind 20 | messenger-online.com/emessenger.php 21 | researchhaven.com/chat.htm 22 | webtal.com.br/imagens/msn.html 23 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a 24 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/chat/usage: -------------------------------------------------------------------------------- 1 | black 2 | white 3 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/dating.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/dating.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/dating/urls: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | version https://git-lfs.github.com/spec/v1 3 | oid sha256:b903fbb7cbbf683f6e25821883309c9ad3dfd457319aff85daf557df1cfb7906 4 | size 342 5 | ======= 6 | a-deux.net/rencontre 7 | askmen.com/dating/ 8 | betolerant.fr/rencontre-ados-jeunes-lesbiennes/1.html 9 | cri.univ-tlse1.fr/tools/test_filtrage/dating/ 10 | divorceoumonop.a-deux.net/rencontre 11 | forum.ados.fr/love/amour/sites-rencontres-sujet_50847_1.htm 12 | gran-angular.net/categoria/citas/ 13 | habitamos.com/list/418/ 14 | malianteo.com/foros/f25/ 15 | skyrock.com/rencontres 16 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a 17 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/dating/usage: -------------------------------------------------------------------------------- 1 | black 2 | dating 3 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/ddos.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/ddos.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/ddos/domains: -------------------------------------------------------------------------------- 1 | 4bidden.info 2 | agebooter.com 3 | alphastress.com 4 | anonboot.com 5 | anonsecurityteam.com 6 | anonymous-stresser.com 7 | anonymous-stresser.net 8 | api-stresser.me 9 | apocalypse-solutions.com 10 | arkbooter.fr 11 | assasinsbooter.altervista.org 12 | astrostress.com 13 | atom-stresser.com 14 | atomstress.org 15 | aurastresser.com 16 | avengestresser.com 17 | b-h.us 18 | battle.pw 19 | begayage-stresser.com 20 | bemybooter.eu 21 | best-ddos-tool.ilovecassola.it 22 | beststresser.com 23 | blink-stresser.000webhostapp.com 24 | blunter.xyz 25 | boot-stresser.avpop37.com 26 | boot-stresser.duri46.com 27 | boot.lu 28 | boot.ml 29 | boot4free.com 30 | booter-panel.cnn-02.com 31 | booter-panel.sunlyfc.com 32 | booter-sales.hourb.com 33 | booter-vip.infonhadat.org 34 | booter.club 35 | booter.eu 36 | booter.im 37 | booter.in 38 | booter.is 39 | booter.ninja 40 | booter.org 41 | booter.pw 42 | booter.sx 43 | booter.vip 44 | booter.xyz 45 | bootme.pro 46 | bootr.org 47 | bootyou.net 48 | botstress.com 49 | bullstresser.com 50 | bullstresser.net 51 | bullstresser.to 52 | buybooters.com 53 | buyddos.com 54 | buzzbooter.info 55 | celerystresser.com 56 | celeste-stresser.xyz 57 | chargen.cf 58 | city-stresser.alwaysdata.net 59 | city-stressing.alwaysdata.net 60 | cloud-hosts.tk 61 | cnstresser.com 62 | connectionstresser.com 63 | crazyamp.me 64 | critical-boot.com 65 | cstress.net 66 | cyber-hub.pw 67 | cyber-sst.com 68 | cyberstresser.org 69 | cybervm.io 70 | darkbooter.com 71 | darkstresser.info 72 | darkstresser.net 73 | darkstresser.nl 74 | darlingstress.com 75 | databooter.com 76 | ddgu.ddos-guard.net 77 | ddos-fighter.com 78 | ddos-him.com 79 | ddos-ip.com 80 | ddos-ovh-v6.000webhostapp.com 81 | ddos-stress.eu 82 | ddos-stress.strayrounds.com 83 | ddos.city 84 | ddos.kr 85 | ddos.tools 86 | ddosbooter.link 87 | ddosbreak.com 88 | ddosclub.com 89 | ddoser-online.studyfund.com 90 | ddoser.xyz 91 | ddosforhire.net 92 | ddosit.net 93 | ddosit.us 94 | ddossite.com 95 | ddostheworld.com 96 | deadlyboot.net 97 | defcon.pro 98 | defconpro.net 99 | defianceprotocol.com 100 | dejabooter.com 101 | destressbooter.com 102 | destressnetworks.com 103 | deucalion.us 104 | diamond-stresser.net 105 | diamond-stresser.pw 106 | diebooter.com 107 | diebooter.net 108 | divinestresser.com 109 | dosarrest.com 110 | doshackers.tk 111 | down-stresser.alwaysdata.net 112 | down-stresser.com 113 | down-stresser.us 114 | downed.io 115 | downed.sx 116 | downthem.org 117 | dreamstresser.com 118 | ebolastresser.com 119 | emaizstresser.net 120 | emo-stresser.com 121 | energy-stresser.000webhostapp.com 122 | energy-stresser.alwaysdata.net 123 | equinoxstresser.net 124 | equivalentstresser.net 125 | etwork-stressing.net 126 | every-stresser.000webhostapp.com 127 | every-stresser.com 128 | evil-stress.xyz 129 | evilbooter.net 130 | exercy-stresser.alwaysdata.net 131 | exile-stresser.net 132 | exitus.to 133 | exostress.in 134 | expressdown.com 135 | fagstresser.net 136 | fiberstresser.com 137 | flood.to 138 | foreverinfamous.com 139 | formalitystresser.com 140 | free-boot.to 141 | free-boot.xyz 142 | free-ip-grabber.ilovecassola.it 143 | free-ip-puller.ilovecassola.it 144 | free-ip-stresser.sushinarii.com 145 | free-stresser.authenticbrownsstore.com 146 | free-stresser.ilovecassola.it 147 | free-stresser.sweetaires.com 148 | freeboot.pw 149 | freebooter4.me 150 | freeipstress.com 151 | freeipstresser.net 152 | freestresser.to 153 | freestresser.xyz 154 | freezystresser.nl 155 | getsmack.de 156 | grimbooter.com 157 | hardstresser.com 158 | havoc-security.pw 159 | hazebooter.com 160 | heavystresser.com 161 | hestresser.com 162 | heydos.cc 163 | hornystress.me 164 | howtoddosattack.com 165 | hydrostress.com 166 | hydrostress.net 167 | hyperstresser.com 168 | i-b.co 169 | iddos.net 170 | igbangbooter.com 171 | imsocool.info 172 | inboot.me 173 | infectedstresser.com 174 | infectedstresser.net 175 | instabooter.com 176 | instant-stresser.com 177 | instant-stresser.surverybot.com 178 | instantdown-stresser.alwaysdata.net 179 | instinctproducts.com 180 | invalid.pw 181 | ionbooter.com 182 | ip-booter-me.ilovecassola.it 183 | ip-booter-net.play3nvvip.com 184 | ip-stresser-tor.yonkersbridal.com 185 | ip-stresser-xbox.hdxba.com 186 | ip-stresser.icee-pdrp.com 187 | ip-stresser.pst-2020.com 188 | ipboot.xyz 189 | ipstress.in 190 | ipstresser.co 191 | ipstresser.com 192 | ipstresser.lidamorgenstein.net 193 | ipstresser.pw 194 | ipstresser.wtf 195 | ipstresstest.com 196 | iridiumstresser.net 197 | isitdownyet.com 198 | jitterstresser.com 199 | k-stress.pw 200 | kryptonic.pw 201 | kth-stress.tk 202 | last-day.xyz 203 | layer-4.com 204 | layer-stresser.alwaysdata.net 205 | layer7-security.net 206 | layer7-stresser.com 207 | layer7-stresser.xyz 208 | layer7.pw 209 | legion.cm 210 | legionboot.com 211 | lifetimeboot.com 212 | lightstress.in 213 | lizardstresser.su 214 | logicstresser.net 215 | loic-sourceforge-net.ilovecassola.it 216 | masterboot.net 217 | maxidown.com 218 | mega-stresser.us 219 | mercilesstresser.com 220 | meteor-stresser.com 221 | meteor-stresser.to 222 | minecraftstresser.com 223 | mini-booter.com 224 | moscow-stress.xyz 225 | mystresser.com 226 | mythicalstress.xyz 227 | narcos-stresser.000webhostapp.com 228 | national-stresser.com 229 | national-stresser.net 230 | netbreak.ec 231 | netspoof.com 232 | netspoof.net 233 | netstress.net 234 | netstress.org 235 | network-stresser.alwaysdata.net 236 | network-stressing.net 237 | network.rip 238 | networkstress.com 239 | networkstress.xyz 240 | networkstresser.com 241 | networkstresser.net 242 | neverddos.com 243 | nice-stresser.alwaysdata.net 244 | nightlystresser.ml 245 | nightmarestresser.com 246 | ninjastresser.com 247 | nismitstresser.net 248 | nodestress.tw 249 | nonymousbooter.com 250 | nstress.com 251 | nstresser.net 252 | nuke.pe.hu 253 | obeystresser.com 254 | obliterateproducts.com 255 | olympusstresser.org 256 | omegastresser.com 257 | onestress.com 258 | onestresser.net 259 | onionstresser.com 260 | ooter.io 261 | optimusstresser.com 262 | orcahub.com 263 | orphicsecurityteam.com 264 | ovh-booter.com 265 | ovh-ip-test.ilovecassola.it 266 | ozzy-stresser.000webhostapp.com 267 | paid-booter.operainlove.it 268 | parabooter.com 269 | penis-stresser.000webhostapp.com 270 | phoenixstresser.com 271 | pineapple-stresser.com 272 | pokent.com 273 | power-ddoser.ilovecassola.it 274 | power-ddoser.resoluteshoppingsite.com 275 | power-stress.pw 276 | powerapi.info 277 | powerapiv2.com 278 | powerdos.co.uk 279 | powerstress.com 280 | powerstresser.com 281 | privateroot.fr 282 | psn-ddos.alwaysdata.net 283 | psn-stress.alwaysdata.net 284 | pstresser.com 285 | purestress.net 286 | quantumbooter.net 287 | quantumstress.net 288 | quez.in 289 | quezstresser.com 290 | quezstresser.in 291 | rackstress.pw 292 | ragebooter.com 293 | ragebooter.net 294 | rapidstresser.com 295 | rawlayer.com 296 | rcahub.com 297 | reafstresser.ga 298 | realstresser.com 299 | rebellionstresser.com 300 | relevantstress.com 301 | renegade-products.net 302 | request.rip 303 | respawn.ca 304 | restricted-stresser.info 305 | riotstresser.com 306 | ripstresser.net 307 | routerslap.com 308 | royalbooter.de 309 | ryptonic.pw 310 | securestress.pw 311 | sharkstresser.com 312 | shawty.club 313 | signalstresser.com 314 | silence-stresser.com 315 | silentstress.wtf 316 | skidbooter.info 317 | sleek.to 318 | smack.rip 319 | snowstresser.net 320 | spacejump.xyz 321 | spboot.net 322 | specialistsservers.tk 323 | speed-stresser.com 324 | sst.wtf 325 | stagestresser.com 326 | stormstresser.net 327 | str3ssed.co 328 | str3ssed.me 329 | stress-analysis.ilovecassola.it 330 | stress-me.io 331 | stress-me.net 332 | stress.alwaysdata.net 333 | stressboss.net 334 | stressed.pw 335 | stresser-ip-booter.icon188.asia 336 | stresser-ip-booter.taj999exch.com 337 | stresser-ip-booter.xero.news 338 | stresser.app 339 | stresser.cc 340 | stresser.club 341 | stresser.in 342 | stresser.net 343 | stresser.network 344 | stresser.nstats.pw 345 | stresser.org 346 | stresser.ovh 347 | stresser.ru 348 | stresser.world 349 | stresserit.com 350 | stressit.club 351 | stressthem.to 352 | stressy.org 353 | strong-stresser.000webhostapp.com 354 | strong-stresser.com 355 | strongest-booter.hq7899.com 356 | stuxstresser.com 357 | superstresser.com 358 | supremesecurityteam.com 359 | sylumstresser.com 360 | synstress.net 361 | syrix-stresser.xyz 362 | thebestbooters.com 363 | thunderstresser.me 364 | time-stresser.pw 365 | titaniumbooter.net 366 | titaniumstresser.net 367 | top-10-booters.ilovecassola.it 368 | top-booter.com 369 | topstresser.io 370 | topstressers.com 371 | torsecurityteam.org 372 | tressed.pw 373 | tresser.info 374 | ts3booter.net 375 | ufa-booters-tools.com 376 | umbstresser.net 377 | unknownbooter.co 378 | unknownbooter.com 379 | unseenbooter.com 380 | vbooter.com 381 | vbooter.org 382 | vdos-s.co 383 | vdos-s.com 384 | vdoss.net 385 | vex-stresser.net 386 | vtoxicity.net 387 | wavestresser.wtf 388 | webbooter.com 389 | webstress.cc 390 | webstress.net 391 | webstresser-free.hostthegame.com 392 | webstresser-free.sunnymoring.com 393 | webstresser.biz 394 | webstresser.co 395 | webstresser.com 396 | weeabooter.com 397 | wifilefgrosdp-stresser.000webhostapp.com 398 | wifistruggles.com 399 | wifistruggles.net 400 | wriz-v2-booter.sfbaywhalewatching.com 401 | xblunter.co 402 | xblunter.net 403 | xboot.net 404 | xbox-xuid-booter.ilovecassola.it 405 | xenon-stresser.com 406 | xkovxboot.co 407 | xr8edstresser.com 408 | xrshellbooter.com 409 | xrstresser.net 410 | xstress.xyz 411 | xtreme.cc 412 | xtremebooter.com 413 | xyzbooter.net 414 | yakuzastresser.com 415 | ydrostress.com 416 | youboot.net 417 | z-shadow.co 418 | z7inc.com 419 | zdstresser.net 420 | zeus-net.pw 421 | zodiac-stresser.com 422 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/ddos/urls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/ddos/urls -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/ddos/usage: -------------------------------------------------------------------------------- 1 | black 2 | ddos 3 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/download.sh: -------------------------------------------------------------------------------- 1 | 2 | # wget https://dsi.ut-capitole.fr/blacklists/download/adult.tar.gz 3 | 4 | # wget https://dsi.ut-capitole.fr/blacklists/download/phishing.tar.gz 5 | 6 | # wget https://dsi.ut-capitole.fr/blacklists/download/dating.tar.gz 7 | 8 | # wget https://dsi.ut-capitole.fr/blacklists/download/gambling.tar.gz 9 | 10 | # wget https://dsi.ut-capitole.fr/blacklists/download/filehosting.tar.gz 11 | 12 | # wget https://dsi.ut-capitole.fr/blacklists/download/ddos.tar.gz 13 | 14 | # wget https://dsi.ut-capitole.fr/blacklists/download/agressif.tar.gz 15 | 16 | # wget https://dsi.ut-capitole.fr/blacklists/download/chat.tar.gz 17 | 18 | # wget https://dsi.ut-capitole.fr/blacklists/download/mixed_adult.tar.gz 19 | 20 | # wget https://dsi.ut-capitole.fr/blacklists/download/arjel.tar.gz 21 | 22 | 23 | #!/bin/bash 24 | 25 | # 遍历当前目录中所有的 .tar.gz 文件 26 | for file in *.tar.gz 27 | do 28 | # 检查文件是否存在(以防止没有匹配的文件时的错误) 29 | if [ -f "$file" ]; then 30 | echo "Extracting $file..." 31 | 32 | # 获取文件名(不包括 .tar.gz 扩展名) 33 | filename="${file%.tar.gz}" 34 | 35 | # 创建一个与文件同名的目录 36 | # mkdir -p "$filename" 37 | 38 | # 解压文件到这个新目录 39 | tar -xzf "$file" 40 | # -C "$filename" 41 | 42 | echo "Finished extracting $file" 43 | fi 44 | done 45 | 46 | echo "All .tar.gz files have been extracted." -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/filehosting.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/filehosting.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/filehosting/domains: -------------------------------------------------------------------------------- 1 | 1000go.fr 2 | 11mbit.de 3 | 123-reg.co.uk 4 | 1fichier.co 5 | 1fichier.com 6 | 1und1.de 7 | 2and2.net 8 | 2big.hu 9 | 2img.net 10 | 2shared.com 11 | 30mb.com 12 | 35mb.com 13 | 450mb.com 14 | 4file.net 15 | 4filehosting.com 16 | 4freeimagehost.com 17 | 4megaupload.com 18 | 4shared.com 19 | 4sync.com 20 | 64k.it 21 | 88.191.122.25 22 | 9divx.com 23 | 9down.com 24 | 9giga.sfr.fr 25 | abload.de 26 | abrutis-videos.com 27 | adrive.com 28 | ahstatic.com 29 | airset.com 30 | alamy.com 31 | alamyimages.fr 32 | alfafile.net 33 | alfon.org 34 | alkk.net 35 | alldrives.ge 36 | allomegavideo.com 37 | alloseven.com 38 | allourls.com 39 | alojarfotos.com 40 | anonfile.com 41 | anyhub.net 42 | anzwers.org 43 | arcadeupload.com 44 | arcadya.net 45 | archivosgrandes.com 46 | arribalafoto.com 47 | axifile.com 48 | b4uphotos.com 49 | backupmyfiles.net 50 | badongo.com 51 | bajapics.tk 52 | battleofdragon.com 53 | baycdn.com 54 | bayfiles.com 55 | bayimg.com 56 | bbhistory.info 57 | behance.net 58 | bestsharing.com 59 | bestupload.com 60 | bfgfile.com 61 | bigandfree.com 62 | bigfileupload.com 63 | bigfilez.com 64 | bigpichost.com 65 | bigupload.com 66 | bigvault.com 67 | birnchen.ath.cx 68 | bitshare.com 69 | bitshare.de 70 | blackbeard.ws 71 | bladimix.com 72 | blinkyou.com 73 | blogsimages.skynet.be 74 | bluehost.to 75 | boardplus.org 76 | bonpoo.com 77 | boomp3.com 78 | boomycloud.com 79 | box.com 80 | box.net 81 | briefcase.yahoo.com 82 | browsl.com 83 | bubbleshare.com 84 | busyupload.com 85 | casimages.com 86 | chomikuj.pl 87 | chungo.net 88 | cld.pt 89 | clicknupload.org 90 | clipser.com 91 | clipupload.com 92 | cloudshare.cz 93 | cocoimage.com 94 | cocoshare.cc 95 | coedproxy.info 96 | come2store.com 97 | comparte.pe 98 | content-type.com 99 | coolstreaming.us 100 | cpbild.co 101 | cramit.in 102 | cramitin.net 103 | crazefiles.com 104 | crazysharing.com 105 | cutedrive.com 106 | cyberdan.fr 107 | cybernetic1995.acc.de 108 | dada.net 109 | dailyuploads.net 110 | dardarkom.com 111 | datafilehost.com 112 | datapickup.com 113 | ddl-share.org 114 | deaddrops.com 115 | debrid-link.fr 116 | debridarea.com 117 | debrideurstreaming.com 118 | deeload.com 119 | demo.ovh.com 120 | depositfiles.com 121 | descargar.traducegratis.com 122 | descargasfull.com 123 | desearch.net 124 | digitalimagehosting.com 125 | diinoweb.com 126 | disiami.net 127 | divshare.com 128 | dl-b.free.fr 129 | dl-more.eu 130 | dl.free.fr 131 | dominiord.com 132 | douploads.net 133 | download.anim-area11.net 134 | downloaddelivery.com 135 | downloads.nl 136 | downloads.phpnuke.org 137 | dpstream.pw 138 | dpstream.tv 139 | drivehq.com 140 | driveway.com 141 | drop.io 142 | dropapk.to 143 | dropbox.com 144 | dropboxusercontent.com 145 | dropfiles.net 146 | dropload.com 147 | dumpanimage.com 148 | dumparump.com 149 | dweb.link 150 | easy-share.com 151 | easy-sharee.com 152 | easy-sharing.com 153 | easyupload.io 154 | eatlime.com 155 | eazyshare.net 156 | eazyupload.net 157 | egoshare.com 158 | egydown.com 159 | ei-pictures.com 160 | electronicfiles.net 161 | elephantdrive.com 162 | emstorage.fr 163 | enablebrowser.info 164 | enterupload.com 165 | epload.co 166 | esgens.com 167 | esnips.com 168 | evilshare.com 169 | ex-load.com 170 | exbyte.net 171 | extremeshare.net 172 | ez-files.net 173 | ezyfile.net 174 | facebook-proxy.info 175 | falcon.o-wh.com 176 | fast-debrid.com 177 | fast-load.net 178 | fasterupload.com 179 | fastpic.ru 180 | fastshare.org 181 | favshare.com 182 | faylyukle.com 183 | fboom.me 184 | fdrop.com 185 | fhqhosting.com 186 | fifostream.com 187 | fifostream.net 188 | fifostream.org 189 | fifostream.tv 190 | fifostreaming.com 191 | fifostreaming.net 192 | fifostreaming.org 193 | fifostreaming.tv 194 | file-rack.com 195 | file-up.org 196 | file.thfree.com 197 | file123.com 198 | file2share.biz 199 | file2upload.net 200 | filebase.to 201 | filebin.net 202 | filebuffer.net 203 | fileburst.com 204 | filecabin.com 205 | filecache.de 206 | fileclick.eu 207 | fileclick.net 208 | filecloud.com 209 | fileden.com 210 | filedip.com 211 | filedn.com 212 | filedropper.com 213 | filedwon.info 214 | filefactory.com 215 | filefishing.com 216 | filefront.com 217 | filegone.com 218 | filehd.com 219 | filehigh.com 220 | fileho.com 221 | filehost.ro 222 | filehosting.cc 223 | filehosting.org 224 | filejungle.com 225 | filekicker.com 226 | filelodge.bolt.com 227 | filemashine.com 228 | filenext.com 229 | fileparadox.in 230 | filepost.com 231 | filepost.us 232 | filepub.com 233 | fileqube.com 234 | files-express.com 235 | files-upload.com 236 | files.bz 237 | files.catbox.moe 238 | files.fm 239 | files.mail.ru 240 | files.to 241 | files.ww.com 242 | files6.com 243 | filesanywhere.com 244 | filesavr.com 245 | fileseasy.com 246 | filesend.net 247 | fileserve.com 248 | fileservices.me.com 249 | fileshit.net 250 | fileskip.com 251 | filesmap.com 252 | filesmore.com 253 | filesonic.com 254 | filespoint.com 255 | filespump.com 256 | filestage.net 257 | filestube.com 258 | filesupload.com 259 | filesusr.com 260 | filetransfer.io 261 | fileup.org 262 | fileupyours.com 263 | fileurls.com 264 | filezzz.com 265 | film-2-streaming.com 266 | film-exclue.net 267 | filthimagehost.com 268 | filthpics.com 269 | filthspace.com 270 | flixya.com 271 | flypicture.com 272 | flyupload.com 273 | foreverhide.com 274 | foroxd.com 275 | fotazas.com 276 | fotolog.pl 277 | fotonons.ru 278 | fotop.net 279 | fotosik.pl 280 | fotosupload.com 281 | freakshare.com 282 | freakshare.net 283 | free-hoster.cc 284 | free-transfer.de 285 | free-webhosts.com 286 | freedebrid.fr 287 | freedrive.com 288 | freefileupload.net 289 | freeimagehost.eu 290 | freeimagehosting.net 291 | freeimgshost.com 292 | freepik.com 293 | freeuploader.com 294 | freshlap.com 295 | friendlyfiles.net 296 | friendlyshare.de 297 | fromsmash.co 298 | fromsmash.com 299 | ftpz.us 300 | fullfotos.com.ar 301 | fupload.com 302 | gayimagehost.com 303 | geekimages.com 304 | geralink.in 305 | get-mu.com 306 | getfile.biz 307 | getkeepsafe.com 308 | gigabyteupload.com 309 | gigafilehost.net 310 | gigallery.com 311 | gigapeta.com 312 | gigashare.com 313 | gigasize.com 314 | gigaup.fr 315 | glintfiles.net 316 | glowfoto.com 317 | go.zummm.com 318 | gofilego.com 319 | goldfile.eu 320 | gopro4vn.com 321 | grablinksby.us 322 | grabme.net 323 | greek-fun.com 324 | grosfichiers.ch 325 | grosfichiers.com 326 | guba.com 327 | gximages.com 328 | gypsi.info 329 | harepix.com 330 | hemenpaylas.com 331 | hexupload.net 332 | hiboox.com 333 | hiboox.es 334 | hiboox.fr 335 | hitfile.net 336 | hjfile.cn 337 | hlusoe.info 338 | host-image.com.ar 339 | hostfiles.org 340 | hosting-test.net 341 | hotchyx.com 342 | hotfile.com 343 | hotlinkfiles.com 344 | hotlinkimage.com 345 | hotshare.net 346 | htpicturetrail.com 347 | hugedrive.com 348 | hulkload.com 349 | hulkshare.com 350 | humyo.com 351 | hyperfileshare.com 352 | hyperupload.com 353 | ibb.co 354 | icefile.com 355 | icefile.net 356 | icerbox.com 357 | idivimage.com 358 | idrive.com 359 | ifdnrg.com 360 | ifile.it 361 | ifolder.ru 362 | ifunpix.com 363 | igest.org 364 | iimmgg.com 365 | illhostit.com 366 | image.ohozaa.com 367 | image2host.com 368 | imagearn.com 369 | imagebam.com 370 | imageban.ru 371 | imagebanana.com 372 | imagebmp.com 373 | imagebor.com 374 | imagecabin.com 375 | imagecave.com 376 | imagechicken.com 377 | imagechile.net 378 | imagecloset.com 379 | imagefiasco.com 380 | imagehigh.com 381 | imagehost.es 382 | imagehost.org 383 | imagehosting.com 384 | imagehosting.us 385 | imagehostxp.com 386 | imagemule.com 387 | imagenchile.com 388 | imagengratis.org 389 | imagepremium.com 390 | imageshack.gr 391 | imageshack.us 392 | imageshadow.com 393 | imagesocket.com 394 | imageunload.com 395 | imageupload.se 396 | imageupper.com 397 | imageurlhost.com 398 | imagevenue.com 399 | imageviper.com 400 | imagewaste.com 401 | imagexa.com 402 | imaxenes.com 403 | img-vidiklub.com 404 | img.adoosimg.com 405 | img.godlike.cl 406 | img.nattawat.org 407 | img.tomatone.net 408 | img.xxfx.org 409 | imgarchive.info 410 | imghost.sk 411 | imgkk.com 412 | imgupload.adoosimg.com 413 | imgur.com 414 | imm.io 415 | immagini.p2pforum.it 416 | incredidl.com 417 | infierno-files.com 418 | internetfiles.org 419 | intoupload.net 420 | ipicture.ru 421 | ipswitch.com 422 | isarapix.com 423 | istockphoto.com 424 | iwastemytime.com 425 | jawcloud.co 426 | jeux.com 427 | jigsawshare.com 428 | jpghosting.com 429 | jumbofiles.com 430 | justupit.com 431 | k2s.cc 432 | katfile.com 433 | keepeek.com 434 | keepmyfile.com 435 | keepmyfiles.com 436 | keepmyimages.com 437 | kepfeltoltes.hu 438 | kewlshare.com 439 | krakenfiles.com 440 | ksaupload.com 441 | kytec.com 442 | largeimagehost.com 443 | leechking.com 444 | leetleech.org 445 | letitbit.net 446 | letsupload.com 447 | likeimg.com 448 | limelinx.com 449 | linkcyb.org 450 | linkspoof.com 451 | littlebyte.net 452 | livefilestore.com 453 | llnwd.net 454 | load.to 455 | loaderx.com 456 | looler.com 457 | lulzimg.com 458 | macle.voila.fr 459 | macleusb.net 460 | magicvortex.com 461 | mailbigfile.com 462 | maloxy.info 463 | mannequinat2000.chez-alice.fr 464 | maxishare.net 465 | mediafire.com 466 | mediapix.ru 467 | mega-debrid.eu 468 | mega-debrideur.tk 469 | mega-films.net 470 | mega.co.nz 471 | megadescarga.net 472 | megadl.info 473 | megadownload.net 474 | megafast.info 475 | megaftp.com 476 | megahotserved.com 477 | megaleech.eu 478 | megapid.com 479 | megashare.co.uk 480 | megashare.com 481 | megashares.com 482 | megaup.net 483 | megaupload-premium.com 484 | megaupload.com 485 | megaupload.de 486 | megavideo.com 487 | mesh.com 488 | mexa.sh 489 | mh2img.net 490 | migaload.com 491 | mihd.net 492 | mipony.net 493 | mirorii.com 494 | miroriii.com 495 | mixturecloud.com 496 | mj.am 497 | modovideo.com 498 | mofile.com 499 | momoshare.com 500 | momupload.com 501 | mon-nuage.com 502 | moncloshare.com 503 | monova.org 504 | mooload.com 505 | motionbox.com 506 | movshare.net 507 | mp3y.download 508 | muack.net 509 | mugrab.com 510 | multidl.com 511 | multipics.net 512 | multiply.com 513 | multiup.org 514 | mundo-descargas.com 515 | mundoimg.com 516 | mybloop.com 517 | myfilehut.com 518 | myfileshack.com 519 | myfilestash.com 520 | myfreefilehosting.com 521 | mynox.fr 522 | myotherdrive.com 523 | mypeopledoc.com 524 | mysave.in 525 | mysharebox.com 526 | mysharefile.com 527 | myspacegod.info 528 | myspacepro.info 529 | mytempdir.com 530 | myvideosharing.info 531 | myvirtualdisk.permissionresearch.com 532 | nakido.com 533 | navigator.ed.mu 534 | nbe-media.com 535 | ndfreehost.com 536 | netload.in 537 | netstorage.xosn.com 538 | netu.cam 539 | netu.io 540 | netu.tv 541 | neufgiga.com 542 | newgrounds.com 543 | nexmicrosystems.com 544 | nitroflare.com 545 | notblocked.hu.tl 546 | novamov.com 547 | nowvideo.eu 548 | nukeuploads.com 549 | onfinite.com 550 | oniva.com 551 | onlinedisk.ru 552 | onlinehome.fr 553 | onlinestuffs.com 554 | onwardhost.com 555 | opendrive.com 556 | openfile.ru 557 | openupload.com 558 | orbitfiles.com 559 | orgfree.com 560 | oron.com 561 | oxyshare.com 562 | ozerki.net 563 | paid4share.com 564 | paid4share.net 565 | paintedover.com 566 | partage-fichiers.com 567 | pbase.com 568 | peejeshare.com 569 | peerfactor.fr 570 | perushare.com 571 | photo-host.org 572 | photobucket.com 573 | photofile.es 574 | photofile.ru 575 | photoimagenes.com 576 | photojerk.com 577 | photos.cx 578 | photosamigos.com 579 | photoserver.ws 580 | phyrefile.com 581 | pic4you.ru 582 | picapic.net 583 | picbase.net 584 | picfoco.com 585 | picfor.me 586 | picfront.com 587 | picfront.de 588 | picfront.org 589 | picfu.net 590 | picoload.com 591 | picoodle.com 592 | picscrazy.com 593 | picsec.com 594 | pict.com 595 | picture-hosting.net 596 | picturedumper.com 597 | picturetrail.com 598 | picupl.com 599 | picvalley.net 600 | pimpandhost.com 601 | pixagogo.com 602 | pixali.com 603 | pixdaus.com 604 | pixelup.net 605 | pixhost.com 606 | pixhost.me 607 | pixhost.org 608 | pixhost.ws 609 | pixpond.com 610 | pixshock.net 611 | pixslam.com 612 | pixsy.com 613 | pixxtra.com 614 | plunder.com 615 | pornpicer.com 616 | postimage.org 617 | premify.com 618 | premiumbyleo.co.cc 619 | premiumlinkgens.blogspot.com 620 | profile.myspace.com 621 | przeklej.pl 622 | psychohost.com 623 | pushfile.net 624 | putfile.com 625 | putlocker.com 626 | qfile.de 627 | qshare.com 628 | quickdump.com 629 | quickshareit.com 630 | r25725.ovh.net 631 | r26538.ovh.net 632 | r27369.ovh.net 633 | r28052.ovh.net 634 | radikal.ru 635 | rapid-photo.com 636 | rapid4free.com 637 | rapid4me.com 638 | rapid8.com 639 | rapidechange.com 640 | rapideo.pl 641 | rapidfile.fr 642 | rapidforum.com 643 | rapidgator.net 644 | rapidl.com 645 | rapidmoviez.com 646 | rapidrar.com 647 | rapidshare.com 648 | rapidshare.de 649 | rapidshare.se 650 | rapidsharewarezmegaupload.com 651 | rapidsharing.com 652 | rapidsharings.com 653 | rapidupload.com 654 | rarhost.com 655 | redlist.be 656 | refrozen.com 657 | reliableimage.com 658 | revver.com 659 | rhost.cz 660 | ringo.com 661 | riprapid.net 662 | ripway.com 663 | rockdizfile.com 664 | rockfile.co 665 | sadew.com 666 | safe-access.com 667 | saleno.privateme.info 668 | sandisk.com 669 | savefile.com 670 | scambia.com 671 | school4.uyou.info 672 | send-file.co.uk 673 | sendbox.fr 674 | sendmefile.com 675 | sendover.com 676 | sendspace.com 677 | sendthisfile.com 678 | series-megaupload.com 679 | servimg.com 680 | sex.beohost.com 681 | seyvet.com 682 | share-online.biz 683 | share.am 684 | share.live.com 685 | shareapic.net 686 | sharebase.to 687 | sharebig.com 688 | sharebigfile.com 689 | sharedzilla.com 690 | sharefiles.ru 691 | shareiffic.com 692 | shareimages.com 693 | sharelor.com 694 | sharemods.com 695 | sharenxs.com 696 | sharingmatrix.com 697 | sharingzone.net 698 | sharovar.com 699 | shinyhosting.net 700 | shitore.com 701 | shop2all.biz 702 | sinpremium.net 703 | skodengz.com 704 | skydrive.live.com 705 | slack-files.com 706 | slibe.com 707 | slide.com 708 | slil.ru 709 | slwatch.co 710 | snaggys.com 711 | snap.com 712 | snapdrive.net 713 | sockshare.com 714 | solidfiles.com 715 | speed-downloading.com 716 | speed4up.net 717 | speedshare.org 718 | spideroak.com 719 | spread-it.com 720 | spymac.com 721 | ssl0d.com 722 | stage6.com 723 | steekr.com 724 | stickypix.net 725 | storage.live.com 726 | storage.yandexcloud.net 727 | storagefun.com 728 | storeandserve.com 729 | streamiz-filmze.fr 730 | streamiz.com 731 | streamlare.com 732 | streamload.com 733 | streamupload.com 734 | sube.la 735 | subefotos.com 736 | subeimagenes.com.ar 737 | subelas.com 738 | subetela.com 739 | subir-archivos.com.ar 740 | subirimagen.es 741 | subirimagen.net 742 | subirimagenes.com 743 | subiteya.com 744 | sugarsync.com 745 | superphotospace.com 746 | supload.com 747 | surfban.info 748 | surfblocked.co.cc 749 | swfcabin.com 750 | swiftdesk.com 751 | swoopshare.com 752 | sxc.hu 753 | syncplicity.com 754 | tabulas.com 755 | tagstat.com 756 | takefile.link 757 | tempfile.ru 758 | terabox.telefonica.com.ar 759 | tezfiles.com 760 | thefilebucket.com 761 | thefilehut.com 762 | thefreesite.com 763 | theimagehosting.com 764 | theonlinedatastorage.com 765 | thepictures.us 766 | theupload.com 767 | thumbhoster.com 768 | thumblogger.com 769 | tinydot.co.cc 770 | tinypic.com 771 | tmpfiles.org 772 | topdebrid.com 773 | torrentreactor.net 774 | toutbox.fr 775 | trackerx.com.ar 776 | tradownload.uk 777 | transfer.sh 778 | transferbigfiles.com 779 | turbobit.net 780 | turbobit.ru 781 | turboupload.com 782 | twilight.ws 783 | ucantblockme.info 784 | ugotfile.com 785 | uloz.to 786 | ultimbox.com 787 | ultrashare.de 788 | ultrashare.net 789 | unblock.nevercatch.com 790 | unblockya.com 791 | unibytes.com 792 | universalhoster.net 793 | universitriat.co.cc 794 | up-4ever.org 795 | up-file.com 796 | up.li.ru 797 | up4net.com 798 | updownloadserver.com 799 | updownloadserver.de 800 | upken.jp 801 | uplo4d.com 802 | upload-file.net 803 | upload.ac 804 | upload.digiex.net 805 | upload.dj 806 | upload.sc 807 | upload.seeitworks.com 808 | upload2.net 809 | uploadarmy.com 810 | uploadbox.com 811 | uploadbuzz.cc 812 | uploadchan.org 813 | uploaded.net 814 | uploaded.to 815 | uploadev.org 816 | uploadfile.info 817 | uploadfiles.io 818 | uploadgalaxy.com 819 | uploadgeek.com 820 | uploadhouse.com 821 | uploadhut.com 822 | uploading.com 823 | uploadingit.com 824 | uploadit.biz 825 | uploadjar.com 826 | uploadking.net 827 | uploadmachine.com 828 | uploadocean.com 829 | uploadrack.com 830 | uploadspy.com 831 | uploadstation.com 832 | uploadstore.com 833 | uploadtemple.com 834 | uploadtown.com 835 | uploadwave.com 836 | uploadx.net 837 | uploadyourfiles.de 838 | uploadyourimages.com 839 | uppit.com 840 | upsara.com 841 | upshare.eu 842 | uptobox.com 843 | upvid.co 844 | uqload.com 845 | usaupload.net 846 | useeimage.com 847 | usersdrive.com 848 | usershare.net 849 | userupload.net 850 | ushareit.com 851 | vagoimagen.com 852 | verysong.co.cc 853 | verzend.be 854 | vgroupnetwork.com.ar 855 | vidcloud.co 856 | videobb.com 857 | videotribe.com 858 | videoucrania.com 859 | videoweed.com 860 | videozer.com 861 | vidoza.net 862 | villagephotos.com 863 | vleech.net 864 | vobbo.com 865 | vodpod.com 866 | volafile.io 867 | vosfichiers.com 868 | vpx.pl 869 | vuestrasfotos.com 870 | wantpremium.net 871 | wantrapid.com 872 | warezlinkers.com 873 | way2tube.com 874 | wdfiles.ru 875 | wdupload.com 876 | we.tl 877 | webfile.ru 878 | webfilehost.com 879 | webshots.com 880 | weshare.me 881 | wetransfer.com 882 | wetransfer.net 883 | wi.to 884 | wikiupload.com 885 | wirefiles.com 886 | with2ch.net 887 | wonderfile.net 888 | woo55.com 889 | workupload.com 890 | wtfhost.com 891 | wuala.com 892 | wudeo.com 893 | wupfile.com 894 | wupload.com 895 | wupload.fr 896 | wyslijto.pl 897 | x7.to 898 | xbinary.com 899 | xdrive.com 900 | xinony.com 901 | xmages.net 902 | xooimage.com 903 | xs.to 904 | xshare.us 905 | xtraupload.de 906 | xzshare.com 907 | yabadaba.ru 908 | yatahonga.com 909 | yourfile.net 910 | yourfile.org 911 | yourfilehost.com 912 | yourfilelink.com 913 | yousendit.com 914 | youshareit.com 915 | zalil.ru 916 | zettapremium.com 917 | ziddu.com 918 | zikuka.com 919 | zikuka.ru 920 | zippyshare.com 921 | zizfile.com 922 | znail.com 923 | zonadd.net 924 | zonaimagen.es 925 | zone-videos.net 926 | zoto.com 927 | zotzoo.com 928 | zshare.net 929 | zumodrive.com 930 | zupload.com 931 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/filehosting/urls: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | version https://git-lfs.github.com/spec/v1 3 | oid sha256:aa7f07f95eb8626f72fb9025ba298a87f670c6033a370b839e1b099d5dde3d02 4 | size 200 5 | ======= 6 | cri.univ-tlse1.fr/tools/test_filtrage/filehosting/ 7 | howcast.com/upload/ 8 | me.com/idisk 9 | media.filecabi.net/upload.html 10 | nhjm.net/~pcdthebum/Upload/ 11 | stupidvideos.com/upload/ 12 | sumo.tv/upload/ 13 | voila.fr/macle/ 14 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a 15 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/filehosting/usage: -------------------------------------------------------------------------------- 1 | black 2 | filehosting 3 | stockage 4 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/gambling.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/gambling.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/gambling/urls: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | version https://git-lfs.github.com/spec/v1 3 | oid sha256:acb511832e0271d2d385cef1aae58a657a11278a3b2c2f667292115e6ab3d2d2 4 | size 182 5 | ======= 6 | astrolabio.net/casino/ 7 | cri.univ-tlse1.fr/tools/test_filtrage/gambling/ 8 | sd579.sivit.org/lesgrandscasinos/ 9 | thestreetwearmagazine.com/www.thestreetwearmagazine.com/ 10 | top-lasvegas.com/en 11 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a 12 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/gambling/usage: -------------------------------------------------------------------------------- 1 | black 2 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/mixed_adult.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/mixed_adult.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/mixed_adult/domains: -------------------------------------------------------------------------------- 1 | 10putes.com 2 | 2and2.net 3 | 321chat.com 4 | 4chan.org 5 | 4freeimagehost.com 6 | abrutis-videos.com 7 | agq.qc.ca 8 | albums-photo.net 9 | allyoucanupload.webshots.com 10 | archiveofourown.org 11 | artdeseduire.com 12 | artistic-nude-images.com 13 | bayimg.com 14 | bazoocam.org 15 | bdamateur.com 16 | beautyandboost.com 17 | blablagues.net 18 | blaguesenstock.com 19 | blinkyou.com 20 | blogsimages.skynet.be 21 | bookfoto.com 22 | bookspace.fr 23 | buzzmaniac.fr 24 | camroulette.biz 25 | camroulette.co.uk 26 | captice.net 27 | carsandgirls.hu 28 | caught.com 29 | cazzateonline.it 30 | cduniverse.com 31 | chatroulette.com 32 | chatroulettefr.com 33 | chatteroulette.fr 34 | dada.net 35 | daniel-bauer.com 36 | debono.club.fr 37 | deskbeauty.net 38 | deviantart.com 39 | dianeetlesexedesanges.ch 40 | digitalimagehosting.com 41 | dumpanimage.com 42 | dvdrama.com 43 | ecranlarge.com 44 | entrevue.fr 45 | filecloud.com 46 | filehigh.com 47 | filelodge.bolt.com 48 | free-webhosts.com 49 | freegamesforgirls.org 50 | freeimagehosting.net 51 | ftw.generation.no 52 | gael-l.com 53 | galerias.ojodigital.com 54 | geekimages.com 55 | girlicious.free.fr 56 | glennbwellmanphoto.com 57 | glowfoto.com 58 | gougoule.com 59 | guba.com 60 | gwendoline.book.fr 61 | hotlinkimage.com 62 | htpicturetrail.com 63 | humour-blague.com 64 | image2host.com 65 | imagecabin.com 66 | imagecave.com 67 | imagecloset.com 68 | imagefiasco.com 69 | imagehigh.com 70 | imagehosting.com 71 | imagehosting.us 72 | imagemule.com 73 | imageshack.us 74 | imagevenue.com 75 | immagini.p2pforum.it 76 | imvu.com 77 | istockphoto.com 78 | izismile.com 79 | jellyfields.com 80 | jeux.com 81 | kamroulette.com 82 | keepmyfile.com 83 | keepmyfiles.com 84 | keepmyimages.com 85 | le-trombi.com 86 | lebest.fr 87 | lecture-en-ligne.com 88 | leslivresdesarah.canalblog.com 89 | libertatea.ro 90 | loofok.com 91 | mademoiselleagency.fr 92 | mandatory.com 93 | mediafire.com 94 | megaupload.com 95 | metacafe.com 96 | newgrounds.com 97 | ohmybuzz.net 98 | onfinite.com 99 | oniva.com 100 | pbh2.com 101 | photofile.ru 102 | photojerk.com 103 | picphotos.net 104 | picturedumper.com 105 | picturetrail.com 106 | pitchu.fr 107 | pixpond.com 108 | profile.myspace.com 109 | quedesconneries.com 110 | rapidforum.com 111 | rapidshare.com 112 | rapidshare.de 113 | rutube.ru 114 | sauna-lotus.fr 115 | sex.beohost.com 116 | slibe.com 117 | slide.com 118 | spymac.com 119 | t45ol.com 120 | tabulas.com 121 | thechive.com 122 | thefreesite.com 123 | theimagehosting.com 124 | thoughtcatalog.com 125 | thumblogger.com 126 | tinypic.com 127 | tmz.com 128 | torrentmv.com 129 | unitedhumor.com 130 | up-file.com 131 | uploadfile.info 132 | uploadyourimages.com 133 | vice.com 134 | videofilia.com 135 | villagephotos.com 136 | wallpapers-paradise.com 137 | weedoroulette.com 138 | weheartit.com 139 | widouf.com 140 | wizzcam.fr 141 | y-top.com 142 | yatahonga.com 143 | youmadeo.com 144 | zinio.com 145 | zone-videos.net 146 | zoto.com 147 | zshare.net 148 | zvraceny.cz 149 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/mixed_adult/urls: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | version https://git-lfs.github.com/spec/v1 3 | oid sha256:d02965aee171c36a9d159af0f047f972ccd5596305f6a13d7db93c895e29a683 4 | size 232 5 | ======= 6 | chatrandom.com/fr/ 7 | cri.univ-tlse1.fr/tools/test_filtrage/mixed_adult/ 8 | images.live.com/videos/thumbnail.aspx 9 | montreal.craigslist.ca/search/tlg 10 | montreal.craigslist.ca/tlg 11 | olhares.aeiou.pt/galeriasprivadas/ 12 | weedochat.com/chatroulette/ 13 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a 14 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/mixed_adult/usage: -------------------------------------------------------------------------------- 1 | black 2 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/phishing.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/phishing.tar.gz -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/blocklist/phishing/usage: -------------------------------------------------------------------------------- 1 | # This list is no longer maintained. 2 | # It's only a copy of malware category 3 | black 4 | phishing 5 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/curated/domains: -------------------------------------------------------------------------------- 1 | stackexchange.com 2 | ncbi.nlm.nih.gov/pmc 3 | arxiv.org 4 | github.com 5 | storage.courtlistener.com 6 | bulkdata.uspto.gov 7 | pubmed.ncbi.nlm.nih.gov 8 | gutenberg.org 9 | opensubtitles.org 10 | wikipedia.org 11 | irclogs.ubuntu.com 12 | statmt.org 13 | news.ycombinator.com 14 | youtube.com 15 | philpapers.org -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/url_blocklist_refinedweb_manual_inspection.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4db972247738dd99bedc51488debb705ff954e230c39fa8434ecc1398bcd349b 3 | size 123585639 4 | -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/whitelist/domains: -------------------------------------------------------------------------------- 1 | bust.com 2 | chicagoreader.com 3 | discord.com 4 | jungefreiheit.de 5 | marktplaza.nl 6 | telegra.ph -------------------------------------------------------------------------------- /web_pipeline/url_filtering/urls/whitelist/urls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/whitelist/urls -------------------------------------------------------------------------------- /web_pipeline/utils/datatrove_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Literal 2 | 3 | from datatrove.io import DataFileLike, DataFolderLike 4 | from datatrove.pipeline.readers.base import BaseDiskReader 5 | from datatrove.utils.logging import logger 6 | 7 | 8 | class TxtReader(BaseDiskReader): 9 | """Read data from TXT files. 10 | Will read each line as a separate document. 11 | 12 | Args: 13 | data_folder: a str, tuple or DataFolder object representing a path/filesystem 14 | paths_file: optionally provide a file with one path per line (without the `data_folder` prefix) to read. 15 | compression: the compression to use (default: "infer") 16 | limit: limit the number of documents to read. Useful for debugging 17 | skip: skip the first n rows 18 | file_progress: show progress bar for files 19 | doc_progress: show progress bar for documents 20 | adapter: function to adapt the data dict from the source to a Document. 21 | Takes as input: (self, data: dict, path: str, id_in_file: int | str) 22 | self allows access to self.text_key and self.id_key 23 | Returns: a dict with at least a "text" and "id" keys 24 | text_key: the key containing the text data (default: "text"). 25 | id_key: the key containing the id for each sample (default: "id"). 26 | default_metadata: a dictionary with any data that should be added to all samples' metadata 27 | recursive: whether to search files recursively. Ignored if paths_file is provided 28 | glob_pattern: pattern that all files must match exactly to be included (relative to data_folder). Ignored if paths_file is provided 29 | shuffle_files: shuffle the files within the returned shard. Mostly used for data viz. purposes, do not use with dedup blocks 30 | """ 31 | 32 | name = "🐿 Txt" 33 | 34 | def __init__( 35 | self, 36 | data_folder: DataFolderLike, 37 | paths_file: DataFileLike | None = None, 38 | compression: Literal["infer", "gzip", "zstd"] | None = "infer", 39 | limit: int = -1, 40 | skip: int = 0, 41 | file_progress: bool = False, 42 | doc_progress: bool = False, 43 | adapter: Callable = None, 44 | text_key: str = "text", 45 | id_key: str = "id", 46 | default_metadata: dict = None, 47 | recursive: bool = True, 48 | glob_pattern: str | None = None, 49 | shuffle_files: bool = False, 50 | ): 51 | super().__init__( 52 | data_folder, 53 | paths_file, 54 | limit, 55 | skip, 56 | file_progress, 57 | doc_progress, 58 | adapter, 59 | text_key, 60 | id_key, 61 | default_metadata, 62 | recursive, 63 | glob_pattern, 64 | shuffle_files, 65 | ) 66 | self.compression = compression 67 | 68 | def read_file(self, filepath: str): 69 | with self.data_folder.open(filepath, "r", compression=self.compression) as f: 70 | try: 71 | # for txt file, each line is a document 72 | for li, line in enumerate(f): 73 | document = self.get_document_from_dict({"text": line}, filepath, li) 74 | yield document 75 | except UnicodeDecodeError as e: 76 | logger.warning( 77 | f"File `{filepath}` may be corrupted: raised UnicodeDecodeError ({e})" 78 | ) 79 | -------------------------------------------------------------------------------- /web_pipeline/utils/decont_utils/data/aime25.jsonl: -------------------------------------------------------------------------------- 1 | {"id":"I-1","problem":"Find the sum of all integer bases $b > 9$ for which $17_b$ is a divisor of $97_b$.","solution":"","answer":"70","url":""} 2 | {"id":"I-2","problem":"On $\\triangle ABC$, points $A$, $D$, $E$, and $B$ lie in that order on side $\\overline{AB}$ with $AD = 4$, $DE = 16$, and $EB = 8$. Points $A$, $F$, $G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF = 13$, $FG = 52$, and $GC = 26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon $AFNBCEM$.","solution":"","answer":"588","url":""} 3 | {"id":"I-3","problem":"The 9 members of a baseball team went to an ice-cream parlor after their game. Each player had a singlescoop cone of chocolate, vanilla, or strawberry ice cream. At least one player chose each flavor, and the number of players who chose chocolate was greater than the number of players who chose vanilla, which was greater than the number of players who chose strawberry. Let $N$ be the number of different assignments of flavors to players that meet these conditions. Find the remainder when $N$ is divided by 1000.","solution":"","answer":"16","url":""} 4 | {"id":"I-4","problem":"Find the number of ordered pairs $(x,y)$, where both $x$ and $y$ are integers between -100 and 100, inclusive, such that $12x^2 - xy - 6y^2 = 0$.","solution":"","answer":"117","url":""} 5 | {"id":"I-5","problem":"There are $8! = 40320$ eight-digit positive integers that use each of the digits $1,2,3,4,5,6,7,8$ exactly once. Let $N$ be the number of these integers that are divisible by 22. Find the difference between $N$ and 2025.","solution":"","answer":"279","url":""} 6 | {"id":"I-6","problem":"An isosceles trapezoid has an inscribed circle tangent to each of its four sides. The radius of the circle is 3, and the area of the trapezoid is 72. Let the parallel sides of the trapezoid have lengths $r$ and $s$, with $r \\neq s$. Find $r^2 + s^2$.","solution":"","answer":"504","url":""} 7 | {"id":"I-7","problem":"The twelve letters $A$, $B$, $C$, $D$, $E$, $F$, $G$, $H$, $I$, $J$, $K$, and $L$ are randomly grouped into six pairs of letters. The two letters in each pair are placed next to each other in alphabetical order to form six two-letter words, and then those six words are listed alphabetically. For example, a possible result is $AB,CJ,DG,EK,FL,HI$. The probability that the last word listed contains $G$ is $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m + n$.","solution":"","answer":"821","url":""} 8 | {"id":"I-8","problem":"Let $k$ be a real number such that the system\n$|25 + 20i - z| = 5$\n$|z - 4 - k| = |z - 3i - k|$\nhas exactly one complex solution $z$. The sum of all possible values of $k$ can be written as $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m + n$. Here $i = \\sqrt{-1}$.","solution":"","answer":"77","url":""} 9 | {"id":"I-9","problem":"The parabola with equation $y = x^2 - 4$ is rotated $60°$ counterclockwise around the origin. The unique point in the fourth quadrant where the original parabola and its image intersect has $y$-coordinate $\\frac{a-\\sqrt{b}}{c}$, where $a$, $b$, and $c$ are positive integers, and $a$ and $c$ are relatively prime. Find $a + b + c$.","solution":"","answer":"62","url":""} 10 | {"id":"I-10","problem":"The 27 cells of a $3 \\times 9$ grid are filled in using the numbers 1 through 9 so that each row contains 9 different numbers, and each of the three $3 \\times 3$ blocks heavily outlined in the example contains 9 different numbers, as in the first three rows of a Sudoku puzzle. The number of different ways to fill such a grid can be written as $p^a \\cdot q^b \\cdot r^c \\cdot s^d$, where $p,q,r,$ and $s$ are distinct prime numbers and $a$, $b$, $c$, and $d$ are positive integers. Find $p \\cdot a + q \\cdot b + r \\cdot c + s \\cdot d$.","solution":"","answer":"81","url":""} 11 | {"id":"I-11","problem":"A piecewise linear periodic function is defined by\n$f(x)=\\begin{cases} x & \\text{if } x \\in [-1,1) \\\\ 2-x & \\text{if } x \\in [1,3) \\end{cases}$\nand $f(x+4)=f(x)$ for all real numbers $x$. The parabola $x=34y^2$ intersects the graph of $f(x)$ at finitely many points. The sum of the $y$-coordinates of these intersection points can be expressed in the form $\\frac{a+b\\sqrt{c}}{d}$, where $a$, $b$, $c$, and $d$ are positive integers, $a$, $b$, and $d$ have greatest common divisor equal to 1, and $c$ is not divisible by the square of any prime. Find $a + b + c + d$.","solution":"","answer":"259","url":""} 12 | {"id":"I-12","problem":"The set of points in 3-dimensional coordinate space that lie in the plane $x + y + z = 75$ whose coordinates satisfy the inequalities\n$x - yz < y - zx < z - xy$\nforms three disjoint convex regions. Exactly one of those regions has finite area. The area of this finite region can be expressed in the form $a\\sqrt{b}$, where $a$ and $b$ are positive integers and $b$ is not divisible by the square of any prime. Find $a + b$.","solution":"","answer":"510","url":""} 13 | {"id":"I-13","problem":"Alex divides a disk into four quadrants with two perpendicular diameters intersecting at the center of the disk. He draws 25 more line segments through the disk, drawing each segment by selecting two points at random on the perimeter of the disk in different quadrants and connecting those two points. Find the expected number of regions into which these 27 line segments divide the disk.","solution":"","answer":"204","url":""} 14 | {"id":"I-14","problem":"Let $ABCDE$ be a convex pentagon with $AB = 14$, $BC = 7$, $CD = 24$, $DE = 13$, $EA = 26$, and $\\angle ZBA = 60°$. For each point $X$ in the plane, define $f(X) = AX + BX + CX + DX + EX$. The least possible value of $f(X)$ can be expressed as $m + n\\sqrt{p}$, where $m$ and $n$ are positive integers and $p$ is not divisible by the square of any prime. Find $m + n + p$.","solution":"","answer":"60","url":""} 15 | {"id":"I-15","problem":"Let $N$ denote the number of ordered triples of positive integers $(a,b,c)$ such that $a,b,c \\leq 3^6$ and $a^3 + b^3 + c^3$ is a multiple of $3^7$. Find the remainder when $N$ is divided by 1000.","solution":"","answer":"735","url":""} -------------------------------------------------------------------------------- /web_pipeline/utils/decont_utils/data/sat.jsonl: -------------------------------------------------------------------------------- 1 | {"id": "0", "question": "The graph of the polynomial function $f$, where $y=f(x)$, has $x$-intercepts of $(-6,0)$ and $(6,0)$. Which of the following must be true?", "options": "A) $f(-6)=0$ B) $f(6)=-6$ C) $f(-6)=6$ D) $f(0)=-6$", "Answer": "A"} 2 | {"id": "1", "question": "$$\\begin{gathered} y=4 x+6 \\\\-5 x-y=21\\end{gathered}$$ What is the solution $(x, y)$ to the given system of equations?", "options": "A) $(-3,-6)$ B) $\\left(-\\frac{5}{3},-\\frac{2}{3}\\right)$ C) $(3,18)$ D) $(15,66)$", "Answer": "A"} 3 | {"id": "2", "question": "$\\lvert x-10 \\rvert = 0$ What are all the possible solutions to the given equation?", "options": "A) -10 B) 0 C) 10 D) -10 and 10", "Answer": "C"} 4 | {"id": "3", "question": "$$q=s(r-1)^2$$ The given equation relates the positive numbers $q, r$, and $s$. Which equation gives $r$ in terms of $q$ and $s$, when $r>1$?", "options": "A) $r=1+\\sqrt{\\frac{q}{s}}$ B) $r=1+\\frac{\\sqrt{q}}{s}$ C) $r=-1-\\sqrt{\\frac{q}{s}}$ D) $r=-1-\\frac{\\sqrt{q}}{s}$", "Answer": "A"} 5 | {"id": "4", "question": "In the relationship between variables $x$ and $y$, each increase of $1$ in the value of $x$ decreases the value of $y$ by 2. When $x=0$, $y=5$. Which equation represents this relationship?", "options": "A) $y=-\\frac{1}{2}x+5$ B) $y=-\\frac{1}{2}x-5$ C) $y=-2x-5$ D) $y=-2x+5$", "Answer": "D"} 6 | {"id": "5", "question": "An isosceles right triangle has a hypotenuse of length 4 inches. What is the perimeter, in inches, of this triangle?", "options": "A) $2\\sqrt{2}$ B) $4\\sqrt{2}$ C) $4+4\\sqrt{2}$ D) $4+8\\sqrt{2}$", "Answer": "C"} 7 | {"id": "6", "question": "How many solutions does the equation $4(x-2) = -2(x+4)$ have?", "options": "A) Zero B) Exactly one C) Exactly two D) Infinitely many", "Answer": "B"} 8 | {"id": "7", "question": "$R(t) = 1,830 - 790(2.71)^{-.18t}$ The function $R$ gives the predicted average rating, expressed as a number of points, in the German chess federation database for a player based on the number of years, $t$, the player has participated in professional chess tournaments. Which of the following represents the predicted average rating of a player who has just entered their first professional chess tournament?", "options": "A) $R(-0.18)$ B) $R(0)$ C) $R(790)$ D) $R(1,830)$", "Answer": "B"} 9 | {"id": "8", "question": "Alice took 60 minutes to complete a task on her first trial. The time it took Alice to complete the task decreased by 10% of the previous time for each additional trial. Approximately how many minutes will it take Alice to complete the task on her fifth trial?", "options": "A) 50 B) 42 C) 39 D) 35", "Answer": "C"} 10 | {"id": "9", "question": "$$ \\begin{aligned} & y<\\frac{2}{5} x+3 \\\\& y>\\frac{1}{2} x-6\\end{aligned}$$ In which of the following tables are all the values of $x$ and their corresponding values of $y$ solutions to the system of inequalities shown?", "options": "A) \\begin{tabular}{|r|r|} \\hline$x$ & $y$ \\\\\\hline-2 & -8 \\\\\\hline 0 & -4 \\\\\\hline 4 & 4 \\\\\\hline\\end{tabular} B) \\begin{tabular}{|c|c|}\\hline$x$ & $y$ \\\\\\hline-2 & -8 \\\\\\hline 0 & 4 \\\\\\hline 4 & 4 \\\\\\hline\\end{tabular} C) \\begin{tabular}{|r|r|}\\hline$x$ & $y$ \\\\\\hline-2 & 3 \\\\\\hline 0 & 2 \\\\\\hline 4 & -3 \\\\\\hline\\end{tabular} D) \\begin{tabular}{|r|r|}\\hline$x$ & $y$ \\\\\\hline-2 & 2 \\\\\\hline 0 & -3 \\\\\\hline 4 & 3 \\\\\\hline\\end{tabular}", "Answer": "D"} 11 | {"id": "10", "question": "Which of the following is equivalent to $(\\sqrt{32})(\\sqrt[5]{64})$?", "options": "A) $6\\left(\\sqrt[7]{2^5}\\right)$ B) $6\\left(\\sqrt[10]{2^7}\\right)$ C) $8\\left(\\sqrt[7]{2^5}\\right)$ D) $8\\left(\\sqrt[10]{2^7}\\right)$", "Answer": "D"} 12 | {"id": "11", "question": "An object has a mass of 3,300 milligrams. What is the mass of the object in grams? (1 gram = 1,000 milligrams)", "options": "A) 0.33 B) 3.30 C) 33.00 D) 330.00", "Answer": "B"} 13 | {"id": "12", "question": "On average, one square inch of human skin contains 650 sweat glands. A certain area of skin contains 1,170 sweat glands. Based on this information, which of the following is closest to the size of this area, in square inches?", "options": "A) 0.44 B) 0.56 C) 0.80 D) 1.80", "Answer": "D"} 14 | {"id": "13", "question": "The table give the heights, in feet, of 5 peaks in the Rocky Mountains and 5 peaks in the Appalachian Mountains. \\begin{tabular}{|l|l|l|l|l|} \\hline $\\begin{array}{l}\\text { Rocky } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ & $\\begin{array}{l}\\text { Appalachian } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Elbert }\\end{array}$ & 14,439 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Mitchell }\\end{array}$ & 6,684 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Massive }\\end{array}$ & 14,429 & Mount Craig & 6,647 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Harvard }\\end{array}$ & 14,419 & $\\begin{array}{l}\\text { Clingman's } \\\\\\text { Dome }\\end{array}$ & 6,643 \\\\\\hline $\\begin{array}{l}\\text { Blanca } \\\\\\text { Peak }\\end{array}$ & 14,350 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Guyot }\\end{array}$ & 6,621 \\\\\\hline $\\begin{array}{l}\\text { La Plata } \\\\\\text { Peak }\\end{array}$ & 14,343 & $\\begin{array}{l}\\text { Balsam } \\\\\\text { Cone }\\end{array}$ & 6,611 \\\\\\hline\\end{tabular} What is the height, in meters, of Blanca Peak? (Use 1 meter $=3.28$ feet)", "options": "A) 437.5 B) 4,375 C) 47,045 D) 47,068", "Answer": "B"} 15 | {"id": "14", "question": "The table give the heights, in feet, of 5 peaks in the Rocky Mountains and 5 peaks in the Appalachian Mountains. \\begin{tabular}{|l|l|l|l|l|} \\hline $\\begin{array}{l}\\text { Rocky } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ & $\\begin{array}{l}\\text { Appalachian } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Elbert }\\end{array}$ & 14,439 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Mitchell }\\end{array}$ & 6,684 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Massive }\\end{array}$ & 14,429 & Mount Craig & 6,647 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Harvard }\\end{array}$ & 14,419 & $\\begin{array}{l}\\text { Clingman's } \\\\\\text { Dome }\\end{array}$ & 6,643 \\\\\\hline $\\begin{array}{l}\\text { Blanca } \\\\\\text { Peak }\\end{array}$ & 14,350 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Guyot }\\end{array}$ & 6,621 \\\\\\hline $\\begin{array}{l}\\text { La Plata } \\\\\\text { Peak }\\end{array}$ & 14,343 & $\\begin{array}{l}\\text { Balsam } \\\\\\text { Cone }\\end{array}$ & 6,611 \\\\\\hline\\end{tabular} For the given Appalachian Mountain peaks, the height of the highest peak is approximately what percent greater than the height of the lowest peak?", "options": "A) $1.1 \\%$ B) $9.9 \\%$ C) $73.0 \\%$ D) $101.1 \\%$", "Answer": "A"} 16 | {"id": "15", "question": "Data set $A: 2,4,6,6,8,12$ Data set B: $2,4,6,6,8,12,26$ Two data sets are shown. Which statement best compares the medians of the data sets?", "options": "A) The median of data set A is greater than the median of data set $B$ B) The median of data set A is less than the median of data set B C) The medians of data sets A and B are equal D) There is not enough information to compare the medians", "Answer": "C"} 17 | {"id": "16", "question": "$$0.79 x+1.0 y=100$$ The mass of a solution of isopropanol and water is 100 grams. The given equation represents this situation, where $x$ is the volume of isopropanol, in cubic centimeters, and $y$ is the volume of water, in cubic centimeters. If the volume of isopropanol is 70 cubic centimeters, what is the approximate volume of water, in cubic centimeters?", "options": "A) 45 B) 55 C) 70 D) 79", "Answer": "A"} 18 | {"id": "17", "question": "There are 435 voting members of the US House of Representatives. If $b$ voting members are in favor of a certain bill, which expression represents the percentage of the voting members in favor of the bill?", "options": "A. $100\\left(\\frac{b}{435}\\right)$ B. $100\\left(\\frac{435}{b}\\right)$ C. $435\\left(\\frac{b}{100}\\right)$ D. $435(100 b)$", "Answer": "A"} 19 | {"id": "18", "question": "$$10(x+120)=120$$ Which of the following equations has the same solution as the given equation?", "options": "A) $x+120=12$ B) $x+120=130$ C) $x+12=12$ D) $x+12=120$", "Answer": "A"} 20 | {"id": "19", "question": "The given function $C$ models the annual soybean use in China, in millions of metric tons, between 1995 and 2014, where $x$ is the number of years after 1995. $$C(x)=4.3 x+19$$ According to the model, what is the best interpretation of 4.3 in this context?", "options": "A) Each year between 1995 and 2014, China used 4.3 million metric tons of soybeans B) Each year between 1995 and 2014, China's annual use of soybeans increased by 4.3 million metric tons C) China used 4.3 million metric tons of soybeans in 1995 D) China used a total of 4.3 million metric tons of soybeans between 1995 and 2014", "Answer": "B"} 21 | {"id": "20", "question": "$$ \\begin{gathered} C(x)=50,000+0.75 x \\\\ R(x)=4.75 x \\end{gathered}$$ The given function $C$ models the total cost (sum of fixed cost and variable cost), in dollars, of growing and harvesting $x$ bales of hay on a certain farm. The given function $R$ models the revenue, in dollars, earned from selling $x$ bales of hay. According to the function $R$, how many bales of hay would have to be sold to earn a revenue of $\\$1,425$?", "options": "A) 100 B) 300 C) 500 D) 1,000", "Answer": "B"} 22 | {"id": "21", "question": "$$ \\begin{gathered} C(x)=50,000+0.75 x \\\\ R(x)=4.75 x \\end{gathered}$$ The given function $C$ models the total cost (sum of fixed cost and variable cost), in dollars, of growing and harvesting $x$ bales of hay on a certain farm. The given function $R$ models the revenue, in dollars, earned from selling $x$ bales of hay. Which of the following inequalities models the number of bales of hay that must be sold to earn a profit of $\\$ 10,000$ or more? (profit $=$ revenue - cost)", "options": "A) $10,000 \\leq 4 x-50,000$ B) $10,000 \\geq 4 x-50,000$ C) $10,000 \\leq 4 x+50,000$ D) $10,000 \\geq 4 x+50,000$", "Answer": "A"} 23 | {"id": "22", "question": "Which expression is equivalent to $\\left(x^2+4\\right)^2+(x-2)(x+2) ?$", "options": "A) $x^4+x^2+20$ B) $x^4+5 x^2+16$ C) $x^4+9 x^2$ D) $x^4+9 x^2+12$", "Answer": "D"} 24 | {"id": "23", "question": "$$ \\begin{aligned} & y=4 x+1 \\\\ & y=4 x+3 \\end{aligned}$$ How many solutions does the given system of equations have?", "options": "A) Zero B) Exactly one C) Exactly two D) Infinitely many", "Answer": "A"} 25 | {"id": "24", "question": "$$ h(x)=3 x+3 $$ Which inequality represents all values of $x$ for which the graph of $y=h(x)$ in the $x y$-plane is above the $x$-axis?", "options": "A) $x<3$ B) $x<-1$ C) $x>-1$ D) $x>3$", "Answer": "C"} 26 | {"id": "25", "question": "Which quadratic equation has no real solutions?", "options": "A) $3 x^2-3=0$ B) $3 x^2+3 x=0$ C) $3 x^2+3 x+3=0$ D) $3 x^2-6 x+3=0$", "Answer": "C"} 27 | {"id": "26", "question": "In 1976, there were approximately 1,000 gray wolves in northern Minnesota. The number of gray wolves in northern Minnesota in 2008 was 190% greater than in 1976. Approximately how many gray wolves were in northern Minnesota in 2008?", "options": "A. 1,190 B. 1,900 C. 2,900 D. 19,000", "Answer": "C"} 28 | {"id": "27", "question": "When the quadratic function $f$ is graphed in the $x y$-plane, where $y=f(x)$, its vertex is $(-2,5)$. One of the $x$-intercepts of this graph is $\\left(-\\frac{7}{3}, 0\\right)$. What is the other $x$-intercept of the graph?", "options": "A. $\\left(-\\frac{13}{3}, 0\\right)$ B. $\\left(-\\frac{5}{3}, 0\\right)$ C. $\\left(\\frac{1}{3}, 0\\right)$ D. $\\left(\\frac{7}{3}, 0\\right)$", "Answer": "B"} 29 | {"id": "28", "question": "For an exponential function $g$, the value of $g(x)$ decreases by $20 \\%$ for each 1-unit increase in the value of $x$. If $g(2)=16$, which equation could define $g$ ?", "options": "A) $g(x)=16(0.8)^{x-2}$ B) $g(x)=16(0.8)^{x+2}$ C) $g(x)=16(0.2)^{x-2}$ D) $g(x)=16(0.2)^{x+2}$", "Answer": "A"} 30 | {"id": "29", "question": "Micha and Rana each selected a random sample of students at their school and asked how many soft drink servings each student had consumed the previous week. Micha estimated that the mean number of soft drink servings was 7.1, with an associated margin of error of 1.2. Rana estimated that the mean number of soft drink servings was 8.3, with an associated margin of error of 0.8. Assuming the margins of error were calculated in the same way, which of the following best explains why Rana obtained a smaller margin of error than Micha?", "options": "A. Rana's sample contained more students than Micha's sample contained. B. Rana's sample contained more students who drank soft drinks than Micha's sample contained. C. Rana's sample contained more students who drank exactly seven soft drink servings than Micha's sample contained. D. Rana's sample contained more students who drank exactly eight soft drink servings than Micha's sample contained.", "Answer": "A"} 31 | {"id": "30", "question": "A circle in the $x y$-plane has its center at $(-3,4)$ and the point $(-2,1)$ lies on the circle. Which equation represents this circle?", "options": "A) $(x-3)^2+(y+4)^2=\\sqrt{10}$ B) $(x+3)^2+(y-4)^2=\\sqrt{10}$ C) $(x-3)^2+(y+4)^2=10$ D) $(x+3)^2+(y-4)^2=10$", "Answer": "D"} 32 | {"id": "31", "question": "\\begin{tabular}{|c|c|} \\hline$x$ & $h(x)$ \\\\\\hline 2 & 0 \\\\\\hline 4 & 0 \\\\\\hline 6 & 8 \\\\\\hline \\end{tabular} For the quadratic function $h$, the table gives three values of $x$ and their corresponding values of $h(x)$. At what value of $x$ does $h$ reach its minimum?", "options": "A) -1 B) 0 C) 3 D) 4", "Answer": "C"} -------------------------------------------------------------------------------- /web_pipeline/utils/decont_utils/datatrove_helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Used for n-gram decontamination. 3 | First build an index using the tasks we want to use to decontaminate our training dataset. 4 | Then read your training data and apply the filter with the index loaded. 5 | """ 6 | 7 | import os 8 | from collections import defaultdict 9 | from concurrent.futures import ThreadPoolExecutor 10 | from dataclasses import dataclass, field 11 | from typing import Tuple 12 | 13 | import numpy as np 14 | 15 | from datatrove.data import Document, DocumentsPipeline 16 | from datatrove.io import DataFolderLike, file_exists, get_datafolder, open_file 17 | from datatrove.pipeline.base import PipelineStep 18 | from datatrove.pipeline.filters.base_filter import BaseFilter 19 | from datatrove.pipeline.writers.disk_base import DiskWriter 20 | from datatrove.utils.binaryio import read_np_from_file 21 | from datatrove.utils.hashing import HashConfig, create_hash_func 22 | from datatrove.utils.logging import logger 23 | from datatrove.utils.text import TextNormConfig, ngrams, simplify_text 24 | from datatrove.utils.typeshelper import Languages 25 | from datatrove.utils.word_tokenizers import load_word_tokenizer 26 | 27 | @dataclass 28 | class NGramsDecontConfig: 29 | """ 30 | Example for n_grams=4 31 | query = ['A', 'B', 'C', 'D', 'E'] (the prompt/instruction) 32 | label = ['F', 'G', 'H', 'I', 'J'] (the answer/gold) 33 | Will find the following N-GRAMS in the training data: 34 | 'F G H I' 35 | 'G H I J' 36 | + IF find_query_ngrams: 37 | 'A B C D' 38 | 'B C D E' 39 | + IF find_overlap_ngrams: 40 | 'C D E F' 41 | 'D E F G' 42 | 'E F G H' 43 | """ 44 | 45 | n_grams: int = 12 46 | find_query_ngrams: bool = False # enable to also check for matches in n-grams containing only the input/prompt 47 | find_overlap_ngrams: bool = True # will also find matches for n-grams containing BOTH input and query 48 | 49 | # for math we do not do number normalize 50 | norm_config: TextNormConfig = field(default_factory=TextNormConfig) 51 | hash_config: HashConfig = field(default_factory=HashConfig) 52 | 53 | 54 | class NGramsDecontIndexer(PipelineStep): 55 | """ 56 | Creates a decontamination index (basically a list of uint64 hashes from ngrams) for each reference task. 57 | Ways to provide task data: 58 | - as input documents from the previous pipeline step with "text=label/correct answer" 59 | and metadata={"query": query/prompt/input, "task": task name} 60 | 61 | #FIXME 62 | @fan, deprecated, we do not use lighteval in megamath 63 | - as a list of strings in the format "suite|task" from the lighteval metadata table: 64 | https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/tasks_table.jsonl as `lighteval_tasks` 65 | - a path to a text file containing one such list, with one "suite|task" per line as `lighteval_tasks` 66 | you can also define your custom tasks with `custom_lighteval_tasks`. See explanation for `custom_tasks` here: 67 | https://github.com/huggingface/lighteval/tree/main?tab=readme-ov-file#evaluate-a-model-on-extended-community-or-custom-tasks 68 | 69 | """ 70 | 71 | type = "🦠 - DECONT" 72 | name = "💥 N-grams build index" 73 | 74 | def __init__( 75 | self, 76 | output_folder: DataFolderLike, 77 | config: NGramsDecontConfig = None, 78 | task_dict: any = None, 79 | language: str = Languages.english, 80 | ): 81 | super().__init__() 82 | self.output_folder = get_datafolder(output_folder) 83 | self.config = config or NGramsDecontConfig() 84 | self.tokenizer = load_word_tokenizer(language) 85 | self.hash_func = create_hash_func(self.config.hash_config) 86 | self.task_dict = task_dict 87 | 88 | def compute_hashes(self, label: str, query: str | None = None) -> list[int]: 89 | # label_tokens = self.tokenizer.word_tokenize(simplify_text(label, self.config.norm_config)) 90 | label_tokens = simplify_text(label, self.config.norm_config).lower().split() 91 | # label_tokens = label.lower().split() 92 | 93 | ngrams_to_compute = list(ngrams(label_tokens, self.config.n_grams)) 94 | if query is not None: 95 | # query_tokens = self.tokenizer.word_tokenize(simplify_text(query, self.config.norm_config)) 96 | query_tokens = simplify_text(query, self.config.norm_config).lower().split() 97 | # query_tokens = query.lower().split() 98 | if self.config.find_query_ngrams: 99 | ngrams_to_compute.extend(ngrams(query_tokens, self.config.n_grams)) 100 | if self.config.find_overlap_ngrams: 101 | # add tokens overlapping query and label 102 | """ 103 | A, B, C, D, E | F, G, H, I, J 104 | 5 grams 105 | B, C, D, E, F (-N + 1 + i:) + (:i + 1) 106 | ... 107 | E, F, G, H, I 108 | """ 109 | ngrams_to_compute.extend( 110 | [ 111 | query_tokens[-self.config.n_grams + 1 + i :] + label_tokens[: i + 1] 112 | for i in range(self.config.n_grams - 1) 113 | # make sure we actually get a list of size N 114 | if len(query_tokens) >= self.config.n_grams - 1 - i and len(label_tokens) >= i + 1 115 | ] 116 | ) 117 | return list(map(self.hash_func, map(" ".join, ngrams_to_compute))) 118 | 119 | def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1): 120 | if world_size != 1: 121 | raise ValueError("Decontamination index building requires a single worker.") 122 | hashes = defaultdict(set) 123 | # use whatever date is parsed in with the following format: 124 | # doc.text -> label 125 | # doc.metadata["input"] -> input 126 | if data: 127 | for doc in data: 128 | if not self.config.find_query_ngrams and "query" not in doc.metadata: 129 | raise ValueError( 130 | "only_label_ngrams is False but could not find 'query' field in documents metadata" 131 | ) 132 | hashes[doc.metadata.get("task", "input")].update( 133 | self.compute_hashes(doc.text, doc.metadata.get("query", None)) 134 | ) 135 | 136 | for task_name, task in self.task_dict.items(): 137 | for eval_doc in task: 138 | try: 139 | golds = eval_doc["label"] if isinstance(eval_doc["label"], list) else [eval_doc["label"]] 140 | query = eval_doc["query"] 141 | except Exception as e: 142 | logger.warning(f"Error while fetching doc data: {e}") 143 | continue 144 | for gold in golds: 145 | hashes[task_name].update(self.compute_hashes(gold, query)) 146 | 147 | for task_name, task_hashes in hashes.items(): 148 | hashes_array = np.array(list(task_hashes), dtype=self.config.hash_config.np_descr) 149 | logger.info(f"Saving {len(task_hashes)} hashes for {task_name}") 150 | with self.output_folder.open(f"{task_name.replace(' ', '_')}.index.hashes", mode="wb") as f: 151 | if self.output_folder.is_local(): 152 | hashes_array.tofile(f) 153 | else: 154 | f.write(hashes_array.tobytes()) 155 | 156 | 157 | class NGramsDecontFilter(BaseFilter): 158 | """ 159 | Loads list of hashes created by the Indexer step. 160 | For each document in the block's input, we will check if any of its ngrams are part of the reference eval tasks. 161 | If so, they will be removed. The contaminated ngram and task where it was found will be saved in the removed 162 | document's metadata. 163 | """ 164 | 165 | type = "🦠 - DECONT" 166 | name = "💥 N-grams decontaminate" 167 | 168 | def __init__( 169 | self, 170 | index_folder: DataFolderLike, 171 | config: NGramsDecontConfig = None, 172 | exclusion_writer: DiskWriter = None, 173 | language: str = Languages.english, 174 | ): 175 | super().__init__() 176 | self.index_folder = get_datafolder(index_folder) 177 | self.config = config or NGramsDecontConfig() 178 | self.exclusion_writer = exclusion_writer 179 | self.language = language 180 | self._index_hashes = None 181 | self.hash_func = create_hash_func(self.config.hash_config) 182 | self.tokenizer = load_word_tokenizer(language) 183 | 184 | def load_index_hashes(self): 185 | def load_index_from_file(file): 186 | with self.index_folder.open(file, mode="rb") as f: 187 | return file, read_np_from_file( 188 | f, np.dtype(self.config.hash_config.np_descr), self.index_folder.is_local() 189 | ).tolist() 190 | 191 | with ThreadPoolExecutor() as pool: 192 | hashes = pool.map(load_index_from_file, self.index_folder.list_files(glob_pattern="**/*.index.hashes")) 193 | 194 | self._index_hashes = {} 195 | for filename, hashlist in hashes: 196 | taskname = filename.removesuffix(".index.hashes") 197 | logger.info(f"Loading {len(hashlist)} hashes for {taskname}") 198 | for hash in hashlist: 199 | self._index_hashes[hash] = taskname 200 | 201 | def filter(self, doc: Document) -> bool | Tuple[bool, str]: 202 | if self._index_hashes is None: 203 | self.load_index_hashes() 204 | 205 | text_tokens = simplify_text(doc.text, self.config.norm_config).lower().split() 206 | ngrams_to_compute = list(ngrams(text_tokens, self.config.n_grams)) 207 | for n_gram in map(" ".join, ngrams_to_compute): 208 | task = self._index_hashes.get(self.hash_func(n_gram), None) 209 | 210 | #FIXME 211 | # @fan, insert punctuation check to avoid empty ngrams 212 | punc_check_config = self.config.norm_config 213 | punc_check_config.remove_punctuation = True 214 | punc_check_text = simplify_text(n_gram, punc_check_config) 215 | if punc_check_text == "": 216 | return True 217 | 218 | if task is not None: 219 | doc.metadata["contaminated_ngram"] = n_gram 220 | doc.metadata["contaminated_task"] = task 221 | self.stat_update(f"contaminated_{task}") 222 | if ":" in task: 223 | self.stat_update(f"contaminated_tg_{task[:task.index(':')]}") 224 | return False, "contaminated" 225 | return True 226 | -------------------------------------------------------------------------------- /web_pipeline/utils/decont_utils/downstream_datasets.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open("./utils/decont_utils/data/asdiv.jsonl", "r") as f: 4 | asdiv_raw = [json.loads(line) for line in f] 5 | asdiv_tasks = [ 6 | { 7 | "query": f"{item['body']}", 8 | "label": f"{item['answer']}", 9 | } 10 | for item in asdiv_raw 11 | ] 12 | 13 | with open("./utils/decont_utils/data/gsm8k.jsonl", "r") as f: 14 | gsm8k_raw = [json.loads(line) for line in f] 15 | gsm8k_tasks = [ 16 | { 17 | "query": f"{item['question']}", 18 | "label": f"{item['cot']} {item['answer']}", 19 | } 20 | for item in gsm8k_raw 21 | ] 22 | 23 | with open("./utils/decont_utils/data/math.jsonl", "r") as f: 24 | math_raw = [json.loads(line) for line in f] 25 | math_tasks = [ 26 | { 27 | "query": f"{item['problem']}", 28 | "label": f"{item['solution']}", 29 | } 30 | for item in math_raw 31 | ] 32 | 33 | 34 | with open("./utils/decont_utils/data/mathqa.jsonl", "r") as f: 35 | mathqa_raw = [json.loads(line) for line in f] 36 | mathqa_tasks = [ 37 | { 38 | "query": f"{item['problem']} {item['options']}", 39 | "label": f"{item['rationale']}"[1:-1], # remove quotes 40 | } 41 | for item in mathqa_raw 42 | ] 43 | 44 | with open("./utils/decont_utils/data/mawps.jsonl", "r") as f: 45 | mawps_raw = [json.loads(line) for line in f] 46 | mawps_tasks = [ 47 | { 48 | "query": f"{item['input']}", 49 | "label": f"{item['target']}", 50 | } 51 | for item in mawps_raw 52 | ] 53 | 54 | with open("./utils/decont_utils/data/mmlu_stem.jsonl", "r") as f: 55 | mmlu_stem_raw = [json.loads(line) for line in f] 56 | mmlu_stem_tasks = [ 57 | { 58 | "query": f"{item['question']}", 59 | "label": "A: " 60 | + str(item["options"][0]) 61 | + " B: " 62 | + str(item["options"][1]) 63 | + " C: " 64 | + str(item["options"][2]) 65 | + " D: " 66 | + str(item["options"][3]) 67 | + " Answer: " 68 | + str(item["answer"]), 69 | } 70 | for item in mmlu_stem_raw 71 | ] 72 | 73 | with open("./utils/decont_utils/data/ocw.jsonl", "r") as f: 74 | ocw_raw = [json.loads(line) for line in f] 75 | ocw_tasks = [ 76 | { 77 | "query": f"{item['problem']}", 78 | "label": f"{item['solution']} {item['answer']}", 79 | } 80 | for item in ocw_raw 81 | ] 82 | 83 | with open("./utils/decont_utils/data/sat.jsonl", "r") as f: 84 | sat_raw = [json.loads(line) for line in f] 85 | sat_tasks = [ 86 | { 87 | "query": f"{item['question']}", 88 | "label": f"{item['options']} {item['Answer']}", 89 | } 90 | for item in sat_raw 91 | ] 92 | 93 | with open("./utils/decont_utils/data/svamp.jsonl", "r") as f: 94 | svamp_raw = [json.loads(line) for line in f] 95 | svamp_tasks = [ 96 | { 97 | "query": f"{item['Body']} {item['Question']}", 98 | "label": f"{item['Answer']}", 99 | } 100 | for item in svamp_raw 101 | ] 102 | 103 | with open("./utils/decont_utils/data/aime24.jsonl", "r") as f: 104 | aime24_raw = [json.loads(line) for line in f] 105 | aime24_tasks = [ 106 | { 107 | "query": f"{item['problem']}", 108 | "label": f"{item['solution']}", 109 | } 110 | for item in aime24_raw 111 | ] 112 | 113 | with open("./utils/decont_utils/data/aime25.jsonl", "r") as f: 114 | aime25_raw = [json.loads(line) for line in f] 115 | aime25_tasks = [ 116 | { 117 | "query": f"{item['problem']}", 118 | "label": f"{item['answer']}", 119 | } 120 | for item in aime25_raw 121 | ] 122 | 123 | with open("./utils/decont_utils/data/amc.jsonl", "r") as f: 124 | amc_raw = [json.loads(line) for line in f] 125 | amc_tasks = [ 126 | { 127 | "query": f"{item['problem']}", 128 | "label": f"{item['answer']}", 129 | } 130 | for item in amc_raw 131 | ] 132 | 133 | TASK_DATASETS = { 134 | "asdiv": asdiv_tasks, 135 | "gsm8k": gsm8k_tasks, 136 | "math": math_tasks, 137 | "mathqa": mathqa_tasks, 138 | "mawps": mawps_tasks, 139 | "mmlu_stem": mmlu_stem_tasks, 140 | "ocw": ocw_tasks, 141 | "sat": sat_tasks, 142 | "svamp": svamp_tasks, 143 | "aime24": aime24_tasks, 144 | "aime25": aime25_tasks, 145 | "amc": amc_tasks, 146 | } 147 | 148 | 149 | if __name__ == "__main__": 150 | for key, value in TASK_DATASETS.items(): 151 | print(key, len(value)) 152 | print(f">>>[Query] {value[0]['query']}") 153 | print(f">>>[Label] {value[0]['label']}") 154 | print("-" * 10 + "\n") 155 | -------------------------------------------------------------------------------- /web_pipeline/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gzip 3 | import json 4 | import os 5 | from dataclasses import asdict 6 | 7 | import requests 8 | 9 | 10 | def download_from_cc( 11 | object_key, bucket_name="commoncrawl", local_root_path="./crawl-data/" 12 | ): 13 | local_file = local_root_path + object_key 14 | local_file_path = os.path.dirname(local_file) 15 | print(f"Downloading {object_key} to {local_file}") 16 | if not os.path.exists(local_file_path): 17 | os.makedirs(local_file_path, exist_ok=True) 18 | try: 19 | # download the file from the url to local_file_path 20 | PREFIX = "https://data.commoncrawl.org/" 21 | url = PREFIX + object_key 22 | # show speed 23 | response = requests.get(url, stream=True) 24 | response.raise_for_status() 25 | with open(local_file, "wb") as file: 26 | file.write(response.content) 27 | print(f"Successfully downloaded {object_key} to {local_file}") 28 | return local_file 29 | 30 | # assert False 31 | except Exception as e: 32 | print(f"An error occurred: {str(e)}") 33 | return None 34 | 35 | 36 | def write_to_jsonlgz(data, output_file): 37 | print(f"Writing {len(data)} documents into {output_file} ...") 38 | with gzip.open(output_file, "at", encoding="utf-8") as gz_file: 39 | gz_file.write("\n".join(json.dumps(item) for item in data) + "\n") 40 | 41 | 42 | def delete_local_files(to_delete_files): 43 | for file_to_remove in to_delete_files: 44 | try: 45 | # Attempt to remove the file 46 | os.remove(file_to_remove) 47 | print(f"File '{file_to_remove}' has been successfully removed.") 48 | except FileNotFoundError: 49 | print(f"File '{file_to_remove}' not found.") 50 | except Exception as e: 51 | print(f"An error occurred: {str(e)}") 52 | 53 | 54 | def make_dir(file_name): 55 | file_path = os.path.dirname(file_name) 56 | print(f"Making directory: {file_path}") 57 | if not os.path.exists(file_path): 58 | os.makedirs(file_path, exist_ok=True) 59 | 60 | 61 | def remove_file(file_name): 62 | if os.path.isfile(file_name): 63 | os.remove(file_name) 64 | print(f"Remove halfly-processed file: {file_name}") 65 | 66 | 67 | def write_stat(stat_file, statistics, input_file, FIELD_NAMES): 68 | make_dir(stat_file) 69 | print(f"Writing {str(input_file)} into {stat_file}") 70 | if not os.path.exists(stat_file): 71 | with open(stat_file, mode="a", newline="") as file: 72 | writer = csv.DictWriter(file, fieldnames=FIELD_NAMES) 73 | 74 | # Write the headers 75 | writer.writeheader() 76 | 77 | # Write the data as a dictionary 78 | writer.writerow(asdict(statistics)) 79 | else: 80 | with open(stat_file, mode="a", newline="") as file: 81 | writer = csv.DictWriter(file, fieldnames=FIELD_NAMES) 82 | 83 | # Write the data as a dictionary 84 | writer.writerow(asdict(statistics)) 85 | -------------------------------------------------------------------------------- /web_pipeline/utils/latex_parsing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import time 4 | 5 | from resiliparse.extract.html2text import extract_plain_text 6 | from resiliparse.parse.html import HTMLTree, traverse_dom 7 | from resiliparse.process_guard import (ExecutionTimeout, InterruptType, 8 | MemoryLimitExceeded, mem_guard, 9 | time_guard) 10 | 11 | from mathml2latex.mathml2latex import mathml2latex, unicode2latex 12 | 13 | 14 | def improve_latex_content_parsing(html_doc): 15 | tree = HTMLTree.parse(html_doc) 16 | 17 | def remove_math_styles(latex_text): 18 | if "display" not in latex_text and "textstyle" not in latex_text: 19 | return latex_text 20 | 21 | pattern = r"\$\{?(\\(?:display|text)style)\s*(.+?)\}?\$" 22 | 23 | def replace_func(match): 24 | content = match.group(2) 25 | content = re.sub(r"^\{(.+)\}$", r"\1", content) 26 | return f"${content}$" 27 | 28 | cleaned_text = re.sub(pattern, replace_func, latex_text) 29 | return cleaned_text 30 | 31 | def clean_latex(latex_text): 32 | latex_text = latex_text.strip() 33 | if latex_text.startswith("{\\displaystyle"): 34 | latex_text = latex_text.replace("{\\displaystyle", "") 35 | if latex_text.endswith("}"): 36 | latex_text = latex_text[:-1] 37 | if latex_text.strip() == "": 38 | return "" 39 | return f"${latex_text.strip()}$" 40 | 41 | def process_math_element(math_elem): 42 | if math_elem.getattr("class") == "katex-mathml": 43 | print("skip katex mathml") 44 | return # 跳过 KaTeX 的 HTML/CSS 渲染部分 45 | 46 | latex = extract_latex_with_timeout(math_elem) 47 | if latex == None: 48 | return 49 | 50 | new_span = tree.create_element("span") 51 | new_span["class"] = "math-text" 52 | new_span.text = latex.strip() 53 | parent = math_elem.parent 54 | if parent: 55 | parent.replace_child(new_span, math_elem) 56 | 57 | def clean_mathml(mathml_block): 58 | if "oldsymbol{" in mathml_block and "boldsymbol{" not in mathml_block: 59 | mathml_block = mathml_block.replace("oldsymbol", "\\boldsymbol") 60 | mathml_block = re.sub(r"<\?xml[^>]+\?>\s*", "", mathml_block) 61 | if 'xmlns="http://www.w3.org/1998/Math/MathML"' not in mathml_block: 62 | mathml_block = mathml_block.replace( 63 | ">>>>>>processing by katex mathml") 155 | parent.replace_child(mathml_elem, katex_elem) 156 | return 157 | 158 | for html_katex in katex_elem.get_elements_by_class_name("katex-html"): 159 | parent = katex_elem.parent 160 | if not parent: 161 | continue 162 | 163 | math_text = parse_katex_html(html_katex) 164 | 165 | math_text = math_text.replace("\u200b", " ") 166 | 167 | if math_text.strip() == "": 168 | continue 169 | print(">>>>>>>processing by katex html") 170 | 171 | new_span = tree.create_element("span") 172 | new_span["class"] = "math-text" 173 | new_span.text = f"${math_text}$" 174 | parent.replace_child(new_span, katex_elem) 175 | 176 | def process_math_html_entities(tree): 177 | # replace the math symbols 178 | # replacements = { 179 | # "−": "-", 180 | # "√": "\\sqrt", 181 | # "γ": "\\gamma", 182 | # } 183 | def start_callback(context): 184 | node = context.node 185 | 186 | def replace_sub_sup_tag(node): 187 | if node.tag == "sup" and not node.text.startswith("^{"): 188 | node.text = f"^{{{node.text}}}" 189 | elif node.tag == "sub" and not node.text.startswith("_{"): 190 | node.text = f"_{{{node.text}}}" 191 | 192 | for child in node.child_nodes: 193 | replace_sub_sup_tag(child) # directly process the child nodes 194 | 195 | return node 196 | 197 | node = replace_sub_sup_tag(node) 198 | 199 | # replace the tag 200 | if node.tag == "span" and "intbl" in node.getattr("class", ""): 201 | numerator = node.get_elements_by_tag_name("em") 202 | 203 | denominator = node.get_elements_by_tag_name("strong") 204 | if numerator and denominator: 205 | # print("=="*10) 206 | node.text = ( 207 | f"\\frac{{{numerator[0].text}}}{{{denominator[0].text}}}" 208 | ) 209 | # print(node.text) 210 | 211 | # traverse the DOM tree from the body 212 | traverse_dom( 213 | tree.body, 214 | start_callback=start_callback, 215 | elements_only=False, # traverse all nodes, including text nodes 216 | ) 217 | 218 | # return the modified DOM tree 219 | return tree 220 | 221 | if tree.body is not None: 222 | # process the KaTeX container 223 | for katex_elem in tree.body.get_elements_by_class_name("katex"): 224 | process_katex_container(katex_elem) 225 | 226 | for math_elem in tree.body.get_elements_by_tag_name("math"): 227 | process_math_element(math_elem) 228 | 229 | # for key, value in MATH_HTML_ENTITIES_REPLACEMENTS.items(): 230 | # if key in tree.body.html: 231 | # print("=="*10) 232 | # print(f"{key} -> {value}") 233 | # tree.body.html = tree.body.html.replace(key, value) 234 | tree = process_math_html_entities(tree) 235 | else: 236 | print("Warning: The HTML document has no body.") 237 | 238 | return str(tree) 239 | 240 | 241 | def improve_latex_content_parsing_with_timeout(html_doc): 242 | with mem_guard( 243 | max_memory=1024 * 1024 * 4, interrupt_type=InterruptType.exception 244 | ): # 4GB limit 245 | with time_guard( 246 | timeout=0.1, interrupt_type=InterruptType.exception 247 | ) as guard: # 1 second timeout 248 | try: 249 | return improve_latex_content_parsing(html_doc) 250 | except ExecutionTimeout: 251 | sys.stderr.write("Timeout! Returning original HTML.\n") 252 | sys.stderr.flush() 253 | return html_doc 254 | except MemoryLimitExceeded: 255 | sys.stderr.write("Memory limit exceeded! Returning original HTML.\n") 256 | sys.stderr.flush() 257 | return html_doc 258 | -------------------------------------------------------------------------------- /web_pipeline/utils/math_fasttext.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import unicodedata 4 | 5 | import fasttext 6 | from fasttext.FastText import _FastText 7 | from nltk.tokenize import wordpunct_tokenize 8 | 9 | LABEL_PREFIX = "__label__" 10 | LABEL_MATH = f"{LABEL_PREFIX}MATH" 11 | LABEL_NON_MATH = f"{LABEL_PREFIX}NON_MATH" 12 | 13 | 14 | def normalization(text): 15 | tokens = wordpunct_tokenize(text) 16 | 17 | processed_tokens = [] 18 | for token in tokens: 19 | token = token.lower() 20 | 21 | if token.isdigit(): 22 | processed_tokens.append("") 23 | elif len(token) <= 100: 24 | processed_tokens.append(token) 25 | 26 | preprocessed_text = " ".join(processed_tokens) 27 | 28 | preprocessed_text = re.sub(r"[\n\r]+", " ", preprocessed_text) 29 | preprocessed_text = re.sub(r"[-_]+", " ", preprocessed_text) 30 | preprocessed_text = re.sub(r"[^a-zA-Z0-9\s]", "", preprocessed_text) 31 | preprocessed_text = re.sub(r"\s+", " ", preprocessed_text).strip() 32 | 33 | return preprocessed_text 34 | 35 | 36 | def preprocess_for_fasttext(text): 37 | if isinstance(text, bytes): 38 | text = text.decode("utf-8") 39 | 40 | text = unicodedata.normalize("NFKC", text) 41 | 42 | text = re.sub(r"\s", " ", text) 43 | 44 | text = text.replace("\n", " ") 45 | 46 | text = re.sub(r"\s+", " ", text) 47 | 48 | text = normalization(text) 49 | 50 | MAX_LINE_SIZE = 1024 51 | lines = text.split("") 52 | processed_lines = [] 53 | for line in lines: 54 | tokens = line.split() 55 | if len(tokens) > MAX_LINE_SIZE: 56 | processed_lines.extend( 57 | [ 58 | " ".join(tokens[i : i + MAX_LINE_SIZE]) 59 | for i in range(0, len(tokens), MAX_LINE_SIZE) 60 | ] 61 | ) 62 | else: 63 | processed_lines.append(line) 64 | 65 | text = " ".join(processed_lines) 66 | 67 | return text.strip() 68 | 69 | 70 | class MathFastTextClassifier: 71 | name = "+ - * ÷ Math FastText Recall" 72 | _requires_dependencies = [("fasttext", "fasttext-wheel"), "fasteners"] 73 | 74 | def __init__( 75 | self, 76 | model_path: str | None = None, 77 | math_threshold: float = 0.95, 78 | math_class_name: str = None, 79 | ): 80 | assert model_path is not None, "please specify the model path" 81 | self.model_path = model_path 82 | self.model = fasttext.load_model(model_path) 83 | self.math_threshold = math_threshold 84 | self.math_class_name = math_class_name 85 | 86 | def __getstate__(self): 87 | """Custom pickling method to avoid pickling the FastText model directly.""" 88 | state = self.__dict__.copy() 89 | # Remove the model from the state to avoid pickling issues 90 | state["model"] = None 91 | return state 92 | 93 | def __setstate__(self, state): 94 | """Custom unpickling method to reload the FastText model.""" 95 | self.__dict__.update(state) 96 | # Reload the model after unpickling 97 | self.model = fasttext.load_model(self.model_path) 98 | 99 | def predict(self, text: str) -> bool: 100 | class_tuples, prob_tuples = self.model.predict( 101 | preprocess_for_fasttext( 102 | text, 103 | ), 104 | k=-1, 105 | ) 106 | 107 | assert len(class_tuples) == len(prob_tuples) 108 | math_score = 0 109 | for class_name, prob in zip(class_tuples, prob_tuples): 110 | if class_name == self.math_class_name: 111 | math_score = prob 112 | break 113 | 114 | # print(f"math score: {math_score}") 115 | return math_score >= self.math_threshold, math_score 116 | --------------------------------------------------------------------------------