├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── assets
├── logo.png
└── teasor.png
├── code_pipeline
└── .keep
└── web_pipeline
├── README.md
├── download
├── download.md
└── process_listings
│ ├── download_cc_list.py
│ ├── split_listing.py
│ ├── split_listing_for_hash_generation.sh
│ ├── split_listing_for_local_deduplication.sh
│ ├── split_listing_for_quality_filtering.sh
│ ├── split_listing_for_re-organizing_data_merge.sh
│ ├── split_listing_for_re-organizing_data_split.sh
│ └── split_listing_for_text_extraction.sh
├── mathml2latex
├── LICENSE
├── README.md
├── input.md
├── mathml2latex.py
├── mmltex
│ ├── README
│ ├── README2
│ ├── cmarkup.xsl
│ ├── entities.xsl
│ ├── glayout.xsl
│ ├── mmltex.xsl
│ ├── scripts.xsl
│ ├── tables.xsl
│ └── tokens.xsl
├── output.md
├── output.png
└── unicode_map.py
├── requirements.txt
├── stage1_download_and_extract.py
├── url_filtering
├── url_filter.py
└── urls
│ ├── adult
│ ├── domains
│ ├── expressions
│ ├── urls
│ └── usage
│ ├── blocklist
│ ├── adult.tar.gz
│ ├── adult
│ │ ├── domains
│ │ ├── expressions
│ │ ├── urls
│ │ └── usage
│ ├── agressif.tar.gz
│ ├── agressif
│ │ ├── domains
│ │ ├── expressions
│ │ ├── urls
│ │ └── usage
│ ├── arjel.tar.gz
│ ├── arjel
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── chat.tar.gz
│ ├── chat
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── dating.tar.gz
│ ├── dating
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── ddos.tar.gz
│ ├── ddos
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── download.sh
│ ├── filehosting.tar.gz
│ ├── filehosting
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── gambling.tar.gz
│ ├── gambling
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── mixed_adult.tar.gz
│ ├── mixed_adult
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── phishing.tar.gz
│ └── phishing
│ │ ├── domains
│ │ ├── urls
│ │ └── usage
│ ├── curated
│ └── domains
│ ├── url_blocklist_refinedweb_manual_inspection.csv
│ └── whitelist
│ ├── domains
│ └── urls
└── utils
├── bad_url_words.py
├── datatrove_utils.py
├── decont_utils
├── data
│ ├── aime24.jsonl
│ ├── aime25.jsonl
│ ├── amc.jsonl
│ ├── asdiv.jsonl
│ ├── gsm8k.jsonl
│ ├── math.jsonl
│ ├── mathqa.jsonl
│ ├── mawps.jsonl
│ ├── mmlu_stem.jsonl
│ ├── ocw.jsonl
│ ├── sat.jsonl
│ └── svamp.jsonl
├── datatrove_helper.py
└── downstream_datasets.py
├── file_utils.py
├── latex_parsing.py
└── math_fasttext.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | web_pipeline/url_filtering/urls/blocklist/adult/domains filter=lfs diff=lfs merge=lfs -text
2 | web_pipeline/url_filtering/urls/url_blocklist_refinedweb_manual_inspection.csv filter=lfs diff=lfs merge=lfs -text
3 | web_pipeline/url_filtering/urls/adult/domains filter=lfs diff=lfs merge=lfs -text
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # UV
98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | #uv.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 |
110 | # pdm
111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | # in version control.
115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 |
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 |
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 |
127 | # SageMath parsed files
128 | *.sage.py
129 |
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 |
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 |
143 | # Rope project settings
144 | .ropeproject
145 |
146 | # mkdocs documentation
147 | /site
148 |
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 |
154 | # Pyre type checker
155 | .pyre/
156 |
157 | # pytype static type analyzer
158 | .pytype/
159 |
160 | # Cython debug symbols
161 | cython_debug/
162 |
163 | # PyCharm
164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | # and can be added to the global gitignore or merged into this file. For a more nuclear
167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 |
170 | # Ruff stuff:
171 | .ruff_cache/
172 |
173 | # PyPI configuration file
174 | .pypirc
175 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
MegaMath: An Open Math Pre-trainng Dataset with 370B Tokens.
3 |
4 | [](https://huggingface.co/datasets/LLM360/MegaMath)
5 | [](https://arxiv.org/pdf/2504.02807)
6 |
7 |
8 | ## About MegaMath
9 |
10 |
11 |

12 |
13 |
14 | MegaMath is a large-scale pre-training dataset for math.
15 | It is curated via the following three efforts:
16 |
17 | - **Revisiting web data**: We re-extracted mathematical documents from Common Crawl with math-oriented HTML optimizations, fasttext-based filtering and deduplication, all for acquiring higher-quality data on the Internet.
18 | - **Recalling Math-related code data**: We identified high quality math-related code from large code training corpus, Stack-V2, further enhancing data diversity.
19 | - **Exploring Synthetic data**: We synthesized QA-style text, math-related code, and interleaved text-code blocks from web data or code data.
20 |
21 | ## How to Use
22 |
23 | MegaMath includes many different data variants which is tailored for different training demands.
24 |
25 | If you are training your LLM from scratch, we recommend you to use the full set of our web data.
26 | ```python
27 | from huggingface_hub import snapshot_download
28 | snapshot_download(
29 | repo_id="LLM360/MegaMath",
30 | local_dir="./",
31 | repo_type="dataset",
32 | allow_patterns=["megamath-web/*"]
33 | )
34 | ```
35 |
36 | If you are performing continual pre-training from strong base models, **MegaMath-Web-Pro** may be your best choice.
37 | ```python
38 | from huggingface_hub import snapshot_download
39 | snapshot_download(
40 | repo_id="LLM360/MegaMath",
41 | local_dir="./",
42 | repo_type="dataset",
43 | allow_patterns=["megamath-web-pro/*"]
44 | )
45 | ```
46 |
47 | We also provide **MegaMath-Code** which can enhance the performance of your LLM on solving math-related tasks via Python code. Moreover, MegaMath contains over 80B tokens of synthetic data, which can be used to further enhance the performance of your LLM on solving math-related tasks.
48 |
49 | ```python
50 | from huggingface_hub import snapshot_download
51 | snapshot_download(
52 | repo_id="LLM360/MegaMath",
53 | local_dir="./",
54 | repo_type="dataset",
55 | allow_patterns=[
56 | "megamath-qa/*",
57 | "megamath-translated-code/*",
58 | "megamath-text-code-block/*",
59 | "megamath-code/*"
60 | ]
61 | )
62 | ```
63 |
64 | ## Data Pipeline
65 |
66 | Please refer to the [web_pipeline](./web_pipeline) for more details. We are actively working on the code pipeline and will update the README soon.
67 |
68 |
69 | ## Citation
70 | If you use our dataset or find our work useful, please cite
71 | ```bibtex
72 | @article{zhou2025megamath,
73 | title = {MegaMath: Pushing the Limits of Open Math Corpora},
74 | author = {Zhou, Fan and Wang, Zengzhi and Ranjan, Nikhil and Cheng, Zhoujun and Tang, Liping and He, Guowei and Liu, Zhengzhong and Xing, Eric P.},
75 | journal = {arXiv preprint arXiv:2504.02807},
76 | year = {2025},
77 | note = {Preprint}
78 | }
79 | ```
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/assets/logo.png
--------------------------------------------------------------------------------
/assets/teasor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/assets/teasor.png
--------------------------------------------------------------------------------
/code_pipeline/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/code_pipeline/.keep
--------------------------------------------------------------------------------
/web_pipeline/README.md:
--------------------------------------------------------------------------------
1 | # Web Pipeline
2 |
3 | This folder contains the code for the web pipeline.
4 | First please follow instructions in [download](./download/download.md) folder to get all the available WARC file paths.
5 |
6 | ## Stage 1: Download and Extract
7 | This will download the WARC files from the Common Crawl and extract the text and HTML content. Meanwhile, we will perform language identification and math text filtering using fasttext models.
8 |
9 | ```bash
10 | python stage1_download_and_extract.py
11 | ```
12 |
13 | ## Stage 2: Deduplication
14 |
15 | We mainly follow DataTrove's example to perform deduplication.
16 | Please refer to the example code in [datatrove](https://github.com/huggingface/datatrove/blob/main/examples/minhash_deduplication.py) for more details. The majority of the code is the same, but we use a different bucket size and hash function number (11 , 10).
17 |
18 | ## Stage 3: Re-extraction
19 |
20 | TODO
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/download_cc_list.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 | import os
4 | import wget
5 | import requests
6 | from bs4 import BeautifulSoup
7 |
8 |
9 | def parse_args():
10 | """Parse and return command line arguments."""
11 | parser = argparse.ArgumentParser(description="Download Common Crawl index dumps.")
12 | parser.add_argument('--save_path', type=str, default="./commoncrawlList/",
13 | help="Path to save the downloaded Common Crawl list.")
14 | parser.add_argument('--skip_existing_dumps', type=bool, default=True,
15 | help="Whether to skip dumps already downloaded to 'save_path'.")
16 | return parser.parse_args()
17 |
18 |
19 | def get_available_dumps(url):
20 | """Fetch and parse the web page to list available Common Crawl dumps."""
21 | response = requests.get(url)
22 | html_content = response.text
23 | soup = BeautifulSoup(html_content, 'html.parser')
24 | return soup.find_all('a', attrs={'class': "crawl-link w-inline-block"})
25 |
26 |
27 | def main():
28 | args = parse_args()
29 |
30 | # Ensure the directory exists where the dumps will be stored.
31 | if not os.path.exists(args.save_path):
32 | os.makedirs(args.save_path)
33 |
34 | # Get all the available dumps from Common Crawl's start page.
35 | url = 'https://commoncrawl.org/get-started'
36 | dump_links = get_available_dumps(url)
37 |
38 | # Prepare to track already downloaded dumps if skipping is enabled.
39 | existing_dumps = set(os.listdir(args.save_path)) if args.skip_existing_dumps else set()
40 |
41 | # Dumps to skip due to different file formats which are not supported for now.
42 | skip_list = {'CC-MAIN-2012', 'CC-MAIN-2009-2010', 'CC-MAIN-2008-2009'}
43 |
44 | # File to record names of newly downloaded dumps.
45 | with open(os.path.join(args.save_path, 'dumplist.txt'), 'w') as dump_file:
46 | for link in dump_links:
47 | dump_url = link.get('href')
48 | dump_name = dump_url.split('/')[-2] # Format: 'CC-MAIN-2024-30'
49 |
50 | # Skip dumps either in skip list or already downloaded.
51 | if dump_name in skip_list or dump_name in existing_dumps:
52 | continue
53 |
54 | # Construct download URL and local save path.
55 | dump_list_url = dump_url.split('index.html')[0] + 'warc.paths.gz'
56 | dump_save_path = os.path.join(args.save_path, dump_name)
57 |
58 | # Ensure dump directory exists and download the dump.
59 | if not os.path.exists(dump_save_path):
60 | os.makedirs(dump_save_path)
61 | wget.download(dump_list_url, out=dump_save_path)
62 | print(f"\n Successfully downloaded {dump_name}")
63 | dump_file.write(dump_name + '\n')
64 |
65 |
66 | if __name__ == '__main__':
67 | main()
68 |
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/split_listing.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gzip
3 | import io
4 | import os
5 | import random
6 |
7 | def start_split(dump_file_name_paths, files_num, store_dir, shuffle, consecutive):
8 | """
9 | Splits the dump file listings into multiple files, either randomly or consecutively,
10 | depending on the flags provided.
11 |
12 | Args:
13 | dump_file_name_paths (list of str): Paths to the input dump file listings.
14 | files_num (int): Number of output files to generate.
15 | store_dir (str): Directory to store the output split files.
16 | shuffle (bool): Whether to shuffle the records before splitting.
17 | consecutive (bool): If True, store records consecutively across output files.
18 | """
19 | file_name_list = []
20 |
21 | # Load files and extend file name list accordingly
22 | for dump_file_name_path in dump_file_name_paths:
23 | if dump_file_name_path.endswith(".txt"):
24 | with open(dump_file_name_path) as records:
25 | file_name_list.extend(records)
26 | elif dump_file_name_path.endswith(".gz"):
27 | with gzip.open(dump_file_name_path, "rb") as stream:
28 | records = io.TextIOWrapper(stream, encoding="utf-8")
29 | file_name_list.extend(records)
30 |
31 | # Optionally shuffle the file name list
32 | if shuffle:
33 | random.shuffle(file_name_list)
34 |
35 | print(f"Total records: {len(file_name_list)}")
36 |
37 | # Ensure the storage directory exists
38 | if not os.path.exists(store_dir) and store_dir.endswith("/"):
39 | os.makedirs(store_dir, exist_ok=True)
40 | elif not os.path.exists(os.path.dirname(store_dir)):
41 | os.makedirs(os.path.dirname(store_dir), exist_ok=True)
42 |
43 | if consecutive:
44 | start_index = 0 # Initial index for consecutive file writing
45 | file_lines = len(file_name_list)
46 | base_lines_per_file = file_lines // files_num
47 | extra_lines = file_lines % files_num
48 |
49 | # Write records to output files
50 | for i in range(files_num):
51 | output_file = os.path.join(store_dir, f"Split_{i:03d}.txt") if store_dir.endswith("/") \
52 | else f"{store_dir}.Split_{i:03d}.txt"
53 | with open(output_file, "w") as f:
54 | if consecutive:
55 | # Determine the number of lines this file should get
56 | lines_for_this_file = base_lines_per_file + (1 if i < extra_lines else 0)
57 | end_index = start_index + lines_for_this_file
58 |
59 | # Write the designated slice of records to the file
60 | f.writelines(file_name_list[start_index:end_index])
61 |
62 | # Update the start index for the next file
63 | start_index = end_index
64 | else:
65 | # Distribute records skipping files_num indices for each record
66 | f.writelines(file_name_list[i::files_num])
67 |
68 | if __name__ == '__main__':
69 | parser = argparse.ArgumentParser(description="Split dump file listings into multiple files.")
70 | parser.add_argument('--file_path', nargs='+', help='Paths to the dump file listings')
71 | parser.add_argument('--files_num', type=int, default=99, help='Number of output files to generate')
72 | parser.add_argument('--store_dir', type=str, default='./split_files', help='Output directory for split files')
73 | parser.add_argument('--shuffle', type=bool, default=False, help='Shuffle the records before splitting')
74 | parser.add_argument('--consecutive', type=bool, default=False, help='Store consecutive listings in one file')
75 |
76 | args = parser.parse_args()
77 | start_split(args.file_path, args.files_num, args.store_dir, args.shuffle, args.consecutive)
78 |
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/split_listing_for_hash_generation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Initialize an array to store the names of new dumps
4 | declare -a ALL_DUMPS
5 |
6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array
7 | # This file contains a list of new dump directories
8 | while IFS= read -r line; do
9 | ALL_DUMPS+=("$line")
10 | done < "commoncrawlList/dumplist.txt"
11 |
12 | for dump_date in "${ALL_DUMPS[@]}"; do
13 | all_file_paths=()
14 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
15 | all_file_paths+=("$file_path")
16 |
17 | # Execute a Python script to process the listed WARC file paths
18 | # --files_num: Specifies the number of output files to generate
19 | # --store_dir: Defines the directory where the split listings will be stored
20 | # --file_path: Passes the array of WARC file paths to the Python script
21 | # --shuffle: Enables shuffling of the file paths before processing
22 | echo $file_path
23 | python3 split_listing.py \
24 | --files_num 20 \
25 | --store_dir ../../listings/hash_generation/run-1/$dump_date \
26 | --file_path "${all_file_paths[@]}" \
27 | --shuffle True
28 | done
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/split_listing_for_local_deduplication.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Initialize an array to store the names of new dumps
4 | declare -a ALL_DUMPS
5 |
6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array
7 | # This file contains a list of new dump directories
8 | while IFS= read -r line; do
9 | ALL_DUMPS+=("$line")
10 | done < "commoncrawlList/dumplist.txt"
11 |
12 | for dump_date in "${ALL_DUMPS[@]}"; do
13 | all_file_paths=()
14 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
15 | all_file_paths+=("$file_path")
16 |
17 | # Execute a Python script to process the listed WARC file paths
18 | # --files_num: Specifies the number of output files to generate
19 | # --store_dir: Defines the directory where the split listings will be stored
20 | # --file_path: Passes the array of WARC file paths to the Python script
21 | # --consecutive: Enables consecutive listings of the file paths
22 | echo $file_path
23 | python3 split_listing.py \
24 | --files_num 70 \
25 | --store_dir ../../listings/local_deduplication/run-1/$dump_date \
26 | --file_path "${all_file_paths[@]}" \
27 | --consecutive True
28 | done
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/split_listing_for_quality_filtering.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Initialize an array to store the names of new dumps
4 | declare -a ALL_DUMPS
5 |
6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array
7 | # This file contains a list of new dump directories
8 | while IFS= read -r line; do
9 | ALL_DUMPS+=("$line")
10 | done < "commoncrawlList/dumplist.txt"
11 |
12 | for dump_date in "${ALL_DUMPS[@]}"; do
13 | all_file_paths=()
14 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
15 | all_file_paths+=("$file_path")
16 |
17 | # Execute a Python script to process the listed WARC file paths
18 | # --files_num: Specifies the number of output files to generate
19 | # --store_dir: Defines the directory where the split listings will be stored
20 | # --file_path: Passes the array of WARC file paths to the Python script
21 | # --shuffle: Enables shuffling file paths in the listings
22 | echo $file_path
23 | python3 split_listing.py \
24 | --files_num 20 \
25 | --store_dir ../../listings/quality_filtering/run-1/${dump_date#CC-MAIN-} \
26 | --file_path "${all_file_paths[@]}" \
27 | --shuffle True
28 | done
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/split_listing_for_re-organizing_data_merge.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Initialize an array to store the names of new dumps
4 | declare -a ALL_DUMPS
5 |
6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array
7 | # This file contains a list of new dump directories
8 | while IFS= read -r line; do
9 | ALL_DUMPS+=("$line")
10 | done < "commoncrawlList/dumplist.txt"
11 |
12 | for dump_date in "${ALL_DUMPS[@]}"; do
13 | all_file_paths=()
14 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
15 | all_file_paths+=("$file_path")
16 |
17 | # Execute a Python script to process the listed WARC file paths
18 | # --files_num: Specifies the number of output files to generate
19 | # --store_dir: Defines the directory where the split listings will be stored
20 | # --file_path: Passes the array of WARC file paths to the Python script
21 | # --shuffle: Enables shuffling file paths in the listings
22 | echo $file_path
23 | python3 split_listing.py \
24 | --files_num 10 \
25 | --store_dir ../../listings/re-organizing_data/merge/run-1/1-1/${dump_date#CC-MAIN-} \
26 | --file_path "${all_file_paths[@]}" \
27 | --shuffle True
28 | done
29 |
30 |
31 | for dump_date in "${ALL_DUMPS[@]}"; do
32 | all_file_paths=()
33 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
34 | all_file_paths+=("$file_path")
35 |
36 | # Execute a Python script to process the listed WARC file paths
37 | # --files_num: Specifies the number of output files to generate
38 | # --store_dir: Defines the directory where the split listings will be stored
39 | # --file_path: Passes the array of WARC file paths to the Python script
40 | # --shuffle: Enables shuffling file paths in the listings
41 | echo $file_path
42 | python3 split_listing.py \
43 | --files_num 10 \
44 | --store_dir ../../listings/re-organizing_data/merge/run-1/2-5/${dump_date#CC-MAIN-} \
45 | --file_path "${all_file_paths[@]}" \
46 | --shuffle True
47 | done
48 |
49 |
50 | for dump_date in "${ALL_DUMPS[@]}"; do
51 | all_file_paths=()
52 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
53 | all_file_paths+=("$file_path")
54 |
55 | # Execute a Python script to process the listed WARC file paths
56 | # --files_num: Specifies the number of output files to generate
57 | # --store_dir: Defines the directory where the split listings will be stored
58 | # --file_path: Passes the array of WARC file paths to the Python script
59 | # --shuffle: Enables shuffling file paths in the listings
60 | echo $file_path
61 | python3 split_listing.py \
62 | --files_num 7 \
63 | --store_dir ../../listings/re-organizing_data/merge/run-1/6-10/${dump_date#CC-MAIN-} \
64 | --file_path "${all_file_paths[@]}" \
65 | --shuffle True
66 | done
67 |
68 |
69 | for dump_date in "${ALL_DUMPS[@]}"; do
70 | all_file_paths=()
71 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
72 | all_file_paths+=("$file_path")
73 |
74 | # Execute a Python script to process the listed WARC file paths
75 | # --files_num: Specifies the number of output files to generate
76 | # --store_dir: Defines the directory where the split listings will be stored
77 | # --file_path: Passes the array of WARC file paths to the Python script
78 | # --shuffle: Enables shuffling file paths in the listings
79 | echo $file_path
80 | python3 split_listing.py \
81 | --files_num 5 \
82 | --store_dir ../../listings/re-organizing_data/merge/run-1/11-100/${dump_date#CC-MAIN-} \
83 | --file_path "${all_file_paths[@]}" \
84 | --shuffle True
85 | done
86 |
87 |
88 | for dump_date in "${ALL_DUMPS[@]}"; do
89 | all_file_paths=()
90 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
91 | all_file_paths+=("$file_path")
92 |
93 | # Execute a Python script to process the listed WARC file paths
94 | # --files_num: Specifies the number of output files to generate
95 | # --store_dir: Defines the directory where the split listings will be stored
96 | # --file_path: Passes the array of WARC file paths to the Python script
97 | # --shuffle: Enables shuffling file paths in the listings
98 | echo $file_path
99 | python3 split_listing.py \
100 | --files_num 3 \
101 | --store_dir ../../listings/re-organizing_data/merge/run-1/101-1000/${dump_date#CC-MAIN-} \
102 | --file_path "${all_file_paths[@]}" \
103 | --shuffle True
104 | done
105 |
106 |
107 | for dump_date in "${ALL_DUMPS[@]}"; do
108 | all_file_paths=()
109 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
110 | all_file_paths+=("$file_path")
111 |
112 | # Execute a Python script to process the listed WARC file paths
113 | # --files_num: Specifies the number of output files to generate
114 | # --store_dir: Defines the directory where the split listings will be stored
115 | # --file_path: Passes the array of WARC file paths to the Python script
116 | # --consecutive: Enables consecutive listings of the file paths
117 | echo $file_path
118 | python3 split_listing.py \
119 | --files_num 2 \
120 | --store_dir ../../listings/re-organizing_data/merge/run-1/1001-inf/${dump_date#CC-MAIN-} \
121 | --file_path "${all_file_paths[@]}" \
122 | --shuffle True
123 | done
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/split_listing_for_re-organizing_data_split.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Initialize an array to store the names of new dumps
4 | declare -a ALL_DUMPS
5 |
6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array
7 | # This file contains a list of new dump directories
8 | while IFS= read -r line; do
9 | ALL_DUMPS+=("$line")
10 | done < "commoncrawlList/dumplist.txt"
11 |
12 | all_file_paths=()
13 | for dump_date in "${ALL_DUMPS[@]}"; do
14 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
15 | all_file_paths+=("$file_path")
16 | done
17 |
18 | # Execute a Python script to process the listed WARC file paths
19 | # --files_num: Specifies the number of output files to generate
20 | # --store_dir: Defines the directory where the split listings will be stored
21 | # --file_path: Passes the array of WARC file paths to the Python script
22 | # --shuffle: Enables shuffling file paths in the listings
23 | python3 split_listing.py \
24 | --files_num 20 \
25 | --store_dir ../../listings/re-organizing_data/split/run-1/ \
26 | --file_path "${all_file_paths[@]}" \
27 | --shuffle True
--------------------------------------------------------------------------------
/web_pipeline/download/process_listings/split_listing_for_text_extraction.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Initialize an array to store the names of new dumps
4 | declare -a ALL_DUMPS
5 |
6 | # Read each line from the dumplist.txt file and append to the ALL_DUMPS array
7 | # This file contains a list of new dump directories
8 | while IFS= read -r line; do
9 | ALL_DUMPS+=("$line")
10 | done < "commoncrawlList/dumplist.txt"
11 |
12 | for dump_date in "${ALL_DUMPS[@]}"; do
13 | all_file_paths=()
14 | file_path="commoncrawlList/$dump_date/warc.paths.gz"
15 | all_file_paths+=("$file_path")
16 |
17 | # Execute a Python script to process the listed WARC file paths
18 | # --files_num: Specifies the number of output files to generate
19 | # --store_dir: Defines the directory where the split listings will be stored
20 | # --file_path: Passes the array of WARC file paths to the Python script
21 | # --shuffle: Enables shuffling file paths in the listings
22 | echo $file_path
23 | python3 download/process_listings/split_listing.py \
24 | --files_num 200 \
25 | --store_dir listings/text_extraction/run-1/${dump_date#CC-MAIN-} \
26 | --file_path "${all_file_paths[@]}" \
27 | --shuffle True
28 | done
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Bo Wang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/README.md:
--------------------------------------------------------------------------------
1 | # Convert MathML to Latex
2 |
3 | ## Introduction
4 |
5 | This project provides a Python script to convert MathML into Latex.
6 |
7 | ## Usage
8 |
9 | ```bash
10 | ./mathml2latex input.md output.md
11 | ```
12 |
13 | Example `input.md`:
14 |
15 | ```
16 | Gradient
17 |
18 | Let be a scalar field. The gradient is
19 |
20 |
21 | ```
22 |
23 | Example `output.md`:
24 |
25 | ```
26 | Gradient
27 | Let $f:{\mathbb{R}}^{n}\to \mathbb{R}$ be a scalar field. The gradient is
28 | $$\nabla f\left(x\right)=\left[\begin{array}{c}\frac{\mathit{\partial}f}{\mathit{\partial}{x}_{1}}\\ \vdots \\ \frac{\mathit{\partial}f}{\mathit{\partial}{x}_{n}}\end{array}\right]$$
29 | ```
30 |
31 | `output.md` rendered as:
32 |
33 | 
34 |
35 | ## Background
36 |
37 | I started this little project when attempting to migrate from OneNote to Markdown. I have a large number of math notes with heavy equations, which makes my journey much bumpier.
38 |
39 | In OneNote, equations are stored in MathML format; while in Markdown, equations are in the form of Latex.
40 |
41 | As this may help others in similar situations, I decided to jot down the approaches to convert OneNote to Markdown below.
42 |
43 | There exist at least three ways to do the conversion.
44 |
45 | 1. **OneNote --> Word --> Markdown**
46 | This [method](https://github.com/SjoerdV/ConvertOneNote2MarkDown) appears to be the most popular. I found several similar repos on Github.
47 | There are two steps in this approach:
48 | * **Step 1**: Export OneNote documents in Word format, i.e. `.docx`
49 | * This export function is supported in the [standalone version](https://www.onenote.com/download) of OneNote on Windows. I have not found it available on Mac or on the version installed from the Microsoft Store.
50 | * **Step 2**: Convert Word documents to Markdown with [Pandoc](https://pandoc.org/)
51 | * This approach has the advantage of being able to export all OneNote documents with a single PowerShell script.
52 | * However, it is a disaster for my equations. When exporting to `.docx`, all equations are converted into images, which not only breaks the line alignment but also loses the capability of editing equations in the future.
53 |
54 |
55 | 2. **OneNote --> HTML --> Markdown**
56 | This approach is based on the [one2html](https://github.com/msiemens/one2html) project which utilizes the [onenote.rs](https://github.com/msiemens/onenote.rs) parser.
57 |
58 | The converted equations are in the HTML format, rather than the image format. However, the converter does not support MathML which renders the converted equation garbled.
59 |
60 |
61 | 3. **OneMark --> Markdown (with MathML) --> Markdown (with Latex)**
62 | [OneMark](http://neux.studio/) is a great plugin that enables writing OneNote with Markdown syntax. It also comes with a handy function to export OneNote into Markdown.
63 | This approach consists of two steps:
64 | * **Step 1**: Export OneNote to Markdown with OneMark
65 | * Since OneMark only has Windows version, you need to do this on a Windows machine with the standalone version of OneNote.
66 | * One inefficiency here is that OneMark currently only supports exporting one page at a time. Thus it may be laborious if you have a large number of notes like me.
67 | * **Step 2**: Convert MathML to Latex in Markdown
68 | * Equations in the Markdown generated by OneMark are in form of MathML which is not edit-friendly and cannot be displayed in many Markdown editors.
69 | * To convert MathML to Latex, I write a Python script which results in this repo.
70 |
71 | ## Mechanism
72 |
73 | `mathml2latex.py` detects MathML blocks denoted by ``. This wrapper block is generated by OneMark.
74 |
75 | The conversion of a MathML block is conducted in two phases:
76 |
77 | 1. Invoke [XSLT MathML](http://xsltml.sourceforge.net/) library to transform MathML structures to Latex markups
78 |
79 |
80 | 2. Convert UTF characters to Latex markups
81 | * XSLT MathML only converts the math structures to Latex markups while leaving UTF symbols like `π` in the literal form.
82 | * Many Markdown editors fail to recognize these UTF symbols which results in a failure of rendering.
83 | * Thus, `unicode2latex()` utilizes a lookup table to convert these UTF symbols to Latex markups, e.g. `\pi`
84 |
85 | ## Related
86 |
87 | * [mathconverter](https://github.com/oerpub/mathconverter/) : a nice math converter that inspires this project. Unfortunately it is not Markdown friendly.
88 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/input.md:
--------------------------------------------------------------------------------
1 | Gradient
2 |
3 | Let be a scalar field. The gradient is
4 |
5 |
6 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mathml2latex.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import os
6 | import sys
7 | from lxml import etree
8 | from mathml2latex.unicode_map import unicode_map
9 |
10 | # MathML to LaTeX conversion with XSLT from Vasil Yaroshevich
11 | base_path = os.path.dirname(os.path.realpath(__file__))
12 | xslt_file = os.path.join(base_path, 'mmltex', 'mmltex.xsl')
13 | xslt = etree.parse(xslt_file)
14 | transform = etree.XSLT(xslt)
15 |
16 |
17 | # add by zzwang
18 |
19 | def preprocess_and_parse_xml(xml_content):
20 | # 替换常见的 HTML 实体
21 | # entity_replacements = {
22 | # ' ': ' ', # 非断空格
23 | # '<': '<', # 小于号
24 | # '>': '>', # 大于号
25 | # '&': '&', # &符号
26 | # '"': '"', # 双引号
27 | # ''': ''', # 单引号
28 | # }
29 |
30 | # for entity, replacement in entity_replacements.items():
31 | # xml_content = xml_content.replace(entity, replacement)
32 |
33 | # # 移除或替换其他可能导致问题的字符
34 | # xml_content = re.sub(r'([0-9a-fA-F]+);', lambda m: chr(int(m.group(1), 16)), xml_content)
35 | # xml_content = re.sub(r'([0-9]+);', lambda m: chr(int(m.group(1))), xml_content)
36 |
37 | # 尝试解析预处理后的内容
38 | try:
39 | return etree.fromstring(xml_content)
40 | except etree.XMLSyntaxError as e:
41 | print(f"解析错误: {e}")
42 | # 如果仍然失败,可以尝试使用更宽松的解析器
43 | parser = etree.XMLParser(recover=True)
44 | return etree.fromstring(xml_content, parser)
45 |
46 | def mathml2latex(mathml_block):
47 | # Preprocess to remove aliases
48 | mathml_block = mathml_block.replace('<<', '<<').replace('>>', '>>')
49 | # dom = etree.fromstring(mathml_block)
50 | dom = preprocess_and_parse_xml(mathml_block)
51 | return transform(dom)
52 |
53 | def unicode2latex(latex_block):
54 | latex_text = str(latex_block, 'utf-8').encode('ascii', 'backslashreplace')
55 | for utf_code, latex_code in unicode_map.items():
56 | latex_text = str(latex_text).replace(utf_code, latex_code)
57 | latex_text = latex_text.replace('\\\\', '\\') # "\\" --> "\"
58 | latex_text = re.sub(r'\\textcolor\[rgb\]\{[0-9.,]+\}', '', latex_text) # "\textcolor[rgb]{...}" --> ""
59 | latex_text = latex_text.replace('\\ ~\\ ', '{\\sim}') # " ~ " --> "{\sim}"
60 | latex_text = latex_text[len('b\''):][:-len('\'')] # b'...' --> ...
61 | latex_text = re.sub(r'^\$ ', '$', latex_text) # "$ " --> "$"
62 | latex_text = latex_text.replace('{\\ }', '\\ ') # "{ }" --> " "
63 | latex_text = re.sub(r' \}', '}', latex_text) # " }" --> "}"
64 | latex_text = latex_text.replace('\\n\\[\\n\\t', '$$').replace('\\n\\]', '$$')
65 | return latex_text
66 |
67 | def convert(text):
68 | mathml_blocks = re.findall(r"", text)
69 | for mathml_block in mathml_blocks:
70 | latex_block = mathml2latex(mathml_block)
71 | latex_text = unicode2latex(latex_block)
72 | text = text.replace('', latex_text)
73 | # Remove multiple consecutive blank lines
74 | for _ in range(2):
75 | text = re.sub(r'\n\n', '\n', text)
76 | return text
77 |
78 | def main():
79 | input_file = open(sys.argv[1], "r", encoding="utf-8")
80 | input = input_file.read()
81 | input_file.close()
82 | output = convert(input)
83 | output_file = open(sys.argv[2], "w", encoding="utf-8")
84 | output_file.write(output)
85 | output_file.close()
86 |
87 | # if __name__ == "__main__":
88 | # main()
89 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mmltex/README:
--------------------------------------------------------------------------------
1 | README for the XSLT MathML Library 2.1.2
2 |
3 | XSLT MathML Library is a set of XSLT stylesheets to transform
4 | MathML 2.0 to LaTeX.
5 |
6 | For more information, see
7 | http://www.raleigh.ru/MathML/mmltex/index.php?lang=en
8 |
9 | Manifest
10 | --------
11 |
12 | README this file
13 | mmltex.xsl
14 | tokens.xsl
15 | glayout.xsl
16 | scripts.xsl
17 | tables.xsl
18 | entities.xsl
19 | cmarkup.xsl
20 |
21 | Use
22 | ---
23 |
24 | There are two ways of using the library:
25 |
26 | * Use a local copy of the library.
27 |
28 | 1. Download the distribution (see below).
29 |
30 | 2. Unpack the distribution, using unzip.
31 |
32 | 3. In your stylesheet import or include either the main
33 | stylesheet, mmltex.xsl, or the stylesheet module you
34 | wish to use, such as tokens.xsl. This example assumes
35 | that the distribution has been extracted into the same
36 | directory as your own stylesheet:
37 |
38 |
39 |
40 | * Import or include either the main stylesheet, or the
41 | stylesheet module you wish to use, directly from the library
42 | website; http://www.raleigh.ru/MathML/mmltex/. For example:
43 |
44 |
45 |
46 | Obtaining The Library
47 | ---------------------
48 |
49 | The XSLT MathML Library is available for download as:
50 |
51 | * Zip file: http://www.raleigh.ru/MathML/mmltex/xsltml_2.1.2.zip
52 |
53 | Copyright
54 | ---------
55 |
56 | Copyright (C) 2001-2003 Vasil Yaroshevich
57 |
58 | Permission is hereby granted, free of charge, to any person
59 | obtaining a copy of this software and associated documentation
60 | files (the ``Software''), to deal in the Software without
61 | restriction, including without limitation the rights to use,
62 | copy, modify, merge, publish, distribute, sublicense, and/or
63 | sell copies of the Software, and to permit persons to whom the
64 | Software is furnished to do so, subject to the following
65 | conditions:
66 |
67 | The above copyright notice and this permission notice shall be
68 | included in all copies or substantial portions of the Software.
69 |
70 | Except as contained in this notice, the names of individuals
71 | credited with contribution to this software shall not be used in
72 | advertising or otherwise to promote the sale, use or other
73 | dealings in this Software without prior written authorization
74 | from the individuals in question.
75 |
76 | Any stylesheet derived from this Software that is publically
77 | distributed will be identified with a different name and the
78 | version strings in any derived Software will be changed so that
79 | no possibility of confusion between the derived package and this
80 | Software will exist.
81 |
82 | Warranty
83 | --------
84 |
85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
88 | NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER
89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
92 | OTHER DEALINGS IN THE SOFTWARE.
93 |
94 | Contacting the Author
95 | ---------------------
96 |
97 | These stylesheets are maintained by Vasil Yaroshevich, .
98 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mmltex/README2:
--------------------------------------------------------------------------------
1 | This file is not part of the original source code.
2 |
3 | Researched links to archived web page and Sourceforge project:
4 |
5 | https://sourceforge.net/projects/xsltml/files/xsltml/
6 |
7 | https://web.archive.org/web/20160109063934/http://www.raleigh.ru/MathML/mmltex/index.php
8 |
9 | Google Translated to English:
10 | https://translate.google.com/translate?sl=ru&tl=en&u=https%3A%2F%2Fweb.archive.org%2Fweb%2F20160114170851%2Fhttp%3A%2F%2Fwww.raleigh.ru%2FMathML%2Fmmltex%2Findex.php
11 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mmltex/glayout.xsl:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | \genfrac{}{}{
18 |
19 |
20 |
21 | ex
22 |
23 |
24 | 0ex
25 |
26 |
27 | .05ex
28 |
29 |
30 |
31 | .2ex
32 |
33 |
34 |
35 |
36 |
37 | }{}{
38 |
39 |
40 | \frac{
41 |
42 |
43 |
44 | \hfill
45 |
46 |
47 |
48 | \hfill
49 |
50 | }{
51 |
52 | \hfill
53 |
54 |
55 |
56 | \hfill
57 |
58 | }
59 |
60 |
61 |
62 | \raisebox{1ex}{$
63 |
64 | $}\!\left/ \!\raisebox{-1ex}{$
65 |
66 | $}\right.
67 |
68 |
69 |
70 |
71 |
72 |
73 | \sqrt[
74 |
75 | ]{
76 |
77 | }
78 |
79 |
80 |
81 | exception 25:
82 | \text{exception 25:}
83 |
84 |
85 |
86 |
87 |
88 | \sqrt{
89 |
90 | }
91 |
92 |
93 |
94 |
95 |
96 |
97 | \left
98 |
99 |
100 | \
101 |
102 |
103 | \left.
104 |
105 |
106 |
107 | \left(
108 |
109 |
110 |
111 |
112 |
113 |
114 | ,
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 | \right
134 |
135 |
136 | \
137 |
138 |
139 | \right.
140 |
141 |
142 |
143 | \right)
144 |
145 |
146 |
147 |
148 | \phantom{
149 |
150 | }
151 |
152 |
153 |
154 |
155 |
156 | \overline{
157 |
158 | \hspace{.2em}|}
159 |
160 |
161 | \sqrt{
162 |
163 | }
164 |
165 |
166 | \overline{)
167 |
168 | }
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 | {\displaystyle
180 |
181 |
182 | {
183 |
184 | \textstyle
185 | \scriptstyle
186 | \scriptscriptstyle
187 |
188 |
189 |
190 | \colorbox[rgb]{
191 |
192 |
193 |
194 | }{$
195 |
196 |
197 | \textcolor[rgb]{
198 |
199 |
200 |
201 | }{
202 |
203 |
204 |
205 | }
206 |
207 |
208 | $}
209 |
210 |
211 | }
212 |
213 |
214 | }
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mmltex/mmltex.xsl:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
9 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 | $
26 |
27 | $
28 |
29 |
30 |
31 |
\[
32 |
33 |
\]
34 |
35 |
36 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mmltex/scripts.xsl:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | \overline{
20 |
21 |
22 |
23 |
24 | }
25 |
26 |
27 | \overbrace{
28 |
29 |
30 |
31 |
32 | }
33 |
34 |
35 | \overleftarrow{
36 |
37 |
38 |
39 |
40 | }
41 |
42 |
43 | \overrightarrow{
44 |
45 |
46 |
47 |
48 | }
49 |
50 |
51 | \overleftrightarrow{
52 |
53 |
54 |
55 |
56 | }
57 |
58 |
59 | \underline{
60 |
61 |
62 |
63 |
64 |
65 | }
66 |
67 |
68 | \underbrace{
69 |
70 |
71 |
72 |
73 |
74 | }
75 |
76 |
77 | \underleftarrow{
78 |
79 |
80 |
81 |
82 |
83 | }
84 |
85 |
86 | \underrightarrow{
87 |
88 |
89 |
90 |
91 |
92 | }
93 |
94 |
95 | \underleftrightarrow{
96 |
97 |
98 |
99 |
100 |
101 | }
102 |
103 |
105 |
113 |
114 | _{
115 |
116 | }^{
117 |
118 | }
119 |
120 |
121 | \underset{
122 |
123 | }{\overset{
124 |
125 | }{
126 |
127 | }}
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 | \overline{
153 |
154 | }
155 |
156 |
157 | \overbrace{
158 |
159 | }
160 |
161 |
162 | \overleftarrow{
163 |
164 | }
165 |
166 |
167 | \overrightarrow{
168 |
169 | }
170 |
171 |
172 | \overleftrightarrow{
173 |
174 | }
175 |
176 |
177 | \tilde{
178 |
179 | }
180 |
181 |
182 | \check{
183 |
184 | }
185 |
186 |
187 | \dot{
188 |
189 | }
190 |
191 |
192 | \ddot{
193 |
194 | }
195 |
196 |
197 |
198 |
199 | \widehat{
200 |
201 |
202 | \hat{
203 |
204 |
205 | }
206 |
207 |
209 |
217 |
218 | ^{
219 |
220 | }
221 |
222 |
223 | \stackrel{
224 |
225 | }{
226 |
227 | }
228 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 | \underline{
244 |
245 | }
246 |
247 |
248 | \underbrace{
249 |
250 | }
251 |
252 |
253 | \underleftarrow{
254 |
255 | }
256 |
257 |
258 | \underrightarrow{
259 |
260 | }
261 |
262 |
263 | \underleftrightarrow{
264 |
265 | }
266 |
267 |
269 |
277 |
278 | _{
279 |
280 | }
281 |
282 |
283 | \underset{
284 |
285 | }{
286 |
287 | }
288 |
289 |
290 |
291 |
292 |
293 | {
294 |
295 | }_{
296 |
297 | }^{
298 |
299 | }
300 |
301 |
302 |
303 | {
304 |
305 | }^{
306 |
307 | }
308 |
309 |
310 |
311 | {
312 |
313 | }_{
314 |
315 | }
316 |
317 |
318 |
319 |
320 |
321 | {}_{
322 |
323 | }
324 |
325 |
326 | {}^{
327 |
328 | }
329 |
330 |
331 |
332 |
333 |
334 | {}
335 |
336 |
337 | _{
338 |
339 | }
340 |
341 |
342 | ^{
343 |
344 | }
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 | {}
359 |
360 |
361 | _{
362 |
363 | }
364 |
365 |
366 | ^{
367 |
368 | }
369 |
370 |
371 |
372 |
373 |
374 |
375 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mmltex/tables.xsl:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
11 |
12 |
13 |
14 | \multicolumn{
15 |
16 | }{c}{
17 |
18 | }
19 |
20 | &
21 |
22 |
23 |
24 |
25 |
26 |
27 | \hfill
28 |
29 |
30 |
31 | \hfill
32 |
33 |
34 |
36 | &
37 |
38 |
39 |
40 |
41 |
42 |
43 | \\
44 |
45 |
46 |
47 |
48 | \begin{array}{
49 |
50 | |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 | |
85 |
86 | }
87 |
88 | \hline
89 |
90 |
91 |
92 | \\ \hline
93 |
94 | \end{array}
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/mmltex/tokens.xsl:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | \textcolor{red}{
20 |
21 | }
22 |
23 |
24 |
25 |
26 |
27 | \mathrm{
28 |
29 | }
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | \mathrm{
41 |
42 | }
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | \left
56 |
57 |
58 | \right
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 | \text{
72 |
73 | }
74 |
75 |
76 |
77 | \phantom{\rule
78 |
79 | [-
80 |
81 | ]
82 |
83 | {
84 |
85 | 0ex
86 |
87 |
88 | }{
89 |
90 | 0ex
91 |
92 |
93 | }}
94 |
95 |
96 |
97 |
98 |
99 | ''
100 |
101 |
102 | ''
103 |
104 |
105 |
106 |
107 |
108 | \colorbox[rgb]{
109 |
110 |
111 |
112 | }{$
113 |
114 |
115 | \textcolor[rgb]{
116 |
117 |
118 |
119 | }{
120 |
121 |
122 |
123 |
124 | \mathrm{
125 |
126 |
127 | \mathbf{
128 |
129 |
130 | \mathit{
131 |
132 |
133 | \mathit{
134 | The value bold-italic for mathvariant is not supported
135 |
136 |
137 | \mathbb{
138 |
139 |
140 | \mathfrak{
141 | The value bold-fraktur for mathvariant is not supported
142 |
143 |
144 | \mathcal{
145 |
146 |
147 | \mathcal{
148 | The value bold-script for mathvariant is not supported
149 |
150 |
151 | \mathfrak{
152 |
153 |
154 | \mathsf{
155 |
156 |
157 | \mathsf{
158 | The value bold-sans-serif for mathvariant is not supported
159 |
160 |
161 | \mathsf{
162 | The value sans-serif-italic for mathvariant is not supported
163 |
164 |
165 | \mathsf{
166 | The value sans-serif-bold-italic for mathvariant is not supported
167 |
168 |
169 | \mathtt{
170 |
171 |
172 | {
173 | Error at mathvariant attribute
174 |
175 |
176 |
177 |
178 |
179 | }
180 |
181 |
182 | }
183 |
184 |
185 | $}
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 | ,
221 |
222 |
223 |
224 |
225 |
226 | ,
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 | ,
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 | ,
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 | 0,1,1
271 | 0,0,0
272 | 0,0,1
273 | 1,0,1
274 | .5,.5,.5
275 | 0,.5,0
276 | 0,1,0
277 | .5,0,0
278 | 0,0,.5
279 | .5,.5,0
280 | .5,0,.5
281 | 1,0,0
282 | .75,.75,.75
283 | 0,.5,.5
284 | 1,1,1
285 | 1,1,0
286 |
287 | Exception at color template
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 | Exception at Hex2Decimal template
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/output.md:
--------------------------------------------------------------------------------
1 | Gradient
2 | Let $f:{\mathbb{R}}^{n}\to \mathbb{R}$ be a scalar field. The gradient is
3 | $$\nabla f\left(x\right)=\left[\begin{array}{c}\frac{\mathit{\partial}f}{\mathit{\partial}{x}_{1}}\\ \vdots \\ \frac{\mathit{\partial}f}{\mathit{\partial}{x}_{n}}\end{array}\right]$$
4 |
--------------------------------------------------------------------------------
/web_pipeline/mathml2latex/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/mathml2latex/output.png
--------------------------------------------------------------------------------
/web_pipeline/requirements.txt:
--------------------------------------------------------------------------------
1 | resiliparse
2 | datatrove
3 | fasttext
4 | nltk
5 | tqdm
6 | bs4
7 | wget
8 | pyahocorasick
9 | fasteners
10 | tldextract
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/url_filter.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable
2 | from datatrove.pipeline.filters.url_filter import URLFilter
3 | from datatrove.pipeline.writers.disk_base import DiskWriter
4 | from datatrove.data import Document
5 | from typing import Iterable
6 | import os
7 | import time
8 | import re
9 | import os
10 |
11 | ASSETS_PATH = "url_filtering"
12 |
13 | normalizer = re.compile(r"[^a-zA-Z0-9]+")
14 |
15 | def normalize(text, replace=""):
16 | return normalizer.sub(replace, text).lower()
17 |
18 | def parse_list(line, do_normalize=True):
19 | return {normalize(x) if do_normalize else x.strip() for x in line if x[0] != "#"}
20 |
21 | def get_list(abs_path: str, file_name: str, extra: set, do_normalize: bool = True):
22 | with open(os.path.join(abs_path, file_name)) as f:
23 | return parse_list(f, do_normalize).union(extra)
24 |
25 | class CustomURLFilterWithWhitelist(URLFilter):
26 | """
27 | Extends URLFilter to include a whitelist functionality.
28 | URLs from whitelisted domains or exact whitelisted URLs will bypass all other filters.
29 | """
30 | name = "😈Custom Url-filter With Whitelist"
31 | _requires_dependencies = ["tldextract", "fasteners", ("ahocorasick", "pyahocorasick")]
32 |
33 | def __init__(
34 | self,
35 | use_whitelist: bool = True,
36 | whitelist_domains: Iterable = None,
37 | whitelist_urls: Iterable = None,
38 | do_remove_curated_sources: bool = False,
39 | curated_domains: Iterable = None,
40 | do_load_from_cache: bool = True,
41 | do_add_extra_domain_and_urls: bool = False,
42 | exclusion_writer: DiskWriter = None,
43 | *args,
44 | **kwargs
45 | ):
46 | if do_add_extra_domain_and_urls:
47 | extra_domains, extra_urls = set(), set()
48 | blocklist_dir = os.path.join(ASSETS_PATH, "urls", "blocklist")
49 | for dirname in os.listdir(blocklist_dir):
50 | if not os.path.isdir(os.path.join(blocklist_dir, dirname)):
51 | continue
52 | extra_domains = get_list(os.path.join(blocklist_dir, dirname), "domains", extra_domains , do_normalize=False)
53 | print(f"domain size: {len(extra_domains)}")
54 | extra_urls = get_list(os.path.join(blocklist_dir, dirname), "urls", extra_urls, do_normalize=False)
55 | print(f"domain size: {len(extra_urls)}")
56 |
57 | print(f"Extra domains ({len(extra_domains)}) and urls ({len(extra_urls)})")
58 | super().__init__(
59 | extra_domains = extra_domains,
60 | extra_urls = extra_urls,
61 | exclusion_writer = exclusion_writer
62 | )
63 | print("use extra domains and urls")
64 | else:
65 | super().__init__(
66 | exclusion_writer = exclusion_writer
67 | )
68 | self.whitelist_domains = set(whitelist_domains or [])
69 | self.whitelist_urls = set(whitelist_urls or [])
70 | self.use_whitelist = use_whitelist
71 | self.do_remove_curated_sources = do_remove_curated_sources
72 | self.curated_domains = set(curated_domains or [])
73 |
74 | if do_load_from_cache:
75 | whitelist_dir = os.path.join(ASSETS_PATH, "urls", "whitelist")
76 | self.whitelist_domains = get_list(whitelist_dir, "domains", self.whitelist_domains, do_normalize=False)
77 | self.whitelist_urls = get_list(whitelist_dir, "urls", self.whitelist_urls, do_normalize=False)
78 |
79 | curated_dir = os.path.join(ASSETS_PATH, "urls", "curated")
80 | self.curated_domains = get_list(curated_dir, "domains", self.curated_domains, do_normalize=False)
81 |
82 | if not self.use_whitelist:
83 | self.whitelist_domains = set()
84 | self.whitelist_urls = set()
85 | if not self.do_remove_curated_sources:
86 | self.curated_domains = set()
87 |
88 | def filter(self, document: Document) -> bool | tuple[bool, str]:
89 | self.download_data()
90 | url = document.metadata.get("url")
91 |
92 | assert url, "Document does not have url in its metadata"
93 | url_info = self.tldextractor(url)
94 |
95 | # Check if the URL or its domain is in the whitelist
96 | if url in self.whitelist_urls or url_info.registered_domain in self.whitelist_domains or url_info.fqdn in self.whitelist_domains:
97 | return True
98 |
99 | if url_info.registered_domain in self.curated_domains or url_info.fqdn in self.curated_domains:
100 | if not self.do_remove_curated_sources:
101 | assert self.curated_domains == set()
102 | return False, "curated"
103 |
104 | # If not whitelisted, proceed with the original filtering logic
105 | return super().filter(document)
106 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/adult/domains:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d6c71c68acd2f7d28103f4a61614cfe73569060ca776b5bfa1bec5bf2843db62
3 | size 122607801
4 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/adult/expressions:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/adult/expressions
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/adult/usage:
--------------------------------------------------------------------------------
1 | black
2 | adult
3 | porn
4 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/adult.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/adult.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/adult/domains:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bbc3b59a265a9bda95b601d28f0a1a5524eff6276f70cdc23dd054c5a0bc1a9d
3 | size 122806347
4 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/adult/expressions:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/adult/expressions
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/adult/usage:
--------------------------------------------------------------------------------
1 | black
2 | adult
3 | porn
4 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/agressif.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/agressif.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/agressif/domains:
--------------------------------------------------------------------------------
1 | 118.123.4.224
2 | 128.121.249.189
3 | 14words.com
4 | 163.177.220.59
5 | 183.61.166.187
6 | 192.67.198.4
7 | 192.67.198.49
8 | 193.96.188.143
9 | 195.4.52.48
10 | 195.63.211.202
11 | 199.93.70.2
12 | 203.2.124.18
13 | 204.181.176.53
14 | 204.50.24.185
15 | 205.160.14.21
16 | 205.160.14.22
17 | 205.167.142.107
18 | 205.167.142.6
19 | 205.241.44.90
20 | 206.113.230.2
21 | 206.160.0.11
22 | 206.160.0.248
23 | 206.160.0.252
24 | 206.168.114.50
25 | 206.168.114.52
26 | 206.244.69.51
27 | 206.31.204.150
28 | 207.201.162.40
29 | 207.231.72.88
30 | 207.36.45.220
31 | 207.70.7.168
32 | 207.71.8.68
33 | 208.185.127.162
34 | 208.185.127.163
35 | 208.48.246.80
36 | 208.55.206.181
37 | 209.103.172.199
38 | 209.123.16.9
39 | 209.126.159.27
40 | 209.15.74.4
41 | 209.15.84.106
42 | 209.161.0.32
43 | 209.189.198.102
44 | 209.195.130.178
45 | 209.196.188.172
46 | 209.197.123.166
47 | 209.204.200.140
48 | 209.204.217.45
49 | 209.240.128.4
50 | 209.250.128.7
51 | 209.35.194.183
52 | 209.61.200.137
53 | 212.114.150.187
54 | 212.227.118.69
55 | 212.227.174.218
56 | 212.38.173.23
57 | 213.130.63.232
58 | 216.100.98.13
59 | 216.100.98.17
60 | 216.100.98.24
61 | 216.100.99.13
62 | 216.110.132.232
63 | 216.110.143.153
64 | 216.126.73.71
65 | 216.127.68.84
66 | 216.131.71.161
67 | 216.150.67.66
68 | 216.169.106.11
69 | 216.218.248.244
70 | 216.219.253.193
71 | 216.40.195.47
72 | 216.40.213.201
73 | 216.43.175.114
74 | 62.116.137.145
75 | 62.116.137.155
76 | 62.116.138.132
77 | 62.116.138.135
78 | 62.116.138.140
79 | 62.116.138.142
80 | 62.116.140.196
81 | 63.218.152.42
82 | 63.236.214.203
83 | 63.249.227.174
84 | 64.156.139.229
85 | 64.239.80.146
86 | 64.239.87.165
87 | 64.70.225.193
88 | 64.82.99.102
89 | 66.175.2.27
90 | 66.255.14.41
91 | 66.28.60.94
92 | 66.78.36.83
93 | 6killshit.tumblr.com
94 | 81.88.35.41
95 | 81.88.35.42
96 | _video-bagarre.com
97 | aaargh-international.org
98 | aaargh.com.mx
99 | abandonfear.tumblr.com
100 | abbc.com
101 | adelaideinstitute.org
102 | aevasitcomno.tk
103 | aime-et-sers.com
104 | air-photo.com
105 | al-jinan.org
106 | algerie-francaise.org
107 | americandefenseleague.com
108 | americannaziparty.com
109 | americanskinheads.com
110 | amren.com
111 | anp14.com
112 | anu.org
113 | anus.com
114 | aryan-nation.org
115 | aryan-nations.org
116 | aryan88.com
117 | aryannations88.com
118 | aryannationsknightskkk.org
119 | aryanwear.com
120 | aufmarsch.de
121 | auslaenderstopp.net
122 | azelin.files.wordpress.com
123 | bagarres.be
124 | bagarres.com
125 | barnesreview.org
126 | bayouknights.org
127 | bestgore.com
128 | bhsweden.tsx.org
129 | blacksandjews.com
130 | blancheurope.com
131 | blogdemariepauledarchicourt.hautetfort.com
132 | bloodandhonour.com
133 | bloodandhonour.de
134 | bloodshows.com
135 | bnp.net
136 | buchanan.org
137 | buendnis-rechts.de
138 | bulldog88.tsx.org
139 | burks.de
140 | cadaver.org
141 | campaign.davidduke.com
142 | cenobite.com
143 | christiangallery.com
144 | christianseparatist.org
145 | churchfliers.com
146 | civil-liberties.com
147 | codoh.com
148 | codoh.org
149 | cofcc.org
150 | compuserb.com
151 | contrelislam.org
152 | creator.org
153 | crusader.net
154 | daaargh.narod.ru
155 | dailyrotten.com
156 | dakingnv.cn
157 | dakingnv.com
158 | deadhouse.org
159 | deadhouse.xyz
160 | dealer-lejeu.com
161 | deathnet.com
162 | democratie-participative.com
163 | democratie-participative.fr
164 | democratie-participative.net
165 | democratie.participative.com
166 | democratie.participative.fr
167 | democratie.participative.net
168 | democratieparticipative.biz
169 | democratieparticipative.com
170 | democratieparticipative.fun
171 | democratieparticipative.host
172 | democratieparticipative.lol
173 | democratieparticipative.net
174 | democratieparticipative.online
175 | democratieparticipative.org
176 | democratieparticipative.site
177 | democratieparticipative.space
178 | democratieparticipative.website
179 | democrativeparticipative.link
180 | der-fuehrer.org
181 | der-stuermer.org
182 | deutsches-rechtsbuero.de
183 | deutsches-reich.de
184 | deviantsockpuppet.com
185 | dsz-verlag.de
186 | duke.org
187 | ety.com
188 | fa.federation-anarchiste.org
189 | faem.com
190 | fkun.de
191 | flawlesslogic.com
192 | forumpatriote.org
193 | fpp.co.uk
194 | france-avenir.com
195 | franceavenir.free.fr
196 | frank-rennicke.de
197 | freedomsite.org
198 | freikorps.com
199 | freimaurer.org
200 | front-comtois.com
201 | gaelle.hautetfort.com
202 | globalfire.tv
203 | godhatesfags.com
204 | gorecenter.com
205 | goresee.com
206 | gudeian.50megs.com
207 | guderian.ds4a.com
208 | hammerskins.com
209 | hangemhighrecords.com
210 | hanse-records.de
211 | harold-covington.org
212 | heathenfront.org
213 | heimatkunde.tsx.org
214 | heimattreue-jugend.de
215 | hitler.org
216 | hitlerisgod.com
217 | hoffman-info.com
218 | holywar.org
219 | iahushua.com
220 | ihr.org
221 | innerdepravity.com
222 | internationalknights.de
223 | intransigeants.com
224 | jdo.org
225 | jeffsarchive.com
226 | jesus-is-lord.com
227 | jetueunami.com
228 | jewwatch.com
229 | jihadology.net
230 | johnsack.com
231 | jungefreiheit.de
232 | k-k-k.com
233 | kamellia.com
234 | kekma.net
235 | killerkomics.com
236 | kingidentity.com
237 | kkk.bz
238 | kkk.com
239 | kkkk.net
240 | kriegsfront.tsx.org
241 | kukluxklan.net
242 | kukluxklan.org
243 | kulmbacher.freeservers.com
244 | le-projet-juif.com
245 | libreopinion.com
246 | louisbeam.com
247 | mankind.org
248 | melvig.org
249 | metapedia.org
250 | micetrap.net
251 | midgaard.org
252 | milgear.fi
253 | missiontoisrael.org
254 | mnsf.info
255 | modelguns-worldwide.com
256 | musicalterrorists.com
257 | mysticknights.org
258 | n-a-f.com
259 | naawp.com
260 | natall.com
261 | natvan.com
262 | nazi-lauck-nsdapao.com
263 | nazi.org
264 | newgrounds.com
265 | neworderknights.com
266 | nit.de
267 | nizkor.org
268 | nocturnevideoculte.com
269 | noontidepress.com
270 | nordfront.de.cx
271 | nordland.net
272 | nordzeit.de
273 | normanfinkelstein.com
274 | npd.net
275 | nsbm.org
276 | nseuropa.org
277 | nsm88.com
278 | nswpp.org
279 | nukeisrael.com
280 | oeuvre-francaise.com
281 | oeuvrefrancaise.com
282 | oikrach.com
283 | ostara.org
284 | ourhero.com
285 | paaargh.blogspot.com
286 | panzerfaust.com
287 | pathcom.com
288 | patriot.dk
289 | pornhulknews.com
290 | posse-comitatus.org
291 | propatria.org
292 | queinsania.com
293 | racnet.tsx.org
294 | radioislam.net
295 | radioislam.org
296 | rahowa.com
297 | rahowa.us
298 | resist.com
299 | resistance.com
300 | revilo-oliver.com
301 | revisionism.com
302 | revisionists.com
303 | rotten.com
304 | rudolf-hess.org
305 | sanctioned-suicide.net
306 | sanctioned-suicide.org
307 | school-fights.com
308 | scripturesforamerica.org
309 | seegore.com
310 | seek-info.com
311 | siegener-baerensturm.de
312 | signal-online.de
313 | sigrdrifa.com
314 | site88.8m.com
315 | skinheadxx.tsx.org
316 | sogore.com
317 | sos-racaille.org
318 | sosfrance.com
319 | splcenter.org
320 | spotlight.org
321 | ssenterprises.com
322 | ssman.com
323 | stormfront.org
324 | thiazi.net
325 | thinkmasa.org
326 | thulenet.com
327 | thulepublications.com
328 | tightrope.cc
329 | trashercorpse.free.fr
330 | tt-v.de
331 | ukar.org
332 | ungraindesable.the-savoisien.com
333 | unitedskins.com
334 | unitedstrike.com
335 | vanguardnewsnetwork.com
336 | vho.org
337 | volkermord.com
338 | volksgemeinschaft.org
339 | wakeupordie.com
340 | wckkkk.com
341 | wcotc.com
342 | webresistant.over-blog.com
343 | whemporium.com
344 | whitehonor.com
345 | whitepower.com
346 | whitepride.com
347 | whitepride.net
348 | whiterace.com
349 | whiteracist.com
350 | whitesingles.com
351 | whiteunitypress.com
352 | widerstand.com
353 | williscarto.com
354 | wno.org
355 | wotansdungeon.tsx.org
356 | wpww.com
357 | x-guns.com
358 | yoderanium.com
359 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/agressif/expressions:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/agressif/expressions
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/agressif/urls:
--------------------------------------------------------------------------------
1 | <<<<<<< HEAD
2 | version https://git-lfs.github.com/spec/v1
3 | oid sha256:c32c45f03148918256626cb9bf1a0a230e83a873230baa1617a12605471a26d8
4 | size 837
5 | =======
6 | 129.105.212.34/~abutz
7 | 193.195.1.1/natofeur
8 | 204.71.88.20/ygg
9 | 204.71.88.21/ygg
10 | 204.71.88.22/ygg
11 | 204.71.88.23/ygg
12 | 64.15.239.150/~wikinger-versand
13 | adl.org/poisoning_web
14 | anjora.de/nwo
15 | archive.org/details/qatal3
16 | bigfoot.com/~wikinger-versand
17 | come.to/bah
18 | come.to/heilkroiter
19 | come.to/ndj
20 | concentric.net/~nwk
21 | corax.org/revisionism
22 | cri.univ-tlse1.fr/tools/test_filtrage/agressif/
23 | cycad.com/cgi-bin/upstream
24 | ddc.net/ygg
25 | demon.co.uk/natofeur
26 | encyclopediadramatica.se/aborigines
27 | fortunecity.com/boozers/whitehart
28 | geocities.com/allo03714
29 | geocities.com/blaaargh8864
30 | go.to/bloodandhonour
31 | imbris.net/~fourteenwords
32 | members.theglobe.com/klanman1
33 | nidlink.com/~aryanvic
34 | nidlink.com/~fourteenwords
35 | ozemail.com.au/~drumbeat
36 | pubweb.acns.nwu.edu/~abutz
37 | relaypoint.net/~lsf
38 | ruspatriot.com/skinhead
39 | twitter.com/anp14
40 | website.yabz.net/chaaargh
41 | ziplink.net/~bright
42 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a
43 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/agressif/usage:
--------------------------------------------------------------------------------
1 | black
2 | aggressive
3 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/arjel.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/arjel.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/arjel/domains:
--------------------------------------------------------------------------------
1 | 200poker.fr
2 | 200pour100.fr
3 | 200pour100poker.fr
4 | 200pourcent.fr
5 | 200pourcentpoker.fr
6 | 888.fr
7 | 888poker.fr
8 | acfpoker.fr
9 | barrierepoker.fr
10 | betclic-mobile.fr
11 | betclic.fr
12 | betclick-mobile.fr
13 | betclickmobile.fr
14 | betclicmobile.fr
15 | betnet.fr
16 | bwin.fr
17 | chilipari.fr
18 | chilipoker.fr
19 | coupedumonde-pari.fr
20 | eurosportbet.fr
21 | everestpoker.fr
22 | football-pari.fr
23 | football365.fr
24 | france-pari.fr
25 | friendbet.fr
26 | fulltiltpoker.fr
27 | gamebookers.fr
28 | genybet.fr
29 | intralot.fr
30 | intralotpari.fr
31 | jechope.com
32 | jeux365.fr
33 | joa-club.fr
34 | joa-online.fr
35 | joaclub.fr
36 | joaonline.fr
37 | leturf.fr
38 | luckyjeux.fr
39 | mypok.fr
40 | pacificpoker.fr
41 | parions974.fr
42 | parionsweb.fdj.fr
43 | parionsweb.fr
44 | paris365.fr
45 | partouche.fr
46 | partybets.fr
47 | partypoker.fr
48 | peoplesbet.fr
49 | peoplesnetwork.fr
50 | pkr.fr
51 | placedesparis.fr
52 | pmu.fr
53 | poker365.fr
54 | poker83.fr
55 | pokerstars.fr
56 | pokersubito.fr
57 | pokerxtrem.fr
58 | sajoo.fr
59 | sportnco.fr
60 | titan.fr
61 | titanpartners.fr
62 | tranchant-poker.fr
63 | tranchantpoker.fr
64 | unibet.fr
65 | winamax.fr
66 | winga.fr
67 | wpt.fr
68 | wsop.fr
69 | zeturf.fr
70 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/arjel/urls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/arjel/urls
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/arjel/usage:
--------------------------------------------------------------------------------
1 | white
2 | black
3 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/chat.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/chat.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/chat/domains:
--------------------------------------------------------------------------------
1 | 12buzz.com
2 | 193.238.160.62
3 | 193.238.162.21
4 | 194.130.106.132
5 | 195.33.103.52
6 | 207.46.110.254
7 | 207.46.110.48
8 | 207.68.178.239
9 | 208.81.191.110
10 | 209.67.215.236
11 | 213.199.154.11
12 | 213.199.154.54
13 | 213.91.8.214
14 | 216.129.112.65
15 | 216.129.112.66
16 | 216.129.112.67
17 | 216.129.112.68
18 | 216.129.112.69
19 | 216.129.112.88
20 | 216.129.126.66
21 | 216.178.160.34
22 | 216.32.66.235
23 | 216.32.67.212
24 | 216.32.68.171
25 | 216.32.84.236
26 | 321chat.com
27 | 47.91.114.71
28 | 47.91.122.46
29 | 64.13.152.67
30 | 64.92.173.122
31 | 69.36.226.107
32 | 69.36.226.108
33 | 69.36.226.109
34 | 69.36.226.134
35 | 69.36.226.135
36 | 69.36.226.141
37 | 69.36.226.142
38 | 69.36.226.143
39 | 69.36.226.144
40 | 69.36.226.145
41 | 69.36.226.146
42 | 69.36.226.147
43 | 69.36.226.148
44 | 69.36.226.149
45 | 69.36.250.11
46 | 69.36.250.12
47 | 69.36.250.13
48 | 69.36.250.14
49 | 69.36.250.15
50 | 69.36.250.16
51 | 69.36.250.17
52 | 69.36.250.18
53 | 69.36.250.19
54 | 69.36.250.20
55 | 69.36.250.21
56 | 69.36.250.22
57 | 69.36.250.23
58 | 69.36.250.24
59 | 69.36.250.25
60 | 69.36.250.26
61 | 69.36.250.27
62 | 69.36.250.28
63 | 69.36.250.29
64 | 69.36.250.30
65 | 69.36.250.31
66 | 69.36.250.32
67 | 69.36.250.33
68 | 69.36.250.35
69 | 69.36.250.36
70 | 69.36.250.37
71 | 69.36.250.68
72 | 69.36.250.9
73 | 72.21.57.84
74 | 72.232.63.35
75 | 85.184.4.4
76 | 8ch.net
77 | aimexpress.oscar.aol.com
78 | airaim.com
79 | ajaxim.org
80 | api.msn.com
81 | assets.msn.com
82 | azarlive.com
83 | aznstar.free.fr
84 | babel.com
85 | bantu.com
86 | batepapo.uol.com.br
87 | bazoocam.org
88 | big-kiss.com
89 | blockedsuks.co.nr
90 | bloochat.com
91 | bonplanchat.com
92 | chaat.fr
93 | chapatiz.com
94 | chat-paradise.com
95 | chat.nrj.fr
96 | chat.org
97 | chat.ru
98 | chat.voila.fr
99 | chateagratis.net
100 | chateandogratis.org
101 | chateaya.org
102 | chatenabled.mail.google.com
103 | chatiw.me
104 | chatroom.conexionplacer.com
105 | chatroulette.com
106 | chatteurs.com
107 | clientless.net
108 | coco.fr
109 | communicationtube.com
110 | communicationtube.net
111 | crisp.chat
112 | discord.com
113 | discordapp.com
114 | e-messenger.net
115 | e-messenget.net
116 | easymessage.net
117 | easymessenger.net
118 | ebuddy.com
119 | emessenger.cl
120 | express.instan-t.com
121 | filter.msn.com
122 | gateway.messenger.live.com
123 | gazzag.com
124 | hopster.com
125 | i3connect.com
126 | ibypass.com
127 | icq.com
128 | iloveim.co.uk
129 | iloveim.com
130 | imaginarlo.com
131 | imhaha.com
132 | imo.im
133 | imtiger.com
134 | imunitive.com
135 | imvu.com
136 | interactiveni.com
137 | inversas.jazztel.es
138 | izuz.net
139 | jeempo.com
140 | jivochat.com
141 | jpager.yahoo.com
142 | jwchat.org
143 | kiwibox.com
144 | kmess.sourceforge.ne
145 | kolikoli.tk
146 | koolim.com
147 | laffer.sourceforge.net
148 | livechat.com
149 | livechatinc.com
150 | livechatinc.net
151 | liveperson.com
152 | loovchat.com
153 | mabber.com
154 | mangeloo.com
155 | mastaline.com
156 | mbm3550nl1n3.siteburg.com
157 | meebo.com
158 | meebo.com.br
159 | meebo.cust.layer42.net
160 | meebome.com
161 | mercury.to
162 | mess.be
163 | messbrasil.cidadeinternet.com.br
164 | messbrasil.com.br
165 | messenger.com
166 | messenger.hotmail.com
167 | messenger.msn.com
168 | messenger.sapo.pt
169 | messenger.services.live.com
170 | messenger.uol.com.br
171 | messenger.yahoo.com
172 | messengerfx.com
173 | messengerfx.com.br
174 | messfreak.be
175 | messplaza.nl
176 | mijnmessenger.nl
177 | mingle.pt
178 | miranda-im.org
179 | msecnd.net
180 | msedge.net
181 | msgweb.nl
182 | msn.audiowatcher.com
183 | msn2go.com
184 | msn2go.com.br
185 | msnanywhere.com
186 | msnfanatic.com
187 | msnger.com
188 | msnskins.be
189 | myoms.net
190 | ninemsn.com
191 | nootmobile.com
192 | ntp.msn.com
193 | omegle.com
194 | onlinemessenger.nl
195 | orkut.com
196 | orkut.com.br
197 | phonefox.com
198 | picachat.com
199 | pidgin.im
200 | piglet-im.com
201 | piglet.0900provider.nl
202 | plugoo.com
203 | polysolve.com
204 | pucinhell.mine.nu
205 | racewarkingdoms.com
206 | radiusim.com
207 | reverse.layeredtech.com
208 | screenname.aol.com
209 | skype.com
210 | skype.net
211 | skypeassets.com
212 | skypeassets.net
213 | smail.fr
214 | snapchat.com
215 | snapchatweb.com
216 | snimmer.com
217 | spidermessenger.com
218 | stopennui.com
219 | sweetim.com
220 | t-messenger.com
221 | talk.google.com
222 | talkgadget.google.com
223 | tchat-enligne.fr
224 | tchatgratuit.eu
225 | tchatteur.com
226 | thevirtualbrowser.com
227 | threema.ch
228 | toc.oscar.aol.com
229 | toperkut.com
230 | trillian.cc
231 | trouter.io
232 | userapi.com
233 | vk.com
234 | vypress.com
235 | wablet.com
236 | wbmsn.net
237 | web2messenger.com
238 | webgama.789mb.com
239 | webmessenger.com
240 | webmessenger.com.br
241 | webmessenger.msn.com
242 | webmessenger.msn.es
243 | webmessenger.yahoo.com
244 | webuzztogether.com
245 | whatsapp.net
246 | x-chat.fr
247 | your-freedom.net
248 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/chat/urls:
--------------------------------------------------------------------------------
1 | <<<<<<< HEAD
2 | version https://git-lfs.github.com/spec/v1
3 | oid sha256:b92ae419a1c37bd77c7f65466291bf17efaf9cd4a27105e331f49d6af732c411
4 | size 476
5 | =======
6 | 207.46.5.10/gateway/gateway.dll
7 | 212.19.193.108/servlet/login
8 | 72.36.146.44/servlets/login
9 | chrishemple.co.uk/proxy
10 | cri.univ-tlse1.fr/tools/test_filtrage/chat/
11 | douradina.pr.gov.br/jacare
12 | ec.rdn.it/sms.asp
13 | facebook.com/ajax/chat
14 | fleo.com.ar/chatbox/
15 | freepgs.com/defilter
16 | google.com/talk
17 | jabber.meta.net.nz/webmsg/register.php
18 | leamonde.net/im/index.php
19 | mail.google.com/mail/channel/bind
20 | messenger-online.com/emessenger.php
21 | researchhaven.com/chat.htm
22 | webtal.com.br/imagens/msn.html
23 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a
24 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/chat/usage:
--------------------------------------------------------------------------------
1 | black
2 | white
3 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/dating.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/dating.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/dating/urls:
--------------------------------------------------------------------------------
1 | <<<<<<< HEAD
2 | version https://git-lfs.github.com/spec/v1
3 | oid sha256:b903fbb7cbbf683f6e25821883309c9ad3dfd457319aff85daf557df1cfb7906
4 | size 342
5 | =======
6 | a-deux.net/rencontre
7 | askmen.com/dating/
8 | betolerant.fr/rencontre-ados-jeunes-lesbiennes/1.html
9 | cri.univ-tlse1.fr/tools/test_filtrage/dating/
10 | divorceoumonop.a-deux.net/rencontre
11 | forum.ados.fr/love/amour/sites-rencontres-sujet_50847_1.htm
12 | gran-angular.net/categoria/citas/
13 | habitamos.com/list/418/
14 | malianteo.com/foros/f25/
15 | skyrock.com/rencontres
16 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a
17 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/dating/usage:
--------------------------------------------------------------------------------
1 | black
2 | dating
3 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/ddos.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/ddos.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/ddos/domains:
--------------------------------------------------------------------------------
1 | 4bidden.info
2 | agebooter.com
3 | alphastress.com
4 | anonboot.com
5 | anonsecurityteam.com
6 | anonymous-stresser.com
7 | anonymous-stresser.net
8 | api-stresser.me
9 | apocalypse-solutions.com
10 | arkbooter.fr
11 | assasinsbooter.altervista.org
12 | astrostress.com
13 | atom-stresser.com
14 | atomstress.org
15 | aurastresser.com
16 | avengestresser.com
17 | b-h.us
18 | battle.pw
19 | begayage-stresser.com
20 | bemybooter.eu
21 | best-ddos-tool.ilovecassola.it
22 | beststresser.com
23 | blink-stresser.000webhostapp.com
24 | blunter.xyz
25 | boot-stresser.avpop37.com
26 | boot-stresser.duri46.com
27 | boot.lu
28 | boot.ml
29 | boot4free.com
30 | booter-panel.cnn-02.com
31 | booter-panel.sunlyfc.com
32 | booter-sales.hourb.com
33 | booter-vip.infonhadat.org
34 | booter.club
35 | booter.eu
36 | booter.im
37 | booter.in
38 | booter.is
39 | booter.ninja
40 | booter.org
41 | booter.pw
42 | booter.sx
43 | booter.vip
44 | booter.xyz
45 | bootme.pro
46 | bootr.org
47 | bootyou.net
48 | botstress.com
49 | bullstresser.com
50 | bullstresser.net
51 | bullstresser.to
52 | buybooters.com
53 | buyddos.com
54 | buzzbooter.info
55 | celerystresser.com
56 | celeste-stresser.xyz
57 | chargen.cf
58 | city-stresser.alwaysdata.net
59 | city-stressing.alwaysdata.net
60 | cloud-hosts.tk
61 | cnstresser.com
62 | connectionstresser.com
63 | crazyamp.me
64 | critical-boot.com
65 | cstress.net
66 | cyber-hub.pw
67 | cyber-sst.com
68 | cyberstresser.org
69 | cybervm.io
70 | darkbooter.com
71 | darkstresser.info
72 | darkstresser.net
73 | darkstresser.nl
74 | darlingstress.com
75 | databooter.com
76 | ddgu.ddos-guard.net
77 | ddos-fighter.com
78 | ddos-him.com
79 | ddos-ip.com
80 | ddos-ovh-v6.000webhostapp.com
81 | ddos-stress.eu
82 | ddos-stress.strayrounds.com
83 | ddos.city
84 | ddos.kr
85 | ddos.tools
86 | ddosbooter.link
87 | ddosbreak.com
88 | ddosclub.com
89 | ddoser-online.studyfund.com
90 | ddoser.xyz
91 | ddosforhire.net
92 | ddosit.net
93 | ddosit.us
94 | ddossite.com
95 | ddostheworld.com
96 | deadlyboot.net
97 | defcon.pro
98 | defconpro.net
99 | defianceprotocol.com
100 | dejabooter.com
101 | destressbooter.com
102 | destressnetworks.com
103 | deucalion.us
104 | diamond-stresser.net
105 | diamond-stresser.pw
106 | diebooter.com
107 | diebooter.net
108 | divinestresser.com
109 | dosarrest.com
110 | doshackers.tk
111 | down-stresser.alwaysdata.net
112 | down-stresser.com
113 | down-stresser.us
114 | downed.io
115 | downed.sx
116 | downthem.org
117 | dreamstresser.com
118 | ebolastresser.com
119 | emaizstresser.net
120 | emo-stresser.com
121 | energy-stresser.000webhostapp.com
122 | energy-stresser.alwaysdata.net
123 | equinoxstresser.net
124 | equivalentstresser.net
125 | etwork-stressing.net
126 | every-stresser.000webhostapp.com
127 | every-stresser.com
128 | evil-stress.xyz
129 | evilbooter.net
130 | exercy-stresser.alwaysdata.net
131 | exile-stresser.net
132 | exitus.to
133 | exostress.in
134 | expressdown.com
135 | fagstresser.net
136 | fiberstresser.com
137 | flood.to
138 | foreverinfamous.com
139 | formalitystresser.com
140 | free-boot.to
141 | free-boot.xyz
142 | free-ip-grabber.ilovecassola.it
143 | free-ip-puller.ilovecassola.it
144 | free-ip-stresser.sushinarii.com
145 | free-stresser.authenticbrownsstore.com
146 | free-stresser.ilovecassola.it
147 | free-stresser.sweetaires.com
148 | freeboot.pw
149 | freebooter4.me
150 | freeipstress.com
151 | freeipstresser.net
152 | freestresser.to
153 | freestresser.xyz
154 | freezystresser.nl
155 | getsmack.de
156 | grimbooter.com
157 | hardstresser.com
158 | havoc-security.pw
159 | hazebooter.com
160 | heavystresser.com
161 | hestresser.com
162 | heydos.cc
163 | hornystress.me
164 | howtoddosattack.com
165 | hydrostress.com
166 | hydrostress.net
167 | hyperstresser.com
168 | i-b.co
169 | iddos.net
170 | igbangbooter.com
171 | imsocool.info
172 | inboot.me
173 | infectedstresser.com
174 | infectedstresser.net
175 | instabooter.com
176 | instant-stresser.com
177 | instant-stresser.surverybot.com
178 | instantdown-stresser.alwaysdata.net
179 | instinctproducts.com
180 | invalid.pw
181 | ionbooter.com
182 | ip-booter-me.ilovecassola.it
183 | ip-booter-net.play3nvvip.com
184 | ip-stresser-tor.yonkersbridal.com
185 | ip-stresser-xbox.hdxba.com
186 | ip-stresser.icee-pdrp.com
187 | ip-stresser.pst-2020.com
188 | ipboot.xyz
189 | ipstress.in
190 | ipstresser.co
191 | ipstresser.com
192 | ipstresser.lidamorgenstein.net
193 | ipstresser.pw
194 | ipstresser.wtf
195 | ipstresstest.com
196 | iridiumstresser.net
197 | isitdownyet.com
198 | jitterstresser.com
199 | k-stress.pw
200 | kryptonic.pw
201 | kth-stress.tk
202 | last-day.xyz
203 | layer-4.com
204 | layer-stresser.alwaysdata.net
205 | layer7-security.net
206 | layer7-stresser.com
207 | layer7-stresser.xyz
208 | layer7.pw
209 | legion.cm
210 | legionboot.com
211 | lifetimeboot.com
212 | lightstress.in
213 | lizardstresser.su
214 | logicstresser.net
215 | loic-sourceforge-net.ilovecassola.it
216 | masterboot.net
217 | maxidown.com
218 | mega-stresser.us
219 | mercilesstresser.com
220 | meteor-stresser.com
221 | meteor-stresser.to
222 | minecraftstresser.com
223 | mini-booter.com
224 | moscow-stress.xyz
225 | mystresser.com
226 | mythicalstress.xyz
227 | narcos-stresser.000webhostapp.com
228 | national-stresser.com
229 | national-stresser.net
230 | netbreak.ec
231 | netspoof.com
232 | netspoof.net
233 | netstress.net
234 | netstress.org
235 | network-stresser.alwaysdata.net
236 | network-stressing.net
237 | network.rip
238 | networkstress.com
239 | networkstress.xyz
240 | networkstresser.com
241 | networkstresser.net
242 | neverddos.com
243 | nice-stresser.alwaysdata.net
244 | nightlystresser.ml
245 | nightmarestresser.com
246 | ninjastresser.com
247 | nismitstresser.net
248 | nodestress.tw
249 | nonymousbooter.com
250 | nstress.com
251 | nstresser.net
252 | nuke.pe.hu
253 | obeystresser.com
254 | obliterateproducts.com
255 | olympusstresser.org
256 | omegastresser.com
257 | onestress.com
258 | onestresser.net
259 | onionstresser.com
260 | ooter.io
261 | optimusstresser.com
262 | orcahub.com
263 | orphicsecurityteam.com
264 | ovh-booter.com
265 | ovh-ip-test.ilovecassola.it
266 | ozzy-stresser.000webhostapp.com
267 | paid-booter.operainlove.it
268 | parabooter.com
269 | penis-stresser.000webhostapp.com
270 | phoenixstresser.com
271 | pineapple-stresser.com
272 | pokent.com
273 | power-ddoser.ilovecassola.it
274 | power-ddoser.resoluteshoppingsite.com
275 | power-stress.pw
276 | powerapi.info
277 | powerapiv2.com
278 | powerdos.co.uk
279 | powerstress.com
280 | powerstresser.com
281 | privateroot.fr
282 | psn-ddos.alwaysdata.net
283 | psn-stress.alwaysdata.net
284 | pstresser.com
285 | purestress.net
286 | quantumbooter.net
287 | quantumstress.net
288 | quez.in
289 | quezstresser.com
290 | quezstresser.in
291 | rackstress.pw
292 | ragebooter.com
293 | ragebooter.net
294 | rapidstresser.com
295 | rawlayer.com
296 | rcahub.com
297 | reafstresser.ga
298 | realstresser.com
299 | rebellionstresser.com
300 | relevantstress.com
301 | renegade-products.net
302 | request.rip
303 | respawn.ca
304 | restricted-stresser.info
305 | riotstresser.com
306 | ripstresser.net
307 | routerslap.com
308 | royalbooter.de
309 | ryptonic.pw
310 | securestress.pw
311 | sharkstresser.com
312 | shawty.club
313 | signalstresser.com
314 | silence-stresser.com
315 | silentstress.wtf
316 | skidbooter.info
317 | sleek.to
318 | smack.rip
319 | snowstresser.net
320 | spacejump.xyz
321 | spboot.net
322 | specialistsservers.tk
323 | speed-stresser.com
324 | sst.wtf
325 | stagestresser.com
326 | stormstresser.net
327 | str3ssed.co
328 | str3ssed.me
329 | stress-analysis.ilovecassola.it
330 | stress-me.io
331 | stress-me.net
332 | stress.alwaysdata.net
333 | stressboss.net
334 | stressed.pw
335 | stresser-ip-booter.icon188.asia
336 | stresser-ip-booter.taj999exch.com
337 | stresser-ip-booter.xero.news
338 | stresser.app
339 | stresser.cc
340 | stresser.club
341 | stresser.in
342 | stresser.net
343 | stresser.network
344 | stresser.nstats.pw
345 | stresser.org
346 | stresser.ovh
347 | stresser.ru
348 | stresser.world
349 | stresserit.com
350 | stressit.club
351 | stressthem.to
352 | stressy.org
353 | strong-stresser.000webhostapp.com
354 | strong-stresser.com
355 | strongest-booter.hq7899.com
356 | stuxstresser.com
357 | superstresser.com
358 | supremesecurityteam.com
359 | sylumstresser.com
360 | synstress.net
361 | syrix-stresser.xyz
362 | thebestbooters.com
363 | thunderstresser.me
364 | time-stresser.pw
365 | titaniumbooter.net
366 | titaniumstresser.net
367 | top-10-booters.ilovecassola.it
368 | top-booter.com
369 | topstresser.io
370 | topstressers.com
371 | torsecurityteam.org
372 | tressed.pw
373 | tresser.info
374 | ts3booter.net
375 | ufa-booters-tools.com
376 | umbstresser.net
377 | unknownbooter.co
378 | unknownbooter.com
379 | unseenbooter.com
380 | vbooter.com
381 | vbooter.org
382 | vdos-s.co
383 | vdos-s.com
384 | vdoss.net
385 | vex-stresser.net
386 | vtoxicity.net
387 | wavestresser.wtf
388 | webbooter.com
389 | webstress.cc
390 | webstress.net
391 | webstresser-free.hostthegame.com
392 | webstresser-free.sunnymoring.com
393 | webstresser.biz
394 | webstresser.co
395 | webstresser.com
396 | weeabooter.com
397 | wifilefgrosdp-stresser.000webhostapp.com
398 | wifistruggles.com
399 | wifistruggles.net
400 | wriz-v2-booter.sfbaywhalewatching.com
401 | xblunter.co
402 | xblunter.net
403 | xboot.net
404 | xbox-xuid-booter.ilovecassola.it
405 | xenon-stresser.com
406 | xkovxboot.co
407 | xr8edstresser.com
408 | xrshellbooter.com
409 | xrstresser.net
410 | xstress.xyz
411 | xtreme.cc
412 | xtremebooter.com
413 | xyzbooter.net
414 | yakuzastresser.com
415 | ydrostress.com
416 | youboot.net
417 | z-shadow.co
418 | z7inc.com
419 | zdstresser.net
420 | zeus-net.pw
421 | zodiac-stresser.com
422 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/ddos/urls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/ddos/urls
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/ddos/usage:
--------------------------------------------------------------------------------
1 | black
2 | ddos
3 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/download.sh:
--------------------------------------------------------------------------------
1 |
2 | # wget https://dsi.ut-capitole.fr/blacklists/download/adult.tar.gz
3 |
4 | # wget https://dsi.ut-capitole.fr/blacklists/download/phishing.tar.gz
5 |
6 | # wget https://dsi.ut-capitole.fr/blacklists/download/dating.tar.gz
7 |
8 | # wget https://dsi.ut-capitole.fr/blacklists/download/gambling.tar.gz
9 |
10 | # wget https://dsi.ut-capitole.fr/blacklists/download/filehosting.tar.gz
11 |
12 | # wget https://dsi.ut-capitole.fr/blacklists/download/ddos.tar.gz
13 |
14 | # wget https://dsi.ut-capitole.fr/blacklists/download/agressif.tar.gz
15 |
16 | # wget https://dsi.ut-capitole.fr/blacklists/download/chat.tar.gz
17 |
18 | # wget https://dsi.ut-capitole.fr/blacklists/download/mixed_adult.tar.gz
19 |
20 | # wget https://dsi.ut-capitole.fr/blacklists/download/arjel.tar.gz
21 |
22 |
23 | #!/bin/bash
24 |
25 | # 遍历当前目录中所有的 .tar.gz 文件
26 | for file in *.tar.gz
27 | do
28 | # 检查文件是否存在(以防止没有匹配的文件时的错误)
29 | if [ -f "$file" ]; then
30 | echo "Extracting $file..."
31 |
32 | # 获取文件名(不包括 .tar.gz 扩展名)
33 | filename="${file%.tar.gz}"
34 |
35 | # 创建一个与文件同名的目录
36 | # mkdir -p "$filename"
37 |
38 | # 解压文件到这个新目录
39 | tar -xzf "$file"
40 | # -C "$filename"
41 |
42 | echo "Finished extracting $file"
43 | fi
44 | done
45 |
46 | echo "All .tar.gz files have been extracted."
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/filehosting.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/filehosting.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/filehosting/domains:
--------------------------------------------------------------------------------
1 | 1000go.fr
2 | 11mbit.de
3 | 123-reg.co.uk
4 | 1fichier.co
5 | 1fichier.com
6 | 1und1.de
7 | 2and2.net
8 | 2big.hu
9 | 2img.net
10 | 2shared.com
11 | 30mb.com
12 | 35mb.com
13 | 450mb.com
14 | 4file.net
15 | 4filehosting.com
16 | 4freeimagehost.com
17 | 4megaupload.com
18 | 4shared.com
19 | 4sync.com
20 | 64k.it
21 | 88.191.122.25
22 | 9divx.com
23 | 9down.com
24 | 9giga.sfr.fr
25 | abload.de
26 | abrutis-videos.com
27 | adrive.com
28 | ahstatic.com
29 | airset.com
30 | alamy.com
31 | alamyimages.fr
32 | alfafile.net
33 | alfon.org
34 | alkk.net
35 | alldrives.ge
36 | allomegavideo.com
37 | alloseven.com
38 | allourls.com
39 | alojarfotos.com
40 | anonfile.com
41 | anyhub.net
42 | anzwers.org
43 | arcadeupload.com
44 | arcadya.net
45 | archivosgrandes.com
46 | arribalafoto.com
47 | axifile.com
48 | b4uphotos.com
49 | backupmyfiles.net
50 | badongo.com
51 | bajapics.tk
52 | battleofdragon.com
53 | baycdn.com
54 | bayfiles.com
55 | bayimg.com
56 | bbhistory.info
57 | behance.net
58 | bestsharing.com
59 | bestupload.com
60 | bfgfile.com
61 | bigandfree.com
62 | bigfileupload.com
63 | bigfilez.com
64 | bigpichost.com
65 | bigupload.com
66 | bigvault.com
67 | birnchen.ath.cx
68 | bitshare.com
69 | bitshare.de
70 | blackbeard.ws
71 | bladimix.com
72 | blinkyou.com
73 | blogsimages.skynet.be
74 | bluehost.to
75 | boardplus.org
76 | bonpoo.com
77 | boomp3.com
78 | boomycloud.com
79 | box.com
80 | box.net
81 | briefcase.yahoo.com
82 | browsl.com
83 | bubbleshare.com
84 | busyupload.com
85 | casimages.com
86 | chomikuj.pl
87 | chungo.net
88 | cld.pt
89 | clicknupload.org
90 | clipser.com
91 | clipupload.com
92 | cloudshare.cz
93 | cocoimage.com
94 | cocoshare.cc
95 | coedproxy.info
96 | come2store.com
97 | comparte.pe
98 | content-type.com
99 | coolstreaming.us
100 | cpbild.co
101 | cramit.in
102 | cramitin.net
103 | crazefiles.com
104 | crazysharing.com
105 | cutedrive.com
106 | cyberdan.fr
107 | cybernetic1995.acc.de
108 | dada.net
109 | dailyuploads.net
110 | dardarkom.com
111 | datafilehost.com
112 | datapickup.com
113 | ddl-share.org
114 | deaddrops.com
115 | debrid-link.fr
116 | debridarea.com
117 | debrideurstreaming.com
118 | deeload.com
119 | demo.ovh.com
120 | depositfiles.com
121 | descargar.traducegratis.com
122 | descargasfull.com
123 | desearch.net
124 | digitalimagehosting.com
125 | diinoweb.com
126 | disiami.net
127 | divshare.com
128 | dl-b.free.fr
129 | dl-more.eu
130 | dl.free.fr
131 | dominiord.com
132 | douploads.net
133 | download.anim-area11.net
134 | downloaddelivery.com
135 | downloads.nl
136 | downloads.phpnuke.org
137 | dpstream.pw
138 | dpstream.tv
139 | drivehq.com
140 | driveway.com
141 | drop.io
142 | dropapk.to
143 | dropbox.com
144 | dropboxusercontent.com
145 | dropfiles.net
146 | dropload.com
147 | dumpanimage.com
148 | dumparump.com
149 | dweb.link
150 | easy-share.com
151 | easy-sharee.com
152 | easy-sharing.com
153 | easyupload.io
154 | eatlime.com
155 | eazyshare.net
156 | eazyupload.net
157 | egoshare.com
158 | egydown.com
159 | ei-pictures.com
160 | electronicfiles.net
161 | elephantdrive.com
162 | emstorage.fr
163 | enablebrowser.info
164 | enterupload.com
165 | epload.co
166 | esgens.com
167 | esnips.com
168 | evilshare.com
169 | ex-load.com
170 | exbyte.net
171 | extremeshare.net
172 | ez-files.net
173 | ezyfile.net
174 | facebook-proxy.info
175 | falcon.o-wh.com
176 | fast-debrid.com
177 | fast-load.net
178 | fasterupload.com
179 | fastpic.ru
180 | fastshare.org
181 | favshare.com
182 | faylyukle.com
183 | fboom.me
184 | fdrop.com
185 | fhqhosting.com
186 | fifostream.com
187 | fifostream.net
188 | fifostream.org
189 | fifostream.tv
190 | fifostreaming.com
191 | fifostreaming.net
192 | fifostreaming.org
193 | fifostreaming.tv
194 | file-rack.com
195 | file-up.org
196 | file.thfree.com
197 | file123.com
198 | file2share.biz
199 | file2upload.net
200 | filebase.to
201 | filebin.net
202 | filebuffer.net
203 | fileburst.com
204 | filecabin.com
205 | filecache.de
206 | fileclick.eu
207 | fileclick.net
208 | filecloud.com
209 | fileden.com
210 | filedip.com
211 | filedn.com
212 | filedropper.com
213 | filedwon.info
214 | filefactory.com
215 | filefishing.com
216 | filefront.com
217 | filegone.com
218 | filehd.com
219 | filehigh.com
220 | fileho.com
221 | filehost.ro
222 | filehosting.cc
223 | filehosting.org
224 | filejungle.com
225 | filekicker.com
226 | filelodge.bolt.com
227 | filemashine.com
228 | filenext.com
229 | fileparadox.in
230 | filepost.com
231 | filepost.us
232 | filepub.com
233 | fileqube.com
234 | files-express.com
235 | files-upload.com
236 | files.bz
237 | files.catbox.moe
238 | files.fm
239 | files.mail.ru
240 | files.to
241 | files.ww.com
242 | files6.com
243 | filesanywhere.com
244 | filesavr.com
245 | fileseasy.com
246 | filesend.net
247 | fileserve.com
248 | fileservices.me.com
249 | fileshit.net
250 | fileskip.com
251 | filesmap.com
252 | filesmore.com
253 | filesonic.com
254 | filespoint.com
255 | filespump.com
256 | filestage.net
257 | filestube.com
258 | filesupload.com
259 | filesusr.com
260 | filetransfer.io
261 | fileup.org
262 | fileupyours.com
263 | fileurls.com
264 | filezzz.com
265 | film-2-streaming.com
266 | film-exclue.net
267 | filthimagehost.com
268 | filthpics.com
269 | filthspace.com
270 | flixya.com
271 | flypicture.com
272 | flyupload.com
273 | foreverhide.com
274 | foroxd.com
275 | fotazas.com
276 | fotolog.pl
277 | fotonons.ru
278 | fotop.net
279 | fotosik.pl
280 | fotosupload.com
281 | freakshare.com
282 | freakshare.net
283 | free-hoster.cc
284 | free-transfer.de
285 | free-webhosts.com
286 | freedebrid.fr
287 | freedrive.com
288 | freefileupload.net
289 | freeimagehost.eu
290 | freeimagehosting.net
291 | freeimgshost.com
292 | freepik.com
293 | freeuploader.com
294 | freshlap.com
295 | friendlyfiles.net
296 | friendlyshare.de
297 | fromsmash.co
298 | fromsmash.com
299 | ftpz.us
300 | fullfotos.com.ar
301 | fupload.com
302 | gayimagehost.com
303 | geekimages.com
304 | geralink.in
305 | get-mu.com
306 | getfile.biz
307 | getkeepsafe.com
308 | gigabyteupload.com
309 | gigafilehost.net
310 | gigallery.com
311 | gigapeta.com
312 | gigashare.com
313 | gigasize.com
314 | gigaup.fr
315 | glintfiles.net
316 | glowfoto.com
317 | go.zummm.com
318 | gofilego.com
319 | goldfile.eu
320 | gopro4vn.com
321 | grablinksby.us
322 | grabme.net
323 | greek-fun.com
324 | grosfichiers.ch
325 | grosfichiers.com
326 | guba.com
327 | gximages.com
328 | gypsi.info
329 | harepix.com
330 | hemenpaylas.com
331 | hexupload.net
332 | hiboox.com
333 | hiboox.es
334 | hiboox.fr
335 | hitfile.net
336 | hjfile.cn
337 | hlusoe.info
338 | host-image.com.ar
339 | hostfiles.org
340 | hosting-test.net
341 | hotchyx.com
342 | hotfile.com
343 | hotlinkfiles.com
344 | hotlinkimage.com
345 | hotshare.net
346 | htpicturetrail.com
347 | hugedrive.com
348 | hulkload.com
349 | hulkshare.com
350 | humyo.com
351 | hyperfileshare.com
352 | hyperupload.com
353 | ibb.co
354 | icefile.com
355 | icefile.net
356 | icerbox.com
357 | idivimage.com
358 | idrive.com
359 | ifdnrg.com
360 | ifile.it
361 | ifolder.ru
362 | ifunpix.com
363 | igest.org
364 | iimmgg.com
365 | illhostit.com
366 | image.ohozaa.com
367 | image2host.com
368 | imagearn.com
369 | imagebam.com
370 | imageban.ru
371 | imagebanana.com
372 | imagebmp.com
373 | imagebor.com
374 | imagecabin.com
375 | imagecave.com
376 | imagechicken.com
377 | imagechile.net
378 | imagecloset.com
379 | imagefiasco.com
380 | imagehigh.com
381 | imagehost.es
382 | imagehost.org
383 | imagehosting.com
384 | imagehosting.us
385 | imagehostxp.com
386 | imagemule.com
387 | imagenchile.com
388 | imagengratis.org
389 | imagepremium.com
390 | imageshack.gr
391 | imageshack.us
392 | imageshadow.com
393 | imagesocket.com
394 | imageunload.com
395 | imageupload.se
396 | imageupper.com
397 | imageurlhost.com
398 | imagevenue.com
399 | imageviper.com
400 | imagewaste.com
401 | imagexa.com
402 | imaxenes.com
403 | img-vidiklub.com
404 | img.adoosimg.com
405 | img.godlike.cl
406 | img.nattawat.org
407 | img.tomatone.net
408 | img.xxfx.org
409 | imgarchive.info
410 | imghost.sk
411 | imgkk.com
412 | imgupload.adoosimg.com
413 | imgur.com
414 | imm.io
415 | immagini.p2pforum.it
416 | incredidl.com
417 | infierno-files.com
418 | internetfiles.org
419 | intoupload.net
420 | ipicture.ru
421 | ipswitch.com
422 | isarapix.com
423 | istockphoto.com
424 | iwastemytime.com
425 | jawcloud.co
426 | jeux.com
427 | jigsawshare.com
428 | jpghosting.com
429 | jumbofiles.com
430 | justupit.com
431 | k2s.cc
432 | katfile.com
433 | keepeek.com
434 | keepmyfile.com
435 | keepmyfiles.com
436 | keepmyimages.com
437 | kepfeltoltes.hu
438 | kewlshare.com
439 | krakenfiles.com
440 | ksaupload.com
441 | kytec.com
442 | largeimagehost.com
443 | leechking.com
444 | leetleech.org
445 | letitbit.net
446 | letsupload.com
447 | likeimg.com
448 | limelinx.com
449 | linkcyb.org
450 | linkspoof.com
451 | littlebyte.net
452 | livefilestore.com
453 | llnwd.net
454 | load.to
455 | loaderx.com
456 | looler.com
457 | lulzimg.com
458 | macle.voila.fr
459 | macleusb.net
460 | magicvortex.com
461 | mailbigfile.com
462 | maloxy.info
463 | mannequinat2000.chez-alice.fr
464 | maxishare.net
465 | mediafire.com
466 | mediapix.ru
467 | mega-debrid.eu
468 | mega-debrideur.tk
469 | mega-films.net
470 | mega.co.nz
471 | megadescarga.net
472 | megadl.info
473 | megadownload.net
474 | megafast.info
475 | megaftp.com
476 | megahotserved.com
477 | megaleech.eu
478 | megapid.com
479 | megashare.co.uk
480 | megashare.com
481 | megashares.com
482 | megaup.net
483 | megaupload-premium.com
484 | megaupload.com
485 | megaupload.de
486 | megavideo.com
487 | mesh.com
488 | mexa.sh
489 | mh2img.net
490 | migaload.com
491 | mihd.net
492 | mipony.net
493 | mirorii.com
494 | miroriii.com
495 | mixturecloud.com
496 | mj.am
497 | modovideo.com
498 | mofile.com
499 | momoshare.com
500 | momupload.com
501 | mon-nuage.com
502 | moncloshare.com
503 | monova.org
504 | mooload.com
505 | motionbox.com
506 | movshare.net
507 | mp3y.download
508 | muack.net
509 | mugrab.com
510 | multidl.com
511 | multipics.net
512 | multiply.com
513 | multiup.org
514 | mundo-descargas.com
515 | mundoimg.com
516 | mybloop.com
517 | myfilehut.com
518 | myfileshack.com
519 | myfilestash.com
520 | myfreefilehosting.com
521 | mynox.fr
522 | myotherdrive.com
523 | mypeopledoc.com
524 | mysave.in
525 | mysharebox.com
526 | mysharefile.com
527 | myspacegod.info
528 | myspacepro.info
529 | mytempdir.com
530 | myvideosharing.info
531 | myvirtualdisk.permissionresearch.com
532 | nakido.com
533 | navigator.ed.mu
534 | nbe-media.com
535 | ndfreehost.com
536 | netload.in
537 | netstorage.xosn.com
538 | netu.cam
539 | netu.io
540 | netu.tv
541 | neufgiga.com
542 | newgrounds.com
543 | nexmicrosystems.com
544 | nitroflare.com
545 | notblocked.hu.tl
546 | novamov.com
547 | nowvideo.eu
548 | nukeuploads.com
549 | onfinite.com
550 | oniva.com
551 | onlinedisk.ru
552 | onlinehome.fr
553 | onlinestuffs.com
554 | onwardhost.com
555 | opendrive.com
556 | openfile.ru
557 | openupload.com
558 | orbitfiles.com
559 | orgfree.com
560 | oron.com
561 | oxyshare.com
562 | ozerki.net
563 | paid4share.com
564 | paid4share.net
565 | paintedover.com
566 | partage-fichiers.com
567 | pbase.com
568 | peejeshare.com
569 | peerfactor.fr
570 | perushare.com
571 | photo-host.org
572 | photobucket.com
573 | photofile.es
574 | photofile.ru
575 | photoimagenes.com
576 | photojerk.com
577 | photos.cx
578 | photosamigos.com
579 | photoserver.ws
580 | phyrefile.com
581 | pic4you.ru
582 | picapic.net
583 | picbase.net
584 | picfoco.com
585 | picfor.me
586 | picfront.com
587 | picfront.de
588 | picfront.org
589 | picfu.net
590 | picoload.com
591 | picoodle.com
592 | picscrazy.com
593 | picsec.com
594 | pict.com
595 | picture-hosting.net
596 | picturedumper.com
597 | picturetrail.com
598 | picupl.com
599 | picvalley.net
600 | pimpandhost.com
601 | pixagogo.com
602 | pixali.com
603 | pixdaus.com
604 | pixelup.net
605 | pixhost.com
606 | pixhost.me
607 | pixhost.org
608 | pixhost.ws
609 | pixpond.com
610 | pixshock.net
611 | pixslam.com
612 | pixsy.com
613 | pixxtra.com
614 | plunder.com
615 | pornpicer.com
616 | postimage.org
617 | premify.com
618 | premiumbyleo.co.cc
619 | premiumlinkgens.blogspot.com
620 | profile.myspace.com
621 | przeklej.pl
622 | psychohost.com
623 | pushfile.net
624 | putfile.com
625 | putlocker.com
626 | qfile.de
627 | qshare.com
628 | quickdump.com
629 | quickshareit.com
630 | r25725.ovh.net
631 | r26538.ovh.net
632 | r27369.ovh.net
633 | r28052.ovh.net
634 | radikal.ru
635 | rapid-photo.com
636 | rapid4free.com
637 | rapid4me.com
638 | rapid8.com
639 | rapidechange.com
640 | rapideo.pl
641 | rapidfile.fr
642 | rapidforum.com
643 | rapidgator.net
644 | rapidl.com
645 | rapidmoviez.com
646 | rapidrar.com
647 | rapidshare.com
648 | rapidshare.de
649 | rapidshare.se
650 | rapidsharewarezmegaupload.com
651 | rapidsharing.com
652 | rapidsharings.com
653 | rapidupload.com
654 | rarhost.com
655 | redlist.be
656 | refrozen.com
657 | reliableimage.com
658 | revver.com
659 | rhost.cz
660 | ringo.com
661 | riprapid.net
662 | ripway.com
663 | rockdizfile.com
664 | rockfile.co
665 | sadew.com
666 | safe-access.com
667 | saleno.privateme.info
668 | sandisk.com
669 | savefile.com
670 | scambia.com
671 | school4.uyou.info
672 | send-file.co.uk
673 | sendbox.fr
674 | sendmefile.com
675 | sendover.com
676 | sendspace.com
677 | sendthisfile.com
678 | series-megaupload.com
679 | servimg.com
680 | sex.beohost.com
681 | seyvet.com
682 | share-online.biz
683 | share.am
684 | share.live.com
685 | shareapic.net
686 | sharebase.to
687 | sharebig.com
688 | sharebigfile.com
689 | sharedzilla.com
690 | sharefiles.ru
691 | shareiffic.com
692 | shareimages.com
693 | sharelor.com
694 | sharemods.com
695 | sharenxs.com
696 | sharingmatrix.com
697 | sharingzone.net
698 | sharovar.com
699 | shinyhosting.net
700 | shitore.com
701 | shop2all.biz
702 | sinpremium.net
703 | skodengz.com
704 | skydrive.live.com
705 | slack-files.com
706 | slibe.com
707 | slide.com
708 | slil.ru
709 | slwatch.co
710 | snaggys.com
711 | snap.com
712 | snapdrive.net
713 | sockshare.com
714 | solidfiles.com
715 | speed-downloading.com
716 | speed4up.net
717 | speedshare.org
718 | spideroak.com
719 | spread-it.com
720 | spymac.com
721 | ssl0d.com
722 | stage6.com
723 | steekr.com
724 | stickypix.net
725 | storage.live.com
726 | storage.yandexcloud.net
727 | storagefun.com
728 | storeandserve.com
729 | streamiz-filmze.fr
730 | streamiz.com
731 | streamlare.com
732 | streamload.com
733 | streamupload.com
734 | sube.la
735 | subefotos.com
736 | subeimagenes.com.ar
737 | subelas.com
738 | subetela.com
739 | subir-archivos.com.ar
740 | subirimagen.es
741 | subirimagen.net
742 | subirimagenes.com
743 | subiteya.com
744 | sugarsync.com
745 | superphotospace.com
746 | supload.com
747 | surfban.info
748 | surfblocked.co.cc
749 | swfcabin.com
750 | swiftdesk.com
751 | swoopshare.com
752 | sxc.hu
753 | syncplicity.com
754 | tabulas.com
755 | tagstat.com
756 | takefile.link
757 | tempfile.ru
758 | terabox.telefonica.com.ar
759 | tezfiles.com
760 | thefilebucket.com
761 | thefilehut.com
762 | thefreesite.com
763 | theimagehosting.com
764 | theonlinedatastorage.com
765 | thepictures.us
766 | theupload.com
767 | thumbhoster.com
768 | thumblogger.com
769 | tinydot.co.cc
770 | tinypic.com
771 | tmpfiles.org
772 | topdebrid.com
773 | torrentreactor.net
774 | toutbox.fr
775 | trackerx.com.ar
776 | tradownload.uk
777 | transfer.sh
778 | transferbigfiles.com
779 | turbobit.net
780 | turbobit.ru
781 | turboupload.com
782 | twilight.ws
783 | ucantblockme.info
784 | ugotfile.com
785 | uloz.to
786 | ultimbox.com
787 | ultrashare.de
788 | ultrashare.net
789 | unblock.nevercatch.com
790 | unblockya.com
791 | unibytes.com
792 | universalhoster.net
793 | universitriat.co.cc
794 | up-4ever.org
795 | up-file.com
796 | up.li.ru
797 | up4net.com
798 | updownloadserver.com
799 | updownloadserver.de
800 | upken.jp
801 | uplo4d.com
802 | upload-file.net
803 | upload.ac
804 | upload.digiex.net
805 | upload.dj
806 | upload.sc
807 | upload.seeitworks.com
808 | upload2.net
809 | uploadarmy.com
810 | uploadbox.com
811 | uploadbuzz.cc
812 | uploadchan.org
813 | uploaded.net
814 | uploaded.to
815 | uploadev.org
816 | uploadfile.info
817 | uploadfiles.io
818 | uploadgalaxy.com
819 | uploadgeek.com
820 | uploadhouse.com
821 | uploadhut.com
822 | uploading.com
823 | uploadingit.com
824 | uploadit.biz
825 | uploadjar.com
826 | uploadking.net
827 | uploadmachine.com
828 | uploadocean.com
829 | uploadrack.com
830 | uploadspy.com
831 | uploadstation.com
832 | uploadstore.com
833 | uploadtemple.com
834 | uploadtown.com
835 | uploadwave.com
836 | uploadx.net
837 | uploadyourfiles.de
838 | uploadyourimages.com
839 | uppit.com
840 | upsara.com
841 | upshare.eu
842 | uptobox.com
843 | upvid.co
844 | uqload.com
845 | usaupload.net
846 | useeimage.com
847 | usersdrive.com
848 | usershare.net
849 | userupload.net
850 | ushareit.com
851 | vagoimagen.com
852 | verysong.co.cc
853 | verzend.be
854 | vgroupnetwork.com.ar
855 | vidcloud.co
856 | videobb.com
857 | videotribe.com
858 | videoucrania.com
859 | videoweed.com
860 | videozer.com
861 | vidoza.net
862 | villagephotos.com
863 | vleech.net
864 | vobbo.com
865 | vodpod.com
866 | volafile.io
867 | vosfichiers.com
868 | vpx.pl
869 | vuestrasfotos.com
870 | wantpremium.net
871 | wantrapid.com
872 | warezlinkers.com
873 | way2tube.com
874 | wdfiles.ru
875 | wdupload.com
876 | we.tl
877 | webfile.ru
878 | webfilehost.com
879 | webshots.com
880 | weshare.me
881 | wetransfer.com
882 | wetransfer.net
883 | wi.to
884 | wikiupload.com
885 | wirefiles.com
886 | with2ch.net
887 | wonderfile.net
888 | woo55.com
889 | workupload.com
890 | wtfhost.com
891 | wuala.com
892 | wudeo.com
893 | wupfile.com
894 | wupload.com
895 | wupload.fr
896 | wyslijto.pl
897 | x7.to
898 | xbinary.com
899 | xdrive.com
900 | xinony.com
901 | xmages.net
902 | xooimage.com
903 | xs.to
904 | xshare.us
905 | xtraupload.de
906 | xzshare.com
907 | yabadaba.ru
908 | yatahonga.com
909 | yourfile.net
910 | yourfile.org
911 | yourfilehost.com
912 | yourfilelink.com
913 | yousendit.com
914 | youshareit.com
915 | zalil.ru
916 | zettapremium.com
917 | ziddu.com
918 | zikuka.com
919 | zikuka.ru
920 | zippyshare.com
921 | zizfile.com
922 | znail.com
923 | zonadd.net
924 | zonaimagen.es
925 | zone-videos.net
926 | zoto.com
927 | zotzoo.com
928 | zshare.net
929 | zumodrive.com
930 | zupload.com
931 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/filehosting/urls:
--------------------------------------------------------------------------------
1 | <<<<<<< HEAD
2 | version https://git-lfs.github.com/spec/v1
3 | oid sha256:aa7f07f95eb8626f72fb9025ba298a87f670c6033a370b839e1b099d5dde3d02
4 | size 200
5 | =======
6 | cri.univ-tlse1.fr/tools/test_filtrage/filehosting/
7 | howcast.com/upload/
8 | me.com/idisk
9 | media.filecabi.net/upload.html
10 | nhjm.net/~pcdthebum/Upload/
11 | stupidvideos.com/upload/
12 | sumo.tv/upload/
13 | voila.fr/macle/
14 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a
15 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/filehosting/usage:
--------------------------------------------------------------------------------
1 | black
2 | filehosting
3 | stockage
4 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/gambling.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/gambling.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/gambling/urls:
--------------------------------------------------------------------------------
1 | <<<<<<< HEAD
2 | version https://git-lfs.github.com/spec/v1
3 | oid sha256:acb511832e0271d2d385cef1aae58a657a11278a3b2c2f667292115e6ab3d2d2
4 | size 182
5 | =======
6 | astrolabio.net/casino/
7 | cri.univ-tlse1.fr/tools/test_filtrage/gambling/
8 | sd579.sivit.org/lesgrandscasinos/
9 | thestreetwearmagazine.com/www.thestreetwearmagazine.com/
10 | top-lasvegas.com/en
11 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a
12 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/gambling/usage:
--------------------------------------------------------------------------------
1 | black
2 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/mixed_adult.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/mixed_adult.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/mixed_adult/domains:
--------------------------------------------------------------------------------
1 | 10putes.com
2 | 2and2.net
3 | 321chat.com
4 | 4chan.org
5 | 4freeimagehost.com
6 | abrutis-videos.com
7 | agq.qc.ca
8 | albums-photo.net
9 | allyoucanupload.webshots.com
10 | archiveofourown.org
11 | artdeseduire.com
12 | artistic-nude-images.com
13 | bayimg.com
14 | bazoocam.org
15 | bdamateur.com
16 | beautyandboost.com
17 | blablagues.net
18 | blaguesenstock.com
19 | blinkyou.com
20 | blogsimages.skynet.be
21 | bookfoto.com
22 | bookspace.fr
23 | buzzmaniac.fr
24 | camroulette.biz
25 | camroulette.co.uk
26 | captice.net
27 | carsandgirls.hu
28 | caught.com
29 | cazzateonline.it
30 | cduniverse.com
31 | chatroulette.com
32 | chatroulettefr.com
33 | chatteroulette.fr
34 | dada.net
35 | daniel-bauer.com
36 | debono.club.fr
37 | deskbeauty.net
38 | deviantart.com
39 | dianeetlesexedesanges.ch
40 | digitalimagehosting.com
41 | dumpanimage.com
42 | dvdrama.com
43 | ecranlarge.com
44 | entrevue.fr
45 | filecloud.com
46 | filehigh.com
47 | filelodge.bolt.com
48 | free-webhosts.com
49 | freegamesforgirls.org
50 | freeimagehosting.net
51 | ftw.generation.no
52 | gael-l.com
53 | galerias.ojodigital.com
54 | geekimages.com
55 | girlicious.free.fr
56 | glennbwellmanphoto.com
57 | glowfoto.com
58 | gougoule.com
59 | guba.com
60 | gwendoline.book.fr
61 | hotlinkimage.com
62 | htpicturetrail.com
63 | humour-blague.com
64 | image2host.com
65 | imagecabin.com
66 | imagecave.com
67 | imagecloset.com
68 | imagefiasco.com
69 | imagehigh.com
70 | imagehosting.com
71 | imagehosting.us
72 | imagemule.com
73 | imageshack.us
74 | imagevenue.com
75 | immagini.p2pforum.it
76 | imvu.com
77 | istockphoto.com
78 | izismile.com
79 | jellyfields.com
80 | jeux.com
81 | kamroulette.com
82 | keepmyfile.com
83 | keepmyfiles.com
84 | keepmyimages.com
85 | le-trombi.com
86 | lebest.fr
87 | lecture-en-ligne.com
88 | leslivresdesarah.canalblog.com
89 | libertatea.ro
90 | loofok.com
91 | mademoiselleagency.fr
92 | mandatory.com
93 | mediafire.com
94 | megaupload.com
95 | metacafe.com
96 | newgrounds.com
97 | ohmybuzz.net
98 | onfinite.com
99 | oniva.com
100 | pbh2.com
101 | photofile.ru
102 | photojerk.com
103 | picphotos.net
104 | picturedumper.com
105 | picturetrail.com
106 | pitchu.fr
107 | pixpond.com
108 | profile.myspace.com
109 | quedesconneries.com
110 | rapidforum.com
111 | rapidshare.com
112 | rapidshare.de
113 | rutube.ru
114 | sauna-lotus.fr
115 | sex.beohost.com
116 | slibe.com
117 | slide.com
118 | spymac.com
119 | t45ol.com
120 | tabulas.com
121 | thechive.com
122 | thefreesite.com
123 | theimagehosting.com
124 | thoughtcatalog.com
125 | thumblogger.com
126 | tinypic.com
127 | tmz.com
128 | torrentmv.com
129 | unitedhumor.com
130 | up-file.com
131 | uploadfile.info
132 | uploadyourimages.com
133 | vice.com
134 | videofilia.com
135 | villagephotos.com
136 | wallpapers-paradise.com
137 | weedoroulette.com
138 | weheartit.com
139 | widouf.com
140 | wizzcam.fr
141 | y-top.com
142 | yatahonga.com
143 | youmadeo.com
144 | zinio.com
145 | zone-videos.net
146 | zoto.com
147 | zshare.net
148 | zvraceny.cz
149 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/mixed_adult/urls:
--------------------------------------------------------------------------------
1 | <<<<<<< HEAD
2 | version https://git-lfs.github.com/spec/v1
3 | oid sha256:d02965aee171c36a9d159af0f047f972ccd5596305f6a13d7db93c895e29a683
4 | size 232
5 | =======
6 | chatrandom.com/fr/
7 | cri.univ-tlse1.fr/tools/test_filtrage/mixed_adult/
8 | images.live.com/videos/thumbnail.aspx
9 | montreal.craigslist.ca/search/tlg
10 | montreal.craigslist.ca/tlg
11 | olhares.aeiou.pt/galeriasprivadas/
12 | weedochat.com/chatroulette/
13 | >>>>>>> 72ba839f1967cae87f00a57fe724930140efc90a
14 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/mixed_adult/usage:
--------------------------------------------------------------------------------
1 | black
2 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/phishing.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/blocklist/phishing.tar.gz
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/blocklist/phishing/usage:
--------------------------------------------------------------------------------
1 | # This list is no longer maintained.
2 | # It's only a copy of malware category
3 | black
4 | phishing
5 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/curated/domains:
--------------------------------------------------------------------------------
1 | stackexchange.com
2 | ncbi.nlm.nih.gov/pmc
3 | arxiv.org
4 | github.com
5 | storage.courtlistener.com
6 | bulkdata.uspto.gov
7 | pubmed.ncbi.nlm.nih.gov
8 | gutenberg.org
9 | opensubtitles.org
10 | wikipedia.org
11 | irclogs.ubuntu.com
12 | statmt.org
13 | news.ycombinator.com
14 | youtube.com
15 | philpapers.org
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/url_blocklist_refinedweb_manual_inspection.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4db972247738dd99bedc51488debb705ff954e230c39fa8434ecc1398bcd349b
3 | size 123585639
4 |
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/whitelist/domains:
--------------------------------------------------------------------------------
1 | bust.com
2 | chicagoreader.com
3 | discord.com
4 | jungefreiheit.de
5 | marktplaza.nl
6 | telegra.ph
--------------------------------------------------------------------------------
/web_pipeline/url_filtering/urls/whitelist/urls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLM360/MegaMath/52471d65673c784868629e2165470cd05f6c8975/web_pipeline/url_filtering/urls/whitelist/urls
--------------------------------------------------------------------------------
/web_pipeline/utils/datatrove_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, Literal
2 |
3 | from datatrove.io import DataFileLike, DataFolderLike
4 | from datatrove.pipeline.readers.base import BaseDiskReader
5 | from datatrove.utils.logging import logger
6 |
7 |
8 | class TxtReader(BaseDiskReader):
9 | """Read data from TXT files.
10 | Will read each line as a separate document.
11 |
12 | Args:
13 | data_folder: a str, tuple or DataFolder object representing a path/filesystem
14 | paths_file: optionally provide a file with one path per line (without the `data_folder` prefix) to read.
15 | compression: the compression to use (default: "infer")
16 | limit: limit the number of documents to read. Useful for debugging
17 | skip: skip the first n rows
18 | file_progress: show progress bar for files
19 | doc_progress: show progress bar for documents
20 | adapter: function to adapt the data dict from the source to a Document.
21 | Takes as input: (self, data: dict, path: str, id_in_file: int | str)
22 | self allows access to self.text_key and self.id_key
23 | Returns: a dict with at least a "text" and "id" keys
24 | text_key: the key containing the text data (default: "text").
25 | id_key: the key containing the id for each sample (default: "id").
26 | default_metadata: a dictionary with any data that should be added to all samples' metadata
27 | recursive: whether to search files recursively. Ignored if paths_file is provided
28 | glob_pattern: pattern that all files must match exactly to be included (relative to data_folder). Ignored if paths_file is provided
29 | shuffle_files: shuffle the files within the returned shard. Mostly used for data viz. purposes, do not use with dedup blocks
30 | """
31 |
32 | name = "🐿 Txt"
33 |
34 | def __init__(
35 | self,
36 | data_folder: DataFolderLike,
37 | paths_file: DataFileLike | None = None,
38 | compression: Literal["infer", "gzip", "zstd"] | None = "infer",
39 | limit: int = -1,
40 | skip: int = 0,
41 | file_progress: bool = False,
42 | doc_progress: bool = False,
43 | adapter: Callable = None,
44 | text_key: str = "text",
45 | id_key: str = "id",
46 | default_metadata: dict = None,
47 | recursive: bool = True,
48 | glob_pattern: str | None = None,
49 | shuffle_files: bool = False,
50 | ):
51 | super().__init__(
52 | data_folder,
53 | paths_file,
54 | limit,
55 | skip,
56 | file_progress,
57 | doc_progress,
58 | adapter,
59 | text_key,
60 | id_key,
61 | default_metadata,
62 | recursive,
63 | glob_pattern,
64 | shuffle_files,
65 | )
66 | self.compression = compression
67 |
68 | def read_file(self, filepath: str):
69 | with self.data_folder.open(filepath, "r", compression=self.compression) as f:
70 | try:
71 | # for txt file, each line is a document
72 | for li, line in enumerate(f):
73 | document = self.get_document_from_dict({"text": line}, filepath, li)
74 | yield document
75 | except UnicodeDecodeError as e:
76 | logger.warning(
77 | f"File `{filepath}` may be corrupted: raised UnicodeDecodeError ({e})"
78 | )
79 |
--------------------------------------------------------------------------------
/web_pipeline/utils/decont_utils/data/aime25.jsonl:
--------------------------------------------------------------------------------
1 | {"id":"I-1","problem":"Find the sum of all integer bases $b > 9$ for which $17_b$ is a divisor of $97_b$.","solution":"","answer":"70","url":""}
2 | {"id":"I-2","problem":"On $\\triangle ABC$, points $A$, $D$, $E$, and $B$ lie in that order on side $\\overline{AB}$ with $AD = 4$, $DE = 16$, and $EB = 8$. Points $A$, $F$, $G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF = 13$, $FG = 52$, and $GC = 26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon $AFNBCEM$.","solution":"","answer":"588","url":""}
3 | {"id":"I-3","problem":"The 9 members of a baseball team went to an ice-cream parlor after their game. Each player had a singlescoop cone of chocolate, vanilla, or strawberry ice cream. At least one player chose each flavor, and the number of players who chose chocolate was greater than the number of players who chose vanilla, which was greater than the number of players who chose strawberry. Let $N$ be the number of different assignments of flavors to players that meet these conditions. Find the remainder when $N$ is divided by 1000.","solution":"","answer":"16","url":""}
4 | {"id":"I-4","problem":"Find the number of ordered pairs $(x,y)$, where both $x$ and $y$ are integers between -100 and 100, inclusive, such that $12x^2 - xy - 6y^2 = 0$.","solution":"","answer":"117","url":""}
5 | {"id":"I-5","problem":"There are $8! = 40320$ eight-digit positive integers that use each of the digits $1,2,3,4,5,6,7,8$ exactly once. Let $N$ be the number of these integers that are divisible by 22. Find the difference between $N$ and 2025.","solution":"","answer":"279","url":""}
6 | {"id":"I-6","problem":"An isosceles trapezoid has an inscribed circle tangent to each of its four sides. The radius of the circle is 3, and the area of the trapezoid is 72. Let the parallel sides of the trapezoid have lengths $r$ and $s$, with $r \\neq s$. Find $r^2 + s^2$.","solution":"","answer":"504","url":""}
7 | {"id":"I-7","problem":"The twelve letters $A$, $B$, $C$, $D$, $E$, $F$, $G$, $H$, $I$, $J$, $K$, and $L$ are randomly grouped into six pairs of letters. The two letters in each pair are placed next to each other in alphabetical order to form six two-letter words, and then those six words are listed alphabetically. For example, a possible result is $AB,CJ,DG,EK,FL,HI$. The probability that the last word listed contains $G$ is $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m + n$.","solution":"","answer":"821","url":""}
8 | {"id":"I-8","problem":"Let $k$ be a real number such that the system\n$|25 + 20i - z| = 5$\n$|z - 4 - k| = |z - 3i - k|$\nhas exactly one complex solution $z$. The sum of all possible values of $k$ can be written as $\\frac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m + n$. Here $i = \\sqrt{-1}$.","solution":"","answer":"77","url":""}
9 | {"id":"I-9","problem":"The parabola with equation $y = x^2 - 4$ is rotated $60°$ counterclockwise around the origin. The unique point in the fourth quadrant where the original parabola and its image intersect has $y$-coordinate $\\frac{a-\\sqrt{b}}{c}$, where $a$, $b$, and $c$ are positive integers, and $a$ and $c$ are relatively prime. Find $a + b + c$.","solution":"","answer":"62","url":""}
10 | {"id":"I-10","problem":"The 27 cells of a $3 \\times 9$ grid are filled in using the numbers 1 through 9 so that each row contains 9 different numbers, and each of the three $3 \\times 3$ blocks heavily outlined in the example contains 9 different numbers, as in the first three rows of a Sudoku puzzle. The number of different ways to fill such a grid can be written as $p^a \\cdot q^b \\cdot r^c \\cdot s^d$, where $p,q,r,$ and $s$ are distinct prime numbers and $a$, $b$, $c$, and $d$ are positive integers. Find $p \\cdot a + q \\cdot b + r \\cdot c + s \\cdot d$.","solution":"","answer":"81","url":""}
11 | {"id":"I-11","problem":"A piecewise linear periodic function is defined by\n$f(x)=\\begin{cases} x & \\text{if } x \\in [-1,1) \\\\ 2-x & \\text{if } x \\in [1,3) \\end{cases}$\nand $f(x+4)=f(x)$ for all real numbers $x$. The parabola $x=34y^2$ intersects the graph of $f(x)$ at finitely many points. The sum of the $y$-coordinates of these intersection points can be expressed in the form $\\frac{a+b\\sqrt{c}}{d}$, where $a$, $b$, $c$, and $d$ are positive integers, $a$, $b$, and $d$ have greatest common divisor equal to 1, and $c$ is not divisible by the square of any prime. Find $a + b + c + d$.","solution":"","answer":"259","url":""}
12 | {"id":"I-12","problem":"The set of points in 3-dimensional coordinate space that lie in the plane $x + y + z = 75$ whose coordinates satisfy the inequalities\n$x - yz < y - zx < z - xy$\nforms three disjoint convex regions. Exactly one of those regions has finite area. The area of this finite region can be expressed in the form $a\\sqrt{b}$, where $a$ and $b$ are positive integers and $b$ is not divisible by the square of any prime. Find $a + b$.","solution":"","answer":"510","url":""}
13 | {"id":"I-13","problem":"Alex divides a disk into four quadrants with two perpendicular diameters intersecting at the center of the disk. He draws 25 more line segments through the disk, drawing each segment by selecting two points at random on the perimeter of the disk in different quadrants and connecting those two points. Find the expected number of regions into which these 27 line segments divide the disk.","solution":"","answer":"204","url":""}
14 | {"id":"I-14","problem":"Let $ABCDE$ be a convex pentagon with $AB = 14$, $BC = 7$, $CD = 24$, $DE = 13$, $EA = 26$, and $\\angle ZBA = 60°$. For each point $X$ in the plane, define $f(X) = AX + BX + CX + DX + EX$. The least possible value of $f(X)$ can be expressed as $m + n\\sqrt{p}$, where $m$ and $n$ are positive integers and $p$ is not divisible by the square of any prime. Find $m + n + p$.","solution":"","answer":"60","url":""}
15 | {"id":"I-15","problem":"Let $N$ denote the number of ordered triples of positive integers $(a,b,c)$ such that $a,b,c \\leq 3^6$ and $a^3 + b^3 + c^3$ is a multiple of $3^7$. Find the remainder when $N$ is divided by 1000.","solution":"","answer":"735","url":""}
--------------------------------------------------------------------------------
/web_pipeline/utils/decont_utils/data/sat.jsonl:
--------------------------------------------------------------------------------
1 | {"id": "0", "question": "The graph of the polynomial function $f$, where $y=f(x)$, has $x$-intercepts of $(-6,0)$ and $(6,0)$. Which of the following must be true?", "options": "A) $f(-6)=0$ B) $f(6)=-6$ C) $f(-6)=6$ D) $f(0)=-6$", "Answer": "A"}
2 | {"id": "1", "question": "$$\\begin{gathered} y=4 x+6 \\\\-5 x-y=21\\end{gathered}$$ What is the solution $(x, y)$ to the given system of equations?", "options": "A) $(-3,-6)$ B) $\\left(-\\frac{5}{3},-\\frac{2}{3}\\right)$ C) $(3,18)$ D) $(15,66)$", "Answer": "A"}
3 | {"id": "2", "question": "$\\lvert x-10 \\rvert = 0$ What are all the possible solutions to the given equation?", "options": "A) -10 B) 0 C) 10 D) -10 and 10", "Answer": "C"}
4 | {"id": "3", "question": "$$q=s(r-1)^2$$ The given equation relates the positive numbers $q, r$, and $s$. Which equation gives $r$ in terms of $q$ and $s$, when $r>1$?", "options": "A) $r=1+\\sqrt{\\frac{q}{s}}$ B) $r=1+\\frac{\\sqrt{q}}{s}$ C) $r=-1-\\sqrt{\\frac{q}{s}}$ D) $r=-1-\\frac{\\sqrt{q}}{s}$", "Answer": "A"}
5 | {"id": "4", "question": "In the relationship between variables $x$ and $y$, each increase of $1$ in the value of $x$ decreases the value of $y$ by 2. When $x=0$, $y=5$. Which equation represents this relationship?", "options": "A) $y=-\\frac{1}{2}x+5$ B) $y=-\\frac{1}{2}x-5$ C) $y=-2x-5$ D) $y=-2x+5$", "Answer": "D"}
6 | {"id": "5", "question": "An isosceles right triangle has a hypotenuse of length 4 inches. What is the perimeter, in inches, of this triangle?", "options": "A) $2\\sqrt{2}$ B) $4\\sqrt{2}$ C) $4+4\\sqrt{2}$ D) $4+8\\sqrt{2}$", "Answer": "C"}
7 | {"id": "6", "question": "How many solutions does the equation $4(x-2) = -2(x+4)$ have?", "options": "A) Zero B) Exactly one C) Exactly two D) Infinitely many", "Answer": "B"}
8 | {"id": "7", "question": "$R(t) = 1,830 - 790(2.71)^{-.18t}$ The function $R$ gives the predicted average rating, expressed as a number of points, in the German chess federation database for a player based on the number of years, $t$, the player has participated in professional chess tournaments. Which of the following represents the predicted average rating of a player who has just entered their first professional chess tournament?", "options": "A) $R(-0.18)$ B) $R(0)$ C) $R(790)$ D) $R(1,830)$", "Answer": "B"}
9 | {"id": "8", "question": "Alice took 60 minutes to complete a task on her first trial. The time it took Alice to complete the task decreased by 10% of the previous time for each additional trial. Approximately how many minutes will it take Alice to complete the task on her fifth trial?", "options": "A) 50 B) 42 C) 39 D) 35", "Answer": "C"}
10 | {"id": "9", "question": "$$ \\begin{aligned} & y<\\frac{2}{5} x+3 \\\\& y>\\frac{1}{2} x-6\\end{aligned}$$ In which of the following tables are all the values of $x$ and their corresponding values of $y$ solutions to the system of inequalities shown?", "options": "A) \\begin{tabular}{|r|r|} \\hline$x$ & $y$ \\\\\\hline-2 & -8 \\\\\\hline 0 & -4 \\\\\\hline 4 & 4 \\\\\\hline\\end{tabular} B) \\begin{tabular}{|c|c|}\\hline$x$ & $y$ \\\\\\hline-2 & -8 \\\\\\hline 0 & 4 \\\\\\hline 4 & 4 \\\\\\hline\\end{tabular} C) \\begin{tabular}{|r|r|}\\hline$x$ & $y$ \\\\\\hline-2 & 3 \\\\\\hline 0 & 2 \\\\\\hline 4 & -3 \\\\\\hline\\end{tabular} D) \\begin{tabular}{|r|r|}\\hline$x$ & $y$ \\\\\\hline-2 & 2 \\\\\\hline 0 & -3 \\\\\\hline 4 & 3 \\\\\\hline\\end{tabular}", "Answer": "D"}
11 | {"id": "10", "question": "Which of the following is equivalent to $(\\sqrt{32})(\\sqrt[5]{64})$?", "options": "A) $6\\left(\\sqrt[7]{2^5}\\right)$ B) $6\\left(\\sqrt[10]{2^7}\\right)$ C) $8\\left(\\sqrt[7]{2^5}\\right)$ D) $8\\left(\\sqrt[10]{2^7}\\right)$", "Answer": "D"}
12 | {"id": "11", "question": "An object has a mass of 3,300 milligrams. What is the mass of the object in grams? (1 gram = 1,000 milligrams)", "options": "A) 0.33 B) 3.30 C) 33.00 D) 330.00", "Answer": "B"}
13 | {"id": "12", "question": "On average, one square inch of human skin contains 650 sweat glands. A certain area of skin contains 1,170 sweat glands. Based on this information, which of the following is closest to the size of this area, in square inches?", "options": "A) 0.44 B) 0.56 C) 0.80 D) 1.80", "Answer": "D"}
14 | {"id": "13", "question": "The table give the heights, in feet, of 5 peaks in the Rocky Mountains and 5 peaks in the Appalachian Mountains. \\begin{tabular}{|l|l|l|l|l|} \\hline $\\begin{array}{l}\\text { Rocky } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ & $\\begin{array}{l}\\text { Appalachian } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Elbert }\\end{array}$ & 14,439 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Mitchell }\\end{array}$ & 6,684 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Massive }\\end{array}$ & 14,429 & Mount Craig & 6,647 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Harvard }\\end{array}$ & 14,419 & $\\begin{array}{l}\\text { Clingman's } \\\\\\text { Dome }\\end{array}$ & 6,643 \\\\\\hline $\\begin{array}{l}\\text { Blanca } \\\\\\text { Peak }\\end{array}$ & 14,350 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Guyot }\\end{array}$ & 6,621 \\\\\\hline $\\begin{array}{l}\\text { La Plata } \\\\\\text { Peak }\\end{array}$ & 14,343 & $\\begin{array}{l}\\text { Balsam } \\\\\\text { Cone }\\end{array}$ & 6,611 \\\\\\hline\\end{tabular} What is the height, in meters, of Blanca Peak? (Use 1 meter $=3.28$ feet)", "options": "A) 437.5 B) 4,375 C) 47,045 D) 47,068", "Answer": "B"}
15 | {"id": "14", "question": "The table give the heights, in feet, of 5 peaks in the Rocky Mountains and 5 peaks in the Appalachian Mountains. \\begin{tabular}{|l|l|l|l|l|} \\hline $\\begin{array}{l}\\text { Rocky } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ & $\\begin{array}{l}\\text { Appalachian } \\\\\\text { Mountain } \\\\\\text { Peak }\\end{array}$ & $\\begin{array}{l}\\text { Height } \\\\\\text { (in feet) }\\end{array}$ \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Elbert }\\end{array}$ & 14,439 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Mitchell }\\end{array}$ & 6,684 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Massive }\\end{array}$ & 14,429 & Mount Craig & 6,647 \\\\\\hline $\\begin{array}{l}\\text { Mount } \\\\\\text { Harvard }\\end{array}$ & 14,419 & $\\begin{array}{l}\\text { Clingman's } \\\\\\text { Dome }\\end{array}$ & 6,643 \\\\\\hline $\\begin{array}{l}\\text { Blanca } \\\\\\text { Peak }\\end{array}$ & 14,350 & $\\begin{array}{l}\\text { Mount } \\\\\\text { Guyot }\\end{array}$ & 6,621 \\\\\\hline $\\begin{array}{l}\\text { La Plata } \\\\\\text { Peak }\\end{array}$ & 14,343 & $\\begin{array}{l}\\text { Balsam } \\\\\\text { Cone }\\end{array}$ & 6,611 \\\\\\hline\\end{tabular} For the given Appalachian Mountain peaks, the height of the highest peak is approximately what percent greater than the height of the lowest peak?", "options": "A) $1.1 \\%$ B) $9.9 \\%$ C) $73.0 \\%$ D) $101.1 \\%$", "Answer": "A"}
16 | {"id": "15", "question": "Data set $A: 2,4,6,6,8,12$ Data set B: $2,4,6,6,8,12,26$ Two data sets are shown. Which statement best compares the medians of the data sets?", "options": "A) The median of data set A is greater than the median of data set $B$ B) The median of data set A is less than the median of data set B C) The medians of data sets A and B are equal D) There is not enough information to compare the medians", "Answer": "C"}
17 | {"id": "16", "question": "$$0.79 x+1.0 y=100$$ The mass of a solution of isopropanol and water is 100 grams. The given equation represents this situation, where $x$ is the volume of isopropanol, in cubic centimeters, and $y$ is the volume of water, in cubic centimeters. If the volume of isopropanol is 70 cubic centimeters, what is the approximate volume of water, in cubic centimeters?", "options": "A) 45 B) 55 C) 70 D) 79", "Answer": "A"}
18 | {"id": "17", "question": "There are 435 voting members of the US House of Representatives. If $b$ voting members are in favor of a certain bill, which expression represents the percentage of the voting members in favor of the bill?", "options": "A. $100\\left(\\frac{b}{435}\\right)$ B. $100\\left(\\frac{435}{b}\\right)$ C. $435\\left(\\frac{b}{100}\\right)$ D. $435(100 b)$", "Answer": "A"}
19 | {"id": "18", "question": "$$10(x+120)=120$$ Which of the following equations has the same solution as the given equation?", "options": "A) $x+120=12$ B) $x+120=130$ C) $x+12=12$ D) $x+12=120$", "Answer": "A"}
20 | {"id": "19", "question": "The given function $C$ models the annual soybean use in China, in millions of metric tons, between 1995 and 2014, where $x$ is the number of years after 1995. $$C(x)=4.3 x+19$$ According to the model, what is the best interpretation of 4.3 in this context?", "options": "A) Each year between 1995 and 2014, China used 4.3 million metric tons of soybeans B) Each year between 1995 and 2014, China's annual use of soybeans increased by 4.3 million metric tons C) China used 4.3 million metric tons of soybeans in 1995 D) China used a total of 4.3 million metric tons of soybeans between 1995 and 2014", "Answer": "B"}
21 | {"id": "20", "question": "$$ \\begin{gathered} C(x)=50,000+0.75 x \\\\ R(x)=4.75 x \\end{gathered}$$ The given function $C$ models the total cost (sum of fixed cost and variable cost), in dollars, of growing and harvesting $x$ bales of hay on a certain farm. The given function $R$ models the revenue, in dollars, earned from selling $x$ bales of hay. According to the function $R$, how many bales of hay would have to be sold to earn a revenue of $\\$1,425$?", "options": "A) 100 B) 300 C) 500 D) 1,000", "Answer": "B"}
22 | {"id": "21", "question": "$$ \\begin{gathered} C(x)=50,000+0.75 x \\\\ R(x)=4.75 x \\end{gathered}$$ The given function $C$ models the total cost (sum of fixed cost and variable cost), in dollars, of growing and harvesting $x$ bales of hay on a certain farm. The given function $R$ models the revenue, in dollars, earned from selling $x$ bales of hay. Which of the following inequalities models the number of bales of hay that must be sold to earn a profit of $\\$ 10,000$ or more? (profit $=$ revenue - cost)", "options": "A) $10,000 \\leq 4 x-50,000$ B) $10,000 \\geq 4 x-50,000$ C) $10,000 \\leq 4 x+50,000$ D) $10,000 \\geq 4 x+50,000$", "Answer": "A"}
23 | {"id": "22", "question": "Which expression is equivalent to $\\left(x^2+4\\right)^2+(x-2)(x+2) ?$", "options": "A) $x^4+x^2+20$ B) $x^4+5 x^2+16$ C) $x^4+9 x^2$ D) $x^4+9 x^2+12$", "Answer": "D"}
24 | {"id": "23", "question": "$$ \\begin{aligned} & y=4 x+1 \\\\ & y=4 x+3 \\end{aligned}$$ How many solutions does the given system of equations have?", "options": "A) Zero B) Exactly one C) Exactly two D) Infinitely many", "Answer": "A"}
25 | {"id": "24", "question": "$$ h(x)=3 x+3 $$ Which inequality represents all values of $x$ for which the graph of $y=h(x)$ in the $x y$-plane is above the $x$-axis?", "options": "A) $x<3$ B) $x<-1$ C) $x>-1$ D) $x>3$", "Answer": "C"}
26 | {"id": "25", "question": "Which quadratic equation has no real solutions?", "options": "A) $3 x^2-3=0$ B) $3 x^2+3 x=0$ C) $3 x^2+3 x+3=0$ D) $3 x^2-6 x+3=0$", "Answer": "C"}
27 | {"id": "26", "question": "In 1976, there were approximately 1,000 gray wolves in northern Minnesota. The number of gray wolves in northern Minnesota in 2008 was 190% greater than in 1976. Approximately how many gray wolves were in northern Minnesota in 2008?", "options": "A. 1,190 B. 1,900 C. 2,900 D. 19,000", "Answer": "C"}
28 | {"id": "27", "question": "When the quadratic function $f$ is graphed in the $x y$-plane, where $y=f(x)$, its vertex is $(-2,5)$. One of the $x$-intercepts of this graph is $\\left(-\\frac{7}{3}, 0\\right)$. What is the other $x$-intercept of the graph?", "options": "A. $\\left(-\\frac{13}{3}, 0\\right)$ B. $\\left(-\\frac{5}{3}, 0\\right)$ C. $\\left(\\frac{1}{3}, 0\\right)$ D. $\\left(\\frac{7}{3}, 0\\right)$", "Answer": "B"}
29 | {"id": "28", "question": "For an exponential function $g$, the value of $g(x)$ decreases by $20 \\%$ for each 1-unit increase in the value of $x$. If $g(2)=16$, which equation could define $g$ ?", "options": "A) $g(x)=16(0.8)^{x-2}$ B) $g(x)=16(0.8)^{x+2}$ C) $g(x)=16(0.2)^{x-2}$ D) $g(x)=16(0.2)^{x+2}$", "Answer": "A"}
30 | {"id": "29", "question": "Micha and Rana each selected a random sample of students at their school and asked how many soft drink servings each student had consumed the previous week. Micha estimated that the mean number of soft drink servings was 7.1, with an associated margin of error of 1.2. Rana estimated that the mean number of soft drink servings was 8.3, with an associated margin of error of 0.8. Assuming the margins of error were calculated in the same way, which of the following best explains why Rana obtained a smaller margin of error than Micha?", "options": "A. Rana's sample contained more students than Micha's sample contained. B. Rana's sample contained more students who drank soft drinks than Micha's sample contained. C. Rana's sample contained more students who drank exactly seven soft drink servings than Micha's sample contained. D. Rana's sample contained more students who drank exactly eight soft drink servings than Micha's sample contained.", "Answer": "A"}
31 | {"id": "30", "question": "A circle in the $x y$-plane has its center at $(-3,4)$ and the point $(-2,1)$ lies on the circle. Which equation represents this circle?", "options": "A) $(x-3)^2+(y+4)^2=\\sqrt{10}$ B) $(x+3)^2+(y-4)^2=\\sqrt{10}$ C) $(x-3)^2+(y+4)^2=10$ D) $(x+3)^2+(y-4)^2=10$", "Answer": "D"}
32 | {"id": "31", "question": "\\begin{tabular}{|c|c|} \\hline$x$ & $h(x)$ \\\\\\hline 2 & 0 \\\\\\hline 4 & 0 \\\\\\hline 6 & 8 \\\\\\hline \\end{tabular} For the quadratic function $h$, the table gives three values of $x$ and their corresponding values of $h(x)$. At what value of $x$ does $h$ reach its minimum?", "options": "A) -1 B) 0 C) 3 D) 4", "Answer": "C"}
--------------------------------------------------------------------------------
/web_pipeline/utils/decont_utils/datatrove_helper.py:
--------------------------------------------------------------------------------
1 | """
2 | Used for n-gram decontamination.
3 | First build an index using the tasks we want to use to decontaminate our training dataset.
4 | Then read your training data and apply the filter with the index loaded.
5 | """
6 |
7 | import os
8 | from collections import defaultdict
9 | from concurrent.futures import ThreadPoolExecutor
10 | from dataclasses import dataclass, field
11 | from typing import Tuple
12 |
13 | import numpy as np
14 |
15 | from datatrove.data import Document, DocumentsPipeline
16 | from datatrove.io import DataFolderLike, file_exists, get_datafolder, open_file
17 | from datatrove.pipeline.base import PipelineStep
18 | from datatrove.pipeline.filters.base_filter import BaseFilter
19 | from datatrove.pipeline.writers.disk_base import DiskWriter
20 | from datatrove.utils.binaryio import read_np_from_file
21 | from datatrove.utils.hashing import HashConfig, create_hash_func
22 | from datatrove.utils.logging import logger
23 | from datatrove.utils.text import TextNormConfig, ngrams, simplify_text
24 | from datatrove.utils.typeshelper import Languages
25 | from datatrove.utils.word_tokenizers import load_word_tokenizer
26 |
27 | @dataclass
28 | class NGramsDecontConfig:
29 | """
30 | Example for n_grams=4
31 | query = ['A', 'B', 'C', 'D', 'E'] (the prompt/instruction)
32 | label = ['F', 'G', 'H', 'I', 'J'] (the answer/gold)
33 | Will find the following N-GRAMS in the training data:
34 | 'F G H I'
35 | 'G H I J'
36 | + IF find_query_ngrams:
37 | 'A B C D'
38 | 'B C D E'
39 | + IF find_overlap_ngrams:
40 | 'C D E F'
41 | 'D E F G'
42 | 'E F G H'
43 | """
44 |
45 | n_grams: int = 12
46 | find_query_ngrams: bool = False # enable to also check for matches in n-grams containing only the input/prompt
47 | find_overlap_ngrams: bool = True # will also find matches for n-grams containing BOTH input and query
48 |
49 | # for math we do not do number normalize
50 | norm_config: TextNormConfig = field(default_factory=TextNormConfig)
51 | hash_config: HashConfig = field(default_factory=HashConfig)
52 |
53 |
54 | class NGramsDecontIndexer(PipelineStep):
55 | """
56 | Creates a decontamination index (basically a list of uint64 hashes from ngrams) for each reference task.
57 | Ways to provide task data:
58 | - as input documents from the previous pipeline step with "text=label/correct answer"
59 | and metadata={"query": query/prompt/input, "task": task name}
60 |
61 | #FIXME
62 | @fan, deprecated, we do not use lighteval in megamath
63 | - as a list of strings in the format "suite|task" from the lighteval metadata table:
64 | https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/tasks_table.jsonl as `lighteval_tasks`
65 | - a path to a text file containing one such list, with one "suite|task" per line as `lighteval_tasks`
66 | you can also define your custom tasks with `custom_lighteval_tasks`. See explanation for `custom_tasks` here:
67 | https://github.com/huggingface/lighteval/tree/main?tab=readme-ov-file#evaluate-a-model-on-extended-community-or-custom-tasks
68 |
69 | """
70 |
71 | type = "🦠 - DECONT"
72 | name = "💥 N-grams build index"
73 |
74 | def __init__(
75 | self,
76 | output_folder: DataFolderLike,
77 | config: NGramsDecontConfig = None,
78 | task_dict: any = None,
79 | language: str = Languages.english,
80 | ):
81 | super().__init__()
82 | self.output_folder = get_datafolder(output_folder)
83 | self.config = config or NGramsDecontConfig()
84 | self.tokenizer = load_word_tokenizer(language)
85 | self.hash_func = create_hash_func(self.config.hash_config)
86 | self.task_dict = task_dict
87 |
88 | def compute_hashes(self, label: str, query: str | None = None) -> list[int]:
89 | # label_tokens = self.tokenizer.word_tokenize(simplify_text(label, self.config.norm_config))
90 | label_tokens = simplify_text(label, self.config.norm_config).lower().split()
91 | # label_tokens = label.lower().split()
92 |
93 | ngrams_to_compute = list(ngrams(label_tokens, self.config.n_grams))
94 | if query is not None:
95 | # query_tokens = self.tokenizer.word_tokenize(simplify_text(query, self.config.norm_config))
96 | query_tokens = simplify_text(query, self.config.norm_config).lower().split()
97 | # query_tokens = query.lower().split()
98 | if self.config.find_query_ngrams:
99 | ngrams_to_compute.extend(ngrams(query_tokens, self.config.n_grams))
100 | if self.config.find_overlap_ngrams:
101 | # add tokens overlapping query and label
102 | """
103 | A, B, C, D, E | F, G, H, I, J
104 | 5 grams
105 | B, C, D, E, F (-N + 1 + i:) + (:i + 1)
106 | ...
107 | E, F, G, H, I
108 | """
109 | ngrams_to_compute.extend(
110 | [
111 | query_tokens[-self.config.n_grams + 1 + i :] + label_tokens[: i + 1]
112 | for i in range(self.config.n_grams - 1)
113 | # make sure we actually get a list of size N
114 | if len(query_tokens) >= self.config.n_grams - 1 - i and len(label_tokens) >= i + 1
115 | ]
116 | )
117 | return list(map(self.hash_func, map(" ".join, ngrams_to_compute)))
118 |
119 | def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1):
120 | if world_size != 1:
121 | raise ValueError("Decontamination index building requires a single worker.")
122 | hashes = defaultdict(set)
123 | # use whatever date is parsed in with the following format:
124 | # doc.text -> label
125 | # doc.metadata["input"] -> input
126 | if data:
127 | for doc in data:
128 | if not self.config.find_query_ngrams and "query" not in doc.metadata:
129 | raise ValueError(
130 | "only_label_ngrams is False but could not find 'query' field in documents metadata"
131 | )
132 | hashes[doc.metadata.get("task", "input")].update(
133 | self.compute_hashes(doc.text, doc.metadata.get("query", None))
134 | )
135 |
136 | for task_name, task in self.task_dict.items():
137 | for eval_doc in task:
138 | try:
139 | golds = eval_doc["label"] if isinstance(eval_doc["label"], list) else [eval_doc["label"]]
140 | query = eval_doc["query"]
141 | except Exception as e:
142 | logger.warning(f"Error while fetching doc data: {e}")
143 | continue
144 | for gold in golds:
145 | hashes[task_name].update(self.compute_hashes(gold, query))
146 |
147 | for task_name, task_hashes in hashes.items():
148 | hashes_array = np.array(list(task_hashes), dtype=self.config.hash_config.np_descr)
149 | logger.info(f"Saving {len(task_hashes)} hashes for {task_name}")
150 | with self.output_folder.open(f"{task_name.replace(' ', '_')}.index.hashes", mode="wb") as f:
151 | if self.output_folder.is_local():
152 | hashes_array.tofile(f)
153 | else:
154 | f.write(hashes_array.tobytes())
155 |
156 |
157 | class NGramsDecontFilter(BaseFilter):
158 | """
159 | Loads list of hashes created by the Indexer step.
160 | For each document in the block's input, we will check if any of its ngrams are part of the reference eval tasks.
161 | If so, they will be removed. The contaminated ngram and task where it was found will be saved in the removed
162 | document's metadata.
163 | """
164 |
165 | type = "🦠 - DECONT"
166 | name = "💥 N-grams decontaminate"
167 |
168 | def __init__(
169 | self,
170 | index_folder: DataFolderLike,
171 | config: NGramsDecontConfig = None,
172 | exclusion_writer: DiskWriter = None,
173 | language: str = Languages.english,
174 | ):
175 | super().__init__()
176 | self.index_folder = get_datafolder(index_folder)
177 | self.config = config or NGramsDecontConfig()
178 | self.exclusion_writer = exclusion_writer
179 | self.language = language
180 | self._index_hashes = None
181 | self.hash_func = create_hash_func(self.config.hash_config)
182 | self.tokenizer = load_word_tokenizer(language)
183 |
184 | def load_index_hashes(self):
185 | def load_index_from_file(file):
186 | with self.index_folder.open(file, mode="rb") as f:
187 | return file, read_np_from_file(
188 | f, np.dtype(self.config.hash_config.np_descr), self.index_folder.is_local()
189 | ).tolist()
190 |
191 | with ThreadPoolExecutor() as pool:
192 | hashes = pool.map(load_index_from_file, self.index_folder.list_files(glob_pattern="**/*.index.hashes"))
193 |
194 | self._index_hashes = {}
195 | for filename, hashlist in hashes:
196 | taskname = filename.removesuffix(".index.hashes")
197 | logger.info(f"Loading {len(hashlist)} hashes for {taskname}")
198 | for hash in hashlist:
199 | self._index_hashes[hash] = taskname
200 |
201 | def filter(self, doc: Document) -> bool | Tuple[bool, str]:
202 | if self._index_hashes is None:
203 | self.load_index_hashes()
204 |
205 | text_tokens = simplify_text(doc.text, self.config.norm_config).lower().split()
206 | ngrams_to_compute = list(ngrams(text_tokens, self.config.n_grams))
207 | for n_gram in map(" ".join, ngrams_to_compute):
208 | task = self._index_hashes.get(self.hash_func(n_gram), None)
209 |
210 | #FIXME
211 | # @fan, insert punctuation check to avoid empty ngrams
212 | punc_check_config = self.config.norm_config
213 | punc_check_config.remove_punctuation = True
214 | punc_check_text = simplify_text(n_gram, punc_check_config)
215 | if punc_check_text == "":
216 | return True
217 |
218 | if task is not None:
219 | doc.metadata["contaminated_ngram"] = n_gram
220 | doc.metadata["contaminated_task"] = task
221 | self.stat_update(f"contaminated_{task}")
222 | if ":" in task:
223 | self.stat_update(f"contaminated_tg_{task[:task.index(':')]}")
224 | return False, "contaminated"
225 | return True
226 |
--------------------------------------------------------------------------------
/web_pipeline/utils/decont_utils/downstream_datasets.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | with open("./utils/decont_utils/data/asdiv.jsonl", "r") as f:
4 | asdiv_raw = [json.loads(line) for line in f]
5 | asdiv_tasks = [
6 | {
7 | "query": f"{item['body']}",
8 | "label": f"{item['answer']}",
9 | }
10 | for item in asdiv_raw
11 | ]
12 |
13 | with open("./utils/decont_utils/data/gsm8k.jsonl", "r") as f:
14 | gsm8k_raw = [json.loads(line) for line in f]
15 | gsm8k_tasks = [
16 | {
17 | "query": f"{item['question']}",
18 | "label": f"{item['cot']} {item['answer']}",
19 | }
20 | for item in gsm8k_raw
21 | ]
22 |
23 | with open("./utils/decont_utils/data/math.jsonl", "r") as f:
24 | math_raw = [json.loads(line) for line in f]
25 | math_tasks = [
26 | {
27 | "query": f"{item['problem']}",
28 | "label": f"{item['solution']}",
29 | }
30 | for item in math_raw
31 | ]
32 |
33 |
34 | with open("./utils/decont_utils/data/mathqa.jsonl", "r") as f:
35 | mathqa_raw = [json.loads(line) for line in f]
36 | mathqa_tasks = [
37 | {
38 | "query": f"{item['problem']} {item['options']}",
39 | "label": f"{item['rationale']}"[1:-1], # remove quotes
40 | }
41 | for item in mathqa_raw
42 | ]
43 |
44 | with open("./utils/decont_utils/data/mawps.jsonl", "r") as f:
45 | mawps_raw = [json.loads(line) for line in f]
46 | mawps_tasks = [
47 | {
48 | "query": f"{item['input']}",
49 | "label": f"{item['target']}",
50 | }
51 | for item in mawps_raw
52 | ]
53 |
54 | with open("./utils/decont_utils/data/mmlu_stem.jsonl", "r") as f:
55 | mmlu_stem_raw = [json.loads(line) for line in f]
56 | mmlu_stem_tasks = [
57 | {
58 | "query": f"{item['question']}",
59 | "label": "A: "
60 | + str(item["options"][0])
61 | + " B: "
62 | + str(item["options"][1])
63 | + " C: "
64 | + str(item["options"][2])
65 | + " D: "
66 | + str(item["options"][3])
67 | + " Answer: "
68 | + str(item["answer"]),
69 | }
70 | for item in mmlu_stem_raw
71 | ]
72 |
73 | with open("./utils/decont_utils/data/ocw.jsonl", "r") as f:
74 | ocw_raw = [json.loads(line) for line in f]
75 | ocw_tasks = [
76 | {
77 | "query": f"{item['problem']}",
78 | "label": f"{item['solution']} {item['answer']}",
79 | }
80 | for item in ocw_raw
81 | ]
82 |
83 | with open("./utils/decont_utils/data/sat.jsonl", "r") as f:
84 | sat_raw = [json.loads(line) for line in f]
85 | sat_tasks = [
86 | {
87 | "query": f"{item['question']}",
88 | "label": f"{item['options']} {item['Answer']}",
89 | }
90 | for item in sat_raw
91 | ]
92 |
93 | with open("./utils/decont_utils/data/svamp.jsonl", "r") as f:
94 | svamp_raw = [json.loads(line) for line in f]
95 | svamp_tasks = [
96 | {
97 | "query": f"{item['Body']} {item['Question']}",
98 | "label": f"{item['Answer']}",
99 | }
100 | for item in svamp_raw
101 | ]
102 |
103 | with open("./utils/decont_utils/data/aime24.jsonl", "r") as f:
104 | aime24_raw = [json.loads(line) for line in f]
105 | aime24_tasks = [
106 | {
107 | "query": f"{item['problem']}",
108 | "label": f"{item['solution']}",
109 | }
110 | for item in aime24_raw
111 | ]
112 |
113 | with open("./utils/decont_utils/data/aime25.jsonl", "r") as f:
114 | aime25_raw = [json.loads(line) for line in f]
115 | aime25_tasks = [
116 | {
117 | "query": f"{item['problem']}",
118 | "label": f"{item['answer']}",
119 | }
120 | for item in aime25_raw
121 | ]
122 |
123 | with open("./utils/decont_utils/data/amc.jsonl", "r") as f:
124 | amc_raw = [json.loads(line) for line in f]
125 | amc_tasks = [
126 | {
127 | "query": f"{item['problem']}",
128 | "label": f"{item['answer']}",
129 | }
130 | for item in amc_raw
131 | ]
132 |
133 | TASK_DATASETS = {
134 | "asdiv": asdiv_tasks,
135 | "gsm8k": gsm8k_tasks,
136 | "math": math_tasks,
137 | "mathqa": mathqa_tasks,
138 | "mawps": mawps_tasks,
139 | "mmlu_stem": mmlu_stem_tasks,
140 | "ocw": ocw_tasks,
141 | "sat": sat_tasks,
142 | "svamp": svamp_tasks,
143 | "aime24": aime24_tasks,
144 | "aime25": aime25_tasks,
145 | "amc": amc_tasks,
146 | }
147 |
148 |
149 | if __name__ == "__main__":
150 | for key, value in TASK_DATASETS.items():
151 | print(key, len(value))
152 | print(f">>>[Query] {value[0]['query']}")
153 | print(f">>>[Label] {value[0]['label']}")
154 | print("-" * 10 + "\n")
155 |
--------------------------------------------------------------------------------
/web_pipeline/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import gzip
3 | import json
4 | import os
5 | from dataclasses import asdict
6 |
7 | import requests
8 |
9 |
10 | def download_from_cc(
11 | object_key, bucket_name="commoncrawl", local_root_path="./crawl-data/"
12 | ):
13 | local_file = local_root_path + object_key
14 | local_file_path = os.path.dirname(local_file)
15 | print(f"Downloading {object_key} to {local_file}")
16 | if not os.path.exists(local_file_path):
17 | os.makedirs(local_file_path, exist_ok=True)
18 | try:
19 | # download the file from the url to local_file_path
20 | PREFIX = "https://data.commoncrawl.org/"
21 | url = PREFIX + object_key
22 | # show speed
23 | response = requests.get(url, stream=True)
24 | response.raise_for_status()
25 | with open(local_file, "wb") as file:
26 | file.write(response.content)
27 | print(f"Successfully downloaded {object_key} to {local_file}")
28 | return local_file
29 |
30 | # assert False
31 | except Exception as e:
32 | print(f"An error occurred: {str(e)}")
33 | return None
34 |
35 |
36 | def write_to_jsonlgz(data, output_file):
37 | print(f"Writing {len(data)} documents into {output_file} ...")
38 | with gzip.open(output_file, "at", encoding="utf-8") as gz_file:
39 | gz_file.write("\n".join(json.dumps(item) for item in data) + "\n")
40 |
41 |
42 | def delete_local_files(to_delete_files):
43 | for file_to_remove in to_delete_files:
44 | try:
45 | # Attempt to remove the file
46 | os.remove(file_to_remove)
47 | print(f"File '{file_to_remove}' has been successfully removed.")
48 | except FileNotFoundError:
49 | print(f"File '{file_to_remove}' not found.")
50 | except Exception as e:
51 | print(f"An error occurred: {str(e)}")
52 |
53 |
54 | def make_dir(file_name):
55 | file_path = os.path.dirname(file_name)
56 | print(f"Making directory: {file_path}")
57 | if not os.path.exists(file_path):
58 | os.makedirs(file_path, exist_ok=True)
59 |
60 |
61 | def remove_file(file_name):
62 | if os.path.isfile(file_name):
63 | os.remove(file_name)
64 | print(f"Remove halfly-processed file: {file_name}")
65 |
66 |
67 | def write_stat(stat_file, statistics, input_file, FIELD_NAMES):
68 | make_dir(stat_file)
69 | print(f"Writing {str(input_file)} into {stat_file}")
70 | if not os.path.exists(stat_file):
71 | with open(stat_file, mode="a", newline="") as file:
72 | writer = csv.DictWriter(file, fieldnames=FIELD_NAMES)
73 |
74 | # Write the headers
75 | writer.writeheader()
76 |
77 | # Write the data as a dictionary
78 | writer.writerow(asdict(statistics))
79 | else:
80 | with open(stat_file, mode="a", newline="") as file:
81 | writer = csv.DictWriter(file, fieldnames=FIELD_NAMES)
82 |
83 | # Write the data as a dictionary
84 | writer.writerow(asdict(statistics))
85 |
--------------------------------------------------------------------------------
/web_pipeline/utils/latex_parsing.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | import time
4 |
5 | from resiliparse.extract.html2text import extract_plain_text
6 | from resiliparse.parse.html import HTMLTree, traverse_dom
7 | from resiliparse.process_guard import (ExecutionTimeout, InterruptType,
8 | MemoryLimitExceeded, mem_guard,
9 | time_guard)
10 |
11 | from mathml2latex.mathml2latex import mathml2latex, unicode2latex
12 |
13 |
14 | def improve_latex_content_parsing(html_doc):
15 | tree = HTMLTree.parse(html_doc)
16 |
17 | def remove_math_styles(latex_text):
18 | if "display" not in latex_text and "textstyle" not in latex_text:
19 | return latex_text
20 |
21 | pattern = r"\$\{?(\\(?:display|text)style)\s*(.+?)\}?\$"
22 |
23 | def replace_func(match):
24 | content = match.group(2)
25 | content = re.sub(r"^\{(.+)\}$", r"\1", content)
26 | return f"${content}$"
27 |
28 | cleaned_text = re.sub(pattern, replace_func, latex_text)
29 | return cleaned_text
30 |
31 | def clean_latex(latex_text):
32 | latex_text = latex_text.strip()
33 | if latex_text.startswith("{\\displaystyle"):
34 | latex_text = latex_text.replace("{\\displaystyle", "")
35 | if latex_text.endswith("}"):
36 | latex_text = latex_text[:-1]
37 | if latex_text.strip() == "":
38 | return ""
39 | return f"${latex_text.strip()}$"
40 |
41 | def process_math_element(math_elem):
42 | if math_elem.getattr("class") == "katex-mathml":
43 | print("skip katex mathml")
44 | return # 跳过 KaTeX 的 HTML/CSS 渲染部分
45 |
46 | latex = extract_latex_with_timeout(math_elem)
47 | if latex == None:
48 | return
49 |
50 | new_span = tree.create_element("span")
51 | new_span["class"] = "math-text"
52 | new_span.text = latex.strip()
53 | parent = math_elem.parent
54 | if parent:
55 | parent.replace_child(new_span, math_elem)
56 |
57 | def clean_mathml(mathml_block):
58 | if "oldsymbol{" in mathml_block and "boldsymbol{" not in mathml_block:
59 | mathml_block = mathml_block.replace("oldsymbol", "\\boldsymbol")
60 | mathml_block = re.sub(r"<\?xml[^>]+\?>\s*", "", mathml_block)
61 | if 'xmlns="http://www.w3.org/1998/Math/MathML"' not in mathml_block:
62 | mathml_block = mathml_block.replace(
63 | "