├── .gitignore ├── LICENSE ├── README.md ├── extract_from_cc ├── configs │ └── randomized_all.yaml ├── extract_from_warc.py ├── spark_extract_dataset.py ├── spark_session_builder.py ├── text_normalizer.py └── warcs │ └── shard_0.txt ├── filtering └── filter_dataset.py ├── imgs ├── OpenWebMath-left.png ├── openwebmath_logo.png └── pipeline.png └── text_extraction ├── setup.py └── text_extract ├── __init__.py ├── banned_selectors.txt ├── boilerplate_words.txt ├── extract.py ├── latex_processing.py ├── line_processing.py ├── mmltex ├── README ├── cmarkup.xsl ├── entities.xsl ├── glayout.xsl ├── mmltex.xsl ├── scripts.xsl ├── tables.xsl └── tokens.xsl ├── tree_processing.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Mac 163 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenWebMath: An Open Dataset of High-Quality Mathematical Web Text 2 | 3 | [Keiran Paster](https://keirp.com)\*, [Marco Dos Santos](https://marco-dossantos.github.io/)\*, [Zhangir Azerbayev](https://zhangir-azerbayev.github.io/), [Jimmy Ba](https://jimmylba.github.io/) 4 | 5 | [🤗 Download OpenWebMath ](https://huggingface.co/datasets/open-web-math/open-web-math) | [ArXiv](https://arxiv.org/abs/2310.06786) 6 | | [PDF](https://arxiv.org/pdf/2310.06786.pdf) 7 | 8 | # About OpenWebMath 9 | 10 | **OpenWebMath** is a dataset containing the majority of the high-quality, mathematical text from the internet. It is filtered and extracted from over 200B HTML files on Common Crawl down to a set of **6.3 million documents** containing a total of **14.7B tokens**. OpenWebMath is intended for use in _pretraining_ and _finetuning_ large language models. 11 | 12 | You can download the dataset using Hugging Face: 13 | 14 | ```python 15 | from datasets import load_dataset 16 | ds = load_dataset("open-web-math/open-web-math") 17 | ``` 18 | 19 | # Code Structure 20 | 21 | We provide code in this repository to reproduce our processing pipeline. The code is organized into three separate folders: 22 | 23 | 1. `text_extraction` contains the code for extracting text and LaTeX from HTML documents. 24 | 2. `extract_from_cc` contains the code for extracting the dataset from Common Crawl, including prefiltering, language identification, MathScore filtering, and perplexity filtering. 25 | 3. `filtering` includes many of the manual filtering steps, including blacklisted domains. 26 | 27 | In order to run the `extract_from_cc` code, you either need to run it in Apache Spark or manually run `extract_from_warc.py` by passing in a WARC file as an argument. 28 | 29 | For deduplication, please use the [text-dedup](https://github.com/ChenghaoMou/text-dedup) library. 30 | 31 | Finally, for filtering, `filter.py` contains the code to load a Hugging Face dataset and filter it based on our heuristics. 32 | 33 | The _MathScore_ model and KenLM model will be released in the near future. 34 | 35 | # OpenWebMath Pipeline 36 | 37 | Overview of the OpenWebMath Pipeline 38 | 39 | OpenWebMath builds on the massive [Common Crawl](https://commoncrawl.org/) dataset, which contains over 200B HTML documents. We filtered the data to only include documents that are: (1) in English, (2) contain mathematical content, and (3) are of high quality. We also put a strong emphasis on extracting LaTeX content from the HTML documents as well as reducing boilerplate in comparison to other web datasets. 40 | 41 | The OpenWebMath pipeline consists of five steps: 42 | 43 | 1. **Prefiltering HTML Documents**: 44 | - We apply a simple prefilter to all HTML documents in Common Crawl in order to skip documents without mathematical content to unnecessary processing time. 45 | 2. **Text Extraction**: 46 | - Extract text, including LaTeX content, from the HTML documents while removing boilerplate. 47 | 3. **Content Classification and Filtering**: 48 | - Apply a [FastText language identification model](https://fasttext.cc/docs/en/language-identification.html) to keep only English documents. 49 | - Filter high perplexity documents using a [KenLM](https://github.com/kpu/kenlm) model trained on [Proof-Pile](https://huggingface.co/datasets/hoskinson-center/proof-pile). 50 | - Filter non-mathematical documents using our own _MathScore_ model. 51 | 4. **Deduplication**: 52 | - Deduplicate the dataset using SimHash in [text-dedup](https://github.com/ChenghaoMou/text-dedup). 53 | 5. **Manual Inspection**: 54 | - Inspect the documents gathered from previous steps and remove low quality pages. 55 | 56 | For a detailed discussion on the processing pipeline, please refer to our paper. 57 | 58 | # OpenWebMath Contents 59 | 60 | The dataset is structured as follows: 61 | 62 | ```python 63 | { 64 | "text": ..., # document text. 65 | "url": ..., # document url. 66 | "date": ..., # date the page was crawled. 67 | "metadata": ..., # JSON containing information from the extraction process. 68 | } 69 | ``` 70 | 71 | OpenWebMath contains documents from over 130k different domains, including data from forums, educational pages, and blogs. The dataset contains documents covering mathematics, physics, statistics, computer science, and more. The following table shows the most common domains in OpenWebMath by character count. 72 | 73 | | Domain | # Characters | % Characters | 74 | | ----------------- | ------------- | ------------ | 75 | | stackexchange.com | 4,655,132,784 | 9.55% | 76 | | nature.com | 1,529,935,838 | 3.14% | 77 | | wordpress.com | 1,294,166,938 | 2.66% | 78 | | physicsforums.com | 1,160,137,919 | 2.38% | 79 | | github.io | 725,689,722 | 1.49% | 80 | | zbmath.org | 620,019,503 | 1.27% | 81 | | wikipedia.org | 618,024,754 | 1.27% | 82 | | groundai.com | 545,214,990 | 1.12% | 83 | | blogspot.com | 520,392,333 | 1.07% | 84 | | mathoverflow.net | 499,102,560 | 1.02% | 85 | 86 | # License 87 | 88 | OpenWebMath is made available under an ODC-By 1.0 license; users should also abide by the CommonCrawl ToU: [https://commoncrawl.org/terms-of-use/](https://commoncrawl.org/terms-of-use/). We do not alter the license of any of the underlying data. 89 | 90 | # Citation Information 91 | 92 | ``` 93 | @misc{paster2023openwebmath, 94 | title={OpenWebMath: An Open Dataset of High-Quality Mathematical Web Text}, 95 | author={Keiran Paster and Marco Dos Santos and Zhangir Azerbayev and Jimmy Ba}, 96 | year={2023}, 97 | eprint={2310.06786}, 98 | archivePrefix={arXiv}, 99 | primaryClass={cs.AI} 100 | } 101 | ``` 102 | -------------------------------------------------------------------------------- /extract_from_cc/configs/randomized_all.yaml: -------------------------------------------------------------------------------- 1 | markdown_headings: [ 2 | [0.9, True], 3 | [0.1, False], 4 | ] 5 | markdown_code: [ 6 | [0.95, True], 7 | [0.05, False], 8 | ] 9 | boilerplate_config: 10 | ratio_threshold: [ 11 | [0.9, 0.18], 12 | [0.1, 0.30], 13 | ] 14 | absolute_threshold: [ 15 | [0.9, 10], 16 | [0.1, 20], 17 | ] 18 | end_threshold: [ 19 | [0.95, 15], 20 | [0.05, 5], 21 | ] 22 | enable: [ 23 | [0.95, True], 24 | [0.05, False], 25 | ] 26 | remove_buttons: True 27 | remove_image_figures: True 28 | remove_link_clusters: True 29 | table_config: 30 | min_rows: 2 31 | min_cols: 3 32 | format: 'plain' 33 | remove_chinese: True 34 | remove_edit_buttons: True 35 | extract_latex: True -------------------------------------------------------------------------------- /extract_from_cc/extract_from_warc.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import traceback 3 | from fastwarc.warc import ArchiveIterator 4 | import argparse 5 | from tqdm import tqdm 6 | from resiliparse.parse.html import HTMLTree 7 | from resiliparse.extract.html2text import extract_plain_text 8 | from resiliparse.parse.encoding import detect_encoding, bytes_to_str 9 | import fsspec 10 | from time import sleep 11 | import random 12 | from io import BytesIO 13 | from resiliparse.process_guard import time_guard, ExecutionTimeout 14 | import json 15 | import os 16 | import re 17 | import fasttext 18 | import kenlm 19 | 20 | # Temporary debugging imports 21 | import uuid 22 | from collections import defaultdict 23 | 24 | from text_extract.extract import extract_text 25 | from text_extract.utils import Config 26 | from text_extract.latex_processing import get_math_config 27 | from text_normalizer import normalize 28 | 29 | # Load the fasttext model 30 | if os.path.isdir('../models'): 31 | MODEL_PATH = '../models/math_score.bin' 32 | else: 33 | MODEL_PATH = 'math_score.bin' 34 | 35 | def score_text(text): 36 | normalized_text = normalize(text).replace('\n', ' ') 37 | # Remove any [EQUATION] tokens 38 | normalized_text = normalized_text.replace('[EQUATION]', '') 39 | pred = score_model.predict(normalized_text, k=2) 40 | if pred[0][0] == '__label__positive': 41 | prob = pred[1][0] 42 | else: 43 | prob = pred[1][1] 44 | 45 | return prob 46 | 47 | # Load the kenlm model 48 | LM_PATH = '../models/lm-v2.binary' 49 | 50 | lm = kenlm.Model(LM_PATH) 51 | 52 | def document_perplexity(text): 53 | text = normalize(text) 54 | score = lm.score(text) 55 | return 10 ** (-score / len(text.split())) 56 | 57 | # Load the model 58 | score_model = fasttext.load_model(MODEL_PATH) 59 | 60 | def is_english(text): 61 | normalized_text = normalize(text).replace('\n', ' ') 62 | pred = lid_model.predict(normalized_text, k=1) 63 | if pred[0][0] == "__label__en" and pred[1][0] >= 0.5: 64 | return True 65 | return False 66 | 67 | MODEL_BIN = '../models/lid.176.bin' 68 | lid_model = fasttext.load_model(MODEL_BIN) 69 | 70 | randomized_config = Config('configs/randomized_all.yaml') 71 | 72 | MATH_KEYWORDS = [ 73 | 'MathJax', 74 | 'mathjax', 75 | ' 0.8 and len(text) > 500: 136 | return True 137 | 138 | return False 139 | 140 | def is_html(record): 141 | """Check that the record is an HTML record.""" 142 | if record.headers is None: 143 | return False 144 | if record.http_headers is None: 145 | return False 146 | content_type = str(record.http_content_type) 147 | if content_type.startswith('text/html') or content_type.startswith('application/xhtml+xml'): 148 | return True 149 | return False 150 | 151 | def extract(html, config): 152 | res = extract_text(html, config, fast=True) 153 | if res is None: 154 | return None 155 | text, info = res 156 | metadata = { 157 | 'extraction_info': info, 158 | 'config': config, 159 | } 160 | return text, metadata 161 | 162 | def load_warc(warc_file): 163 | """Loads a WARC file with fsspec. Retries if it fails.""" 164 | for i in range(10): 165 | try: 166 | with fsspec.open(warc_file, 'rb') as f: 167 | return f.read() 168 | except: 169 | if i == 9: 170 | raise Exception('Failed to read {}'.format(warc_file)) 171 | print('Retrying to read {}'.format(warc_file)) 172 | # Sleep a random amount of time 173 | sleep(random.random()) 174 | 175 | def process_warc(warc_file): 176 | """Yields extracted text from a WARC file.""" 177 | 178 | doc_counter = defaultdict(int) 179 | 180 | # Error if it takes more than 20 minutes 181 | with time_guard(timeout=60*20): 182 | try: 183 | f = load_warc(warc_file) 184 | for i in range(10): 185 | try: 186 | stream = BytesIO(f) 187 | break 188 | except: 189 | if i == 9: 190 | print('Failed to read {}'.format(warc_file)) 191 | return 192 | print('Retrying to read {}'.format(warc_file)) 193 | 194 | # We only want to extract text from the response records 195 | total_parsed = 0 196 | total_has_math = 0 197 | for record in tqdm(ArchiveIterator(stream)): 198 | try: 199 | doc_counter['records'] += 1 200 | if record.headers is None: continue 201 | if record.http_headers is None: continue 202 | if record.headers['WARC-Type'] != 'response': continue 203 | if not is_html(record): continue 204 | doc_counter['html'] += 1 205 | # Extract text from the payload 206 | html = record.reader.read() 207 | html = decode_html(html) 208 | url = record.headers.get('WARC-Target-URI') 209 | record_date = record.record_date 210 | if html is None: continue 211 | if not contains_math(html): continue 212 | doc_counter['passes prefilter'] += 1 213 | randomized_config_sample = randomized_config.sample() 214 | res = extract(html, randomized_config_sample) 215 | total_parsed += 1 216 | print(f'Running percentage: {total_has_math / total_parsed:.2f}, total parsed: {total_parsed}, total has math: {total_has_math}') 217 | if res is None: continue 218 | randomized_text, metadata = res 219 | 220 | found_math = metadata['extraction_info']['found_math'] 221 | if not is_english(randomized_text): continue 222 | doc_counter['is english'] += 1 223 | score = score_text(randomized_text) 224 | if found_math: 225 | if score < 0.15: continue 226 | else: 227 | if score < 0.8: continue 228 | doc_counter['passes score'] += 1 229 | 230 | metadata['extraction_info']['math_score'] = score 231 | perplexity = document_perplexity(randomized_text) 232 | metadata['extraction_info']['perplexity'] = perplexity 233 | 234 | if perplexity > 30_000: continue 235 | doc_counter['passes perplexity'] += 1 236 | total_has_math += 1 237 | 238 | yield (url, (randomized_text, html, warc_file, str(metadata), str(record_date))) 239 | except Exception as e: 240 | print(f'Execution timeout for {warc_file}') 241 | print(e) 242 | # Print a trace 243 | traceback.print_exc() 244 | except: 245 | print(f'Execution (probably) timed out for {warc_file}') 246 | return 247 | 248 | print(f'Finished processing {warc_file}' 249 | f' with {total_has_math} math-containing pages out of {total_parsed} parsed pages') 250 | 251 | # Save the doc counter as a text file with a uuid name 252 | with open(f'{uuid.uuid4()}.txt', 'w') as f: 253 | f.write(json.dumps(doc_counter)) 254 | 255 | 256 | def main(warc_file, output_dir): 257 | data = [] 258 | print('Extracting text from {}'.format(warc_file)) 259 | for url, (text, html, _, metadata, _) in process_warc(warc_file): 260 | data.append({'url': url, 'value': {'text': text, 'no_latex': '', 'no_randomization': '', 'html': html, 'warc_path': warc_file, 'metadata': metadata}}) 261 | # Remove directories if they exist 262 | if os.path.exists(output_dir): 263 | shutil.rmtree(output_dir) 264 | os.makedirs(output_dir) 265 | # Save the data as md files 266 | for i, d in enumerate(data): 267 | text = d['value']['text'] 268 | html = d['value']['html'] 269 | url = d['url'] 270 | with open(os.path.join(output_dir, '{}.md'.format(i)), 'w') as f: 271 | f.write('# {}\n\n'.format(url)) 272 | f.write(text) 273 | with open(os.path.join(output_dir, '{}.html'.format(i)), 'w') as f: 274 | f.write(html) 275 | 276 | 277 | if __name__ == '__main__': 278 | parser = argparse.ArgumentParser() 279 | parser.add_argument('--warc_file', help='WARC file to extract text from') 280 | parser.add_argument('--output_dir', help='Output dir') 281 | args = parser.parse_args() 282 | main(args.warc_file, args.output_dir) -------------------------------------------------------------------------------- /extract_from_cc/spark_extract_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from spark_session_builder import build_spark_session 3 | from pyspark.sql import SparkSession 4 | from pyspark import SparkContext 5 | from extract_from_warc import process_warc 6 | import jsonlines 7 | import os 8 | from pyspark.sql.types import StructType, StructField, StringType 9 | import uuid 10 | 11 | output_schema = StructType([ 12 | StructField('url', StringType(), True), 13 | StructField('value', StructType([ 14 | StructField('text', StringType(), True), 15 | StructField('html', StringType(), True), 16 | StructField('warc_path', StringType(), True), 17 | StructField('metadata', StringType(), True), 18 | StructField('date', StringType(), True), 19 | ]), True), 20 | ]) 21 | 22 | def process_filename(filename): 23 | if filename.startswith('file:'): 24 | filename = filename[5:] 25 | return filename 26 | 27 | def test_process_warc(warc_file): 28 | yield ('test_url', ('test_html', 'test_text')) 29 | 30 | def main(warc_file_list, 31 | output_dir, 32 | master='local', 33 | driver_memory=33, 34 | driver_cores=1, 35 | executor_memory=33, 36 | num_executors=29, 37 | executor_cores=5, 38 | num_cpus_per_task=5, 39 | num_output_partitions=1, 40 | output_format='parquet', 41 | output_compression='gzip'): 42 | spark = SparkSession.getActiveSession() 43 | if spark is not None: 44 | spark.stop() 45 | spark = build_spark_session(master=master, 46 | driver_memory=driver_memory, 47 | executor_memory=executor_memory, 48 | num_cpus_per_task=num_cpus_per_task, 49 | executor_cores=executor_cores, 50 | driver_cores=driver_cores) 51 | sc = SparkContext.getOrCreate() 52 | # warc_file_list is a text file with one warc file per line 53 | # Get the filename of warc_file_list 54 | warc_file_list_name = os.path.basename(warc_file_list).split('.')[0] 55 | warc_files = spark.read.text(warc_file_list).rdd.map(lambda r: r[0]).collect() 56 | warc_files = [process_filename(w) for w in warc_files] 57 | warc_count = len(warc_files) 58 | print('Found {} warc files'.format(warc_count)) 59 | warc_rdd = sc.parallelize(warc_files, warc_count) 60 | 61 | df = warc_rdd.flatMap(process_warc).toDF(output_schema) 62 | 63 | # Split into partitions 64 | df = df.repartition(num_output_partitions) 65 | # Write the output 66 | unique_id = str(uuid.uuid4()) 67 | output_file = os.path.join(output_dir, f'math_{warc_file_list_name}_{unique_id}') 68 | df.write.format(output_format).option('compression', output_compression).save(output_file) 69 | 70 | if __name__ == '__main__': 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('--warc_file_list', type=str) 73 | parser.add_argument('--output_dir', type=str) 74 | parser.add_argument('--master', type=str, default='local') 75 | parser.add_argument('--driver_memory', type=int, default=33) 76 | parser.add_argument('--driver_cores', type=int, default=1) 77 | parser.add_argument('--executor_memory', type=int, default=33) 78 | parser.add_argument('--num_executors', type=int, default=29) 79 | parser.add_argument('--executor_cores', type=int, default=5) 80 | parser.add_argument('--num_cpus_per_task', type=int, default=2) 81 | parser.add_argument('--mem_gb', type=int, default=16) 82 | parser.add_argument('--num_output_partitions', type=int, default=1) 83 | parser.add_argument('--output_format', type=str, default='json') 84 | parser.add_argument('--output_compression', type=str, default='gzip') 85 | args = parser.parse_args() 86 | main(warc_file_list=args.warc_file_list, 87 | output_dir=args.output_dir, 88 | master=args.master, 89 | driver_memory=args.driver_memory, 90 | driver_cores=args.driver_cores, 91 | executor_memory=args.executor_memory, 92 | num_executors=args.num_executors, 93 | executor_cores=args.executor_cores, 94 | num_cpus_per_task=args.num_cpus_per_task, 95 | num_output_partitions=args.num_output_partitions, 96 | output_format=args.output_format, 97 | output_compression=args.output_compression) -------------------------------------------------------------------------------- /extract_from_cc/spark_session_builder.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | import os 3 | import sys 4 | 5 | 6 | def build_spark_session(master, driver_memory, executor_memory, num_cpus_per_task, executor_cores, driver_cores): 7 | """Build a spark session based on the master url and the number of cores and memory to use""" 8 | if master == "local": 9 | spark = local_session(executor_cores, driver_memory) 10 | else: 11 | spark = aws_ec2_s3_spark_session(driver_memory, executor_memory, num_cpus_per_task, executor_cores, driver_cores) 12 | 13 | return spark 14 | 15 | 16 | def local_session(num_cores=4, mem_gb=16): 17 | """Build a local spark session""" 18 | spark = ( 19 | SparkSession.builder.config("spark.driver.memory", str(mem_gb) + "G") 20 | .master("local[" + str(num_cores) + "]") 21 | .appName("extract_math") 22 | .getOrCreate() 23 | ) 24 | return spark 25 | 26 | def aws_ec2_s3_spark_session(driver_memory, executor_memory, num_cpus_per_task, executor_cores, driver_cores): 27 | """Build a spark session on AWS EC2""" 28 | driver_memory = str(int(driver_memory)) + 'g' 29 | # executor_memory = str(int(executor_memory)) + 'g' 30 | main_memory = str(int(executor_memory * 0.9)) + 'g' 31 | memory_overhead = str(executor_memory - int(executor_memory * 0.9)) + 'g' 32 | spark = ( 33 | SparkSession.builder.appName("extractmath") 34 | .config("spark.executor.memory", main_memory) 35 | .config("spark.driver.memory", driver_memory) 36 | # .config("spark.driver.cores", str(driver_cores)) 37 | .config("spark.executor.memoryOverhead", memory_overhead) 38 | .config("spark.executor.cores", str(executor_cores)) # Number of cpu cores available basically 39 | .config("spark.task.cpus", str(num_cpus_per_task)) 40 | .config("spark.task.maxFailures", "10") 41 | .config("spark.dynamicAllocation.enabled", "true") 42 | .config("spark.shuffle.service.enabled", "true") 43 | .getOrCreate() 44 | ) 45 | return spark -------------------------------------------------------------------------------- /extract_from_cc/text_normalizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # From https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py 7 | 8 | import re 9 | import unicodedata 10 | 11 | UNICODE_PUNCT = { 12 | ",": ",", 13 | "。": ".", 14 | "、": ",", 15 | "„": '"', 16 | "”": '"', 17 | "“": '"', 18 | "«": '"', 19 | "»": '"', 20 | "1": '"', 21 | "」": '"', 22 | "「": '"', 23 | "《": '"', 24 | "》": '"', 25 | "´": "'", 26 | "∶": ":", 27 | ":": ":", 28 | "?": "?", 29 | "!": "!", 30 | "(": "(", 31 | ")": ")", 32 | ";": ";", 33 | "–": "-", 34 | "—": " - ", 35 | ".": ". ", 36 | "~": "~", 37 | "’": "'", 38 | "…": "...", 39 | "━": "-", 40 | "〈": "<", 41 | "〉": ">", 42 | "【": "[", 43 | "】": "]", 44 | "%": "%", 45 | "►": "-", 46 | } 47 | 48 | UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]") 49 | 50 | MATH_RE = r"(? str: 55 | return "".join((UNICODE_PUNCT.get(c, c) for c in text)) 56 | 57 | 58 | def remove_unicode_punct(text: str) -> str: 59 | """More aggressive version of replace_unicode_punct but also faster.""" 60 | return UNICODE_PUNCT_RE.sub("", text) 61 | 62 | 63 | def strip_accents(line: str) -> str: 64 | """Strips accents from a piece of text.""" 65 | nfd = unicodedata.normalize("NFD", line) 66 | output = [c for c in nfd if unicodedata.category(c) != "Mn"] 67 | if len(output) == line: 68 | return line 69 | return "".join(output) 70 | 71 | 72 | # Build a regex matching all control characters. 73 | NON_PRINTING_CHARS_RE = re.compile( 74 | f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" 75 | ) 76 | DIGIT_RE = re.compile(r"\d") 77 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile( 78 | (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "") 79 | ) 80 | 81 | 82 | def remove_non_printing_char(text: str) -> str: 83 | return NON_PRINTING_CHARS_RE.sub("", text) 84 | 85 | 86 | def normalize_spacing_for_tok(text: str, language: str = "en") -> str: 87 | res = ( 88 | text.replace("\r", "") 89 | # remove extra spaces 90 | .replace("(", " (") 91 | .replace(")", ") ") 92 | .replace(" +", " ") 93 | ) 94 | res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res) 95 | res = res.replace("( ", "(").replace(" )", ")") 96 | res = re.sub(r"(\d) \%", r"\1\%", res) 97 | res = res.replace(" :", ":").replace(" ;", ";") 98 | res = res.replace("`", "'").replace("''", ' " ') 99 | 100 | res = ( 101 | res.replace("„", '"') 102 | .replace("“", '"') 103 | .replace("”", '"') 104 | .replace("–", "-") 105 | .replace("—", " - ") 106 | .replace(" +", " ") 107 | .replace("´", "'") 108 | .replace("([a-z])‘([a-z])", r"\1'\2/") 109 | .replace("([a-z])’([a-z])", r"\1'\2/") 110 | .replace("‘", '"') 111 | .replace("‚", '"') 112 | .replace("’", '"') 113 | .replace("''", '"') 114 | .replace("´´", '"') 115 | .replace("…", "...") 116 | # French quotes 117 | .replace(" « ", ' "') 118 | .replace("« ", '"') 119 | .replace("«", '"') 120 | .replace(" » ", '" ') 121 | .replace(" »", '"') 122 | .replace("»", '"') 123 | # handle pseudo-spaces 124 | .replace(" %", "%") 125 | .replace("nº ", "nº ") 126 | .replace(" :", ":") 127 | .replace(" ºC", " ºC") 128 | .replace(" cm", " cm") 129 | .replace(" ?", "?") 130 | .replace(" !", "!") 131 | .replace(" ;", ";") 132 | .replace(", ", ", ") 133 | .replace(" +", " ") 134 | .replace(".", ". ") 135 | ) 136 | # English "quotation," followed by comma, style 137 | if language == "en": 138 | res = re.sub(r"\"([,\.]+)", r"\1\"", res) 139 | # Czech is confused 140 | elif language == "cs" or language == "cz": 141 | pass 142 | # German/Spanish/French "quotation", followed by comma, style 143 | else: 144 | res = res.replace(',"', '",') 145 | res = re.sub( 146 | r"(\.+)\"(\s*[^<])", r"\"\1\2", res 147 | ) # don't fix period at end of sentence 148 | 149 | if ( 150 | language == "de" 151 | or language == "es" 152 | or language == "cz" 153 | or language == "cs" 154 | or language == "fr" 155 | ): 156 | res = re.sub(r"(\d) (\d)", r"\1,\2", res) 157 | else: 158 | res = re.sub(r"(\d) (\d)", r"\1.\2", res) 159 | return res 160 | 161 | 162 | def normalize(line: str, accent=True, case=True, numbers=True, math=True, code=True, punct=1) -> str: 163 | line = line.strip() 164 | if not line: 165 | return line 166 | if case: 167 | line = line.lower() 168 | if accent: 169 | line = strip_accents(line) 170 | if numbers: 171 | line = DIGIT_RE.sub("0", line) 172 | if punct == 1: 173 | line = replace_unicode_punct(line) 174 | elif punct == 2: 175 | line = remove_unicode_punct(line) 176 | if math: 177 | line = re.sub(MATH_RE, "[EQUATION]", line, flags=re.DOTALL) 178 | if code: 179 | line = re.sub(CODE_RE, "[CODE]", line, flags=re.DOTALL) 180 | # Replace any or explicitly written in the text with nothing 181 | line = line.replace("", "").replace("", "") 182 | line = remove_non_printing_char(line) 183 | return line 184 | 185 | 186 | def slow_normalize_for_dedup(line: str) -> str: 187 | return normalize(line, accent=False, case=True, numbers=True, punct=2) 188 | 189 | 190 | def normalize_for_dedup(line: str) -> str: 191 | line = line.strip() 192 | if not line: 193 | return line 194 | # case 195 | line = line.lower() 196 | # numbers 197 | line = DIGIT_RE.sub("0", line) 198 | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) 199 | return line -------------------------------------------------------------------------------- /filtering/filter_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import argparse 3 | from datasets import load_dataset, Features 4 | from tqdm import tqdm 5 | import json 6 | import matplotlib.pyplot as plt 7 | import re 8 | 9 | BAD_URLS = [ 10 | 'worldwidescience', 11 | 'science.gov', 12 | 'archive.org', 13 | 'scribd.com', 14 | 'unz.com', 15 | '/profile/', 16 | '/researcher', 17 | 'noobstarter.com', 18 | 'philpapers.org', 19 | 'thesa.com', 20 | 'beyondhighbrow.com', 21 | 'careyoukeep.com', 22 | 'eevblog.com', 23 | 'happyslide.net', 24 | 'issuu.com', 25 | 'zh-cn.unz.com', 26 | 'vixra.org', 27 | 'medcraveonline.com', 28 | 'sciendo.com', 29 | 'open.library.ubc.ca', 30 | 'eurotrib.com', 31 | 'postthreads.org', 32 | 'jim.bmj.com', 33 | 'wanweibaike.com', 34 | 'hzdr.de', 35 | '/joursearch/', 36 | 'docplayer.net', 37 | 'bookofmormonblog.org', 38 | 'bradford-delong.com', 39 | 'profiles.stanford.edu', 40 | 'vo.astronet.ru', 41 | 'homainstallationen.at', 42 | '/author/', 43 | '/authors/' 44 | '/serials/' 45 | 'read.dukeupress.edu', 46 | 'thewikipost.org', 47 | 'is.tuebingen.mpg.de', 48 | 'discourse.darkjedibrotherhood.com', 49 | 'springermedizin.de', 50 | 'materials-chain.com', 51 | 'www.unzmag.net', 52 | 'is.mpg.de', 53 | 'hobby8.5ch.net', 54 | 'forums.penny-arcade.com', 55 | 'wowwiki.com', 56 | '8chan.moe', 57 | 'plosone.org', 58 | 'www.is.mpg.de', 59 | 'feeds.churchturing.org', 60 | 'learn.gcs.edu', 61 | 'mobinuke.com', 62 | 'judithcurry.com', 63 | 'tek-tips.com', 64 | 'skepticforum.com', 65 | 'all_publications', 66 | '.de/publications', 67 | 'nih.gov', 68 | 'lastfm.it', 69 | '/commit', 70 | 'vitaminstore', 71 | 'studylib.net', 72 | 'dokumen.pub', 73 | 'manualzz.com', 74 | 'fraser.stlouisfed.org' 75 | ] 76 | 77 | libretext_good = [ 78 | 'math', 79 | 'phys', 80 | 'stats', 81 | ] 82 | 83 | accented_chars = ['ü', 'ï', 'ö', 'ê', 'ä', 'â', 'ê', 'î', 'û', 'ô', 'è', 'é', 'à'] 84 | accented_chars = set(accented_chars) 85 | def has_accented_char(text): 86 | num_accent = sum([c in accented_chars for c in text.lower()]) 87 | if len(text) == 0: 88 | return False 89 | return num_accent / len(text) > 0.015 90 | 91 | def count_latex_formulas(text): 92 | # Remove unwanted patterns 93 | cleaned_text = re.sub(r'\$\$\\PageIndex\{[^\}]*\}\$\$', '', text) 94 | cleaned_text = re.sub(r'\$\\PageIndex\{[^\}]*\}\$', '', cleaned_text) 95 | 96 | # Pattern for inline and display math 97 | pattern = r'\$\$[^\$]*\$\$|\$[^\$]*\$' 98 | 99 | matches = re.findall(pattern, cleaned_text) 100 | 101 | return len(matches) 102 | 103 | def filter_data(data): 104 | metadata = json.loads(data['metadata']) 105 | perplexity = metadata['extraction_info']['perplexity'] 106 | math_score = metadata['extraction_info']['math_score'] 107 | if perplexity > 15_000: 108 | return False 109 | if math_score < 0.17: 110 | return False 111 | if 'arxiv-vanity' in data['url']: 112 | return False 113 | # Check if /search is in the path 114 | if '/search' in data['url'] and '//search' not in data['url']: 115 | return False 116 | if 'proceedings' in data['url']: 117 | return False 118 | if 'bibbase' in data['url']: 119 | return False 120 | if 'nrsworld.com' in data['url']: 121 | return False 122 | if 'bibtex' in data['url']: 123 | return False 124 | if 'issn' in data['url']: 125 | return False 126 | if 'arxiv-export' in data['url']: 127 | return False 128 | if 'bmjopen' in data['url']: 129 | return False 130 | if 'stackexchange.com/users' in data['url']: 131 | return False 132 | if 'mathoverflow.net/users' in data['url']: 133 | return False 134 | return True 135 | 136 | def process_data(datas): 137 | # Convert datas (which is keys with lists) to a list of dicts 138 | datas = [dict(zip(datas, t)) for t in zip(*datas.values())] 139 | new_datas = [] 140 | for data in datas: 141 | url = data['url'] 142 | text = data['text'] 143 | 144 | should_filter = not filter_data(data) 145 | 146 | # Filter out bad urls 147 | for bad_url in BAD_URLS: 148 | if bad_url in url: 149 | should_filter = True 150 | break 151 | 152 | # Remove any line that has more than one "newcommand" 153 | lines = text.split('\n') 154 | new_lines = [] 155 | for line in lines: 156 | if line.count('newcommand') > 1: 157 | continue 158 | new_lines.append(line) 159 | text = '\n'.join(new_lines) 160 | data['text'] = text 161 | 162 | # Filter less than 100 characters 163 | if len(text) < 100: 164 | should_filter = True 165 | 166 | if 'libretexts' in url: 167 | # Check if the url is part of the whitelist 168 | is_whitelist = False 169 | for good in libretext_good: 170 | if good in url: 171 | is_whitelist = True 172 | break 173 | if not is_whitelist: 174 | # Throw out if 0 math formulas 175 | if count_latex_formulas(text) == 0: 176 | should_filter = True 177 | 178 | # Filter out accents 179 | if has_accented_char(text): 180 | should_filter = True 181 | 182 | if not should_filter: 183 | new_datas.append(data) 184 | 185 | # Transform back to a dict of lists 186 | new_datas = {k: [d[k] for d in new_datas] for k in new_datas[0]} 187 | return new_datas 188 | 189 | def main(args): 190 | dataset = load_dataset(args.input, split='train') 191 | print(dataset) 192 | 193 | # Filter the dataset 194 | filtered_dataset = dataset.map(process_data, num_proc=args.n_processes, batched=True) 195 | print(filtered_dataset) 196 | 197 | # Save the dataset 198 | filtered_dataset.save_to_disk(args.output_dataset) 199 | 200 | if __name__ == '__main__': 201 | parser = argparse.ArgumentParser() 202 | parser.add_argument("--input", type=str, help="Huggingface dataset name") 203 | parser.add_argument("--output_dataset", type=str, required=True, help="path to save dataset") 204 | parser.add_argument("--n_processes", type=int, default=32, help="Number of processes to use") 205 | 206 | args = parser.parse_args() 207 | main(args) -------------------------------------------------------------------------------- /imgs/OpenWebMath-left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keirp/OpenWebMath/98f81d0ae14ff8dd877b84226b69c825b64da498/imgs/OpenWebMath-left.png -------------------------------------------------------------------------------- /imgs/openwebmath_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keirp/OpenWebMath/98f81d0ae14ff8dd877b84226b69c825b64da498/imgs/openwebmath_logo.png -------------------------------------------------------------------------------- /imgs/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keirp/OpenWebMath/98f81d0ae14ff8dd877b84226b69c825b64da498/imgs/pipeline.png -------------------------------------------------------------------------------- /text_extraction/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='text_extract', 5 | version='0.1', 6 | description='Text extraction tools', 7 | author='', 8 | packages=['text_extract'], 9 | ) -------------------------------------------------------------------------------- /text_extraction/text_extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/keirp/OpenWebMath/98f81d0ae14ff8dd877b84226b69c825b64da498/text_extraction/text_extract/__init__.py -------------------------------------------------------------------------------- /text_extraction/text_extract/banned_selectors.txt: -------------------------------------------------------------------------------- 1 | .breadcrumb 2 | #popup 3 | #flyout 4 | #site-slogan 5 | #site-name 6 | #menu 7 | .nav 8 | .login 9 | .dropdown 10 | .dropdown-menu 11 | #login 12 | .vote 13 | .form-item 14 | .user_pic_popup 15 | #post-editor 16 | .post-form 17 | .bottom-notice 18 | #sidebar 19 | #copyright 20 | #footer 21 | .footer 22 | .site-navigation 23 | .popupgroup 24 | .posthead 25 | .signaturecontainer 26 | .after_content 27 | .userinfo 28 | #similar_threads 29 | .toplinks 30 | .user-info 31 | .post-header 32 | .widget_archive 33 | .widget_categories 34 | .widget_meta 35 | .widget_recent_entries 36 | .widget_rss 37 | .wp_widget_tag_cloud 38 | .widget_calendar 39 | .navbox 40 | #mw-hidden-catlinks 41 | .above_postlist 42 | #navigation 43 | .threadtools 44 | .socialbuttons 45 | #respond 46 | .menu 47 | .WikiaHeader 48 | .buttons 49 | #WikiaRecentActivity 50 | #WikiaRandomWiki 51 | .loggedout-follow-normal 52 | #blurb 53 | #banner-top 54 | .topbar 55 | .topbar-dialog 56 | .related-links 57 | .votecell 58 | .comment-actions 59 | .d-none 60 | .Tooltip 61 | .Notices 62 | .likes-other-gravatars 63 | #logo_and_banner 64 | #pmmcrumb2 65 | .qa-notice 66 | .qa-nav-user 67 | .trackbacks 68 | #further_reading 69 | .topbar-links 70 | #your-communities-section 71 | .links-container 72 | #herobox 73 | .qa-voting-container 74 | .qa-post-when-container 75 | .qa-q-view-who 76 | .qa-q-item-meta 77 | .post-menu 78 | #vbseo-likes 79 | #side_one 80 | #side_two 81 | #feed_bar 82 | .author 83 | #likes-other-gravatars 84 | .pageInfo 85 | .ka-video-player 86 | .mw-editsection 87 | .mw-ui-icon 88 | #mw-revision-info 89 | #siteSub 90 | .heading--main 91 | #loginBarHandle 92 | .medalsrest 93 | .diff-otitle 94 | .diff-ntitle 95 | .diff-currentversion-title 96 | .diff-contentalign-left 97 | [class*="promo"] 98 | [class*="button"] 99 | [class*="upsell"] 100 | .expert-reply-overlay 101 | .PreviewContents 102 | .solutionHeader__isbn 103 | .cta 104 | .update-header 105 | .best-answer-selected 106 | .medal-info 107 | #profile-tooltip 108 | .update-info 109 | .google-search-openstudy 110 | .attachments 111 | button 112 | .delete 113 | .editor-actions 114 | .editor 115 | .files-attached 116 | .call-to-action 117 | .group-info 118 | .top-online-users 119 | .message-userExtras 120 | .message-attribution-opposite 121 | .u-srOnly 122 | .block--similarContents 123 | .u-concealed 124 | .similarThreads 125 | .breadcrumbs 126 | .courseHeader 127 | .impactSection 128 | .creativeCommons 129 | #clear -------------------------------------------------------------------------------- /text_extraction/text_extract/boilerplate_words.txt: -------------------------------------------------------------------------------- 1 | © 2 | updates 3 | join our 4 | buy 5 | sign up 6 | no results 7 | search images 8 | all rights reserved 9 | was this 10 | please 11 | visit our 12 | download for free 13 | retrieved from 14 | home page 15 | jump to 16 | notification switch 17 | your email address 18 | view answer 19 | no label found 20 | is licensed under 21 | regular updates 22 | copyright 23 | have access to this article 24 | youtube 25 | advertisment 26 | password 27 | login 28 | learn more 29 | cookie 30 | jump to navigation 31 | download 32 | table of contents 33 | leave a reply 34 | leave a message 35 | skip to 36 | stay updated 37 | contact us 38 | twitter 39 | from wikibooks, open books for an open world 40 | last modified 41 | from wikipedia, the free encyclopedia 42 | more info 43 | terms of use 44 | terms of service 45 | privacy policy 46 | navigation 47 | sign in 48 | report error 49 | newest 50 | under license 51 | follow 52 | newer 53 | notification 54 | post a comment 55 | click here 56 | leave a comment 57 | google 58 | free account 59 | for free 60 | alert 61 | receive update 62 | share this 63 | report ad 64 | more posts 65 | date of creation 66 | link 67 | powered by 68 | receive 69 | newsletter 70 | pdf version 71 | ask 72 | your 73 | facebook 74 | jump to search 75 | required fields 76 | back to top 77 | published by 78 | pdf article 79 | accessload 80 | start with 81 | loading 82 | username 83 | helpful 84 | log in 85 | license 86 | get the best 87 | join us 88 | full article 89 | attribution 90 | main content 91 | printed from 92 | distributed under 93 | rss 94 | 24/7 95 | your service 96 | please contact 97 | captcha 98 | might be incomplete 99 | about this 100 | lifetime 101 | access to 102 | this article is 103 | not found 104 | show more 105 | about 106 | business 107 | interested in joining 108 | wikipedia page 109 | gift 110 | premium 111 | purchase this 112 | purchasing 113 | access denied 114 | wims 115 | latest version 116 | this page 117 | your web browser 118 | recent version 119 | this article 120 | please help 121 | help you 122 | discard 123 | view tag cloud 124 | reply 125 | sponsor 126 | return to 127 | physicsoverflow is an open platform for community peer review 128 | comments 129 | trackback 130 | show menu 131 | add comment 132 | printable view 133 | advertisement 134 | join now 135 | from proofwiki 136 | energy points 137 | at the top of this image page 138 | all products 139 | maplesim 140 | online help 141 | see also 142 | all lesson plans 143 | menu 144 | check out more articles 145 | ad 146 | votes 147 | answer 148 | question you clicked on 149 | this question is closed -------------------------------------------------------------------------------- /text_extraction/text_extract/extract.py: -------------------------------------------------------------------------------- 1 | from resiliparse.parse.html import HTMLTree 2 | from resiliparse.extract.html2text import extract_plain_text 3 | import os 4 | import re 5 | 6 | from text_extract.latex_processing import extract_math, extract_delimited_math, get_math_config, replace_math_tags_with_dollar_signs 7 | from text_extract.tree_processing import remove_buttons, remove_image_figures, extract_code, extract_tables, extract_headings, remove_dense_links, add_se_separators, wikipedia_preprocess, remove_display_none, main_content_preprocess, post_process_headings 8 | from text_extract.line_processing import remove_empty_headers, remove_edit_buttons, remove_chinese_characters, remove_boilerplate 9 | from text_extract.utils import ReplacementManager 10 | 11 | import faulthandler 12 | faulthandler.enable() 13 | 14 | 15 | selectors_path = os.path.join(os.path.dirname(__file__), "banned_selectors.txt") 16 | with open(selectors_path, "r") as f: 17 | selectors = [line.replace('\n', '').strip() for line in f] 18 | # Remove empty lines 19 | selectors = [line for line in selectors if line] 20 | 21 | def filter_tree(tree, replacement_manager, config): 22 | """Filters the HTML tree to remove unwanted elements.""" 23 | 24 | # Remove display none elements 25 | remove_display_none(tree) 26 | 27 | # Remove the wikipedia footer 28 | wikipedia_preprocess(tree) 29 | 30 | if config['remove_buttons']: 31 | # Remove any bootstrap buttons 32 | remove_buttons(tree) 33 | 34 | if config['remove_image_figures']: 35 | # Remove any figures that only contain images 36 | remove_image_figures(tree) 37 | 38 | if config['markdown_code']: 39 | # Wrap the code in markdown code blocks 40 | extract_code(tree, replacement_manager) 41 | 42 | # Record the location of headings and format them 43 | extract_headings(tree, replacement_manager, config['markdown_headings']) 44 | 45 | # Remove link lists 46 | remove_dense_links(tree) 47 | 48 | # Format tables 49 | extract_tables(tree.document, replacement_manager, config['table_config']) 50 | 51 | # Process stack exchange separators 52 | add_se_separators(tree) 53 | 54 | # Preprocess main content 55 | main_content_preprocess(tree) 56 | 57 | return tree 58 | 59 | def html_preprocessing(html): 60 | html = html.replace("<math>", "[itex]") 61 | html = html.replace("</math>", "[/itex]") 62 | return html 63 | 64 | def replace_tags(html, old, new): 65 | pattern = re.compile(old, re.IGNORECASE) 66 | return pattern.sub(new, html) 67 | 68 | def extract_text(html, config, fast=False): 69 | """Extracts plain text from an HTML string.""" 70 | html = replace_tags(html, '', '') 74 | html = html_preprocessing(html) 75 | tree = HTMLTree.parse(html) 76 | replacement_manager = ReplacementManager() 77 | 78 | if fast: 79 | links = tree.document.query_selector_all('a') 80 | span_links = tree.document.query_selector_all('span a') 81 | if len(links) > 3000 or len(span_links) > 3000: 82 | print("Too many links, skipping") 83 | return None 84 | 85 | if config['extract_latex']: 86 | math_config = get_math_config(tree.document.html) 87 | tree, info = extract_math(tree, replacement_manager) 88 | else: 89 | info = {} 90 | tree = filter_tree(tree, replacement_manager, config) 91 | 92 | # Disable their filters because we use our own. 93 | text = extract_plain_text(tree, 94 | main_content=True, 95 | alt_texts=False, 96 | skip_elements=selectors) 97 | 98 | if config['extract_latex']: 99 | text = extract_delimited_math(text, math_config, info, replacement_manager) 100 | 101 | text = post_process_headings(text) 102 | 103 | lines = text.split("\n") 104 | 105 | if config['remove_chinese']: 106 | # Remove Chinese characters 107 | lines = remove_chinese_characters(lines) 108 | 109 | if config['boilerplate_config']['enable']: 110 | # Remove boilerplate 111 | lines = remove_boilerplate(lines, config['boilerplate_config'], replacement_manager) 112 | 113 | # Remove headings with nothing (or only other headings) after 114 | lines = remove_empty_headers(lines, replacement_manager) 115 | 116 | # Strip lines 117 | lines = [line.strip() for line in lines] 118 | 119 | # Create the final string 120 | text = "\n".join(lines) 121 | 122 | # Escape any dollar signs in the text 123 | text = text.replace("$", "\\$") 124 | 125 | # Now, add the dollar signs for math 126 | text = replace_math_tags_with_dollar_signs(text) 127 | 128 | if config['remove_edit_buttons']: 129 | # Remove edit buttons 130 | lines = text.split("\n") 131 | lines = remove_edit_buttons(lines) 132 | text = "\n".join(lines) 133 | 134 | # If there are over two newlines in a row, replace with two 135 | text = re.sub(r'\n{3,}', '\n\n', text) 136 | 137 | text = replacement_manager.remove_tags(text) 138 | 139 | text = text.strip() 140 | 141 | return text, info -------------------------------------------------------------------------------- /text_extraction/text_extract/latex_processing.py: -------------------------------------------------------------------------------- 1 | from py_asciimath.translator.translator import ASCIIMath2Tex 2 | import logging 3 | from lxml import etree as ET 4 | import re 5 | import html 6 | import os 7 | import json 8 | from resiliparse.parse.html import traverse_dom 9 | import urllib3 10 | from urllib.parse import unquote 11 | logging.getLogger().setLevel(logging.ERROR) 12 | 13 | color_regex = re.compile(r'\\textcolor\[.*?\]\{.*?\}') 14 | 15 | asciimath2tex = ASCIIMath2Tex(log=False) 16 | 17 | PARAGRAPH_TAGS = frozenset({ 18 | 'body', 'blockquote', 'caption', 'center', 'col', 'colgroup', 'dd', 19 | 'div', 'dl', 'dt', 'fieldset', 'form', 'legend', 'optgroup', 'option', 20 | 'p', 'pre', 'table', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 21 | 'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 22 | }) 23 | 24 | latex_math_commands = [ 25 | "\\end", "\\begin", "\\ref", "\\frac", "\\label", "\\bf", "\\right", "\\left", 26 | "\\rm", "\\alpha", "\\mu", "\\def", "\\it", "\\pi", "\\sigma", "\\sum", "\\lambda", 27 | "\\beta", "\\nu", "\\partial", "\\int", "\\delta", "\\rho", "\\phi", "\\gamma", 28 | "\\omega", "\\over", "\\nonumber", "\\bar", "\\sqrt", "\\theta", "\\tau", "\\em", 29 | "\\rangle", "\\hat", "\\tilde", "\\cal", "\\hline", "\\item", "\\psi", "\\vec", 30 | "\\langle", "\\epsilon", "\\eta", "\\cdot", "\\in", "\\xi", "\\infty", "\\quad", 31 | "\\mathcal", "\\times", "\\emph", "\\mathbf", "\\prime", "\\be", "\\mathrm", "\\ee", 32 | "\\vspace", "\\pm", "\\chi", "\\ell", "\\text", "\\qquad", "\\noindent", "\\to", 33 | "\\varphi", "\\hspace", "\\leq", "\\cos", "\\eqref", "\\overline", "\\sin", "\\kappa", 34 | "\\hbox", "\\rightarrow", "\\varepsilon", "\\textit", "\\dagger", "\\big", "\\otimes", 35 | "\\equiv", "\\zeta", "\\dot", "\\ln" 36 | ] 37 | 38 | latex_image_class_names = [ 39 | 'latexcenter', 40 | 'latex', 41 | 'tex', 42 | 'latexdisplay', 43 | 'latexblock', 44 | 'latexblockcenter', 45 | ] 46 | 47 | 48 | latex_math_commands = [re.escape(term) for term in latex_math_commands] 49 | latex_math_commands = [x + '(?![a-zA-Z])' for x in latex_math_commands] 50 | latex_regex = re.compile('|'.join(latex_math_commands)) 51 | 52 | def extract_asciimath(s): 53 | parsed = asciimath2tex.translate(s) 54 | return parsed 55 | 56 | cur_file = os.path.abspath(__file__) 57 | xsl_path = os.path.join(os.path.dirname(cur_file), 'mmltex/mmltex.xsl') 58 | 59 | xslt = ET.parse(xsl_path) 60 | transform = ET.XSLT(xslt) 61 | 62 | def mml_to_latex(mml_code): 63 | # Remove any attibutes from the math tag 64 | mml_code = re.sub(r'()', r'\1', mml_code) 65 | mml_ns = mml_code.replace('', '') #Required. 66 | mml_dom = ET.fromstring(mml_ns) 67 | mmldom = transform(mml_dom) 68 | latex_code = str(mmldom) 69 | return latex_code 70 | 71 | def wrap_math(s, display=False): 72 | s = re.sub(r'\s+', ' ', s) 73 | s = color_regex.sub('', s) 74 | s = s.replace('$', '') 75 | s = s.replace('\n', ' ') 76 | s = s.strip() 77 | if len(s) == 0: 78 | return s 79 | # Don't wrap if it's already in \align 80 | if 'align' in s: 81 | return s 82 | if display: 83 | return '[extract_tex]' + s + '[/extract_tex]' 84 | return '[extract_itex]' + s + '[/extract_itex]' 85 | 86 | def get_math_config(html): 87 | has_mathjax = re.search(r"mathjax", html.lower()) 88 | has_katex = re.search(r"katex", html.lower()) 89 | has_latex_math_command = latex_regex.search(html) 90 | if not has_mathjax and not has_katex and not has_latex_math_command: 91 | return None 92 | # Get LaTeX config for MathJax 93 | regex = r"tex2jax: {[^}]*}" 94 | latex_config = { 95 | "inlineMath": [["$", "$"], ["\[", "\]"], 96 | ["[itex]", "[/itex]"], ["[math]", "[/math]"], 97 | ["[latex]", "[/latex]"], ["[texx]", "[/texx]"]], 98 | "displayMath": [["\(", "\)"], ["$$", "$$"], ["[tex]", "[/tex]"]], 99 | "skipTags": ["script", "noscript", "style", "textarea", "pre", "code"], 100 | "ignoreClass": "tex2jax_ignore" 101 | } 102 | try: 103 | match = re.search(regex, html) 104 | if match: 105 | config = match.group(0) 106 | # Make it a valid json object by adding quotes around the keys 107 | config = re.sub(r"(\w+):", r'"\1":', config) 108 | config = "{" + config + "}" 109 | # config = re.sub(r"\\", r"\\\\", config) 110 | config = re.sub(r"'", r'"', config) 111 | config = re.sub(r",\s*}", "}", config) 112 | extracted_latex_config = json.loads(config)['tex2jax'] 113 | # latex_config.update(extracted_latex_config) 114 | # Update this in a smart way: if the key is already there, append the values 115 | # if the key is not there, add it 116 | 117 | for key in extracted_latex_config: 118 | if key in latex_config and key != 'ignoreClass': 119 | latex_config[key] += extracted_latex_config[key] 120 | else: 121 | latex_config[key] = extracted_latex_config[key] 122 | except Exception as e: 123 | pass 124 | 125 | # Get LaTeX config for KaTeX 126 | """ delimiters: [ 127 | {left: '$$', right: '$$', display: true} 128 | ], 129 | """ 130 | regex = r"delimiters: \[[^\]]*\]" 131 | try: 132 | match = re.search(regex, html) 133 | if match: 134 | config = match.group(0) 135 | # Make it a valid json object by adding quotes around the keys 136 | config = re.sub(r"(\w+):", r'"\1":', config) 137 | # The match is a list without the [] around it. Wrap with {"delimiters": ...} 138 | config = '{' + config + '}' 139 | config = re.sub(r"'", r'"', config) 140 | config = re.sub(r",\s*}", "}", config) 141 | extracted_latex_config = json.loads(config)['delimiters'] 142 | for delimiter in extracted_latex_config: 143 | if delimiter['display']: 144 | latex_config['displayMath'].append([delimiter['left'], delimiter['right']]) 145 | else: 146 | latex_config['inlineMath'].append([delimiter['left'], delimiter['right']]) 147 | except Exception as e: 148 | pass 149 | 150 | # Get AsciiMath config 151 | regex = r"asciimath2jax: {[^}]*}" 152 | asciimath_config = { 153 | "delimiters": [["`" , "`"]], 154 | "skipTags": ["script", "noscript", "style", "textarea", "pre", "code"], 155 | "ignoreClass": "asciimath2jax_ignore" 156 | } 157 | try: 158 | match = re.search(regex, html) 159 | if match: 160 | config = match.group(0) 161 | # Make it a valid json object by adding quotes around the keys 162 | config = re.sub(r"(\w+):", r'"\1":', config) 163 | config = "{" + config + "}" 164 | # config = re.sub(r"\\", r"\\\\", config) 165 | config = re.sub(r"'", r'"', config) 166 | config = re.sub(r",\s*}", "}", config) 167 | extracted_asciimath_config = json.loads(config)['asciimath2jax'] 168 | asciimath_config.update(extracted_asciimath_config) 169 | except Exception as e: 170 | pass 171 | return { 172 | "latex": latex_config, 173 | "asciimath": asciimath_config 174 | } 175 | 176 | def html_unescape(s): 177 | return html.unescape(s) 178 | 179 | def replace_math_tags_with_dollar_signs(text): 180 | # Replace each of these in the proper way 181 | # itex -> $...$ 182 | # tex -> $$...$$ 183 | # asciimath -> ... 184 | 185 | # Instead of this, simply replace extract_itex with $ and extract_tex with $$. 186 | text = re.sub(r'\[extract_itex\]', '$', text) 187 | text = re.sub(r'\[/extract_itex\]', '$', text) 188 | text = re.sub(r'\[extract_tex\]', '$$', text) 189 | text = re.sub(r'\[/extract_tex\]', '$$', text) 190 | 191 | return text 192 | 193 | 194 | def update_text_with_delimiters(text, delimiters, replacement_manager, info): 195 | 196 | def replace_itex(match): 197 | wrapped = wrap_math(match.group(1)) 198 | tagged = replacement_manager.add_replacement(wrapped, tag='math') 199 | return tagged 200 | 201 | def replace_tex(match): 202 | wrapped = wrap_math(match.group(1), display=True) 203 | tagged = replacement_manager.add_replacement(wrapped, tag='math') 204 | return tagged 205 | 206 | def replace_asciimath(match): 207 | wrapped = match.group(1) 208 | tagged = replacement_manager.add_replacement(wrapped, tag='math') 209 | return tagged 210 | 211 | for delimiter, type in delimiters: 212 | start_delimiter = re.escape(delimiter[0]) 213 | end_delimiter = re.escape(delimiter[1]) 214 | regex = f"{start_delimiter}(.*?){end_delimiter}" 215 | if type == 'INLINE_LATEX': 216 | # Simply replace the delimiters with [itex] and [/itex] 217 | updated_text = re.sub(regex, replace_itex, text, flags=re.DOTALL) 218 | if updated_text != text: 219 | info['found_math'] = True 220 | info['mathjax_inline_tex'] += 1 221 | text = updated_text 222 | elif type == 'DISPLAY_LATEX': 223 | updated_text = re.sub(regex, replace_tex, text, flags=re.DOTALL) 224 | if updated_text != text: 225 | info['found_math'] = True 226 | info['mathjax_display_tex'] += 1 227 | text = updated_text 228 | elif type == 'ASCIIMATH': 229 | updated_text = re.sub(regex, replace_asciimath, text, flags=re.DOTALL) 230 | if updated_text != text: 231 | info['found_math'] = True 232 | info['mathjax_asciimath'] += 1 233 | text = updated_text 234 | 235 | return text 236 | 237 | def extract_delimited_math(text, mathjax_config, info, replacement_manager): 238 | """This operates on plain text and extracts LaTeX and AsciiMath""" 239 | # import pdb; pdb.set_trace() 240 | if mathjax_config is None: 241 | return text 242 | delimiters = [] 243 | for delimiter in mathjax_config['latex']['inlineMath']: 244 | delimiters.append((delimiter, 'INLINE_LATEX')) 245 | for delimiter in mathjax_config['latex']['displayMath']: 246 | delimiters.append((delimiter, 'DISPLAY_LATEX')) 247 | for delimiter in mathjax_config['asciimath']['delimiters']: 248 | delimiters.append((delimiter, 'ASCIIMATH')) 249 | 250 | delimiters = sorted(delimiters, key=lambda x: len(x[0][0]), reverse=True) 251 | text = update_text_with_delimiters(text, delimiters, replacement_manager, info) 252 | return text 253 | 254 | def extract_math(tree, replacement_manager): 255 | """Webpages often contain LaTeX or AsciiMath equations that are 256 | hidden within the HTML. This function extracts the LaTeX and 257 | AsciiMath equations from the HTML. 258 | """ 259 | 260 | info = { 261 | 'found_math': False, 262 | 'script_math_tex': 0, 263 | 'script_math_asciimath': 0, 264 | 'math_annotations': 0, 265 | 'math_alttext': 0, 266 | 'mathml': 0, 267 | 'mathjax_tag': 0, 268 | 'mathjax_inline_tex': 0, 269 | 'mathjax_display_tex': 0, 270 | 'mathjax_asciimath': 0, 271 | 'img_math': 0, 272 | 'codecogs_latex': 0, 273 | 'wp_latex': 0, 274 | 'mimetex.cgi': 0, 275 | '/images/math/codecogs': 0, 276 | 'mathtex.cgi': 0, 277 | 'katex': 0, 278 | 'math-container': 0, 279 | 'wp-katex-eq': 0, 280 | 'align': 0, 281 | 'equation': 0, 282 | 'x-ck12': 0, 283 | 'texerror': 0, 284 | } 285 | 286 | # Find and tag any \align environments 287 | def start_callback(element): 288 | regex = r'\\begin{align}(.*?)\\end{align}' 289 | if element.node.type == 3: 290 | text = element.node.text 291 | matches = re.findall(regex, text, re.DOTALL) 292 | for match in matches: 293 | info['align'] += 1 294 | info['found_math'] = True 295 | match = replacement_manager.add_replacement(match, tag='math') 296 | text.replace(match, match) 297 | element.node.text = text 298 | 299 | def end_callback(element): 300 | pass 301 | 302 | body = tree.document.query_selector("body") 303 | traverse_dom(body, start_callback, end_callback) 304 | 305 | # Find any \equation environments 306 | def start_callback(element): 307 | regex = r'\\begin{equation}(.*?)\\end{equation}' 308 | if element.node.type == 3: 309 | text = element.node.text 310 | matches = re.findall(regex, text, re.DOTALL) 311 | for match in matches: 312 | info['equation'] += 1 313 | info['found_math'] = True 314 | match = match.replace('\\begin{equation}', '') 315 | match = match.replace('\\end{equation}', '') 316 | wrapped_text = wrap_math(match, display=True) 317 | wrapped_text = replacement_manager.add_replacement(wrapped_text, tag='math') 318 | text = text.replace(match, wrapped_text) 319 | # Remove the \begin{equation} and \end{equation} tags 320 | text = text.replace('\\begin{equation}', '') 321 | text = text.replace('\\end{equation}', '') 322 | element.node.text = text 323 | 324 | def end_callback(element): 325 | pass 326 | 327 | body = tree.document.query_selector("body") 328 | traverse_dom(body, start_callback, end_callback) 329 | 330 | # Find all .texerror 331 | texerrors = tree.document.query_selector_all('.texerror') 332 | for texerror in texerrors: 333 | 334 | # Find the text between {} (maximum length) and replace the texerror with that text 335 | match = re.search(r'\{(.{1,})\}', texerror.text) 336 | if match: 337 | info['found_math'] = True 338 | info['texerror'] += 1 339 | wrapped_match = wrap_math(match.group(1)) 340 | texerror.html = replacement_manager.add_replacement(wrapped_match, tag='math') 341 | 342 | # This has a ton of repeated code, but it's nice to have fine control over 343 | # how each source is handled. 344 | imgs = tree.document.query_selector_all('img') 345 | for img in imgs: 346 | 347 | class_attr = img.getattr('class') 348 | if class_attr is not None: 349 | class_list = class_attr.split(' ') 350 | if any([img_class in class_list for img_class in latex_image_class_names]): 351 | alt = img.getattr('alt') 352 | if alt is None: 353 | continue 354 | new_span = tree.create_element('span') 355 | wrapped_alt = wrap_math(alt) 356 | new_span.html = replacement_manager.add_replacement(wrapped_alt, tag='math') 357 | parent = img.parent 358 | parent.replace_child(new_span, img) 359 | if len(wrapped_alt.strip()) > 0: 360 | info['found_math'] = True 361 | info['img_math'] += 1 362 | 363 | src = img.getattr('src') 364 | if src is None: 365 | continue 366 | if 'codecogs.com' in src: 367 | try: 368 | latex = src.split('?')[1:] 369 | latex = '?'.join(latex) # In case there are multiple ? in the latex 370 | latex = unquote(latex) 371 | new_span = tree.create_element('span') 372 | wrapped_latex = wrap_math(latex) 373 | new_span.html = replacement_manager.add_replacement(wrapped_latex, tag='math') 374 | parent = img.parent 375 | parent.replace_child(new_span, img) 376 | if len(wrapped_latex.strip()) > 0: 377 | info['found_math'] = True 378 | info['codecogs_latex'] += 1 379 | except: 380 | pass 381 | if 'latex.php' in src: 382 | try: 383 | # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" 384 | alt = img.getattr('alt') 385 | if alt is None: 386 | continue 387 | # Unescape the latex 388 | alt = unquote(alt) 389 | # Get the latex 390 | wrapped_alt = wrap_math(alt) 391 | new_span = tree.create_element('span') 392 | new_span.html = replacement_manager.add_replacement(wrapped_alt, tag='math') 393 | parent = img.parent 394 | parent.replace_child(new_span, img) 395 | if len(wrapped_alt.strip()) > 0: 396 | info['found_math'] = True 397 | info['wp_latex'] += 1 398 | except: 399 | pass 400 | if '/images/math/codecogs' in src: 401 | try: 402 | # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" 403 | alt = img.getattr('alt') 404 | if alt is None: 405 | continue 406 | # Unescape the latex 407 | alt = unquote(alt) 408 | # Get the latex 409 | wrapped_alt = wrap_math(alt) 410 | new_span = tree.create_element('span') 411 | new_span.html = replacement_manager.add_replacement(wrapped_alt, tag='math') 412 | parent = img.parent 413 | parent.replace_child(new_span, img) 414 | if len(wrapped_alt.strip()) > 0: 415 | info['found_math'] = True 416 | info['/images/math/codecogs'] += 1 417 | except: 418 | pass 419 | if 'mimetex.cgi' in src: 420 | try: 421 | latex = src.split('?')[1:] 422 | latex = '?'.join(latex) # In case there are multiple ? in the latex 423 | latex = unquote(latex) 424 | new_span = tree.create_element('span') 425 | wrapped_latex = wrap_math(latex) 426 | new_span.html = replacement_manager.add_replacement(wrapped_latex, tag='math') 427 | parent = img.parent 428 | parent.replace_child(new_span, img) 429 | if len(wrapped_latex.strip()) > 0: 430 | info['found_math'] = True 431 | info['mimetex.cgi'] += 1 432 | except: 433 | pass 434 | if 'mathtex.cgi' in src: 435 | try: 436 | latex = src.split('?')[1:] 437 | latex = '?'.join(latex) # In case there are multiple ? in the latex 438 | latex = unquote(latex) 439 | new_span = tree.create_element('span') 440 | wrapped_latex = wrap_math(latex) 441 | new_span.html = replacement_manager.add_replacement(wrapped_latex, tag='math') 442 | parent = img.parent 443 | parent.replace_child(new_span, img) 444 | if len(wrapped_latex.strip()) > 0: 445 | info['found_math'] = True 446 | info['mathtex.cgi'] += 1 447 | except: 448 | pass 449 | class_attr = img.getattr('class') 450 | if class_attr is not None: 451 | if 'x-ck12' in class_attr: 452 | try: 453 | latex = img.getattr('alt') 454 | latex = unquote(latex) 455 | new_span = tree.create_element('span') 456 | wrapped_latex = wrap_math(latex) 457 | new_span.html = replacement_manager.add_replacement(wrapped_latex, tag='math') 458 | parent = img.parent 459 | parent.replace_child(new_span, img) 460 | if len(wrapped_latex.strip()) > 0: 461 | info['found_math'] = True 462 | info['x-ck12'] += 1 463 | except: 464 | pass 465 | 466 | 467 | # Find any blocks with class math-container and replace them with spans 468 | math_containers = tree.document.query_selector_all('.math-container') 469 | for math_container in math_containers: 470 | text = math_container.text 471 | new_span = tree.create_element('span') 472 | wrapped_math = wrap_math(text, display=True) 473 | new_span.html = replacement_manager.add_replacement(wrapped_math, tag='math') 474 | parent = math_container.parent 475 | parent.replace_child(new_span, math_container) 476 | if len(wrapped_math.strip()) > 0: 477 | info['found_math'] = True 478 | info['math-container'] += 1 479 | 480 | katex_inline_wp = tree.document.query_selector_all('.wp-katex-eq') 481 | for katex in katex_inline_wp: 482 | text = katex.text 483 | new_span = tree.create_element('span') 484 | display_attr = katex.getattr('data-display') 485 | if display_attr is not None: 486 | display = display_attr == 'true' 487 | else: 488 | display = False 489 | wrapped_math = wrap_math(text, display=display) 490 | new_span.html = replacement_manager.add_replacement(wrapped_math, tag='math') 491 | parent = katex.parent 492 | parent.replace_child(new_span, katex) 493 | if len(wrapped_math.strip()) > 0: 494 | info['found_math'] = True 495 | info['wp-katex-eq'] += 1 496 | 497 | # Find all script[type="math/tex"] tags and replace them with spans 498 | latex_script_tags = tree.document.query_selector_all( 499 | 'script[type="math/tex"]') 500 | for script_tag in latex_script_tags: 501 | text = script_tag.text 502 | new_span = tree.create_element('span') 503 | wrapped_text = wrap_math(text) 504 | new_span.html = replacement_manager.add_replacement(wrapped_text, tag='math') 505 | parent = script_tag.parent 506 | parent.replace_child(new_span, script_tag) 507 | if len(wrapped_text.strip()) > 0: 508 | info['found_math'] = True 509 | info['script_math_tex'] += 1 510 | 511 | asciimath_script_tags = tree.document.query_selector_all( 512 | 'script[type="math/asciimath"]') 513 | for script_tag in asciimath_script_tags: 514 | try: 515 | text = script_tag.text 516 | new_span = tree.create_element('span') 517 | wrapped_asciimath = wrap_math(extract_asciimath(text)) 518 | new_span.html = replacement_manager.add_replacement(wrapped_asciimath, tag='math') 519 | parent = script_tag.parent 520 | parent.replace_child(new_span, script_tag) 521 | if len(wrapped_asciimath.strip()) > 0: 522 | info['found_math'] = True 523 | info['script_math_asciimath'] += 1 524 | except: 525 | # Delete this script tag 526 | parent = script_tag.parent 527 | parent.remove_child(script_tag) 528 | 529 | # For katex, find all elements with class = tex 530 | katex_spans = tree.document.query_selector_all('.tex') 531 | for katex_span in katex_spans: 532 | try: 533 | # Check if they have data-expr attr 534 | expr = katex_span.getattr('data-expr') 535 | if expr is None: 536 | continue 537 | # Replace with a span 538 | new_span = tree.create_element('span') 539 | wrapped_expr = wrap_math(expr) 540 | new_span.html = replacement_manager.add_replacement(wrapped_expr, tag='math') 541 | parent = katex_span.parent 542 | parent.replace_child(new_span, katex_span) 543 | if len(wrapped_expr.strip()) > 0: 544 | info['found_math'] = True 545 | info['katex'] += 1 546 | except: 547 | pass 548 | 549 | # Find any spans with class "katex" 550 | katex_spans = tree.document.query_selector_all('span.katex') 551 | for katex_span in katex_spans: 552 | # Find any spans with class "katex-html" and remove them 553 | katex_html_spans = katex_span.query_selector_all('span.katex-html') 554 | for katex_html_span in katex_html_spans: 555 | parent = katex_html_span.parent 556 | parent.remove_child(katex_html_span) 557 | 558 | # Remove any .MathJax_Preview spans 559 | mathjax_preview_spans = tree.document.query_selector_all( 560 | 'span.MathJax_Preview') 561 | for mathjax_preview_span in mathjax_preview_spans: 562 | parent = mathjax_preview_span.parent 563 | parent.remove_child(mathjax_preview_span) 564 | 565 | # Find any math tags 566 | math_tags = tree.document.query_selector_all('math') 567 | # For each math tag, see if there is an annotation tag with 568 | # encoding="application/x-tex" inside it 569 | for math_tag in math_tags: 570 | annotation_tag = math_tag.query_selector( 571 | 'annotation[encoding="application/x-tex"]') 572 | if annotation_tag is not None: 573 | # Get the text content of the annotation tag 574 | text = annotation_tag.text 575 | # Set the content of the math tag to the text 576 | # replace this math tag with a span tag with the text 577 | # To do this, we need to get the parent of the math tag 578 | parent = math_tag.parent 579 | # Then, we need to create a new span tag 580 | new_span = tree.create_element('span') 581 | # Set the html of the new span tag to the text 582 | wrapped_text = wrap_math(text) 583 | new_span.html = replacement_manager.add_replacement(wrapped_text, tag='math') 584 | # Then, we need to replace the math tag with the new span tag 585 | parent.replace_child(new_span, math_tag) 586 | # If the parent has style="display:none", then we need to 587 | # remove the style attribute 588 | style_value = parent.getattr('style') 589 | if style_value is not None: 590 | normalized_style_value = style_value.lower( 591 | ).strip().replace(' ', '').replace(';', '') 592 | if 'display:none' in normalized_style_value: 593 | parent.delattr('style') 594 | if len(wrapped_text.strip()) > 0: 595 | info['found_math'] = True 596 | info['math_annotations'] += 1 597 | # Check if the math tag has an alttext attribute 598 | elif math_tag.getattr('alttext') is not None: 599 | # Get the alttext attribute 600 | alttext = math_tag.getattr('alttext') 601 | new_span = tree.create_element('span') 602 | # Set the html of the new span tag to the text 603 | wrapped_alttext = wrap_math(alttext) 604 | new_span.html = replacement_manager.add_replacement(wrapped_alttext, tag='math') 605 | # Then, we need to replace the math tag with the new span tag 606 | parent = math_tag.parent 607 | parent.replace_child(new_span, math_tag) 608 | if len(wrapped_alttext.strip()) > 0: 609 | info['found_math'] = True 610 | info['math_alttext'] += 1 611 | # Otherwise, translate the math tag to LaTeX 612 | else: 613 | try: 614 | # Try translating to LaTeX 615 | mathml = math_tag.html 616 | # If this includes xmlns:mml, then we need to replace all 617 | # instances of mml: with nothing 618 | if 'xmlns:mml' in mathml: 619 | mathml = mathml.replace('mml:', '') 620 | # replace xmlns:mml="..." with nothing 621 | mathml = re.sub(r'xmlns:mml=".*?"', '', mathml) 622 | latex = mml_to_latex(mathml) 623 | # Make a new span tag 624 | new_span = tree.create_element('span') 625 | # Set the html of the new span tag to the text 626 | wrapped_latex = wrap_math(latex) 627 | new_span.html = replacement_manager.add_replacement(wrapped_latex, tag='math') 628 | # Then, we need to replace the math tag with the new span tag 629 | parent = math_tag.parent 630 | parent.replace_child(new_span, math_tag) 631 | if len(wrapped_latex.strip()) > 0: 632 | info['found_math'] = True 633 | info['mathml'] += 1 634 | except Exception as e: 635 | parent = math_tag.parent 636 | parent.remove_child(math_tag) 637 | 638 | mathjax_tags = tree.document.query_selector_all('mathjax') 639 | for mathjax_tag in mathjax_tags: 640 | # Get the inner text of the mathjax tag 641 | text = mathjax_tag.text 642 | text = html.unescape(text) 643 | # Use regex to find text wrapped in hashes 644 | matches = re.findall(r'#(.+?)#', text) 645 | # For each match, replace the match with the LaTeX 646 | for match in matches: 647 | try: 648 | latex = extract_asciimath(match) 649 | # Replace the match with the LaTeX 650 | text = text.replace(f'#{match}#', latex) 651 | except Exception as e: 652 | pass 653 | 654 | # Create a new span tag 655 | new_span = tree.create_element('span') 656 | # Set the html of the new span tag to the text 657 | new_span.html = replacement_manager.add_replacement(text, tag='math') 658 | # Then, we need to replace the mathjax tag with the new span tag 659 | parent = mathjax_tag.parent 660 | parent.replace_child(new_span, mathjax_tag) 661 | if len(text.strip()) > 0: 662 | info['found_math'] = True 663 | info['mathjax_tag'] += 1 664 | 665 | return tree, info 666 | 667 | def remove_color(text): 668 | return re.sub(color_regex, '', text) 669 | -------------------------------------------------------------------------------- /text_extraction/text_extract/line_processing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | 4 | edit_regex = r"\[(e|E)dit\]" 5 | 6 | boilerplate_words_path = os.path.join(os.path.dirname(__file__), "boilerplate_words.txt") 7 | with open(boilerplate_words_path, "r") as f: 8 | boilerplate_words = {} 9 | for line in f: 10 | words = line.replace('\n', '') 11 | n_words = len(words.split()) 12 | boilerplate_words[words] = n_words 13 | 14 | def remove_empty_headers(lines, replacement_manager): 15 | output_lines = [] 16 | is_heading = [0] * len(lines) 17 | for k in range(1,7): 18 | for i in range(len(lines)): 19 | if replacement_manager.has_tag(lines[i], tag='h' + str(k)): 20 | is_heading[i] = k 21 | for i in range(len(lines)): 22 | # Check if this line is a heading 23 | if is_heading[i] != 0: 24 | remove = False 25 | # Go through the next lines until we find a line that is not a heading 26 | j = i + 1 27 | while j < len(lines): 28 | if is_heading[j] == 0 and len(lines[j]) > 16: 29 | break 30 | elif is_heading[j] != 0 and is_heading[j] <= is_heading[i]: 31 | remove = True 32 | break 33 | j += 1 34 | # If we found a line that is not a heading, then we have a section 35 | if j < len(lines) and not remove: 36 | output_lines.append(lines[i]) 37 | else: 38 | output_lines.append(lines[i]) 39 | # If there is at least one non-heading line, then we have a section 40 | 41 | return output_lines 42 | 43 | def remove_edit_buttons(lines): 44 | output_lines = [] 45 | for line in lines: 46 | if re.search(edit_regex, line): 47 | output_lines.append(re.sub(edit_regex, "", line)) 48 | else: 49 | output_lines.append(line) 50 | return output_lines 51 | 52 | def remove_chinese_characters(lines): 53 | output_lines = [] 54 | for line in lines: 55 | if re.match(u'[\u4e00-\u9fff]', line): 56 | output_lines.append("") 57 | else: 58 | output_lines.append(line) 59 | return output_lines 60 | 61 | def remove_boilerplate(lines, boilerplate_config, replacement_manager): 62 | output_lines = [] 63 | maths = [replacement_manager.has_tag(line, tag='math') for line in lines] 64 | codes = [replacement_manager.has_tag(line, tag='code') for line in lines] 65 | for i in range(len(lines)): 66 | lowered = lines[i].lower() 67 | without_tags = replacement_manager.remove_tags(lowered) 68 | s = sum([without_tags.count(word) * boilerplate_words[word] for word in boilerplate_words]) 69 | # Compute the ratio of boilerplate words over the length of the line, and remove the line if this ratio is larger than the threshold 70 | ratio = s / (len(without_tags.split()) + 0.001) 71 | if (ratio > boilerplate_config['ratio_threshold'] or \ 72 | s > boilerplate_config['absolute_threshold']) and \ 73 | not maths[i] and not codes[i]: 74 | if len(lines) - i < boilerplate_config['end_threshold']: 75 | for j in range(i, len(lines)): 76 | if maths[j] or codes[j]: 77 | output_lines.append(lines[j]) 78 | break 79 | else: 80 | output_lines.append(lines[i]) 81 | return output_lines -------------------------------------------------------------------------------- /text_extraction/text_extract/mmltex/README: -------------------------------------------------------------------------------- 1 | README for the XSLT MathML Library 2 | 3 | XSLT MathML Library is a set of XSLT stylesheets to transform 4 | MathML 2.0 to LaTeX. 5 | 6 | For more information, see 7 | http://www.raleigh.ru/MathML/mmltex/index.php?lang=en 8 | 9 | Manifest 10 | -------- 11 | 12 | README this file 13 | mmltex.xsl 14 | tokens.xsl 15 | glayout.xsl 16 | scripts.xsl 17 | tables.xsl 18 | entities.xsl 19 | cmarkup.xsl 20 | 21 | Use 22 | --- 23 | 24 | There are two ways of using the library: 25 | 26 | * Use a local copy of the library. 27 | 28 | 1. Download the distribution (see below). 29 | 30 | 2. Unpack the distribution, using unzip. 31 | 32 | 3. In your stylesheet import or include either the main 33 | stylesheet, mmltex.xsl, or the stylesheet module you 34 | wish to use, such as tokens.xsl. This example assumes 35 | that the distribution has been extracted into the same 36 | directory as your own stylesheet: 37 | 38 | 39 | 40 | * Import or include either the main stylesheet, or the 41 | stylesheet module you wish to use, directly from the library 42 | website; http://www.raleigh.ru/MathML/mmltex/. For example: 43 | 44 | 45 | 46 | Obtaining The Library 47 | --------------------- 48 | 49 | The XSLT MathML Library is available for download as: 50 | 51 | * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip 52 | 53 | Copyright 54 | --------- 55 | 56 | Copyright (C) 2001, 2002 Vasil Yaroshevich 57 | 58 | Permission is hereby granted, free of charge, to any person 59 | obtaining a copy of this software and associated documentation 60 | files (the ``Software''), to deal in the Software without 61 | restriction, including without limitation the rights to use, 62 | copy, modify, merge, publish, distribute, sublicense, and/or 63 | sell copies of the Software, and to permit persons to whom the 64 | Software is furnished to do so, subject to the following 65 | conditions: 66 | 67 | The above copyright notice and this permission notice shall be 68 | included in all copies or substantial portions of the Software. 69 | 70 | Except as contained in this notice, the names of individuals 71 | credited with contribution to this software shall not be used in 72 | advertising or otherwise to promote the sale, use or other 73 | dealings in this Software without prior written authorization 74 | from the individuals in question. 75 | 76 | Any stylesheet derived from this Software that is publically 77 | distributed will be identified with a different name and the 78 | version strings in any derived Software will be changed so that 79 | no possibility of confusion between the derived package and this 80 | Software will exist. 81 | 82 | Warranty 83 | -------- 84 | 85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 88 | NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER 89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 92 | OTHER DEALINGS IN THE SOFTWARE. 93 | 94 | Contacting the Author 95 | --------------------- 96 | 97 | These stylesheets are maintained by Vasil Yaroshevich, . 98 | -------------------------------------------------------------------------------- /text_extraction/text_extract/mmltex/cmarkup.xsl: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 8 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | + 20 | 21 | i 22 | 23 | 24 | 25 | 26 | / 27 | 28 | 29 | 30 | 31 | 32 | _{} 33 | 34 | 35 | 36 | 37 | e^{i 38 | 39 | } 40 | 41 | 42 | 43 | 44 | E 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | \mathrm{} 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | ( 66 | 67 | 68 | , 69 | 70 | ) 71 | 72 | 73 | 74 | 75 | () 76 | 77 | 78 | 79 | 80 | 81 | 82 | \left( 83 | 84 | \left[ 85 | 86 | 87 | , 88 | 89 | 90 | 91 | \right) 92 | 93 | \right] 94 | 95 | 96 | 97 | 98 | \left\{\right\} 99 | 100 | 101 | 102 | 103 | ^{(-1)} 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | \mathrm{lambda}\: 112 | 113 | .\: 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | \circ 124 | 125 | 126 | 127 | 128 | \mathrm{id} 129 | 130 | 132 | 134 | \mathop{\mathrm{ 135 | 136 | }} 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | \begin{cases} 145 | 146 | 147 | \end{cases} 148 | 149 | 150 | 151 | 152 | & \text{if $ 153 | 154 | $} 155 | \\ 156 | 157 | 158 | 159 | 160 | & \text{otherwise} 161 | 162 | 163 | 164 | 165 | \left\lfloor\frac{ 166 | 167 | }{ 168 | 169 | }\right\rfloor 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | ! 178 | 179 | 180 | 181 | 182 | 183 | 184 | \left( 185 | \frac{ 186 | 187 | 189 | }{ 190 | 191 | 193 | } 194 | \right) 195 | 196 | 197 | 198 | 199 | \ 200 | 201 | \{ 202 | 203 | 204 | 205 | , 206 | 207 | 208 | 209 | 210 | 211 | , 212 | 213 | 214 | 215 | \} 216 | 217 | 218 | 219 | 220 | - 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | - 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | ( 240 | 241 | 242 | 243 | 244 | - 247 | + 248 | 249 | 250 | 251 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | ) 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | ^{ 285 | 286 | 287 | 288 | } 289 | 290 | 291 | 292 | 293 | 294 | 295 | \mod 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | ( 306 | 307 | 308 | 309 | \times 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | ) 320 | 321 | 322 | 323 | 324 | \sqrt 325 | 326 | [ 327 | 328 | ] 329 | 330 | { 331 | 332 | } 333 | 334 | 335 | 336 | \gcd 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | \land 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | \lor 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | \mathop{\mathrm{xor}} 365 | 366 | 367 | 368 | 369 | 370 | \neg 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | \implies 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | \ 389 | 390 | 391 | 392 | 393 | , 394 | 395 | 396 | \colon 397 | 398 | 399 | 400 | 401 | 402 | 403 | \left| 404 | 405 | \right| 406 | 407 | 408 | 409 | 410 | \overline{} 411 | 412 | 413 | 414 | \Re 415 | 416 | 417 | \Im 418 | 419 | 420 | 421 | \lfloor 422 | 423 | \rfloor 424 | 425 | 426 | 427 | 428 | \lceil 429 | 430 | \rceil 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | = 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | \neq 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | > 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | < 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | \ge 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | \le 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | \equiv 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | \approx 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | \int 526 | 527 | _{ 528 | 529 | } 530 | 531 | 532 | ^{ 533 | 534 | } 535 | 536 | 537 | 538 | \,d 539 | 540 | 541 | 542 | 543 | 544 | 545 | ^\prime 546 | 547 | 548 | 549 | \frac{ 550 | 551 | 552 | d^{ 553 | 554 | } 555 | 556 | }{d 557 | 558 | ^{ 559 | 560 | } 561 | 562 | 563 | d 564 | 565 | }{d 566 | 567 | } 568 | 569 | 570 | } 571 | 572 | 573 | 574 | 575 | D_{ 576 | 577 | 578 | , 579 | 580 | } 581 | 582 | 583 | 584 | 585 | \frac{\partial^{ 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | + 594 | 595 | 596 | + 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | } 605 | 606 | }{ 607 | 608 | \partial 609 | 610 | 611 | ^{ 612 | 613 | } 614 | 615 | 616 | } 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | , 626 | 627 | 628 | 629 | \mathop{\mathrm{div}} 630 | 631 | 632 | \nabla^2 633 | 634 | 635 | 636 | \{\} 637 | 638 | 639 | 640 | 641 | \left[\right] 642 | 643 | 644 | 645 | 646 | 647 | 648 | \colon 649 | 650 | 651 | 652 | 653 | 654 | , 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | \cup 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | \cap 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | \in 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | \notin 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | \subseteq 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | \subset 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | \nsubseteq 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | \not\subset 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | \setminus 747 | 748 | 749 | 750 | 751 | 752 | | 753 | 754 | | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | \times 764 | 765 | 766 | 767 | 770 | 771 | 772 | 773 | ^{ 774 | 775 | } 776 | 777 | 778 | 779 | 780 | \sum 781 | 782 | 783 | 784 | 785 | \prod 786 | 787 | 788 | 789 | 790 | _{ 791 | 792 | 793 | = 794 | 795 | 796 | } 797 | 798 | 799 | ^{ 800 | 801 | } 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | \lim_{ 810 | 811 | } 812 | 813 | 814 | 815 | 816 | 817 | \to 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | \searrow 830 | \nearrow 831 | \rightarrow 832 | \to 833 | 834 | 835 | 836 | 837 | 838 | 839 | 844 | \ 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 855 | \ 856 | 857 | 858 | 859 | 860 | 865 | \mathrm{ 866 | 867 | \,} 868 | 869 | 870 | 871 | 872 | 873 | 876 | \mathrm{ 877 | 878 | } 879 | 880 | 881 | 882 | 883 | e^{} 884 | 885 | 886 | 887 | 888 | \lg 889 | 890 | 891 | 892 | 893 | 894 | 895 | \log_{ 896 | 897 | } 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | \langle 906 | 907 | 908 | , 909 | 910 | \rangle 911 | 912 | 913 | 914 | \sigma 915 | 916 | 917 | 918 | \sigma( 919 | 920 | )^2 921 | 922 | 923 | 924 | 925 | \langle 926 | 927 | ^{ 928 | 929 | }\rangle 930 | 931 | _{ 932 | 933 | } 934 | 935 | 936 | 937 | 938 | 939 | 940 | \left(\begin{array}{c} 941 | 942 | 943 | \\ 944 | 945 | \end{array}\right) 946 | 947 | 948 | 949 | 950 | \begin{pmatrix} 951 | 952 | \end{pmatrix} 953 | 954 | 955 | 956 | 957 | 958 | 959 | & 960 | 961 | \\ 962 | 963 | 964 | 965 | 966 | \det 967 | 968 | 969 | 970 | 971 | 972 | 973 | \begin{vmatrix} 974 | 975 | \end{vmatrix} 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | ^T 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | _{ 992 | 993 | 994 | , 995 | 996 | } 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | \dot 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | \mathbb{Z} 1018 | 1019 | 1020 | \mathbb{R} 1021 | 1022 | 1023 | \mathbb{Q} 1024 | 1025 | 1026 | \mathbb{N} 1027 | 1028 | 1029 | \mathbb{C} 1030 | 1031 | 1032 | \mathbb{P} 1033 | 1034 | 1035 | e 1036 | 1037 | 1038 | i 1039 | 1040 | 1041 | NaN 1042 | 1043 | 1044 | \mbox{true} 1045 | 1046 | 1047 | \mbox{false} 1048 | 1049 | 1050 | \emptyset 1051 | 1052 | 1053 | \pi 1054 | 1055 | 1056 | \gamma 1057 | 1058 | 1059 | \infty 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | ( 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | ) 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | ( 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | ) 1091 | 1092 | 1093 | -------------------------------------------------------------------------------- /text_extraction/text_extract/mmltex/glayout.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 21 | 22 | 23 | \genfrac{}{}{ 24 | 25 | 26 | 27 | ex 28 | 29 | 30 | .05ex 31 | 32 | 33 | 34 | .2ex 35 | 36 | 37 | 38 | 39 | 40 | }{}{ 41 | 42 | 43 | \frac{ 44 | 45 | 46 | 47 | \hfill 48 | 49 | 50 | 51 | \hfill 52 | 53 | }{ 54 | 55 | \hfill 56 | 57 | 58 | 59 | \hfill 60 | 61 | } 62 | 63 | 64 | 65 | 66 | 67 | \sqrt[ 68 | 69 | ]{ 70 | 71 | } 72 | 73 | 74 | 75 | exception 25: 76 | \text{exception 25:} 77 | 78 | 79 | 80 | 81 | 82 | \sqrt{ 83 | 84 | } 85 | 86 | 87 | 88 | 89 | 90 | 91 | \left 92 | 93 | 94 | \ 95 | 96 | 97 | 98 | \left( 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | , 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | \right 134 | 135 | 136 | \ 137 | 138 | 139 | 140 | \right) 141 | 142 | 143 | 144 | 145 | \phantom{ 146 | 147 | } 148 | 149 | 150 | 151 | 152 | 153 | \overline{ 154 | 155 | \hspace{.2em}|} 156 | 157 | 158 | \sqrt{ 159 | 160 | } 161 | 162 | 163 | \overline{) 164 | 165 | } 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | \colorbox[rgb]{ 177 | 178 | 179 | 180 | }{$ 181 | 182 | 183 | \textcolor[rgb]{ 184 | 185 | 186 | 187 | }{ 188 | 189 | 190 | 191 | } 192 | 193 | 194 | $} 195 | 196 | 197 | 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /text_extraction/text_extract/mmltex/mmltex.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | $ 41 | 42 | $ 43 | 44 | 45 | -------------------------------------------------------------------------------- /text_extraction/text_extract/mmltex/scripts.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | \overline{ 33 | 34 | 35 | 36 | 37 | } 38 | 39 | 40 | \overbrace{ 41 | 42 | 43 | 44 | 45 | } 46 | 47 | 48 | \underline{ 49 | 50 | 51 | 52 | 53 | 54 | } 55 | 56 | 57 | \underbrace{ 58 | 59 | 60 | 61 | 62 | 63 | } 64 | 65 | 67 | 75 | 76 | _{ 77 | 78 | }^{ 79 | 80 | } 81 | 82 | 83 | \underset{ 84 | 85 | }{\overset{ 86 | 87 | }{ 88 | 89 | }} 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | \overline{ 131 | 132 | } 133 | 134 | 135 | \overbrace{ 136 | 137 | } 138 | 139 | 141 | 149 | 150 | ^{ 151 | 152 | } 153 | 154 | 155 | \stackrel{ 156 | 157 | }{ 158 | 159 | } 160 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | \underline{ 176 | 177 | } 178 | 179 | 180 | \underbrace{ 181 | 182 | } 183 | 184 | 186 | 194 | 195 | _{ 196 | 197 | } 198 | 199 | 200 | \underset{ 201 | 202 | }{ 203 | 204 | } 205 | 206 | 207 | 208 | 209 | 210 | { 211 | 212 | }_{ 213 | 214 | }^{ 215 | 216 | } 217 | 218 | 219 | 220 | { 221 | 222 | }^{ 223 | 224 | } 225 | 226 | 227 | 228 | { 229 | 230 | }_{ 231 | 232 | } 233 | 234 | 235 | 236 | 237 | 238 | {}_{ 239 | 240 | } 241 | 242 | 243 | {}^{ 244 | 245 | } 246 | 247 | 248 | 249 | 250 | 251 | {} 252 | 253 | 254 | _{ 255 | 256 | } 257 | 258 | 259 | ^{ 260 | 261 | } 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | {} 276 | 277 | 278 | _{ 279 | 280 | } 281 | 282 | 283 | ^{ 284 | 285 | } 286 | 287 | 288 | 289 | 290 | 291 | 292 | -------------------------------------------------------------------------------- /text_extraction/text_extract/mmltex/tables.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | \multicolumn{ 15 | 16 | }{c}{ 17 | 18 | } 19 | 20 | & 21 | 22 | 23 | 24 | 25 | 26 | 27 | \hfill 28 | 29 | 30 | 31 | \hfill 32 | 33 | 34 | 36 | & 37 | 38 | 39 | 40 | 41 | 42 | 43 | \\ 44 | 45 | 46 | 47 | 48 | \begin{array}{ 49 | 50 | | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | | 85 | 86 | } 87 | 88 | \hline 89 | 90 | 91 | 92 | \\ \hline 93 | 94 | \end{array} 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /text_extraction/text_extract/mmltex/tokens.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | \mathrm{ 21 | 22 | } 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | \text{ 45 | 46 | } 47 | 48 | 49 | 50 | \phantom{\rule 51 | 52 | [- 53 | 54 | ] 55 | 56 | { 57 | 58 | 0ex 59 | 60 | 61 | }{ 62 | 63 | 0ex 64 | 65 | 66 | }} 67 | 68 | 69 | 70 | 71 | 72 | " 73 | 74 | 75 | " 76 | 77 | 78 | 79 | 80 | 81 | \colorbox[rgb]{ 82 | 83 | 84 | 85 | }{$ 86 | 87 | 88 | \textcolor[rgb]{ 89 | 90 | 91 | 92 | }{ 93 | 94 | 95 | 96 | 97 | \mathrm{ 98 | 99 | 100 | \mathbf{ 101 | 102 | 103 | \mathit{ 104 | 105 | 106 | \mathbit{ 107 | 108 | 109 | \mathbb{ 110 | 111 | 112 | { 113 | 114 | 115 | \mathcal{ 116 | 117 | 118 | \mathsc{ 119 | 120 | 121 | \mathfrak{ 122 | 123 | 124 | \mathsf{ 125 | 126 | 127 | \mathbsf{ 128 | 129 | 130 | \mathsfit{ 131 | 132 | 133 | \mathbsfit{ 134 | 135 | 136 | \mathtt{ 137 | 138 | 139 | { 140 | 141 | 142 | 143 | 144 | 145 | } 146 | 147 | 148 | } 149 | 150 | 151 | $} 152 | 153 | 154 | 155 | 156 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | , 189 | 190 | 191 | 192 | 193 | 194 | , 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | , 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | , 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 0,1,1 239 | 0,0,0 240 | 0,0,1 241 | 1,0,1 242 | .5,.5,.5 243 | 0,.5,0 244 | 0,1,0 245 | .5,0,0 246 | 0,0,.5 247 | .5,.5,0 248 | .5,0,.5 249 | 1,0,0 250 | .75,.75,.75 251 | 0,.5,.5 252 | 1,1,1 253 | 1,1,0 254 | 255 | Exception at color template 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | Exception at Hex2Decimal template 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | -------------------------------------------------------------------------------- /text_extraction/text_extract/tree_processing.py: -------------------------------------------------------------------------------- 1 | from text_extract.utils import has_style 2 | from tabulate import tabulate 3 | from resiliparse.parse.html import traverse_dom 4 | from resiliparse.parse.html import DOMCollection 5 | import re 6 | 7 | header_to_format = { 8 | f'h{i}': f'[heading_{i}]' for i in range(1, 7) 9 | } 10 | 11 | def remove_buttons(tree): 12 | btns = tree.document.query_selector_all('.btn') 13 | for btn in btns: 14 | parent = btn.parent 15 | parent.remove_child(btn) 16 | # Remove any button tags 17 | btns = tree.document.query_selector_all('button') 18 | for btn in btns: 19 | parent = btn.parent 20 | if parent: 21 | parent.remove_child(btn) 22 | 23 | def remove_links(tree): 24 | """Replace links with spans so that resiliparse doesn't try to remove them.""" 25 | links = tree.document.query_selector_all('a') 26 | for link in links: 27 | parent = link.parent 28 | if parent is None: 29 | continue 30 | new_span = tree.create_element('span') 31 | new_span.text = link.text 32 | parent.replace_child(new_span, link) 33 | 34 | def flatten(node): 35 | """Remove any divs or spans that only have one child and replace them with their child.""" 36 | divs = node.query_selector_all('div') 37 | spans = node.query_selector_all('span') 38 | for div in divs: 39 | if len(div.child_nodes) == 1: 40 | parent = div.parent 41 | if parent is None: 42 | continue 43 | parent.replace_child(div.child_nodes[0], div) 44 | for span in spans: 45 | if len(span.child_nodes) == 1: 46 | parent = span.parent 47 | if parent is None: 48 | continue 49 | parent.replace_child(span.child_nodes[0], span) 50 | 51 | return node 52 | 53 | def remove_dense_links(tree): 54 | """Remove lists that only have links.""" 55 | # First, remove any nav elements to be safe. 56 | navs = tree.document.query_selector_all('nav') 57 | for nav in navs: 58 | parent = nav.parent 59 | if parent is None: 60 | continue 61 | parent.remove_child(nav) 62 | 63 | lists = tree.document.query_selector_all('ul, ol, div, span, nav, table, p') 64 | to_remove = [] 65 | for _list in lists: 66 | if len(_list.child_nodes) == 0 or len(_list.child_nodes) == 1: 67 | continue 68 | children = _list.child_nodes 69 | links = _list.query_selector_all('a') 70 | total_children_text = ''.join([x.text.strip() for x in children if type(x) != DOMCollection]) 71 | total_links_text = ''.join([x.text.strip() for x in links]) 72 | if len(total_children_text) == 0 or len(total_links_text) == 0: 73 | continue 74 | ratio = len(total_links_text) / len(total_children_text) 75 | if ratio > 0.8: 76 | parent = _list.parent 77 | if parent is None: 78 | continue 79 | to_remove.append(_list) 80 | 81 | for _list in to_remove: 82 | parent = _list.parent 83 | if parent is None: 84 | continue 85 | parent.remove_child(_list) 86 | 87 | def remove_image_figures(tree): 88 | to_remove = [] 89 | imgs = tree.document.query_selector_all('img') 90 | for img in imgs: 91 | cur_node = img 92 | while cur_node is not None: 93 | if cur_node.class_name == 'figure': 94 | parent = cur_node.parent 95 | if parent: 96 | to_remove.append(cur_node) 97 | break 98 | cur_node = cur_node.parent 99 | 100 | for node in to_remove: 101 | parent = node.parent 102 | if parent is None: 103 | continue 104 | parent.remove_child(node) 105 | 106 | def remove_link_clusters(tree): 107 | # First, find all links that are in span blocks. If they have no siblings, delete the span. 108 | to_remove = [] 109 | 110 | span_links = tree.document.query_selector_all('span a') 111 | for link in span_links: 112 | parent = link.parent 113 | if parent is None: 114 | continue 115 | n_siblings = 0 116 | for sibling in parent.child_nodes: 117 | if sibling.type == 1: 118 | n_siblings += 1 119 | break 120 | if n_siblings == 1: 121 | grandparent = parent.parent 122 | if grandparent is None: 123 | continue 124 | # grandparent.remove_child(parent) 125 | to_remove.append(parent) 126 | 127 | 128 | links = list(tree.document.query_selector_all('a')) 129 | 130 | i = 0 131 | while len(links) > 0: 132 | link = links[0] 133 | del links[0] 134 | parent = link.parent 135 | i += 1 136 | if parent is None or parent.parent is None: 137 | continue 138 | n_links = 0 139 | n_children = len(parent.child_nodes) 140 | child_links = parent.query_selector_all('a') 141 | if len(child_links) == n_children: 142 | for child_link in child_links: 143 | # Check if it's visible and not empty. 144 | empty = child_link.text is None or child_link.text.strip() == '' 145 | styles = child_link.getattr('style') 146 | visible = styles is None or not (has_style('display: none', styles) or has_style('visibility: hidden', styles)) 147 | if visible and not empty: 148 | n_links += 1 149 | multilink = n_links > 1 and n_children == n_links 150 | if multilink: 151 | grandparent = parent.parent 152 | if grandparent is None: 153 | continue 154 | # grandparent.remove_child(parent) 155 | to_remove.append(parent) 156 | 157 | for node in to_remove: 158 | parent = node.parent 159 | if parent is None: 160 | continue 161 | parent.remove_child(node) 162 | 163 | def extract_code(tree, replacement_manager): 164 | wp_syntax = tree.document.query_selector_all('.wp_syntax') 165 | codes = tree.document.query_selector_all('code') 166 | code_responsive = tree.document.query_selector_all('.code_responsive') 167 | pre_tags = tree.document.query_selector_all('pre') 168 | for code in [*wp_syntax, *codes, *code_responsive, *pre_tags]: 169 | multiline = code.text.count('\n') > 0 170 | if len(code.text) > 0: 171 | if multiline: 172 | code.text = replacement_manager.add_replacement(f'```{code.text}```', tag='code') 173 | else: 174 | code.text = replacement_manager.add_replacement(f'`{code.text}`', tag='code') 175 | 176 | def extract_tables(node, replacement_manager, table_config): 177 | if table_config['format'] == 'none': 178 | return 179 | # Don't worry about tables that have tables in them or have headers 180 | # tables = node.query_selector_all('table:not(:has(table *))') 181 | tables = node.query_selector_all('table:not(:has(table, h1, h2, h3, h4, h5, h6))') 182 | for table in tables: 183 | table_data = [] 184 | headers = [] 185 | # Find all headers 186 | ths = table.query_selector_all('th') 187 | for th in ths: 188 | headers.append(th.text) 189 | trs = table.query_selector_all('tr') 190 | for tr in trs: 191 | row_data = [] 192 | tds = tr.query_selector_all('td') 193 | for td in tds: 194 | # Remove any scripts 195 | scripts = td.query_selector_all('script') 196 | for script in scripts: 197 | script.parent.remove_child(script) 198 | # Get the text of each td element 199 | row_data.append(td.text) 200 | col_span = td.getattr('colspan') 201 | if col_span: 202 | try: 203 | col_span = int(col_span) 204 | if col_span > 100: 205 | continue 206 | except ValueError: 207 | continue 208 | # Add empty cells for colspans 209 | for _ in range(col_span - 1): 210 | row_data.append('') 211 | table_data.append(row_data) 212 | if len(table_data) == 0 or len(table_data[0]) == 0: 213 | continue 214 | # Post processing 215 | # Make sure all rows have the same number of columns 216 | max_cols = max([len(row) for row in table_data]) 217 | for row in table_data: 218 | if len(row) < max_cols: 219 | row.extend([''] * (max_cols - len(row))) 220 | # Strip all cells 221 | for i in range(len(table_data)): 222 | for j in range(len(table_data[i])): 223 | table_data[i][j] = table_data[i][j].strip() 224 | # If any columns or rows are consistently empty, remove them 225 | # Remove empty columns 226 | empty_columns = [] 227 | for i in range(len(table_data[0])): 228 | if all([len(row[i]) == 0 for row in table_data]): 229 | empty_columns.append(i) 230 | 231 | for i in reversed(empty_columns): 232 | for row in table_data: 233 | del row[i] 234 | # Remove empty rows 235 | table_data = [row for row in table_data if len(row) > 0] 236 | 237 | # Remove any newlines from the table 238 | for i in range(len(table_data)): 239 | for j in range(len(table_data[i])): 240 | table_data[i][j] = table_data[i][j].replace('\n', ' ') 241 | # Check that the table has at least one row and one column 242 | if len(table_data) >= table_config['min_rows'] and len(table_data[0]) >= table_config['min_cols']: 243 | # Replace the table with a markdown 244 | parent = table.parent 245 | if parent: 246 | if len(headers) == 0: 247 | headers = [''] * len(table_data[0]) 248 | rendered_table = tabulate(table_data, tablefmt=table_config['format'], headers=headers) 249 | table.html = replacement_manager.add_replacement(rendered_table, tag='table') 250 | elif len(table_data) > 0 and len(table_data[0]) > 0: 251 | # Do the same but use a plain format 252 | # Replace the table with a markdown 253 | parent = table.parent 254 | if parent: 255 | if len(headers) == 0: 256 | headers = [''] * len(table_data[0]) 257 | rendered_table = tabulate(table_data, tablefmt='plain', headers=headers) 258 | table.html = replacement_manager.add_replacement(rendered_table, tag='table') 259 | else: 260 | # Remove empty tables 261 | if table.parent: 262 | table.parent.remove_child(table) 263 | 264 | return node 265 | 266 | def extract_headings(tree, replacement_manager, markdown_formatting): 267 | to_remove = [] 268 | for heading_tag in header_to_format: 269 | hs = tree.document.query_selector_all(heading_tag) 270 | for heading in hs: 271 | text = "" 272 | for child in heading.child_nodes: 273 | if child.text.strip() != "" and child.type != 8: 274 | text += child.text 275 | child.text = "" 276 | text = text.strip() 277 | if len(text) == 0: 278 | # remove the heading 279 | if heading.parent: 280 | to_remove.append(heading) 281 | continue 282 | if markdown_formatting: 283 | heading.text = replacement_manager.add_replacement(header_to_format[heading_tag] + " " + text + '\n\n', 284 | tag=heading_tag) 285 | else: 286 | heading.text = replacement_manager.add_replacement(text + '\n\n', tag=heading_tag) 287 | 288 | for heading in to_remove: 289 | parent = heading.parent 290 | if parent: 291 | parent.remove_child(heading) 292 | 293 | def post_process_headings(text): 294 | """Replace [heading_i] with '#' * i""" 295 | for i in range(6, 0, -1): 296 | text = text.replace('[heading_%d]' % i, '#' * i) 297 | return text 298 | 299 | def add_se_separators(tree): 300 | user_infos = tree.document.query_selector_all('table.fw') 301 | # Replace all of these with spans - 302 | for user_info in user_infos: 303 | new_span = tree.create_element('span') 304 | new_span.text = '-' 305 | parent = user_info.parent 306 | # Remove the table 307 | parent.remove_child(user_info) 308 | # Add the span 309 | parent.append_child(new_span) 310 | 311 | def wikipedia_preprocess(tree): 312 | external_links = tree.document.query_selector('#External_links') 313 | if external_links: 314 | # Remove all next until nothing left 315 | node = external_links.parent.next 316 | while node: 317 | next = node.next 318 | node.parent.remove_child(node) 319 | node = next 320 | external_links.parent.remove_child(external_links) 321 | 322 | edit_buttons = tree.document.query_selector_all('.mw-editsection') 323 | for edit_button in edit_buttons: 324 | if edit_button.parent: 325 | edit_button.parent.remove_child(edit_button) 326 | 327 | def remove_display_none(tree): 328 | # Remove all elements with display none 329 | elements = tree.document.query_selector_all('[style*="display:none"]') 330 | for element in elements: 331 | element.parent.remove_child(element) 332 | 333 | def preserve_question_headers(tree): 334 | elements = tree.document.query_selector_all('#question-header') 335 | for element in elements: 336 | inner_h1 = element.query_selector('h1') 337 | if inner_h1: 338 | new_h1 = tree.create_element('h1') 339 | new_h1.text = inner_h1.text 340 | element.parent.replace_child(new_h1, element) 341 | 342 | def main_content_preprocess(tree): 343 | """Make any changes that are necessary to maximize the performance 344 | of the resiliparse main_content=True option.""" 345 | 346 | # Look for qa-main class 347 | qa_main = tree.document.query_selector('.qa-main') 348 | if qa_main: 349 | qa_main.setattr('class', 'article-body') 350 | 351 | # If there is a role=main and a question-header class, add the question-header to the top of the role=main 352 | role_main = tree.document.query_selector('[role="main"]') 353 | if role_main: 354 | question_header = tree.document.query_selector('#question-header') 355 | if question_header: 356 | first_child = role_main.first_child 357 | if first_child: 358 | role_main.insert_before(question_header, first_child) 359 | 360 | post_content = tree.document.query_selector('.postcontent') 361 | if post_content: 362 | post_body = tree.document.query_selector('.postbody') 363 | if post_body: 364 | # Set the class of postbody to postcontent and remove the postcontent class 365 | post_body.setattr('class', 'postcontent') 366 | post_content.setattr('class', '') 367 | 368 | # Find .postbit 369 | postbit = tree.document.query_selector('.postbit') 370 | if postbit: 371 | # Change the class to article-body 372 | postbit.setattr('class', '') 373 | 374 | # Find all ul and add a few wrapping divs to move them farther from the root node 375 | uls = tree.document.query_selector_all('ul') 376 | for ul in uls: 377 | # Create 4 nested divs and set the html of the last one to the html of the ul. Then replace the ul with the last div 378 | div1 = tree.create_element('div') 379 | div2 = tree.create_element('div') 380 | div3 = tree.create_element('div') 381 | div4 = tree.create_element('div') 382 | div4.html = ul.html 383 | div3.append_child(div4) 384 | div2.append_child(div3) 385 | div1.append_child(div2) 386 | if ul.parent: 387 | ul.parent.replace_child(div1, ul) -------------------------------------------------------------------------------- /text_extraction/text_extract/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import yaml 3 | import numpy as np 4 | 5 | def has_style(style, styles): 6 | """Does the style string contain any of the styles? 7 | This function is robust to variations in the spaces between the styles. 8 | """ 9 | # Remove any spaces. 10 | style = style.replace(' ', '') 11 | styles = [s.replace(' ', '') for s in styles] 12 | for s in styles: 13 | if s in style: 14 | return True 15 | return False 16 | 17 | def word_wrap(text, char_width=20): 18 | """Wrap text to a given width, not breaking words.""" 19 | if not text: 20 | return "" 21 | 22 | words = text.split() 23 | lines = [] 24 | current_line = [] 25 | 26 | for word in words: 27 | if len(" ".join(current_line + [word])) <= char_width: 28 | current_line.append(word) 29 | else: 30 | if current_line: # Check if current_line is not empty 31 | lines.append(" ".join(current_line)) 32 | current_line = [word] 33 | 34 | # Handle the case when the word is longer than the character width 35 | while len(current_line[0]) > char_width: 36 | lines.append(current_line[0][:char_width]) 37 | current_line[0] = current_line[0][char_width:] 38 | 39 | if current_line: 40 | lines.append(" ".join(current_line)) 41 | 42 | return "\n".join(lines) 43 | 44 | class ReplacementManager: 45 | """This replacement manager simply adds tags next to the instances of the text. 46 | It contains a method to remove these tags.""" 47 | 48 | def __init__(self): 49 | self.tags = [] 50 | 51 | def add_replacement(self, text, tag='default'): 52 | self.tags.append(tag) 53 | return f'§§{tag}§§' + text 54 | 55 | def remove_tags(self, text): 56 | tag_regex = "|".join(f'§§{tag}§§' for tag in self.tags) 57 | return re.sub(tag_regex, '', text) 58 | 59 | def has_tag(self, text, tag): 60 | return f'§§{tag}§§' in text 61 | 62 | class Config: 63 | """A simple config object that loads a config from a YAML file and 64 | presents as a dictionary""" 65 | 66 | def __init__(self, config_file): 67 | with open(config_file, 'r') as f: 68 | self.config = yaml.safe_load(f) 69 | 70 | def sample_from_list(self, list): 71 | """Sample from a list of (probability, value) tuples.""" 72 | probabilities = [p for p, _ in list] 73 | values = [v for _, v in list] 74 | probabilities = np.array(probabilities) 75 | probabilities /= probabilities.sum() 76 | return np.random.choice(values, p=probabilities) 77 | 78 | def _sample(self, config): 79 | # For every value that has a type of list, first check it is in the format of: 80 | # - (probability, value) 81 | # - (probability, value) 82 | # - ... 83 | # And the probabilities sum to 1. 84 | # Then sample from the list. 85 | sampled_config = {} 86 | for key, value in config.items(): 87 | # print the type of the value 88 | if isinstance(value, list): 89 | # Check the format of the list. 90 | # Check the probabilities sum to 1. 91 | # Sample from the list. 92 | sampled_config[key] = self.sample_from_list(value) 93 | elif isinstance(value, dict): 94 | sampled_config[key] = self._sample(value) 95 | else: 96 | sampled_config[key] = value 97 | return sampled_config 98 | 99 | def sample(self): 100 | return self._sample(self.config) --------------------------------------------------------------------------------