├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── deploy.yaml │ └── test.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── index_files └── figure-commonmark │ └── mermaid-figure-1.png ├── nbs ├── 00_core.ipynb ├── 01_filter.ipynb ├── 02_clean.ipynb ├── 03_helpers.ipynb ├── 04_tutorials.ipynb ├── _quarto.yml ├── index.ipynb ├── nbdev.yml ├── sidebar.yml └── styles.css ├── settings.ini ├── setup.py └── squeakily ├── __init__.py ├── _modidx.py ├── clean.py ├── core.py ├── filter.py └── helpers.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | on: 3 | push: 4 | branches: [ "main", "master" ] 5 | workflow_dispatch: 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | steps: [uses: fastai/workflows/quarto-ghp@master] 10 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [workflow_dispatch, pull_request, push] 3 | 4 | jobs: 5 | test: 6 | runs-on: ubuntu-latest 7 | steps: [uses: fastai/workflows/nbdev-ci@master] 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bin 2 | pilev2/ 3 | _docs/ 4 | _proc/ 5 | 6 | *.bak 7 | .gitattributes 8 | .last_checked 9 | .gitconfig 10 | *.bak 11 | *.log 12 | *~ 13 | ~* 14 | _tmp* 15 | tmp* 16 | tags 17 | *.pkg 18 | 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # dotenv 101 | .env 102 | 103 | # virtualenv 104 | .venv 105 | venv/ 106 | ENV/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | 121 | .vscode 122 | *.swp 123 | 124 | # osx generated files 125 | .DS_Store 126 | .DS_Store? 127 | .Trashes 128 | ehthumbs.db 129 | Thumbs.db 130 | .idea 131 | 132 | # pytest 133 | .pytest_cache 134 | 135 | # tools/trust-doc-nbs 136 | docs_src/.last_checked 137 | 138 | # symlinks to fastai 139 | docs_src/fastai 140 | tools/fastai 141 | 142 | # link checker 143 | checklink/cookies.txt 144 | 145 | # .gitconfig is now autogenerated 146 | .gitconfig 147 | 148 | # Quarto installer 149 | .deb 150 | .pkg 151 | 152 | # Quarto 153 | .quarto 154 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | louis@stability.ai. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022, CarperAI, EleutherAI, Chenghao Mou, BigCode, BigScience, and Eduardo Gonzalez Ponferrada 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # squeakily 2 | 3 | 4 | 5 | This repository is heavily inspired by BigScience’s [ROOTs 6 | project](https://github.com/bigscience-workshop/data-preparation) and 7 | EleutherAI’s [The Pile](https://github.com/EleutherAI/the-pile). 8 | 9 | The overall pipeline is as follows: 10 | 11 | ``` mermaid 12 | flowchart LR 13 | A(Defining <br/>Datasources) --> B(Defining Filters <br/>per Datasource) 14 | B --> C(Defining Cleaners <br/>per Datasource) 15 | ``` 16 | 17 | In this library, we define filtering as data instances being removed 18 | from the dataset based on some criteria and cleaning as data instances 19 | being modified in some way. 20 | 21 | ## Install 22 | 23 | ``` sh 24 | pip install squeakily 25 | ``` 26 | 27 | ## How to use 28 | 29 | ### Using the API 30 | 31 | First, we need to define a datasource. `squeakily` accepts any `Dataset` 32 | object from the [HuggingFace 33 | Datasets](https://huggingface.co/docs/datasets/index) library. For 34 | example, we can use the 35 | [wikitext](https://huggingface.co/datasets/wikitext) dataset: 36 | 37 | ``` python 38 | from datasets import load_dataset 39 | 40 | ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]") 41 | ``` 42 | 43 | We simply need to wrap the `Dataset` object in a dictionary, with the 44 | key being the name of the datasource and the value being the `Dataset` 45 | object, the filter and cleaners. For example: 46 | 47 | ``` python 48 | from squeakily.filter import check_char_repetition, check_flagged_words 49 | from squeakily.clean import remove_empty_lines, normalize_whitespace 50 | 51 | datasources = [ 52 | { 53 | "dataset": ds, 54 | "name": "wikitext", 55 | "columns": ["text"], 56 | "filters": [check_char_repetition, check_flagged_words], 57 | "cleaners": [remove_empty_lines, normalize_whitespace], 58 | }, 59 | # ... 60 | ] 61 | ``` 62 | 63 |
[11/16/22 04:32:57] INFO Running datasource: wikitext core.py:41 95 |96 |
INFO Running filter: check_char_repetition on text core.py:54 97 |98 |
INFO Running filter: check_flagged_words on text core.py:54 99 |100 |
INFO Running cleaner: remove_empty_lines on text core.py:57 101 |102 |
[11/16/22 04:32:59] INFO Running cleaner: normalize_whitespace on text core.py:57 103 |104 | 105 |
[11/16/22 04:50:08] INFO Running datasource: wikitext core.py:41\n", 145 | "\n" 146 | ], 147 | "text/plain": [ 148 | "\u001b[2;36m[11/16/22 04:50:08]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Running datasource: wikitext \u001b]8;id=538643;file:///fsx/home-nathan/work/squeakily/squeakily/core.py\u001b\\\u001b[2mcore.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=230902;file:///fsx/home-nathan/work/squeakily/squeakily/core.py#41\u001b\\\u001b[2m41\u001b[0m\u001b]8;;\u001b\\\n" 149 | ] 150 | }, 151 | "metadata": {}, 152 | "output_type": "display_data" 153 | }, 154 | { 155 | "data": { 156 | "text/html": [ 157 | "
INFO Running cleaner: clean on text core.py:57\n", 158 | "\n" 159 | ], 160 | "text/plain": [ 161 | "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Running cleaner: clean on text \u001b]8;id=441718;file:///fsx/home-nathan/work/squeakily/squeakily/core.py\u001b\\\u001b[2mcore.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=808891;file:///fsx/home-nathan/work/squeakily/squeakily/core.py#57\u001b\\\u001b[2m57\u001b[0m\u001b]8;;\u001b\\\n" 162 | ] 163 | }, 164 | "metadata": {}, 165 | "output_type": "display_data" 166 | }, 167 | { 168 | "name": "stderr", 169 | "output_type": "stream", 170 | "text": [ 171 | "#0: 0%| | 0/251 [00:00, ?ex/s]\n", 172 | "\u001b[A\n", 173 | "\n", 174 | "\u001b[A\u001b[A\n", 175 | "\n", 176 | "\n", 177 | "\u001b[A\u001b[A\u001b[A\n", 178 | "\n", 179 | "\n", 180 | "\n", 181 | "#0: 100%|██████████| 251/251 [00:00<00:00, 3072.01ex/s]\n", 182 | "\n", 183 | "\n", 184 | "\n", 185 | "\n", 186 | "\n", 187 | "#6: 0%| | 0/251 [00:00, ?ex/s]\n", 188 | "\n", 189 | "\n", 190 | "\n", 191 | "\n", 192 | "\n", 193 | "#1: 100%|██████████| 251/251 [00:00<00:00, 2612.54ex/s]\n", 194 | "#2: 100%|██████████| 251/251 [00:00<00:00, 2855.57ex/s]\n", 195 | "\n", 196 | "\n", 197 | "\n", 198 | "\n", 199 | "\n", 200 | "\n", 201 | "\n", 202 | "#3: 100%|██████████| 251/251 [00:00<00:00, 2935.28ex/s]\n", 203 | "#4: 100%|██████████| 251/251 [00:00<00:00, 3264.68ex/s]\n", 204 | "\n", 205 | "\n", 206 | "\n", 207 | "\n", 208 | "\n", 209 | "\n", 210 | "\n", 211 | "\n", 212 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 213 | "\n", 214 | "\n", 215 | "\n", 216 | "\n", 217 | "\n", 218 | "\n", 219 | "\n", 220 | "\n", 221 | "#10: 0%| | 0/251 [00:00, ?ex/s]\n", 222 | "\n", 223 | "\n", 224 | "\n", 225 | "\u001b[A\u001b[A\u001b[A\u001b[A\n", 226 | "\n", 227 | "\n", 228 | "\n", 229 | "\n", 230 | "\n", 231 | "\n", 232 | "\n", 233 | "\n", 234 | "\n", 235 | "#5: 100%|██████████| 251/251 [00:00<00:00, 2389.82ex/s]\n", 236 | "#6: 100%|██████████| 251/251 [00:00<00:00, 2589.32ex/s]\n", 237 | "\n", 238 | "\n", 239 | "\n", 240 | "\n", 241 | "\n", 242 | "\n", 243 | "\n", 244 | "\n", 245 | "\n", 246 | "\n", 247 | "\n", 248 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 249 | "\n", 250 | "\n", 251 | "\n", 252 | "\n", 253 | "\n", 254 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 255 | "\n", 256 | "\n", 257 | "\n", 258 | "\n", 259 | "\n", 260 | "\n", 261 | "\n", 262 | "\n", 263 | "\n", 264 | "\n", 265 | "\n", 266 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 267 | "\n", 268 | "\n", 269 | "\n", 270 | "\n", 271 | "\n", 272 | "\n", 273 | "#7: 100%|██████████| 251/251 [00:00<00:00, 2034.34ex/s]\n", 274 | "#9: 100%|██████████| 251/251 [00:00<00:00, 2617.65ex/s]\n", 275 | "\n", 276 | "\n", 277 | "\n", 278 | "\n", 279 | "\n", 280 | "\n", 281 | "\n", 282 | "\n", 283 | "\n", 284 | "\n", 285 | "\n", 286 | "\n", 287 | "\n", 288 | "#11: 100%|██████████| 251/251 [00:00<00:00, 3306.24ex/s]\n", 289 | "#8: 100%|██████████| 251/251 [00:00<00:00, 1814.93ex/s]\n", 290 | "\n", 291 | "\n", 292 | "\n", 293 | "\n", 294 | "\n", 295 | "\n", 296 | "\n", 297 | "\n", 298 | "\n", 299 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 300 | "\n", 301 | "\n", 302 | "\n", 303 | "\n", 304 | "\n", 305 | "\n", 306 | "\n", 307 | "\n", 308 | "\n", 309 | "\n", 310 | "\n", 311 | "\n", 312 | "\n", 313 | "#10: 100%|██████████| 251/251 [00:00<00:00, 2270.29ex/s]\n", 314 | "\n", 315 | "\n", 316 | "\n", 317 | "\n", 318 | "\n", 319 | "\n", 320 | "\n", 321 | "\n", 322 | "\n", 323 | "\n", 324 | "\n", 325 | "\n", 326 | "\n", 327 | "\n", 328 | "\n", 329 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 330 | "\n", 331 | "\n", 332 | "\n", 333 | "\n", 334 | "\n", 335 | "\n", 336 | "\n", 337 | "\n", 338 | "\n", 339 | "\n", 340 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 341 | "\n", 342 | "\n", 343 | "\n", 344 | "\n", 345 | "\n", 346 | "\n", 347 | "\n", 348 | "\n", 349 | "\n", 350 | "\n", 351 | "\n", 352 | "\n", 353 | "\n", 354 | "\n", 355 | "\n", 356 | "#13: 100%|██████████| 251/251 [00:00<00:00, 3016.87ex/s]\n", 357 | "#12: 100%|██████████| 251/251 [00:00<00:00, 2143.69ex/s]\n", 358 | "\n", 359 | "\n", 360 | "\n", 361 | "\n", 362 | "\n", 363 | "\n", 364 | "\n", 365 | "\n", 366 | "\n", 367 | "\n", 368 | "\n", 369 | "\n", 370 | "\n", 371 | "\n", 372 | "\n", 373 | "\n", 374 | "\n", 375 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 376 | "\n", 377 | "\n", 378 | "\n", 379 | "\n", 380 | "\n", 381 | "\n", 382 | "\n", 383 | "\n", 384 | "\n", 385 | "\n", 386 | "\n", 387 | "\n", 388 | "\n", 389 | "\n", 390 | "\n", 391 | "\n", 392 | "\n", 393 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 394 | "\n", 395 | "\n", 396 | "\n", 397 | "\n", 398 | "\n", 399 | "\n", 400 | "\n", 401 | "\n", 402 | "\n", 403 | "\n", 404 | "\n", 405 | "\n", 406 | "#14: 100%|██████████| 250/250 [00:00<00:00, 2288.07ex/s]\n", 407 | "#16: 100%|██████████| 250/250 [00:00<00:00, 3103.58ex/s]\n", 408 | "#15: 100%|██████████| 250/250 [00:00<00:00, 2592.26ex/s]\n", 409 | "#17: 100%|██████████| 250/250 [00:00<00:00, 2701.81ex/s]\n", 410 | "#19: 100%|██████████| 250/250 [00:00<00:00, 3712.02ex/s]\n", 411 | "#18: 100%|██████████| 250/250 [00:00<00:00, 2754.21ex/s]\n", 412 | "#21: 100%|██████████| 250/250 [00:00<00:00, 3429.25ex/s]\n", 413 | "#20: 100%|██████████| 250/250 [00:00<00:00, 2555.64ex/s]\n", 414 | "#22: 100%|██████████| 250/250 [00:00<00:00, 2730.66ex/s]\n", 415 | "#23: 100%|██████████| 250/250 [00:00<00:00, 3116.50ex/s]\n", 416 | "#24: 100%|██████████| 250/250 [00:00<00:00, 2475.98ex/s]\n", 417 | "#27: 100%|██████████| 250/250 [00:00<00:00, 2275.80ex/s]\n", 418 | "#28: 100%|██████████| 250/250 [00:00<00:00, 2605.21ex/s]\n", 419 | "#26: 100%|██████████| 250/250 [00:00<00:00, 1912.19ex/s]\n", 420 | "#29: 100%|██████████| 250/250 [00:00<00:00, 2153.56ex/s]\n", 421 | "#25: 100%|██████████| 250/250 [00:00<00:00, 1340.31ex/s]\n", 422 | "#32: 100%|██████████| 250/250 [00:00<00:00, 2992.20ex/s]\n", 423 | "#30: 100%|██████████| 250/250 [00:00<00:00, 2616.85ex/s]\n", 424 | "#35: 100%|██████████| 250/250 [00:00<00:00, 2765.58ex/s]\n", 425 | "#33: 100%|██████████| 250/250 [00:00<00:00, 2415.08ex/s]\n", 426 | "#37: 100%|██████████| 250/250 [00:00<00:00, 2361.90ex/s]\n", 427 | "#34: 100%|██████████| 250/250 [00:00<00:00, 2138.85ex/s]\n", 428 | "\n", 429 | "#31: 100%|██████████| 250/250 [00:00<00:00, 1646.41ex/s]\n", 430 | "#39: 100%|██████████| 250/250 [00:00<00:00, 2733.18ex/s]\n", 431 | "#36: 100%|██████████| 250/250 [00:00<00:00, 1822.53ex/s]\n", 432 | "#40: 100%|██████████| 250/250 [00:00<00:00, 3259.21ex/s]\n", 433 | "#41: 100%|██████████| 250/250 [00:00<00:00, 2362.59ex/s]\n", 434 | "#42: 100%|██████████| 250/250 [00:00<00:00, 2641.48ex/s]\n", 435 | "#43: 100%|██████████| 250/250 [00:00<00:00, 2797.40ex/s]\n", 436 | "#46: 100%|██████████| 250/250 [00:00<00:00, 3180.07ex/s]\n", 437 | "#45: 100%|██████████| 250/250 [00:00<00:00, 2793.68ex/s]\n", 438 | "#44: 100%|██████████| 250/250 [00:00<00:00, 2030.37ex/s]\n", 439 | "#47: 100%|██████████| 250/250 [00:00<00:00, 3685.85ex/s]\n", 440 | "#48: 100%|██████████| 250/250 [00:00<00:00, 2849.12ex/s]\n", 441 | "#50: 100%|██████████| 250/250 [00:00<00:00, 3088.79ex/s]\n", 442 | "#49: 100%|██████████| 250/250 [00:00<00:00, 2418.17ex/s]\n", 443 | "#51: 100%|██████████| 250/250 [00:00<00:00, 2865.90ex/s]\n", 444 | "#52: 100%|██████████| 250/250 [00:00<00:00, 3265.18ex/s]\n", 445 | "#53: 100%|██████████| 250/250 [00:00<00:00, 2847.02ex/s]\n", 446 | "#54: 100%|██████████| 250/250 [00:00<00:00, 2452.67ex/s]\n", 447 | "#56: 100%|██████████| 250/250 [00:00<00:00, 2912.06ex/s]\n", 448 | "#55: 100%|██████████| 250/250 [00:00<00:00, 1961.08ex/s]\n", 449 | "#58: 100%|██████████| 250/250 [00:00<00:00, 3242.75ex/s]\n", 450 | "#57: 100%|██████████| 250/250 [00:00<00:00, 2639.49ex/s]\n", 451 | "#59: 100%|██████████| 250/250 [00:00<00:00, 3095.53ex/s]\n", 452 | "#60: 100%|██████████| 250/250 [00:00<00:00, 3094.43ex/s]\n", 453 | "#61: 100%|██████████| 250/250 [00:00<00:00, 2429.84ex/s]\n", 454 | "#62: 100%|██████████| 250/250 [00:00<00:00, 2758.44ex/s]\n", 455 | "#63: 100%|██████████| 250/250 [00:00<00:00, 2411.69ex/s]\n", 456 | "#64: 100%|██████████| 250/250 [00:00<00:00, 2883.52ex/s]\n", 457 | "#65: 100%|██████████| 250/250 [00:00<00:00, 2773.47ex/s]\n", 458 | "#66: 100%|██████████| 250/250 [00:00<00:00, 2894.81ex/s]\n", 459 | "#67: 100%|██████████| 250/250 [00:00<00:00, 2518.36ex/s]\n", 460 | "#68: 100%|██████████| 250/250 [00:00<00:00, 2671.31ex/s]\n", 461 | "#69: 100%|██████████| 250/250 [00:00<00:00, 2463.28ex/s]\n", 462 | "#70: 100%|██████████| 250/250 [00:00<00:00, 2670.79ex/s]\n", 463 | "#71: 100%|██████████| 250/250 [00:00<00:00, 2941.40ex/s]\n" 464 | ] 465 | } 466 | ], 467 | "source": [ 468 | "# |output: false\n", 469 | "from squeakily.core import Pipeline\n", 470 | "\n", 471 | "pipeline = Pipeline(datasources)\n", 472 | "pipeline.run()" 473 | ] 474 | } 475 | ], 476 | "metadata": { 477 | "kernelspec": { 478 | "display_name": "python3", 479 | "language": "python", 480 | "name": "python3" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 4 485 | } 486 | -------------------------------------------------------------------------------- /nbs/_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | 4 | format: 5 | html: 6 | theme: cosmo 7 | css: styles.css 8 | toc: true 9 | 10 | website: 11 | twitter-card: true 12 | open-graph: true 13 | repo-actions: [issue] 14 | navbar: 15 | background: primary 16 | search: true 17 | sidebar: 18 | style: floating 19 | 20 | metadata-files: [nbdev.yml, sidebar.yml] -------------------------------------------------------------------------------- /nbs/nbdev.yml: -------------------------------------------------------------------------------- 1 | project: 2 | output-dir: _docs 3 | 4 | website: 5 | title: "squeakily" 6 | site-url: "https://CarperAI.github.io/squeakily" 7 | description: "A library for squeakily cleaning and filtering language datasets." 8 | repo-branch: main 9 | repo-url: "https://github.com/CarperAI/squeakily" 10 | -------------------------------------------------------------------------------- /nbs/sidebar.yml: -------------------------------------------------------------------------------- 1 | website: 2 | sidebar: 3 | contents: 4 | - index.ipynb 5 | - 00_core.ipynb 6 | - 01_filter.ipynb 7 | - 02_clean.ipynb 8 | - 03_helpers.ipynb 9 | - 04_tutorials.ipynb 10 | -------------------------------------------------------------------------------- /nbs/styles.css: -------------------------------------------------------------------------------- 1 | .cell { 2 | margin-bottom: 1rem; 3 | } 4 | 5 | .cell > .sourceCode { 6 | margin-bottom: 0; 7 | } 8 | 9 | .cell-output > pre { 10 | margin-bottom: 0; 11 | } 12 | 13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { 14 | margin-left: 0.8rem; 15 | margin-top: 0; 16 | background: none; 17 | border-left: 2px solid lightsalmon; 18 | border-top-left-radius: 0; 19 | border-top-right-radius: 0; 20 | } 21 | 22 | .cell-output > .sourceCode { 23 | border: none; 24 | } 25 | 26 | .cell-output > .sourceCode { 27 | background: none; 28 | margin-top: 0; 29 | } 30 | 31 | div.description { 32 | padding-left: 2px; 33 | padding-top: 5px; 34 | font-style: italic; 35 | font-size: 135%; 36 | opacity: 70%; 37 | } 38 | -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | repo = squeakily 3 | lib_name = squeakily 4 | version = 0.0.3 5 | min_python = 3.7 6 | license = apache2 7 | doc_path = _docs 8 | lib_path = squeakily 9 | nbs_path = nbs 10 | recursive = True 11 | tst_flags = notest 12 | put_version_in_init = True 13 | branch = main 14 | custom_sidebar = False 15 | doc_host = https://CarperAI.github.io 16 | doc_baseurl = /squeakily 17 | git_url = https://github.com/CarperAI/squeakily 18 | title = squeakily 19 | audience = Developers 20 | author = ncoop57 21 | author_email = nathan.cooper@stability.ai 22 | copyright = 2022 onwards, ncoop57 23 | description = A library for squeakily cleaning and filtering language datasets. 24 | keywords = nbdev jupyter notebook python 25 | language = English 26 | status = 3 27 | user = CarperAI 28 | requirements = datasketch==1.5.8 datasets==2.7.1 Faker==15.3.3 fastcore huggingface-hub networkit pydantic rich ftfy scikit-learn 29 | dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai code-ast 30 | black_formatting = False 31 | readme_nb = index.ipynb 32 | allowed_metadata_keys = 33 | allowed_cell_metadata_keys = 34 | jupyter_hooks = True 35 | clean_ids = True 36 | clear_all = False 37 | 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import parse_version 2 | from configparser import ConfigParser 3 | import setuptools 4 | assert parse_version(setuptools.__version__)>=parse_version('36.2') 5 | 6 | # note: all settings are in settings.ini; edit there, not here 7 | config = ConfigParser(delimiters=['=']) 8 | config.read('settings.ini') 9 | cfg = config['DEFAULT'] 10 | 11 | cfg_keys = 'version description keywords author author_email'.split() 12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split() 13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o) 14 | setup_cfg = {o:cfg[o] for o in cfg_keys} 15 | 16 | licenses = { 17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), 18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'), 19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), 20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), 21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), 22 | } 23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', 24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] 25 | py_versions = '3.6 3.7 3.8 3.9 3.10'.split() 26 | 27 | requirements = cfg.get('requirements','').split() 28 | if cfg.get('pip_requirements'): requirements += cfg.get('pip_requirements','').split() 29 | min_python = cfg['min_python'] 30 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) 31 | dev_requirements = (cfg.get('dev_requirements') or '').split() 32 | 33 | setuptools.setup( 34 | name = cfg['lib_name'], 35 | license = lic[0], 36 | classifiers = [ 37 | 'Development Status :: ' + statuses[int(cfg['status'])], 38 | 'Intended Audience :: ' + cfg['audience'].title(), 39 | 'Natural Language :: ' + cfg['language'].title(), 40 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), 41 | url = cfg['git_url'], 42 | packages = setuptools.find_packages(), 43 | include_package_data = True, 44 | install_requires = requirements, 45 | extras_require={ 'dev': dev_requirements }, 46 | dependency_links = cfg.get('dep_links','').split(), 47 | python_requires = '>=' + cfg['min_python'], 48 | long_description = open('README.md').read(), 49 | long_description_content_type = 'text/markdown', 50 | zip_safe = False, 51 | entry_points = { 52 | 'console_scripts': cfg.get('console_scripts','').split(), 53 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] 54 | }, 55 | **setup_cfg) 56 | 57 | 58 | -------------------------------------------------------------------------------- /squeakily/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.3" 2 | -------------------------------------------------------------------------------- /squeakily/_modidx.py: -------------------------------------------------------------------------------- 1 | # Autogenerated by nbdev 2 | 3 | d = { 'settings': { 'branch': 'main', 4 | 'doc_baseurl': '/squeakily', 5 | 'doc_host': 'https://CarperAI.github.io', 6 | 'git_url': 'https://github.com/CarperAI/squeakily', 7 | 'lib_path': 'squeakily'}, 8 | 'syms': { 'squeakily.clean': { 'squeakily.clean.clean_code_license': ('clean.html#clean_code_license', 'squeakily/clean.py'), 9 | 'squeakily.clean.fix_utf8_encoding': ('clean.html#fix_utf8_encoding', 'squeakily/clean.py'), 10 | 'squeakily.clean.normalize_punctuation': ('clean.html#normalize_punctuation', 'squeakily/clean.py'), 11 | 'squeakily.clean.normalize_whitespace': ('clean.html#normalize_whitespace', 'squeakily/clean.py'), 12 | 'squeakily.clean.remove_empty_lines': ('clean.html#remove_empty_lines', 'squeakily/clean.py'), 13 | 'squeakily.clean.replace_credit_card': ('clean.html#replace_credit_card', 'squeakily/clean.py'), 14 | 'squeakily.clean.replace_dates': ('clean.html#replace_dates', 'squeakily/clean.py'), 15 | 'squeakily.clean.replace_email': ('clean.html#replace_email', 'squeakily/clean.py'), 16 | 'squeakily.clean.replace_ip': ('clean.html#replace_ip', 'squeakily/clean.py'), 17 | 'squeakily.clean.replace_phone': ('clean.html#replace_phone', 'squeakily/clean.py'), 18 | 'squeakily.clean.replace_ssn': ('clean.html#replace_ssn', 'squeakily/clean.py'), 19 | 'squeakily.clean.replace_urls': ('clean.html#replace_urls', 'squeakily/clean.py')}, 20 | 'squeakily.core': { 'squeakily.core.Pipeline': ('core.html#pipeline', 'squeakily/core.py'), 21 | 'squeakily.core.Pipeline.__init__': ('core.html#pipeline.__init__', 'squeakily/core.py'), 22 | 'squeakily.core.Pipeline.__run_filter': ('core.html#pipeline.__run_filter', 'squeakily/core.py'), 23 | 'squeakily.core.Pipeline.export_to_path': ('core.html#pipeline.export_to_path', 'squeakily/core.py'), 24 | 'squeakily.core.Pipeline.run': ('core.html#pipeline.run', 'squeakily/core.py')}, 25 | 'squeakily.filter': { 'squeakily.filter._calculate_average_false_positive_rate': ( 'filter.html#_calculate_average_false_positive_rate', 26 | 'squeakily/filter.py'), 27 | 'squeakily.filter._char_rep_ratio': ('filter.html#_char_rep_ratio', 'squeakily/filter.py'), 28 | 'squeakily.filter._compress_ratio': ('filter.html#_compress_ratio', 'squeakily/filter.py'), 29 | 'squeakily.filter._find_duplicate_communities': ( 'filter.html#_find_duplicate_communities', 30 | 'squeakily/filter.py'), 31 | 'squeakily.filter._flag_word_ratio': ('filter.html#_flag_word_ratio', 'squeakily/filter.py'), 32 | 'squeakily.filter._hash_func': ('filter.html#_hash_func', 'squeakily/filter.py'), 33 | 'squeakily.filter._jaccard_similarity': ('filter.html#_jaccard_similarity', 'squeakily/filter.py'), 34 | 'squeakily.filter._query_content': ('filter.html#_query_content', 'squeakily/filter.py'), 35 | 'squeakily.filter.check_char_repetition': ('filter.html#check_char_repetition', 'squeakily/filter.py'), 36 | 'squeakily.filter.check_code_parsability': ('filter.html#check_code_parsability', 'squeakily/filter.py'), 37 | 'squeakily.filter.check_compression_ratio': ( 'filter.html#check_compression_ratio', 38 | 'squeakily/filter.py'), 39 | 'squeakily.filter.check_flagged_words': ('filter.html#check_flagged_words', 'squeakily/filter.py'), 40 | 'squeakily.filter.check_labels': ('filter.html#check_labels', 'squeakily/filter.py'), 41 | 'squeakily.filter.check_language': ('filter.html#check_language', 'squeakily/filter.py'), 42 | 'squeakily.filter.check_perplexity': ('filter.html#check_perplexity', 'squeakily/filter.py'), 43 | 'squeakily.filter.check_stop_word_ratio': ('filter.html#check_stop_word_ratio', 'squeakily/filter.py'), 44 | 'squeakily.filter.check_word_number': ('filter.html#check_word_number', 'squeakily/filter.py'), 45 | 'squeakily.filter.minhash_dedup': ('filter.html#minhash_dedup', 'squeakily/filter.py')}, 46 | 'squeakily.helpers': { 'squeakily.helpers.FastTextLanguageDetector': ( 'helpers.html#fasttextlanguagedetector', 47 | 'squeakily/helpers.py'), 48 | 'squeakily.helpers.FastTextLanguageDetector.__eq__': ( 'helpers.html#fasttextlanguagedetector.__eq__', 49 | 'squeakily/helpers.py'), 50 | 'squeakily.helpers.FastTextLanguageDetector.__init__': ( 'helpers.html#fasttextlanguagedetector.__init__', 51 | 'squeakily/helpers.py'), 52 | 'squeakily.helpers.FastTextLanguageDetector.__reduce__': ( 'helpers.html#fasttextlanguagedetector.__reduce__', 53 | 'squeakily/helpers.py'), 54 | 'squeakily.helpers.FastTextLanguageDetector.from_pretrained': ( 'helpers.html#fasttextlanguagedetector.from_pretrained', 55 | 'squeakily/helpers.py'), 56 | 'squeakily.helpers.FastTextLanguageDetector.get_language': ( 'helpers.html#fasttextlanguagedetector.get_language', 57 | 'squeakily/helpers.py'), 58 | 'squeakily.helpers.KenlmModel': ('helpers.html#kenlmmodel', 'squeakily/helpers.py'), 59 | 'squeakily.helpers.KenlmModel.__init__': ('helpers.html#kenlmmodel.__init__', 'squeakily/helpers.py'), 60 | 'squeakily.helpers.KenlmModel.download_kenlm_model': ( 'helpers.html#kenlmmodel.download_kenlm_model', 61 | 'squeakily/helpers.py'), 62 | 'squeakily.helpers.KenlmModel.from_pretrained': ( 'helpers.html#kenlmmodel.from_pretrained', 63 | 'squeakily/helpers.py'), 64 | 'squeakily.helpers.KenlmModel.get_perplexity': ( 'helpers.html#kenlmmodel.get_perplexity', 65 | 'squeakily/helpers.py'), 66 | 'squeakily.helpers.KenlmModel.normalize': ('helpers.html#kenlmmodel.normalize', 'squeakily/helpers.py'), 67 | 'squeakily.helpers.KenlmModel.pp': ('helpers.html#kenlmmodel.pp', 'squeakily/helpers.py'), 68 | 'squeakily.helpers.KenlmModel.remove_non_printing_char': ( 'helpers.html#kenlmmodel.remove_non_printing_char', 69 | 'squeakily/helpers.py'), 70 | 'squeakily.helpers.KenlmModel.remove_unicode_punct': ( 'helpers.html#kenlmmodel.remove_unicode_punct', 71 | 'squeakily/helpers.py'), 72 | 'squeakily.helpers.KenlmModel.replace_unicode_punct': ( 'helpers.html#kenlmmodel.replace_unicode_punct', 73 | 'squeakily/helpers.py'), 74 | 'squeakily.helpers.KenlmModel.strip_accents': ( 'helpers.html#kenlmmodel.strip_accents', 75 | 'squeakily/helpers.py'), 76 | 'squeakily.helpers.LLMLabeler': ('helpers.html#llmlabeler', 'squeakily/helpers.py'), 77 | 'squeakily.helpers.LLMLabeler.__call__': ('helpers.html#llmlabeler.__call__', 'squeakily/helpers.py'), 78 | 'squeakily.helpers.LLMLabeler.__init__': ('helpers.html#llmlabeler.__init__', 'squeakily/helpers.py'), 79 | 'squeakily.helpers.LLMLabelerParser': ('helpers.html#llmlabelerparser', 'squeakily/helpers.py'), 80 | 'squeakily.helpers.SentencePiece': ('helpers.html#sentencepiece', 'squeakily/helpers.py'), 81 | 'squeakily.helpers.SentencePiece.__init__': ( 'helpers.html#sentencepiece.__init__', 82 | 'squeakily/helpers.py'), 83 | 'squeakily.helpers.SentencePiece.do': ('helpers.html#sentencepiece.do', 'squeakily/helpers.py'), 84 | 'squeakily.helpers.get_words': ('helpers.html#get_words', 'squeakily/helpers.py')}}} -------------------------------------------------------------------------------- /squeakily/clean.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_clean.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['fake', 'whitespace', 'unicode_punctuation', 'normalize_whitespace', 'normalize_punctuation', 'remove_empty_lines', 5 | 'replace_urls', 'replace_dates', 'replace_email', 'replace_phone', 'replace_ip', 'replace_credit_card', 6 | 'replace_ssn', 'fix_utf8_encoding', 'clean_code_license'] 7 | 8 | # %% ../nbs/02_clean.ipynb 2 9 | import re 10 | from faker import Faker 11 | import ftfy 12 | 13 | fake = Faker() 14 | 15 | # %% ../nbs/02_clean.ipynb 4 16 | # From: https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L95 17 | whitespace = { 18 | " ", 19 | " ", 20 | " ", 21 | " ", 22 | " ", 23 | " ", 24 | " ", 25 | " ", 26 | " ", 27 | " ", 28 | "", 29 | "", 30 | } 31 | 32 | 33 | def normalize_whitespace( 34 | text: str, # The text to normalize 35 | ) -> str: # The normalized text 36 | """ 37 | Replace the various whitespace characters with the standard one. 38 | """ 39 | text = "".join([char if char not in whitespace else " " for char in text]) 40 | return text 41 | 42 | # %% ../nbs/02_clean.ipynb 6 43 | unicode_punctuation = { 44 | ",": ",", 45 | "。": ".", 46 | "、": ",", 47 | "„": '"', 48 | "”": '"', 49 | "“": '"', 50 | "«": '"', 51 | "»": '"', 52 | "1": '"', 53 | "」": '"', 54 | "「": '"', 55 | "《": '"', 56 | "》": '"', 57 | "´": "'", 58 | "∶": ":", 59 | ":": ":", 60 | "?": "?", 61 | "!": "!", 62 | "(": "(", 63 | ")": ")", 64 | ";": ";", 65 | "–": "-", 66 | "—": " - ", 67 | ".": ". ", 68 | "~": "~", 69 | "’": "'", 70 | "…": "...", 71 | "━": "-", 72 | "〈": "<", 73 | "〉": ">", 74 | "【": "[", 75 | "】": "]", 76 | "%": "%", 77 | "►": "-", 78 | } 79 | 80 | 81 | def normalize_punctuation( 82 | text: str, # The text to normalize 83 | ) -> str: # The normalized text 84 | """ 85 | Replace the various unicode punctuation characters with the standard ones. 86 | """ 87 | text = "".join([unicode_punctuation.get(char, char) for char in text]) 88 | return text 89 | 90 | # %% ../nbs/02_clean.ipynb 8 91 | def remove_empty_lines( 92 | text: str, # The text to remove empty lines from 93 | ) -> str: # The text with empty lines removed 94 | """ 95 | Remove empty lines from the text. 96 | Solution from https://stackoverflow.com/a/3711884/5768407 97 | """ 98 | lines = text.splitlines() 99 | filtered = filter(lambda x: not re.match(r"^\s*$", x), lines) 100 | return "\n".join(filtered) 101 | 102 | # %% ../nbs/02_clean.ipynb 10 103 | def replace_urls( 104 | text: str, # The text to replace URLs in 105 | dummy: str = "https://example.com/", # The dummy text to replace URLs with 106 | ) -> str: # The text with URLs replaced 107 | """Replace urls from text with a dummy.""" 108 | return re.sub(r"http\S+", dummy, text) 109 | 110 | # %% ../nbs/02_clean.ipynb 12 111 | def replace_dates( 112 | text: str, # The text to remove dates from 113 | dummy: str = fake.date(), # The dummy text to replace dates with 114 | ) -> str: # The text with dates replaced 115 | """Replace dates from text with a dummy.""" 116 | return re.sub(r"\d{1,2}/\d{1,2}/\d{4}", dummy, text) 117 | 118 | # %% ../nbs/02_clean.ipynb 15 119 | def replace_email( 120 | text: str, # The text to replace email addresses in 121 | dummy: str = fake.email(), # The dummy text to replace email addresses with 122 | ) -> str: # The text with email addresses replaced 123 | """Replace email addresses from text with a dummy.""" 124 | return re.sub(r"[\w\.-]+@[\w\.-]+", dummy, text) 125 | 126 | # %% ../nbs/02_clean.ipynb 17 127 | def replace_phone( 128 | text: str, # The text to replace phone numbers in 129 | dummy: str = fake.phone_number(), # The dummy text to replace phone numbers with 130 | ) -> str: # The text with phone numbers replaced 131 | """Replace phone numbers from text with a dummy.""" 132 | return re.sub(r"\(?\d{3}\)?-? *\d{3}-? *-?\d{4}", dummy, text) 133 | 134 | # %% ../nbs/02_clean.ipynb 19 135 | def replace_ip( 136 | text, # The text to replace ip addresses in 137 | dummy1: str = fake.ipv4(), # The dummy text to replace ipv4 addresses with 138 | dummy2: str = fake.ipv6(), # The dummy text to replace ipv6 addresses with 139 | ) -> str: # The text with ip addresses replaced 140 | """ 141 | Replace ip addresses from text with a dummy. 142 | Solution from https://github.com/bigcode-project/bigcode-analysis/blob/main/data_analysis/pii/utils/emails_ip_addresses_detection.py#L48 143 | """ 144 | ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" 145 | text = re.sub(ipv4_pattern, dummy1, text) 146 | ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])" 147 | text = re.sub(ipv6_pattern, dummy2, text) 148 | return text 149 | 150 | # %% ../nbs/02_clean.ipynb 21 151 | def replace_credit_card( 152 | text: str, # The text to replace credit card numbers in 153 | dummy: str = fake.credit_card_number(), # The dummy text to replace credit card numbers with 154 | ) -> str: # The text with credit card numbers replaced 155 | """Replace credit card numbers from text with a dummy.""" 156 | return re.sub(r"\d{4}-\d{4}-\d{4}-\d{4}", dummy, text) 157 | 158 | # %% ../nbs/02_clean.ipynb 23 159 | def replace_ssn( 160 | text: str, # The text to replace social security numbers in 161 | dummy: str = fake.ssn(), # The dummy text to replace social security numbers with 162 | ) -> str: # The text with social security numbers replaced 163 | """Replace social security numbers from text with a dummy.""" 164 | return re.sub(r"\d{3}-\d{2}-\d{4}", dummy, text) 165 | 166 | # %% ../nbs/02_clean.ipynb 25 167 | def fix_utf8_encoding( 168 | text: str, # The text to fix 169 | ) -> str: # The fixed text 170 | """Fix utf8 text using ftfy.""" 171 | return ftfy.fix_text(text) 172 | 173 | # %% ../nbs/02_clean.ipynb 27 174 | def clean_code_license( 175 | code: str, # The code to clean 176 | language: str = "python", # The language of the code 177 | min_lines: int = 3, # The minimum number of lines that need to be removed 178 | ): 179 | import code_ast 180 | from code_ast import ASTVisitor 181 | from code_ast.ast import LEAVE_WHITELIST 182 | 183 | class FirstNonCommentVisitor(ASTVisitor): 184 | def __init__(self): 185 | self.passed_global_node = False 186 | self.first_node = None 187 | 188 | def visit(self, node): 189 | if not self.passed_global_node: 190 | self.passed_global_node = True 191 | return 192 | if self.first_node is None: 193 | if node.child_count > 0 or node.type in LEAVE_WHITELIST: 194 | self.first_node = node 195 | 196 | """Remove the license or other boilerplate comments from the code.""" 197 | ast = code_ast.ast(code, lang=language) 198 | visitor = FirstNonCommentVisitor() 199 | ast.visit(visitor) 200 | start_line = visitor.first_node.start_point[0] 201 | if start_line < min_lines: 202 | return code 203 | else: 204 | return "\n".join(code.splitlines()[start_line:]) 205 | -------------------------------------------------------------------------------- /squeakily/core.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['logger', 'Pipeline'] 5 | 6 | # %% ../nbs/00_core.ipynb 2 7 | import logging 8 | import os 9 | 10 | from datasets import concatenate_datasets, Dataset 11 | from rich.logging import RichHandler 12 | 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.INFO) 15 | logger.addHandler(RichHandler(rich_tracebacks=True)) 16 | # Turn off logging for datasets 17 | logging.getLogger("datasets").setLevel(logging.ERROR) 18 | 19 | # %% ../nbs/00_core.ipynb 5 20 | class Pipeline: 21 | """ 22 | A pipeline is a collection of datasources and their associated transformations to be run. 23 | """ 24 | 25 | def __init__(self, datasources): # The datasources to be run 26 | self.datasources = datasources 27 | 28 | def __run_filter(self, dataset, column, filter_fn, dry_run, num_proc): 29 | """ 30 | Run a filter on a dataset. 31 | """ 32 | name = filter_fn.__name__ 33 | logger.info(f"Running filter: {name} on {column}") 34 | if dry_run: 35 | logger.info(f"Running in dry-run mode") 36 | return dataset.map( 37 | lambda x: {f"{name}_criteria": filter_fn(x[column], dry_run=True)}, 38 | num_proc=num_proc, 39 | ) 40 | else: 41 | return dataset.filter( 42 | lambda x: filter_fn(x[column]), 43 | num_proc=num_proc, 44 | ) 45 | 46 | def run( 47 | self, 48 | global_filters=[], # Filters to be run at the dataset level rather than the example level 49 | global_cleaners=[], # Cleaners to be run at the dataset level rather than the example level 50 | cleaning_first=False, # Whether to run the cleaning transformations first 51 | globals_first=False, # Whether to run the global transformations first 52 | dry_run=False, # Whether to run the pipeline or only calculate the various criteria and add as a column 53 | num_proc=os.cpu_count(), # Number of processes to use 54 | ): 55 | """ 56 | Run the pipeline. 57 | """ 58 | for i in range(len(self.datasources)): 59 | column = self.datasources[i]["columns"][0] 60 | logger.info(f"Running datasource: {self.datasources[i]['name']}") 61 | if cleaning_first: 62 | for c in self.datasources[i]["cleaners"]: 63 | name = c.__name__ 64 | logger.info(f"Running cleaner: {name} on {column}") 65 | self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map( 66 | lambda x: {column: c(x[column])}, 67 | num_proc=num_proc, 68 | ) 69 | for f in self.datasources[i]["filters"]: 70 | self.datasources[i]["dataset"] = self.__run_filter( 71 | self.datasources[i]["dataset"], column, f, dry_run, num_proc 72 | ) 73 | else: 74 | for f in self.datasources[i]["filters"]: 75 | self.datasources[i]["dataset"] = self.__run_filter( 76 | self.datasources[i]["dataset"], column, f, dry_run, num_proc 77 | ) 78 | for c in self.datasources[i]["cleaners"]: 79 | name = c.__name__ 80 | logger.info(f"Running cleaner: {name} on {column}") 81 | self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map( 82 | lambda x: {column: c(x[column])}, 83 | num_proc=num_proc, 84 | ) 85 | 86 | if len(global_filters) > 0: 87 | # concatenate all datasets 88 | datasets = [ 89 | d["dataset"] 90 | for d in self.datasources 91 | if not d.get("skip_global", False) 92 | ] 93 | global_column = self.datasources[0]["columns"][0] 94 | global_dataset = concatenate_datasets(datasets) 95 | 96 | # Add a column representing the original dataset name 97 | md = [] 98 | for d in self.datasources: 99 | if not d.get("skip_global", False): 100 | md.extend([d["name"]] * len(d["dataset"])) 101 | meta_data = Dataset.from_dict({"meta_data": md}) 102 | global_dataset_with_meta = concatenate_datasets( 103 | [global_dataset, meta_data], axis=1 104 | ) 105 | 106 | # Run the global filters 107 | for f in global_filters: 108 | logger.info(f"Running global filter: {f.__name__}") 109 | global_dataset_with_meta = f( 110 | global_dataset_with_meta, global_column, dry_run=dry_run 111 | ) 112 | 113 | # Split the dataset back up 114 | for i, d in enumerate(self.datasources): 115 | if not d.get("skip_global", False): 116 | self.datasources[i]["dataset"] = global_dataset_with_meta.filter( 117 | lambda x: x["meta_data"] == d["name"], 118 | num_proc=num_proc, 119 | ) 120 | 121 | def export_to_path(self, export_path, output_type="csv"): 122 | """ 123 | Export the cleaned & filtered dataset to a desired export path 124 | 125 | Args: 126 | export_path(str): Path to directory 127 | output_type(str, optional param): Output type of the file to export as 128 | """ 129 | try: 130 | os.makedirs(export_path, exist_ok=True) 131 | except OSError as e: 132 | logger.error(f"Failed to create directory: {export_path}. Error: {str(e)}") 133 | return 134 | 135 | for i, datasource in enumerate(self.datasources): 136 | name = datasource["name"] 137 | filename = f"{name}.csv" 138 | filepath = os.path.join(export_path, filename) 139 | try: 140 | if output_type == "csv": 141 | datasource["dataset"].to_csv(filepath, index=False) 142 | elif output_type == "json": 143 | datasource["dataset"].to_json(filepath, index=False) 144 | else: 145 | logger.error( 146 | f"Invalid output_type: {output_type}. Skipping export for {name} dataset." 147 | ) 148 | logger.info(f"Exported {name} dataset to {filepath}") 149 | except Exception as e: 150 | logger.error( 151 | f"Failed to export {name} dataset to {filepath}. Error: {str(e)}" 152 | ) 153 | -------------------------------------------------------------------------------- /squeakily/filter.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_filter.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['logger', 'zstd_cntxt', 'MINHASH_SEED', 'NON_ALPHA', 'lsh', 'dup_ids', 'check_compression_ratio', 5 | 'check_char_repetition', 'check_flagged_words', 'check_perplexity', 'check_language', 'check_word_number', 6 | 'check_stop_word_ratio', 'check_code_parsability', 'check_labels', 'minhash_dedup'] 7 | 8 | # %% ../nbs/01_filter.ipynb 2 9 | import datasets 10 | import gc 11 | import logging 12 | import multiprocessing 13 | import os 14 | import random 15 | import re 16 | 17 | import networkit as nk 18 | import numpy as np 19 | 20 | from collections import Counter 21 | from datasets import Dataset, Features, Value, Sequence 22 | from datasketch import LeanMinHash, MinHash, MinHashLSH 23 | from rich.logging import RichHandler 24 | from .helpers import flagged_words, get_words 25 | from .helpers import stopwords, stopword_ratios 26 | from tqdm.auto import tqdm 27 | from typing import Set 28 | 29 | # %% ../nbs/01_filter.ipynb 3 30 | logger = logging.getLogger(__name__) 31 | logger.setLevel(logging.INFO) 32 | logger.addHandler(RichHandler(rich_tracebacks=True)) 33 | logger.propagate = False 34 | datasets.logging.set_verbosity_error() 35 | # Turn off logging for datasets 36 | logging.getLogger("datasets").setLevel(logging.ERROR) 37 | 38 | # %% ../nbs/01_filter.ipynb 5 39 | multiprocessing.set_start_method("fork", force=True) 40 | 41 | zstd_cntxt = None 42 | 43 | # %% ../nbs/01_filter.ipynb 6 44 | def _compress_ratio( 45 | doc: str, # document to be analyzed 46 | compression_level: int = 3, # compression level to use 47 | ) -> float: 48 | """ 49 | Returns the ratio of the compressed document to the original document. 50 | """ 51 | global zstd_cntxt 52 | if zstd_cntxt is None: 53 | import zstandard as zstd 54 | 55 | zstd_cntxt = zstd.ZstdCompressor(level=compression_level) 56 | bts = doc.encode("utf-8") 57 | compressed_bts = zstd_cntxt.compress(bts) 58 | try: 59 | ratio = len(compressed_bts) / len(bts) 60 | except ZeroDivisionError: 61 | ratio = 0 62 | return ratio 63 | 64 | # %% ../nbs/01_filter.ipynb 7 65 | def check_compression_ratio( 66 | document, # document to be analyzed 67 | compression_threshold: float = 0.5, # threshold for compression ratio 68 | compression_level: int = 3, # compression level to use 69 | dry_run=False, # if True, returns the ratio of character repetition 70 | ) -> bool: # returns True if document is below threshold 71 | """ 72 | Checks if the document is below the character repetition threshold. 73 | """ 74 | compress_ratio = _compress_ratio(document, compression_level=compression_level) 75 | if dry_run: 76 | return compress_ratio 77 | else: 78 | return compress_ratio > compression_threshold 79 | 80 | # %% ../nbs/01_filter.ipynb 9 81 | def _char_rep_ratio( 82 | doc: str, # document to be analyzed 83 | char_rep_len: int, # length of character repetition 84 | ) -> float: 85 | """ 86 | Returns the ratio of character repetitions in a document. 87 | """ 88 | 89 | def calc_ngrams(doc, n): 90 | char_ngrams = [doc[i : i + n] for i in range(len(doc) - n + 1)] 91 | freq_char_ngrams = Counter(char_ngrams) 92 | return freq_char_ngrams 93 | 94 | freq_char_ngrams = calc_ngrams(doc, char_rep_len) 95 | if len(freq_char_ngrams) == 0: 96 | return 0 97 | freq_char_ngrams = list(freq_char_ngrams.values()) 98 | freq_char_ngrams = sorted(freq_char_ngrams, reverse=True) 99 | val_one = len([el for el in freq_char_ngrams if el == 1]) 100 | num_rep_char_ngrams = min( 101 | int(np.sqrt(len(freq_char_ngrams))), 102 | len(freq_char_ngrams) - val_one, 103 | ) 104 | char_rep_ratio = sum(freq_char_ngrams[:num_rep_char_ngrams]) / sum(freq_char_ngrams) 105 | return char_rep_ratio 106 | 107 | # %% ../nbs/01_filter.ipynb 10 108 | def check_char_repetition( 109 | document, # document to be analyzed 110 | char_repetition_len=10, # length of character repetition 111 | char_repetition_threshold=0.2, # threshold for character repetition 112 | dry_run=False, # if True, returns the ratio of character repetition 113 | ) -> bool: # returns True if document is below threshold 114 | """ 115 | Checks if the document is below the character repetition threshold. 116 | """ 117 | char_rep_ratio = _char_rep_ratio(document, char_repetition_len) 118 | if dry_run: 119 | return char_rep_ratio 120 | else: 121 | return char_rep_ratio <= char_repetition_threshold 122 | 123 | # %% ../nbs/01_filter.ipynb 12 124 | def _flag_word_ratio( 125 | doc: str, # document to be analyzed 126 | flagged_words: list, # list of flagged words 127 | get_words_func: callable, # function to get words from document 128 | ) -> float: # returns ratio of flagged words in document 129 | """ 130 | Returns the ratio of flagged words in a document. 131 | """ 132 | words = get_words_func(doc) 133 | if not words: 134 | return 0.0 135 | flagged_words_ratio = len([word for word in words if word in flagged_words]) / len( 136 | words 137 | ) 138 | if flagged_words_ratio > 1.0: 139 | flagged_words_ratio = 1.0 140 | return flagged_words_ratio 141 | 142 | # %% ../nbs/01_filter.ipynb 13 143 | def check_flagged_words( 144 | document: str, # document to be analyzed 145 | flagged_words: list = flagged_words["en"], # list of flagged words 146 | flagged_words_threshold: float = 0.1, # threshold for flagged words 147 | get_words_func: callable = get_words, # function to get words from document 148 | dry_run: bool = False, # if True, returns the ratio of flagged words 149 | ) -> bool: # returns True if document is below threshold unless dry_run is True 150 | """ 151 | Checks if a document contains a high percentage of flagged words. 152 | """ 153 | cond = True 154 | if flagged_words: 155 | flagged_words_ratio = _flag_word_ratio( 156 | document, 157 | flagged_words, 158 | get_words_func, 159 | ) 160 | if dry_run: 161 | return flagged_words_ratio 162 | 163 | cond = flagged_words_ratio <= flagged_words_threshold 164 | return cond 165 | 166 | # %% ../nbs/01_filter.ipynb 16 167 | def check_perplexity( 168 | document, # document to be analyzed 169 | perplexity_threshold=10_000, # threshold for perplexity 170 | model=None, # model to calculate perplexity 171 | dry_run=False, # if True, returns the perplexity of the document 172 | ) -> bool: # returns True if document is below threshold 173 | """ 174 | Checks if the document is below the perplexity threshold. 175 | """ 176 | perplexity = model.get_perplexity(document) 177 | if dry_run: 178 | return perplexity 179 | else: 180 | return perplexity <= perplexity_threshold 181 | 182 | # %% ../nbs/01_filter.ipynb 19 183 | def check_language( 184 | document, # document to be analyzed 185 | language="en", # language to check 186 | language_threshold=0.9, # threshold for language 187 | model=None, # model to check language 188 | dry_run=False, # if True, returns the language of the document 189 | ) -> bool: # returns True if document is below threshold 190 | """ 191 | Checks if the document is below the language threshold. 192 | """ 193 | lang, prob = model.get_language(document) 194 | if dry_run: 195 | if lang == language: 196 | return prob 197 | else: 198 | return -1.0 199 | else: 200 | return language == lang and prob > language_threshold 201 | 202 | # %% ../nbs/01_filter.ipynb 21 203 | def check_word_number( 204 | document, # document to be analyzed 205 | min_word_threshold=5, # minimum number of words 206 | max_word_threshold=100, # maximum number of words 207 | get_words_func=get_words, # function to get words from document 208 | dry_run=False, # if True, returns the number of words in the document 209 | ) -> bool: # returns True if document is between the minimum and maximum thresholds 210 | """ 211 | Checks if the document is between the minimum and maximum word thresholds. 212 | """ 213 | words = get_words_func(document) 214 | if dry_run: 215 | return len(words) 216 | else: 217 | return len(words) >= min_word_threshold and len(words) <= max_word_threshold 218 | 219 | # %% ../nbs/01_filter.ipynb 23 220 | def check_stop_word_ratio( 221 | document, # document to be analyzed 222 | stop_word_threshold=stopword_ratios["en"], # threshold for stop words 223 | stop_words=stopwords["en"], # list of stop words 224 | get_words_func=get_words, # function to get words from document 225 | dry_run=False, # if True, returns the ratio of stop words in the document 226 | ) -> bool: # returns True if document is below the threshold 227 | """ 228 | Checks if the document contains a high percentage of stop words. 229 | """ 230 | cond = True 231 | if stop_words: 232 | stop_word_ratio = _flag_word_ratio( 233 | document, 234 | stop_words, 235 | get_words_func, 236 | ) 237 | if dry_run: 238 | return stop_word_ratio 239 | else: 240 | cond = stop_word_ratio <= stop_word_threshold 241 | return cond 242 | 243 | # %% ../nbs/01_filter.ipynb 25 244 | def check_code_parsability( 245 | document, # document to be analyzed 246 | program_language="python", # programming language to check 247 | ) -> bool: # returns True if the code is parsable 248 | """ 249 | Checks if the document contains parsable code. 250 | """ 251 | import code_tokenize as ctok 252 | 253 | try: 254 | ctok.tokenize(document, lang=program_language, syntax_error="raise") 255 | return True 256 | except SyntaxError: 257 | return False 258 | 259 | # %% ../nbs/01_filter.ipynb 27 260 | def check_labels( 261 | document, # document to be analyzed 262 | labels: list, # list of labels to check the document against 263 | model=None, # model to check label 264 | dry_run=False, # if True, returns the tags of the document 265 | ) -> bool: # returns True if document relates to any of the labels 266 | """ 267 | Checks if the document relates to any of the labels. 268 | """ 269 | pred_labels = model(document) 270 | if dry_run: 271 | return pred_labels 272 | else: 273 | return any([label in pred_labels for label in labels]) 274 | 275 | # %% ../nbs/01_filter.ipynb 31 276 | MINHASH_SEED = 115 277 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 278 | 279 | random.seed(MINHASH_SEED) 280 | 281 | lsh: MinHashLSH = None 282 | dup_ids: Set = None 283 | 284 | # %% ../nbs/01_filter.ipynb 32 285 | def _hash_func( 286 | idx: int, # The index of the record. 287 | content: str, # The content to be hashed. 288 | *, 289 | num_perm: int # The number of permutations to use in the MinHash object. 290 | ) -> dict[str, any]: # The MinHash signature and the index of the record. 291 | """ 292 | Embed the content of a record into a MinHash object. This function should be 293 | used with multiprocessing and it scales well with the number of cores. 294 | >>> result = _hash_func(0, "Hello world!", num_perm=128) 295 | >>> result["__id__"] 296 | 0 297 | >>> result["__signature__"].shape 298 | (128,) 299 | >>> result["__signature__"].dtype 300 | dtype('uint64') 301 | """ 302 | m = MinHash(num_perm=num_perm, seed=MINHASH_SEED) 303 | m.update_batch( 304 | [token.encode("utf-8") for token in {t for t in NON_ALPHA.split(content) if t}] 305 | ) 306 | return {"__signature__": m.hashvalues, "__id__": idx} 307 | 308 | # %% ../nbs/01_filter.ipynb 34 309 | def _query_content( 310 | idx: int, # The index of the record. 311 | signature: np.ndarray, # The MinHash signature of the record to be queried. 312 | *, 313 | index: MinHashLSH # The MinHashLSH index. It is shared across all processes when using multiprocessing with fork without copy. 314 | ) -> dict[str, any]: # The query result. 315 | """ 316 | Query the MinHashLSH index for the record. This function can be used with multiprocessing 317 | as long as the index is shared across processes. 318 | """ 319 | return { 320 | "__neighbors__": [ 321 | dup_idx 322 | for dup_idx in index.query( 323 | LeanMinHash(seed=MINHASH_SEED, hashvalues=signature), 324 | ) 325 | if dup_idx != idx # exclude itself 326 | ], 327 | "__id__": idx, 328 | } 329 | 330 | # %% ../nbs/01_filter.ipynb 36 331 | def _jaccard_similarity( 332 | s1: str, s2: str # The first string to compare. # The second string to compare. 333 | ) -> float: # The Jaccard similarity between the two strings. 334 | """ 335 | Calculate the jaccard similarity between two code snippets. 336 | """ 337 | tokens1 = set([t for t in NON_ALPHA.split(s1) if t.strip()]) 338 | tokens2 = set([t for t in NON_ALPHA.split(s2) if t.strip()]) 339 | return len(tokens1 & tokens2) / max(1, len(tokens1 | tokens2)) 340 | 341 | # %% ../nbs/01_filter.ipynb 38 342 | def _calculate_average_false_positive_rate( 343 | clusters: list[list[int]], # The clusters of duplicate records. 344 | reference_records: Dataset, # The reference records. 345 | threshold: float, # The threshold to use for calculating the false positive rate. 346 | column: str, # The column to use for calculating the false positive rate. 347 | ) -> None: 348 | """ 349 | Calculate the average false positive rate within each cluster. The false positives are defined as 350 | number of examples that have a maximum jaccard similarity with any example in the cluster that is 351 | less than the threshold. The false positive rate is defined as the number of false positives divided 352 | by the number of examples in the cluster. The average false positive rate is defined as the average 353 | of the false positive rate across all clusters given. 354 | """ 355 | cluster_false_positive_rates: list[float] = [] 356 | deltas: list[float] = [] 357 | 358 | for cluster in tqdm(clusters, desc="Calculating sampling false positive rate..."): 359 | num_false_positives = 0 360 | ids = sorted(cluster) 361 | for i, x in enumerate(ids): 362 | is_false_positive = True 363 | max_similarity = -float("inf") 364 | for j, y in enumerate(ids): 365 | if i == j: 366 | continue 367 | # TODO This can be redundant but we only calculate this for a small sample 368 | similarity = _jaccard_similarity( 369 | reference_records[x][column], reference_records[y][column] 370 | ) 371 | max_similarity = max(max_similarity, similarity) 372 | if max_similarity >= threshold: 373 | is_false_positive = False 374 | break 375 | if is_false_positive: 376 | num_false_positives += 1 377 | deltas.append(threshold - max_similarity) 378 | cluster_false_positive_rates.append(num_false_positives / len(ids)) 379 | 380 | logger.info( 381 | f"Average false positive rate from {len(clusters)} clusters: {np.mean(cluster_false_positive_rates):.2f}" 382 | ) 383 | logger.info(f"Similarity delta stats from threshold:") 384 | logger.info(f"- Max : {np.max(deltas):0.2f}") 385 | logger.info(f"- Min : {np.min(deltas):0.2f}") 386 | logger.info(f"- Mean: {np.mean(deltas):0.2f}") 387 | logger.info(f"- Std : {np.std(deltas):0.2f}") 388 | 389 | # %% ../nbs/01_filter.ipynb 39 390 | def _find_duplicate_communities( 391 | records: Dataset, # The dataset that contains both `__id__` and `__neighbors__`. 392 | community_detection: bool, # Whether to use community detection to find the duplicate communities, or to use the connected components. 393 | report_false_positive_rate: bool = False, # Whether to report the false positive rate. 394 | reference_records: Dataset = None, # The reference records. It can be an iterable or a Dataset. It is only used when `report_false_positive_rate` is True. 395 | threshold: float = 0.85, # The threshold to use for calculating the false positive rate. 396 | column: str = "content", # The column to use for calculating the false positive rate. 397 | verbose: bool = False, 398 | ) -> ( 399 | Set 400 | ): # The set of duplicate ids that should be removed, leaving only one id in each community. 401 | """ 402 | Find the duplicate communities from the queried dataset. 403 | """ 404 | SAMPLE_MIN_SIZE = 10 405 | SAMPLE_MAX_SIZE = 100 406 | SAMPLE_SIZE = 10 407 | g = nk.graph.Graph() 408 | for record in tqdm(records, desc="Constructing graph..."): 409 | for y in record["__neighbors__"]: 410 | g.addEdge(record["__id__"], y, addMissing=True) 411 | 412 | to_remove: Set = set() 413 | samples: list[list[int]] = [] 414 | if not community_detection: 415 | cc = nk.components.ConnectedComponents(g) 416 | cc.run() 417 | partition = cc.getPartition() 418 | components = list(cc.getComponents()) 419 | random.shuffle(components) 420 | for component in tqdm(components, desc="Iterating over components..."): 421 | component = sorted(component) 422 | to_remove.update(component[1:]) 423 | if ( 424 | len(samples) < SAMPLE_SIZE 425 | and SAMPLE_MAX_SIZE > len(component) >= SAMPLE_MIN_SIZE 426 | ): 427 | samples.append(component[:]) 428 | else: 429 | algo = nk.community.PLM(g, refine=False) 430 | algo.run() 431 | partition = algo.getPartition() 432 | communities = list(partition.getSubsetIds()) 433 | random.shuffle(communities) 434 | # This can be slow if there are many communities 435 | for i in tqdm(communities, desc="Iterating over communities..."): 436 | ids = partition.getMembers(i) 437 | to_remove.update(sorted(ids)[1:]) 438 | if ( 439 | len(samples) < SAMPLE_SIZE 440 | and SAMPLE_MAX_SIZE > len(ids) >= SAMPLE_MIN_SIZE 441 | ): 442 | samples.append(ids) 443 | 444 | if report_false_positive_rate and verbose: 445 | _calculate_average_false_positive_rate( 446 | samples, 447 | reference_records, 448 | threshold, 449 | column, 450 | ) 451 | 452 | return to_remove 453 | 454 | # %% ../nbs/01_filter.ipynb 40 455 | def minhash_dedup( 456 | ds, # The dataset to deduplicate. 457 | column, # The column to use for deduplication. 458 | community_detection: bool = False, # Whether to use community detection to find the duplicate communities, or to use the connected components. 459 | report_false_positive_rate: bool = False, # Whether to report the false positive rate. 460 | threshold: float = 0.85, # The threshold to use for deduplication. 461 | num_perm: int = 128, # The number of permutations to use for minhashing. 462 | dry_run: bool = False, # Whether to run the deduplication in dry run mode. 463 | ) -> Dataset: 464 | """ 465 | Deduplicate the dataset using minhashing as described in the paper "Deduplicating Training Data Makes Language Models Better". 466 | """ 467 | global lsh 468 | global dup_ids 469 | 470 | lsh = MinHashLSH( 471 | threshold=threshold, 472 | num_perm=num_perm, 473 | ) 474 | column_names = ds.column_names 475 | ds = ds.map( 476 | lambda _, idx: {"__id__": idx}, 477 | with_indices=True, 478 | num_proc=os.cpu_count(), 479 | desc="Adding index...", 480 | ) 481 | hashed_ds = ds.map( 482 | function=_hash_func, 483 | fn_kwargs={"num_perm": num_perm}, 484 | input_columns=["__id__", column], 485 | remove_columns=column_names, 486 | num_proc=os.cpu_count(), 487 | desc=f"Fingerprinting...", 488 | ) 489 | with lsh.insertion_session() as session: 490 | for data in tqdm(hashed_ds, desc="Indexing signatures..."): 491 | if data["__id__"] in lsh: 492 | continue 493 | session.insert( 494 | data["__id__"], 495 | LeanMinHash(seed=MINHASH_SEED, hashvalues=data["__signature__"]), 496 | check_duplication=False, 497 | ) 498 | 499 | gc.disable() 500 | gc.freeze() 501 | 502 | conf = { 503 | "threshold": threshold, 504 | "community_detection": community_detection, 505 | "report_false_positive_rate": report_false_positive_rate, 506 | "num_perm": num_perm, 507 | "name": ds.builder_name, 508 | "column": column, 509 | } 510 | queried = hashed_ds.map( 511 | lambda x, y: _query_content(x, y, index=lsh), 512 | num_proc=os.cpu_count(), 513 | features=Features( 514 | { 515 | "__id__": Value(dtype="int64", id=None), 516 | "__neighbors__": Sequence( 517 | feature=Value(dtype="int64", id=None), length=-1, id=None 518 | ), 519 | } 520 | ), 521 | input_columns=["__id__", "__signature__"], 522 | remove_columns=["__signature__"], 523 | desc=f"Querying...", 524 | ) 525 | 526 | del lsh 527 | gc.collect() 528 | 529 | queried = queried.filter( 530 | lambda x: len(x["__neighbors__"]) > 0, 531 | num_proc=os.cpu_count(), 532 | desc="Finding duplicates...", 533 | ) 534 | dup_ids = _find_duplicate_communities( 535 | records=queried, 536 | community_detection=conf["community_detection"], 537 | report_false_positive_rate=conf["report_false_positive_rate"], 538 | reference_records=ds, 539 | threshold=conf["threshold"], 540 | column=conf["column"], 541 | ) 542 | 543 | del queried 544 | gc.collect() 545 | 546 | if dry_run: 547 | final_data = ds.map( 548 | lambda idx: {"duplicate": idx in dup_ids}, 549 | input_columns=["__id__"], 550 | num_proc=os.cpu_count(), 551 | desc="Labeling duplicates...", 552 | ) 553 | else: 554 | final_data = ds.filter( 555 | lambda idx: idx not in dup_ids, 556 | input_columns=["__id__"], 557 | num_proc=os.cpu_count(), 558 | desc="Filtering duplicates...", 559 | ) 560 | return final_data 561 | --------------------------------------------------------------------------------