├── squeakily ├── __init__.py ├── core.py ├── clean.py ├── _modidx.py └── filter.py ├── MANIFEST.in ├── index_files └── figure-commonmark │ └── mermaid-figure-1.png ├── .github ├── workflows │ ├── test.yaml │ └── deploy.yaml └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── nbs ├── sidebar.yml ├── nbdev.yml ├── _quarto.yml ├── styles.css ├── 04_tutorials.ipynb ├── 02_clean.ipynb └── 01_filter.ipynb ├── settings.ini ├── .gitignore ├── setup.py ├── CODE_OF_CONDUCT.md ├── README.md └── LICENSE /squeakily/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.3" 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | -------------------------------------------------------------------------------- /index_files/figure-commonmark/mermaid-figure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CarperAI/squeakily/HEAD/index_files/figure-commonmark/mermaid-figure-1.png -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [workflow_dispatch, pull_request, push] 3 | 4 | jobs: 5 | test: 6 | runs-on: ubuntu-latest 7 | steps: [uses: fastai/workflows/nbdev-ci@master] 8 | -------------------------------------------------------------------------------- /nbs/sidebar.yml: -------------------------------------------------------------------------------- 1 | website: 2 | sidebar: 3 | contents: 4 | - index.ipynb 5 | - 00_core.ipynb 6 | - 01_filter.ipynb 7 | - 02_clean.ipynb 8 | - 03_helpers.ipynb 9 | - 04_tutorials.ipynb 10 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | on: 3 | push: 4 | branches: [ "main", "master" ] 5 | workflow_dispatch: 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | steps: [uses: fastai/workflows/quarto-ghp@master] 10 | -------------------------------------------------------------------------------- /nbs/nbdev.yml: -------------------------------------------------------------------------------- 1 | project: 2 | output-dir: _docs 3 | 4 | website: 5 | title: "squeakily" 6 | site-url: "https://CarperAI.github.io/squeakily" 7 | description: "A library for squeakily cleaning and filtering language datasets." 8 | repo-branch: main 9 | repo-url: "https://github.com/CarperAI/squeakily" 10 | -------------------------------------------------------------------------------- /nbs/_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | 4 | format: 5 | html: 6 | theme: cosmo 7 | css: styles.css 8 | toc: true 9 | 10 | website: 11 | twitter-card: true 12 | open-graph: true 13 | repo-actions: [issue] 14 | navbar: 15 | background: primary 16 | search: true 17 | sidebar: 18 | style: floating 19 | 20 | metadata-files: [nbdev.yml, sidebar.yml] -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /nbs/styles.css: -------------------------------------------------------------------------------- 1 | .cell { 2 | margin-bottom: 1rem; 3 | } 4 | 5 | .cell > .sourceCode { 6 | margin-bottom: 0; 7 | } 8 | 9 | .cell-output > pre { 10 | margin-bottom: 0; 11 | } 12 | 13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { 14 | margin-left: 0.8rem; 15 | margin-top: 0; 16 | background: none; 17 | border-left: 2px solid lightsalmon; 18 | border-top-left-radius: 0; 19 | border-top-right-radius: 0; 20 | } 21 | 22 | .cell-output > .sourceCode { 23 | border: none; 24 | } 25 | 26 | .cell-output > .sourceCode { 27 | background: none; 28 | margin-top: 0; 29 | } 30 | 31 | div.description { 32 | padding-left: 2px; 33 | padding-top: 5px; 34 | font-style: italic; 35 | font-size: 135%; 36 | opacity: 70%; 37 | } 38 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | repo = squeakily 3 | lib_name = squeakily 4 | version = 0.0.3 5 | min_python = 3.7 6 | license = apache2 7 | doc_path = _docs 8 | lib_path = squeakily 9 | nbs_path = nbs 10 | recursive = True 11 | tst_flags = notest 12 | put_version_in_init = True 13 | branch = main 14 | custom_sidebar = False 15 | doc_host = https://CarperAI.github.io 16 | doc_baseurl = /squeakily 17 | git_url = https://github.com/CarperAI/squeakily 18 | title = squeakily 19 | audience = Developers 20 | author = ncoop57 21 | author_email = nathan.cooper@stability.ai 22 | copyright = 2022 onwards, ncoop57 23 | description = A library for squeakily cleaning and filtering language datasets. 24 | keywords = nbdev jupyter notebook python 25 | language = English 26 | status = 3 27 | user = CarperAI 28 | requirements = datasketch==1.5.8 datasets==2.7.1 Faker==15.3.3 fastcore huggingface-hub networkit pydantic rich ftfy scikit-learn 29 | dev_requirements = BeautifulSoup4 fasttext nbdev scrubadub twine sentencepiece code-tokenize langchain==0.0.212 openai code-ast 30 | black_formatting = False 31 | readme_nb = index.ipynb 32 | allowed_metadata_keys = 33 | allowed_cell_metadata_keys = 34 | jupyter_hooks = True 35 | clean_ids = True 36 | clear_all = False 37 | 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bin 2 | pilev2/ 3 | _docs/ 4 | _proc/ 5 | 6 | *.bak 7 | .gitattributes 8 | .last_checked 9 | .gitconfig 10 | *.bak 11 | *.log 12 | *~ 13 | ~* 14 | _tmp* 15 | tmp* 16 | tags 17 | *.pkg 18 | 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # dotenv 101 | .env 102 | 103 | # virtualenv 104 | .venv 105 | venv/ 106 | ENV/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | 121 | .vscode 122 | *.swp 123 | 124 | # osx generated files 125 | .DS_Store 126 | .DS_Store? 127 | .Trashes 128 | ehthumbs.db 129 | Thumbs.db 130 | .idea 131 | 132 | # pytest 133 | .pytest_cache 134 | 135 | # tools/trust-doc-nbs 136 | docs_src/.last_checked 137 | 138 | # symlinks to fastai 139 | docs_src/fastai 140 | tools/fastai 141 | 142 | # link checker 143 | checklink/cookies.txt 144 | 145 | # .gitconfig is now autogenerated 146 | .gitconfig 147 | 148 | # Quarto installer 149 | .deb 150 | .pkg 151 | 152 | # Quarto 153 | .quarto 154 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import parse_version 2 | from configparser import ConfigParser 3 | import setuptools 4 | assert parse_version(setuptools.__version__)>=parse_version('36.2') 5 | 6 | # note: all settings are in settings.ini; edit there, not here 7 | config = ConfigParser(delimiters=['=']) 8 | config.read('settings.ini') 9 | cfg = config['DEFAULT'] 10 | 11 | cfg_keys = 'version description keywords author author_email'.split() 12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split() 13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o) 14 | setup_cfg = {o:cfg[o] for o in cfg_keys} 15 | 16 | licenses = { 17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), 18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'), 19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), 20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), 21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), 22 | } 23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', 24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] 25 | py_versions = '3.6 3.7 3.8 3.9 3.10'.split() 26 | 27 | requirements = cfg.get('requirements','').split() 28 | if cfg.get('pip_requirements'): requirements += cfg.get('pip_requirements','').split() 29 | min_python = cfg['min_python'] 30 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) 31 | dev_requirements = (cfg.get('dev_requirements') or '').split() 32 | 33 | setuptools.setup( 34 | name = cfg['lib_name'], 35 | license = lic[0], 36 | classifiers = [ 37 | 'Development Status :: ' + statuses[int(cfg['status'])], 38 | 'Intended Audience :: ' + cfg['audience'].title(), 39 | 'Natural Language :: ' + cfg['language'].title(), 40 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), 41 | url = cfg['git_url'], 42 | packages = setuptools.find_packages(), 43 | include_package_data = True, 44 | install_requires = requirements, 45 | extras_require={ 'dev': dev_requirements }, 46 | dependency_links = cfg.get('dep_links','').split(), 47 | python_requires = '>=' + cfg['min_python'], 48 | long_description = open('README.md').read(), 49 | long_description_content_type = 'text/markdown', 50 | zip_safe = False, 51 | entry_points = { 52 | 'console_scripts': cfg.get('console_scripts','').split(), 53 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] 54 | }, 55 | **setup_cfg) 56 | 57 | 58 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | louis@stability.ai. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /squeakily/core.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['logger', 'Pipeline'] 5 | 6 | # %% ../nbs/00_core.ipynb 2 7 | import logging 8 | import os 9 | 10 | from datasets import concatenate_datasets, Dataset 11 | from rich.logging import RichHandler 12 | 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.INFO) 15 | logger.addHandler(RichHandler(rich_tracebacks=True)) 16 | # Turn off logging for datasets 17 | logging.getLogger("datasets").setLevel(logging.ERROR) 18 | 19 | # %% ../nbs/00_core.ipynb 5 20 | class Pipeline: 21 | """ 22 | A pipeline is a collection of datasources and their associated transformations to be run. 23 | """ 24 | 25 | def __init__(self, datasources): # The datasources to be run 26 | self.datasources = datasources 27 | 28 | def __run_filter(self, dataset, column, filter_fn, dry_run, num_proc): 29 | """ 30 | Run a filter on a dataset. 31 | """ 32 | name = filter_fn.__name__ 33 | logger.info(f"Running filter: {name} on {column}") 34 | if dry_run: 35 | logger.info(f"Running in dry-run mode") 36 | return dataset.map( 37 | lambda x: {f"{name}_criteria": filter_fn(x[column], dry_run=True)}, 38 | num_proc=num_proc, 39 | ) 40 | else: 41 | return dataset.filter( 42 | lambda x: filter_fn(x[column]), 43 | num_proc=num_proc, 44 | ) 45 | 46 | def run( 47 | self, 48 | global_filters=[], # Filters to be run at the dataset level rather than the example level 49 | global_cleaners=[], # Cleaners to be run at the dataset level rather than the example level 50 | cleaning_first=False, # Whether to run the cleaning transformations first 51 | globals_first=False, # Whether to run the global transformations first 52 | dry_run=False, # Whether to run the pipeline or only calculate the various criteria and add as a column 53 | num_proc=os.cpu_count(), # Number of processes to use 54 | ): 55 | """ 56 | Run the pipeline. 57 | """ 58 | for i in range(len(self.datasources)): 59 | column = self.datasources[i]["columns"][0] 60 | logger.info(f"Running datasource: {self.datasources[i]['name']}") 61 | if cleaning_first: 62 | for c in self.datasources[i]["cleaners"]: 63 | name = c.__name__ 64 | logger.info(f"Running cleaner: {name} on {column}") 65 | self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map( 66 | lambda x: {column: c(x[column])}, 67 | num_proc=num_proc, 68 | ) 69 | for f in self.datasources[i]["filters"]: 70 | self.datasources[i]["dataset"] = self.__run_filter( 71 | self.datasources[i]["dataset"], column, f, dry_run, num_proc 72 | ) 73 | else: 74 | for f in self.datasources[i]["filters"]: 75 | self.datasources[i]["dataset"] = self.__run_filter( 76 | self.datasources[i]["dataset"], column, f, dry_run, num_proc 77 | ) 78 | for c in self.datasources[i]["cleaners"]: 79 | name = c.__name__ 80 | logger.info(f"Running cleaner: {name} on {column}") 81 | self.datasources[i]["dataset"] = self.datasources[i]["dataset"].map( 82 | lambda x: {column: c(x[column])}, 83 | num_proc=num_proc, 84 | ) 85 | 86 | if len(global_filters) > 0: 87 | # concatenate all datasets 88 | datasets = [ 89 | d["dataset"] 90 | for d in self.datasources 91 | if not d.get("skip_global", False) 92 | ] 93 | global_column = self.datasources[0]["columns"][0] 94 | global_dataset = concatenate_datasets(datasets) 95 | 96 | # Add a column representing the original dataset name 97 | md = [] 98 | for d in self.datasources: 99 | if not d.get("skip_global", False): 100 | md.extend([d["name"]] * len(d["dataset"])) 101 | meta_data = Dataset.from_dict({"meta_data": md}) 102 | global_dataset_with_meta = concatenate_datasets( 103 | [global_dataset, meta_data], axis=1 104 | ) 105 | 106 | # Run the global filters 107 | for f in global_filters: 108 | logger.info(f"Running global filter: {f.__name__}") 109 | global_dataset_with_meta = f( 110 | global_dataset_with_meta, global_column, dry_run=dry_run 111 | ) 112 | 113 | # Split the dataset back up 114 | for i, d in enumerate(self.datasources): 115 | if not d.get("skip_global", False): 116 | self.datasources[i]["dataset"] = global_dataset_with_meta.filter( 117 | lambda x: x["meta_data"] == d["name"], 118 | num_proc=num_proc, 119 | ) 120 | 121 | def export_to_path(self, export_path, output_type="csv"): 122 | """ 123 | Export the cleaned & filtered dataset to a desired export path 124 | 125 | Args: 126 | export_path(str): Path to directory 127 | output_type(str, optional param): Output type of the file to export as 128 | """ 129 | try: 130 | os.makedirs(export_path, exist_ok=True) 131 | except OSError as e: 132 | logger.error(f"Failed to create directory: {export_path}. Error: {str(e)}") 133 | return 134 | 135 | for i, datasource in enumerate(self.datasources): 136 | name = datasource["name"] 137 | filename = f"{name}.csv" 138 | filepath = os.path.join(export_path, filename) 139 | try: 140 | if output_type == "csv": 141 | datasource["dataset"].to_csv(filepath, index=False) 142 | elif output_type == "json": 143 | datasource["dataset"].to_json(filepath, index=False) 144 | else: 145 | logger.error( 146 | f"Invalid output_type: {output_type}. Skipping export for {name} dataset." 147 | ) 148 | logger.info(f"Exported {name} dataset to {filepath}") 149 | except Exception as e: 150 | logger.error( 151 | f"Failed to export {name} dataset to {filepath}. Error: {str(e)}" 152 | ) 153 | -------------------------------------------------------------------------------- /squeakily/clean.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_clean.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['fake', 'whitespace', 'unicode_punctuation', 'normalize_whitespace', 'normalize_punctuation', 'remove_empty_lines', 5 | 'replace_urls', 'replace_dates', 'replace_email', 'replace_phone', 'replace_ip', 'replace_credit_card', 6 | 'replace_ssn', 'fix_utf8_encoding', 'clean_code_license'] 7 | 8 | # %% ../nbs/02_clean.ipynb 2 9 | import re 10 | from faker import Faker 11 | import ftfy 12 | 13 | fake = Faker() 14 | 15 | # %% ../nbs/02_clean.ipynb 4 16 | # From: https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L95 17 | whitespace = { 18 | " ", 19 | " ", 20 | " ", 21 | " ", 22 | " ", 23 | " ", 24 | " ", 25 | " ", 26 | " ", 27 | " ", 28 | "", 29 | "", 30 | } 31 | 32 | 33 | def normalize_whitespace( 34 | text: str, # The text to normalize 35 | ) -> str: # The normalized text 36 | """ 37 | Replace the various whitespace characters with the standard one. 38 | """ 39 | text = "".join([char if char not in whitespace else " " for char in text]) 40 | return text 41 | 42 | # %% ../nbs/02_clean.ipynb 6 43 | unicode_punctuation = { 44 | ",": ",", 45 | "。": ".", 46 | "、": ",", 47 | "„": '"', 48 | "”": '"', 49 | "“": '"', 50 | "«": '"', 51 | "»": '"', 52 | "1": '"', 53 | "」": '"', 54 | "「": '"', 55 | "《": '"', 56 | "》": '"', 57 | "´": "'", 58 | "∶": ":", 59 | ":": ":", 60 | "?": "?", 61 | "!": "!", 62 | "(": "(", 63 | ")": ")", 64 | ";": ";", 65 | "–": "-", 66 | "—": " - ", 67 | ".": ". ", 68 | "~": "~", 69 | "’": "'", 70 | "…": "...", 71 | "━": "-", 72 | "〈": "<", 73 | "〉": ">", 74 | "【": "[", 75 | "】": "]", 76 | "%": "%", 77 | "►": "-", 78 | } 79 | 80 | 81 | def normalize_punctuation( 82 | text: str, # The text to normalize 83 | ) -> str: # The normalized text 84 | """ 85 | Replace the various unicode punctuation characters with the standard ones. 86 | """ 87 | text = "".join([unicode_punctuation.get(char, char) for char in text]) 88 | return text 89 | 90 | # %% ../nbs/02_clean.ipynb 8 91 | def remove_empty_lines( 92 | text: str, # The text to remove empty lines from 93 | ) -> str: # The text with empty lines removed 94 | """ 95 | Remove empty lines from the text. 96 | Solution from https://stackoverflow.com/a/3711884/5768407 97 | """ 98 | lines = text.splitlines() 99 | filtered = filter(lambda x: not re.match(r"^\s*$", x), lines) 100 | return "\n".join(filtered) 101 | 102 | # %% ../nbs/02_clean.ipynb 10 103 | def replace_urls( 104 | text: str, # The text to replace URLs in 105 | dummy: str = "https://example.com/", # The dummy text to replace URLs with 106 | ) -> str: # The text with URLs replaced 107 | """Replace urls from text with a dummy.""" 108 | return re.sub(r"http\S+", dummy, text) 109 | 110 | # %% ../nbs/02_clean.ipynb 12 111 | def replace_dates( 112 | text: str, # The text to remove dates from 113 | dummy: str = fake.date(), # The dummy text to replace dates with 114 | ) -> str: # The text with dates replaced 115 | """Replace dates from text with a dummy.""" 116 | return re.sub(r"\d{1,2}/\d{1,2}/\d{4}", dummy, text) 117 | 118 | # %% ../nbs/02_clean.ipynb 15 119 | def replace_email( 120 | text: str, # The text to replace email addresses in 121 | dummy: str = fake.email(), # The dummy text to replace email addresses with 122 | ) -> str: # The text with email addresses replaced 123 | """Replace email addresses from text with a dummy.""" 124 | return re.sub(r"[\w\.-]+@[\w\.-]+", dummy, text) 125 | 126 | # %% ../nbs/02_clean.ipynb 17 127 | def replace_phone( 128 | text: str, # The text to replace phone numbers in 129 | dummy: str = fake.phone_number(), # The dummy text to replace phone numbers with 130 | ) -> str: # The text with phone numbers replaced 131 | """Replace phone numbers from text with a dummy.""" 132 | return re.sub(r"\(?\d{3}\)?-? *\d{3}-? *-?\d{4}", dummy, text) 133 | 134 | # %% ../nbs/02_clean.ipynb 19 135 | def replace_ip( 136 | text, # The text to replace ip addresses in 137 | dummy1: str = fake.ipv4(), # The dummy text to replace ipv4 addresses with 138 | dummy2: str = fake.ipv6(), # The dummy text to replace ipv6 addresses with 139 | ) -> str: # The text with ip addresses replaced 140 | """ 141 | Replace ip addresses from text with a dummy. 142 | Solution from https://github.com/bigcode-project/bigcode-analysis/blob/main/data_analysis/pii/utils/emails_ip_addresses_detection.py#L48 143 | """ 144 | ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" 145 | text = re.sub(ipv4_pattern, dummy1, text) 146 | ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])" 147 | text = re.sub(ipv6_pattern, dummy2, text) 148 | return text 149 | 150 | # %% ../nbs/02_clean.ipynb 21 151 | def replace_credit_card( 152 | text: str, # The text to replace credit card numbers in 153 | dummy: str = fake.credit_card_number(), # The dummy text to replace credit card numbers with 154 | ) -> str: # The text with credit card numbers replaced 155 | """Replace credit card numbers from text with a dummy.""" 156 | return re.sub(r"\d{4}-\d{4}-\d{4}-\d{4}", dummy, text) 157 | 158 | # %% ../nbs/02_clean.ipynb 23 159 | def replace_ssn( 160 | text: str, # The text to replace social security numbers in 161 | dummy: str = fake.ssn(), # The dummy text to replace social security numbers with 162 | ) -> str: # The text with social security numbers replaced 163 | """Replace social security numbers from text with a dummy.""" 164 | return re.sub(r"\d{3}-\d{2}-\d{4}", dummy, text) 165 | 166 | # %% ../nbs/02_clean.ipynb 25 167 | def fix_utf8_encoding( 168 | text: str, # The text to fix 169 | ) -> str: # The fixed text 170 | """Fix utf8 text using ftfy.""" 171 | return ftfy.fix_text(text) 172 | 173 | # %% ../nbs/02_clean.ipynb 27 174 | def clean_code_license( 175 | code: str, # The code to clean 176 | language: str = "python", # The language of the code 177 | min_lines: int = 3, # The minimum number of lines that need to be removed 178 | ): 179 | import code_ast 180 | from code_ast import ASTVisitor 181 | from code_ast.ast import LEAVE_WHITELIST 182 | 183 | class FirstNonCommentVisitor(ASTVisitor): 184 | def __init__(self): 185 | self.passed_global_node = False 186 | self.first_node = None 187 | 188 | def visit(self, node): 189 | if not self.passed_global_node: 190 | self.passed_global_node = True 191 | return 192 | if self.first_node is None: 193 | if node.child_count > 0 or node.type in LEAVE_WHITELIST: 194 | self.first_node = node 195 | 196 | """Remove the license or other boilerplate comments from the code.""" 197 | ast = code_ast.ast(code, lang=language) 198 | visitor = FirstNonCommentVisitor() 199 | ast.visit(visitor) 200 | start_line = visitor.first_node.start_point[0] 201 | if start_line < min_lines: 202 | return code 203 | else: 204 | return "\n".join(code.splitlines()[start_line:]) 205 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # squeakily 2 | 3 | 4 | 5 | This repository is heavily inspired by BigScience’s [ROOTs 6 | project](https://github.com/bigscience-workshop/data-preparation) and 7 | EleutherAI’s [The Pile](https://github.com/EleutherAI/the-pile). 8 | 9 | The overall pipeline is as follows: 10 | 11 | ``` mermaid 12 | flowchart LR 13 | A(Defining <br/>Datasources) --> B(Defining Filters <br/>per Datasource) 14 | B --> C(Defining Cleaners <br/>per Datasource) 15 | ``` 16 | 17 | In this library, we define filtering as data instances being removed 18 | from the dataset based on some criteria and cleaning as data instances 19 | being modified in some way. 20 | 21 | ## Install 22 | 23 | ``` sh 24 | pip install squeakily 25 | ``` 26 | 27 | ## How to use 28 | 29 | ### Using the API 30 | 31 | First, we need to define a datasource. `squeakily` accepts any `Dataset` 32 | object from the [HuggingFace 33 | Datasets](https://huggingface.co/docs/datasets/index) library. For 34 | example, we can use the 35 | [wikitext](https://huggingface.co/datasets/wikitext) dataset: 36 | 37 | ``` python 38 | from datasets import load_dataset 39 | 40 | ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:1%]") 41 | ``` 42 | 43 | We simply need to wrap the `Dataset` object in a dictionary, with the 44 | key being the name of the datasource and the value being the `Dataset` 45 | object, the filter and cleaners. For example: 46 | 47 | ``` python 48 | from squeakily.filter import check_char_repetition, check_flagged_words 49 | from squeakily.clean import remove_empty_lines, normalize_whitespace 50 | 51 | datasources = [ 52 | { 53 | "dataset": ds, 54 | "name": "wikitext", 55 | "columns": ["text"], 56 | "filters": [check_char_repetition, check_flagged_words], 57 | "cleaners": [remove_empty_lines, normalize_whitespace], 58 | }, 59 | # ... 60 | ] 61 | ``` 62 | 63 |
[11/16/22 04:32:57] INFO Running datasource: wikitext core.py:41 95 |96 |
INFO Running filter: check_char_repetition on text core.py:54 97 |98 |
INFO Running filter: check_flagged_words on text core.py:54 99 |100 |
INFO Running cleaner: remove_empty_lines on text core.py:57 101 |102 |
[11/16/22 04:32:59] INFO Running cleaner: normalize_whitespace on text core.py:57 103 |104 | 105 |
[11/16/22 04:50:08] INFO Running datasource: wikitext core.py:41\n", 145 | "\n" 146 | ], 147 | "text/plain": [ 148 | "\u001b[2;36m[11/16/22 04:50:08]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Running datasource: wikitext \u001b]8;id=538643;file:///fsx/home-nathan/work/squeakily/squeakily/core.py\u001b\\\u001b[2mcore.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=230902;file:///fsx/home-nathan/work/squeakily/squeakily/core.py#41\u001b\\\u001b[2m41\u001b[0m\u001b]8;;\u001b\\\n" 149 | ] 150 | }, 151 | "metadata": {}, 152 | "output_type": "display_data" 153 | }, 154 | { 155 | "data": { 156 | "text/html": [ 157 | "
INFO Running cleaner: clean on text core.py:57\n", 158 | "\n" 159 | ], 160 | "text/plain": [ 161 | "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Running cleaner: clean on text \u001b]8;id=441718;file:///fsx/home-nathan/work/squeakily/squeakily/core.py\u001b\\\u001b[2mcore.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=808891;file:///fsx/home-nathan/work/squeakily/squeakily/core.py#57\u001b\\\u001b[2m57\u001b[0m\u001b]8;;\u001b\\\n" 162 | ] 163 | }, 164 | "metadata": {}, 165 | "output_type": "display_data" 166 | }, 167 | { 168 | "name": "stderr", 169 | "output_type": "stream", 170 | "text": [ 171 | "#0: 0%| | 0/251 [00:00, ?ex/s]\n", 172 | "\u001b[A\n", 173 | "\n", 174 | "\u001b[A\u001b[A\n", 175 | "\n", 176 | "\n", 177 | "\u001b[A\u001b[A\u001b[A\n", 178 | "\n", 179 | "\n", 180 | "\n", 181 | "#0: 100%|██████████| 251/251 [00:00<00:00, 3072.01ex/s]\n", 182 | "\n", 183 | "\n", 184 | "\n", 185 | "\n", 186 | "\n", 187 | "#6: 0%| | 0/251 [00:00, ?ex/s]\n", 188 | "\n", 189 | "\n", 190 | "\n", 191 | "\n", 192 | "\n", 193 | "#1: 100%|██████████| 251/251 [00:00<00:00, 2612.54ex/s]\n", 194 | "#2: 100%|██████████| 251/251 [00:00<00:00, 2855.57ex/s]\n", 195 | "\n", 196 | "\n", 197 | "\n", 198 | "\n", 199 | "\n", 200 | "\n", 201 | "\n", 202 | "#3: 100%|██████████| 251/251 [00:00<00:00, 2935.28ex/s]\n", 203 | "#4: 100%|██████████| 251/251 [00:00<00:00, 3264.68ex/s]\n", 204 | "\n", 205 | "\n", 206 | "\n", 207 | "\n", 208 | "\n", 209 | "\n", 210 | "\n", 211 | "\n", 212 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 213 | "\n", 214 | "\n", 215 | "\n", 216 | "\n", 217 | "\n", 218 | "\n", 219 | "\n", 220 | "\n", 221 | "#10: 0%| | 0/251 [00:00, ?ex/s]\n", 222 | "\n", 223 | "\n", 224 | "\n", 225 | "\u001b[A\u001b[A\u001b[A\u001b[A\n", 226 | "\n", 227 | "\n", 228 | "\n", 229 | "\n", 230 | "\n", 231 | "\n", 232 | "\n", 233 | "\n", 234 | "\n", 235 | "#5: 100%|██████████| 251/251 [00:00<00:00, 2389.82ex/s]\n", 236 | "#6: 100%|██████████| 251/251 [00:00<00:00, 2589.32ex/s]\n", 237 | "\n", 238 | "\n", 239 | "\n", 240 | "\n", 241 | "\n", 242 | "\n", 243 | "\n", 244 | "\n", 245 | "\n", 246 | "\n", 247 | "\n", 248 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 249 | "\n", 250 | "\n", 251 | "\n", 252 | "\n", 253 | "\n", 254 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 255 | "\n", 256 | "\n", 257 | "\n", 258 | "\n", 259 | "\n", 260 | "\n", 261 | "\n", 262 | "\n", 263 | "\n", 264 | "\n", 265 | "\n", 266 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 267 | "\n", 268 | "\n", 269 | "\n", 270 | "\n", 271 | "\n", 272 | "\n", 273 | "#7: 100%|██████████| 251/251 [00:00<00:00, 2034.34ex/s]\n", 274 | "#9: 100%|██████████| 251/251 [00:00<00:00, 2617.65ex/s]\n", 275 | "\n", 276 | "\n", 277 | "\n", 278 | "\n", 279 | "\n", 280 | "\n", 281 | "\n", 282 | "\n", 283 | "\n", 284 | "\n", 285 | "\n", 286 | "\n", 287 | "\n", 288 | "#11: 100%|██████████| 251/251 [00:00<00:00, 3306.24ex/s]\n", 289 | "#8: 100%|██████████| 251/251 [00:00<00:00, 1814.93ex/s]\n", 290 | "\n", 291 | "\n", 292 | "\n", 293 | "\n", 294 | "\n", 295 | "\n", 296 | "\n", 297 | "\n", 298 | "\n", 299 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 300 | "\n", 301 | "\n", 302 | "\n", 303 | "\n", 304 | "\n", 305 | "\n", 306 | "\n", 307 | "\n", 308 | "\n", 309 | "\n", 310 | "\n", 311 | "\n", 312 | "\n", 313 | "#10: 100%|██████████| 251/251 [00:00<00:00, 2270.29ex/s]\n", 314 | "\n", 315 | "\n", 316 | "\n", 317 | "\n", 318 | "\n", 319 | "\n", 320 | "\n", 321 | "\n", 322 | "\n", 323 | "\n", 324 | "\n", 325 | "\n", 326 | "\n", 327 | "\n", 328 | "\n", 329 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 330 | "\n", 331 | "\n", 332 | "\n", 333 | "\n", 334 | "\n", 335 | "\n", 336 | "\n", 337 | "\n", 338 | "\n", 339 | "\n", 340 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 341 | "\n", 342 | "\n", 343 | "\n", 344 | "\n", 345 | "\n", 346 | "\n", 347 | "\n", 348 | "\n", 349 | "\n", 350 | "\n", 351 | "\n", 352 | "\n", 353 | "\n", 354 | "\n", 355 | "\n", 356 | "#13: 100%|██████████| 251/251 [00:00<00:00, 3016.87ex/s]\n", 357 | "#12: 100%|██████████| 251/251 [00:00<00:00, 2143.69ex/s]\n", 358 | "\n", 359 | "\n", 360 | "\n", 361 | "\n", 362 | "\n", 363 | "\n", 364 | "\n", 365 | "\n", 366 | "\n", 367 | "\n", 368 | "\n", 369 | "\n", 370 | "\n", 371 | "\n", 372 | "\n", 373 | "\n", 374 | "\n", 375 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 376 | "\n", 377 | "\n", 378 | "\n", 379 | "\n", 380 | "\n", 381 | "\n", 382 | "\n", 383 | "\n", 384 | "\n", 385 | "\n", 386 | "\n", 387 | "\n", 388 | "\n", 389 | "\n", 390 | "\n", 391 | "\n", 392 | "\n", 393 | "\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n", 394 | "\n", 395 | "\n", 396 | "\n", 397 | "\n", 398 | "\n", 399 | "\n", 400 | "\n", 401 | "\n", 402 | "\n", 403 | "\n", 404 | "\n", 405 | "\n", 406 | "#14: 100%|██████████| 250/250 [00:00<00:00, 2288.07ex/s]\n", 407 | "#16: 100%|██████████| 250/250 [00:00<00:00, 3103.58ex/s]\n", 408 | "#15: 100%|██████████| 250/250 [00:00<00:00, 2592.26ex/s]\n", 409 | "#17: 100%|██████████| 250/250 [00:00<00:00, 2701.81ex/s]\n", 410 | "#19: 100%|██████████| 250/250 [00:00<00:00, 3712.02ex/s]\n", 411 | "#18: 100%|██████████| 250/250 [00:00<00:00, 2754.21ex/s]\n", 412 | "#21: 100%|██████████| 250/250 [00:00<00:00, 3429.25ex/s]\n", 413 | "#20: 100%|██████████| 250/250 [00:00<00:00, 2555.64ex/s]\n", 414 | "#22: 100%|██████████| 250/250 [00:00<00:00, 2730.66ex/s]\n", 415 | "#23: 100%|██████████| 250/250 [00:00<00:00, 3116.50ex/s]\n", 416 | "#24: 100%|██████████| 250/250 [00:00<00:00, 2475.98ex/s]\n", 417 | "#27: 100%|██████████| 250/250 [00:00<00:00, 2275.80ex/s]\n", 418 | "#28: 100%|██████████| 250/250 [00:00<00:00, 2605.21ex/s]\n", 419 | "#26: 100%|██████████| 250/250 [00:00<00:00, 1912.19ex/s]\n", 420 | "#29: 100%|██████████| 250/250 [00:00<00:00, 2153.56ex/s]\n", 421 | "#25: 100%|██████████| 250/250 [00:00<00:00, 1340.31ex/s]\n", 422 | "#32: 100%|██████████| 250/250 [00:00<00:00, 2992.20ex/s]\n", 423 | "#30: 100%|██████████| 250/250 [00:00<00:00, 2616.85ex/s]\n", 424 | "#35: 100%|██████████| 250/250 [00:00<00:00, 2765.58ex/s]\n", 425 | "#33: 100%|██████████| 250/250 [00:00<00:00, 2415.08ex/s]\n", 426 | "#37: 100%|██████████| 250/250 [00:00<00:00, 2361.90ex/s]\n", 427 | "#34: 100%|██████████| 250/250 [00:00<00:00, 2138.85ex/s]\n", 428 | "\n", 429 | "#31: 100%|██████████| 250/250 [00:00<00:00, 1646.41ex/s]\n", 430 | "#39: 100%|██████████| 250/250 [00:00<00:00, 2733.18ex/s]\n", 431 | "#36: 100%|██████████| 250/250 [00:00<00:00, 1822.53ex/s]\n", 432 | "#40: 100%|██████████| 250/250 [00:00<00:00, 3259.21ex/s]\n", 433 | "#41: 100%|██████████| 250/250 [00:00<00:00, 2362.59ex/s]\n", 434 | "#42: 100%|██████████| 250/250 [00:00<00:00, 2641.48ex/s]\n", 435 | "#43: 100%|██████████| 250/250 [00:00<00:00, 2797.40ex/s]\n", 436 | "#46: 100%|██████████| 250/250 [00:00<00:00, 3180.07ex/s]\n", 437 | "#45: 100%|██████████| 250/250 [00:00<00:00, 2793.68ex/s]\n", 438 | "#44: 100%|██████████| 250/250 [00:00<00:00, 2030.37ex/s]\n", 439 | "#47: 100%|██████████| 250/250 [00:00<00:00, 3685.85ex/s]\n", 440 | "#48: 100%|██████████| 250/250 [00:00<00:00, 2849.12ex/s]\n", 441 | "#50: 100%|██████████| 250/250 [00:00<00:00, 3088.79ex/s]\n", 442 | "#49: 100%|██████████| 250/250 [00:00<00:00, 2418.17ex/s]\n", 443 | "#51: 100%|██████████| 250/250 [00:00<00:00, 2865.90ex/s]\n", 444 | "#52: 100%|██████████| 250/250 [00:00<00:00, 3265.18ex/s]\n", 445 | "#53: 100%|██████████| 250/250 [00:00<00:00, 2847.02ex/s]\n", 446 | "#54: 100%|██████████| 250/250 [00:00<00:00, 2452.67ex/s]\n", 447 | "#56: 100%|██████████| 250/250 [00:00<00:00, 2912.06ex/s]\n", 448 | "#55: 100%|██████████| 250/250 [00:00<00:00, 1961.08ex/s]\n", 449 | "#58: 100%|██████████| 250/250 [00:00<00:00, 3242.75ex/s]\n", 450 | "#57: 100%|██████████| 250/250 [00:00<00:00, 2639.49ex/s]\n", 451 | "#59: 100%|██████████| 250/250 [00:00<00:00, 3095.53ex/s]\n", 452 | "#60: 100%|██████████| 250/250 [00:00<00:00, 3094.43ex/s]\n", 453 | "#61: 100%|██████████| 250/250 [00:00<00:00, 2429.84ex/s]\n", 454 | "#62: 100%|██████████| 250/250 [00:00<00:00, 2758.44ex/s]\n", 455 | "#63: 100%|██████████| 250/250 [00:00<00:00, 2411.69ex/s]\n", 456 | "#64: 100%|██████████| 250/250 [00:00<00:00, 2883.52ex/s]\n", 457 | "#65: 100%|██████████| 250/250 [00:00<00:00, 2773.47ex/s]\n", 458 | "#66: 100%|██████████| 250/250 [00:00<00:00, 2894.81ex/s]\n", 459 | "#67: 100%|██████████| 250/250 [00:00<00:00, 2518.36ex/s]\n", 460 | "#68: 100%|██████████| 250/250 [00:00<00:00, 2671.31ex/s]\n", 461 | "#69: 100%|██████████| 250/250 [00:00<00:00, 2463.28ex/s]\n", 462 | "#70: 100%|██████████| 250/250 [00:00<00:00, 2670.79ex/s]\n", 463 | "#71: 100%|██████████| 250/250 [00:00<00:00, 2941.40ex/s]\n" 464 | ] 465 | } 466 | ], 467 | "source": [ 468 | "# |output: false\n", 469 | "from squeakily.core import Pipeline\n", 470 | "\n", 471 | "pipeline = Pipeline(datasources)\n", 472 | "pipeline.run()" 473 | ] 474 | } 475 | ], 476 | "metadata": { 477 | "kernelspec": { 478 | "display_name": "python3", 479 | "language": "python", 480 | "name": "python3" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 4 485 | } 486 | -------------------------------------------------------------------------------- /squeakily/filter.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_filter.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['logger', 'zstd_cntxt', 'MINHASH_SEED', 'NON_ALPHA', 'lsh', 'dup_ids', 'check_compression_ratio', 5 | 'check_char_repetition', 'check_flagged_words', 'check_perplexity', 'check_language', 'check_word_number', 6 | 'check_stop_word_ratio', 'check_code_parsability', 'check_labels', 'minhash_dedup'] 7 | 8 | # %% ../nbs/01_filter.ipynb 2 9 | import datasets 10 | import gc 11 | import logging 12 | import multiprocessing 13 | import os 14 | import random 15 | import re 16 | 17 | import networkit as nk 18 | import numpy as np 19 | 20 | from collections import Counter 21 | from datasets import Dataset, Features, Value, Sequence 22 | from datasketch import LeanMinHash, MinHash, MinHashLSH 23 | from rich.logging import RichHandler 24 | from .helpers import flagged_words, get_words 25 | from .helpers import stopwords, stopword_ratios 26 | from tqdm.auto import tqdm 27 | from typing import Set 28 | 29 | # %% ../nbs/01_filter.ipynb 3 30 | logger = logging.getLogger(__name__) 31 | logger.setLevel(logging.INFO) 32 | logger.addHandler(RichHandler(rich_tracebacks=True)) 33 | logger.propagate = False 34 | datasets.logging.set_verbosity_error() 35 | # Turn off logging for datasets 36 | logging.getLogger("datasets").setLevel(logging.ERROR) 37 | 38 | # %% ../nbs/01_filter.ipynb 5 39 | multiprocessing.set_start_method("fork", force=True) 40 | 41 | zstd_cntxt = None 42 | 43 | # %% ../nbs/01_filter.ipynb 6 44 | def _compress_ratio( 45 | doc: str, # document to be analyzed 46 | compression_level: int = 3, # compression level to use 47 | ) -> float: 48 | """ 49 | Returns the ratio of the compressed document to the original document. 50 | """ 51 | global zstd_cntxt 52 | if zstd_cntxt is None: 53 | import zstandard as zstd 54 | 55 | zstd_cntxt = zstd.ZstdCompressor(level=compression_level) 56 | bts = doc.encode("utf-8") 57 | compressed_bts = zstd_cntxt.compress(bts) 58 | try: 59 | ratio = len(compressed_bts) / len(bts) 60 | except ZeroDivisionError: 61 | ratio = 0 62 | return ratio 63 | 64 | # %% ../nbs/01_filter.ipynb 7 65 | def check_compression_ratio( 66 | document, # document to be analyzed 67 | compression_threshold: float = 0.5, # threshold for compression ratio 68 | compression_level: int = 3, # compression level to use 69 | dry_run=False, # if True, returns the ratio of character repetition 70 | ) -> bool: # returns True if document is below threshold 71 | """ 72 | Checks if the document is below the character repetition threshold. 73 | """ 74 | compress_ratio = _compress_ratio(document, compression_level=compression_level) 75 | if dry_run: 76 | return compress_ratio 77 | else: 78 | return compress_ratio > compression_threshold 79 | 80 | # %% ../nbs/01_filter.ipynb 9 81 | def _char_rep_ratio( 82 | doc: str, # document to be analyzed 83 | char_rep_len: int, # length of character repetition 84 | ) -> float: 85 | """ 86 | Returns the ratio of character repetitions in a document. 87 | """ 88 | 89 | def calc_ngrams(doc, n): 90 | char_ngrams = [doc[i : i + n] for i in range(len(doc) - n + 1)] 91 | freq_char_ngrams = Counter(char_ngrams) 92 | return freq_char_ngrams 93 | 94 | freq_char_ngrams = calc_ngrams(doc, char_rep_len) 95 | if len(freq_char_ngrams) == 0: 96 | return 0 97 | freq_char_ngrams = list(freq_char_ngrams.values()) 98 | freq_char_ngrams = sorted(freq_char_ngrams, reverse=True) 99 | val_one = len([el for el in freq_char_ngrams if el == 1]) 100 | num_rep_char_ngrams = min( 101 | int(np.sqrt(len(freq_char_ngrams))), 102 | len(freq_char_ngrams) - val_one, 103 | ) 104 | char_rep_ratio = sum(freq_char_ngrams[:num_rep_char_ngrams]) / sum(freq_char_ngrams) 105 | return char_rep_ratio 106 | 107 | # %% ../nbs/01_filter.ipynb 10 108 | def check_char_repetition( 109 | document, # document to be analyzed 110 | char_repetition_len=10, # length of character repetition 111 | char_repetition_threshold=0.2, # threshold for character repetition 112 | dry_run=False, # if True, returns the ratio of character repetition 113 | ) -> bool: # returns True if document is below threshold 114 | """ 115 | Checks if the document is below the character repetition threshold. 116 | """ 117 | char_rep_ratio = _char_rep_ratio(document, char_repetition_len) 118 | if dry_run: 119 | return char_rep_ratio 120 | else: 121 | return char_rep_ratio <= char_repetition_threshold 122 | 123 | # %% ../nbs/01_filter.ipynb 12 124 | def _flag_word_ratio( 125 | doc: str, # document to be analyzed 126 | flagged_words: list, # list of flagged words 127 | get_words_func: callable, # function to get words from document 128 | ) -> float: # returns ratio of flagged words in document 129 | """ 130 | Returns the ratio of flagged words in a document. 131 | """ 132 | words = get_words_func(doc) 133 | if not words: 134 | return 0.0 135 | flagged_words_ratio = len([word for word in words if word in flagged_words]) / len( 136 | words 137 | ) 138 | if flagged_words_ratio > 1.0: 139 | flagged_words_ratio = 1.0 140 | return flagged_words_ratio 141 | 142 | # %% ../nbs/01_filter.ipynb 13 143 | def check_flagged_words( 144 | document: str, # document to be analyzed 145 | flagged_words: list = flagged_words["en"], # list of flagged words 146 | flagged_words_threshold: float = 0.1, # threshold for flagged words 147 | get_words_func: callable = get_words, # function to get words from document 148 | dry_run: bool = False, # if True, returns the ratio of flagged words 149 | ) -> bool: # returns True if document is below threshold unless dry_run is True 150 | """ 151 | Checks if a document contains a high percentage of flagged words. 152 | """ 153 | cond = True 154 | if flagged_words: 155 | flagged_words_ratio = _flag_word_ratio( 156 | document, 157 | flagged_words, 158 | get_words_func, 159 | ) 160 | if dry_run: 161 | return flagged_words_ratio 162 | 163 | cond = flagged_words_ratio <= flagged_words_threshold 164 | return cond 165 | 166 | # %% ../nbs/01_filter.ipynb 16 167 | def check_perplexity( 168 | document, # document to be analyzed 169 | perplexity_threshold=10_000, # threshold for perplexity 170 | model=None, # model to calculate perplexity 171 | dry_run=False, # if True, returns the perplexity of the document 172 | ) -> bool: # returns True if document is below threshold 173 | """ 174 | Checks if the document is below the perplexity threshold. 175 | """ 176 | perplexity = model.get_perplexity(document) 177 | if dry_run: 178 | return perplexity 179 | else: 180 | return perplexity <= perplexity_threshold 181 | 182 | # %% ../nbs/01_filter.ipynb 19 183 | def check_language( 184 | document, # document to be analyzed 185 | language="en", # language to check 186 | language_threshold=0.9, # threshold for language 187 | model=None, # model to check language 188 | dry_run=False, # if True, returns the language of the document 189 | ) -> bool: # returns True if document is below threshold 190 | """ 191 | Checks if the document is below the language threshold. 192 | """ 193 | lang, prob = model.get_language(document) 194 | if dry_run: 195 | if lang == language: 196 | return prob 197 | else: 198 | return -1.0 199 | else: 200 | return language == lang and prob > language_threshold 201 | 202 | # %% ../nbs/01_filter.ipynb 21 203 | def check_word_number( 204 | document, # document to be analyzed 205 | min_word_threshold=5, # minimum number of words 206 | max_word_threshold=100, # maximum number of words 207 | get_words_func=get_words, # function to get words from document 208 | dry_run=False, # if True, returns the number of words in the document 209 | ) -> bool: # returns True if document is between the minimum and maximum thresholds 210 | """ 211 | Checks if the document is between the minimum and maximum word thresholds. 212 | """ 213 | words = get_words_func(document) 214 | if dry_run: 215 | return len(words) 216 | else: 217 | return len(words) >= min_word_threshold and len(words) <= max_word_threshold 218 | 219 | # %% ../nbs/01_filter.ipynb 23 220 | def check_stop_word_ratio( 221 | document, # document to be analyzed 222 | stop_word_threshold=stopword_ratios["en"], # threshold for stop words 223 | stop_words=stopwords["en"], # list of stop words 224 | get_words_func=get_words, # function to get words from document 225 | dry_run=False, # if True, returns the ratio of stop words in the document 226 | ) -> bool: # returns True if document is below the threshold 227 | """ 228 | Checks if the document contains a high percentage of stop words. 229 | """ 230 | cond = True 231 | if stop_words: 232 | stop_word_ratio = _flag_word_ratio( 233 | document, 234 | stop_words, 235 | get_words_func, 236 | ) 237 | if dry_run: 238 | return stop_word_ratio 239 | else: 240 | cond = stop_word_ratio <= stop_word_threshold 241 | return cond 242 | 243 | # %% ../nbs/01_filter.ipynb 25 244 | def check_code_parsability( 245 | document, # document to be analyzed 246 | program_language="python", # programming language to check 247 | ) -> bool: # returns True if the code is parsable 248 | """ 249 | Checks if the document contains parsable code. 250 | """ 251 | import code_tokenize as ctok 252 | 253 | try: 254 | ctok.tokenize(document, lang=program_language, syntax_error="raise") 255 | return True 256 | except SyntaxError: 257 | return False 258 | 259 | # %% ../nbs/01_filter.ipynb 27 260 | def check_labels( 261 | document, # document to be analyzed 262 | labels: list, # list of labels to check the document against 263 | model=None, # model to check label 264 | dry_run=False, # if True, returns the tags of the document 265 | ) -> bool: # returns True if document relates to any of the labels 266 | """ 267 | Checks if the document relates to any of the labels. 268 | """ 269 | pred_labels = model(document) 270 | if dry_run: 271 | return pred_labels 272 | else: 273 | return any([label in pred_labels for label in labels]) 274 | 275 | # %% ../nbs/01_filter.ipynb 31 276 | MINHASH_SEED = 115 277 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 278 | 279 | random.seed(MINHASH_SEED) 280 | 281 | lsh: MinHashLSH = None 282 | dup_ids: Set = None 283 | 284 | # %% ../nbs/01_filter.ipynb 32 285 | def _hash_func( 286 | idx: int, # The index of the record. 287 | content: str, # The content to be hashed. 288 | *, 289 | num_perm: int # The number of permutations to use in the MinHash object. 290 | ) -> dict[str, any]: # The MinHash signature and the index of the record. 291 | """ 292 | Embed the content of a record into a MinHash object. This function should be 293 | used with multiprocessing and it scales well with the number of cores. 294 | >>> result = _hash_func(0, "Hello world!", num_perm=128) 295 | >>> result["__id__"] 296 | 0 297 | >>> result["__signature__"].shape 298 | (128,) 299 | >>> result["__signature__"].dtype 300 | dtype('uint64') 301 | """ 302 | m = MinHash(num_perm=num_perm, seed=MINHASH_SEED) 303 | m.update_batch( 304 | [token.encode("utf-8") for token in {t for t in NON_ALPHA.split(content) if t}] 305 | ) 306 | return {"__signature__": m.hashvalues, "__id__": idx} 307 | 308 | # %% ../nbs/01_filter.ipynb 34 309 | def _query_content( 310 | idx: int, # The index of the record. 311 | signature: np.ndarray, # The MinHash signature of the record to be queried. 312 | *, 313 | index: MinHashLSH # The MinHashLSH index. It is shared across all processes when using multiprocessing with fork without copy. 314 | ) -> dict[str, any]: # The query result. 315 | """ 316 | Query the MinHashLSH index for the record. This function can be used with multiprocessing 317 | as long as the index is shared across processes. 318 | """ 319 | return { 320 | "__neighbors__": [ 321 | dup_idx 322 | for dup_idx in index.query( 323 | LeanMinHash(seed=MINHASH_SEED, hashvalues=signature), 324 | ) 325 | if dup_idx != idx # exclude itself 326 | ], 327 | "__id__": idx, 328 | } 329 | 330 | # %% ../nbs/01_filter.ipynb 36 331 | def _jaccard_similarity( 332 | s1: str, s2: str # The first string to compare. # The second string to compare. 333 | ) -> float: # The Jaccard similarity between the two strings. 334 | """ 335 | Calculate the jaccard similarity between two code snippets. 336 | """ 337 | tokens1 = set([t for t in NON_ALPHA.split(s1) if t.strip()]) 338 | tokens2 = set([t for t in NON_ALPHA.split(s2) if t.strip()]) 339 | return len(tokens1 & tokens2) / max(1, len(tokens1 | tokens2)) 340 | 341 | # %% ../nbs/01_filter.ipynb 38 342 | def _calculate_average_false_positive_rate( 343 | clusters: list[list[int]], # The clusters of duplicate records. 344 | reference_records: Dataset, # The reference records. 345 | threshold: float, # The threshold to use for calculating the false positive rate. 346 | column: str, # The column to use for calculating the false positive rate. 347 | ) -> None: 348 | """ 349 | Calculate the average false positive rate within each cluster. The false positives are defined as 350 | number of examples that have a maximum jaccard similarity with any example in the cluster that is 351 | less than the threshold. The false positive rate is defined as the number of false positives divided 352 | by the number of examples in the cluster. The average false positive rate is defined as the average 353 | of the false positive rate across all clusters given. 354 | """ 355 | cluster_false_positive_rates: list[float] = [] 356 | deltas: list[float] = [] 357 | 358 | for cluster in tqdm(clusters, desc="Calculating sampling false positive rate..."): 359 | num_false_positives = 0 360 | ids = sorted(cluster) 361 | for i, x in enumerate(ids): 362 | is_false_positive = True 363 | max_similarity = -float("inf") 364 | for j, y in enumerate(ids): 365 | if i == j: 366 | continue 367 | # TODO This can be redundant but we only calculate this for a small sample 368 | similarity = _jaccard_similarity( 369 | reference_records[x][column], reference_records[y][column] 370 | ) 371 | max_similarity = max(max_similarity, similarity) 372 | if max_similarity >= threshold: 373 | is_false_positive = False 374 | break 375 | if is_false_positive: 376 | num_false_positives += 1 377 | deltas.append(threshold - max_similarity) 378 | cluster_false_positive_rates.append(num_false_positives / len(ids)) 379 | 380 | logger.info( 381 | f"Average false positive rate from {len(clusters)} clusters: {np.mean(cluster_false_positive_rates):.2f}" 382 | ) 383 | logger.info(f"Similarity delta stats from threshold:") 384 | logger.info(f"- Max : {np.max(deltas):0.2f}") 385 | logger.info(f"- Min : {np.min(deltas):0.2f}") 386 | logger.info(f"- Mean: {np.mean(deltas):0.2f}") 387 | logger.info(f"- Std : {np.std(deltas):0.2f}") 388 | 389 | # %% ../nbs/01_filter.ipynb 39 390 | def _find_duplicate_communities( 391 | records: Dataset, # The dataset that contains both `__id__` and `__neighbors__`. 392 | community_detection: bool, # Whether to use community detection to find the duplicate communities, or to use the connected components. 393 | report_false_positive_rate: bool = False, # Whether to report the false positive rate. 394 | reference_records: Dataset = None, # The reference records. It can be an iterable or a Dataset. It is only used when `report_false_positive_rate` is True. 395 | threshold: float = 0.85, # The threshold to use for calculating the false positive rate. 396 | column: str = "content", # The column to use for calculating the false positive rate. 397 | verbose: bool = False, 398 | ) -> ( 399 | Set 400 | ): # The set of duplicate ids that should be removed, leaving only one id in each community. 401 | """ 402 | Find the duplicate communities from the queried dataset. 403 | """ 404 | SAMPLE_MIN_SIZE = 10 405 | SAMPLE_MAX_SIZE = 100 406 | SAMPLE_SIZE = 10 407 | g = nk.graph.Graph() 408 | for record in tqdm(records, desc="Constructing graph..."): 409 | for y in record["__neighbors__"]: 410 | g.addEdge(record["__id__"], y, addMissing=True) 411 | 412 | to_remove: Set = set() 413 | samples: list[list[int]] = [] 414 | if not community_detection: 415 | cc = nk.components.ConnectedComponents(g) 416 | cc.run() 417 | partition = cc.getPartition() 418 | components = list(cc.getComponents()) 419 | random.shuffle(components) 420 | for component in tqdm(components, desc="Iterating over components..."): 421 | component = sorted(component) 422 | to_remove.update(component[1:]) 423 | if ( 424 | len(samples) < SAMPLE_SIZE 425 | and SAMPLE_MAX_SIZE > len(component) >= SAMPLE_MIN_SIZE 426 | ): 427 | samples.append(component[:]) 428 | else: 429 | algo = nk.community.PLM(g, refine=False) 430 | algo.run() 431 | partition = algo.getPartition() 432 | communities = list(partition.getSubsetIds()) 433 | random.shuffle(communities) 434 | # This can be slow if there are many communities 435 | for i in tqdm(communities, desc="Iterating over communities..."): 436 | ids = partition.getMembers(i) 437 | to_remove.update(sorted(ids)[1:]) 438 | if ( 439 | len(samples) < SAMPLE_SIZE 440 | and SAMPLE_MAX_SIZE > len(ids) >= SAMPLE_MIN_SIZE 441 | ): 442 | samples.append(ids) 443 | 444 | if report_false_positive_rate and verbose: 445 | _calculate_average_false_positive_rate( 446 | samples, 447 | reference_records, 448 | threshold, 449 | column, 450 | ) 451 | 452 | return to_remove 453 | 454 | # %% ../nbs/01_filter.ipynb 40 455 | def minhash_dedup( 456 | ds, # The dataset to deduplicate. 457 | column, # The column to use for deduplication. 458 | community_detection: bool = False, # Whether to use community detection to find the duplicate communities, or to use the connected components. 459 | report_false_positive_rate: bool = False, # Whether to report the false positive rate. 460 | threshold: float = 0.85, # The threshold to use for deduplication. 461 | num_perm: int = 128, # The number of permutations to use for minhashing. 462 | dry_run: bool = False, # Whether to run the deduplication in dry run mode. 463 | ) -> Dataset: 464 | """ 465 | Deduplicate the dataset using minhashing as described in the paper "Deduplicating Training Data Makes Language Models Better". 466 | """ 467 | global lsh 468 | global dup_ids 469 | 470 | lsh = MinHashLSH( 471 | threshold=threshold, 472 | num_perm=num_perm, 473 | ) 474 | column_names = ds.column_names 475 | ds = ds.map( 476 | lambda _, idx: {"__id__": idx}, 477 | with_indices=True, 478 | num_proc=os.cpu_count(), 479 | desc="Adding index...", 480 | ) 481 | hashed_ds = ds.map( 482 | function=_hash_func, 483 | fn_kwargs={"num_perm": num_perm}, 484 | input_columns=["__id__", column], 485 | remove_columns=column_names, 486 | num_proc=os.cpu_count(), 487 | desc=f"Fingerprinting...", 488 | ) 489 | with lsh.insertion_session() as session: 490 | for data in tqdm(hashed_ds, desc="Indexing signatures..."): 491 | if data["__id__"] in lsh: 492 | continue 493 | session.insert( 494 | data["__id__"], 495 | LeanMinHash(seed=MINHASH_SEED, hashvalues=data["__signature__"]), 496 | check_duplication=False, 497 | ) 498 | 499 | gc.disable() 500 | gc.freeze() 501 | 502 | conf = { 503 | "threshold": threshold, 504 | "community_detection": community_detection, 505 | "report_false_positive_rate": report_false_positive_rate, 506 | "num_perm": num_perm, 507 | "name": ds.builder_name, 508 | "column": column, 509 | } 510 | queried = hashed_ds.map( 511 | lambda x, y: _query_content(x, y, index=lsh), 512 | num_proc=os.cpu_count(), 513 | features=Features( 514 | { 515 | "__id__": Value(dtype="int64", id=None), 516 | "__neighbors__": Sequence( 517 | feature=Value(dtype="int64", id=None), length=-1, id=None 518 | ), 519 | } 520 | ), 521 | input_columns=["__id__", "__signature__"], 522 | remove_columns=["__signature__"], 523 | desc=f"Querying...", 524 | ) 525 | 526 | del lsh 527 | gc.collect() 528 | 529 | queried = queried.filter( 530 | lambda x: len(x["__neighbors__"]) > 0, 531 | num_proc=os.cpu_count(), 532 | desc="Finding duplicates...", 533 | ) 534 | dup_ids = _find_duplicate_communities( 535 | records=queried, 536 | community_detection=conf["community_detection"], 537 | report_false_positive_rate=conf["report_false_positive_rate"], 538 | reference_records=ds, 539 | threshold=conf["threshold"], 540 | column=conf["column"], 541 | ) 542 | 543 | del queried 544 | gc.collect() 545 | 546 | if dry_run: 547 | final_data = ds.map( 548 | lambda idx: {"duplicate": idx in dup_ids}, 549 | input_columns=["__id__"], 550 | num_proc=os.cpu_count(), 551 | desc="Labeling duplicates...", 552 | ) 553 | else: 554 | final_data = ds.filter( 555 | lambda idx: idx not in dup_ids, 556 | input_columns=["__id__"], 557 | num_proc=os.cpu_count(), 558 | desc="Filtering duplicates...", 559 | ) 560 | return final_data 561 | -------------------------------------------------------------------------------- /nbs/02_clean.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# clean\n", 8 | "\n", 9 | "> This module contains all the various cleaning options supported." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# | default_exp clean" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# | export\n", 28 | "import re\n", 29 | "from faker import Faker\n", 30 | "import ftfy\n", 31 | "\n", 32 | "fake = Faker()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# | hide\n", 42 | "from nbdev.showdoc import *" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# | export\n", 52 | "# From: https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L95\n", 53 | "whitespace = {\n", 54 | " \" \",\n", 55 | " \" \",\n", 56 | " \" \",\n", 57 | " \" \",\n", 58 | " \" \",\n", 59 | " \" \",\n", 60 | " \" \",\n", 61 | " \" \",\n", 62 | " \" \",\n", 63 | " \" \",\n", 64 | " \"\",\n", 65 | " \"\",\n", 66 | "}\n", 67 | "\n", 68 | "\n", 69 | "def normalize_whitespace(\n", 70 | " text: str, # The text to normalize\n", 71 | ") -> str: # The normalized text\n", 72 | " \"\"\"\n", 73 | " Replace the various whitespace characters with the standard one.\n", 74 | " \"\"\"\n", 75 | " text = \"\".join([char if char not in whitespace else \" \" for char in text])\n", 76 | " return text" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# test the normalize_whitespace function\n", 86 | "assert normalize_whitespace(\"a b c d e f g h ijk\") == \"a b c d e f g h i j k\"" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# | export\n", 96 | "unicode_punctuation = {\n", 97 | " \",\": \",\",\n", 98 | " \"。\": \".\",\n", 99 | " \"、\": \",\",\n", 100 | " \"„\": '\"',\n", 101 | " \"”\": '\"',\n", 102 | " \"“\": '\"',\n", 103 | " \"«\": '\"',\n", 104 | " \"»\": '\"',\n", 105 | " \"1\": '\"',\n", 106 | " \"」\": '\"',\n", 107 | " \"「\": '\"',\n", 108 | " \"《\": '\"',\n", 109 | " \"》\": '\"',\n", 110 | " \"´\": \"'\",\n", 111 | " \"∶\": \":\",\n", 112 | " \":\": \":\",\n", 113 | " \"?\": \"?\",\n", 114 | " \"!\": \"!\",\n", 115 | " \"(\": \"(\",\n", 116 | " \")\": \")\",\n", 117 | " \";\": \";\",\n", 118 | " \"–\": \"-\",\n", 119 | " \"—\": \" - \",\n", 120 | " \".\": \". \",\n", 121 | " \"~\": \"~\",\n", 122 | " \"’\": \"'\",\n", 123 | " \"…\": \"...\",\n", 124 | " \"━\": \"-\",\n", 125 | " \"〈\": \"<\",\n", 126 | " \"〉\": \">\",\n", 127 | " \"【\": \"[\",\n", 128 | " \"】\": \"]\",\n", 129 | " \"%\": \"%\",\n", 130 | " \"►\": \"-\",\n", 131 | "}\n", 132 | "\n", 133 | "\n", 134 | "def normalize_punctuation(\n", 135 | " text: str, # The text to normalize\n", 136 | ") -> str: # The normalized text\n", 137 | " \"\"\"\n", 138 | " Replace the various unicode punctuation characters with the standard ones.\n", 139 | " \"\"\"\n", 140 | " text = \"\".join([unicode_punctuation.get(char, char) for char in text])\n", 141 | " return text" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# test the normalize_punctuation function\n", 151 | "text = \",。、„”“«»1」「《》´∶:?!();–—.~’…━〈〉【】%►\"\n", 152 | "\n", 153 | "assert normalize_punctuation(text) == ',.,\"\"\"\"\"\"\"\"\"\"\\'::?!();- - . ~\\'...-<>[]%-'" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# | export\n", 163 | "def remove_empty_lines(\n", 164 | " text: str, # The text to remove empty lines from\n", 165 | ") -> str: # The text with empty lines removed\n", 166 | " \"\"\"\n", 167 | " Remove empty lines from the text.\n", 168 | " Solution from https://stackoverflow.com/a/3711884/5768407\n", 169 | " \"\"\"\n", 170 | " lines = text.splitlines()\n", 171 | " filtered = filter(lambda x: not re.match(r\"^\\s*$\", x), lines)\n", 172 | " return \"\\n\".join(filtered)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# test the remove_empty_lines function\n", 182 | "starts_with_newline = \"\\nfoo\\nbar\"\n", 183 | "multiple_newlines = \"foo\\n\\nbar\"\n", 184 | "ends_with_newline = \"foo\\nbar\\n\"\n", 185 | "\n", 186 | "assert remove_empty_lines(starts_with_newline) == \"foo\\nbar\"\n", 187 | "assert remove_empty_lines(multiple_newlines) == \"foo\\nbar\"\n", 188 | "assert remove_empty_lines(ends_with_newline) == \"foo\\nbar\"" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# | export\n", 198 | "def replace_urls(\n", 199 | " text: str, # The text to replace URLs in\n", 200 | " dummy: str = \"https://example.com/\", # The dummy text to replace URLs with\n", 201 | ") -> str: # The text with URLs replaced\n", 202 | " \"\"\"Replace urls from text with a dummy.\"\"\"\n", 203 | " return re.sub(r\"http\\S+\", dummy, text)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# test the replace_urls function\n", 213 | "url_after_space = \"foo http://bar.com\"\n", 214 | "url_before_space = \"http://foo.com bar\"\n", 215 | "assert replace_urls(url_after_space) == \"foo https://example.com/\"\n", 216 | "assert replace_urls(url_before_space) == \"https://example.com/ bar\"" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "# | export\n", 226 | "def replace_dates(\n", 227 | " text: str, # The text to remove dates from\n", 228 | " dummy: str = fake.date(), # The dummy text to replace dates with\n", 229 | ") -> str: # The text with dates replaced\n", 230 | " \"\"\"Replace dates from text with a dummy.\"\"\"\n", 231 | " return re.sub(r\"\\d{1,2}/\\d{1,2}/\\d{4}\", dummy, text)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "# test the replace_dates function\n", 241 | "date_after_space = \"foo 1/1/2020\"\n", 242 | "date_before_space = \"1/1/2020 bar\"\n", 243 | "assert replace_dates(date_after_space, \"1/1/1970\") == \"foo 1/1/1970\"\n", 244 | "assert replace_dates(date_before_space, \"1/1/1970\") == \"1/1/1970 bar\"" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "## PII Removal\n", 252 | "\n", 253 | "Currently, we support the following PII removal options:\n", 254 | "\n", 255 | " * `replace_email`\n", 256 | " * `replace_phone`\n", 257 | " * `replace_ip`\n", 258 | " * `replace_credit_card`\n", 259 | " * `replace_ssn`\n", 260 | "\n", 261 | "However, for emails, phone numbers, credit cards, and SSNs, we recommend you to use the [scrubadub](https://scrubadub.readthedocs.io/en/stable/index.html) library." 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "# | export\n", 271 | "def replace_email(\n", 272 | " text: str, # The text to replace email addresses in\n", 273 | " dummy: str = fake.email(), # The dummy text to replace email addresses with\n", 274 | ") -> str: # The text with email addresses replaced\n", 275 | " \"\"\"Replace email addresses from text with a dummy.\"\"\"\n", 276 | " return re.sub(r\"[\\w\\.-]+@[\\w\\.-]+\", dummy, text)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "# test the replace_email function\n", 286 | "email_after_space = \"foo fake@email.com\"\n", 287 | "email_before_space = \"fake@email.com bar\"\n", 288 | "email_with_forward_periods = \"foo.bar@email.com\"\n", 289 | "email_with_backward_periods = \"foo@bar.email.com\"\n", 290 | "\n", 291 | "assert replace_email(email_after_space, \"example@email.com\") == \"foo example@email.com\"\n", 292 | "assert replace_email(email_before_space, \"example@email.com\") == \"example@email.com bar\"\n", 293 | "assert (\n", 294 | " replace_email(email_with_forward_periods, \"example@email.com\")\n", 295 | " == \"example@email.com\"\n", 296 | ")\n", 297 | "assert (\n", 298 | " replace_email(email_with_backward_periods, \"example@email.com\")\n", 299 | " == \"example@email.com\"\n", 300 | ")" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "# | export\n", 310 | "def replace_phone(\n", 311 | " text: str, # The text to replace phone numbers in\n", 312 | " dummy: str = fake.phone_number(), # The dummy text to replace phone numbers with\n", 313 | ") -> str: # The text with phone numbers replaced\n", 314 | " \"\"\"Replace phone numbers from text with a dummy.\"\"\"\n", 315 | " return re.sub(r\"\\(?\\d{3}\\)?-? *\\d{3}-? *-?\\d{4}\", dummy, text)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "# test the replace_phone function\n", 325 | "phone_after_space = \"foo 111-222-3333\"\n", 326 | "phone_before_space = \"111-222-3333 bar\"\n", 327 | "phone_with_parens = \"(111) 222-3333\"\n", 328 | "phone_with_spaces = \"111 222 3333\"\n", 329 | "phone_with_dashes = \"111-222-3333\"\n", 330 | "\n", 331 | "assert replace_phone(phone_after_space, \"123-456-7890\") == \"foo 123-456-7890\"\n", 332 | "assert replace_phone(phone_before_space, \"123-456-7890\") == \"123-456-7890 bar\"\n", 333 | "assert replace_phone(phone_with_parens, \"123-456-7890\") == \"123-456-7890\"\n", 334 | "assert replace_phone(phone_with_spaces, \"123-456-7890\") == \"123-456-7890\"\n", 335 | "assert replace_phone(phone_with_dashes, \"123-456-7890\") == \"123-456-7890\"" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "# | export\n", 345 | "def replace_ip(\n", 346 | " text, # The text to replace ip addresses in\n", 347 | " dummy1: str = fake.ipv4(), # The dummy text to replace ipv4 addresses with\n", 348 | " dummy2: str = fake.ipv6(), # The dummy text to replace ipv6 addresses with\n", 349 | ") -> str: # The text with ip addresses replaced\n", 350 | " \"\"\"\n", 351 | " Replace ip addresses from text with a dummy.\n", 352 | " Solution from https://github.com/bigcode-project/bigcode-analysis/blob/main/data_analysis/pii/utils/emails_ip_addresses_detection.py#L48\n", 353 | " \"\"\"\n", 354 | " ipv4_pattern = r\"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}\"\n", 355 | " text = re.sub(ipv4_pattern, dummy1, text)\n", 356 | " ipv6_pattern = r\"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\"\n", 357 | " text = re.sub(ipv6_pattern, dummy2, text)\n", 358 | " return text" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "# test the replace_ip function\n", 368 | "ip4_after_space = \"foo 111.222.3.4\"\n", 369 | "ip4_before_space = \"111.222.3.4 bar\"\n", 370 | "ip6_with_colons = \"2001:0db8:0000:0000:0000:8a2e:0370:7334\"\n", 371 | "\n", 372 | "assert replace_ip(ip4_after_space, \"127.0.0.1\") == \"foo 127.0.0.1\"\n", 373 | "assert replace_ip(ip4_before_space, \"127.0.0.1\") == \"127.0.0.1 bar\"\n", 374 | "assert replace_ip(ip6_with_colons, \"127.0.0.1\", \"0:0:0:0:0:0:0:1\") == \"0:0:0:0:0:0:0:1\"" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "# | export\n", 384 | "def replace_credit_card(\n", 385 | " text: str, # The text to replace credit card numbers in\n", 386 | " dummy: str = fake.credit_card_number(), # The dummy text to replace credit card numbers with\n", 387 | ") -> str: # The text with credit card numbers replaced\n", 388 | " \"\"\"Replace credit card numbers from text with a dummy.\"\"\"\n", 389 | " return re.sub(r\"\\d{4}-\\d{4}-\\d{4}-\\d{4}\", dummy, text)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "# test the replace_credit_card function\n", 399 | "credit_card_after_space = \"foo 1111-2222-3333-4444\"\n", 400 | "credit_card_before_space = \"1111-2222-3333-4444 bar\"\n", 401 | "\n", 402 | "assert (\n", 403 | " replace_credit_card(credit_card_after_space, \"1234-5678-9012-3456\")\n", 404 | " == \"foo 1234-5678-9012-3456\"\n", 405 | ")\n", 406 | "assert (\n", 407 | " replace_credit_card(credit_card_before_space, \"1234-5678-9012-3456\")\n", 408 | " == \"1234-5678-9012-3456 bar\"\n", 409 | ")" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# | export\n", 419 | "def replace_ssn(\n", 420 | " text: str, # The text to replace social security numbers in\n", 421 | " dummy: str = fake.ssn(), # The dummy text to replace social security numbers with\n", 422 | ") -> str: # The text with social security numbers replaced\n", 423 | " \"\"\"Replace social security numbers from text with a dummy.\"\"\"\n", 424 | " return re.sub(r\"\\d{3}-\\d{2}-\\d{4}\", dummy, text)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "# test the replace_ssn function\n", 434 | "ssn_after_space = \"foo 111-22-3333\"\n", 435 | "ssn_before_space = \"111-22-3333 bar\"\n", 436 | "\n", 437 | "assert replace_ssn(ssn_after_space, \"123-45-6789\") == \"foo 123-45-6789\"\n", 438 | "assert replace_ssn(ssn_before_space, \"123-45-6789\") == \"123-45-6789 bar\"" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "# | export\n", 448 | "def fix_utf8_encoding(\n", 449 | " text: str, # The text to fix\n", 450 | ") -> str: # The fixed text\n", 451 | " \"\"\"Fix utf8 text using ftfy.\"\"\"\n", 452 | " return ftfy.fix_text(text)" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "# test the fix_utf8_encoding function\n", 462 | "bad_text = \"✔ No problems\"\n", 463 | "assert fix_utf8_encoding(bad_text) == \"✔ No problems\"\n", 464 | "bad_text = \"déjà vu\"\n", 465 | "assert fix_utf8_encoding(bad_text) == \"déjà vu\"\n", 466 | "bad_text = \"é\"\n", 467 | "assert fix_utf8_encoding(bad_text) == \"é\"\n", 468 | "bad_text = \"P&EACUTE;REZ\"\n", 469 | "assert fix_utf8_encoding(bad_text) == \"PÉREZ\"" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "# | export\n", 479 | "def clean_code_license(\n", 480 | " code: str, # The code to clean\n", 481 | " language: str = \"python\", # The language of the code\n", 482 | " min_lines: int = 3, # The minimum number of lines that need to be removed\n", 483 | "):\n", 484 | " import code_ast\n", 485 | " from code_ast import ASTVisitor\n", 486 | " from code_ast.ast import LEAVE_WHITELIST\n", 487 | "\n", 488 | " class FirstNonCommentVisitor(ASTVisitor):\n", 489 | " def __init__(self):\n", 490 | " self.passed_global_node = False\n", 491 | " self.first_node = None\n", 492 | "\n", 493 | " def visit(self, node):\n", 494 | " if not self.passed_global_node:\n", 495 | " self.passed_global_node = True\n", 496 | " return\n", 497 | " if self.first_node is None:\n", 498 | " if node.child_count > 0 or node.type in LEAVE_WHITELIST:\n", 499 | " self.first_node = node\n", 500 | "\n", 501 | " \"\"\"Remove the license or other boilerplate comments from the code.\"\"\"\n", 502 | " ast = code_ast.ast(code, lang=language)\n", 503 | " visitor = FirstNonCommentVisitor()\n", 504 | " ast.visit(visitor)\n", 505 | " start_line = visitor.first_node.start_point[0]\n", 506 | " if start_line < min_lines:\n", 507 | " return code\n", 508 | " else:\n", 509 | " return \"\\n\".join(code.splitlines()[start_line:])" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "# |eval: false\n", 519 | "# Test the cleaning of code licenses or similar boilerplate comments from code\n", 520 | "code_python = \"\"\"# -*- coding: utf-8 -*-\n", 521 | "\n", 522 | "# Copyright 2018 Spanish National Research Council (CSIC)\n", 523 | "#\n", 524 | "# Licensed under the Apache License, Version 2.0 (the \"License\"); you may\n", 525 | "# not use this file except in compliance with the License. You may obtain\n", 526 | "# a copy of the License at\n", 527 | "#\n", 528 | "# http://www.apache.org/licenses/LICENSE-2.0\n", 529 | "#\n", 530 | "# Unless required by applicable law or agreed to in writing, software\n", 531 | "# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n", 532 | "# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n", 533 | "# License for the specific language governing permissions and limitations\n", 534 | "# under the License.\n", 535 | "\n", 536 | "\\\"\\\"\\\"\n", 537 | "Given two dates and region, download N Sentinel Collections scenes from ESA\n", 538 | "Sentinel dataHUB.\n", 539 | "The downloaded Sentinel collection scenes are compatible with:\n", 540 | "S2MSI1C: Top-of-atmosphere reflectances in cartographic geometry\n", 541 | "or S2MSI2A: Bottom-of-atmosphere reflectance in cartographic geometry\n", 542 | "Parameters\n", 543 | "----------\n", 544 | "inidate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n", 545 | "enddate: datetime.strptime(\"YYYY-MM-dd\", \"%Y-%m-%d\")\n", 546 | "region: name of one reservoir saved in the \"coord_reservoirs.json\" file\n", 547 | "coordinates : dict. Coordinates of the region to search.\n", 548 | "Example: {\"W\": -2.830, \"S\": 41.820, \"E\": -2.690, \"N\": 41.910}}\n", 549 | "platform : str. Satellite to use from the Sentinel family\n", 550 | "producttype : str. Dataset type.\n", 551 | "cloud: int\n", 552 | "path : path\n", 553 | "Author: Daniel García Díaz\n", 554 | "Email: garciad@ifca.unican.es\n", 555 | "Institute of Physics of Cantabria (IFCA)\n", 556 | "Advanced Computing and e-Science\n", 557 | "Date: Sep 2018\n", 558 | "\\\"\\\"\\\"\n", 559 | "#imports apis\n", 560 | "import requests\n", 561 | "import os\n", 562 | "\n", 563 | "# Subfunctions\n", 564 | "from wq_sat.utils import config\n", 565 | "\"\"\"\n", 566 | "\n", 567 | "code_go = \"\"\"// +build go1.9\n", 568 | "\n", 569 | "// Copyright 2019 Microsoft Corporation\n", 570 | "//\n", 571 | "// Licensed under the Apache License, Version 2.0 (the \"License\");\n", 572 | "// you may not use this file except in compliance with the License.\n", 573 | "// You may obtain a copy of the License at\n", 574 | "//\n", 575 | "// http://www.apache.org/licenses/LICENSE-2.0\n", 576 | "//\n", 577 | "// Unless required by applicable law or agreed to in writing, software\n", 578 | "// distributed under the License is distributed on an \"AS IS\" BASIS,\n", 579 | "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 580 | "// See the License for the specific language governing permissions and\n", 581 | "// limitations under the License.\n", 582 | "\n", 583 | "// This code was auto-generated by:\n", 584 | "// github.com/Azure/azure-sdk-for-go/tools/profileBuilder\n", 585 | "\n", 586 | "package policyinsights\n", 587 | "\n", 588 | "import (\n", 589 | "\t\"context\"\n", 590 | "\n", 591 | "\toriginal \"github.com/Azure/azure-sdk-for-go/services/policyinsights/mgmt/2019-10-01/policyinsights\"\n", 592 | ")\n", 593 | "\"\"\"\n", 594 | "\n", 595 | "code_c = \"\"\"/*\n", 596 | " * copyright (c) 2008 - 2011 Espressif System\n", 597 | " *\n", 598 | " * Define user specified Event signals and Task priorities here\n", 599 | " *\n", 600 | " */\n", 601 | "\n", 602 | "#ifndef _ETS_SYS_H\n", 603 | "#define _ETS_SYS_H\n", 604 | "\n", 605 | "#include \"c_types.h\"\n", 606 | "#include \"eagle_soc.h\"\n", 607 | "\n", 608 | "typedef uint32_t ETSSignal;\n", 609 | "\"\"\"\n", 610 | "\n", 611 | "code_cpp = \"\"\"/* Pokemon Automation Bot Base - Client Example\n", 612 | "\n", 613 | " * \n", 614 | "\n", 615 | " * From: https://github.com/PokemonAutomation/Arduino-Source\n", 616 | "\n", 617 | " * \n", 618 | "\n", 619 | " */\n", 620 | "\n", 621 | "\n", 622 | "\n", 623 | "#include \"Common/CRC32.h\"\n", 624 | "\n", 625 | "#include \"Common/Microcontroller/MessageProtocol.h\"\n", 626 | "\n", 627 | "#include \"ClientSource/Libraries/Logging.h\"\n", 628 | "\n", 629 | "#include \"ClientSource/Libraries/MessageConverter.h\"\n", 630 | "\n", 631 | "#include \"BotBaseMessage.h\"\n", 632 | "\n", 633 | "#include \"PABotBaseConnection.h\"\n", 634 | "\n", 635 | "\n", 636 | "\n", 637 | "#include