├── .github ├── FUNDING.yml └── workflows │ ├── publish_to_pypi.yml │ └── run_unit_test.yml ├── .gitignore ├── DESCRIPTION.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── __init__.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── test_results └── .gitignore ├── tests ├── __init__.py ├── run_tests.py └── text_preprocessing_test.py └── text_preprocessing ├── __init__.py ├── data ├── custom_substitutions.csv └── ignore_spellcheck_words.txt └── text_preprocessing.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: berknology 2 | -------------------------------------------------------------------------------- /.github/workflows/publish_to_pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Release 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/run_unit_test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Build 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.8 20 | uses: actions/setup-python@v1 21 | with: 22 | python-version: 3.8 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install flake8 27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 34 | - name: Test with unittest 35 | run: | 36 | make test 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | cover/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # PyCharm 60 | .idea 61 | *.iml 62 | pycharm 63 | 64 | # Gradle 65 | .gradle 66 | 67 | # docker builds 68 | docker.properties 69 | 70 | # tests 71 | test_results/* 72 | !test_results/.gitignore 73 | 74 | # ipython 75 | .ipynb_checkpoints/ 76 | 77 | # VSCode 78 | .classpath 79 | .project 80 | .settings/ 81 | .vscode/ 82 | 83 | # virtual environment 84 | venv/ -------------------------------------------------------------------------------- /DESCRIPTION.rst: -------------------------------------------------------------------------------- 1 | ================================================== 2 | Text preprocessing for Natural Language Processing 3 | ================================================== 4 | 5 | A python package for text preprocessing task in natural language processing. 6 | 7 | Usage 8 | ----- 9 | To use this text preprocessing package, first install it using pip: 10 | 11 | .. code-block:: python 12 | 13 | pip install text-preprocessing 14 | 15 | 16 | Then, import the package in your python script and call appropriate functions: 17 | 18 | .. code-block:: python 19 | 20 | from text_preprocessing import preprocess_text 21 | from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word 22 | 23 | # Preprocess text using default preprocess functions in the pipeline 24 | text_to_process = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit our website www.johndoe.com' 25 | preprocessed_text = preprocess_text(text_to_process) 26 | print(preprocessed_text) 27 | # output: hello email visit website 28 | 29 | # Preprocess text using custom preprocess functions in the pipeline 30 | preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word] 31 | preprocessed_text = preprocess_text(text_to_process, preprocess_functions) 32 | print(preprocessed_text) 33 | # output: helllo i am john doe my email is visit our website 34 | 35 | 36 | Features 37 | -------- 38 | 39 | .. csv-table:: 40 | :header: "Feature", "Function" 41 | :widths: 50, 35 42 | 43 | "convert to lower case", "to_lower" 44 | "convert to upper case", "to_upper" 45 | "keep only alphabetic and numerical characters", "keep_alpha_numeric" 46 | "check and correct spellings", "check_spelling" 47 | "expand contractions", "expand_contraction" 48 | "remove URLs", "remove_url" 49 | "remove names", "remove_name" 50 | "remove emails", "remove_email" 51 | "remove phone numbers", "remove_phone_number" 52 | "remove SSNs", "remove_ssn" 53 | "remove credit card numbers", "remove_credit_card_number" 54 | "remove numbers", "remove_number" 55 | "remove bullets and numbering", "remove_itemized_bullet_and_numbering" 56 | "remove special characters", "remove_special_character" 57 | "remove punctuations", "remove_punctuation" 58 | "remove extra whitespace", "remove_whitespace" 59 | "normalize unicode (e.g., café -> cafe)", "normalize_unicode" 60 | "remove stop words", "remove_stopword" 61 | "tokenize words", "tokenize_word" 62 | "tokenize sentences", "tokenize_sentence" 63 | "substitute custom words (e.g., vs -> versus)", "substitute_token" 64 | "stem words", "stem_word" 65 | "lemmatize words", "lemmatize_word" 66 | "preprocess text through a sequence of preprocessing functions", "preprocess_text" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, He Hao 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include text_preprocessing/data/custom_substitutions.csv 2 | include text_preprocessing/data/ignore_spellcheck_words.txt 3 | 4 | recursive-include tests * -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | define BROWSER_PYSCRIPT 3 | import os, webbrowser, sys 4 | try: 5 | from urllib import pathname2url 6 | except: 7 | from urllib.request import pathname2url 8 | 9 | try: 10 | import bump2version 11 | except: 12 | print("Please install library bump2version by 'pip install bump2version'") 13 | 14 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 15 | endef 16 | export BROWSER_PYSCRIPT 17 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 18 | 19 | help: 20 | @echo "clean - remove all build, test, coverage and Python artifacts" 21 | @echo "clean-build - remove build artifacts" 22 | @echo "clean-pyc - remove Python file artifacts" 23 | @echo "clean-test - remove test and coverage artifacts" 24 | @echo "lint - check style with flake8" 25 | @echo "coverage - check code coverage quickly with the default Python" 26 | @echo "docs - generate Sphinx HTML documentation, including API docs" 27 | @echo "release - package and upload a release" 28 | @echo "dist - build python package" 29 | @echo "install - install the package to the active Python's site-packages" 30 | @echo "bump-batch - use bump2version to bump patch version" 31 | @echo "bump-minor - use bump2version to bump minor version" 32 | @echo "bump-major - use bump2version to bump major version" 33 | @echo "test - run unit test" 34 | 35 | clean: clean-build clean-pyc clean-test 36 | 37 | clean-build: 38 | rm -fr build/ 39 | rm -fr dist/ 40 | rm -fr .eggs/ 41 | find . -name '*.egg-info' -exec rm -fr {} + 42 | find . -name '*.egg' -exec rm -f {} + 43 | 44 | clean-pyc: 45 | find . -name '*.pyc' -exec rm -f {} + 46 | find . -name '*.pyo' -exec rm -f {} + 47 | find . -name '*~' -exec rm -f {} + 48 | find . -name '__pycache__' -exec rm -fr {} + 49 | 50 | clean-test: 51 | rm -fr .tox/ 52 | rm -f .coverage 53 | rm -fr htmlcov/ 54 | 55 | lint: 56 | flake8 pii_detector tests 57 | 58 | coverage: 59 | coverage run --source pii_detector setup.py test 60 | coverage report -m 61 | coverage html 62 | $(BROWSER) htmlcov/index.html 63 | 64 | docs: 65 | rm -f docs/pii_detector.rst 66 | rm -f docs/modules.rst 67 | sphinx-apidoc -o docs/ pii_detector 68 | $(MAKE) -C docs clean 69 | $(MAKE) -C docs html 70 | $(BROWSER) docs/_build/html/index.html 71 | 72 | servedocs: docs 73 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 74 | 75 | dist: 76 | python setup.py bdist_wheel 77 | ls -l dist 78 | 79 | release: dist 80 | twine upload dist/* 81 | 82 | install: clean 83 | python setup.py install 84 | 85 | bump-patch: 86 | bump2version patch 87 | 88 | bump-minor: 89 | bump2version minor 90 | 91 | bump-major: 92 | bump2version major 93 | 94 | test: 95 | python -m tests.run_tests -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Text preprocessing for Natural Language Processing 2 | ============= 3 | 4 | ![Build](https://github.com/berknology/text-preprocessing/workflows/Build/badge.svg) 5 | ![Release](https://github.com/berknology/text-preprocessing/workflows/Release/badge.svg) 6 | ![PyPi](https://img.shields.io/pypi/v/text-preprocessing.svg) 7 | 8 | 9 | A python package for text preprocessing task in natural language processing. 10 | 11 | Usage 12 | -------- 13 | To use this text preprocessing package, first install it using pip: 14 | ```bash 15 | pip install text-preprocessing 16 | ``` 17 | 18 | Then, import the package in your python script and call appropriate functions: 19 | 20 | ```python 21 | from text_preprocessing import preprocess_text 22 | from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word 23 | 24 | # Preprocess text using default preprocess functions in the pipeline 25 | text_to_process = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit our website www.johndoe.com' 26 | preprocessed_text = preprocess_text(text_to_process) 27 | print(preprocessed_text) 28 | # output: hello email visit website 29 | 30 | # Preprocess text using custom preprocess functions in the pipeline 31 | preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word] 32 | preprocessed_text = preprocess_text(text_to_process, preprocess_functions) 33 | print(preprocessed_text) 34 | # output: helllo i am john doe my email is visit our website 35 | ``` 36 | 37 | If you have a lot of data to preprocess, and would like to run text preprocessig in a parallel manner in PySpark on 38 | Databricks, please use the following udf function: 39 | ```python 40 | from text_preprocessing import preprocess_text 41 | from pyspark.sql.functions import udf 42 | from pyspark.sql.types import StringType 43 | from pyspark.sql import DataFrame as SparkDataFrame 44 | 45 | 46 | def preprocess_text_spark(df: SparkDataFrame, 47 | target_column: str, 48 | preprocessed_column_name: str = 'preprocessed_text' 49 | ) -> SparkDataFrame: 50 | """ Preprocess text in a column of a PySpark DataFrame by leveraging PySpark UDF to preprocess text in parallel """ 51 | _preprocess_text = udf(preprocess_text, StringType()) 52 | new_df = df.withColumn(preprocessed_column_name, _preprocess_text(df[target_column])) 53 | return new_df 54 | ``` 55 | 56 | Features 57 | -------- 58 | 59 | | Feature | Function | 60 | | :------------------------------------------------------------ |:------------------------------------- | 61 | | convert to lower case | to_lower | 62 | | convert to upper case | to_upper | 63 | | keep only alphabetic and numerical characters | keep_alpha_numeric | 64 | | check and correct spellings | check_spelling | 65 | | expand contractions | expand_contraction | 66 | | remove URLs | remove_url | 67 | | remove names | remove_name | 68 | | remove emails | remove_email | 69 | | remove phone numbers | remove_phone_number | 70 | | remove SSNs | remove_ssn | 71 | | remove credit card numbers | remove_credit_card_number | 72 | | remove numbers | remove_number | 73 | | remove bullets and numbering | remove_itemized_bullet_and_numbering | 74 | | remove special characters | remove_special_character | 75 | | remove punctuations | remove_punctuation | 76 | | remove extra whitespace | remove_whitespace | 77 | | normalize unicode (e.g., café -> cafe) | normalize_unicode | 78 | | remove stop words | remove_stopword | 79 | | tokenize words | tokenize_word | 80 | | tokenize sentences | tokenize_sentence | 81 | | substitute custom words (e.g., vs -> versus) | substitute_token | 82 | | stem words | stem_word | 83 | | lemmatize words | lemmatize_word | 84 | | preprocess text through a sequence of preprocessing functions | preprocess_text | 85 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.1' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | pyspellchecker 3 | contractions 4 | names-dataset==2.1 5 | # For unit test 6 | unittest-xml-reporting -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.1 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bumpversion:file:text_preprocessing/__init__.py] 15 | search = __version__ = '{current_version}' 16 | replace = __version__ = '{new_version}' 17 | 18 | [bdist_wheel] 19 | universal = 1 20 | 21 | [flake8] 22 | exclude = docs 23 | 24 | [aliases] 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | def parse_requirements(fn): 5 | with open(fn) as f: 6 | return [req for req in f.read().strip().split('\n') if "#" not in req] 7 | 8 | 9 | parsed_requirements = parse_requirements( 10 | 'requirements.txt', 11 | ) 12 | 13 | parsed_test_requirements = parse_requirements( 14 | 'requirements.txt', 15 | ) 16 | 17 | requirements = [str(ir) for ir in parsed_requirements] 18 | test_requirements = [str(tr) for tr in parsed_test_requirements] 19 | 20 | 21 | with open('DESCRIPTION.rst') as description_file: 22 | description = description_file.read() 23 | 24 | 25 | setup( 26 | name='text_preprocessing', 27 | version='0.1.1', 28 | description="A python package for text preprocessing task in natural language processing", 29 | long_description=description, 30 | url='https://github.com/berknology/text-preprocessing', 31 | license="BSD license", 32 | author="He Hao", 33 | author_email='berknology@gmail.com', 34 | packages=find_packages(include=['text_preprocessing', 'text_preprocessing.*']), 35 | include_package_data=True, 36 | install_requires=requirements, 37 | zip_safe=False, 38 | keywords='NLP', 39 | classifiers=[ 40 | 'Development Status :: 2 - Pre-Alpha', 41 | 'Intended Audience :: Developers', 42 | 'License :: OSI Approved :: BSD License', 43 | 'Natural Language :: English', 44 | 'Programming Language :: Python :: 3.7', 45 | 'Programming Language :: Python :: 3.8', 46 | ], 47 | test_suite='tests', 48 | tests_require=test_requirements 49 | ) 50 | 51 | -------------------------------------------------------------------------------- /test_results/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berknology/text-preprocessing/59351e5f4adc510b7063faee0376bc194790a82c/test_results/.gitignore -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/run_tests.py: -------------------------------------------------------------------------------- 1 | # Standard libraries 2 | import unittest 3 | 4 | # Third party libraries 5 | import xmlrunner 6 | 7 | 8 | if __name__ == '__main__': 9 | # Only run tests in packages contain an __init__.py file 10 | unittest.main(module=None, 11 | testRunner=xmlrunner.XMLTestRunner(output="test_results"), 12 | failfast=False, 13 | buffer=False, 14 | catchbreak=False, 15 | argv=["", "discover", "-p", "*test*.py"] 16 | ) 17 | -------------------------------------------------------------------------------- /tests/text_preprocessing_test.py: -------------------------------------------------------------------------------- 1 | # Standard libraries 2 | from unittest import TestCase 3 | from unittest.mock import patch, MagicMock 4 | 5 | # Project code 6 | from text_preprocessing import (to_lower, to_upper, remove_number, remove_url, remove_punctuation, 7 | remove_special_character, keep_alpha_numeric, remove_whitespace, expand_contraction, 8 | normalize_unicode, remove_stopword, remove_email, remove_phone_number, remove_ssn, 9 | remove_credit_card_number, remove_name, check_spelling, substitute_token, 10 | remove_itemized_bullet_and_numbering) 11 | from text_preprocessing import preprocess_text 12 | 13 | 14 | class TestTextPreprocessing(TestCase): 15 | 16 | def test_to_lower(self): 17 | # Setup 18 | input_text = 'HellO' 19 | expected_output = 'hello' 20 | # Actual call 21 | output_text = to_lower(input_text) 22 | # Asserts 23 | self.assertEqual(output_text, expected_output) 24 | 25 | def test_to_lower_lower_input(self): 26 | # Setup 27 | input_text = 'hello' 28 | expected_output = 'hello' 29 | # Actual call 30 | output_text = to_lower(input_text) 31 | # Asserts 32 | self.assertEqual(output_text, expected_output) 33 | 34 | def test_to_lower_upper_input(self): 35 | # Setup 36 | input_text = 'HELLO' 37 | expected_output = 'hello' 38 | # Actual call 39 | output_text = to_lower(input_text) 40 | # Asserts 41 | self.assertEqual(output_text, expected_output) 42 | 43 | def test_to_lower_none(self): 44 | # Setup 45 | input_text = None 46 | expected_output = '' 47 | # Actual call 48 | output_text = to_lower(input_text) 49 | # Asserts 50 | self.assertEqual(output_text, expected_output) 51 | 52 | def test_to_lower_empty_input(self): 53 | # Setup 54 | input_text = '' 55 | expected_output = '' 56 | # Actual call 57 | output_text = to_lower(input_text) 58 | # Asserts 59 | self.assertEqual(output_text, expected_output) 60 | 61 | def test_to_upper(self): 62 | # Setup 63 | input_text = 'HellO' 64 | expected_output = 'HELLO' 65 | # Actual call 66 | output_text = to_upper(input_text) 67 | # Asserts 68 | self.assertEqual(output_text, expected_output) 69 | 70 | def test_to_upper_lower_input(self): 71 | # Setup 72 | input_text = 'hello' 73 | expected_output = 'HELLO' 74 | # Actual call 75 | output_text = to_upper(input_text) 76 | # Asserts 77 | self.assertEqual(output_text, expected_output) 78 | 79 | def test_to_upper_upper_input(self): 80 | # Setup 81 | input_text = 'HELLO' 82 | expected_output = 'HELLO' 83 | # Actual call 84 | output_text = to_upper(input_text) 85 | # Asserts 86 | self.assertEqual(output_text, expected_output) 87 | 88 | def test_to_upper_none(self): 89 | # Setup 90 | input_text = None 91 | expected_output = '' 92 | # Actual call 93 | output_text = to_upper(input_text) 94 | # Asserts 95 | self.assertEqual(output_text, expected_output) 96 | 97 | def test_to_upper_empty_input(self): 98 | # Setup 99 | input_text = '' 100 | expected_output = '' 101 | # Actual call 102 | output_text = to_upper(input_text) 103 | # Asserts 104 | self.assertEqual(output_text, expected_output) 105 | 106 | def test_remove_number(self): 107 | # Setup 108 | input_text = 'HellO123' 109 | expected_output = 'HellO' 110 | # Actual call 111 | output_text = remove_number(input_text) 112 | # Asserts 113 | self.assertEqual(output_text, expected_output) 114 | 115 | def test_remove_number_no_number(self): 116 | # Setup 117 | input_text = 'HellO!.' 118 | expected_output = 'HellO!.' 119 | # Actual call 120 | output_text = remove_number(input_text) 121 | # Asserts 122 | self.assertEqual(output_text, expected_output) 123 | 124 | def test_remove_number_all_number(self): 125 | # Setup 126 | input_text = '987123' 127 | expected_output = '' 128 | # Actual call 129 | output_text = remove_number(input_text) 130 | # Asserts 131 | self.assertEqual(output_text, expected_output) 132 | 133 | def test_remove_number_none(self): 134 | # Setup 135 | input_text = None 136 | expected_output = '' 137 | # Actual call 138 | output_text = remove_number(input_text) 139 | # Asserts 140 | self.assertEqual(output_text, expected_output) 141 | 142 | def test_remove_number_empty_input(self): 143 | # Setup 144 | input_text = '' 145 | expected_output = '' 146 | # Actual call 147 | output_text = remove_number(input_text) 148 | # Asserts 149 | self.assertEqual(output_text, expected_output) 150 | 151 | def test_remove_itemized_bullet_and_numbering(self): 152 | # Setup 153 | input_text = 'My comments: 1) blah blah, 2. blah blah. III) blah blah; iv) blah blah, (d) blah blah' 154 | expected_output = 'My comments: blah blah, blah blah. blah blah; blah blah, blah blah' 155 | # Actual call 156 | output_text = remove_itemized_bullet_and_numbering(input_text) 157 | # Asserts 158 | self.assertEqual(output_text, expected_output) 159 | 160 | def test_remove_itemized_bullet_and_numbering_no_bullet_or_numbering(self): 161 | # Setup 162 | input_text = 'hello, this is a test. ' 163 | expected_output = 'hello, this is a test. ' 164 | # Actual call 165 | output_text = remove_itemized_bullet_and_numbering(input_text) 166 | # Asserts 167 | self.assertEqual(output_text, expected_output) 168 | 169 | def test_remove_itemized_bullet_and_numbering_all_bullets_and_numberings(self): 170 | # Setup 171 | input_text = ' 1) test 2. test. (3) test a) test (b) test E) test. (F) test. (i) a vx) b IV. c' 172 | expected_output = ' test test. test test test test. test. a b c' 173 | # Actual call 174 | output_text = remove_itemized_bullet_and_numbering(input_text) 175 | # Asserts 176 | self.assertEqual(output_text, expected_output) 177 | 178 | def test_remove_itemized_bullet_and_numbering_none(self): 179 | # Setup 180 | input_text = None 181 | expected_output = '' 182 | # Actual call 183 | output_text = remove_itemized_bullet_and_numbering(input_text) 184 | # Asserts 185 | self.assertEqual(output_text, expected_output) 186 | 187 | def test_remove_itemized_bullet_and_numbering_empty_input(self): 188 | # Setup 189 | input_text = '' 190 | expected_output = '' 191 | # Actual call 192 | output_text = remove_itemized_bullet_and_numbering(input_text) 193 | # Asserts 194 | self.assertEqual(output_text, expected_output) 195 | 196 | def test_remove_url(self): 197 | # Setup 198 | input_text = 'my address is www.microsoft.com https://www.microsoft.com' 199 | expected_output = 'my address is ' 200 | # Actual call 201 | output_text = remove_url(input_text) 202 | # Asserts 203 | self.assertEqual(output_text, expected_output) 204 | 205 | def test_remove_url_no_url(self): 206 | # Setup 207 | input_text = 'my address is www.microsoft.com https://www.microsoft.com' 208 | expected_output = 'my address is ' 209 | # Actual call 210 | output_text = remove_url(input_text) 211 | # Asserts 212 | self.assertEqual(output_text, expected_output) 213 | 214 | def test_remove_url_all_url(self): 215 | # Setup 216 | input_text = 'www.microsoft.com https://www.microsoft.com' 217 | expected_output = ' ' 218 | # Actual call 219 | output_text = remove_url(input_text) 220 | # Asserts 221 | self.assertEqual(output_text, expected_output) 222 | 223 | def test_remove_url_none(self): 224 | # Setup 225 | input_text = None 226 | expected_output = '' 227 | # Actual call 228 | output_text = remove_url(input_text) 229 | # Asserts 230 | self.assertEqual(output_text, expected_output) 231 | 232 | def test_remove_url_empty_input(self): 233 | # Setup 234 | input_text = '' 235 | expected_output = '' 236 | # Actual call 237 | output_text = remove_url(input_text) 238 | # Asserts 239 | self.assertEqual(output_text, expected_output) 240 | 241 | def test_remove_punctuation(self): 242 | # Setup 243 | input_text = 'Hello!!! Welcome.' 244 | expected_output = 'Hello Welcome' 245 | # Actual call 246 | output_text = remove_punctuation(input_text) 247 | # Asserts 248 | self.assertEqual(output_text, expected_output) 249 | 250 | def test_remove_punctuation_no_punctuations(self): 251 | # Setup 252 | input_text = 'Hello world' 253 | expected_output = 'Hello world' 254 | # Actual call 255 | output_text = remove_punctuation(input_text) 256 | # Asserts 257 | self.assertEqual(output_text, expected_output) 258 | 259 | def test_remove_punctuation_all_punctuations(self): 260 | # Setup 261 | input_text = '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~' 262 | expected_output = '' 263 | # Actual call 264 | output_text = remove_punctuation(input_text) 265 | # Asserts 266 | self.assertEqual(output_text, expected_output) 267 | 268 | def test_remove_punctuation_none(self): 269 | # Setup 270 | input_text = None 271 | expected_output = '' 272 | # Actual call 273 | output_text = remove_punctuation(input_text) 274 | # Asserts 275 | self.assertEqual(output_text, expected_output) 276 | 277 | def test_remove_punctuation_empty_input(self): 278 | # Setup 279 | input_text = '' 280 | expected_output = '' 281 | # Actual call 282 | output_text = remove_punctuation(input_text) 283 | # Asserts 284 | self.assertEqual(output_text, expected_output) 285 | 286 | def test_remove_special_character(self): 287 | # Setup 288 | input_text = 'Hello 弫 Welcome.' 289 | expected_output = 'Hello Welcome.' 290 | # Actual call 291 | output_text = remove_special_character(input_text) 292 | # Asserts 293 | self.assertEqual(output_text, expected_output) 294 | 295 | def test_remove_special_character_no_special_characters(self): 296 | # Setup 297 | input_text = 'Hello world' 298 | expected_output = 'Hello world' 299 | # Actual call 300 | output_text = remove_special_character(input_text) 301 | # Asserts 302 | self.assertEqual(output_text, expected_output) 303 | 304 | def test_remove_special_character_all_special_characters(self): 305 | # Setup 306 | input_text = '弫¥ª°©ð±§µæ¹¢³¿®ä£' 307 | expected_output = '' 308 | # Actual call 309 | output_text = remove_special_character(input_text) 310 | # Asserts 311 | self.assertEqual(output_text, expected_output) 312 | 313 | def test_remove_special_character_none(self): 314 | # Setup 315 | input_text = None 316 | expected_output = '' 317 | # Actual call 318 | output_text = remove_special_character(input_text) 319 | # Asserts 320 | self.assertEqual(output_text, expected_output) 321 | 322 | def test_remove_special_character_empty_input(self): 323 | # Setup 324 | input_text = '' 325 | expected_output = '' 326 | # Actual call 327 | output_text = remove_special_character(input_text) 328 | # Asserts 329 | self.assertEqual(output_text, expected_output) 330 | 331 | def test_keep_alpha_numeric(self): 332 | # Setup 333 | input_text = 'Hello1 弫µæ Welcome2.' 334 | expected_output = 'Hello1å¼µæWelcome2' 335 | # Actual call 336 | output_text = keep_alpha_numeric(input_text) 337 | # Asserts 338 | self.assertEqual(output_text, expected_output) 339 | 340 | def test_keep_alpha_numeric_no_alphanumeric(self): 341 | # Setup 342 | input_text = '!.,*&^' 343 | expected_output = '' 344 | # Actual call 345 | output_text = keep_alpha_numeric(input_text) 346 | # Asserts 347 | self.assertEqual(output_text, expected_output) 348 | 349 | def test_keep_alpha_numeric_none(self): 350 | # Setup 351 | input_text = None 352 | expected_output = '' 353 | # Actual call 354 | output_text = keep_alpha_numeric(input_text) 355 | # Asserts 356 | self.assertEqual(output_text, expected_output) 357 | 358 | def test_keep_alpha_numeric_empty_input(self): 359 | # Setup 360 | input_text = '' 361 | expected_output = '' 362 | # Actual call 363 | output_text = keep_alpha_numeric(input_text) 364 | # Asserts 365 | self.assertEqual(output_text, expected_output) 366 | 367 | def test_remove_whitespace(self): 368 | # Setup 369 | input_text = ' Hello Welcome. ' 370 | expected_output = 'Hello Welcome.' 371 | # Actual call 372 | output_text = remove_whitespace(input_text) 373 | # Asserts 374 | self.assertEqual(output_text, expected_output) 375 | 376 | def test_remove_whitespace_strip(self): 377 | # Setup 378 | input_text = ' Hello Welcome. ' 379 | expected_output = 'Hello Welcome.' 380 | # Actual call 381 | output_text = remove_whitespace(input_text, remove_duplicate_whitespace=False) 382 | # Asserts 383 | self.assertEqual(output_text, expected_output) 384 | 385 | def test_remove_whitespace_no_whitespace(self): 386 | # Setup 387 | input_text = 'Helloworld...' 388 | expected_output = 'Helloworld...' 389 | # Actual call 390 | output_text = remove_whitespace(input_text) 391 | # Asserts 392 | self.assertEqual(output_text, expected_output) 393 | 394 | def test_remove_whitespace_all_whitespace(self): 395 | # Setup 396 | input_text = ' ' 397 | expected_output = '' 398 | # Actual call 399 | output_text = remove_whitespace(input_text) 400 | # Asserts 401 | self.assertEqual(output_text, expected_output) 402 | 403 | def test_remove_whitespace_none(self): 404 | # Setup 405 | input_text = None 406 | expected_output = '' 407 | # Actual call 408 | output_text = remove_whitespace(input_text) 409 | # Asserts 410 | self.assertEqual(output_text, expected_output) 411 | 412 | def test_remove_whitespace_empty_input(self): 413 | # Setup 414 | input_text = '' 415 | expected_output = '' 416 | # Actual call 417 | output_text = remove_whitespace(input_text) 418 | # Asserts 419 | self.assertEqual(output_text, expected_output) 420 | 421 | def test_expand_contraction(self): 422 | # Setup 423 | input_text = "This isn't a test" 424 | expected_output = 'This is not a test' 425 | # Actual call 426 | output_text = expand_contraction(input_text) 427 | # Asserts 428 | self.assertEqual(output_text, expected_output) 429 | 430 | def test_expand_contraction_no_contraction(self): 431 | # Setup 432 | input_text = 'Hello world' 433 | expected_output = 'Hello world' 434 | # Actual call 435 | output_text = expand_contraction(input_text) 436 | # Asserts 437 | self.assertEqual(output_text, expected_output) 438 | 439 | def test_expand_contraction_all_contractions(self): 440 | # Setup 441 | input_text = "cannot isn't ain't couldn't" 442 | expected_output = 'cannot is not are not could not' 443 | # Actual call 444 | output_text = expand_contraction(input_text) 445 | # Asserts 446 | self.assertEqual(output_text, expected_output) 447 | 448 | def test_expand_contraction_none(self): 449 | # Setup 450 | input_text = None 451 | expected_output = '' 452 | # Actual call 453 | output_text = expand_contraction(input_text) 454 | # Asserts 455 | self.assertEqual(output_text, expected_output) 456 | 457 | def test_expand_contraction_empty_input(self): 458 | # Setup 459 | input_text = '' 460 | expected_output = '' 461 | # Actual call 462 | output_text = expand_contraction(input_text) 463 | # Asserts 464 | self.assertEqual(output_text, expected_output) 465 | 466 | def test_normalize_unicode(self): 467 | # Setup 468 | input_text = "I love this Café" 469 | expected_output = 'I love this Cafe' 470 | # Actual call 471 | output_text = normalize_unicode(input_text) 472 | # Asserts 473 | self.assertEqual(output_text, expected_output) 474 | 475 | def test_normalize_unicode_no_special_unicode(self): 476 | # Setup 477 | input_text = 'This is a test' 478 | expected_output = 'This is a test' 479 | # Actual call 480 | output_text = normalize_unicode(input_text) 481 | # Asserts 482 | self.assertEqual(output_text, expected_output) 483 | 484 | def test_normalize_unicode_all_special_unicode(self): 485 | # Setup 486 | input_text = 'áñó' 487 | expected_output = 'ano' 488 | # Actual call 489 | output_text = normalize_unicode(input_text) 490 | # Asserts 491 | self.assertEqual(output_text, expected_output) 492 | 493 | def test_normalize_unicode_none(self): 494 | # Setup 495 | input_text = None 496 | expected_output = '' 497 | # Actual call 498 | output_text = normalize_unicode(input_text) 499 | # Asserts 500 | self.assertEqual(output_text, expected_output) 501 | 502 | def test_normalize_unicode_empty_input(self): 503 | # Setup 504 | input_text = '' 505 | expected_output = '' 506 | # Actual call 507 | output_text = normalize_unicode(input_text) 508 | # Asserts 509 | self.assertEqual(output_text, expected_output) 510 | 511 | def test_remove_stopword(self): 512 | # Setup 513 | input_text = "This is a test!" 514 | expected_output = ['This', 'test', '!'] 515 | # Actual call 516 | output_text = remove_stopword(input_text) 517 | # Asserts 518 | self.assertListEqual(output_text, expected_output) 519 | 520 | def test_remove_stopword_no_stopword(self): 521 | # Setup 522 | input_text = 'Hello World.' 523 | expected_output = ['Hello', 'World', '.'] 524 | # Actual call 525 | output_text = remove_stopword(input_text) 526 | # Asserts 527 | self.assertListEqual(output_text, expected_output) 528 | 529 | def test_remove_stopword_all_stopwords(self): 530 | # Setup 531 | input_text = 'the a your my his her' 532 | expected_output = [] 533 | # Actual call 534 | output_text = remove_stopword(input_text) 535 | # Asserts 536 | self.assertListEqual(output_text, expected_output) 537 | 538 | def test_remove_stopword_none(self): 539 | # Setup 540 | input_text = None 541 | expected_output = [] 542 | # Actual call 543 | output_text = remove_stopword(input_text) 544 | # Asserts 545 | self.assertListEqual(output_text, expected_output) 546 | 547 | def test_remove_stopword_empty_input(self): 548 | # Setup 549 | input_text = '' 550 | expected_output = [] 551 | # Actual call 552 | output_text = remove_stopword(input_text) 553 | # Asserts 554 | self.assertListEqual(output_text, expected_output) 555 | 556 | def test_remove_email(self): 557 | # Setup 558 | input_text = "Please email me at john.doe@email.com." 559 | expected_output = "Please email me at ." 560 | # Actual call 561 | output_text = remove_email(input_text) 562 | # Asserts 563 | self.assertEqual(output_text, expected_output) 564 | 565 | def test_remove_email_no_email(self): 566 | # Setup 567 | input_text = "Please call me (425) 425-1234." 568 | expected_output = "Please call me (425) 425-1234." 569 | # Actual call 570 | output_text = remove_email(input_text) 571 | # Asserts 572 | self.assertEqual(output_text, expected_output) 573 | 574 | def test_remove_email_all_emails(self): 575 | # Setup 576 | input_text = 'john.doe@email.com, john.doe@microsoft.com, janedoe@gmail.com' 577 | expected_output = ', , ' 578 | # Actual call 579 | output_text = remove_email(input_text) 580 | # Asserts 581 | self.assertEqual(output_text, expected_output) 582 | 583 | def test_remove_email_none(self): 584 | # Setup 585 | input_text = None 586 | expected_output = '' 587 | # Actual call 588 | output_text = remove_email(input_text) 589 | # Asserts 590 | self.assertEqual(output_text, expected_output) 591 | 592 | def test_remove_email_empty_input(self): 593 | # Setup 594 | input_text = '' 595 | expected_output = '' 596 | # Actual call 597 | output_text = remove_email(input_text) 598 | # Asserts 599 | self.assertEqual(output_text, expected_output) 600 | 601 | def test_remove_phone_number(self): 602 | # Setup 603 | input_text = "Please call me at (425) 538-0116." 604 | expected_output = "Please call me at." 605 | # Actual call 606 | output_text = remove_phone_number(input_text) 607 | # Asserts 608 | self.assertEqual(output_text, expected_output) 609 | 610 | def test_remove_phone_number_no_phone(self): 611 | # Setup 612 | input_text = "Please email me" 613 | expected_output = "Please email me" 614 | # Actual call 615 | output_text = remove_phone_number(input_text) 616 | # Asserts 617 | self.assertEqual(output_text, expected_output) 618 | 619 | def test_remove_phone_number_all_phones(self): 620 | # Setup 621 | input_text = '(425) 538-1234, (425)5381234, 4255381234 425-538-1234, 425.538.1234, +1 425-538-1234' 622 | expected_output = ',,,, ' 623 | # Actual call 624 | output_text = remove_phone_number(input_text) 625 | # Asserts 626 | self.assertEqual(output_text, expected_output) 627 | 628 | def test_remove_phone_number_none(self): 629 | # Setup 630 | input_text = None 631 | expected_output = '' 632 | # Actual call 633 | output_text = remove_phone_number(input_text) 634 | # Asserts 635 | self.assertEqual(output_text, expected_output) 636 | 637 | def test_remove_phone_number_empty_input(self): 638 | # Setup 639 | input_text = '' 640 | expected_output = '' 641 | # Actual call 642 | output_text = remove_phone_number(input_text) 643 | # Asserts 644 | self.assertEqual(output_text, expected_output) 645 | 646 | def test_remove_ssn(self): 647 | # Setup 648 | input_text = "My social security is 770-12-3456" 649 | expected_output = "My social security is " 650 | # Actual call 651 | output_text = remove_ssn(input_text) 652 | # Asserts 653 | self.assertEqual(output_text, expected_output) 654 | 655 | def test_remove_ssn_no_ssn(self): 656 | # Setup 657 | input_text = "Hello world!" 658 | expected_output = "Hello world!" 659 | # Actual call 660 | output_text = remove_ssn(input_text) 661 | # Asserts 662 | self.assertEqual(output_text, expected_output) 663 | 664 | def test_remove_ssn_all_ssns(self): 665 | # Setup 666 | input_text = '574-76-3766, 664-20-8576, 481-94-4099, 585-60-3079, 541714785' 667 | expected_output = ', , , , ' 668 | # Actual call 669 | output_text = remove_ssn(input_text) 670 | # Asserts 671 | self.assertEqual(output_text, expected_output) 672 | 673 | def test_remove_ssn_none(self): 674 | # Setup 675 | input_text = None 676 | expected_output = '' 677 | # Actual call 678 | output_text = remove_ssn(input_text) 679 | # Asserts 680 | self.assertEqual(output_text, expected_output) 681 | 682 | def test_remove_ssn_empty_input(self): 683 | # Setup 684 | input_text = '' 685 | expected_output = '' 686 | # Actual call 687 | output_text = remove_ssn(input_text) 688 | # Asserts 689 | self.assertEqual(output_text, expected_output) 690 | 691 | def test_remove_credit_card_number(self): 692 | # Setup 693 | input_text = "Please refund me 5116937367451492" 694 | expected_output = "Please refund me " 695 | # Actual call 696 | output_text = remove_credit_card_number(input_text) 697 | # Asserts 698 | self.assertEqual(output_text, expected_output) 699 | 700 | def test_remove_credit_card_number_no_credit_card_number(self): 701 | # Setup 702 | input_text = "Hello world!" 703 | expected_output = "Hello world!" 704 | # Actual call 705 | output_text = remove_credit_card_number(input_text) 706 | # Asserts 707 | self.assertEqual(output_text, expected_output) 708 | 709 | def test_remove_credit_card_number_all_credit_card_numbers(self): 710 | # Setup 711 | input_text = '379524231139785, 5592621143924294, 6011167500016424, 4500339642915036, 4979770613611' 712 | expected_output = ', , , , ' 713 | # Actual call 714 | output_text = remove_credit_card_number(input_text) 715 | # Asserts 716 | self.assertEqual(output_text, expected_output) 717 | 718 | def test_remove_credit_card_number_none(self): 719 | # Setup 720 | input_text = None 721 | expected_output = '' 722 | # Actual call 723 | output_text = remove_credit_card_number(input_text) 724 | # Asserts 725 | self.assertEqual(output_text, expected_output) 726 | 727 | def test_remove_credit_card_number_empty_input(self): 728 | # Setup 729 | input_text = '' 730 | expected_output = '' 731 | # Actual call 732 | output_text = remove_credit_card_number(input_text) 733 | # Asserts 734 | self.assertEqual(output_text, expected_output) 735 | 736 | def test_remove_name(self): 737 | # Setup 738 | input_text = "My name is Lionel Messi" 739 | expected_output = ['My', 'name', 'is'] 740 | # Actual call 741 | output_text = remove_name(input_text) 742 | # Asserts 743 | self.assertListEqual(output_text, expected_output) 744 | 745 | def test_remove_name_no_name(self): 746 | # Setup 747 | input_text = 'Hello World.' 748 | expected_output = ['Hello', 'World', '.'] 749 | # Actual call 750 | output_text = remove_name(input_text) 751 | # Asserts 752 | self.assertListEqual(output_text, expected_output) 753 | 754 | def test_remove_name_all_names(self): 755 | # Setup 756 | input_text = 'Paul Allen John Doe Jane Doe Lebron James' 757 | expected_output = [] 758 | # Actual call 759 | output_text = remove_name(input_text) 760 | # Asserts 761 | self.assertListEqual(output_text, expected_output) 762 | 763 | def test_remove_name_none(self): 764 | # Setup 765 | input_text = None 766 | expected_output = [] 767 | # Actual call 768 | output_text = remove_name(input_text) 769 | # Asserts 770 | self.assertListEqual(output_text, expected_output) 771 | 772 | def test_remove_name_empty_input(self): 773 | # Setup 774 | input_text = '' 775 | expected_output = [] 776 | # Actual call 777 | output_text = remove_name(input_text) 778 | # Asserts 779 | self.assertListEqual(output_text, expected_output) 780 | 781 | def test_check_spelling(self): 782 | # Setup 783 | input_text = "Helloo world" 784 | expected_output = "hello world" 785 | # Actual call 786 | output_text = check_spelling(input_text) 787 | # Asserts 788 | self.assertEqual(output_text, expected_output) 789 | 790 | def test_check_spelling_list(self): 791 | # Setup 792 | input_text = ["Helloo", "world"] 793 | expected_output = "hello world" 794 | # Actual call 795 | output_text = check_spelling(input_text) 796 | # Asserts 797 | self.assertEqual(output_text, expected_output) 798 | 799 | def test_check_spelling_no_spelling_error(self): 800 | # Setup 801 | input_text = "Hello world!" 802 | expected_output = "hello world !" 803 | # Actual call 804 | output_text = check_spelling(input_text) 805 | # Asserts 806 | self.assertEqual(output_text, expected_output) 807 | 808 | def test_check_spelling_no_spelling_error_list(self): 809 | # Setup 810 | input_text = ["hello", "world"] 811 | expected_output = "hello world" 812 | # Actual call 813 | output_text = check_spelling(input_text) 814 | # Asserts 815 | self.assertEqual(output_text, expected_output) 816 | 817 | def test_check_spelling_all_errors(self): 818 | # Setup 819 | input_text = 'Helllo worlld nicee to meeet' 820 | expected_output = 'hello world nice to meet' 821 | # Actual call 822 | output_text = check_spelling(input_text) 823 | # Asserts 824 | self.assertEqual(output_text, expected_output) 825 | 826 | def test_check_spelling_all_errors_list(self): 827 | # Setup 828 | input_text = ['Helllo', 'worlld', 'nicee', 'to', 'meeet'] 829 | expected_output = 'hello world nice to meet' 830 | # Actual call 831 | output_text = check_spelling(input_text) 832 | # Asserts 833 | self.assertEqual(output_text, expected_output) 834 | 835 | def test_check_spelling_none(self): 836 | # Setup 837 | input_text = None 838 | expected_output = '' 839 | # Actual call 840 | output_text = check_spelling(input_text) 841 | # Asserts 842 | self.assertEqual(output_text, expected_output) 843 | 844 | def test_check_spelling_empty_input(self): 845 | # Setup 846 | input_text = '' 847 | expected_output = '' 848 | # Actual call 849 | output_text = check_spelling(input_text) 850 | # Asserts 851 | self.assertEqual(output_text, expected_output) 852 | 853 | def test_check_spelling_empty_list_input(self): 854 | # Setup 855 | input_text = [] 856 | expected_output = '' 857 | # Actual call 858 | output_text = check_spelling(input_text) 859 | # Asserts 860 | self.assertEqual(output_text, expected_output) 861 | 862 | def test_substitute_token(self): 863 | # Setup 864 | input_list = ['hello', 'world', 'msft'] 865 | expected_output = ['hello', 'world', 'Microsoft'] 866 | # Actual call 867 | output_text = substitute_token(input_list) 868 | # Asserts 869 | self.assertListEqual(output_text, expected_output) 870 | 871 | def test_substitute_token_no_custom_token(self): 872 | # Setup 873 | input_list = ['hello', 'world'] 874 | expected_output = ['hello', 'world'] 875 | # Actual call 876 | output_text = substitute_token(input_list) 877 | # Asserts 878 | self.assertListEqual(output_text, expected_output) 879 | 880 | def test_substitute_token_all_custom_tokens(self): 881 | # Setup 882 | input_list = ['fyi', 'btw', 'apr', 'mon'] 883 | expected_output = ['for your information', 'by the way', 'April', 'Monday'] 884 | # Actual call 885 | output_text = substitute_token(input_list) 886 | # Asserts 887 | self.assertListEqual(output_text, expected_output) 888 | 889 | def test_substitute_token_none_input(self): 890 | # Setup 891 | input_text = None 892 | expected_output = [] 893 | # Actual call 894 | output_text = substitute_token(input_text) 895 | # Asserts 896 | self.assertListEqual(output_text, expected_output) 897 | 898 | def test_substitute_token_empty_list_input(self): 899 | # Setup 900 | input_text = [] 901 | expected_output = [] 902 | # Actual call 903 | output_text = substitute_token(input_text) 904 | # Asserts 905 | self.assertListEqual(output_text, expected_output) 906 | 907 | @patch("text_preprocessing.text_preprocessing.to_lower", autospec=True) 908 | @patch("text_preprocessing.text_preprocessing.remove_url", autospec=True) 909 | @patch("text_preprocessing.text_preprocessing.remove_email", autospec=True) 910 | @patch("text_preprocessing.text_preprocessing.remove_phone_number", autospec=True) 911 | @patch("text_preprocessing.text_preprocessing.remove_itemized_bullet_and_numbering", autospec=True) 912 | @patch("text_preprocessing.text_preprocessing.expand_contraction", autospec=True) 913 | @patch("text_preprocessing.text_preprocessing.check_spelling", autospec=True) 914 | @patch("text_preprocessing.text_preprocessing.remove_special_character", autospec=True) 915 | @patch("text_preprocessing.text_preprocessing.remove_punctuation", autospec=True) 916 | @patch("text_preprocessing.text_preprocessing.remove_whitespace", autospec=True) 917 | @patch("text_preprocessing.text_preprocessing.normalize_unicode", autospec=True) 918 | @patch("text_preprocessing.text_preprocessing.remove_stopword", autospec=True) 919 | @patch("text_preprocessing.text_preprocessing.remove_name", autospec=True) 920 | @patch("text_preprocessing.text_preprocessing.substitute_token", autospec=True) 921 | @patch("text_preprocessing.text_preprocessing.lemmatize_word", autospec=True) 922 | def test_preprocess_text(self, 923 | mock_lemmatize_word: MagicMock, 924 | mock_substitute_token: MagicMock, 925 | mock_remove_name: MagicMock, 926 | mock_remove_stopword: MagicMock, 927 | mock_normalize_unicode: MagicMock, 928 | mock_remove_whitespace: MagicMock, 929 | mock_remove_punctuation: MagicMock, 930 | mock_remove_special_character: MagicMock, 931 | mock_check_spelling: MagicMock, 932 | mock_expand_contraction: MagicMock, 933 | mock_remove_itemized_bullet_and_numbering: MagicMock, 934 | mock_remove_phone_number: MagicMock, 935 | mock_remove_email: MagicMock, 936 | mock_remove_url: MagicMock, 937 | mock_to_lower: MagicMock): 938 | # Setup 939 | input_text = 'a test' 940 | # Actual call 941 | _ = preprocess_text(input_text) 942 | # Asserts 943 | mock_to_lower.assert_called_once() 944 | mock_remove_url.assert_called_once() 945 | mock_remove_email.assert_called_once() 946 | mock_remove_phone_number.assert_called_once() 947 | mock_remove_itemized_bullet_and_numbering.assert_called_once() 948 | mock_expand_contraction.assert_called_once() 949 | mock_check_spelling.assert_called_once() 950 | mock_remove_special_character.assert_called_once() 951 | mock_remove_punctuation.assert_called_once() 952 | mock_remove_whitespace.assert_called_once() 953 | mock_normalize_unicode.assert_called_once() 954 | mock_remove_stopword.assert_called_once() 955 | mock_remove_name.assert_called_once() 956 | mock_substitute_token.assert_called_once() 957 | mock_lemmatize_word.assert_called_once() 958 | 959 | @patch("text_preprocessing.text_preprocessing.to_lower", autospec=True) 960 | @patch("text_preprocessing.text_preprocessing.remove_url", autospec=True) 961 | @patch("text_preprocessing.text_preprocessing.remove_email", autospec=True) 962 | @patch("text_preprocessing.text_preprocessing.remove_phone_number", autospec=True) 963 | def test_preprocess_text_custom(self, 964 | mock_remove_phone_number: MagicMock, mock_remove_email: MagicMock, 965 | mock_remove_url: MagicMock, mock_to_lower: MagicMock): 966 | # Setup 967 | input_text = 'a test' 968 | # Actual call 969 | pipeline_functions = [mock_to_lower, mock_remove_url, mock_remove_email, mock_remove_phone_number] 970 | _ = preprocess_text(input_text, pipeline_functions) 971 | # Asserts 972 | mock_to_lower.assert_called_once() 973 | mock_remove_url.assert_called_once() 974 | mock_remove_email.assert_called_once() 975 | mock_remove_phone_number.assert_called_once() 976 | 977 | def test_preprocess_text_integration_a(self): 978 | # Setup 979 | input_text = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Please visit my website ' \ 980 | 'www.johndoe.com ' 981 | expected_output = 'hello email please visit website' 982 | # Actual call 983 | output_text = preprocess_text(input_text) 984 | # Asserts 985 | self.assertEqual(output_text, expected_output) 986 | 987 | def test_preprocess_text_integration_custom(self): 988 | # Setup 989 | input_text = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit my website www.johndoe.com ' 990 | expected_output = 'helllo i am john doe my email is visit my website ' 991 | # Actual call 992 | pipeline_functions = [to_lower, remove_url, remove_email, remove_punctuation] 993 | output_text = preprocess_text(input_text, pipeline_functions) 994 | # Asserts 995 | self.assertEqual(output_text, expected_output) 996 | -------------------------------------------------------------------------------- /text_preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.1' 2 | 3 | from .text_preprocessing import (to_lower, to_upper, remove_number, remove_itemized_bullet_and_numbering, remove_url, 4 | remove_punctuation, remove_special_character, keep_alpha_numeric, remove_whitespace, 5 | expand_contraction, normalize_unicode, remove_stopword, remove_email, 6 | remove_phone_number, remove_ssn, remove_credit_card_number, remove_name, 7 | check_spelling, tokenize_word, tokenize_sentence, stem_word, lemmatize_word, 8 | substitute_token, preprocess_text) 9 | -------------------------------------------------------------------------------- /text_preprocessing/data/custom_substitutions.csv: -------------------------------------------------------------------------------- 1 | 2factor,two factor 2 | 2way,two way 3 | aapl,Apple 4 | acct,account 5 | accts,accounts 6 | admins,administrators 7 | admns,administrators 8 | ai,artificial intelligence 9 | amzn,Amazon 10 | anniv,anniversary 11 | api,application program interface 12 | approx,approximately 13 | appt,appointment 14 | apr,April 15 | asap,as soon as possible 16 | async,asynchronous 17 | aug,August 18 | auth,authentication 19 | authen,authentication 20 | auths,authentications 21 | avg,average 22 | aws,Amazon Web Services 23 | b2b,business to business 24 | b2c,business to consumer 25 | batt,battery 26 | bc,because 27 | biz,business 28 | bldg,building 29 | btw,by the way 30 | cdo,chief data officer 31 | ceo,chief executive officer 32 | cfo,chief financial officer 33 | cio,chief information officer 34 | cmo,chief marketing officer 35 | cms,content management system 36 | cob,close of business 37 | combo,combination 38 | comm,communication 39 | comms,communications 40 | compat,compatible 41 | config,configuration 42 | configs,configurations 43 | coo,chief operating officer 44 | crm,customer relationship management 45 | css,cascading style sheet 46 | cta,call to action 47 | cto,chief technology officer 48 | ctr,click through rate 49 | da,data analyst 50 | db,database 51 | de,data engineer 52 | dec,December 53 | demos,demonstrations 54 | dept,department 55 | devs,developers 56 | dissat,dissatisfaction 57 | distro,distribution 58 | doc,document 59 | docs,documents 60 | droid,Android 61 | ds,data science 62 | dunno,do not know 63 | edu,education 64 | eg,for example 65 | eod,end of day 66 | eow,end of week 67 | eps,earnings per share 68 | esp,especially 69 | eta,estimated time of arrival 70 | exec,executive 71 | execs,executives 72 | exp,experience 73 | f2f,face to face 74 | face2face,face to face 75 | fb,FaceBook 76 | feb,February 77 | fifo,first in first out 78 | fri,Friday 79 | fte,full time employee 80 | ftes,full time employees 81 | fyi,for your information 82 | gen,generation 83 | goog,Google 84 | gov,government 85 | govt,government 86 | gui,graphical user interface 87 | hq,headquarter 88 | hr,human resources 89 | hrs,hours 90 | iaas,infrastructure as a service 91 | im,instant messaging 92 | imho,in my opinion 93 | imo,in my opinion 94 | incl,including 95 | info,information 96 | infra,infrastructure 97 | intro,introduction 98 | ipo,initial public offering 99 | isp,internet service provider 100 | jan,January 101 | jul,July 102 | jun,June 103 | kinda,kind of 104 | kpi,key performance indicator 105 | kpis,key performance indicators 106 | legit,legitimate 107 | lifo,last in first out 108 | mar,March 109 | mfa,multi factor authentication 110 | mgt,management 111 | mgmt,management 112 | mgr,manager 113 | mgrs,managers 114 | mic,microphone 115 | mics,microphones 116 | mins,minutes 117 | mkt,market 118 | ml,machine learning 119 | mon,Monday 120 | msft,Microsoft 121 | msg,message 122 | msgs,messages 123 | mtg,meeting 124 | mtng,meeting 125 | mtgs,meetings 126 | mtngs,meetings 127 | neo,new employee orientation 128 | nflx,Netflix 129 | nov,November 130 | o365,Office365 131 | oct,October 132 | oof,out of office 133 | ooo,out of office 134 | org,organization 135 | orgs,organizations 136 | paas,platform as a service 137 | pbi,PowerBI 138 | pbix,PowerBI 139 | pc,personal computer 140 | pcard,purchase card 141 | pcs,personal computers 142 | pic,picture 143 | pics,pictures 144 | pkg,package 145 | pls,please 146 | plz,please 147 | pm,project manager 148 | poc,proof of concept 149 | ppl,people 150 | ppt,Powerpoint 151 | pptx,Powerpoint 152 | pr,public relations 153 | prev,previous 154 | prod,production 155 | promo,promotion 156 | pte,part time employee 157 | pto,paid time off 158 | pwd,password 159 | qa,quality assurance 160 | qtr,quarter 161 | qtrly,quarterly 162 | r&d,research and development 163 | re,referring to 164 | repo,repository 165 | repos,repositories 166 | req,requirement 167 | reqs,requirements 168 | roa,return on assets 169 | roe,return on equity 170 | roi,return on investment 171 | saas,software as a service 172 | sat,Saturday 173 | sde,software development engineer 174 | sep,September 175 | sept,September 176 | sorta,sort of 177 | swe,software engineer 178 | thu,Thursday 179 | thur,Thursday 180 | thurs,Thursday 181 | thx,thanks 182 | tmw,tomorrow 183 | tue,Tuesday 184 | tues,Tuesday 185 | txt,text 186 | ui,user interface 187 | ux,user experience 188 | vs,versus 189 | wed,Wednesday 190 | wfh,work from home 191 | wrt,with respect to 192 | ww,worldwide 193 | xls,Excel 194 | xlsx,Excel 195 | yr,year 196 | yrs,years 197 | ytd,year to date 198 | -------------------------------------------------------------------------------- /text_preprocessing/data/ignore_spellcheck_words.txt: -------------------------------------------------------------------------------- 1 | 2factor 2 | 2way 3 | aapl 4 | acct 5 | accts 6 | admins 7 | admns 8 | ai 9 | amzn 10 | anniv 11 | api 12 | approx 13 | appt 14 | apr 15 | asap 16 | async 17 | aug 18 | auth 19 | authen 20 | auths 21 | avg 22 | aws 23 | b2b 24 | b2c 25 | batt 26 | bc 27 | biz 28 | bldg 29 | btw 30 | cdo 31 | ceo 32 | cfo 33 | cio 34 | cmo 35 | cms 36 | cob 37 | combo 38 | comm 39 | comms 40 | compat 41 | config 42 | configs 43 | coo 44 | crm 45 | css 46 | cta 47 | cto 48 | ctr 49 | da 50 | db 51 | de 52 | dec 53 | demos 54 | dept 55 | devs 56 | dissat 57 | distro 58 | doc 59 | docs 60 | droid 61 | ds 62 | dunno 63 | edu 64 | eg 65 | eod 66 | eow 67 | eps 68 | esp 69 | eta 70 | exec 71 | execs 72 | exp 73 | f2f 74 | face2face 75 | fb 76 | feb 77 | fifo 78 | fri 79 | fte 80 | ftes 81 | fyi 82 | gen 83 | goog 84 | gov 85 | govt 86 | gui 87 | hq 88 | hr 89 | hrs 90 | iaas 91 | im 92 | imho 93 | imo 94 | incl 95 | info 96 | infra 97 | intro 98 | ipo 99 | isp 100 | jan 101 | jul 102 | jun 103 | kinda 104 | kpi 105 | kpis 106 | legit 107 | lifo 108 | mar 109 | mfa 110 | mgt 111 | mgmt 112 | mgr 113 | mgrs 114 | mic 115 | mics 116 | mins 117 | mkt 118 | ml 119 | mon 120 | msft 121 | msg 122 | msgs 123 | mtg 124 | mtng 125 | mtgs 126 | mtngs 127 | neo 128 | nflx 129 | nov 130 | o365 131 | oct 132 | oof 133 | ooo 134 | org 135 | orgs 136 | paas 137 | pbi 138 | pbix 139 | pc 140 | pcard 141 | pcs 142 | pic 143 | pics 144 | pkg 145 | pls 146 | plz 147 | pm 148 | poc 149 | ppl 150 | ppt 151 | pptx 152 | pr 153 | prev 154 | prod 155 | promo 156 | pte 157 | pto 158 | pwd 159 | qa 160 | qtr 161 | qtrly 162 | r&d 163 | re 164 | repo 165 | repos 166 | req 167 | reqs 168 | roa 169 | roe 170 | roi 171 | saas 172 | sat 173 | sde 174 | sep 175 | sept 176 | sorta 177 | swe 178 | thu 179 | thur 180 | thurs 181 | thx 182 | tmw 183 | tue 184 | tues 185 | txt 186 | ui 187 | ux 188 | vs 189 | wed 190 | wfh 191 | wrt 192 | ww 193 | xls 194 | xlsx 195 | yr 196 | yrs 197 | ytd -------------------------------------------------------------------------------- /text_preprocessing/text_preprocessing.py: -------------------------------------------------------------------------------- 1 | # Standard libraries 2 | import os 3 | import re 4 | import string 5 | import logging 6 | import csv 7 | from pathlib import Path 8 | from functools import wraps 9 | from unicodedata import normalize 10 | from typing import List, Optional, Union, Callable 11 | 12 | # Third party libraries 13 | import contractions 14 | import nltk 15 | from nltk.corpus import stopwords 16 | from nltk.tokenize import word_tokenize, PunktSentenceTokenizer 17 | from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer 18 | from spellchecker import SpellChecker 19 | from names_dataset import NameDataset 20 | 21 | nltk.download('stopwords', quiet=True) 22 | nltk.download('wordnet', quiet=True) 23 | nltk.download('punkt', quiet=True) 24 | nltk.download('omw-1.4') 25 | 26 | _CUSTOM_SUB_CSV_FILE_PATH = os.path.join(os.path.dirname(__file__), 'data/custom_substitutions.csv') 27 | _IGNORE_SPELLCHECK_WORD_FILE_PATH = os.path.join(os.path.dirname(__file__), 'data/ignore_spellcheck_words.txt') 28 | 29 | LOGGER = logging.getLogger(__name__) 30 | LOGGER.setLevel(logging.INFO) 31 | 32 | 33 | def _return_empty_string_for_invalid_input(func): 34 | """ Return empty string if the input is None or empty """ 35 | @wraps(func) 36 | def wrapper(*args, **kwargs): 37 | if 'input_text' in kwargs: 38 | input_text = kwargs['input_text'] 39 | else: 40 | try: 41 | input_text = args[0] 42 | except IndexError as e: 43 | LOGGER.exception('No appropriate positional argument is provide.') 44 | raise e 45 | if input_text is None or len(input_text) == 0: 46 | return '' 47 | else: 48 | return func(*args, **kwargs) 49 | return wrapper 50 | 51 | 52 | def _return_empty_list_for_invalid_input(func): 53 | """ Return empty list if the input is None or empty """ 54 | @wraps(func) 55 | def wrapper(*args, **kwargs): 56 | if 'input_text_or_list' in kwargs: 57 | input_text_or_list = kwargs['input_text_or_list'] 58 | else: 59 | try: 60 | input_text_or_list = args[0] 61 | except IndexError as e: 62 | LOGGER.exception('No appropriate positional argument is provide.') 63 | raise e 64 | if input_text_or_list is None or len(input_text_or_list) == 0: 65 | return [] 66 | else: 67 | return func(*args, **kwargs) 68 | return wrapper 69 | 70 | 71 | @_return_empty_string_for_invalid_input 72 | def to_lower(input_text: str) -> str: 73 | """ Convert input text to lower case """ 74 | return input_text.lower() 75 | 76 | 77 | @_return_empty_string_for_invalid_input 78 | def to_upper(input_text: str) -> str: 79 | """ Convert input text to upper case """ 80 | return input_text.upper() 81 | 82 | 83 | @_return_empty_string_for_invalid_input 84 | def remove_number(input_text: str) -> str: 85 | """ Remove number in the input text """ 86 | processed_text = re.sub('\d+', '', input_text) 87 | return processed_text 88 | 89 | 90 | @_return_empty_string_for_invalid_input 91 | def remove_itemized_bullet_and_numbering(input_text: str) -> str: 92 | """ Remove bullets or numbering in itemized input """ 93 | processed_text = re.sub('[(\s][0-9a-zA-Z][.)]\s+|[(\s][ivxIVX]+[.)]\s+', ' ', input_text) 94 | return processed_text 95 | 96 | 97 | @_return_empty_string_for_invalid_input 98 | def remove_url(input_text: str) -> str: 99 | """ Remove url in the input text """ 100 | return re.sub('(www|http)\S+', '', input_text) 101 | 102 | 103 | @_return_empty_string_for_invalid_input 104 | def remove_punctuation(input_text: str, punctuations: Optional[str] = None) -> str: 105 | """ 106 | Removes all punctuations from a string, as defined by string.punctuation or a custom list. 107 | For reference, Python's string.punctuation is equivalent to '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~' 108 | """ 109 | if punctuations is None: 110 | punctuations = string.punctuation 111 | processed_text = input_text.translate(str.maketrans('', '', punctuations)) 112 | return processed_text 113 | 114 | 115 | @_return_empty_string_for_invalid_input 116 | def remove_special_character(input_text: str, special_characters: Optional[str] = None) -> str: 117 | """ Removes special characters """ 118 | if special_characters is None: 119 | # TODO: add more special characters 120 | special_characters = '弫¥ª°©ð±§µæ¹¢³¿®ä£' 121 | processed_text = input_text.translate(str.maketrans('', '', special_characters)) 122 | return processed_text 123 | 124 | 125 | @_return_empty_string_for_invalid_input 126 | def keep_alpha_numeric(input_text: str) -> str: 127 | """ Remove any character except alphanumeric characters """ 128 | return ''.join(c for c in input_text if c.isalnum()) 129 | 130 | 131 | @_return_empty_string_for_invalid_input 132 | def remove_whitespace(input_text: str, remove_duplicate_whitespace: bool = True) -> str: 133 | """ Removes leading, trailing, and (optionally) duplicated whitespace """ 134 | if remove_duplicate_whitespace: 135 | return ' '.join(re.split('\s+', input_text.strip(), flags=re.UNICODE)) 136 | return input_text.strip() 137 | 138 | 139 | @_return_empty_string_for_invalid_input 140 | def expand_contraction(input_text: str) -> str: 141 | """ Expand contractions in input text """ 142 | return contractions.fix(input_text) 143 | 144 | 145 | @_return_empty_string_for_invalid_input 146 | def normalize_unicode(input_text: str) -> str: 147 | """ Normalize unicode data to remove umlauts, and accents, etc. """ 148 | processed_tokens = normalize('NFKD', input_text).encode('ASCII', 'ignore').decode('utf8') 149 | return processed_tokens 150 | 151 | 152 | @_return_empty_list_for_invalid_input 153 | def remove_stopword(input_text_or_list: Union[str, List[str]], stop_words: Optional[set] = None) -> List[str]: 154 | """ Remove stop words """ 155 | 156 | if stop_words is None: 157 | stop_words = set(stopwords.words('english')) 158 | if isinstance(stop_words, list): 159 | stop_words = set(stop_words) 160 | if isinstance(input_text_or_list, str): 161 | tokens = word_tokenize(input_text_or_list) 162 | processed_tokens = [token for token in tokens if token not in stop_words] 163 | else: 164 | processed_tokens = [token for token in input_text_or_list 165 | if (token not in stop_words and token is not None and len(token) > 0)] 166 | return processed_tokens 167 | 168 | 169 | @_return_empty_string_for_invalid_input 170 | def remove_email(input_text: str) -> str: 171 | """ Remove email in the input text """ 172 | regex_pattern = '[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}' 173 | return re.sub(regex_pattern, '', input_text) 174 | 175 | 176 | @_return_empty_string_for_invalid_input 177 | def remove_phone_number(input_text: str) -> str: 178 | """ Remove phone number in the input text """ 179 | regex_pattern = '(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?' 180 | return re.sub(regex_pattern, '', input_text) 181 | 182 | 183 | @_return_empty_string_for_invalid_input 184 | def remove_ssn(input_text: str) -> str: 185 | """ Remove social security number in the input text """ 186 | regex_pattern = '(?!219-09-9999|078-05-1120)(?!666|000|9\d{2})\d{3}-(?!00)\d{2}-(?!0{4})\d{4}|(' \ 187 | '?!219099999|078051120)(?!666|000|9\d{2})\d{3}(?!00)\d{2}(?!0{4})\d{4}' 188 | return re.sub(regex_pattern, '', input_text) 189 | 190 | 191 | @_return_empty_string_for_invalid_input 192 | def remove_credit_card_number(input_text: str) -> str: 193 | """ Remove credit card number in the input text """ 194 | regex_pattern = '(4[0-9]{12}(?:[0-9]{3})?|(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][' \ 195 | '0-9]|2720)[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12}|(' \ 196 | '?:2131|1800|35\d{3})\d{11})' 197 | return re.sub(regex_pattern, '', input_text) 198 | 199 | 200 | @_return_empty_list_for_invalid_input 201 | def remove_name(input_text_or_list: Union[str, List[str]]) -> List[str]: 202 | """ Remove name in the input text """ 203 | name_searcher = NameDataset() 204 | if isinstance(input_text_or_list, str): 205 | tokens = word_tokenize(input_text_or_list) 206 | processed_tokens = [token for token in tokens 207 | if (not name_searcher.search_first_name(token)) and 208 | (not name_searcher.search_last_name(token))] 209 | else: 210 | processed_tokens = [token for token in input_text_or_list 211 | if (not name_searcher.search_first_name(token)) and 212 | (not name_searcher.search_last_name(token)) and token is not None and len(token) > 0] 213 | return processed_tokens 214 | 215 | 216 | def check_spelling(input_text_or_list: Union[str, List[str]], lang='en', 217 | ignore_word_file_path: Union[str, Path] = _IGNORE_SPELLCHECK_WORD_FILE_PATH) -> str: 218 | """ Check and correct spellings of the text list """ 219 | if input_text_or_list is None or len(input_text_or_list) == 0: 220 | return '' 221 | spelling_checker = SpellChecker(language=lang, distance=1) 222 | # TODO: add acronyms into spell checker to ignore auto correction specified by _IGNORE_SPELLCHECK_WORD_FILE_PATH 223 | spelling_checker.word_frequency.load_text_file(ignore_word_file_path) 224 | if isinstance(input_text_or_list, str): 225 | if not input_text_or_list.islower(): 226 | input_text_or_list = input_text_or_list.lower() 227 | tokens = word_tokenize(input_text_or_list) 228 | else: 229 | tokens = [token.lower() for token in input_text_or_list if token is not None and len(token) > 0] 230 | misspelled = spelling_checker.unknown(tokens) 231 | for word in misspelled: 232 | tokens[tokens.index(word)] = spelling_checker.correction(word) 233 | return ' '.join(tokens).strip() 234 | 235 | 236 | def tokenize_word(input_text: str) -> List[str]: 237 | """ Converts a text into a list of word tokens """ 238 | if input_text is None or len(input_text) == 0: 239 | return [] 240 | return word_tokenize(input_text) 241 | 242 | 243 | def tokenize_sentence(input_text: str) -> List[str]: 244 | """ Converts a text into a list of sentence tokens """ 245 | if input_text is None or len(input_text) == 0: 246 | return [] 247 | tokenizer = PunktSentenceTokenizer() 248 | return tokenizer.tokenize(input_text) 249 | 250 | 251 | @_return_empty_list_for_invalid_input 252 | def stem_word(input_text_or_list: Union[str, List[str]], 253 | stemmer: Optional[Union[PorterStemmer, SnowballStemmer, LancasterStemmer]] = None 254 | ) -> List[str]: 255 | """ Stem each token in a text """ 256 | if stemmer is None: 257 | stemmer = PorterStemmer() 258 | if isinstance(input_text_or_list, str): 259 | tokens = word_tokenize(input_text_or_list) 260 | processed_tokens = [stemmer.stem(token) for token in tokens] 261 | else: 262 | processed_tokens = [stemmer.stem(token) for token in input_text_or_list if token is not None and len(token) > 0] 263 | return processed_tokens 264 | 265 | 266 | @_return_empty_list_for_invalid_input 267 | def lemmatize_word(input_text_or_list: Union[str, List[str]], 268 | lemmatizer: Optional[WordNetLemmatizer] = None 269 | ) -> List[str]: 270 | """ Lemmatize each token in a text by finding its base form """ 271 | if lemmatizer is None: 272 | lemmatizer = WordNetLemmatizer() 273 | if isinstance(input_text_or_list, str): 274 | tokens = word_tokenize(input_text_or_list) 275 | processed_tokens = [lemmatizer.lemmatize(token) for token in tokens] 276 | else: 277 | processed_tokens = [lemmatizer.lemmatize(token) 278 | for token in input_text_or_list if token is not None and len(token) > 0] 279 | return processed_tokens 280 | 281 | 282 | def substitute_token(token_list: List[str], sub_dict: Optional[dict] = None) -> List[str]: 283 | """ Substitute each token by another token, e.g., 'vs' -> 'versus' """ 284 | # TODO: add more custom substitutions in the csv file specified by _CUSTOM_SUB_CSV_FILE_PATH 285 | if token_list is None or len(token_list) == 0: 286 | return [] 287 | if sub_dict is None: 288 | with open(_CUSTOM_SUB_CSV_FILE_PATH, 'r') as f: 289 | csv_file = csv.reader(f) 290 | sub_dict = dict(csv_file) 291 | processed_tokens = list() 292 | for token in token_list: 293 | if token in sub_dict: 294 | processed_tokens.append(sub_dict[token]) 295 | else: 296 | processed_tokens.append(token) 297 | return processed_tokens 298 | 299 | 300 | def preprocess_text(input_text: str, processing_function_list: Optional[List[Callable]] = None) -> str: 301 | """ Preprocess an input text by executing a series of preprocessing functions specified in functions list """ 302 | if processing_function_list is None: 303 | processing_function_list = [to_lower, 304 | remove_url, 305 | remove_email, 306 | remove_phone_number, 307 | remove_itemized_bullet_and_numbering, 308 | expand_contraction, 309 | check_spelling, 310 | remove_special_character, 311 | remove_punctuation, 312 | remove_whitespace, 313 | normalize_unicode, 314 | remove_stopword, 315 | remove_name, 316 | substitute_token, 317 | lemmatize_word] 318 | for func in processing_function_list: 319 | input_text = func(input_text) 320 | if isinstance(input_text, str): 321 | processed_text = input_text 322 | else: 323 | processed_text = ' '.join(input_text) 324 | return processed_text 325 | 326 | 327 | if __name__ == '__main__': 328 | text_to_process = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit our website www.johndoe.com' 329 | preprocessed_text = preprocess_text(text_to_process) 330 | print(preprocessed_text) 331 | 332 | preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word] 333 | preprocessed_text = preprocess_text(text_to_process, preprocess_functions) 334 | print(preprocessed_text) 335 | --------------------------------------------------------------------------------