├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── publish_to_pypi.yml
    │   └── run_unit_test.yml
├── .gitignore
├── DESCRIPTION.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── __init__.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── test_results
    └── .gitignore
├── tests
    ├── __init__.py
    ├── run_tests.py
    └── text_preprocessing_test.py
└── text_preprocessing
    ├── __init__.py
    ├── data
        ├── custom_substitutions.csv
        └── ignore_spellcheck_words.txt
    └── text_preprocessing.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: berknology
2 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_to_pypi.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Release
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v1
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.github/workflows/run_unit_test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python 3.8
20 |       uses: actions/setup-python@v1
21 |       with:
22 |         python-version: 3.8
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install flake8
27 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 |     - name: Lint with flake8
29 |       run: |
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 |     - name: Test with unittest
35 |       run: |
36 |         make test
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | cover/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # PyCharm
60 | .idea
61 | *.iml
62 | pycharm
63 | 
64 | # Gradle
65 | .gradle
66 | 
67 | # docker builds
68 | docker.properties
69 | 
70 | # tests
71 | test_results/*
72 | !test_results/.gitignore
73 | 
74 | # ipython
75 | .ipynb_checkpoints/
76 | 
77 | # VSCode
78 | .classpath
79 | .project
80 | .settings/
81 | .vscode/
82 | 
83 | # virtual environment
84 | venv/


--------------------------------------------------------------------------------
/DESCRIPTION.rst:
--------------------------------------------------------------------------------
 1 | ==================================================
 2 | Text preprocessing for Natural Language Processing
 3 | ==================================================
 4 | 
 5 | A python package for text preprocessing task in natural language processing.
 6 | 
 7 | Usage
 8 | -----
 9 | To use this text preprocessing package, first install it using pip:
10 | 
11 | .. code-block:: python
12 | 
13 |     pip install text-preprocessing
14 | 
15 | 
16 | Then, import the package in your python script and call appropriate functions:
17 | 
18 | .. code-block:: python
19 | 
20 |     from text_preprocessing import preprocess_text
21 |     from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word
22 | 
23 |     # Preprocess text using default preprocess functions in the pipeline
24 |     text_to_process = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit our website www.johndoe.com'
25 |     preprocessed_text = preprocess_text(text_to_process)
26 |     print(preprocessed_text)
27 |     # output: hello email visit website
28 | 
29 |     # Preprocess text using custom preprocess functions in the pipeline
30 |     preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word]
31 |     preprocessed_text = preprocess_text(text_to_process, preprocess_functions)
32 |     print(preprocessed_text)
33 |     # output: helllo i am john doe my email is visit our website
34 | 
35 | 
36 | Features
37 | --------
38 | 
39 | .. csv-table::
40 |    :header: "Feature", "Function"
41 |    :widths: 50, 35
42 | 
43 |     "convert to lower case", "to_lower"
44 |     "convert to upper case", "to_upper"
45 |     "keep only alphabetic and numerical characters", "keep_alpha_numeric"
46 |     "check and correct spellings", "check_spelling"
47 |     "expand contractions", "expand_contraction"
48 |     "remove URLs", "remove_url"
49 |     "remove names", "remove_name"
50 |     "remove emails", "remove_email"
51 |     "remove phone numbers", "remove_phone_number"
52 |     "remove SSNs", "remove_ssn"
53 |     "remove credit card numbers", "remove_credit_card_number"
54 |     "remove numbers", "remove_number"
55 |     "remove bullets and numbering", "remove_itemized_bullet_and_numbering"
56 |     "remove special characters", "remove_special_character"
57 |     "remove punctuations", "remove_punctuation"
58 |     "remove extra whitespace", "remove_whitespace"
59 |     "normalize unicode (e.g., café -> cafe)", "normalize_unicode"
60 |     "remove stop words", "remove_stopword"
61 |     "tokenize words", "tokenize_word"
62 |     "tokenize sentences", "tokenize_sentence"
63 |     "substitute custom words (e.g., vs -> versus)", "substitute_token"
64 |     "stem words", "stem_word"
65 |     "lemmatize words", "lemmatize_word"
66 |     "preprocess text through a sequence of preprocessing functions", "preprocess_text"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, He Hao
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include text_preprocessing/data/custom_substitutions.csv
2 | include text_preprocessing/data/ignore_spellcheck_words.txt
3 | 
4 | recursive-include tests *


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | define BROWSER_PYSCRIPT
 3 | import os, webbrowser, sys
 4 | try:
 5 | 	from urllib import pathname2url
 6 | except:
 7 | 	from urllib.request import pathname2url
 8 | 
 9 | try:
10 | 	import bump2version
11 | except:
12 | 	print("Please install library bump2version by 'pip install bump2version'")
13 | 
14 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
15 | endef
16 | export BROWSER_PYSCRIPT
17 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
18 | 
19 | help:
20 | 	@echo "clean - remove all build, test, coverage and Python artifacts"
21 | 	@echo "clean-build - remove build artifacts"
22 | 	@echo "clean-pyc - remove Python file artifacts"
23 | 	@echo "clean-test - remove test and coverage artifacts"
24 | 	@echo "lint - check style with flake8"
25 | 	@echo "coverage - check code coverage quickly with the default Python"
26 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
27 | 	@echo "release - package and upload a release"
28 | 	@echo "dist - build python package"
29 | 	@echo "install - install the package to the active Python's site-packages"
30 | 	@echo "bump-batch - use bump2version to bump patch version"
31 | 	@echo "bump-minor - use bump2version to bump minor version"
32 | 	@echo "bump-major - use bump2version to bump major version"
33 | 	@echo "test - run unit test"
34 | 
35 | clean: clean-build clean-pyc clean-test
36 | 
37 | clean-build:
38 | 	rm -fr build/
39 | 	rm -fr dist/
40 | 	rm -fr .eggs/
41 | 	find . -name '*.egg-info' -exec rm -fr {} +
42 | 	find . -name '*.egg' -exec rm -f {} +
43 | 
44 | clean-pyc:
45 | 	find . -name '*.pyc' -exec rm -f {} +
46 | 	find . -name '*.pyo' -exec rm -f {} +
47 | 	find . -name '*~' -exec rm -f {} +
48 | 	find . -name '__pycache__' -exec rm -fr {} +
49 | 
50 | clean-test:
51 | 	rm -fr .tox/
52 | 	rm -f .coverage
53 | 	rm -fr htmlcov/
54 | 
55 | lint:
56 | 	flake8 pii_detector tests
57 | 
58 | coverage:
59 | 	coverage run --source pii_detector setup.py test
60 | 	coverage report -m
61 | 	coverage html
62 | 	$(BROWSER) htmlcov/index.html
63 | 
64 | docs:
65 | 	rm -f docs/pii_detector.rst
66 | 	rm -f docs/modules.rst
67 | 	sphinx-apidoc -o docs/ pii_detector
68 | 	$(MAKE) -C docs clean
69 | 	$(MAKE) -C docs html
70 | 	$(BROWSER) docs/_build/html/index.html
71 | 
72 | servedocs: docs
73 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
74 | 
75 | dist:
76 | 	python setup.py bdist_wheel
77 | 	ls -l dist
78 | 
79 | release: dist
80 | 	twine upload dist/*
81 | 
82 | install: clean
83 | 	python setup.py install
84 | 
85 | bump-patch:
86 | 	bump2version patch
87 | 
88 | bump-minor:
89 | 	bump2version minor
90 | 
91 | bump-major:
92 | 	bump2version major
93 | 
94 | test:
95 | 	python -m tests.run_tests


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Text preprocessing for Natural Language Processing
 2 | =============
 3 | 
 4 | ![Build](https://github.com/berknology/text-preprocessing/workflows/Build/badge.svg)
 5 | ![Release](https://github.com/berknology/text-preprocessing/workflows/Release/badge.svg)
 6 | ![PyPi](https://img.shields.io/pypi/v/text-preprocessing.svg)
 7 | 
 8 | 
 9 | A python package for text preprocessing task in natural language processing.
10 | 
11 | Usage
12 | --------
13 | To use this text preprocessing package, first install it using pip:
14 | ```bash
15 | pip install text-preprocessing
16 | ```
17 | 
18 | Then, import the package in your python script and call appropriate functions:
19 | 
20 | ```python
21 | from text_preprocessing import preprocess_text
22 | from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word
23 | 
24 | # Preprocess text using default preprocess functions in the pipeline 
25 | text_to_process = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit our website www.johndoe.com'
26 | preprocessed_text = preprocess_text(text_to_process)
27 | print(preprocessed_text)
28 | # output: hello email visit website
29 | 
30 | # Preprocess text using custom preprocess functions in the pipeline 
31 | preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word]
32 | preprocessed_text = preprocess_text(text_to_process, preprocess_functions)
33 | print(preprocessed_text)
34 | # output: helllo i am john doe my email is visit our website
35 | ```
36 | 
37 | If you have a lot of data to preprocess, and would like to run text preprocessig in a parallel manner in PySpark on 
38 | Databricks, please use the following udf function:
39 | ```python
40 | from text_preprocessing import preprocess_text
41 | from pyspark.sql.functions import udf
42 | from pyspark.sql.types import StringType
43 | from pyspark.sql import DataFrame as SparkDataFrame
44 | 
45 | 
46 | def preprocess_text_spark(df: SparkDataFrame, 
47 |                           target_column: str, 
48 |                           preprocessed_column_name: str = 'preprocessed_text'
49 |                          ) -> SparkDataFrame:
50 |     """ Preprocess text in a column of a PySpark DataFrame by leveraging PySpark UDF to preprocess text in parallel """
51 |     _preprocess_text = udf(preprocess_text, StringType())
52 |     new_df = df.withColumn(preprocessed_column_name, _preprocess_text(df[target_column]))
53 |     return new_df
54 | ```
55 | 
56 | Features
57 | --------
58 | 
59 | | Feature                                                       | Function                              |
60 | | :------------------------------------------------------------ |:------------------------------------- |
61 | | convert to lower case                                         | to_lower                              |
62 | | convert to upper case                                         | to_upper                              |
63 | | keep only alphabetic and numerical characters                 | keep_alpha_numeric                    |
64 | | check and correct spellings                                   | check_spelling                        |
65 | | expand contractions                                           | expand_contraction                    |
66 | | remove URLs                                                   | remove_url                            |
67 | | remove names                                                  | remove_name                           |
68 | | remove emails                                                 | remove_email                          |
69 | | remove phone numbers                                          | remove_phone_number                   |
70 | | remove SSNs                                                   | remove_ssn                            |
71 | | remove credit card numbers                                    | remove_credit_card_number             |
72 | | remove numbers                                                | remove_number                         |
73 | | remove bullets and numbering                                  | remove_itemized_bullet_and_numbering  |
74 | | remove special characters                                     | remove_special_character              |
75 | | remove punctuations                                           | remove_punctuation                    |
76 | | remove extra whitespace                                       | remove_whitespace                     |
77 | | normalize unicode (e.g., café -> cafe)                        | normalize_unicode                     |
78 | | remove stop words                                             | remove_stopword                       |
79 | | tokenize words                                                | tokenize_word                         |
80 | | tokenize sentences                                            | tokenize_sentence                     |
81 | | substitute custom words (e.g., vs -> versus)                  | substitute_token                      |
82 | | stem words                                                    | stem_word                             |
83 | | lemmatize words                                               | lemmatize_word                        |
84 | | preprocess text through a sequence of preprocessing functions | preprocess_text                       |
85 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.1'


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | pyspellchecker
3 | contractions
4 | names-dataset==2.1
5 | # For unit test
6 | unittest-xml-reporting


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.1.1
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bumpversion:file:text_preprocessing/__init__.py]
15 | search = __version__ = '{current_version}'
16 | replace = __version__ = '{new_version}'
17 | 
18 | [bdist_wheel]
19 | universal = 1
20 | 
21 | [flake8]
22 | exclude = docs
23 | 
24 | [aliases]
25 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | def parse_requirements(fn):
 5 |     with open(fn) as f:
 6 |         return [req for req in f.read().strip().split('\n') if "#" not in req]
 7 | 
 8 | 
 9 | parsed_requirements = parse_requirements(
10 |     'requirements.txt',
11 | )
12 | 
13 | parsed_test_requirements = parse_requirements(
14 |     'requirements.txt',
15 | )
16 | 
17 | requirements = [str(ir) for ir in parsed_requirements]
18 | test_requirements = [str(tr) for tr in parsed_test_requirements]
19 | 
20 | 
21 | with open('DESCRIPTION.rst') as description_file:
22 |     description = description_file.read()
23 | 
24 | 
25 | setup(
26 |     name='text_preprocessing',
27 |     version='0.1.1',
28 |     description="A python package for text preprocessing task in natural language processing",
29 |     long_description=description,
30 |     url='https://github.com/berknology/text-preprocessing',
31 |     license="BSD license",
32 |     author="He Hao",
33 |     author_email='berknology@gmail.com',
34 |     packages=find_packages(include=['text_preprocessing', 'text_preprocessing.*']),
35 |     include_package_data=True,
36 |     install_requires=requirements,
37 |     zip_safe=False,
38 |     keywords='NLP',
39 |     classifiers=[
40 |         'Development Status :: 2 - Pre-Alpha',
41 |         'Intended Audience :: Developers',
42 |         'License :: OSI Approved :: BSD License',
43 |         'Natural Language :: English',
44 |         'Programming Language :: Python :: 3.7',
45 |         'Programming Language :: Python :: 3.8',
46 |     ],
47 |     test_suite='tests',
48 |     tests_require=test_requirements
49 | )
50 | 
51 | 


--------------------------------------------------------------------------------
/test_results/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berknology/text-preprocessing/59351e5f4adc510b7063faee0376bc194790a82c/test_results/.gitignore


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/run_tests.py:
--------------------------------------------------------------------------------
 1 | # Standard libraries
 2 | import unittest
 3 | 
 4 | # Third party libraries
 5 | import xmlrunner
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     # Only run tests in packages contain an __init__.py file
10 |     unittest.main(module=None,
11 |                   testRunner=xmlrunner.XMLTestRunner(output="test_results"),
12 |                   failfast=False,
13 |                   buffer=False,
14 |                   catchbreak=False,
15 |                   argv=["", "discover", "-p", "*test*.py"]
16 |                   )
17 | 


--------------------------------------------------------------------------------
/tests/text_preprocessing_test.py:
--------------------------------------------------------------------------------
  1 | # Standard libraries
  2 | from unittest import TestCase
  3 | from unittest.mock import patch, MagicMock
  4 | 
  5 | # Project code
  6 | from text_preprocessing import (to_lower, to_upper, remove_number, remove_url, remove_punctuation,
  7 |                                 remove_special_character, keep_alpha_numeric, remove_whitespace, expand_contraction,
  8 |                                 normalize_unicode, remove_stopword, remove_email, remove_phone_number, remove_ssn,
  9 |                                 remove_credit_card_number, remove_name, check_spelling, substitute_token,
 10 |                                 remove_itemized_bullet_and_numbering)
 11 | from text_preprocessing import preprocess_text
 12 | 
 13 | 
 14 | class TestTextPreprocessing(TestCase):
 15 | 
 16 |     def test_to_lower(self):
 17 |         # Setup
 18 |         input_text = 'HellO'
 19 |         expected_output = 'hello'
 20 |         # Actual call
 21 |         output_text = to_lower(input_text)
 22 |         # Asserts
 23 |         self.assertEqual(output_text, expected_output)
 24 | 
 25 |     def test_to_lower_lower_input(self):
 26 |         # Setup
 27 |         input_text = 'hello'
 28 |         expected_output = 'hello'
 29 |         # Actual call
 30 |         output_text = to_lower(input_text)
 31 |         # Asserts
 32 |         self.assertEqual(output_text, expected_output)
 33 | 
 34 |     def test_to_lower_upper_input(self):
 35 |         # Setup
 36 |         input_text = 'HELLO'
 37 |         expected_output = 'hello'
 38 |         # Actual call
 39 |         output_text = to_lower(input_text)
 40 |         # Asserts
 41 |         self.assertEqual(output_text, expected_output)
 42 | 
 43 |     def test_to_lower_none(self):
 44 |         # Setup
 45 |         input_text = None
 46 |         expected_output = ''
 47 |         # Actual call
 48 |         output_text = to_lower(input_text)
 49 |         # Asserts
 50 |         self.assertEqual(output_text, expected_output)
 51 | 
 52 |     def test_to_lower_empty_input(self):
 53 |         # Setup
 54 |         input_text = ''
 55 |         expected_output = ''
 56 |         # Actual call
 57 |         output_text = to_lower(input_text)
 58 |         # Asserts
 59 |         self.assertEqual(output_text, expected_output)
 60 | 
 61 |     def test_to_upper(self):
 62 |         # Setup
 63 |         input_text = 'HellO'
 64 |         expected_output = 'HELLO'
 65 |         # Actual call
 66 |         output_text = to_upper(input_text)
 67 |         # Asserts
 68 |         self.assertEqual(output_text, expected_output)
 69 | 
 70 |     def test_to_upper_lower_input(self):
 71 |         # Setup
 72 |         input_text = 'hello'
 73 |         expected_output = 'HELLO'
 74 |         # Actual call
 75 |         output_text = to_upper(input_text)
 76 |         # Asserts
 77 |         self.assertEqual(output_text, expected_output)
 78 | 
 79 |     def test_to_upper_upper_input(self):
 80 |         # Setup
 81 |         input_text = 'HELLO'
 82 |         expected_output = 'HELLO'
 83 |         # Actual call
 84 |         output_text = to_upper(input_text)
 85 |         # Asserts
 86 |         self.assertEqual(output_text, expected_output)
 87 | 
 88 |     def test_to_upper_none(self):
 89 |         # Setup
 90 |         input_text = None
 91 |         expected_output = ''
 92 |         # Actual call
 93 |         output_text = to_upper(input_text)
 94 |         # Asserts
 95 |         self.assertEqual(output_text, expected_output)
 96 | 
 97 |     def test_to_upper_empty_input(self):
 98 |         # Setup
 99 |         input_text = ''
100 |         expected_output = ''
101 |         # Actual call
102 |         output_text = to_upper(input_text)
103 |         # Asserts
104 |         self.assertEqual(output_text, expected_output)
105 | 
106 |     def test_remove_number(self):
107 |         # Setup
108 |         input_text = 'HellO123'
109 |         expected_output = 'HellO'
110 |         # Actual call
111 |         output_text = remove_number(input_text)
112 |         # Asserts
113 |         self.assertEqual(output_text, expected_output)
114 | 
115 |     def test_remove_number_no_number(self):
116 |         # Setup
117 |         input_text = 'HellO!.'
118 |         expected_output = 'HellO!.'
119 |         # Actual call
120 |         output_text = remove_number(input_text)
121 |         # Asserts
122 |         self.assertEqual(output_text, expected_output)
123 | 
124 |     def test_remove_number_all_number(self):
125 |         # Setup
126 |         input_text = '987123'
127 |         expected_output = ''
128 |         # Actual call
129 |         output_text = remove_number(input_text)
130 |         # Asserts
131 |         self.assertEqual(output_text, expected_output)
132 | 
133 |     def test_remove_number_none(self):
134 |         # Setup
135 |         input_text = None
136 |         expected_output = ''
137 |         # Actual call
138 |         output_text = remove_number(input_text)
139 |         # Asserts
140 |         self.assertEqual(output_text, expected_output)
141 | 
142 |     def test_remove_number_empty_input(self):
143 |         # Setup
144 |         input_text = ''
145 |         expected_output = ''
146 |         # Actual call
147 |         output_text = remove_number(input_text)
148 |         # Asserts
149 |         self.assertEqual(output_text, expected_output)
150 | 
151 |     def test_remove_itemized_bullet_and_numbering(self):
152 |         # Setup
153 |         input_text = 'My comments: 1) blah blah, 2. blah blah. III) blah blah; iv) blah blah, (d) blah blah'
154 |         expected_output = 'My comments: blah blah, blah blah. blah blah; blah blah,  blah blah'
155 |         # Actual call
156 |         output_text = remove_itemized_bullet_and_numbering(input_text)
157 |         # Asserts
158 |         self.assertEqual(output_text, expected_output)
159 | 
160 |     def test_remove_itemized_bullet_and_numbering_no_bullet_or_numbering(self):
161 |         # Setup
162 |         input_text = 'hello, this is a test. '
163 |         expected_output = 'hello, this is a test. '
164 |         # Actual call
165 |         output_text = remove_itemized_bullet_and_numbering(input_text)
166 |         # Asserts
167 |         self.assertEqual(output_text, expected_output)
168 | 
169 |     def test_remove_itemized_bullet_and_numbering_all_bullets_and_numberings(self):
170 |         # Setup
171 |         input_text = ' 1) test 2. test. (3) test  a) test (b) test E) test. (F) test. (i) a vx) b IV. c'
172 |         expected_output = ' test test.  test  test  test test.  test.  a b c'
173 |         # Actual call
174 |         output_text = remove_itemized_bullet_and_numbering(input_text)
175 |         # Asserts
176 |         self.assertEqual(output_text, expected_output)
177 | 
178 |     def test_remove_itemized_bullet_and_numbering_none(self):
179 |         # Setup
180 |         input_text = None
181 |         expected_output = ''
182 |         # Actual call
183 |         output_text = remove_itemized_bullet_and_numbering(input_text)
184 |         # Asserts
185 |         self.assertEqual(output_text, expected_output)
186 | 
187 |     def test_remove_itemized_bullet_and_numbering_empty_input(self):
188 |         # Setup
189 |         input_text = ''
190 |         expected_output = ''
191 |         # Actual call
192 |         output_text = remove_itemized_bullet_and_numbering(input_text)
193 |         # Asserts
194 |         self.assertEqual(output_text, expected_output)
195 | 
196 |     def test_remove_url(self):
197 |         # Setup
198 |         input_text = 'my address is www.microsoft.com https://www.microsoft.com'
199 |         expected_output = 'my address is  '
200 |         # Actual call
201 |         output_text = remove_url(input_text)
202 |         # Asserts
203 |         self.assertEqual(output_text, expected_output)
204 | 
205 |     def test_remove_url_no_url(self):
206 |         # Setup
207 |         input_text = 'my address is www.microsoft.com https://www.microsoft.com'
208 |         expected_output = 'my address is  '
209 |         # Actual call
210 |         output_text = remove_url(input_text)
211 |         # Asserts
212 |         self.assertEqual(output_text, expected_output)
213 | 
214 |     def test_remove_url_all_url(self):
215 |         # Setup
216 |         input_text = 'www.microsoft.com https://www.microsoft.com'
217 |         expected_output = ' '
218 |         # Actual call
219 |         output_text = remove_url(input_text)
220 |         # Asserts
221 |         self.assertEqual(output_text, expected_output)
222 | 
223 |     def test_remove_url_none(self):
224 |         # Setup
225 |         input_text = None
226 |         expected_output = ''
227 |         # Actual call
228 |         output_text = remove_url(input_text)
229 |         # Asserts
230 |         self.assertEqual(output_text, expected_output)
231 | 
232 |     def test_remove_url_empty_input(self):
233 |         # Setup
234 |         input_text = ''
235 |         expected_output = ''
236 |         # Actual call
237 |         output_text = remove_url(input_text)
238 |         # Asserts
239 |         self.assertEqual(output_text, expected_output)
240 | 
241 |     def test_remove_punctuation(self):
242 |         # Setup
243 |         input_text = 'Hello!!! Welcome.'
244 |         expected_output = 'Hello Welcome'
245 |         # Actual call
246 |         output_text = remove_punctuation(input_text)
247 |         # Asserts
248 |         self.assertEqual(output_text, expected_output)
249 | 
250 |     def test_remove_punctuation_no_punctuations(self):
251 |         # Setup
252 |         input_text = 'Hello world'
253 |         expected_output = 'Hello world'
254 |         # Actual call
255 |         output_text = remove_punctuation(input_text)
256 |         # Asserts
257 |         self.assertEqual(output_text, expected_output)
258 | 
259 |     def test_remove_punctuation_all_punctuations(self):
260 |         # Setup
261 |         input_text = '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~'
262 |         expected_output = ''
263 |         # Actual call
264 |         output_text = remove_punctuation(input_text)
265 |         # Asserts
266 |         self.assertEqual(output_text, expected_output)
267 | 
268 |     def test_remove_punctuation_none(self):
269 |         # Setup
270 |         input_text = None
271 |         expected_output = ''
272 |         # Actual call
273 |         output_text = remove_punctuation(input_text)
274 |         # Asserts
275 |         self.assertEqual(output_text, expected_output)
276 | 
277 |     def test_remove_punctuation_empty_input(self):
278 |         # Setup
279 |         input_text = ''
280 |         expected_output = ''
281 |         # Actual call
282 |         output_text = remove_punctuation(input_text)
283 |         # Asserts
284 |         self.assertEqual(output_text, expected_output)
285 | 
286 |     def test_remove_special_character(self):
287 |         # Setup
288 |         input_text = 'Hello å¼« Welcome.'
289 |         expected_output = 'Hello  Welcome.'
290 |         # Actual call
291 |         output_text = remove_special_character(input_text)
292 |         # Asserts
293 |         self.assertEqual(output_text, expected_output)
294 | 
295 |     def test_remove_special_character_no_special_characters(self):
296 |         # Setup
297 |         input_text = 'Hello world'
298 |         expected_output = 'Hello world'
299 |         # Actual call
300 |         output_text = remove_special_character(input_text)
301 |         # Asserts
302 |         self.assertEqual(output_text, expected_output)
303 | 
304 |     def test_remove_special_character_all_special_characters(self):
305 |         # Setup
306 |         input_text = 'å¼«¥ª°©ð±§µæ¹¢³¿®ä£'
307 |         expected_output = ''
308 |         # Actual call
309 |         output_text = remove_special_character(input_text)
310 |         # Asserts
311 |         self.assertEqual(output_text, expected_output)
312 | 
313 |     def test_remove_special_character_none(self):
314 |         # Setup
315 |         input_text = None
316 |         expected_output = ''
317 |         # Actual call
318 |         output_text = remove_special_character(input_text)
319 |         # Asserts
320 |         self.assertEqual(output_text, expected_output)
321 | 
322 |     def test_remove_special_character_empty_input(self):
323 |         # Setup
324 |         input_text = ''
325 |         expected_output = ''
326 |         # Actual call
327 |         output_text = remove_special_character(input_text)
328 |         # Asserts
329 |         self.assertEqual(output_text, expected_output)
330 | 
331 |     def test_keep_alpha_numeric(self):
332 |         # Setup
333 |         input_text = 'Hello1 å¼«µæ Welcome2.'
334 |         expected_output = 'Hello1å¼µæWelcome2'
335 |         # Actual call
336 |         output_text = keep_alpha_numeric(input_text)
337 |         # Asserts
338 |         self.assertEqual(output_text, expected_output)
339 | 
340 |     def test_keep_alpha_numeric_no_alphanumeric(self):
341 |         # Setup
342 |         input_text = '!.,*&^'
343 |         expected_output = ''
344 |         # Actual call
345 |         output_text = keep_alpha_numeric(input_text)
346 |         # Asserts
347 |         self.assertEqual(output_text, expected_output)
348 | 
349 |     def test_keep_alpha_numeric_none(self):
350 |         # Setup
351 |         input_text = None
352 |         expected_output = ''
353 |         # Actual call
354 |         output_text = keep_alpha_numeric(input_text)
355 |         # Asserts
356 |         self.assertEqual(output_text, expected_output)
357 | 
358 |     def test_keep_alpha_numeric_empty_input(self):
359 |         # Setup
360 |         input_text = ''
361 |         expected_output = ''
362 |         # Actual call
363 |         output_text = keep_alpha_numeric(input_text)
364 |         # Asserts
365 |         self.assertEqual(output_text, expected_output)
366 | 
367 |     def test_remove_whitespace(self):
368 |         # Setup
369 |         input_text = ' Hello  Welcome. '
370 |         expected_output = 'Hello Welcome.'
371 |         # Actual call
372 |         output_text = remove_whitespace(input_text)
373 |         # Asserts
374 |         self.assertEqual(output_text, expected_output)
375 | 
376 |     def test_remove_whitespace_strip(self):
377 |         # Setup
378 |         input_text = ' Hello  Welcome. '
379 |         expected_output = 'Hello  Welcome.'
380 |         # Actual call
381 |         output_text = remove_whitespace(input_text, remove_duplicate_whitespace=False)
382 |         # Asserts
383 |         self.assertEqual(output_text, expected_output)
384 | 
385 |     def test_remove_whitespace_no_whitespace(self):
386 |         # Setup
387 |         input_text = 'Helloworld...'
388 |         expected_output = 'Helloworld...'
389 |         # Actual call
390 |         output_text = remove_whitespace(input_text)
391 |         # Asserts
392 |         self.assertEqual(output_text, expected_output)
393 | 
394 |     def test_remove_whitespace_all_whitespace(self):
395 |         # Setup
396 |         input_text = '   '
397 |         expected_output = ''
398 |         # Actual call
399 |         output_text = remove_whitespace(input_text)
400 |         # Asserts
401 |         self.assertEqual(output_text, expected_output)
402 | 
403 |     def test_remove_whitespace_none(self):
404 |         # Setup
405 |         input_text = None
406 |         expected_output = ''
407 |         # Actual call
408 |         output_text = remove_whitespace(input_text)
409 |         # Asserts
410 |         self.assertEqual(output_text, expected_output)
411 | 
412 |     def test_remove_whitespace_empty_input(self):
413 |         # Setup
414 |         input_text = ''
415 |         expected_output = ''
416 |         # Actual call
417 |         output_text = remove_whitespace(input_text)
418 |         # Asserts
419 |         self.assertEqual(output_text, expected_output)
420 | 
421 |     def test_expand_contraction(self):
422 |         # Setup
423 |         input_text = "This isn't a test"
424 |         expected_output = 'This is not a test'
425 |         # Actual call
426 |         output_text = expand_contraction(input_text)
427 |         # Asserts
428 |         self.assertEqual(output_text, expected_output)
429 | 
430 |     def test_expand_contraction_no_contraction(self):
431 |         # Setup
432 |         input_text = 'Hello world'
433 |         expected_output = 'Hello world'
434 |         # Actual call
435 |         output_text = expand_contraction(input_text)
436 |         # Asserts
437 |         self.assertEqual(output_text, expected_output)
438 | 
439 |     def test_expand_contraction_all_contractions(self):
440 |         # Setup
441 |         input_text = "cannot isn't ain't couldn't"
442 |         expected_output = 'cannot is not are not could not'
443 |         # Actual call
444 |         output_text = expand_contraction(input_text)
445 |         # Asserts
446 |         self.assertEqual(output_text, expected_output)
447 | 
448 |     def test_expand_contraction_none(self):
449 |         # Setup
450 |         input_text = None
451 |         expected_output = ''
452 |         # Actual call
453 |         output_text = expand_contraction(input_text)
454 |         # Asserts
455 |         self.assertEqual(output_text, expected_output)
456 | 
457 |     def test_expand_contraction_empty_input(self):
458 |         # Setup
459 |         input_text = ''
460 |         expected_output = ''
461 |         # Actual call
462 |         output_text = expand_contraction(input_text)
463 |         # Asserts
464 |         self.assertEqual(output_text, expected_output)
465 | 
466 |     def test_normalize_unicode(self):
467 |         # Setup
468 |         input_text = "I love this Café"
469 |         expected_output = 'I love this Cafe'
470 |         # Actual call
471 |         output_text = normalize_unicode(input_text)
472 |         # Asserts
473 |         self.assertEqual(output_text, expected_output)
474 | 
475 |     def test_normalize_unicode_no_special_unicode(self):
476 |         # Setup
477 |         input_text = 'This is a test'
478 |         expected_output = 'This is a test'
479 |         # Actual call
480 |         output_text = normalize_unicode(input_text)
481 |         # Asserts
482 |         self.assertEqual(output_text, expected_output)
483 | 
484 |     def test_normalize_unicode_all_special_unicode(self):
485 |         # Setup
486 |         input_text = 'áñó'
487 |         expected_output = 'ano'
488 |         # Actual call
489 |         output_text = normalize_unicode(input_text)
490 |         # Asserts
491 |         self.assertEqual(output_text, expected_output)
492 | 
493 |     def test_normalize_unicode_none(self):
494 |         # Setup
495 |         input_text = None
496 |         expected_output = ''
497 |         # Actual call
498 |         output_text = normalize_unicode(input_text)
499 |         # Asserts
500 |         self.assertEqual(output_text, expected_output)
501 | 
502 |     def test_normalize_unicode_empty_input(self):
503 |         # Setup
504 |         input_text = ''
505 |         expected_output = ''
506 |         # Actual call
507 |         output_text = normalize_unicode(input_text)
508 |         # Asserts
509 |         self.assertEqual(output_text, expected_output)
510 | 
511 |     def test_remove_stopword(self):
512 |         # Setup
513 |         input_text = "This is a test!"
514 |         expected_output = ['This', 'test', '!']
515 |         # Actual call
516 |         output_text = remove_stopword(input_text)
517 |         # Asserts
518 |         self.assertListEqual(output_text, expected_output)
519 | 
520 |     def test_remove_stopword_no_stopword(self):
521 |         # Setup
522 |         input_text = 'Hello World.'
523 |         expected_output = ['Hello', 'World', '.']
524 |         # Actual call
525 |         output_text = remove_stopword(input_text)
526 |         # Asserts
527 |         self.assertListEqual(output_text, expected_output)
528 | 
529 |     def test_remove_stopword_all_stopwords(self):
530 |         # Setup
531 |         input_text = 'the a your my his her'
532 |         expected_output = []
533 |         # Actual call
534 |         output_text = remove_stopword(input_text)
535 |         # Asserts
536 |         self.assertListEqual(output_text, expected_output)
537 | 
538 |     def test_remove_stopword_none(self):
539 |         # Setup
540 |         input_text = None
541 |         expected_output = []
542 |         # Actual call
543 |         output_text = remove_stopword(input_text)
544 |         # Asserts
545 |         self.assertListEqual(output_text, expected_output)
546 | 
547 |     def test_remove_stopword_empty_input(self):
548 |         # Setup
549 |         input_text = ''
550 |         expected_output = []
551 |         # Actual call
552 |         output_text = remove_stopword(input_text)
553 |         # Asserts
554 |         self.assertListEqual(output_text, expected_output)
555 | 
556 |     def test_remove_email(self):
557 |         # Setup
558 |         input_text = "Please email me at john.doe@email.com."
559 |         expected_output = "Please email me at ."
560 |         # Actual call
561 |         output_text = remove_email(input_text)
562 |         # Asserts
563 |         self.assertEqual(output_text, expected_output)
564 | 
565 |     def test_remove_email_no_email(self):
566 |         # Setup
567 |         input_text = "Please call me (425) 425-1234."
568 |         expected_output = "Please call me (425) 425-1234."
569 |         # Actual call
570 |         output_text = remove_email(input_text)
571 |         # Asserts
572 |         self.assertEqual(output_text, expected_output)
573 | 
574 |     def test_remove_email_all_emails(self):
575 |         # Setup
576 |         input_text = 'john.doe@email.com, john.doe@microsoft.com, janedoe@gmail.com'
577 |         expected_output = ', , '
578 |         # Actual call
579 |         output_text = remove_email(input_text)
580 |         # Asserts
581 |         self.assertEqual(output_text, expected_output)
582 | 
583 |     def test_remove_email_none(self):
584 |         # Setup
585 |         input_text = None
586 |         expected_output = ''
587 |         # Actual call
588 |         output_text = remove_email(input_text)
589 |         # Asserts
590 |         self.assertEqual(output_text, expected_output)
591 | 
592 |     def test_remove_email_empty_input(self):
593 |         # Setup
594 |         input_text = ''
595 |         expected_output = ''
596 |         # Actual call
597 |         output_text = remove_email(input_text)
598 |         # Asserts
599 |         self.assertEqual(output_text, expected_output)
600 | 
601 |     def test_remove_phone_number(self):
602 |         # Setup
603 |         input_text = "Please call me at (425) 538-0116."
604 |         expected_output = "Please call me at."
605 |         # Actual call
606 |         output_text = remove_phone_number(input_text)
607 |         # Asserts
608 |         self.assertEqual(output_text, expected_output)
609 | 
610 |     def test_remove_phone_number_no_phone(self):
611 |         # Setup
612 |         input_text = "Please email me"
613 |         expected_output = "Please email me"
614 |         # Actual call
615 |         output_text = remove_phone_number(input_text)
616 |         # Asserts
617 |         self.assertEqual(output_text, expected_output)
618 | 
619 |     def test_remove_phone_number_all_phones(self):
620 |         # Setup
621 |         input_text = '(425) 538-1234, (425)5381234, 4255381234 425-538-1234, 425.538.1234, +1 425-538-1234'
622 |         expected_output = ',,,, '
623 |         # Actual call
624 |         output_text = remove_phone_number(input_text)
625 |         # Asserts
626 |         self.assertEqual(output_text, expected_output)
627 | 
628 |     def test_remove_phone_number_none(self):
629 |         # Setup
630 |         input_text = None
631 |         expected_output = ''
632 |         # Actual call
633 |         output_text = remove_phone_number(input_text)
634 |         # Asserts
635 |         self.assertEqual(output_text, expected_output)
636 | 
637 |     def test_remove_phone_number_empty_input(self):
638 |         # Setup
639 |         input_text = ''
640 |         expected_output = ''
641 |         # Actual call
642 |         output_text = remove_phone_number(input_text)
643 |         # Asserts
644 |         self.assertEqual(output_text, expected_output)
645 | 
646 |     def test_remove_ssn(self):
647 |         # Setup
648 |         input_text = "My social security is 770-12-3456"
649 |         expected_output = "My social security is "
650 |         # Actual call
651 |         output_text = remove_ssn(input_text)
652 |         # Asserts
653 |         self.assertEqual(output_text, expected_output)
654 | 
655 |     def test_remove_ssn_no_ssn(self):
656 |         # Setup
657 |         input_text = "Hello world!"
658 |         expected_output = "Hello world!"
659 |         # Actual call
660 |         output_text = remove_ssn(input_text)
661 |         # Asserts
662 |         self.assertEqual(output_text, expected_output)
663 | 
664 |     def test_remove_ssn_all_ssns(self):
665 |         # Setup
666 |         input_text = '574-76-3766, 664-20-8576, 481-94-4099, 585-60-3079, 541714785'
667 |         expected_output = ', , , , '
668 |         # Actual call
669 |         output_text = remove_ssn(input_text)
670 |         # Asserts
671 |         self.assertEqual(output_text, expected_output)
672 | 
673 |     def test_remove_ssn_none(self):
674 |         # Setup
675 |         input_text = None
676 |         expected_output = ''
677 |         # Actual call
678 |         output_text = remove_ssn(input_text)
679 |         # Asserts
680 |         self.assertEqual(output_text, expected_output)
681 | 
682 |     def test_remove_ssn_empty_input(self):
683 |         # Setup
684 |         input_text = ''
685 |         expected_output = ''
686 |         # Actual call
687 |         output_text = remove_ssn(input_text)
688 |         # Asserts
689 |         self.assertEqual(output_text, expected_output)
690 | 
691 |     def test_remove_credit_card_number(self):
692 |         # Setup
693 |         input_text = "Please refund me 5116937367451492"
694 |         expected_output = "Please refund me "
695 |         # Actual call
696 |         output_text = remove_credit_card_number(input_text)
697 |         # Asserts
698 |         self.assertEqual(output_text, expected_output)
699 | 
700 |     def test_remove_credit_card_number_no_credit_card_number(self):
701 |         # Setup
702 |         input_text = "Hello world!"
703 |         expected_output = "Hello world!"
704 |         # Actual call
705 |         output_text = remove_credit_card_number(input_text)
706 |         # Asserts
707 |         self.assertEqual(output_text, expected_output)
708 | 
709 |     def test_remove_credit_card_number_all_credit_card_numbers(self):
710 |         # Setup
711 |         input_text = '379524231139785, 5592621143924294, 6011167500016424, 4500339642915036, 4979770613611'
712 |         expected_output = ', , , , '
713 |         # Actual call
714 |         output_text = remove_credit_card_number(input_text)
715 |         # Asserts
716 |         self.assertEqual(output_text, expected_output)
717 | 
718 |     def test_remove_credit_card_number_none(self):
719 |         # Setup
720 |         input_text = None
721 |         expected_output = ''
722 |         # Actual call
723 |         output_text = remove_credit_card_number(input_text)
724 |         # Asserts
725 |         self.assertEqual(output_text, expected_output)
726 | 
727 |     def test_remove_credit_card_number_empty_input(self):
728 |         # Setup
729 |         input_text = ''
730 |         expected_output = ''
731 |         # Actual call
732 |         output_text = remove_credit_card_number(input_text)
733 |         # Asserts
734 |         self.assertEqual(output_text, expected_output)
735 | 
736 |     def test_remove_name(self):
737 |         # Setup
738 |         input_text = "My name is Lionel Messi"
739 |         expected_output = ['My', 'name', 'is']
740 |         # Actual call
741 |         output_text = remove_name(input_text)
742 |         # Asserts
743 |         self.assertListEqual(output_text, expected_output)
744 | 
745 |     def test_remove_name_no_name(self):
746 |         # Setup
747 |         input_text = 'Hello World.'
748 |         expected_output = ['Hello', 'World', '.']
749 |         # Actual call
750 |         output_text = remove_name(input_text)
751 |         # Asserts
752 |         self.assertListEqual(output_text, expected_output)
753 | 
754 |     def test_remove_name_all_names(self):
755 |         # Setup
756 |         input_text = 'Paul Allen John Doe Jane Doe Lebron James'
757 |         expected_output = []
758 |         # Actual call
759 |         output_text = remove_name(input_text)
760 |         # Asserts
761 |         self.assertListEqual(output_text, expected_output)
762 | 
763 |     def test_remove_name_none(self):
764 |         # Setup
765 |         input_text = None
766 |         expected_output = []
767 |         # Actual call
768 |         output_text = remove_name(input_text)
769 |         # Asserts
770 |         self.assertListEqual(output_text, expected_output)
771 | 
772 |     def test_remove_name_empty_input(self):
773 |         # Setup
774 |         input_text = ''
775 |         expected_output = []
776 |         # Actual call
777 |         output_text = remove_name(input_text)
778 |         # Asserts
779 |         self.assertListEqual(output_text, expected_output)
780 | 
781 |     def test_check_spelling(self):
782 |         # Setup
783 |         input_text = "Helloo world"
784 |         expected_output = "hello world"
785 |         # Actual call
786 |         output_text = check_spelling(input_text)
787 |         # Asserts
788 |         self.assertEqual(output_text, expected_output)
789 | 
790 |     def test_check_spelling_list(self):
791 |         # Setup
792 |         input_text = ["Helloo", "world"]
793 |         expected_output = "hello world"
794 |         # Actual call
795 |         output_text = check_spelling(input_text)
796 |         # Asserts
797 |         self.assertEqual(output_text, expected_output)
798 | 
799 |     def test_check_spelling_no_spelling_error(self):
800 |         # Setup
801 |         input_text = "Hello world!"
802 |         expected_output = "hello world !"
803 |         # Actual call
804 |         output_text = check_spelling(input_text)
805 |         # Asserts
806 |         self.assertEqual(output_text, expected_output)
807 | 
808 |     def test_check_spelling_no_spelling_error_list(self):
809 |         # Setup
810 |         input_text = ["hello", "world"]
811 |         expected_output = "hello world"
812 |         # Actual call
813 |         output_text = check_spelling(input_text)
814 |         # Asserts
815 |         self.assertEqual(output_text, expected_output)
816 | 
817 |     def test_check_spelling_all_errors(self):
818 |         # Setup
819 |         input_text = 'Helllo worlld nicee to meeet'
820 |         expected_output = 'hello world nice to meet'
821 |         # Actual call
822 |         output_text = check_spelling(input_text)
823 |         # Asserts
824 |         self.assertEqual(output_text, expected_output)
825 | 
826 |     def test_check_spelling_all_errors_list(self):
827 |         # Setup
828 |         input_text = ['Helllo', 'worlld', 'nicee', 'to', 'meeet']
829 |         expected_output = 'hello world nice to meet'
830 |         # Actual call
831 |         output_text = check_spelling(input_text)
832 |         # Asserts
833 |         self.assertEqual(output_text, expected_output)
834 | 
835 |     def test_check_spelling_none(self):
836 |         # Setup
837 |         input_text = None
838 |         expected_output = ''
839 |         # Actual call
840 |         output_text = check_spelling(input_text)
841 |         # Asserts
842 |         self.assertEqual(output_text, expected_output)
843 | 
844 |     def test_check_spelling_empty_input(self):
845 |         # Setup
846 |         input_text = ''
847 |         expected_output = ''
848 |         # Actual call
849 |         output_text = check_spelling(input_text)
850 |         # Asserts
851 |         self.assertEqual(output_text, expected_output)
852 | 
853 |     def test_check_spelling_empty_list_input(self):
854 |         # Setup
855 |         input_text = []
856 |         expected_output = ''
857 |         # Actual call
858 |         output_text = check_spelling(input_text)
859 |         # Asserts
860 |         self.assertEqual(output_text, expected_output)
861 | 
862 |     def test_substitute_token(self):
863 |         # Setup
864 |         input_list = ['hello', 'world', 'msft']
865 |         expected_output = ['hello', 'world', 'Microsoft']
866 |         # Actual call
867 |         output_text = substitute_token(input_list)
868 |         # Asserts
869 |         self.assertListEqual(output_text, expected_output)
870 | 
871 |     def test_substitute_token_no_custom_token(self):
872 |         # Setup
873 |         input_list = ['hello', 'world']
874 |         expected_output = ['hello', 'world']
875 |         # Actual call
876 |         output_text = substitute_token(input_list)
877 |         # Asserts
878 |         self.assertListEqual(output_text, expected_output)
879 | 
880 |     def test_substitute_token_all_custom_tokens(self):
881 |         # Setup
882 |         input_list = ['fyi', 'btw', 'apr', 'mon']
883 |         expected_output = ['for your information', 'by the way', 'April', 'Monday']
884 |         # Actual call
885 |         output_text = substitute_token(input_list)
886 |         # Asserts
887 |         self.assertListEqual(output_text, expected_output)
888 | 
889 |     def test_substitute_token_none_input(self):
890 |         # Setup
891 |         input_text = None
892 |         expected_output = []
893 |         # Actual call
894 |         output_text = substitute_token(input_text)
895 |         # Asserts
896 |         self.assertListEqual(output_text, expected_output)
897 | 
898 |     def test_substitute_token_empty_list_input(self):
899 |         # Setup
900 |         input_text = []
901 |         expected_output = []
902 |         # Actual call
903 |         output_text = substitute_token(input_text)
904 |         # Asserts
905 |         self.assertListEqual(output_text, expected_output)
906 | 
907 |     @patch("text_preprocessing.text_preprocessing.to_lower", autospec=True)
908 |     @patch("text_preprocessing.text_preprocessing.remove_url", autospec=True)
909 |     @patch("text_preprocessing.text_preprocessing.remove_email", autospec=True)
910 |     @patch("text_preprocessing.text_preprocessing.remove_phone_number", autospec=True)
911 |     @patch("text_preprocessing.text_preprocessing.remove_itemized_bullet_and_numbering", autospec=True)
912 |     @patch("text_preprocessing.text_preprocessing.expand_contraction", autospec=True)
913 |     @patch("text_preprocessing.text_preprocessing.check_spelling", autospec=True)
914 |     @patch("text_preprocessing.text_preprocessing.remove_special_character", autospec=True)
915 |     @patch("text_preprocessing.text_preprocessing.remove_punctuation", autospec=True)
916 |     @patch("text_preprocessing.text_preprocessing.remove_whitespace", autospec=True)
917 |     @patch("text_preprocessing.text_preprocessing.normalize_unicode", autospec=True)
918 |     @patch("text_preprocessing.text_preprocessing.remove_stopword", autospec=True)
919 |     @patch("text_preprocessing.text_preprocessing.remove_name", autospec=True)
920 |     @patch("text_preprocessing.text_preprocessing.substitute_token", autospec=True)
921 |     @patch("text_preprocessing.text_preprocessing.lemmatize_word", autospec=True)
922 |     def test_preprocess_text(self,
923 |                              mock_lemmatize_word: MagicMock,
924 |                              mock_substitute_token: MagicMock,
925 |                              mock_remove_name: MagicMock,
926 |                              mock_remove_stopword: MagicMock,
927 |                              mock_normalize_unicode: MagicMock,
928 |                              mock_remove_whitespace: MagicMock,
929 |                              mock_remove_punctuation: MagicMock,
930 |                              mock_remove_special_character: MagicMock,
931 |                              mock_check_spelling: MagicMock,
932 |                              mock_expand_contraction: MagicMock,
933 |                              mock_remove_itemized_bullet_and_numbering: MagicMock,
934 |                              mock_remove_phone_number: MagicMock,
935 |                              mock_remove_email: MagicMock,
936 |                              mock_remove_url: MagicMock,
937 |                              mock_to_lower: MagicMock):
938 |         # Setup
939 |         input_text = 'a test'
940 |         # Actual call
941 |         _ = preprocess_text(input_text)
942 |         # Asserts
943 |         mock_to_lower.assert_called_once()
944 |         mock_remove_url.assert_called_once()
945 |         mock_remove_email.assert_called_once()
946 |         mock_remove_phone_number.assert_called_once()
947 |         mock_remove_itemized_bullet_and_numbering.assert_called_once()
948 |         mock_expand_contraction.assert_called_once()
949 |         mock_check_spelling.assert_called_once()
950 |         mock_remove_special_character.assert_called_once()
951 |         mock_remove_punctuation.assert_called_once()
952 |         mock_remove_whitespace.assert_called_once()
953 |         mock_normalize_unicode.assert_called_once()
954 |         mock_remove_stopword.assert_called_once()
955 |         mock_remove_name.assert_called_once()
956 |         mock_substitute_token.assert_called_once()
957 |         mock_lemmatize_word.assert_called_once()
958 | 
959 |     @patch("text_preprocessing.text_preprocessing.to_lower", autospec=True)
960 |     @patch("text_preprocessing.text_preprocessing.remove_url", autospec=True)
961 |     @patch("text_preprocessing.text_preprocessing.remove_email", autospec=True)
962 |     @patch("text_preprocessing.text_preprocessing.remove_phone_number", autospec=True)
963 |     def test_preprocess_text_custom(self,
964 |                                     mock_remove_phone_number: MagicMock, mock_remove_email: MagicMock,
965 |                                     mock_remove_url: MagicMock, mock_to_lower: MagicMock):
966 |         # Setup
967 |         input_text = 'a test'
968 |         # Actual call
969 |         pipeline_functions = [mock_to_lower, mock_remove_url, mock_remove_email, mock_remove_phone_number]
970 |         _ = preprocess_text(input_text, pipeline_functions)
971 |         # Asserts
972 |         mock_to_lower.assert_called_once()
973 |         mock_remove_url.assert_called_once()
974 |         mock_remove_email.assert_called_once()
975 |         mock_remove_phone_number.assert_called_once()
976 | 
977 |     def test_preprocess_text_integration_a(self):
978 |         # Setup
979 |         input_text = 'Helllo, I am John Doe!!!   My email is john.doe@email.com. Please visit my website ' \
980 |                      'www.johndoe.com '
981 |         expected_output = 'hello email please visit website'
982 |         # Actual call
983 |         output_text = preprocess_text(input_text)
984 |         # Asserts
985 |         self.assertEqual(output_text, expected_output)
986 | 
987 |     def test_preprocess_text_integration_custom(self):
988 |         # Setup
989 |         input_text = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit my website www.johndoe.com '
990 |         expected_output = 'helllo i am john doe my email is  visit my website  '
991 |         # Actual call
992 |         pipeline_functions = [to_lower, remove_url, remove_email, remove_punctuation]
993 |         output_text = preprocess_text(input_text, pipeline_functions)
994 |         # Asserts
995 |         self.assertEqual(output_text, expected_output)
996 | 


--------------------------------------------------------------------------------
/text_preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.1'
2 | 
3 | from .text_preprocessing import (to_lower, to_upper, remove_number, remove_itemized_bullet_and_numbering, remove_url,
4 |                                  remove_punctuation, remove_special_character, keep_alpha_numeric, remove_whitespace,
5 |                                  expand_contraction, normalize_unicode, remove_stopword, remove_email,
6 |                                  remove_phone_number, remove_ssn, remove_credit_card_number, remove_name,
7 |                                  check_spelling, tokenize_word, tokenize_sentence, stem_word, lemmatize_word,
8 |                                  substitute_token, preprocess_text)
9 | 


--------------------------------------------------------------------------------
/text_preprocessing/data/custom_substitutions.csv:
--------------------------------------------------------------------------------
  1 | 2factor,two factor
  2 | 2way,two way
  3 | aapl,Apple
  4 | acct,account
  5 | accts,accounts
  6 | admins,administrators
  7 | admns,administrators
  8 | ai,artificial intelligence
  9 | amzn,Amazon
 10 | anniv,anniversary
 11 | api,application program interface
 12 | approx,approximately
 13 | appt,appointment
 14 | apr,April
 15 | asap,as soon as possible
 16 | async,asynchronous
 17 | aug,August
 18 | auth,authentication
 19 | authen,authentication
 20 | auths,authentications
 21 | avg,average
 22 | aws,Amazon Web Services
 23 | b2b,business to business
 24 | b2c,business to consumer
 25 | batt,battery
 26 | bc,because
 27 | biz,business
 28 | bldg,building
 29 | btw,by the way
 30 | cdo,chief data officer
 31 | ceo,chief executive officer
 32 | cfo,chief financial officer
 33 | cio,chief information officer
 34 | cmo,chief marketing officer
 35 | cms,content management system
 36 | cob,close of business
 37 | combo,combination
 38 | comm,communication
 39 | comms,communications
 40 | compat,compatible
 41 | config,configuration
 42 | configs,configurations
 43 | coo,chief operating officer
 44 | crm,customer relationship management
 45 | css,cascading style sheet
 46 | cta,call to action
 47 | cto,chief technology officer
 48 | ctr,click through rate
 49 | da,data analyst
 50 | db,database
 51 | de,data engineer
 52 | dec,December
 53 | demos,demonstrations
 54 | dept,department
 55 | devs,developers
 56 | dissat,dissatisfaction
 57 | distro,distribution
 58 | doc,document
 59 | docs,documents
 60 | droid,Android
 61 | ds,data science
 62 | dunno,do not know
 63 | edu,education
 64 | eg,for example
 65 | eod,end of day
 66 | eow,end of week
 67 | eps,earnings per share
 68 | esp,especially
 69 | eta,estimated time of arrival
 70 | exec,executive
 71 | execs,executives
 72 | exp,experience
 73 | f2f,face to face
 74 | face2face,face to face
 75 | fb,FaceBook
 76 | feb,February
 77 | fifo,first in first out
 78 | fri,Friday
 79 | fte,full time employee
 80 | ftes,full time employees
 81 | fyi,for your information
 82 | gen,generation
 83 | goog,Google
 84 | gov,government
 85 | govt,government
 86 | gui,graphical user interface
 87 | hq,headquarter
 88 | hr,human resources
 89 | hrs,hours
 90 | iaas,infrastructure as a service
 91 | im,instant messaging
 92 | imho,in my opinion
 93 | imo,in my opinion
 94 | incl,including
 95 | info,information
 96 | infra,infrastructure
 97 | intro,introduction
 98 | ipo,initial public offering
 99 | isp,internet service provider
100 | jan,January 
101 | jul,July
102 | jun,June
103 | kinda,kind of
104 | kpi,key performance indicator
105 | kpis,key performance indicators
106 | legit,legitimate
107 | lifo,last in first out
108 | mar,March
109 | mfa,multi factor authentication
110 | mgt,management
111 | mgmt,management
112 | mgr,manager
113 | mgrs,managers
114 | mic,microphone
115 | mics,microphones
116 | mins,minutes
117 | mkt,market
118 | ml,machine learning
119 | mon,Monday
120 | msft,Microsoft
121 | msg,message
122 | msgs,messages
123 | mtg,meeting
124 | mtng,meeting
125 | mtgs,meetings
126 | mtngs,meetings
127 | neo,new employee orientation
128 | nflx,Netflix
129 | nov,November
130 | o365,Office365
131 | oct,October
132 | oof,out of office
133 | ooo,out of office
134 | org,organization
135 | orgs,organizations
136 | paas,platform as a service
137 | pbi,PowerBI
138 | pbix,PowerBI
139 | pc,personal computer
140 | pcard,purchase card
141 | pcs,personal computers
142 | pic,picture
143 | pics,pictures
144 | pkg,package
145 | pls,please
146 | plz,please
147 | pm,project manager
148 | poc,proof of concept
149 | ppl,people
150 | ppt,Powerpoint
151 | pptx,Powerpoint
152 | pr,public relations
153 | prev,previous
154 | prod,production
155 | promo,promotion
156 | pte,part time employee
157 | pto,paid time off
158 | pwd,password
159 | qa,quality assurance
160 | qtr,quarter
161 | qtrly,quarterly
162 | r&d,research and development
163 | re,referring to
164 | repo,repository
165 | repos,repositories
166 | req,requirement
167 | reqs,requirements
168 | roa,return on assets
169 | roe,return on equity
170 | roi,return on investment
171 | saas,software as a service
172 | sat,Saturday
173 | sde,software development engineer
174 | sep,September
175 | sept,September
176 | sorta,sort of
177 | swe,software engineer
178 | thu,Thursday
179 | thur,Thursday
180 | thurs,Thursday
181 | thx,thanks
182 | tmw,tomorrow
183 | tue,Tuesday
184 | tues,Tuesday
185 | txt,text
186 | ui,user interface
187 | ux,user experience
188 | vs,versus
189 | wed,Wednesday 
190 | wfh,work from home
191 | wrt,with respect to
192 | ww,worldwide
193 | xls,Excel
194 | xlsx,Excel
195 | yr,year
196 | yrs,years
197 | ytd,year to date
198 | 


--------------------------------------------------------------------------------
/text_preprocessing/data/ignore_spellcheck_words.txt:
--------------------------------------------------------------------------------
  1 | 2factor
  2 | 2way
  3 | aapl
  4 | acct
  5 | accts
  6 | admins
  7 | admns
  8 | ai
  9 | amzn
 10 | anniv
 11 | api
 12 | approx
 13 | appt
 14 | apr
 15 | asap
 16 | async
 17 | aug
 18 | auth
 19 | authen
 20 | auths
 21 | avg
 22 | aws
 23 | b2b
 24 | b2c
 25 | batt
 26 | bc
 27 | biz
 28 | bldg
 29 | btw
 30 | cdo
 31 | ceo
 32 | cfo
 33 | cio
 34 | cmo
 35 | cms
 36 | cob
 37 | combo
 38 | comm
 39 | comms
 40 | compat
 41 | config
 42 | configs
 43 | coo
 44 | crm
 45 | css
 46 | cta
 47 | cto
 48 | ctr
 49 | da
 50 | db
 51 | de
 52 | dec
 53 | demos
 54 | dept
 55 | devs
 56 | dissat
 57 | distro
 58 | doc
 59 | docs
 60 | droid
 61 | ds
 62 | dunno
 63 | edu
 64 | eg
 65 | eod
 66 | eow
 67 | eps
 68 | esp
 69 | eta
 70 | exec
 71 | execs
 72 | exp
 73 | f2f
 74 | face2face
 75 | fb
 76 | feb
 77 | fifo
 78 | fri
 79 | fte
 80 | ftes
 81 | fyi
 82 | gen
 83 | goog
 84 | gov
 85 | govt
 86 | gui
 87 | hq
 88 | hr
 89 | hrs
 90 | iaas
 91 | im
 92 | imho
 93 | imo
 94 | incl
 95 | info
 96 | infra
 97 | intro
 98 | ipo
 99 | isp
100 | jan
101 | jul
102 | jun
103 | kinda
104 | kpi
105 | kpis
106 | legit
107 | lifo
108 | mar
109 | mfa
110 | mgt
111 | mgmt
112 | mgr
113 | mgrs
114 | mic
115 | mics
116 | mins
117 | mkt
118 | ml
119 | mon
120 | msft
121 | msg
122 | msgs
123 | mtg
124 | mtng
125 | mtgs
126 | mtngs
127 | neo
128 | nflx
129 | nov
130 | o365
131 | oct
132 | oof
133 | ooo
134 | org
135 | orgs
136 | paas
137 | pbi
138 | pbix
139 | pc
140 | pcard
141 | pcs
142 | pic
143 | pics
144 | pkg
145 | pls
146 | plz
147 | pm
148 | poc
149 | ppl
150 | ppt
151 | pptx
152 | pr
153 | prev
154 | prod
155 | promo
156 | pte
157 | pto
158 | pwd
159 | qa
160 | qtr
161 | qtrly
162 | r&d
163 | re
164 | repo
165 | repos
166 | req
167 | reqs
168 | roa
169 | roe
170 | roi
171 | saas
172 | sat
173 | sde
174 | sep
175 | sept
176 | sorta
177 | swe
178 | thu
179 | thur
180 | thurs
181 | thx
182 | tmw
183 | tue
184 | tues
185 | txt
186 | ui
187 | ux
188 | vs
189 | wed
190 | wfh
191 | wrt
192 | ww
193 | xls
194 | xlsx
195 | yr
196 | yrs
197 | ytd


--------------------------------------------------------------------------------
/text_preprocessing/text_preprocessing.py:
--------------------------------------------------------------------------------
  1 | # Standard libraries
  2 | import os
  3 | import re
  4 | import string
  5 | import logging
  6 | import csv
  7 | from pathlib import Path
  8 | from functools import wraps
  9 | from unicodedata import normalize
 10 | from typing import List, Optional, Union, Callable
 11 | 
 12 | # Third party libraries
 13 | import contractions
 14 | import nltk
 15 | from nltk.corpus import stopwords
 16 | from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
 17 | from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer
 18 | from spellchecker import SpellChecker
 19 | from names_dataset import NameDataset
 20 | 
 21 | nltk.download('stopwords', quiet=True)
 22 | nltk.download('wordnet', quiet=True)
 23 | nltk.download('punkt', quiet=True)
 24 | nltk.download('omw-1.4')
 25 | 
 26 | _CUSTOM_SUB_CSV_FILE_PATH = os.path.join(os.path.dirname(__file__), 'data/custom_substitutions.csv')
 27 | _IGNORE_SPELLCHECK_WORD_FILE_PATH = os.path.join(os.path.dirname(__file__), 'data/ignore_spellcheck_words.txt')
 28 | 
 29 | LOGGER = logging.getLogger(__name__)
 30 | LOGGER.setLevel(logging.INFO)
 31 | 
 32 | 
 33 | def _return_empty_string_for_invalid_input(func):
 34 |     """ Return empty string if the input is None or empty """
 35 |     @wraps(func)
 36 |     def wrapper(*args, **kwargs):
 37 |         if 'input_text' in kwargs:
 38 |             input_text = kwargs['input_text']
 39 |         else:
 40 |             try:
 41 |                 input_text = args[0]
 42 |             except IndexError as e:
 43 |                 LOGGER.exception('No appropriate positional argument is provide.')
 44 |                 raise e
 45 |         if input_text is None or len(input_text) == 0:
 46 |             return ''
 47 |         else:
 48 |             return func(*args, **kwargs)
 49 |     return wrapper
 50 | 
 51 | 
 52 | def _return_empty_list_for_invalid_input(func):
 53 |     """ Return empty list if the input is None or empty """
 54 |     @wraps(func)
 55 |     def wrapper(*args, **kwargs):
 56 |         if 'input_text_or_list' in kwargs:
 57 |             input_text_or_list = kwargs['input_text_or_list']
 58 |         else:
 59 |             try:
 60 |                 input_text_or_list = args[0]
 61 |             except IndexError as e:
 62 |                 LOGGER.exception('No appropriate positional argument is provide.')
 63 |                 raise e
 64 |         if input_text_or_list is None or len(input_text_or_list) == 0:
 65 |             return []
 66 |         else:
 67 |             return func(*args, **kwargs)
 68 |     return wrapper
 69 | 
 70 | 
 71 | @_return_empty_string_for_invalid_input
 72 | def to_lower(input_text: str) -> str:
 73 |     """ Convert input text to lower case """
 74 |     return input_text.lower()
 75 | 
 76 | 
 77 | @_return_empty_string_for_invalid_input
 78 | def to_upper(input_text: str) -> str:
 79 |     """ Convert input text to upper case """
 80 |     return input_text.upper()
 81 | 
 82 | 
 83 | @_return_empty_string_for_invalid_input
 84 | def remove_number(input_text: str) -> str:
 85 |     """ Remove number in the input text """
 86 |     processed_text = re.sub('\d+', '', input_text)
 87 |     return processed_text
 88 | 
 89 | 
 90 | @_return_empty_string_for_invalid_input
 91 | def remove_itemized_bullet_and_numbering(input_text: str) -> str:
 92 |     """ Remove bullets or numbering in itemized input """
 93 |     processed_text = re.sub('[(\s][0-9a-zA-Z][.)]\s+|[(\s][ivxIVX]+[.)]\s+', ' ', input_text)
 94 |     return processed_text
 95 | 
 96 | 
 97 | @_return_empty_string_for_invalid_input
 98 | def remove_url(input_text: str) -> str:
 99 |     """ Remove url in the input text """
100 |     return re.sub('(www|http)\S+', '', input_text)
101 | 
102 | 
103 | @_return_empty_string_for_invalid_input
104 | def remove_punctuation(input_text: str, punctuations: Optional[str] = None) -> str:
105 |     """
106 |     Removes all punctuations from a string, as defined by string.punctuation or a custom list.
107 |     For reference, Python's string.punctuation is equivalent to '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~'
108 |     """
109 |     if punctuations is None:
110 |         punctuations = string.punctuation
111 |     processed_text = input_text.translate(str.maketrans('', '', punctuations))
112 |     return processed_text
113 | 
114 | 
115 | @_return_empty_string_for_invalid_input
116 | def remove_special_character(input_text: str, special_characters: Optional[str] = None) -> str:
117 |     """ Removes special characters """
118 |     if special_characters is None:
119 |         # TODO: add more special characters
120 |         special_characters = 'å¼«¥ª°©ð±§µæ¹¢³¿®ä£'
121 |     processed_text = input_text.translate(str.maketrans('', '', special_characters))
122 |     return processed_text
123 | 
124 | 
125 | @_return_empty_string_for_invalid_input
126 | def keep_alpha_numeric(input_text: str) -> str:
127 |     """ Remove any character except alphanumeric characters """
128 |     return ''.join(c for c in input_text if c.isalnum())
129 | 
130 | 
131 | @_return_empty_string_for_invalid_input
132 | def remove_whitespace(input_text: str, remove_duplicate_whitespace: bool = True) -> str:
133 |     """ Removes leading, trailing, and (optionally) duplicated whitespace """
134 |     if remove_duplicate_whitespace:
135 |         return ' '.join(re.split('\s+', input_text.strip(), flags=re.UNICODE))
136 |     return input_text.strip()
137 | 
138 | 
139 | @_return_empty_string_for_invalid_input
140 | def expand_contraction(input_text: str) -> str:
141 |     """ Expand contractions in input text """
142 |     return contractions.fix(input_text)
143 | 
144 | 
145 | @_return_empty_string_for_invalid_input
146 | def normalize_unicode(input_text: str) -> str:
147 |     """ Normalize unicode data to remove umlauts, and accents, etc. """
148 |     processed_tokens = normalize('NFKD', input_text).encode('ASCII', 'ignore').decode('utf8')
149 |     return processed_tokens
150 | 
151 | 
152 | @_return_empty_list_for_invalid_input
153 | def remove_stopword(input_text_or_list: Union[str, List[str]], stop_words: Optional[set] = None) -> List[str]:
154 |     """ Remove stop words """
155 | 
156 |     if stop_words is None:
157 |         stop_words = set(stopwords.words('english'))
158 |     if isinstance(stop_words, list):
159 |         stop_words = set(stop_words)
160 |     if isinstance(input_text_or_list, str):
161 |         tokens = word_tokenize(input_text_or_list)
162 |         processed_tokens = [token for token in tokens if token not in stop_words]
163 |     else:
164 |         processed_tokens = [token for token in input_text_or_list
165 |                             if (token not in stop_words and token is not None and len(token) > 0)]
166 |     return processed_tokens
167 | 
168 | 
169 | @_return_empty_string_for_invalid_input
170 | def remove_email(input_text: str) -> str:
171 |     """ Remove email in the input text """
172 |     regex_pattern = '[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}'
173 |     return re.sub(regex_pattern, '', input_text)
174 | 
175 | 
176 | @_return_empty_string_for_invalid_input
177 | def remove_phone_number(input_text: str) -> str:
178 |     """ Remove phone number in the input text """
179 |     regex_pattern = '(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?'
180 |     return re.sub(regex_pattern, '', input_text)
181 | 
182 | 
183 | @_return_empty_string_for_invalid_input
184 | def remove_ssn(input_text: str) -> str:
185 |     """ Remove social security number in the input text """
186 |     regex_pattern = '(?!219-09-9999|078-05-1120)(?!666|000|9\d{2})\d{3}-(?!00)\d{2}-(?!0{4})\d{4}|(' \
187 |                     '?!219099999|078051120)(?!666|000|9\d{2})\d{3}(?!00)\d{2}(?!0{4})\d{4}'
188 |     return re.sub(regex_pattern, '', input_text)
189 | 
190 | 
191 | @_return_empty_string_for_invalid_input
192 | def remove_credit_card_number(input_text: str) -> str:
193 |     """ Remove credit card number in the input text """
194 |     regex_pattern = '(4[0-9]{12}(?:[0-9]{3})?|(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][' \
195 |                     '0-9]|2720)[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12}|(' \
196 |                     '?:2131|1800|35\d{3})\d{11})'
197 |     return re.sub(regex_pattern, '', input_text)
198 | 
199 | 
200 | @_return_empty_list_for_invalid_input
201 | def remove_name(input_text_or_list: Union[str, List[str]]) -> List[str]:
202 |     """ Remove name in the input text """
203 |     name_searcher = NameDataset()
204 |     if isinstance(input_text_or_list, str):
205 |         tokens = word_tokenize(input_text_or_list)
206 |         processed_tokens = [token for token in tokens
207 |                             if (not name_searcher.search_first_name(token)) and
208 |                                (not name_searcher.search_last_name(token))]
209 |     else:
210 |         processed_tokens = [token for token in input_text_or_list
211 |                             if (not name_searcher.search_first_name(token)) and
212 |                                (not name_searcher.search_last_name(token)) and token is not None and len(token) > 0]
213 |     return processed_tokens
214 | 
215 | 
216 | def check_spelling(input_text_or_list: Union[str, List[str]], lang='en',
217 |                    ignore_word_file_path: Union[str, Path] = _IGNORE_SPELLCHECK_WORD_FILE_PATH) -> str:
218 |     """ Check and correct spellings of the text list """
219 |     if input_text_or_list is None or len(input_text_or_list) == 0:
220 |         return ''
221 |     spelling_checker = SpellChecker(language=lang, distance=1)
222 |     # TODO: add acronyms into spell checker to ignore auto correction specified by _IGNORE_SPELLCHECK_WORD_FILE_PATH
223 |     spelling_checker.word_frequency.load_text_file(ignore_word_file_path)
224 |     if isinstance(input_text_or_list, str):
225 |         if not input_text_or_list.islower():
226 |             input_text_or_list = input_text_or_list.lower()
227 |         tokens = word_tokenize(input_text_or_list)
228 |     else:
229 |         tokens = [token.lower() for token in input_text_or_list if token is not None and len(token) > 0]
230 |     misspelled = spelling_checker.unknown(tokens)
231 |     for word in misspelled:
232 |         tokens[tokens.index(word)] = spelling_checker.correction(word)
233 |     return ' '.join(tokens).strip()
234 | 
235 | 
236 | def tokenize_word(input_text: str) -> List[str]:
237 |     """ Converts a text into a list of word tokens """
238 |     if input_text is None or len(input_text) == 0:
239 |         return []
240 |     return word_tokenize(input_text)
241 | 
242 | 
243 | def tokenize_sentence(input_text: str) -> List[str]:
244 |     """ Converts a text into a list of sentence tokens """
245 |     if input_text is None or len(input_text) == 0:
246 |         return []
247 |     tokenizer = PunktSentenceTokenizer()
248 |     return tokenizer.tokenize(input_text)
249 | 
250 | 
251 | @_return_empty_list_for_invalid_input
252 | def stem_word(input_text_or_list: Union[str, List[str]],
253 |               stemmer: Optional[Union[PorterStemmer, SnowballStemmer, LancasterStemmer]] = None
254 |               ) -> List[str]:
255 |     """ Stem each token in a text """
256 |     if stemmer is None:
257 |         stemmer = PorterStemmer()
258 |     if isinstance(input_text_or_list, str):
259 |         tokens = word_tokenize(input_text_or_list)
260 |         processed_tokens = [stemmer.stem(token) for token in tokens]
261 |     else:
262 |         processed_tokens = [stemmer.stem(token) for token in input_text_or_list if token is not None and len(token) > 0]
263 |     return processed_tokens
264 | 
265 | 
266 | @_return_empty_list_for_invalid_input
267 | def lemmatize_word(input_text_or_list: Union[str, List[str]],
268 |                    lemmatizer: Optional[WordNetLemmatizer] = None
269 |                    ) -> List[str]:
270 |     """ Lemmatize each token in a text by finding its base form """
271 |     if lemmatizer is None:
272 |         lemmatizer = WordNetLemmatizer()
273 |     if isinstance(input_text_or_list, str):
274 |         tokens = word_tokenize(input_text_or_list)
275 |         processed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
276 |     else:
277 |         processed_tokens = [lemmatizer.lemmatize(token)
278 |                             for token in input_text_or_list if token is not None and len(token) > 0]
279 |     return processed_tokens
280 | 
281 | 
282 | def substitute_token(token_list: List[str], sub_dict: Optional[dict] = None) -> List[str]:
283 |     """ Substitute each token by another token, e.g., 'vs' -> 'versus' """
284 |     # TODO: add more custom substitutions in the csv file specified by _CUSTOM_SUB_CSV_FILE_PATH
285 |     if token_list is None or len(token_list) == 0:
286 |         return []
287 |     if sub_dict is None:
288 |         with open(_CUSTOM_SUB_CSV_FILE_PATH, 'r') as f:
289 |             csv_file = csv.reader(f)
290 |             sub_dict = dict(csv_file)
291 |     processed_tokens = list()
292 |     for token in token_list:
293 |         if token in sub_dict:
294 |             processed_tokens.append(sub_dict[token])
295 |         else:
296 |             processed_tokens.append(token)
297 |     return processed_tokens
298 | 
299 | 
300 | def preprocess_text(input_text: str, processing_function_list: Optional[List[Callable]] = None) -> str:
301 |     """ Preprocess an input text by executing a series of preprocessing functions specified in functions list """
302 |     if processing_function_list is None:
303 |         processing_function_list = [to_lower,
304 |                                     remove_url,
305 |                                     remove_email,
306 |                                     remove_phone_number,
307 |                                     remove_itemized_bullet_and_numbering,
308 |                                     expand_contraction,
309 |                                     check_spelling,
310 |                                     remove_special_character,
311 |                                     remove_punctuation,
312 |                                     remove_whitespace,
313 |                                     normalize_unicode,
314 |                                     remove_stopword,
315 |                                     remove_name,
316 |                                     substitute_token,
317 |                                     lemmatize_word]
318 |     for func in processing_function_list:
319 |         input_text = func(input_text)
320 |     if isinstance(input_text, str):
321 |         processed_text = input_text
322 |     else:
323 |         processed_text = ' '.join(input_text)
324 |     return processed_text
325 | 
326 | 
327 | if __name__ == '__main__':
328 |     text_to_process = 'Helllo, I am John Doe!!! My email is john.doe@email.com. Visit our website www.johndoe.com'
329 |     preprocessed_text = preprocess_text(text_to_process)
330 |     print(preprocessed_text)
331 | 
332 |     preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word]
333 |     preprocessed_text = preprocess_text(text_to_process, preprocess_functions)
334 |     print(preprocessed_text)
335 | 


--------------------------------------------------------------------------------