├── .editorconfig ├── .flake8 ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── requirements.txt ├── setup.py ├── text_normalizer ├── __init__.py ├── collection │ ├── __init__.py │ ├── base_collection.py │ ├── basic.py │ ├── charactor.py │ ├── eng_basic.py │ ├── punctuation_keeping.py │ ├── test │ │ ├── __init__.py │ │ ├── test_base_collection.py │ │ ├── test_basic.py │ │ ├── test_charactor.py │ │ ├── test_eng_basic.py │ │ ├── test_punctuation_keeping.py │ │ └── test_unicode_mapping.py │ └── unicode_mapping.py ├── data │ ├── punctuation │ │ ├── punctuation_mapping_0221.csv │ │ └── punctuation_mapping_0221_simplified.csv │ └── unicode │ │ ├── chinese_characters_and_digits.txt │ │ ├── chinese_characters_only.txt │ │ ├── chinese_english_digits.txt │ │ ├── chinese_english_digits_and_full_punctuations.txt │ │ ├── chinese_english_digits_and_simplified_punctuations_1.txt │ │ ├── english_characters_and_digits.txt │ │ └── english_digits_and_full_punctuations.txt ├── factory │ ├── __init__.py │ ├── base_factory.py │ ├── eng_lowercase.py │ ├── identity.py │ ├── number_token.py │ ├── punctuation_mapping.py │ ├── replace_pattern_with_token.py │ ├── strip.py │ ├── test │ │ ├── __init__.py │ │ ├── example_punctuation_mapping.csv │ │ ├── example_unicode_mapping.txt │ │ ├── test_base_factory.py │ │ ├── test_eng_lowercase.py │ │ ├── test_identity.py │ │ ├── test_number_token_test_tokenizer.py │ │ ├── test_punctuation_mapping.py │ │ ├── test_strip.py │ │ └── test_unicode_mapping.py │ ├── toolkit │ │ ├── __init__.py │ │ ├── findall_position.c │ │ └── findall_position.pyx │ └── unicode_mapping.py └── library │ ├── __init__.py │ ├── basic.py │ ├── date.py │ ├── eng_lowercase.py │ ├── identity.py │ ├── number.py │ ├── punctuation.py │ ├── punctuation_mapping.py │ ├── strip.py │ ├── test │ ├── __init__.py │ ├── test_basic.py │ ├── test_date.py │ ├── test_eng_lowercase.py │ ├── test_identity.py │ ├── test_number.py │ ├── test_punctuation.py │ ├── test_punctuation_mapping.py │ ├── test_strip.py │ ├── test_time.py │ └── test_unicode_text_normalizers.py │ ├── time.py │ └── unicode.py └── utils ├── __init__.py ├── label_propagation.c ├── label_propagation.pyx ├── setup_utils ├── __init__.py ├── get_ext.py └── remove_so_files.py └── test ├── __init__.py └── test_label_propagation.py /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | end_of_line = lf 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | 10 | [*.{js,yml,css,html}] 11 | indent_size = 2 12 | 13 | [*.{py,pyx,pxd}] 14 | indent_size = 4 15 | max_line_length = 100 16 | 17 | [*.json] 18 | indent_size = 2 19 | insert_final_newline = ignore 20 | 21 | [Makefile] 22 | indent_style = tab 23 | 24 | [*.md] 25 | indent_size = 4 26 | trim_trailing_whitespace = false 27 | 28 | [*.{c,cpp}] 29 | indent_size = 4 30 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | filename = *.py,*.pyx,*.pxd 3 | max-line-length = 100 4 | ignore = 5 | E125,E121,E266, 6 | # print is allowed 7 | T001, T003, 8 | # invalid escape sequence 9 | W605 10 | exclude = 11 | .git 12 | # __pycache__ 13 | __pycache__ 14 | # virtual environment 15 | .venv/ 16 | venv/ 17 | env/ 18 | build/ 19 | # sphinx docs 20 | docs/ 21 | max_complexity = 10 22 | statistics = true 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | *.fuse_hidden* 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | example.py 93 | .vscode/ 94 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.5' 4 | - '3.6' 5 | install: 6 | - make install 7 | script: 8 | - make lint 9 | - make test 10 | deploy: 11 | provider: pypi 12 | user: solumilken 13 | password: 14 | secure: 0Opwlmv6qK1uyW2u3sE5rbdNiopeFTvG8kKAFZS3b5joRpiCKRdRGPYPIfCf1l8S5SeXq6INt6HIHcxyNDsIB1ejHYYJu33wL9tK1mUekivlEXibdEDaN3/qNfT9dZDWm/4tUFrFvHGhB6krJjIToUYsJvM3tYBSX6uCgfxFrpr6GsLNrxs/nIzy2aCD9MMReQ89iC3IQoWkTuTIbGnuj7eWEQpbhjLmBIrwJwnh5zcjdrR9PAzWakOX4bMeVa89nQiaL16icTaHthCQrLuyCP7lQ2tlh7rO4yT+UF4qLynWFAEYEQL3mvx+I/bNpKaRvHy26ZgkTLsd5mJsntbohDYN0Ydyx6nXTzuAMsElumMdVYizJghh8+/x9CfbF+CqK6qQ/UqL10OjFUinTNcYUi9jzt2hsGnno9eDjzVtlQmo4i+N3MQRciTWbQawWM7VXmjT7rGI18Zc4zp4/Y9qEZG18QZzaDPexXFOpJU7pWt07658jMHwGqmQJiyIWXTKjBq4IWxIw/s7VmE5R0ElqgCL6spwC3ErHzJvvX1XhrU98lDqyk1VWQxtRl/jyA3OLKnInou92jLPH3M0iAriKlttHxEacFEj0rsaaYDYtLwvyIiWFNdaATraIBaH8cQeoMJH4HmNTCAQFFRUW1B1/Ss2XCgEuUpLaPQaGrpNyNo= 15 | skip_upload_docs: true 16 | skip_cleanup: true 17 | on: 18 | tags: true 19 | python: 3.5 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 YOCTOL INFO INC. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include text_normalizer/data/*/* 2 | include README.md 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := all 2 | 3 | .PHONY: installself 4 | installself: 5 | python setup.py build_ext 6 | pip install -e . 7 | 8 | .PHONY: install 9 | install: 10 | pip install -U pip wheel setuptools cython 11 | pip install -r requirements.txt 12 | make installself 13 | 14 | .PHONY: lint 15 | lint: 16 | flake8 17 | 18 | .PHONY: test 19 | test: 20 | python -m unittest -v 21 | 22 | .PHONY: all 23 | all: test lint 24 | 25 | .PHONY: clean 26 | clean: 27 | rm -rf `find . -name __pycache__` 28 | rm -f `find . -type f -name '*.py[co]' ` 29 | rm -f `find . -type f -name '*~' ` 30 | rm -f `find . -type f -name '.*~' ` 31 | rm -rf .cache 32 | rm -rf htmlcov 33 | rm -rf *.egg-info 34 | rm -f .coverage 35 | rm -f .coverage.* 36 | rm -rf build 37 | python setup_utils/remove_so_files.py 38 | make -C docs clean 39 | python setup.py clean 40 | 41 | .PHONY: dev-test 42 | dev-test: 43 | rm -rf build 44 | python utils/setup_utils/remove_so_files.py 45 | python setup.py build_ext 46 | pip install -e . 47 | make lint 48 | make test 49 | 50 | .PHONY: docs 51 | docs: 52 | make installself 53 | make -C docs 54 | 55 | .PHONY: distribute 56 | distribute: 57 | make clean 58 | python setup.py sdist 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # text-normalizer 2 | 3 | [![travis][travis-image]][travis-url] 4 | [![pypi][pypi-image]][pypi-url] 5 | 6 | [travis-image]: https://img.shields.io/travis/Yoctol/text-normalizer.svg?style=flat 7 | [travis-url]: https://travis-ci.org/Yoctol/text-normalizer 8 | [pypi-image]: https://img.shields.io/pypi/v/text-normalizer.svg?style=flat 9 | [pypi-url]: https://pypi.python.org/pypi/text-normalizer 10 | 11 | Normalize your Text String. 12 | It is a python package that help you normalize your text data and recover it. 13 | 14 | ## Install 15 | Use Python3 16 | ``` 17 | > pip install text-normalizer 18 | ``` 19 | ## Usage 20 | ```python 21 | from text_normalizer.text_normalizer_collection_library import chinese_charactor_text_normalizer_collection_2 22 | 23 | 24 | input_sentence = " 我在85.33度C買了一杯900──1000元的咖啡 《ohoh》?? m_m" 25 | nor_sentence, meta = chinese_charactor_text_normalizer_collection_2.normalize(input_sentence) 26 | print(nor_sentence) 27 | > "我在_float_度c買了一杯_int_-_int_元的咖啡 ?? m_m" 28 | 29 | de_sentence = chinese_charactor_text_normalizer_collection_2.denormalize(nor_sentence, meta) 30 | print(de_sentence) 31 | > "我在85.33度C買了一杯900──1000元的咖啡 《ohoh》?? m_m", 32 | 33 | ``` 34 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | flake8-config-yoctol>=0.0.11 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from setuptools import setup, find_packages 4 | from utils.setup_utils.get_ext import get_ext_modules_n_cmdclass 5 | 6 | 7 | ROOT_DIR = Path(__file__).parent 8 | 9 | 10 | # make description 11 | readme = ROOT_DIR.joinpath('README.md') 12 | if readme.exists(): 13 | with readme.open() as f: 14 | long_description = f.read() 15 | try: 16 | from pypandoc import convert_text 17 | long_description = convert_text( 18 | long_description, 'rst', format='md') 19 | except ImportError: 20 | print("warning: pypandoc module not found, could not convert Markdown to RST") 21 | else: 22 | long_description = '-' 23 | 24 | 25 | # get cython extension 26 | ext_modules, cmdclass = get_ext_modules_n_cmdclass() 27 | 28 | 29 | setup( 30 | name="text-normalizer", 31 | version="0.1.3", 32 | description="Yoctol Natural Language Text Normalizer", 33 | license="MIT", 34 | author="Solumilken", 35 | author_email="yien.tsai@yoctol.com", 36 | url="https://github.com/Yoctol/text-normalizer", 37 | packages=find_packages(), 38 | install_requires=[ 39 | 'pandas;python_version>="3.5"', 40 | 'pandas<0.21;python_version<"3.5"', 41 | ], 42 | python_requires=">=3.5", 43 | long_description=long_description, 44 | classifiers=[ 45 | "Programming Language :: Python", 46 | "Programming Language :: Python :: 3.5", 47 | "Programming Language :: Python :: 3.6", 48 | ], 49 | include_package_data=True, 50 | cmdclass=cmdclass, 51 | ext_modules=ext_modules, 52 | ) 53 | -------------------------------------------------------------------------------- /text_normalizer/__init__.py: -------------------------------------------------------------------------------- 1 | from os.path import abspath, dirname 2 | 3 | ROOT_DIR = dirname(abspath(__file__)) 4 | -------------------------------------------------------------------------------- /text_normalizer/collection/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic import ( # noqa 2 | basic_text_normalizer_collection, 3 | number_with_digits_text_normalizer_collection, 4 | ) 5 | from .eng_basic import eng_basic_text_normalizer_collection # noqa 6 | from .punctuation_keeping import ( # noqa 7 | full_punctuation_keeping_text_normalizer_collection, 8 | simplified_punctuation_keeping_text_normalizer_collection, 9 | number_with_digits_n_simplified_punctuation_text_normalizer_collection, 10 | ) 11 | from .charactor import ( # noqa 12 | chinese_charactor_text_normalizer_collection_1, 13 | chinese_charactor_text_normalizer_collection_2, 14 | chinese_charactor_text_normalizer_collection_3, 15 | chinese_charactor_text_normalizer_collection_4, 16 | ) 17 | from .unicode_mapping import ( # noqa 18 | u_zh_text_normalizer_collection_1, 19 | u_zh_text_normalizer_collection_2, 20 | u_zh_text_normalizer_collection_3, 21 | u_zh_text_normalizer_collection_4, 22 | u_en_text_normalizer_collection_1, 23 | u_en_text_normalizer_collection_2, 24 | u_en_text_normalizer_collection_3, 25 | u_zh_en_text_normalizer_collection_1, 26 | u_zh_en_text_normalizer_collection_2, 27 | u_zh_en_text_normalizer_collection_3, 28 | u_zh_en_text_normalizer_collection_4, 29 | ) 30 | -------------------------------------------------------------------------------- /text_normalizer/collection/base_collection.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | class BaseCollection(object): 5 | 6 | def __init__(self): 7 | self.text_normalizers = [] 8 | 9 | def add_text_normalizers( 10 | self, 11 | text_normalizers: List[object], 12 | ) -> None: 13 | ''' 14 | TODO: Ensure text normalizer is a subclass of BaseTextNormalizer 15 | ''' 16 | for text_normalizer in text_normalizers: 17 | self.text_normalizers.append(text_normalizer) 18 | 19 | def clear_text_normalizers(self): 20 | self.text_normalizers = [] 21 | 22 | def normalize( 23 | self, 24 | sentence: str, 25 | )-> (str, List[dict]): 26 | meta = [] 27 | for text_normalizer in self.text_normalizers: 28 | sentence, meta_data = text_normalizer.normalize(sentence=sentence) 29 | meta.append({ 30 | 'name': text_normalizer.name, 31 | 'revised_sentence': sentence, 32 | 'meta_data': meta_data, 33 | }) 34 | return sentence, meta 35 | 36 | def denormalize( 37 | self, 38 | sentence: str, 39 | meta: List[dict], 40 | ) -> str: 41 | for text_normalizer, record in zip( 42 | self.text_normalizers[::-1], 43 | meta[::-1], 44 | ): 45 | if record['name'] == text_normalizer.name: 46 | sentence = text_normalizer.denormalize( 47 | sentence=sentence, 48 | meta=record['meta_data'], 49 | ) 50 | return sentence.strip() 51 | 52 | # def ldenormalize( 53 | # self, 54 | # sentence: List[str], 55 | # meta: List[dict], 56 | # ): 57 | # for text_normalizer, record in zip(self.text_normalizers[::-1], meta[::-1]): 58 | # if record['name'] == text_normalizer.name: 59 | # sentence = text_normalizer.lretrieve( 60 | # sentence=sentence, 61 | # meta=record['meta_data'], 62 | # ) 63 | # return sentence 64 | -------------------------------------------------------------------------------- /text_normalizer/collection/basic.py: -------------------------------------------------------------------------------- 1 | from .base_collection import BaseCollection 2 | from ..library import ( 3 | whitespace_char_text_normalizer, 4 | float_with_space_text_normalizer, 5 | int_with_space_text_normalizer, 6 | float_with_digit_n_space_text_normalizer, 7 | int_with_digit_n_space_text_normalizer, 8 | all_punctuation_without_endpoint_text_normalizer, 9 | all_punctuation_without_underscore_text_normalizer, 10 | pure_strip_text_normalizer, 11 | eng_lowercase_text_normalizer, 12 | ) 13 | 14 | 15 | basic_text_normalizer_collection = BaseCollection() 16 | basic_text_normalizer_collection.add_text_normalizers( 17 | text_normalizers=[ 18 | eng_lowercase_text_normalizer, 19 | all_punctuation_without_endpoint_text_normalizer, 20 | float_with_space_text_normalizer, 21 | int_with_space_text_normalizer, 22 | all_punctuation_without_underscore_text_normalizer, 23 | whitespace_char_text_normalizer, 24 | pure_strip_text_normalizer, 25 | ], 26 | ) 27 | 28 | number_with_digits_text_normalizer_collection = BaseCollection() 29 | number_with_digits_text_normalizer_collection.add_text_normalizers( 30 | text_normalizers=[ 31 | eng_lowercase_text_normalizer, 32 | all_punctuation_without_endpoint_text_normalizer, 33 | float_with_digit_n_space_text_normalizer, 34 | int_with_digit_n_space_text_normalizer, 35 | all_punctuation_without_underscore_text_normalizer, 36 | whitespace_char_text_normalizer, 37 | pure_strip_text_normalizer, 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /text_normalizer/collection/charactor.py: -------------------------------------------------------------------------------- 1 | from .base_collection import BaseCollection 2 | from ..library import ( 3 | whitespace_char_text_normalizer, 4 | float_text_normalizer, 5 | int_text_normalizer, 6 | int_with_digit_text_normalizer, 7 | float_with_digit_text_normalizer, 8 | full_punctuation_mapping_text_normalizer, 9 | simplified_punctuation_mapping_text_normalizer, 10 | pure_strip_text_normalizer, 11 | eng_lowercase_text_normalizer, 12 | ) 13 | 14 | 15 | chinese_charactor_text_normalizer_collection_1 = BaseCollection() 16 | chinese_charactor_text_normalizer_collection_1.add_text_normalizers( 17 | text_normalizers=[ 18 | eng_lowercase_text_normalizer, 19 | simplified_punctuation_mapping_text_normalizer, 20 | float_text_normalizer, 21 | int_text_normalizer, 22 | whitespace_char_text_normalizer, 23 | pure_strip_text_normalizer, 24 | ], 25 | ) 26 | 27 | 28 | chinese_charactor_text_normalizer_collection_2 = BaseCollection() 29 | chinese_charactor_text_normalizer_collection_2.add_text_normalizers( 30 | text_normalizers=[ 31 | eng_lowercase_text_normalizer, 32 | full_punctuation_mapping_text_normalizer, 33 | float_text_normalizer, 34 | int_text_normalizer, 35 | whitespace_char_text_normalizer, 36 | pure_strip_text_normalizer, 37 | ], 38 | ) 39 | 40 | 41 | chinese_charactor_text_normalizer_collection_3 = BaseCollection() 42 | chinese_charactor_text_normalizer_collection_3.add_text_normalizers( 43 | text_normalizers=[ 44 | eng_lowercase_text_normalizer, 45 | simplified_punctuation_mapping_text_normalizer, 46 | float_with_digit_text_normalizer, 47 | int_with_digit_text_normalizer, 48 | whitespace_char_text_normalizer, 49 | pure_strip_text_normalizer, 50 | ], 51 | ) 52 | 53 | 54 | chinese_charactor_text_normalizer_collection_4 = BaseCollection() 55 | chinese_charactor_text_normalizer_collection_4.add_text_normalizers( 56 | text_normalizers=[ 57 | eng_lowercase_text_normalizer, 58 | full_punctuation_mapping_text_normalizer, 59 | float_with_digit_text_normalizer, 60 | int_with_digit_text_normalizer, 61 | whitespace_char_text_normalizer, 62 | pure_strip_text_normalizer, 63 | ], 64 | ) 65 | -------------------------------------------------------------------------------- /text_normalizer/collection/eng_basic.py: -------------------------------------------------------------------------------- 1 | from .base_collection import BaseCollection 2 | from ..library import ( 3 | whitespace_char_text_normalizer, 4 | pure_strip_text_normalizer, 5 | eng_lowercase_text_normalizer, 6 | ) 7 | 8 | 9 | eng_basic_text_normalizer_collection = BaseCollection() 10 | eng_basic_text_normalizer_collection.add_text_normalizers( 11 | text_normalizers=[ 12 | eng_lowercase_text_normalizer, 13 | whitespace_char_text_normalizer, 14 | pure_strip_text_normalizer, 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /text_normalizer/collection/punctuation_keeping.py: -------------------------------------------------------------------------------- 1 | from .base_collection import BaseCollection 2 | from ..library import ( 3 | whitespace_char_text_normalizer, 4 | float_with_space_text_normalizer, 5 | int_with_space_text_normalizer, 6 | float_with_digit_n_space_text_normalizer, 7 | int_with_digit_n_space_text_normalizer, 8 | full_punctuation_mapping_text_normalizer, 9 | simplified_punctuation_mapping_text_normalizer, 10 | pure_strip_text_normalizer, 11 | eng_lowercase_text_normalizer, 12 | ) 13 | 14 | 15 | full_punctuation_keeping_text_normalizer_collection = BaseCollection() 16 | full_punctuation_keeping_text_normalizer_collection.add_text_normalizers( 17 | text_normalizers=[ 18 | eng_lowercase_text_normalizer, 19 | full_punctuation_mapping_text_normalizer, 20 | float_with_space_text_normalizer, 21 | int_with_space_text_normalizer, 22 | whitespace_char_text_normalizer, 23 | pure_strip_text_normalizer, 24 | ], 25 | ) 26 | 27 | 28 | simplified_punctuation_keeping_text_normalizer_collection = BaseCollection() 29 | simplified_punctuation_keeping_text_normalizer_collection.add_text_normalizers( 30 | text_normalizers=[ 31 | eng_lowercase_text_normalizer, 32 | simplified_punctuation_mapping_text_normalizer, 33 | float_with_space_text_normalizer, 34 | int_with_space_text_normalizer, 35 | whitespace_char_text_normalizer, 36 | pure_strip_text_normalizer, 37 | ], 38 | ) 39 | 40 | 41 | number_with_digits_n_simplified_punctuation_text_normalizer_collection = \ 42 | BaseCollection() 43 | number_with_digits_n_simplified_punctuation_text_normalizer_collection.add_text_normalizers( 44 | text_normalizers=[ 45 | eng_lowercase_text_normalizer, 46 | simplified_punctuation_mapping_text_normalizer, 47 | float_with_digit_n_space_text_normalizer, 48 | int_with_digit_n_space_text_normalizer, 49 | whitespace_char_text_normalizer, 50 | pure_strip_text_normalizer, 51 | ], 52 | ) 53 | -------------------------------------------------------------------------------- /text_normalizer/collection/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/collection/test/__init__.py -------------------------------------------------------------------------------- /text_normalizer/collection/test/test_base_collection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest.mock import Mock, call 3 | 4 | from unittest import TestCase 5 | from ..base_collection import BaseCollection 6 | 7 | 8 | class TestBaseCollection(TestCase): 9 | 10 | def setUp(self): 11 | self.base_text_normalizer_collection = BaseCollection() 12 | self.example_sentence = "0123456789" 13 | self.text_normalizers = Mock() 14 | self.text_normalizer_0 = Mock() 15 | self.text_normalizer_0.normalize = Mock(return_value=("我123456789", {"我": ["0"]})) 16 | self.text_normalizer_0.denormalize = Mock(return_value="023456789") 17 | self.text_normalizer_0.name = "text_normalizer_0" 18 | self.text_normalizer_1 = Mock() 19 | self.text_normalizer_1.normalize = Mock(return_value=("我23456789", None)) 20 | self.text_normalizer_1.denormalize = Mock(return_value="我23456789") 21 | self.text_normalizer_1.name = "text_normalizer_1" 22 | self.text_normalizer_2 = Mock() 23 | self.text_normalizer_2.normalize = Mock(return_value=("我要3456789", {"要": ["2"]})) 24 | self.text_normalizer_2.denormalize = Mock(return_value="我23456789") 25 | self.text_normalizer_2.name = "text_normalizer_2" 26 | self.text_normalizers.f0, self.text_normalizers.f1, self.text_normalizers.f2 = \ 27 | self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2 28 | 29 | def test_attributes(self): 30 | self.assertEqual( 31 | { 32 | 'text_normalizers': [], 33 | }, 34 | self.base_text_normalizer_collection.__dict__, 35 | ) 36 | 37 | def test_add_text_normalizers(self): 38 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_0]) 39 | self.assertEqual( 40 | [self.text_normalizer_0], 41 | self.base_text_normalizer_collection.text_normalizers, 42 | ) 43 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_1]) 44 | self.assertEqual( 45 | [self.text_normalizer_0, self.text_normalizer_1], 46 | self.base_text_normalizer_collection.text_normalizers, 47 | ) 48 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_2]) 49 | self.assertEqual( 50 | [self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2], 51 | self.base_text_normalizer_collection.text_normalizers, 52 | ) 53 | self.base_text_normalizer_collection.clear_text_normalizers() 54 | self.base_text_normalizer_collection.add_text_normalizers( 55 | [self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2], 56 | ) 57 | self.assertEqual( 58 | [self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2], 59 | self.base_text_normalizer_collection.text_normalizers, 60 | ) 61 | 62 | def test_call(self): 63 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_0]) 64 | self.base_text_normalizer_collection.normalize( 65 | sentence=self.example_sentence, 66 | ) 67 | self.text_normalizers.assert_has_calls( 68 | [call.f0.normalize(sentence=self.example_sentence)], 69 | ) 70 | 71 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_1]) 72 | self.base_text_normalizer_collection.normalize( 73 | sentence=self.example_sentence, 74 | ) 75 | self.text_normalizers.assert_has_calls( 76 | [ 77 | call.f0.normalize(sentence=self.example_sentence), 78 | call.f1.normalize(sentence="我123456789"), 79 | ], 80 | ) 81 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_2]) 82 | self.base_text_normalizer_collection.normalize( 83 | sentence=self.example_sentence, 84 | ) 85 | self.text_normalizers.assert_has_calls( 86 | [ 87 | call.f0.normalize(sentence=self.example_sentence), 88 | call.f1.normalize(sentence="我123456789"), 89 | call.f2.normalize(sentence="我23456789"), 90 | ], 91 | ) 92 | 93 | def test_denormalize(self): 94 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_0]) 95 | self.base_text_normalizer_collection.denormalize( 96 | sentence="我123456789", 97 | meta=[ 98 | { 99 | 'name': "text_normalizer_0", 100 | 'revised_sentence': "XDDD", 101 | 'meta_data': {"我": ["0"]}, 102 | }, 103 | ], 104 | ) 105 | self.text_normalizers.assert_has_calls( 106 | [ 107 | call.f0.denormalize( 108 | sentence="我123456789", 109 | meta={"我": ["0"]}, 110 | ), 111 | ], 112 | ) 113 | 114 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_1]) 115 | self.base_text_normalizer_collection.denormalize( 116 | sentence="我123456789", 117 | meta=[ 118 | { 119 | 'name': "text_normalizer_0", 120 | 'revised_sentence': "XDDD", 121 | 'meta_data': {"我": ["0"]}, 122 | }, 123 | { 124 | 'name': "text_normalizer_1", 125 | "revise_sentence": ">O<", 126 | "meta_data": None, 127 | }, 128 | ], 129 | ) 130 | self.text_normalizers.assert_has_calls( 131 | [ 132 | call.f1.denormalize( 133 | sentence="我123456789", 134 | meta=None, 135 | ), 136 | call.f0.denormalize( 137 | sentence="我23456789", 138 | meta={"我": ["0"]}, 139 | ), 140 | ], 141 | ) 142 | 143 | self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_2]) 144 | self.base_text_normalizer_collection.denormalize( 145 | sentence="我要3456789", 146 | meta=[ 147 | { 148 | 'name': "text_normalizer_0", 149 | 'revised_sentence': "XDDD", 150 | 'meta_data': {"我": ["0"]}, 151 | }, 152 | { 153 | 'name': "text_normalizer_1", 154 | "revise_sentence": ">O<", 155 | "meta_data": None, 156 | }, 157 | { 158 | 'name': "text_normalizer_2", 159 | "revised_sentence": "M_M", 160 | "meta_data": {"要": ["2"]}, 161 | }, 162 | ], 163 | ) 164 | 165 | self.text_normalizers.assert_has_calls( 166 | [ 167 | call.f2.denormalize( 168 | sentence="我要3456789", 169 | meta={"要": ["2"]}, 170 | ), 171 | call.f1.denormalize( 172 | sentence="我23456789", 173 | meta=None, 174 | ), 175 | call.f0.denormalize( 176 | sentence="我23456789", 177 | meta={"我": ["0"]}, 178 | ), 179 | ], 180 | ) 181 | -------------------------------------------------------------------------------- /text_normalizer/collection/test/test_basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..basic import ( 5 | basic_text_normalizer_collection, 6 | number_with_digits_text_normalizer_collection, 7 | ) 8 | 9 | 10 | class BasicNormalizerCollectionTestCase(TestCase): 11 | 12 | def test_basic_text_normalizer_collection(self): 13 | normalizer = basic_text_normalizer_collection 14 | test_cases = [ 15 | ( 16 | '我在85.33度C買了一杯(*999*)的咖啡--', 17 | '我在 _float_ 度c買了一杯 _int_ 的咖啡', 18 | '我在85.33度C買了一杯999的咖啡', 19 | ), 20 | ( 21 | '++', 22 | '', 23 | '', 24 | ), 25 | ] 26 | for test_case in test_cases: 27 | with self.subTest(test_case=test_case): 28 | revised_sentence, meta = normalizer.normalize( 29 | sentence=test_case[0], 30 | ) 31 | self.assertEqual( 32 | test_case[1], 33 | revised_sentence, 34 | ) 35 | recovered_sentence = normalizer.denormalize( 36 | sentence=test_case[1], 37 | meta=meta, 38 | ) 39 | self.assertEqual( 40 | test_case[2], 41 | recovered_sentence, 42 | ) 43 | 44 | def test_number_with_digits_text_normalizer_collection(self): 45 | normalizer = number_with_digits_text_normalizer_collection 46 | test_cases = [ 47 | ( 48 | '我在85.33度C買了一杯(*999*)的咖啡--', 49 | '我在 _2float2_ 度c買了一杯 _3int_ 的咖啡', 50 | '我在85.33度C買了一杯999的咖啡', 51 | ), 52 | ( 53 | '++??', 54 | '', 55 | '', 56 | ), 57 | ] 58 | for test_case in test_cases: 59 | with self.subTest(test_case=test_case): 60 | revised_sentence, meta = normalizer.normalize( 61 | sentence=test_case[0], 62 | ) 63 | self.assertEqual( 64 | test_case[1], 65 | revised_sentence, 66 | ) 67 | recovered_sentence = normalizer.denormalize( 68 | sentence=test_case[1], 69 | meta=meta, 70 | ) 71 | self.assertEqual( 72 | test_case[2], 73 | recovered_sentence, 74 | ) 75 | -------------------------------------------------------------------------------- /text_normalizer/collection/test/test_charactor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..charactor import ( 5 | chinese_charactor_text_normalizer_collection_1, 6 | chinese_charactor_text_normalizer_collection_2, 7 | chinese_charactor_text_normalizer_collection_3, 8 | chinese_charactor_text_normalizer_collection_4, 9 | ) 10 | 11 | 12 | class CharactorTextNormalizerCollectionTestCase(TestCase): 13 | 14 | def _run_test(self, test_cases, normalizer, normalizer_name): 15 | for test_case in test_cases: 16 | with self.subTest(test_case=(normalizer_name, test_case[0])): 17 | revised_sentence, meta = normalizer.normalize( 18 | sentence=test_case[0], 19 | ) 20 | self.assertEqual( 21 | test_case[1], 22 | revised_sentence, 23 | ) 24 | recovered_sentence = normalizer.denormalize( 25 | sentence=test_case[1], 26 | meta=meta, 27 | ) 28 | self.assertEqual( 29 | test_case[2], 30 | recovered_sentence, 31 | ) 32 | 33 | def test_chinese_charactor_text_normalizer_collection_1(self): 34 | test_cases = [ 35 | ( 36 | " 我在85.33度C買了一杯900──1000元的咖啡《ohoh》?? m_m", 37 | "我在_float_度c買了一杯_int_-_int_元的咖啡(ohoh)?? m_m", 38 | "我在85.33度C買了一杯900──1000元的咖啡《ohoh》?? m_m", 39 | ), 40 | ( 41 | "買5──8年五門車 ~~", 42 | "買_int_-_int_年五門車 --", 43 | "買5──8年五門車 ~~", 44 | ), 45 | ( 46 | "2001 ~ 2007年紅色\藍色的Benz OHOHOH", 47 | "_int_ - _int_年紅色,藍色的benz ohohoh", 48 | "2001 ~ 2007年紅色\藍色的Benz OHOHOH", 49 | ), 50 | ] 51 | self._run_test( 52 | test_cases=test_cases, 53 | normalizer=chinese_charactor_text_normalizer_collection_1, 54 | normalizer_name="chinese_charactor_text_normalizer_collection_1", 55 | ) 56 | 57 | def test_chinese_charactor_text_normalizer_collection_2(self): 58 | test_cases = [ 59 | ( 60 | " 我在85.33度C買了一杯900──1000元的咖啡 《ohoh》?? m_m", 61 | "我在_float_度c買了一杯_int_-_int_元的咖啡 ?? m_m", 62 | "我在85.33度C買了一杯900──1000元的咖啡 《ohoh》?? m_m", 63 | ), 64 | ( 65 | "買5-8年五門車 ~~ ", 66 | "買_int_-_int_年五門車 ~~", 67 | "買5-8年五門車 ~~", 68 | ), 69 | ( 70 | "2001 ~ 2007年紅色\藍色的Benz OHOHOH ", 71 | "_int_ ~ _int_年紅色\藍色的benz ohohoh", 72 | "2001 ~ 2007年紅色\藍色的Benz OHOHOH", 73 | ), 74 | ] 75 | self._run_test( 76 | test_cases=test_cases, 77 | normalizer=chinese_charactor_text_normalizer_collection_2, 78 | normalizer_name="chinese_charactor_text_normalizer_collection_2", 79 | ) 80 | 81 | def test_chinese_charactor_text_normalizer_collection_3(self): 82 | test_cases = [ 83 | ( 84 | " 我在85.33度C買了一杯900-1000元的咖啡{ohoh}", 85 | "我在_2float2_度c買了一杯_3int_-_4int_元的咖啡(ohoh)", 86 | "我在85.33度C買了一杯900-1000元的咖啡{ohoh}", 87 | ), 88 | ( 89 | " 買5 - 80年 五門車~ ", 90 | "買_1int_ - _2int_年 五門車-", 91 | "買5 - 80年 五門車~", 92 | ), 93 | ] 94 | self._run_test( 95 | test_cases=test_cases, 96 | normalizer=chinese_charactor_text_normalizer_collection_3, 97 | normalizer_name="chinese_charactor_text_normalizer_collection_3", 98 | ) 99 | 100 | def test_chinese_charactor_text_normalizer_collection_4(self): 101 | test_cases = [ 102 | ( 103 | " 我在85.333度C買了一杯900──1000元的咖啡《ohoh》?? m_m ", 104 | "我在_2float3_度c買了一杯_3int_-_4int_元的咖啡?? m_m", 105 | "我在85.333度C買了一杯900──1000元的咖啡《ohoh》?? m_m", 106 | ), 107 | ( 108 | "買5-800年 五門車 ~~ ", 109 | "買_1int_-_3int_年 五門車 ~~", 110 | "買5-800年 五門車 ~~", 111 | ), 112 | ( 113 | " 2001 ~ 2007年紅色\藍色的Benz OHOHOH", 114 | "_4int_ ~ _4int_年紅色\藍色的benz ohohoh", 115 | "2001 ~ 2007年紅色\藍色的Benz OHOHOH", 116 | ), 117 | ] 118 | self._run_test( 119 | test_cases=test_cases, 120 | normalizer=chinese_charactor_text_normalizer_collection_4, 121 | normalizer_name="chinese_charactor_text_normalizer_collection_4", 122 | ) 123 | -------------------------------------------------------------------------------- /text_normalizer/collection/test/test_eng_basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..eng_basic import eng_basic_text_normalizer_collection 5 | 6 | 7 | class EngBasicNormalizerCollectionTestCase(TestCase): 8 | 9 | def test_eng_basic_text_normalizer_collection(self): 10 | test_cases = [ 11 | ( 12 | 'Hoa DADA loves to eat chicken pie.', 13 | 'hoa dada loves to eat chicken pie.', 14 | 'Hoa DADA loves to eat chicken pie.', 15 | ), 16 | ( 17 | 'CPH DA DA want to hang out with Hoa \t\t DADA! \n\n', 18 | 'cph da da want to hang out with hoa dada!', 19 | 'CPH DA DA want to hang out with Hoa DADA!', 20 | ), 21 | ] 22 | for test_case in test_cases: 23 | with self.subTest(test_case=test_case): 24 | revised_sentence, meta = eng_basic_text_normalizer_collection.normalize( 25 | sentence=test_case[0], 26 | ) 27 | self.assertEqual(test_case[1], revised_sentence) 28 | recovered_sentence = eng_basic_text_normalizer_collection.denormalize( 29 | sentence=revised_sentence, 30 | meta=meta, 31 | ) 32 | self.assertEqual(test_case[2], recovered_sentence) 33 | -------------------------------------------------------------------------------- /text_normalizer/collection/test/test_punctuation_keeping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..punctuation_keeping import ( 5 | full_punctuation_keeping_text_normalizer_collection, 6 | simplified_punctuation_keeping_text_normalizer_collection, 7 | number_with_digits_n_simplified_punctuation_text_normalizer_collection, 8 | ) 9 | 10 | 11 | class PunctuationKeepingTextNormalizerCollectionTestCase(TestCase): 12 | 13 | def test_full_punctuation_keeping_text_normalizer_collection(self): 14 | normalizer = full_punctuation_keeping_text_normalizer_collection 15 | test_cases = [ 16 | ( 17 | "我在85.33度C買了一杯900──1000元的咖啡《ohoh》??", 18 | "我在 _float_ 度c買了一杯 _int_ - _int_ 元的咖啡??", 19 | ), 20 | ( 21 | "買5──8年五門車~~", 22 | "買 _int_ - _int_ 年五門車~~", 23 | ), 24 | ] 25 | for test_case in test_cases: 26 | with self.subTest(test_case=test_case): 27 | revised_sentence, meta = normalizer.normalize( 28 | sentence=test_case[0], 29 | ) 30 | self.assertEqual( 31 | test_case[1], 32 | revised_sentence, 33 | ) 34 | recovered_sentence = normalizer.denormalize( 35 | sentence=test_case[1], 36 | meta=meta, 37 | ) 38 | self.assertEqual( 39 | test_case[0], 40 | recovered_sentence, 41 | ) 42 | 43 | def test_simplified_punctuation_keeping_text_normalizer_collection(self): 44 | normalizer = simplified_punctuation_keeping_text_normalizer_collection 45 | test_cases = [ 46 | ( 47 | "我在85.33度C買了一杯900-1000元的咖啡{ohoh}", 48 | "我在 _float_ 度c買了一杯 _int_ - _int_ 元的咖啡(ohoh)", 49 | ), 50 | ( 51 | "買5-8年五門車~", 52 | "買 _int_ - _int_ 年五門車-", 53 | ), 54 | ] 55 | for test_case in test_cases: 56 | with self.subTest(test_case=test_case): 57 | revised_sentence, meta = normalizer.normalize( 58 | sentence=test_case[0], 59 | ) 60 | self.assertEqual( 61 | test_case[1], 62 | revised_sentence, 63 | ) 64 | recovered_sentence = normalizer.denormalize( 65 | sentence=test_case[1], 66 | meta=meta, 67 | ) 68 | self.assertEqual( 69 | test_case[0], 70 | recovered_sentence, 71 | ) 72 | 73 | def test_number_with_digits_n_simplified_punctuation_text_normalizer_collection(self): 74 | normalizer = number_with_digits_n_simplified_punctuation_text_normalizer_collection 75 | test_cases = [ 76 | ( 77 | "我在85.33度C買了一杯900-1000元的咖啡{ohoh}", 78 | "我在 _2float2_ 度c買了一杯 _3int_ - _4int_ 元的咖啡(ohoh)", 79 | ), 80 | ( 81 | "買5-8年五門車~", 82 | "買 _1int_ - _1int_ 年五門車-", 83 | ), 84 | ] 85 | for test_case in test_cases: 86 | with self.subTest(test_case=test_case): 87 | revised_sentence, meta = normalizer.normalize( 88 | sentence=test_case[0], 89 | ) 90 | self.assertEqual( 91 | test_case[1], 92 | revised_sentence, 93 | ) 94 | recovered_sentence = normalizer.denormalize( 95 | sentence=test_case[1], 96 | meta=meta, 97 | ) 98 | self.assertEqual( 99 | test_case[0], 100 | recovered_sentence, 101 | ) 102 | -------------------------------------------------------------------------------- /text_normalizer/collection/test/test_unicode_mapping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..unicode_mapping import ( 5 | u_zh_text_normalizer_collection_1, 6 | u_zh_text_normalizer_collection_2, 7 | u_zh_text_normalizer_collection_3, 8 | u_zh_text_normalizer_collection_4, 9 | u_en_text_normalizer_collection_1, 10 | u_en_text_normalizer_collection_2, 11 | u_en_text_normalizer_collection_3, 12 | u_zh_en_text_normalizer_collection_1, 13 | u_zh_en_text_normalizer_collection_2, 14 | u_zh_en_text_normalizer_collection_3, 15 | u_zh_en_text_normalizer_collection_4, 16 | ) 17 | 18 | 19 | class UnicodeTextNormalizerCollectionTestCase(TestCase): 20 | 21 | def unit_test(self, normalizer, test_cases): 22 | for test_case in test_cases: 23 | with self.subTest(test_case=test_case): 24 | revised_sentence, meta = normalizer.normalize( 25 | sentence=test_case[0], 26 | ) 27 | self.assertEqual( 28 | test_case[1], 29 | revised_sentence, 30 | ) 31 | recovered_sentence = normalizer.denormalize( 32 | sentence=test_case[1], 33 | meta=meta, 34 | ) 35 | self.assertEqual( 36 | test_case[0], 37 | recovered_sentence, 38 | ) 39 | 40 | def test_u_zh_text_normalizer_collection_1(self): 41 | normalizer = u_zh_text_normalizer_collection_1 42 | test_cases = [ 43 | ( 44 | '我在85.33度C買了一杯900──1000元的咖啡《ohoh》??', 45 | '我在 度 買了一杯 元的咖啡 ', 46 | ), 47 | ( 48 | '買5──8年五門車~~', 49 | '買 年五門車 ', 50 | ), 51 | ] 52 | self.unit_test( 53 | normalizer=normalizer, 54 | test_cases=test_cases, 55 | ) 56 | 57 | def test_u_zh_text_normalizer_collection_2(self): 58 | normalizer = u_zh_text_normalizer_collection_2 59 | test_cases = [ 60 | ( 61 | '我在85.33度C買了一杯900 1000元的咖啡《ohoh》??', 62 | '我在85.33度 買了一杯900 1000元的咖啡 ', 63 | ), 64 | ( 65 | '買5──8年五門車~~', 66 | '買5 8年五門車 ', 67 | ), 68 | ] 69 | self.unit_test( 70 | normalizer=normalizer, 71 | test_cases=test_cases, 72 | ) 73 | 74 | def test_u_zh_text_normalizer_collection_3(self): 75 | normalizer = u_zh_text_normalizer_collection_3 76 | test_cases = [ 77 | ( 78 | '我在85.33度C買了一杯900 1000元的咖啡《ohoh》??', 79 | '我在_float_度 買了一杯_int_ _int_元的咖啡 ', 80 | ), 81 | ( 82 | '買5──8年五門車~~', 83 | '買_int_ _int_年五門車 ', 84 | ), 85 | ] 86 | self.unit_test( 87 | normalizer=normalizer, 88 | test_cases=test_cases, 89 | ) 90 | 91 | def test_u_zh_text_normalizer_collection_4(self): 92 | normalizer = u_zh_text_normalizer_collection_4 93 | test_cases = [ 94 | ( 95 | '我在85.333度C買了一杯900 1000元的咖啡《ohoh》??', 96 | '我在_2float3_度 買了一杯_3int_ _4int_元的咖啡 ', 97 | ), 98 | ( 99 | '買5──8年五門車~~', 100 | '買_1int_ _1int_年五門車 ', 101 | ), 102 | ] 103 | self.unit_test( 104 | normalizer=normalizer, 105 | test_cases=test_cases, 106 | ) 107 | 108 | def test_u_en_text_normalizer_collection_1(self): 109 | normalizer = u_en_text_normalizer_collection_1 110 | test_cases = [ 111 | ( 112 | 'I want to buy 300 cups of $10.7 coffee. OHOH@@', 113 | 'i want to buy 300 cups of $10.7 coffee. ohoh@@', 114 | ), 115 | ] 116 | self.unit_test( 117 | normalizer=normalizer, 118 | test_cases=test_cases, 119 | ) 120 | 121 | def test_u_en_text_normalizer_collection_2(self): 122 | normalizer = u_en_text_normalizer_collection_2 123 | test_cases = [ 124 | ( 125 | 'I want to buy 300 cups of $10.7 coffee. OHOH', 126 | 'i want to buy _int_ cups of $_float_ coffee. ohoh', 127 | ), 128 | ] 129 | self.unit_test( 130 | normalizer=normalizer, 131 | test_cases=test_cases, 132 | ) 133 | 134 | def test_u_en_text_normalizer_collection_3(self): 135 | normalizer = u_en_text_normalizer_collection_3 136 | test_cases = [ 137 | ( 138 | 'I want to buy 300 cups of $10.7 coffee. OHOH', 139 | 'i want to buy _3int_ cups of $_2float1_ coffee. ohoh', 140 | ), 141 | ] 142 | self.unit_test( 143 | normalizer=normalizer, 144 | test_cases=test_cases, 145 | ) 146 | 147 | def test_u_zh_en_text_normalizer_collection_1(self): 148 | normalizer = u_zh_en_text_normalizer_collection_1 149 | test_cases = [ 150 | ( 151 | '我在85.333度C買了a cup of900-1000元的咖啡《ohoh》??', 152 | '我在_float_度c買了a cup of_int_-_int_元的咖啡??', 153 | ), 154 | ( 155 | '+1~~', 156 | '+_int_~~', 157 | ), 158 | ( 159 | ',買5~80年五門車~~', 160 | ',買_int_~_int_年五門車~~', 161 | ), 162 | ( 163 | '<><>@@##', 164 | '<><>@@##', 165 | ), 166 | ] 167 | self.unit_test( 168 | normalizer=normalizer, 169 | test_cases=test_cases, 170 | ) 171 | 172 | def test_u_zh_en_text_normalizer_collection_2(self): 173 | normalizer = u_zh_en_text_normalizer_collection_2 174 | test_cases = [ 175 | ( 176 | '我在85.333度C買了a cup of900-1000元的咖啡《ohoh》??', 177 | '我在_2float3_度c買了a cup of_3int_-_4int_元的咖啡??', 178 | ), 179 | ( 180 | '+1~~', 181 | '+_1int_~~', 182 | ), 183 | ( 184 | ',買5~80年五門車~~', 185 | ',買_1int_~_2int_年五門車~~', 186 | ), 187 | ( 188 | '<><>@@##', 189 | '<><>@@##', 190 | ), 191 | ] 192 | self.unit_test( 193 | normalizer=normalizer, 194 | test_cases=test_cases, 195 | ) 196 | 197 | def test_u_zh_en_text_normalizer_collection_3(self): 198 | normalizer = u_zh_en_text_normalizer_collection_3 199 | test_cases = [ 200 | ( 201 | '我在85.333度C買了a cup of900~1000元的咖啡《ohoh》??', 202 | '我在_float_度c買了a cup of_int_-_int_元的咖啡 ohoh ', 203 | ), 204 | ( 205 | '+1~~', 206 | '+_int_--', 207 | ), 208 | ( 209 | '<><>@@##', 210 | ' ', 211 | ), 212 | ] 213 | self.unit_test( 214 | normalizer=normalizer, 215 | test_cases=test_cases, 216 | ) 217 | 218 | def test_u_zh_en_text_normalizer_collection_4(self): 219 | normalizer = u_zh_en_text_normalizer_collection_4 220 | test_cases = [ 221 | ( 222 | '我在85.333度C買了a cup of900~1000元的咖啡《OhoH》??', 223 | '我在_2float3_度c買了a cup of_3int_-_4int_元的咖啡 ohoh ', 224 | ), 225 | ( 226 | '+1~~', 227 | '+_1int_--', 228 | ), 229 | ( 230 | ',買5~80年五門車~~', 231 | ',買_1int_-_2int_年五門車--', 232 | ), 233 | ( 234 | '<><>@@##', 235 | ' ', 236 | ), 237 | ] 238 | self.unit_test( 239 | normalizer=normalizer, 240 | test_cases=test_cases, 241 | ) 242 | -------------------------------------------------------------------------------- /text_normalizer/collection/unicode_mapping.py: -------------------------------------------------------------------------------- 1 | from .base_collection import BaseCollection 2 | from ..library import ( 3 | whitespace_reduction_text_normalizer, 4 | eng_lowercase_text_normalizer, 5 | float_text_normalizer, 6 | int_text_normalizer, 7 | int_with_digit_text_normalizer, 8 | float_with_digit_text_normalizer, 9 | unicode__chinese_characters_text_normalizer, 10 | unicode__chinese_characters_and_digits_text_normalizer, 11 | unicode__english_digits_and_full_punctuations_text_normalizer, 12 | unicode__chinese_english_digits_and_full_punctuations_text_normalizer, 13 | unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer, 14 | ) 15 | 16 | 17 | u_zh_text_normalizer_collection_1 = BaseCollection() 18 | u_zh_text_normalizer_collection_1.add_text_normalizers( 19 | text_normalizers=[ 20 | unicode__chinese_characters_text_normalizer, 21 | unicode__chinese_characters_and_digits_text_normalizer, 22 | whitespace_reduction_text_normalizer, 23 | ], 24 | ) 25 | 26 | u_zh_text_normalizer_collection_2 = BaseCollection() 27 | u_zh_text_normalizer_collection_2.add_text_normalizers( 28 | text_normalizers=[ 29 | unicode__chinese_characters_and_digits_text_normalizer, 30 | whitespace_reduction_text_normalizer, 31 | ], 32 | ) 33 | 34 | u_zh_text_normalizer_collection_3 = BaseCollection() 35 | u_zh_text_normalizer_collection_3.add_text_normalizers( 36 | text_normalizers=[ 37 | unicode__chinese_characters_and_digits_text_normalizer, 38 | float_text_normalizer, 39 | int_text_normalizer, 40 | whitespace_reduction_text_normalizer, 41 | ], 42 | ) 43 | 44 | u_zh_text_normalizer_collection_4 = BaseCollection() 45 | u_zh_text_normalizer_collection_4.add_text_normalizers( 46 | text_normalizers=[ 47 | unicode__chinese_characters_and_digits_text_normalizer, 48 | float_with_digit_text_normalizer, 49 | int_with_digit_text_normalizer, 50 | whitespace_reduction_text_normalizer, 51 | ], 52 | ) 53 | 54 | u_en_text_normalizer_collection_1 = BaseCollection() 55 | u_en_text_normalizer_collection_1.add_text_normalizers( 56 | text_normalizers=[ 57 | unicode__english_digits_and_full_punctuations_text_normalizer, 58 | eng_lowercase_text_normalizer, 59 | whitespace_reduction_text_normalizer, 60 | ], 61 | ) 62 | 63 | u_en_text_normalizer_collection_2 = BaseCollection() 64 | u_en_text_normalizer_collection_2.add_text_normalizers( 65 | text_normalizers=[ 66 | unicode__english_digits_and_full_punctuations_text_normalizer, 67 | eng_lowercase_text_normalizer, 68 | float_text_normalizer, 69 | int_text_normalizer, 70 | whitespace_reduction_text_normalizer, 71 | ], 72 | ) 73 | 74 | u_en_text_normalizer_collection_3 = BaseCollection() 75 | u_en_text_normalizer_collection_3.add_text_normalizers( 76 | text_normalizers=[ 77 | unicode__english_digits_and_full_punctuations_text_normalizer, 78 | eng_lowercase_text_normalizer, 79 | float_with_digit_text_normalizer, 80 | int_with_digit_text_normalizer, 81 | whitespace_reduction_text_normalizer, 82 | ], 83 | ) 84 | 85 | u_zh_en_text_normalizer_collection_1 = BaseCollection() 86 | u_zh_en_text_normalizer_collection_1.add_text_normalizers( 87 | text_normalizers=[ 88 | unicode__chinese_english_digits_and_full_punctuations_text_normalizer, 89 | eng_lowercase_text_normalizer, 90 | float_text_normalizer, 91 | int_text_normalizer, 92 | whitespace_reduction_text_normalizer, 93 | ], 94 | ) 95 | 96 | u_zh_en_text_normalizer_collection_2 = BaseCollection() 97 | u_zh_en_text_normalizer_collection_2.add_text_normalizers( 98 | text_normalizers=[ 99 | unicode__chinese_english_digits_and_full_punctuations_text_normalizer, 100 | eng_lowercase_text_normalizer, 101 | float_with_digit_text_normalizer, 102 | int_with_digit_text_normalizer, 103 | whitespace_reduction_text_normalizer, 104 | ], 105 | ) 106 | 107 | u_zh_en_text_normalizer_collection_3 = BaseCollection() 108 | u_zh_en_text_normalizer_collection_3.add_text_normalizers( 109 | text_normalizers=[ 110 | unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer, 111 | eng_lowercase_text_normalizer, 112 | float_text_normalizer, 113 | int_text_normalizer, 114 | whitespace_reduction_text_normalizer, 115 | ], 116 | ) 117 | 118 | u_zh_en_text_normalizer_collection_4 = BaseCollection() 119 | u_zh_en_text_normalizer_collection_4.add_text_normalizers( 120 | text_normalizers=[ 121 | unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer, 122 | eng_lowercase_text_normalizer, 123 | float_with_digit_text_normalizer, 124 | int_with_digit_text_normalizer, 125 | whitespace_reduction_text_normalizer, 126 | ], 127 | ) 128 | -------------------------------------------------------------------------------- /text_normalizer/data/punctuation/punctuation_mapping_0221.csv: -------------------------------------------------------------------------------- 1 | "before","after" 2 | "( ( ❨ ﹙ ( ︵","(" 3 | ") ) ❩ ﹚ ) ︶",")" 4 | """ 『 「 “ ‘ ' ` ﹁ ﹃ 〝 〃 ’ "" 』 」 ” ’' ﹂ ﹄ 〞","""" 5 | "[ 〔 〘 【 ﹝ 【 〔 ︹ ︻","[" 6 | "] 〕 〙 】 ﹞ 】 〕 ︺ ︼","]" 7 | "{ ❴ ﹛ { ︷","{" 8 | "} ❵ ﹜ } ︸","}" 9 | "< ⟨ 《 〈 〈 < ﹤ ≦ ︽ ︿","<" 10 | "> ⟩ 》 〉 > ﹥ ≧ ︾ ﹀",">" 11 | ", , , 、 , ﹐","," 12 | "- — ― ── ﹣ – — ╴ ¯  ̄ ﹉ ﹊ ﹋ ﹌","-" 13 | "~ ~ ~","~" 14 | "! !","!" 15 | ". 。 ‧ . ﹒ ˙ ·","." 16 | "... …","..." 17 | ": : ﹕ ︰",":" 18 | " ; ﹔",";" 19 | "? ?","?" 20 | "+ + + ﹢","+" 21 | "% %","%" 22 | "* * × ╳","*" 23 | "| | ︱ ︳ ∣ ︴","|" 24 | "/ ∕ ╱ / ∥","/" 25 | "\ ﹨ ╲ \ ﹨","\" 26 | "# #","#" 27 | "$ $","$" 28 | "@ @","@" 29 | "& &","&" 30 | "= ﹦ = ≡ = ≒","=" 31 | "﹍ ﹎ ﹏ _","_" 32 | -------------------------------------------------------------------------------- /text_normalizer/data/punctuation/punctuation_mapping_0221_simplified.csv: -------------------------------------------------------------------------------- 1 | "before","after" 2 | "( ( ❨ ﹙ ( ︵ [ 〔 〘 【 ﹝ 【 〔 ︹ ︻ < ⟨ 《 〈 〈 < ﹤ ≦ ︽ ︿ { ❴ ﹛ { ︷","(" 3 | ") ) ❩ ﹚ ) ︶ ] 〕 〙 】 ﹞ 】 〕 ︺ ︼ > ⟩ 》 〉 > ﹥ ≧ ︾ ﹀ } ❵ ﹜ } ︸",")" 4 | """ 『 「 “ ‘ ' ` ﹁ ﹃ 〝 〃 ’ "" 』 」 ” ’' ﹂ ﹄ 〞","""" 5 | ", , , 、 , ﹐ / ∕ ╱ / ∥ \ ﹨ ╲ \ ﹨ | | ︱ ︳ ∣ ︴","," 6 | "- — ― ── ﹣ – — ╴ ¯  ̄ ﹉ ﹊ ﹋ ﹌ ~ ~","-" 7 | "! !","!" 8 | ". 。 ‧ . ﹒ ˙ ·","." 9 | "... …","..." 10 | ": : ﹕ ︰",":" 11 | " ; ﹔",";" 12 | "? ?","?" 13 | "+ + + ﹢","+" 14 | "% %","%" 15 | "* * × ╳","*" 16 | "# #","#" 17 | "$ $","$" 18 | "@ @","@" 19 | "& &","&" 20 | "= ﹦ = ≡ = ≒","=" 21 | "﹍ ﹎ ﹏ _","_" 22 | -------------------------------------------------------------------------------- /text_normalizer/data/unicode/chinese_characters_and_digits.txt: -------------------------------------------------------------------------------- 1 | 0030-0039:one2one(HalfWidth Numbers) 2 | FF10-FF19:one2one(FullWidth Numbers) 3 | 4E00-9FFF:one2one(CJK Unified Ideographs) 4 | F900-FAFF:one2one(CJK Compatibility Ideographs) 5 | 002E:002E(.) -------------------------------------------------------------------------------- /text_normalizer/data/unicode/chinese_characters_only.txt: -------------------------------------------------------------------------------- 1 | 4E00-9FFF:one2one(CJK Unified Ideographs) 2 | F900-FAFF:one2one(CJK Compatibility Ideographs) -------------------------------------------------------------------------------- /text_normalizer/data/unicode/chinese_english_digits.txt: -------------------------------------------------------------------------------- 1 | 0030-0039:one2one(HalfWidth Numbers) 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters) 3 | 0061-007A:one2one(HalfWidth Lower English Characters) 4 | FF10-FF19:one2one(FullWidth Numbers) 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters) 6 | FF41-FF5A:one2one(FullWidth Lower English Characters) 7 | 4E00-9FFF:one2one(CJK Unified Ideographs) 8 | F900-FAFF:one2one(CJK Compatibility Ideographs) -------------------------------------------------------------------------------- /text_normalizer/data/unicode/chinese_english_digits_and_full_punctuations.txt: -------------------------------------------------------------------------------- 1 | 0030-0039:one2one(HalfWidth Numbers) 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters) 3 | 0061-007A:one2one(HalfWidth Lower English Characters) 4 | FF10-FF19:one2one(FullWidth Numbers) 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters) 6 | FF41-FF5A:one2one(FullWidth Lower English Characters) 7 | 4E00-9FFF:one2one(CJK Unified Ideographs) 8 | F900-FAFF:one2one(CJK Compatibility Ideographs) 9 | FF01 01C3 0021:0021(!) 10 | 3003 300C 300D 300E 300F 201C 201D 201F FF62 FF63 FF02 0022:0022(") 11 | FF03 0023:0023(#) 12 | 1F4B2 FF04 0024:0024($) 13 | FF05 0025:0025(%) 14 | 1F674 FF06 0026:0026(&) 15 | 2018 2019 FF07 0027:0027(') 16 | FF5F FF08 0028:0028(() 17 | FF60 FF09 0029:0029()) 18 | 2217 FF0A 002A:002A(*) 19 | FF0B 002B:002B(+) 20 | 3001 201A FF64 FF0C 002C:002C(,) 21 | 2010 23BA 23BB 23BC 23BD FF0D 002D:002D(-) 22 | 3002 302A 302B 302C 302D 302E 2218 2219 FF65 FF61 FF0E 002E:002E(.) 23 | 27CB 2215 2044 0338 2215 FF0F 002F:002F(/) 24 | 302F 0589 05C3 A789 2236 FF1A 003A:003A(:) 25 | 037E FF1B 003B:003B(;) 26 | 3008 300A 2039 227A 2329 FF1C 003C:003C(<) 27 | 2261 10190 A78A FF1D 003D:003D(=) 28 | 3009 300B 203A 227B 232A FF1E 003E:003E(>) 29 | 203D FF1F 003F:003F(?) 30 | FF20 0040:0040(@) 31 | 3010 3016 FF3B 005B:005B([) 32 | 20E5 2216 FF3C 005C:005C(\) 33 | 3011 3017 FF3D 005D:005D(]) 34 | 2038 FF3E 005E:005E(^) 35 | 02CD 0331 0332 FF3F 005F:005F(_) 36 | 2035 FF40 0060:0060(`) 37 | FF5B 007B:007B({) 38 | 2223 23B8 23B9 23D0 FF5C 007C:007C(|) 39 | FF5D 007D:007D(}) 40 | 301C 2053 02DC 0303 223C 223D 223E 223F FF5E 007E:007E(~) -------------------------------------------------------------------------------- /text_normalizer/data/unicode/chinese_english_digits_and_simplified_punctuations_1.txt: -------------------------------------------------------------------------------- 1 | 0030-0039:one2one(HalfWidth Numbers) 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters) 3 | 0061-007A:one2one(HalfWidth Lower English Characters) 4 | FF10-FF19:one2one(FullWidth Numbers) 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters) 6 | FF41-FF5A:one2one(FullWidth Lower English Characters) 7 | 4E00-9FFF:one2one(CJK Unified Ideographs) 8 | F900-FAFF:one2one(CJK Compatibility Ideographs) 9 | 1F4B2 FF04 0024:0024($) 10 | FF0B 002B:002B(+) 11 | FF05 0025:0025(%) 12 | 3001 201A FF64 FF0C 002C 037E FF1B 003B 27CB 2215 2044 0338 2215 FF0F 002F 2223 23B8 23B9 23D0 FF5C 007C 1F674 FF06 0026:002C(separation symbols) 13 | 2010 23BA 23BB 23BC 23BD FF0D 002D 301C 2053 02DC 0303 223C 223D 223E 223F FF5E 007E:002D(to) 14 | 3002 302A 302B 302C 302D 302E 2218 2219 FF65 FF61 FF0E 002E:002E(.) -------------------------------------------------------------------------------- /text_normalizer/data/unicode/english_characters_and_digits.txt: -------------------------------------------------------------------------------- 1 | 0030-0039:one2one(HalfWidth Numbers) 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters) 3 | 0061-007A:one2one(HalfWidth Lower English Characters) 4 | FF10-FF19:one2one(FullWidth Numbers) 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters) 6 | FF41-FF5A:one2one(FullWidth Lower English Characters) -------------------------------------------------------------------------------- /text_normalizer/data/unicode/english_digits_and_full_punctuations.txt: -------------------------------------------------------------------------------- 1 | 0030-0039:one2one(HalfWidth Numbers) 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters) 3 | 0061-007A:one2one(HalfWidth Lower English Characters) 4 | FF10-FF19:one2one(FullWidth Numbers) 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters) 6 | FF41-FF5A:one2one(FullWidth Lower English Characters) 7 | FF01 01C3 0021:0021(!) 8 | 3003 300C 300D 300E 300F 201C 201D 201F FF62 FF63 FF02 0022:0022(") 9 | FF03 0023:0023(#) 10 | 1F4B2 FF04 0024:0024($) 11 | FF05 0025:0025(%) 12 | 1F674 FF06 0026:0026(&) 13 | 2018 2019 FF07 0027:0027(') 14 | FF5F FF08 0028:0028(() 15 | FF60 FF09 0029:0029()) 16 | 2217 FF0A 002A:002A(*) 17 | FF0B 002B:002B(+) 18 | 3001 201A FF64 FF0C 002C:002C(,) 19 | 2010 23BA 23BB 23BC 23BD FF0D 002D:002D(-) 20 | 3002 302A 302B 302C 302D 302E 2218 2219 FF65 FF61 FF0E 002E:002E(.) 21 | 27CB 2215 2044 0338 2215 FF0F 002F:002F(/) 22 | 302F 0589 05C3 A789 2236 FF1A 003A:003A(:) 23 | 037E FF1B 003B:003B(;) 24 | 3008 300A 2039 227A 2329 FF1C 003C:003C(<) 25 | 2261 10190 A78A FF1D 003D:003D(=) 26 | 3009 300B 203A 227B 232A FF1E 003E:003E(>) 27 | 203D FF1F 003F:003F(?) 28 | FF20 0040:0040(@) 29 | 3010 3016 FF3B 005B:005B([) 30 | 20E5 2216 FF3C 005C:005C(\) 31 | 3011 3017 FF3D 005D:005D(]) 32 | 2038 FF3E 005E:005E(^) 33 | 02CD 0331 0332 FF3F 005F:005F(_) 34 | 2035 FF40 0060:0060(`) 35 | FF5B 007B:007B({) 36 | 2223 23B8 23B9 23D0 FF5C 007C:007C(|) 37 | FF5D 007D:007D(}) 38 | 301C 2053 02DC 0303 223C 223D 223E 223F FF5E 007E:007E(~) -------------------------------------------------------------------------------- /text_normalizer/factory/__init__.py: -------------------------------------------------------------------------------- 1 | from .eng_lowercase import EngLowercase # noqa 2 | from .identity import Identity # noqa 3 | from .number_token import NumberToken # noqa 4 | from .punctuation_mapping import PunctuationMapping # noqa 5 | from .replace_pattern_with_token import ReplacePatternWithToken # noqa 6 | from .strip import Strip # noqa 7 | from .unicode_mapping import UnicodeMapping # noqa 8 | -------------------------------------------------------------------------------- /text_normalizer/factory/base_factory.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | class BaseFactory(object): 5 | 6 | def __init__( 7 | self, 8 | denormalizable: bool = False, 9 | name: str = None, 10 | ) -> None: 11 | self.denormalizable = denormalizable 12 | if name is None: 13 | self.name = self.__class__.__name__ 14 | else: 15 | self.name = name 16 | 17 | def normalize( 18 | self, 19 | sentence: str, 20 | ) -> (str, List[dict]): 21 | raise NotImplementedError 22 | 23 | def denormalize( 24 | self, 25 | sentence: str, 26 | meta: dict = None, 27 | ) -> str: 28 | ''' 29 | If the text normalizer is denormalizable, then this method should be implemented. 30 | ''' 31 | if not self.denormalizable: 32 | return sentence 33 | 34 | # def ldenormalize( 35 | # self, 36 | # sentence: List[str], 37 | # meta: dict = None, 38 | # ) -> str: 39 | # if not self.denormalizable: 40 | # return sentence 41 | -------------------------------------------------------------------------------- /text_normalizer/factory/eng_lowercase.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Tuple 2 | import re 3 | 4 | from .base_factory import BaseFactory 5 | 6 | 7 | class EngLowercase(BaseFactory): 8 | 9 | def __init__(self, name='eng_lowercase'): 10 | super().__init__(name=name, denormalizable=True) 11 | self.fullwidth_uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 12 | self.fullwidth_lowercase = "abcdefghijklmnopqrstuvwxyz" 13 | self.halfwidth_uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 14 | self.halfwidth_lowercase = "abcdefghijklmnopqrstuvwxyz" 15 | 16 | self.pattern = "[a-zA-Z{}{}]+".format( 17 | self.fullwidth_uppercase, 18 | self.fullwidth_lowercase, 19 | ) 20 | self.findall_prog = re.compile(self.pattern) 21 | self.mapping_table = self.gen_table() 22 | 23 | def gen_table(self) -> Dict[str, str]: 24 | table = {} 25 | for index in range(26): 26 | table[self.fullwidth_uppercase[index]] = \ 27 | self.halfwidth_lowercase[index] 28 | table[self.fullwidth_lowercase[index]] = \ 29 | self.halfwidth_lowercase[index] 30 | table[self.halfwidth_uppercase[index]] = \ 31 | self.halfwidth_lowercase[index] 32 | return table 33 | 34 | def lowercase(self, sentence: str) -> str: 35 | output = [] 36 | for char in sentence: 37 | if char in self.mapping_table: 38 | output.append(self.mapping_table[char]) 39 | else: 40 | output.append(char) 41 | return ''.join(output) 42 | 43 | def normalize( 44 | self, 45 | sentence: str, 46 | ) -> Tuple[str, List[dict]]: 47 | eng_words = self.findall_prog.findall(sentence) 48 | if len(eng_words) == 0: 49 | return sentence, None 50 | else: 51 | meta = [] 52 | for eng_word in eng_words: 53 | meta.append( 54 | { 55 | 'before': eng_word, 56 | 'after': self.lowercase(eng_word), 57 | }, 58 | ) 59 | return self.lowercase(sentence), meta 60 | 61 | def denormalize( 62 | self, 63 | sentence: str, 64 | meta: List[dict] = None, 65 | ) -> str: 66 | if (not self.denormalizable) or (meta is None): 67 | # Case1: self.denormalizable = False 68 | return sentence 69 | else: 70 | begin_index = 0 71 | output = [] 72 | for single_meta in meta: 73 | start = sentence.find(single_meta['after'], begin_index) 74 | if start != -1: 75 | if begin_index != start: 76 | output.append(sentence[begin_index: start]) 77 | begin_index = start 78 | output.append(single_meta['before']) 79 | begin_index += len(single_meta['before']) 80 | if begin_index != len(sentence): 81 | output.append(sentence[begin_index:]) 82 | return ''.join(output) 83 | -------------------------------------------------------------------------------- /text_normalizer/factory/identity.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .base_factory import BaseFactory 4 | 5 | 6 | class Identity(BaseFactory): 7 | 8 | def __init__(self): 9 | super().__init__(name='identity', denormalizable=False) 10 | 11 | def normalize( 12 | self, 13 | sentence: str, 14 | ) -> (str, List[dict]): 15 | return sentence, None 16 | -------------------------------------------------------------------------------- /text_normalizer/factory/number_token.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import re 3 | 4 | from .base_factory import BaseFactory 5 | 6 | 7 | INT_PATTERN = re.compile(r"[0-90123456789]+(?!float|\_|\d)") 8 | FLOAT_PATTERN = re.compile( 9 | r"(? str: 35 | split_prog = re.compile( 36 | '{}|{}'.format( 37 | token, 38 | token.strip(), 39 | ), 40 | ) 41 | splited_sentence = split_prog.split(sentence) 42 | if len(splited_sentence) != len(value_list) + 1: 43 | raise ValueError( 44 | "Number of tokens in sentence should be equal to that of values", 45 | "original sentence = {}".format(sentence), 46 | "token = {}".format(token), 47 | "value_list = {}".format(value_list), 48 | ) 49 | 50 | output_sent = [] 51 | for i, segment in enumerate(splited_sentence): 52 | output_sent.append(segment) 53 | if i != len(splited_sentence) - 1: 54 | output_sent.append(value_list[i]) 55 | return ''.join(output_sent) 56 | 57 | 58 | CASES = { 59 | "_int_": { 60 | "pattern": INT_PATTERN, 61 | }, 62 | "_float_": { 63 | "pattern": FLOAT_PATTERN, 64 | }, 65 | "_{}int_": { 66 | "pattern": INT_PATTERN, 67 | "gen_token_with_digit": gen_int_token_with_digit, 68 | }, 69 | "_{}float{}_": { 70 | "pattern": FLOAT_PATTERN, 71 | "gen_token_with_digit": gen_float_token_with_digit, 72 | }, 73 | " _int_ ": { 74 | "pattern": INT_PATTERN, 75 | }, 76 | " _float_ ": { 77 | "pattern": FLOAT_PATTERN, 78 | }, 79 | " _{}int_ ": { 80 | "pattern": INT_PATTERN, 81 | "gen_token_with_digit": gen_int_token_with_digit, 82 | }, 83 | " _{}float{}_ ": { 84 | "pattern": FLOAT_PATTERN, 85 | "gen_token_with_digit": gen_float_token_with_digit, 86 | }, 87 | } 88 | 89 | 90 | class NumberToken(BaseFactory): 91 | 92 | def __init__( 93 | self, 94 | token: str, 95 | denormalizable: bool = True, 96 | name: str = None, 97 | ) -> None: 98 | super().__init__(name=name, denormalizable=denormalizable) 99 | if token not in CASES: 100 | raise KeyError( 101 | "This case [{}] is not handled".format(token), 102 | "Handle cases {} only".format(CASES.keys()), 103 | ) 104 | self.token = token 105 | 106 | def normalize( 107 | self, 108 | sentence: str, 109 | ) -> (str, dict): 110 | revised_sentence = CASES[self.token]["pattern"].sub( 111 | repl=self.token, 112 | string=sentence, 113 | ) 114 | value_list = CASES[self.token]["pattern"].findall(string=sentence) 115 | if "gen_token_with_digit" not in CASES[self.token]: 116 | if not self.denormalizable: 117 | return revised_sentence, None 118 | return revised_sentence, {self.token: value_list} 119 | 120 | #### token with digits #### 121 | tokens_with_digit = CASES[self.token]["gen_token_with_digit"]( 122 | value_list, 123 | token=self.token, 124 | ) 125 | revised_sentence = sub_token_with_value_sequentially( 126 | sentence=revised_sentence, 127 | token=self.token, 128 | value_list=tokens_with_digit, 129 | ) 130 | if not self.denormalizable: 131 | return revised_sentence, None 132 | 133 | meta = {} 134 | for token, value in zip(tokens_with_digit, value_list): 135 | if token in meta: 136 | meta[token].append(value) 137 | else: 138 | meta[token] = [value] 139 | return revised_sentence, meta 140 | 141 | def denormalize( 142 | self, 143 | sentence: str, 144 | meta: dict = None, 145 | ) -> str: 146 | if meta is None: 147 | meta = {} 148 | if (not self.denormalizable) or (len(meta) == 0): 149 | # Case1: self.denormalizable = False 150 | return sentence 151 | 152 | for token, values in meta.items(): 153 | sentence = sub_token_with_value_sequentially( 154 | sentence=sentence, 155 | token=token, 156 | value_list=values, 157 | ) 158 | return sentence 159 | -------------------------------------------------------------------------------- /text_normalizer/factory/punctuation_mapping.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | import re 3 | 4 | import pandas as pd 5 | from .base_factory import BaseFactory 6 | 7 | SpecialCases = { 8 | '\\': '\\\\', 9 | } 10 | 11 | RevSpecialCases = {v: k for k, v in SpecialCases.items()} 12 | 13 | 14 | class PunctuationMapping(BaseFactory): 15 | 16 | def __init__( 17 | self, 18 | normalization_table_path: str, 19 | denormalizable: bool = True, 20 | name: str = 'punctuation_normalizer', 21 | ) -> None: 22 | super().__init__(name=name, denormalizable=True) 23 | remove_space = re.compile(r"\s+") 24 | table_df = pd.read_csv(normalization_table_path).astype(str) 25 | for column in table_df.columns.tolist(): 26 | table_df[column] = table_df[column].str.strip() 27 | table_dict = table_df.to_dict(orient='index') 28 | 29 | self.patterns = [] 30 | for _, mapping in table_dict.items(): 31 | cleaned_before_pattern = remove_space.sub(" ", mapping["before"]) 32 | before_pattern_list = cleaned_before_pattern.split(" ") 33 | escaped_before_pattern_list = [ 34 | re.escape(pat) for pat in list(set(before_pattern_list))] 35 | escaped_after_pattern = re.escape(mapping["after"]) 36 | self.patterns.append( 37 | { 38 | "normalization_pattern": re.compile( 39 | r"{}".format("|".join(escaped_before_pattern_list)), 40 | ), 41 | "denormalization_pattern": re.compile( 42 | r"{}".format(escaped_after_pattern), 43 | ), 44 | "replacement": mapping["after"], 45 | }, 46 | ) 47 | 48 | def normalize( 49 | self, 50 | sentence: str, 51 | ) -> (str, List[Dict[str, List[str]]]): 52 | revised_sentence = sentence 53 | meta = [] 54 | for pattern in self.patterns: 55 | if pattern["replacement"] in SpecialCases: 56 | pattern["replacement"] = SpecialCases[pattern["replacement"]] 57 | 58 | revised_sentence = pattern["normalization_pattern"].sub( 59 | repl=pattern["replacement"], 60 | string=revised_sentence, 61 | ) 62 | meta.append( 63 | { 64 | "before": pattern["normalization_pattern"].findall( 65 | string=sentence, 66 | ), 67 | "after": pattern["replacement"], 68 | }, 69 | ) 70 | if not self.denormalizable: 71 | return revised_sentence, None 72 | return revised_sentence, meta 73 | 74 | def denormalize( 75 | self, 76 | sentence: str, 77 | meta: List[Dict[str, List[str]]] = None, 78 | ) -> str: 79 | if (not self.denormalizable) or (meta is None): 80 | # Case1: self.denormalizable = False 81 | return sentence 82 | 83 | for single_meta, pattern in zip(meta[::-1], self.patterns[::-1]): 84 | if single_meta["after"] != pattern["replacement"]: 85 | KeyError( 86 | "WRONG META !!!", 87 | "The AFTER token should be the same as REPLACEMENT in patterns", 88 | "Now, AFTER token is {} and REPLACEMENT is {}".format( 89 | single_meta["after"], 90 | pattern["replacement"], 91 | ), 92 | ) 93 | 94 | if pattern["replacement"] in RevSpecialCases: 95 | pattern["replacement"] = RevSpecialCases[pattern["replacement"]] 96 | 97 | punct_to_be_denormalized = pattern["denormalization_pattern"].findall( 98 | string=sentence, 99 | ) 100 | if len(punct_to_be_denormalized) != len(single_meta["before"]): 101 | raise KeyError( 102 | "The number of punctuation to be denormalized is not equal to", 103 | "the number of that in meta data", 104 | "# of punctuations to be denormalized = {}".format( 105 | len(punct_to_be_denormalized), 106 | ), 107 | "punctuations to be denormalized = {}".format( 108 | punct_to_be_denormalized, 109 | ), 110 | "punctuations in meta = {}".format(single_meta["before"]), 111 | ) 112 | 113 | splited_sentence = pattern["denormalization_pattern"].split(sentence) 114 | output_sentence = [] 115 | for idx, segment in enumerate(splited_sentence): 116 | output_sentence.append(segment) 117 | if idx != len(splited_sentence) - 1: 118 | output_sentence.append(single_meta["before"][idx]) 119 | sentence = ''.join(output_sentence) 120 | return sentence 121 | -------------------------------------------------------------------------------- /text_normalizer/factory/replace_pattern_with_token.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import re 3 | 4 | from .base_factory import BaseFactory 5 | 6 | 7 | class ReplacePatternWithToken(BaseFactory): 8 | 9 | def __init__( 10 | self, 11 | target_pattern: str, 12 | token: str, 13 | prefix_pattern: str = None, 14 | suffix_pattern: str = None, 15 | denormalizable: bool = False, 16 | name: str = None, 17 | ): 18 | super().__init__(name=name, denormalizable=denormalizable) 19 | self.token = token 20 | if prefix_pattern and suffix_pattern: 21 | self.findall_pattern = "(?:{})({})(?={})".format( 22 | prefix_pattern, 23 | target_pattern, 24 | suffix_pattern, 25 | ) 26 | self.sub_pattern = "({}){}(?={})".format( 27 | prefix_pattern, 28 | target_pattern, 29 | suffix_pattern, 30 | ) 31 | self.sub_replacement = "\g<1>{}".format(token) 32 | else: 33 | self.findall_pattern = target_pattern 34 | self.sub_pattern = target_pattern 35 | self.sub_replacement = token 36 | 37 | self.findall_prog = re.compile(self.findall_pattern) 38 | self.sub_prog = re.compile(self.sub_pattern) 39 | self.split_prog = re.compile( 40 | '{}|{}|{}|{}'.format( 41 | self.token, 42 | self.token.rstrip(), 43 | self.token.lstrip(), 44 | self.token.strip(), 45 | ), 46 | ) 47 | 48 | def normalize( 49 | self, 50 | sentence: str, 51 | ) -> (str, List[dict]): 52 | revised_sentence = self.sub_prog.sub( 53 | repl=self.sub_replacement, 54 | string=sentence, 55 | ) 56 | if self.denormalizable: 57 | meta = self.findall_prog.findall( 58 | string=sentence, 59 | ) 60 | return revised_sentence, {self.token: meta} 61 | else: 62 | return revised_sentence, None 63 | 64 | def denormalize( 65 | self, 66 | sentence: str, 67 | meta: dict = None, 68 | ) -> str: 69 | if not self.denormalizable: 70 | # Case1: self.denormalizable = False 71 | return sentence 72 | 73 | if self.token not in meta: 74 | # Case2: meta = {'a': ['XX', 'cc']}, 'a' != self.token 75 | raise KeyError( 76 | 'Wrong meta :{} !!!'.format(meta), 77 | 'Meta should be { %s: [...]}.' % self.token, 78 | ) 79 | 80 | splited_sentence = self.split_prog.split(sentence) 81 | if (len(splited_sentence) == 1) and (len(meta[self.token]) == 0): 82 | # Case3: no token in sentence and meta is empty 83 | return sentence 84 | elif len(splited_sentence) - 1 != len(meta[self.token]): 85 | # Case4: # of token in sentence != # of token in meta 86 | raise ValueError( 87 | '# of tokens in sentence is not equal to that in meta' 88 | 'sentence = {}'.format(sentence), 89 | 'meta = {}'.format(meta), 90 | ) 91 | else: 92 | output_sentence = '' 93 | idx = 0 94 | for s_idx, segment in enumerate(splited_sentence, start=1): 95 | output_sentence += segment 96 | if s_idx != len(splited_sentence): 97 | output_sentence += meta[self.token][idx] 98 | idx += 1 99 | return output_sentence 100 | 101 | # def ldenormalize( 102 | # self, 103 | # sentence: List[str], 104 | # meta: dict = None, 105 | # ) -> List[str]: 106 | 107 | # super().ldenormalize(sentence=sentence) 108 | # if self.token not in meta: 109 | # raise KeyError('Wrong meta :{} !!!'.format(meta)) 110 | 111 | # ''' 112 | # Each segment should not contain more than one token. 113 | # ''' 114 | # idx = 0 115 | # output_sentence = [] 116 | # for segment in sentence: 117 | # if self.token in segment: 118 | # denormalized_segment = re.sub(self.token, meta[self.token][idx], segment) 119 | # output_sentence.append(denormalized_segment) 120 | # idx += 1 121 | # else: 122 | # output_sentence.append(segment) 123 | # return output_sentence 124 | -------------------------------------------------------------------------------- /text_normalizer/factory/strip.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .base_factory import BaseFactory 4 | 5 | 6 | class Strip(BaseFactory): 7 | 8 | def __init__( 9 | self, 10 | chars: List[str] = None, 11 | direction: str = 'both', 12 | name: str = 'strip', 13 | ): 14 | self.chars = chars 15 | if self.chars is None: 16 | self.chars_str = None 17 | else: 18 | self.chars_str = ''.join(chars) 19 | if direction not in ['both', 'left', 'right']: 20 | raise ValueError( 21 | 'WRONG direction input! ' 22 | 'Direction has three options [both, left, right]', 23 | 'Your input is {}'.format(direction), 24 | ) 25 | else: 26 | self.direction = direction 27 | super().__init__( 28 | name=name + '_' + self.direction + '_' + str(self.chars_str), 29 | denormalizable=False, 30 | ) 31 | 32 | def normalize( 33 | self, 34 | sentence: str, 35 | ) -> (str, List[dict]): 36 | if self.direction == 'both': 37 | return sentence.strip(self.chars_str), None 38 | elif self.direction == 'left': 39 | return sentence.lstrip(self.chars_str), None 40 | elif self.direction == 'right': 41 | return sentence.rstrip(self.chars_str), None 42 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/factory/test/__init__.py -------------------------------------------------------------------------------- /text_normalizer/factory/test/example_punctuation_mapping.csv: -------------------------------------------------------------------------------- 1 | before,after 2 | ( ( ❨ ﹙ (,( 3 | ) ) ❩ ﹚ ),) 4 | "," ,,"," 5 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/example_unicode_mapping.txt: -------------------------------------------------------------------------------- 1 | FF10-FF10:one2one(0) 2 | FF11 0031:0031(1) 3 | FF0C 002C:002C(,) -------------------------------------------------------------------------------- /text_normalizer/factory/test/test_base_factory.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from ..base_factory import BaseFactory 3 | 4 | 5 | class TestBaseFactory(TestCase): 6 | 7 | def setUp(self): 8 | self.base_text_normalizer_class = BaseFactory() 9 | self.base_text_normalizer_class_with_name = BaseFactory(name='123') 10 | 11 | def test_attributes(self): 12 | self.assertEqual( 13 | { 14 | 'denormalizable': False, 15 | 'name': 'BaseFactory', 16 | }, 17 | self.base_text_normalizer_class.__dict__, 18 | ) 19 | 20 | self.assertEqual( 21 | { 22 | 'denormalizable': False, 23 | 'name': '123', 24 | }, 25 | self.base_text_normalizer_class_with_name.__dict__, 26 | ) 27 | 28 | def test_denormalize(self): 29 | self.assertEqual( 30 | 'HAHA', 31 | self.base_text_normalizer_class.denormalize(sentence='HAHA'), 32 | ) 33 | self.assertEqual( 34 | 'HAHA', 35 | self.base_text_normalizer_class_with_name.denormalize(sentence='HAHA'), 36 | ) 37 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/test_eng_lowercase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | from ..eng_lowercase import EngLowercase 4 | 5 | 6 | class EngLowercaseTestCase(TestCase): 7 | 8 | def setUp(self): 9 | self.eng_lowercase_text_normalizer = EngLowercase() 10 | 11 | def test_lowercase(self): 12 | test_cases = [ 13 | ( 14 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ", 15 | "abcdefghijklmnopqrstuvwxyz", 16 | ), 17 | ( 18 | "abcdefghijklmnopqrstuvwxyz", 19 | "abcdefghijklmnopqrstuvwxyz", 20 | ), 21 | ( 22 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ", 23 | "abcdefghijklmnopqrstuvwxyz", 24 | ), 25 | ] 26 | for test_case in test_cases: 27 | with self.subTest(test_case=test_case): 28 | self.assertEqual( 29 | test_case[1], 30 | self.eng_lowercase_text_normalizer.lowercase( 31 | sentence=test_case[0], 32 | ), 33 | ) 34 | 35 | def test_normalize_n_denormalize_0(self): 36 | test_cases = [ 37 | ( 38 | "哈囉 AAB 123 Cddef 哈囉 >< ???", 39 | ( 40 | "哈囉 aab 123 cddef 哈囉 >< ???", 41 | [ 42 | { 43 | "before": "AAB", 44 | "after": "aab", 45 | }, 46 | { 47 | "before": "Cddef", 48 | "after": "cddef", 49 | }, 50 | ], 51 | ), 52 | "哈囉 AAB 123 Cddef 哈囉 >< ???", 53 | ), 54 | ( 55 | "AAB 123 哈囉 Cddef 456 ffecI", 56 | ( 57 | "aab 123 哈囉 cddef 456 ffeci", 58 | [ 59 | { 60 | "before": "AAB", 61 | "after": "aab", 62 | }, 63 | { 64 | "before": "Cddef", 65 | "after": "cddef", 66 | }, 67 | { 68 | "before": "ffecI", 69 | "after": "ffeci", 70 | }, 71 | ], 72 | ), 73 | "AAB 123 哈囉 Cddef 456 ffecI", 74 | ), 75 | ( 76 | "家豪大大亂入吃雞排", 77 | ("家豪大大亂入吃雞排", None), 78 | "家豪大大亂入吃雞排", 79 | ), 80 | ( 81 | "abc", 82 | ( 83 | "abc", 84 | [ 85 | { 86 | "before": "abc", 87 | "after": "abc", 88 | }, 89 | ], 90 | ), 91 | "abc", 92 | ), 93 | ] 94 | 95 | for test_case in test_cases: 96 | with self.subTest( 97 | test_case="normalize {}".format(test_case[0]), 98 | ): 99 | self.assertEqual( 100 | test_case[1], 101 | self.eng_lowercase_text_normalizer.normalize( 102 | sentence=test_case[0], 103 | ), 104 | ) 105 | with self.subTest( 106 | test_case="denormalize {}".format(test_case[0]), 107 | ): 108 | self.assertEqual( 109 | test_case[2], 110 | self.eng_lowercase_text_normalizer.denormalize( 111 | sentence=test_case[1][0], 112 | meta=test_case[1][1], 113 | ), 114 | ) 115 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/test_identity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | from ..identity import Identity 4 | 5 | 6 | class IdentityTestCase(TestCase): 7 | 8 | def setUp(self): 9 | self.identity_text_normalizer = Identity() 10 | 11 | def test_attributes(self): 12 | self.assertEqual( 13 | { 14 | 'denormalizable': False, 15 | 'name': 'identity', 16 | }, 17 | self.identity_text_normalizer.__dict__, 18 | ) 19 | 20 | def test_normalize(self): 21 | result = self.identity_text_normalizer.normalize( 22 | '不管你測什麼 我都會回傳原本的句子給你 呵呵', 23 | ) 24 | self.assertEqual( 25 | ('不管你測什麼 我都會回傳原本的句子給你 呵呵', None), 26 | result, 27 | ) 28 | 29 | def test_denormalize(self): 30 | result = self.identity_text_normalizer.denormalize( 31 | '不管你測什麼 我都會回傳原本的句子給你 呵呵', 32 | ) 33 | self.assertEqual( 34 | '不管你測什麼 我都會回傳原本的句子給你 呵呵', 35 | result, 36 | ) 37 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/test_number_token_test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | from ..number_token import ( 4 | gen_float_token_with_digit, 5 | gen_int_token_with_digit, 6 | sub_token_with_value_sequentially, 7 | NumberToken, 8 | ) 9 | 10 | 11 | class NumberTokenTestCase(TestCase): 12 | 13 | def run_test_denormalizable(self, test_cases, normalizer): 14 | for test_case in test_cases: 15 | with self.subTest(test_case=test_case): 16 | self.assertEqual( 17 | test_case[1], 18 | normalizer.normalize(test_case[0]), 19 | ) 20 | self.assertEqual( 21 | test_case[0], 22 | normalizer.denormalize( 23 | sentence=test_case[1][0], 24 | meta=test_case[1][1], 25 | ), 26 | ) 27 | 28 | def run_test_not_denormalizable(self, test_cases, normalizer): 29 | for test_case in test_cases: 30 | with self.subTest(test_case=test_case): 31 | self.assertEqual( 32 | test_case[1], 33 | normalizer.normalize(test_case[0]), 34 | ) 35 | self.assertEqual( 36 | test_case[1][0], 37 | normalizer.denormalize( 38 | sentence=test_case[1][0], 39 | meta=test_case[1][1], 40 | ), 41 | ) 42 | 43 | def test_gen_float_token_with_digit(self): 44 | self.assertEqual( 45 | ["_1float1_", "_1float5_", "_3float4_", "_4float2_"], 46 | gen_float_token_with_digit( 47 | ["2.0", "0.00003", "300.1113", "5000.05"]), 48 | ) 49 | 50 | def test_gen_int_token_with_digit(self): 51 | self.assertEqual( 52 | ["_1int_", "_2int_", "_3int_", "_7int_"], 53 | gen_int_token_with_digit(["1", "20", "300", "5000.05"]), 54 | ) 55 | 56 | def test_sub_token_with_value_sequentially(self): 57 | test_cases = [ 58 | ( 59 | { 60 | "sentence": "A@A@@A@@@AA", 61 | "token": "A", 62 | "value_list": ["1", "2", "3", "4", "5"], 63 | }, 64 | "1@2@@3@@@45", 65 | ), 66 | ( 67 | { 68 | "sentence": "來亂的", 69 | "token": "bla", 70 | "value_list": [], 71 | }, 72 | "來亂的", 73 | ), 74 | ] 75 | for test_case in test_cases: 76 | with self.subTest(test_case=test_case): 77 | self.assertEqual( 78 | test_case[1], 79 | sub_token_with_value_sequentially(**test_case[0]), 80 | ) 81 | 82 | def test_unhandle_case(self): 83 | with self.assertRaises(KeyError): 84 | NumberToken(token="_ohoh_") 85 | 86 | def test_pure_int(self): 87 | int_text_normalizer = NumberToken(token="_int_") 88 | test_cases = [ 89 | ("123", ("_int_", {"_int_": ["123"]})), 90 | ("23.35", ("_int_._int_", {"_int_": ["23", "35"]})), 91 | ("23 0000", ("_int_ _int_", {"_int_": ["23", "0000"]})), 92 | ("OHOH 23", ("OHOH _int_", {"_int_": ["23"]})), 93 | ("122223333 OHOH", ("_int_ OHOH", {"_int_": ["122223333"]})), 94 | ("100", ("_int_", {"_int_": ["100"]})), 95 | ("340分", ("_int_分", {"_int_": ["340"]})), 96 | ("薄餡大大1個打10個", ("薄餡大大_int_個打_int_個", {"_int_": ["1", "10"]})), 97 | ("0800-22-44-66", 98 | ("_int_-_int_-_int_-_int_", {"_int_": ["0800", "22", "44", "66"]})), 99 | ("來亂的", ("來亂的", {"_int_": []})), 100 | ] 101 | self.run_test_denormalizable( 102 | normalizer=int_text_normalizer, 103 | test_cases=test_cases, 104 | ) 105 | 106 | def test_pure_int_not_denormalizable(self): 107 | int_text_normalizer_not_denormalizable = NumberToken( 108 | token="_int_", 109 | denormalizable=False, 110 | ) 111 | test_cases = [ 112 | ("123", ("_int_", None)), 113 | ("23.35", ("_int_._int_", None)), 114 | ("23 0000", ("_int_ _int_", None)), 115 | ("OHOH 23", ("OHOH _int_", None)), 116 | ("122223333 OHOH", ("_int_ OHOH", None)), 117 | ("100", ("_int_", None)), 118 | ("340分", ("_int_分", None)), 119 | ("薄餡大大1個打10個", ("薄餡大大_int_個打_int_個", None)), 120 | ("0800-22-44-66", ("_int_-_int_-_int_-_int_", None)), 121 | ("來亂的", ("來亂的", None)), 122 | ] 123 | self.run_test_not_denormalizable( 124 | normalizer=int_text_normalizer_not_denormalizable, 125 | test_cases=test_cases, 126 | ) 127 | 128 | def test_pure_float(self): 129 | float_text_normalizer = NumberToken(token="_float_") 130 | test_cases = [ 131 | ("49.3", ("_float_", {"_float_": ["49.3"]})), 132 | ("12.33 456.0", ("_float_ _float_", {"_float_": ["12.33", "456.0"]})), 133 | ("123", ("123", {"_float_": []})), 134 | ("94.87分", ("_float_分", {"_float_": ["94.87"]})), 135 | ("薄餡大大1.5個打10.7個", 136 | ("薄餡大大_float_個打_float_個", {"_float_": ["1.5", "10.7"]})), 137 | ("123.456.789", ("123.456.789", {"_float_": []})), 138 | ("100.000", ("_float_", {"_float_": ["100.000"]})), 139 | ("94.87分", ("_float_分", {"_float_": ["94.87"]})), 140 | ("薄餡大大1.5個打10.7個", 141 | ("薄餡大大_float_個打_float_個", {"_float_": ["1.5", "10.7"]})), 142 | ("123.456.789", ("123.456.789", {"_float_": []})), 143 | ("來亂的", ("來亂的", {"_float_": []})), 144 | ] 145 | self.run_test_denormalizable( 146 | normalizer=float_text_normalizer, 147 | test_cases=test_cases, 148 | ) 149 | 150 | def test_pure_float_not_denormalizable(self): 151 | float_text_normalizer = NumberToken( 152 | token="_float_", 153 | denormalizable=False, 154 | ) 155 | test_cases = [ 156 | ("49.3", ("_float_", None)), 157 | ("12.33 456.0", ("_float_ _float_", None)), 158 | ("123", ("123", None)), 159 | ("94.87分", ("_float_分", None)), 160 | ("薄餡大大1.5個打10.7個", ("薄餡大大_float_個打_float_個", None)), 161 | ("123.456.789", ("123.456.789", None)), 162 | ("100.000", ("_float_", None)), 163 | ("94.87分", ("_float_分", None)), 164 | ("薄餡大大1.5個打10.7個", ("薄餡大大_float_個打_float_個", None)), 165 | ("123.456.789", ("123.456.789", None)), 166 | ("來亂的", ("來亂的", None)), 167 | ] 168 | self.run_test_not_denormalizable( 169 | normalizer=float_text_normalizer, 170 | test_cases=test_cases, 171 | ) 172 | 173 | def test_int_with_digit(self): 174 | intd_text_normalizer = NumberToken(token="_{}int_") 175 | test_cases = [ 176 | ("123", ("_3int_", {"_3int_": ["123"]})), 177 | ("098765431389", ("_12int_", {"_12int_": ["098765431389"]})), 178 | ("1 4567890103", 179 | ("_1int_ _10int_", {"_1int_": ["1"], "_10int_": ["4567890103"]})), 180 | ("_12float733_", ("_12float733_", {})), 181 | ("ohoh 000 _33float0_ 1", 182 | ("ohoh _3int_ _33float0_ _1int_", {"_3int_": ["000"], "_1int_": ["1"]})), 183 | ("123 345 678 901", 184 | ("_3int_ _3int_ _3int_ _3int_", {"_3int_": ["123", "345", "678", "901"]})), 185 | ("100", ("_3int_", {"_3int_": ["100"]})), 186 | ("340分", ("_3int_分", {"_3int_": ["340"]})), 187 | ("薄餡大大1個打10個", ("薄餡大大_1int_個打_2int_個", {"_1int_": ["1"], "_2int_": ["10"]})), 188 | ("0800-22-44-66", 189 | ("_4int_-_2int_-_2int_-_2int_", 190 | {"_4int_": ["0800"], "_2int_": ["22", "44", "66"]})), 191 | ("來亂的", ("來亂的", {})), 192 | ] 193 | self.run_test_denormalizable( 194 | test_cases=test_cases, 195 | normalizer=intd_text_normalizer, 196 | ) 197 | 198 | def test_int_with_digit_not_denormalizable(self): 199 | intd_text_normalizer = NumberToken( 200 | token="_{}int_", 201 | denormalizable=False, 202 | ) 203 | test_cases = [ 204 | ("123", ("_3int_", None)), 205 | ("098765431389", ("_12int_", None)), 206 | ("1 4567890103", ("_1int_ _10int_", None)), 207 | ("_12float733_", ("_12float733_", None)), 208 | ("ohoh 000 _33float0_ 1", ("ohoh _3int_ _33float0_ _1int_", None)), 209 | ("100", ("_3int_", None)), 210 | ("340分", ("_3int_分", None)), 211 | ("薄餡大大1個打10個", ("薄餡大大_1int_個打_2int_個", None)), 212 | ("0800-22-44-66", ("_4int_-_2int_-_2int_-_2int_", None)), 213 | ("來亂的", ("來亂的", None)), 214 | ] 215 | self.run_test_not_denormalizable( 216 | test_cases=test_cases, 217 | normalizer=intd_text_normalizer, 218 | ) 219 | 220 | def test_float_with_digit(self): 221 | floatd_text_normalizer = NumberToken( 222 | token="_{}float{}_", 223 | ) 224 | test_cases = [ 225 | ("123.33", ("_3float2_", {"_3float2_": ["123.33"]})), 226 | ("123", ("123", {})), 227 | ("1234567890.123456789011", 228 | ("_10float12_", {"_10float12_": ["1234567890.123456789011"]})), 229 | ("1.3 224.00", ("_1float1_ _3float2_", 230 | {"_1float1_": ["1.3"], "_3float2_": ["224.00"]})), 231 | ("12.3 34.5 67.8 90.1", 232 | ("_2float1_ _2float1_ _2float1_ _2float1_", 233 | {"_2float1_": ["12.3", "34.5", "67.8", "90.1"]})), 234 | ("_3int_", ("_3int_", {})), 235 | ("94.87分", ("_2float2_分", {"_2float2_": ["94.87"]})), 236 | ("薄餡大大1.5個打10.7個", 237 | ("薄餡大大_1float1_個打_2float1_個", 238 | {"_1float1_": ["1.5"], "_2float1_": ["10.7"]})), 239 | ("123.456.789", ("123.456.789", {})), 240 | ("100.000", ("_3float3_", {"_3float3_": ["100.000"]})), 241 | ("94.87分", ("_2float2_分", {"_2float2_": ["94.87"]})), 242 | ("薄餡大大1.5個打10.7個", 243 | ("薄餡大大_1float1_個打_2float1_個", 244 | {"_1float1_": ["1.5"], "_2float1_": ["10.7"]})), 245 | ("123.456.789", ("123.456.789", {})), 246 | ("來亂的", ("來亂的", {})), 247 | ] 248 | self.run_test_denormalizable( 249 | test_cases=test_cases, 250 | normalizer=floatd_text_normalizer, 251 | ) 252 | 253 | def test_float_with_digit_not_denrmalizable(self): 254 | floatd_text_normalizer = NumberToken( 255 | token="_{}float{}_", 256 | denormalizable=False, 257 | ) 258 | test_cases = [ 259 | ("123.33", ("_3float2_", None)), 260 | ("123", ("123", None)), 261 | ("1234567890.123456789011", ("_10float12_", None)), 262 | ("1.3 224.00", ("_1float1_ _3float2_", None)), 263 | ("_3int_", ("_3int_", None)), 264 | ("94.87分", ("_2float2_分", None)), 265 | ("薄餡大大1.5個打10.7個", ("薄餡大大_1float1_個打_2float1_個", None)), 266 | ("123.456.789", ("123.456.789", None)), 267 | ("100.000", ("_3float3_", None)), 268 | ("94.87分", ("_2float2_分", None)), 269 | ("薄餡大大1.5個打10.7個", ("薄餡大大_1float1_個打_2float1_個", None)), 270 | ("123.456.789", ("123.456.789", None)), 271 | ("來亂的", ("來亂的", None)), 272 | ] 273 | self.run_test_not_denormalizable( 274 | test_cases=test_cases, 275 | normalizer=floatd_text_normalizer, 276 | ) 277 | 278 | def test_int_text_normalizer_with_space(self): 279 | int_text_normalizer_with_space = NumberToken(token=" _int_ ") 280 | test_cases = [ 281 | ("12345678900", (" _int_ ", {" _int_ ": ["12345678900"]})), 282 | ("340分", (" _int_ 分", {" _int_ ": ["340"]})), 283 | ("薄餡大大1個打10個", ("薄餡大大 _int_ 個打 _int_ 個", {" _int_ ": ["1", "10"]})), 284 | ("0800-22-44-66", (" _int_ - _int_ - _int_ - _int_ ", 285 | {" _int_ ": ["0800", "22", "44", "66"]})), 286 | ("100", (" _int_ ", {" _int_ ": ["100"]})), 287 | ("340分", (" _int_ 分", {" _int_ ": ["340"]})), 288 | ("薄餡大大1個打10個", ("薄餡大大 _int_ 個打 _int_ 個", {" _int_ ": ["1", "10"]})), 289 | ("0800-22-44-66", (" _int_ - _int_ - _int_ - _int_ ", 290 | {" _int_ ": ["0800", "22", "44", "66"]})), 291 | ("家豪大大亂入", ("家豪大大亂入", {" _int_ ": []})), 292 | ] 293 | self.run_test_denormalizable( 294 | test_cases=test_cases, 295 | normalizer=int_text_normalizer_with_space, 296 | ) 297 | with self.assertRaises(ValueError): 298 | int_text_normalizer_with_space.denormalize( 299 | sentence=" _int_ 和 _int_ 這兩個日期都沒有雞排", 300 | meta={" _int_ ": ["12"]}, 301 | ) 302 | 303 | def test_float_text_normalizer_with_space(self): 304 | float_text_normalizer_with_space = NumberToken(token=" _float_ ") 305 | test_cases = [ 306 | ("100.000", (" _float_ ", {" _float_ ": ["100.000"]})), 307 | ("94.87分", (" _float_ 分", {" _float_ ": ["94.87"]})), 308 | ("薄餡大大1.5個打10.7個", ("薄餡大大 _float_ 個打 _float_ 個", {" _float_ ": ["1.5", "10.7"]})), 309 | ("123.456.789", ("123.456.789", {" _float_ ": []})), 310 | ("100.000", (" _float_ ", {" _float_ ": ["100.000"]})), 311 | ("94.87分", (" _float_ 分", {" _float_ ": ["94.87"]})), 312 | ("薄餡大大1.5個打10.7個", ("薄餡大大 _float_ 個打 _float_ 個", {" _float_ ": ["1.5", "10.7"]})), 313 | ("123.456.789", ("123.456.789", {" _float_ ": []})), 314 | ("家豪大大亂入", ("家豪大大亂入", {" _float_ ": []})), 315 | ] 316 | self.run_test_denormalizable( 317 | test_cases=test_cases, 318 | normalizer=float_text_normalizer_with_space, 319 | ) 320 | 321 | def test_int_with_digit_n_space(self): 322 | intd_text_normalizer_with_space = NumberToken(token=" _{}int_ ") 323 | test_cases = [ 324 | ("123", (" _3int_ ", {" _3int_ ": ["123"]})), 325 | ("098765431389", (" _12int_ ", {" _12int_ ": ["098765431389"]})), 326 | ("1 4567890103", 327 | (" _1int_ _10int_ ", 328 | {" _1int_ ": ["1"], " _10int_ ": ["4567890103"]})), 329 | ("_12float733_", ("_12float733_", {})), 330 | ("ohoh 000 _33float0_ 1", 331 | ("ohoh _3int_ _33float0_ _1int_ ", 332 | {" _3int_ ": ["000"], " _1int_ ": ["1"]})), 333 | ("123 345 678 901", 334 | (" _3int_ _3int_ _3int_ _3int_ ", 335 | {" _3int_ ": ["123", "345", "678", "901"]})), 336 | ("100", (" _3int_ ", {" _3int_ ": ["100"]})), 337 | ("340分", (" _3int_ 分", {" _3int_ ": ["340"]})), 338 | ("薄餡大大1個打10個", 339 | ("薄餡大大 _1int_ 個打 _2int_ 個", 340 | {" _1int_ ": ["1"], " _2int_ ": ["10"]})), 341 | ("0800-22-44-66", 342 | (" _4int_ - _2int_ - _2int_ - _2int_ ", 343 | {" _4int_ ": ["0800"], " _2int_ ": ["22", "44", "66"]})), 344 | ("來亂的", ("來亂的", {})), 345 | ] 346 | self.run_test_denormalizable( 347 | test_cases=test_cases, 348 | normalizer=intd_text_normalizer_with_space, 349 | ) 350 | 351 | def test_float_with_digit_n_space(self): 352 | floatd_text_normalizer_with_space = NumberToken( 353 | token=" _{}float{}_ ", 354 | ) 355 | test_cases = [ 356 | ("123.33", (" _3float2_ ", {" _3float2_ ": ["123.33"]})), 357 | ("123", ("123", {})), 358 | ("1234567890.123456789011", 359 | (" _10float12_ ", {" _10float12_ ": ["1234567890.123456789011"]})), 360 | ("1.3 224.00", (" _1float1_ _3float2_ ", 361 | {" _1float1_ ": ["1.3"], " _3float2_ ": ["224.00"]})), 362 | ("12.3 34.5 67.8 90.1", 363 | (" _2float1_ _2float1_ _2float1_ _2float1_ ", 364 | {" _2float1_ ": ["12.3", "34.5", "67.8", "90.1"]})), 365 | ("_3int_", ("_3int_", {})), 366 | ("94.87分", (" _2float2_ 分", {" _2float2_ ": ["94.87"]})), 367 | ("薄餡大大1.5個打10.7個", 368 | ("薄餡大大 _1float1_ 個打 _2float1_ 個", 369 | {" _1float1_ ": ["1.5"], " _2float1_ ": ["10.7"]})), 370 | ("123.456.789", ("123.456.789", {})), 371 | ("100.000", (" _3float3_ ", {" _3float3_ ": ["100.000"]})), 372 | ("94.87分", (" _2float2_ 分", {" _2float2_ ": ["94.87"]})), 373 | ("薄餡大大1.5個打10.7個", 374 | ("薄餡大大 _1float1_ 個打 _2float1_ 個", 375 | {" _1float1_ ": ["1.5"], " _2float1_ ": ["10.7"]})), 376 | ("123.456.789", ("123.456.789", {})), 377 | ("來亂的", ("來亂的", {})), 378 | ] 379 | self.run_test_denormalizable( 380 | test_cases=test_cases, 381 | normalizer=floatd_text_normalizer_with_space, 382 | ) 383 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/test_punctuation_mapping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from os.path import abspath, dirname, join 3 | from unittest import TestCase 4 | 5 | from ..punctuation_mapping import PunctuationMapping 6 | 7 | ROOT_DIR = dirname(abspath(__file__)) 8 | 9 | 10 | class PunctMappingTestCase(TestCase): 11 | 12 | def setUp(self): 13 | self.punct_normalizer = PunctuationMapping( 14 | normalization_table_path=join( 15 | ROOT_DIR, "example_punctuation_mapping.csv"), 16 | ) 17 | 18 | def test_normalize_n_denormalize(self): 19 | result = self.punct_normalizer.normalize( 20 | "❨哈囉❩,((❩) ) ,,,﹙﹚() ❨", 21 | ) 22 | self.assertEqual( 23 | ("(哈囉),(()) ) ,,,()() (", 24 | [ 25 | { 26 | "before": ["❨", "(", "(", "﹙", "(", "❨"], 27 | "after": "(", 28 | }, 29 | { 30 | "before": ["❩", "❩", ")", ")", "﹚", ")"], 31 | "after": ")", 32 | }, 33 | { 34 | "before": [",", ",", ",", ","], 35 | "after": ",", 36 | }, 37 | ], 38 | ), 39 | result, 40 | ) 41 | result = self.punct_normalizer.denormalize( 42 | sentence=result[0], 43 | meta=result[1], 44 | ) 45 | self.assertEqual( 46 | "❨哈囉❩,((❩) ) ,,,﹙﹚() ❨", 47 | result, 48 | ) 49 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/test_strip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | from ..strip import Strip 4 | 5 | 6 | class StripTestCase(TestCase): 7 | 8 | def setUp(self): 9 | self.strip_text_normalizer_default = Strip() 10 | self.strip_text_normalizer_left = Strip( 11 | direction='left', 12 | chars=['#', ' '], 13 | ) 14 | self.strip_text_normalizer_right = Strip( 15 | direction='right', 16 | chars=['/', ' '], 17 | ) 18 | 19 | def test_attributes(self): 20 | self.assertEqual( 21 | { 22 | 'chars': None, 23 | 'chars_str': None, 24 | 'direction': 'both', 25 | 'denormalizable': False, 26 | 'name': 'strip_both_None', 27 | }, 28 | self.strip_text_normalizer_default.__dict__, 29 | ) 30 | self.assertEqual( 31 | { 32 | 'chars': ['#', ' '], 33 | 'chars_str': '# ', 34 | 'direction': 'left', 35 | 'denormalizable': False, 36 | 'name': 'strip_left_# ', 37 | }, 38 | self.strip_text_normalizer_left.__dict__, 39 | ) 40 | self.assertEqual( 41 | { 42 | 'chars': ['/', ' '], 43 | 'chars_str': '/ ', 44 | 'direction': 'right', 45 | 'denormalizable': False, 46 | 'name': 'strip_right_/ ', 47 | }, 48 | self.strip_text_normalizer_right.__dict__, 49 | ) 50 | 51 | def test_normalize(self): 52 | result = self.strip_text_normalizer_default.normalize( 53 | sentence=' HAHA ', 54 | ) 55 | self.assertEqual( 56 | ('HAHA', None), 57 | result, 58 | ) 59 | result = self.strip_text_normalizer_left.normalize( 60 | sentence='## \t\tHAHA', 61 | ) 62 | self.assertEqual( 63 | ('\t\tHAHA', None), 64 | result, 65 | ) 66 | result = self.strip_text_normalizer_right.normalize( 67 | sentence='HAHA\t\t/// ', 68 | ) 69 | self.assertEqual( 70 | ('HAHA\t\t', None), 71 | result, 72 | ) 73 | -------------------------------------------------------------------------------- /text_normalizer/factory/test/test_unicode_mapping.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from os.path import abspath, dirname, join 3 | 4 | from ..unicode_mapping import UnicodeMapping 5 | 6 | 7 | ROOT_DIR = dirname(abspath(__file__)) 8 | 9 | 10 | class UnicodeMappingTestCase(TestCase): 11 | 12 | def setUp(self): 13 | self.normalizer = UnicodeMapping( 14 | unicode_mapping_path=join( 15 | ROOT_DIR, 16 | "example_unicode_mapping.txt", 17 | ), 18 | ) 19 | 20 | def test_attributes(self): 21 | table = self.normalizer.mapping_table 22 | self.assertEqual( 23 | { 24 | '0xff11': '1', 25 | '0x31': '1', 26 | '0xff0c': ',', 27 | '0xff10': '0', 28 | '0x2c': ',', 29 | }, 30 | table, 31 | ) 32 | unicode_other = self.normalizer.u_other 33 | self.assertEqual("0x20", unicode_other) 34 | 35 | other = self.normalizer.other 36 | self.assertEqual(" ", other) 37 | 38 | def test_normalize(self): 39 | result = self.normalizer.normalize( 40 | sentence=',,HAHA0101 11', 41 | ) 42 | self.assertEqual( 43 | ( 44 | ',, 0101 11', 45 | { 46 | '0': ['0', '0'], 47 | ',': [',', ','], 48 | ' ': ['H', 'A', 'H', 'A', ' '], 49 | '1': ['1', '1', '1', '1'], 50 | }, 51 | ), 52 | result, 53 | ) 54 | 55 | def test_denormalize(self): 56 | nor_result = self.normalizer.normalize( 57 | sentence=',,HAHA0101 11', 58 | ) 59 | de_result = self.normalizer.denormalize( 60 | sentence=nor_result[0], 61 | meta=nor_result[1], 62 | ) 63 | self.assertEqual( 64 | ',,HAHA0101 11', 65 | de_result, 66 | ) 67 | -------------------------------------------------------------------------------- /text_normalizer/factory/toolkit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/factory/toolkit/__init__.py -------------------------------------------------------------------------------- /text_normalizer/factory/toolkit/findall_position.pyx: -------------------------------------------------------------------------------- 1 | 2 | 3 | def findall_position(input_str, reg_pattern): 4 | return findall_position_in_c( 5 | input_str, 6 | reg_pattern, 7 | ) 8 | 9 | 10 | cdef list findall_position_in_c( # noqa: E999 11 | str input_str, 12 | reg_pattern, 13 | ): 14 | cdef unsigned int i, str_len 15 | cdef list output_list 16 | 17 | i = 0 18 | str_len = len(input_str) 19 | output_list = [] 20 | while (i < str_len): 21 | output = reg_pattern.search(input_str[i:]) 22 | if output is None: 23 | break 24 | start, end = output.span() 25 | output_list.append((start + i, end + i)) 26 | i += end 27 | return output_list 28 | -------------------------------------------------------------------------------- /text_normalizer/factory/unicode_mapping.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Tuple 2 | import re 3 | 4 | from .base_factory import BaseFactory 5 | 6 | 7 | PROG = re.compile(r"([0-9A-Z\s\-]+)\:([0-9A-Za-z]+)") 8 | PROG_DASH = re.compile(r"([0-9A-Z]+)\-([0-9A-Z]+)") 9 | 10 | 11 | class UnicodeMapping(BaseFactory): 12 | 13 | def __init__( 14 | self, 15 | unicode_mapping_path: str, 16 | other: hex = "0x20", 17 | name: str = 'unicode_normalizer', 18 | denormalizable: bool = True, 19 | ) -> None: 20 | 21 | self.denormalizable = denormalizable 22 | self.mapping_table = self._gen_unicode_mapping_table( 23 | unicode_mapping_path=unicode_mapping_path, 24 | ) 25 | if len(other) > 0: 26 | self.u_other = other 27 | self.other = chr(int(other, 16)) 28 | else: 29 | self.u_other = None 30 | self.other = other 31 | self.denormalizable = False 32 | 33 | super().__init__( 34 | name=name, 35 | denormalizable=self.denormalizable, 36 | ) 37 | 38 | @staticmethod 39 | def _gen_unicode_mapping_table( 40 | unicode_mapping_path: str, 41 | ) -> Dict[hex, str]: 42 | 43 | with open(unicode_mapping_path, "r") as filep: 44 | mapping_list = filep.read().split("\n") 45 | 46 | mapping_table = {} 47 | for map_ in mapping_list: 48 | 49 | if len(map_) == 0: 50 | continue 51 | 52 | input_, output = PROG.findall(map_)[0] 53 | 54 | range_or_not = PROG_DASH.findall(input_) 55 | 56 | if len(range_or_not) > 0: 57 | for uninum in range( 58 | int(range_or_not[0][0], 16), 59 | int(range_or_not[0][1], 16) + 1, 60 | ): 61 | if output == "one2one": 62 | output_token = chr(uninum) 63 | else: 64 | output_token = chr(int(output, 16)) 65 | mapping_table[hex(uninum)] = output_token 66 | else: 67 | for uninum in input_.split(" "): 68 | mapping_table[hex(int(uninum, 16))] = chr(int(output, 16)) 69 | 70 | return mapping_table 71 | 72 | @staticmethod 73 | def _check_utf8_encoding(sentence: str): 74 | 75 | try: 76 | output_sentence = sentence.encode('utf-8').decode('utf-8') 77 | except UnicodeEncodeError as e: 78 | print("sentence: {}, error: {}".format(sentence, e)) 79 | return False 80 | if output_sentence != sentence: 81 | return False 82 | 83 | return True 84 | 85 | def normalize( 86 | self, 87 | sentence: str, 88 | ) -> Tuple[str, Dict[str, List[str]]]: 89 | 90 | if not self._check_utf8_encoding(sentence): 91 | raise ValueError( 92 | "sentence: {} can not be encoded by UTF-8".format(sentence), 93 | ) 94 | 95 | output_sentence = [] 96 | meta = {} 97 | for char in sentence: 98 | uchar = hex(ord(char)) 99 | if uchar in self.mapping_table: 100 | output_char = self.mapping_table[uchar] 101 | else: 102 | output_char = self.other 103 | if output_char not in meta: 104 | meta[output_char] = [char] 105 | else: 106 | meta[output_char].extend(char) 107 | output_sentence.append(output_char) 108 | 109 | return "".join(output_sentence), meta 110 | 111 | def denormalize( 112 | self, 113 | sentence: str, 114 | meta: Dict[str, List[str]], 115 | ) -> str: 116 | 117 | if not self.denormalizable: 118 | return sentence 119 | 120 | for org_o, org_i in meta.items(): 121 | splited_sent = sentence.split(org_o) 122 | output_sentence = [] 123 | for i, token in enumerate(splited_sent): 124 | output_sentence.append(token) 125 | if i != len(org_i): 126 | output_sentence.append(org_i[i]) 127 | sentence = "".join(output_sentence) 128 | 129 | return sentence 130 | -------------------------------------------------------------------------------- /text_normalizer/library/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic import ( # noqa 2 | whitespace_char_text_normalizer, 3 | whitespace_reduction_text_normalizer, 4 | ) 5 | from .punctuation import ( # noqa 6 | chinese_punctuation_text_normalizer, 7 | english_punctuation_text_normalizer, 8 | all_punctuation_text_normalizer, 9 | all_punctuation_without_endpoint_text_normalizer, 10 | all_punctuation_without_underscore_text_normalizer, 11 | ) 12 | from .date import ( # noqa 13 | date_text_normalizer_yymmdd, 14 | ) 15 | from .time import ( # noqa 16 | time_text_normalizer_hhmm, 17 | ) 18 | from .identity import identity_text_normalizer # noqa 19 | from .eng_lowercase import eng_lowercase_text_normalizer # noqa 20 | from .punctuation_mapping import ( # noqa 21 | full_punctuation_mapping_text_normalizer, 22 | simplified_punctuation_mapping_text_normalizer, 23 | ) 24 | from .number import ( # noqa 25 | int_text_normalizer, 26 | float_text_normalizer, 27 | int_with_digit_text_normalizer, 28 | float_with_digit_text_normalizer, 29 | int_with_space_text_normalizer, 30 | float_with_space_text_normalizer, 31 | int_with_digit_n_space_text_normalizer, 32 | float_with_digit_n_space_text_normalizer, 33 | ) 34 | from .strip import ( # noqa 35 | pure_strip_text_normalizer, 36 | ) 37 | from .unicode import ( # noqa 38 | unicode__chinese_characters_text_normalizer, 39 | unicode__chinese_characters_and_digits_text_normalizer, 40 | unicode__english_characters_and_digits_text_normalizer, 41 | unicode__english_digits_and_full_punctuations_text_normalizer, 42 | unicode__chinese_english_digits_and_full_punctuations_text_normalizer, 43 | unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer, 44 | ) 45 | -------------------------------------------------------------------------------- /text_normalizer/library/basic.py: -------------------------------------------------------------------------------- 1 | from ..factory import ReplacePatternWithToken 2 | 3 | 4 | whitespace_char_text_normalizer = ReplacePatternWithToken( 5 | name='whitespace_char', 6 | denormalizable=False, 7 | target_pattern=r'\s+', 8 | prefix_pattern=None, 9 | suffix_pattern=None, 10 | token=' ', 11 | ) 12 | 13 | whitespace_reduction_text_normalizer = ReplacePatternWithToken( 14 | name='whitespaces2one', 15 | denormalizable=True, 16 | target_pattern=r'\s+', 17 | prefix_pattern=None, 18 | suffix_pattern=None, 19 | token=' ', 20 | ) 21 | -------------------------------------------------------------------------------- /text_normalizer/library/date.py: -------------------------------------------------------------------------------- 1 | from ..factory import ReplacePatternWithToken 2 | 3 | 4 | date_text_normalizer_yymmdd = ReplacePatternWithToken( 5 | name='date_yymmdd', 6 | denormalizable=True, 7 | target_pattern=r'[0-2]*\d\d\d-[0-1]*\d-[0-3]*\d', 8 | prefix_pattern=r'[^\d-]{1}|\A', 9 | suffix_pattern=r'[^\d-]{1}|\Z', 10 | token=' _date_ ', 11 | ) 12 | -------------------------------------------------------------------------------- /text_normalizer/library/eng_lowercase.py: -------------------------------------------------------------------------------- 1 | from ..factory import EngLowercase 2 | 3 | 4 | eng_lowercase_text_normalizer = EngLowercase() 5 | -------------------------------------------------------------------------------- /text_normalizer/library/identity.py: -------------------------------------------------------------------------------- 1 | from ..factory import Identity 2 | 3 | 4 | identity_text_normalizer = Identity() 5 | -------------------------------------------------------------------------------- /text_normalizer/library/number.py: -------------------------------------------------------------------------------- 1 | from ..factory import NumberToken 2 | 3 | 4 | int_text_normalizer = NumberToken(token="_int_") 5 | float_text_normalizer = NumberToken(token="_float_") 6 | int_with_digit_text_normalizer = NumberToken(token="_{}int_") 7 | float_with_digit_text_normalizer = NumberToken(token="_{}float{}_") 8 | 9 | int_with_space_text_normalizer = NumberToken(token=" _int_ ") 10 | float_with_space_text_normalizer = NumberToken(token=" _float_ ") 11 | int_with_digit_n_space_text_normalizer = NumberToken(token=" _{}int_ ") 12 | float_with_digit_n_space_text_normalizer = NumberToken(token=" _{}float{}_ ") 13 | -------------------------------------------------------------------------------- /text_normalizer/library/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..factory import ReplacePatternWithToken 2 | 3 | CHINESE_PUNCTUATIONS = r"。「」﹁﹂『 』、‧( )《》〈〉 ﹏﹏﹏……—~,?;:[]【 】!" 4 | ENGLISH_PUNCTUATIONS = \ 5 | r"\!\#\$\%\&\(\)\*\+\,\-\.\/\:\;\?\@\[\]\{\}\|\~\`\_\^\<\>\=\'\"\\" 6 | ENGLISH_PUNCTUATIONS_WITHOUT_ENDPOINT = ENGLISH_PUNCTUATIONS.replace("\.", "") 7 | ENGLISH_PUNCTUATIONS_WITHOUT_UNDERSCORE = ENGLISH_PUNCTUATIONS.replace("\_", "") 8 | 9 | 10 | chinese_punctuation_text_normalizer = ReplacePatternWithToken( 11 | name='chinese_punctuation', 12 | denormalizable=False, 13 | target_pattern=r'[{}]+'.format(CHINESE_PUNCTUATIONS), 14 | prefix_pattern=None, 15 | suffix_pattern=None, 16 | token=' ', 17 | ) 18 | 19 | english_punctuation_text_normalizer = ReplacePatternWithToken( 20 | name='english_punctuation', 21 | denormalizable=False, 22 | target_pattern=r'[{}]+'.format(ENGLISH_PUNCTUATIONS), 23 | prefix_pattern=None, 24 | suffix_pattern=None, 25 | token=' ', 26 | ) 27 | 28 | all_punctuation_text_normalizer = ReplacePatternWithToken( 29 | name='all_punctuation', 30 | denormalizable=False, 31 | target_pattern=r'[{}]+'.format(ENGLISH_PUNCTUATIONS + CHINESE_PUNCTUATIONS), 32 | prefix_pattern=None, 33 | suffix_pattern=None, 34 | token=' ', 35 | ) 36 | 37 | all_punctuation_without_endpoint_text_normalizer = ReplacePatternWithToken( 38 | name='all_punctuation_without_endpoint', 39 | denormalizable=False, 40 | target_pattern=r'[{}]+'.format( 41 | CHINESE_PUNCTUATIONS + ENGLISH_PUNCTUATIONS_WITHOUT_ENDPOINT, 42 | ), 43 | prefix_pattern=None, 44 | suffix_pattern=None, 45 | token=' ', 46 | ) 47 | all_punctuation_without_underscore_text_normalizer = ReplacePatternWithToken( 48 | name='all_punctuation_without_underscore', 49 | denormalizable=False, 50 | target_pattern=r'[{}]+'.format( 51 | CHINESE_PUNCTUATIONS + ENGLISH_PUNCTUATIONS_WITHOUT_UNDERSCORE, 52 | ), 53 | prefix_pattern=None, 54 | suffix_pattern=None, 55 | token=' ', 56 | ) 57 | -------------------------------------------------------------------------------- /text_normalizer/library/punctuation_mapping.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | from ..factory import PunctuationMapping 4 | from text_normalizer import ROOT_DIR 5 | 6 | 7 | full_punctuation_mapping_text_normalizer = PunctuationMapping( 8 | normalization_table_path=join( 9 | ROOT_DIR, 10 | 'data/punctuation/punctuation_mapping_0221.csv', 11 | ), 12 | ) 13 | 14 | simplified_punctuation_mapping_text_normalizer = PunctuationMapping( 15 | normalization_table_path=join( 16 | ROOT_DIR, 17 | 'data/punctuation/punctuation_mapping_0221_simplified.csv', 18 | ), 19 | ) 20 | -------------------------------------------------------------------------------- /text_normalizer/library/strip.py: -------------------------------------------------------------------------------- 1 | from ..factory import Strip 2 | 3 | 4 | pure_strip_text_normalizer = Strip() 5 | -------------------------------------------------------------------------------- /text_normalizer/library/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/library/test/__init__.py -------------------------------------------------------------------------------- /text_normalizer/library/test/test_basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from unittest import TestCase 4 | from ..basic import ( 5 | whitespace_char_text_normalizer, 6 | whitespace_reduction_text_normalizer, 7 | ) 8 | 9 | 10 | class BasicTextNormalizersTestCase(TestCase): 11 | 12 | def test_whitespace_char_text_normalizer_normalize(self): 13 | test_cases = [ 14 | (' ', (' ', None)), 15 | (' ', (' ', None)), 16 | ('\n\n\n\n\n', (' ', None)), 17 | ('我有很多 空白', ('我有很多 空白', None)), 18 | ('我有很多 tab', ('我有很多 tab', None)), 19 | ('我有很多\n\n\n\n\n分行', ('我有很多 分行', None)), 20 | ('家豪大大亂入', ('家豪大大亂入', None)), 21 | ] 22 | for test_case in test_cases: 23 | with self.subTest(test_case=test_case): 24 | self.assertEqual( 25 | test_case[1], 26 | whitespace_char_text_normalizer.normalize( 27 | sentence=test_case[0], 28 | ), 29 | ) 30 | 31 | def test_whitespace_reduction_text_normalizer(self): 32 | normalizer = whitespace_reduction_text_normalizer 33 | test_cases = [ 34 | ( 35 | ' ', 36 | ( 37 | ' ', 38 | {' ': [' ']}, 39 | ), 40 | ), 41 | ( 42 | ' ', 43 | ( 44 | ' ', 45 | {' ': [' ']}, 46 | ), 47 | ), 48 | ( 49 | '\n\n\n\n\n', 50 | ( 51 | ' ', 52 | {' ': ['\n\n\n\n\n']}, 53 | ), 54 | ), 55 | ( 56 | '我有很多 空白\n', 57 | ( 58 | '我有很多 空白 ', 59 | {' ': [' ', '\n']}, 60 | ), 61 | ), 62 | ( 63 | '我有很多 tab\n\n\t', 64 | ( 65 | '我有很多 tab ', 66 | {' ': [' ', '\n\n\t']}, 67 | ), 68 | ), 69 | ( 70 | '我有很多\n\n\n\n\n分行', 71 | ( 72 | '我有很多 分行', 73 | {' ': ['\n\n\n\n\n']}, 74 | ), 75 | ), 76 | ('家豪大大亂入', ('家豪大大亂入', {' ': []})), 77 | ] 78 | for test_case in test_cases: 79 | with self.subTest(test_case=test_case): 80 | self.assertEqual( 81 | test_case[1], 82 | normalizer.normalize( 83 | sentence=test_case[0], 84 | ), 85 | ) 86 | self.assertEqual( 87 | test_case[0], 88 | normalizer.denormalize( 89 | sentence=test_case[1][0], 90 | meta=test_case[1][1], 91 | ), 92 | ) 93 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_date.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..date import ( 5 | date_text_normalizer_yymmdd, 6 | ) 7 | 8 | 9 | class DateTextNormalizersTestCase(TestCase): 10 | 11 | def test_date_yymmdd_normalize(self): 12 | test_cases = [ 13 | ('2017-12-07', (' _date_ ', {' _date_ ': ['2017-12-07']})), 14 | ('2017-1-3', (' _date_ ', {' _date_ ': ['2017-1-3']})), 15 | ('2017-1-30', (' _date_ ', {' _date_ ': ['2017-1-30']})), 16 | ('2017-12-3', (' _date_ ', {' _date_ ': ['2017-12-3']})), 17 | ('3017-12-07', ('3017-12-07', {' _date_ ': []})), 18 | ('2017-22-07', ('2017-22-07', {' _date_ ': []})), 19 | ('2017-12-47', ('2017-12-47', {' _date_ ': []})), 20 | ('今天日期是2017-12-07', ('今天日期是 _date_ ', {' _date_ ': ['2017-12-07']})), 21 | ('2017-12-07XD', (' _date_ XD', {' _date_ ': ['2017-12-07']})), 22 | ('現在日期2017-12-07XD', ('現在日期 _date_ XD', {' _date_ ': ['2017-12-07']})), 23 | ('2017-12-07-00', ('2017-12-07-00', {' _date_ ': []})), 24 | ('2017-12-0708', ('2017-12-0708', {' _date_ ': []})), 25 | ('2017-1208-07', ('2017-1208-07', {' _date_ ': []})), 26 | ('2017-12-07和2018-01-10', (' _date_ 和 _date_ ', 27 | {' _date_ ': ['2017-12-07', '2018-01-10']})), 28 | ('2017-12-072018-01-10', ('2017-12-072018-01-10', {' _date_ ': []})), 29 | ('家豪大大亂入', ('家豪大大亂入', {' _date_ ': []})), 30 | ] 31 | for test_case in test_cases: 32 | with self.subTest(test_case=test_case): 33 | self.assertEqual( 34 | test_case[1], 35 | date_text_normalizer_yymmdd.normalize(sentence=test_case[0]), 36 | ) 37 | 38 | def test_date_yymmdd_denormalize(self): 39 | normal_test_cases = [ 40 | (' _date_ ', {' _date_ ': ['2017-12-18']}, '2017-12-18'), 41 | ('現在日期 _date_ ', {' _date_ ': ['2017-12-18']}, '現在日期2017-12-18'), 42 | (' _date_ XD', {' _date_ ': ['2017-12-18']}, '2017-12-18XD'), 43 | ('現在日期 _date_ XD', {' _date_ ': ['2017-12-18']}, '現在日期2017-12-18XD'), 44 | (' _date_ 和 _date_ ', 45 | {' _date_ ': ['2017-12-18', '2018-01-02']}, '2017-12-18和2018-01-02'), 46 | (' _date_ _date_ ', 47 | {' _date_ ': ['2017-12-18', '2018-01-02']}, '2017-12-182018-01-02'), 48 | ('家豪大大亂入', {' _date_ ': []}, '家豪大大亂入'), 49 | ] 50 | for test_case in normal_test_cases: 51 | with self.subTest(test_case=test_case): 52 | self.assertEqual( 53 | test_case[2], 54 | date_text_normalizer_yymmdd.denormalize( 55 | sentence=test_case[0], 56 | meta=test_case[1], 57 | ), 58 | ) 59 | with self.assertRaises(KeyError): 60 | date_text_normalizer_yymmdd.denormalize( 61 | sentence='家豪大大亂入', 62 | meta={'_雞排_': ['大雞排']}, 63 | ), 64 | with self.assertRaises(ValueError): 65 | date_text_normalizer_yymmdd.denormalize( 66 | sentence=' _date_ 和 _date_ 這兩個日期都沒有雞排', 67 | meta={' _date_ ': ['2017-12-18']}, 68 | ) 69 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_eng_lowercase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from unittest import TestCase 4 | from ..eng_lowercase import ( 5 | eng_lowercase_text_normalizer, 6 | ) 7 | 8 | 9 | class EngLowercaseTextNormalizerTestCase(TestCase): 10 | 11 | def test_normalize_n_denormalize_0(self): 12 | result = eng_lowercase_text_normalizer.normalize( 13 | '哈囉 AAB 123 Cddef 哈囉 >< ???', 14 | ) 15 | self.assertEqual( 16 | ('哈囉 aab 123 cddef 哈囉 >< ???', 17 | [ 18 | { 19 | 'before': 'AAB', 20 | 'after': 'aab', 21 | }, 22 | { 23 | 'before': 'Cddef', 24 | 'after': 'cddef', 25 | }, 26 | ], 27 | ), 28 | result, 29 | ) 30 | result = eng_lowercase_text_normalizer.denormalize( 31 | sentence=result[0], 32 | meta=result[1], 33 | ) 34 | self.assertEqual( 35 | '哈囉 AAB 123 Cddef 哈囉 >< ???', 36 | result, 37 | ) 38 | 39 | def test_normalize_n_denormalize_1(self): 40 | result = eng_lowercase_text_normalizer.normalize( 41 | 'AAB 123 哈囉 Cddef 456 ffecI', 42 | ) 43 | self.assertEqual( 44 | ('aab 123 哈囉 cddef 456 ffeci', 45 | [ 46 | { 47 | 'before': 'AAB', 48 | 'after': 'aab', 49 | }, 50 | { 51 | 'before': 'Cddef', 52 | 'after': 'cddef', 53 | }, 54 | { 55 | 'before': 'ffecI', 56 | 'after': 'ffeci', 57 | }, 58 | ], 59 | ), 60 | result, 61 | ) 62 | result = eng_lowercase_text_normalizer.denormalize( 63 | sentence=result[0], 64 | meta=result[1], 65 | ) 66 | self.assertEqual( 67 | 'AAB 123 哈囉 Cddef 456 ffecI', 68 | result, 69 | ) 70 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_identity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from unittest import TestCase 4 | from ..identity import ( 5 | identity_text_normalizer, 6 | ) 7 | 8 | 9 | class IdentityTextNormalizersTestCase(TestCase): 10 | 11 | def test_identity_text_normalizer_normalize(self): 12 | result = identity_text_normalizer.normalize( 13 | sentence='我超懶惰 我就是想耍廢 KerKer ><', 14 | ) 15 | self.assertEqual( 16 | ('我超懶惰 我就是想耍廢 KerKer ><', None), 17 | result, 18 | ) 19 | result = identity_text_normalizer.denormalize( 20 | sentence=result[0], 21 | meta=result[1], 22 | ) 23 | self.assertEqual( 24 | '我超懶惰 我就是想耍廢 KerKer ><', 25 | result, 26 | ) 27 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_number.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from ..number import ( 3 | int_text_normalizer, 4 | float_text_normalizer, 5 | int_with_digit_text_normalizer, 6 | float_with_digit_text_normalizer, 7 | int_with_space_text_normalizer, 8 | float_with_space_text_normalizer, 9 | int_with_digit_n_space_text_normalizer, 10 | float_with_digit_n_space_text_normalizer, 11 | ) 12 | 13 | 14 | class NumberTextNormalizersTestCase(TestCase): 15 | 16 | def test_int_text_normalizer(self): 17 | revised_sentence, meta = int_text_normalizer.normalize(sentence="123") 18 | recovered_sentence = int_text_normalizer.denormalize( 19 | sentence=revised_sentence, 20 | meta=meta, 21 | ) 22 | self.assertEqual("_int_", revised_sentence) 23 | self.assertEqual({"_int_": ["123"]}, meta) 24 | self.assertEqual("123", recovered_sentence) 25 | 26 | def test_float_text_normalizer(self): 27 | revised_sentence, meta = float_text_normalizer.normalize(sentence="123.33") 28 | recovered_sentence = float_text_normalizer.denormalize( 29 | sentence=revised_sentence, 30 | meta=meta, 31 | ) 32 | self.assertEqual("_float_", revised_sentence) 33 | self.assertEqual({"_float_": ["123.33"]}, meta) 34 | self.assertEqual("123.33", recovered_sentence) 35 | 36 | def test_int_with_digit_text_normalizer(self): 37 | revised_sentence, meta = int_with_digit_text_normalizer.normalize(sentence="123") 38 | recovered_sentence = int_with_digit_text_normalizer.denormalize( 39 | sentence=revised_sentence, 40 | meta=meta, 41 | ) 42 | self.assertEqual("_3int_", revised_sentence) 43 | self.assertEqual({"_3int_": ["123"]}, meta) 44 | self.assertEqual("123", recovered_sentence) 45 | 46 | def test_float_with_digit_text_normalizer(self): 47 | revised_sentence, meta = float_with_digit_text_normalizer.normalize( 48 | sentence="123.33", 49 | ) 50 | recovered_sentence = float_with_digit_text_normalizer.denormalize( 51 | sentence=revised_sentence, 52 | meta=meta, 53 | ) 54 | self.assertEqual("_3float2_", revised_sentence) 55 | self.assertEqual({"_3float2_": ["123.33"]}, meta) 56 | self.assertEqual("123.33", recovered_sentence) 57 | 58 | def test_int_with_space_text_normalizer(self): 59 | revised_sentence, meta = int_with_space_text_normalizer.normalize(sentence="123") 60 | recovered_sentence = int_with_space_text_normalizer.denormalize( 61 | sentence=revised_sentence, 62 | meta=meta, 63 | ) 64 | self.assertEqual(" _int_ ", revised_sentence) 65 | self.assertEqual({" _int_ ": ["123"]}, meta) 66 | self.assertEqual("123", recovered_sentence) 67 | 68 | def test_float_with_space_text_normalizer(self): 69 | revised_sentence, meta = float_with_space_text_normalizer.normalize(sentence="123.33") 70 | recovered_sentence = float_with_space_text_normalizer.denormalize( 71 | sentence=revised_sentence, 72 | meta=meta, 73 | ) 74 | self.assertEqual(" _float_ ", revised_sentence) 75 | self.assertEqual({" _float_ ": ["123.33"]}, meta) 76 | self.assertEqual("123.33", recovered_sentence) 77 | 78 | def test_int_with_digit_n_space_text_normalizer(self): 79 | revised_sentence, meta = int_with_digit_n_space_text_normalizer.normalize( 80 | sentence="123", 81 | ) 82 | recovered_sentence = int_with_digit_n_space_text_normalizer.denormalize( 83 | sentence=revised_sentence, 84 | meta=meta, 85 | ) 86 | self.assertEqual(" _3int_ ", revised_sentence) 87 | self.assertEqual({" _3int_ ": ["123"]}, meta) 88 | self.assertEqual("123", recovered_sentence) 89 | 90 | def test_float_with_digit_n_space_text_normalizer(self): 91 | revised_sentence, meta = float_with_digit_n_space_text_normalizer.normalize( 92 | sentence="123.33", 93 | ) 94 | recovered_sentence = float_with_digit_n_space_text_normalizer.denormalize( 95 | sentence=revised_sentence, 96 | meta=meta, 97 | ) 98 | self.assertEqual(" _3float2_ ", revised_sentence) 99 | self.assertEqual({" _3float2_ ": ["123.33"]}, meta) 100 | self.assertEqual("123.33", recovered_sentence) 101 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_punctuation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from unittest import TestCase 4 | from ..punctuation import ( 5 | chinese_punctuation_text_normalizer, 6 | english_punctuation_text_normalizer, 7 | all_punctuation_text_normalizer, 8 | all_punctuation_without_endpoint_text_normalizer, 9 | all_punctuation_without_underscore_text_normalizer, 10 | ) 11 | 12 | 13 | class PunctuationTextNormalizersTestCase(TestCase): 14 | 15 | def test_chinese_punctuation_text_normalizer_normalize(self): 16 | test_cases = [ 17 | ('勤彥大大喜歡吃《變態》糖果!!!', ('勤彥大大喜歡吃 變態 糖果 ', None)), 18 | ('。「」﹁﹂『 』、‧( )《》〈〉 ﹏﹏﹏……—~,?;:[]【 】!', (' ', None)), 19 | ('家豪大大亂入', ('家豪大大亂入', None)), 20 | ] 21 | for test_case in test_cases: 22 | with self.subTest(test_case=test_case): 23 | self.assertEqual( 24 | test_case[1], 25 | chinese_punctuation_text_normalizer.normalize(sentence=test_case[0]), 26 | ) 27 | 28 | def test_english_punctuation_text_normalizer_normalize(self): 29 | test_cases = [ 30 | ('勤彥大大喜歡吃<變態>糖果!!!', ('勤彥大大喜歡吃 變態 糖果 ', None)), 31 | ('.,<>(){}[]*^!?=+-~', (' ', None)), 32 | ('家豪大大亂入', ('家豪大大亂入', None)), 33 | ] 34 | for test_case in test_cases: 35 | with self.subTest(test_case=test_case): 36 | self.assertEqual( 37 | test_case[1], 38 | english_punctuation_text_normalizer.normalize(sentence=test_case[0]), 39 | ) 40 | 41 | def test_all_punctuation_text_normalizer_normalize(self): 42 | test_cases = [ 43 | ('勤彥大大:喜歡吃《》<變態>《》糖果!!!', ('勤彥大大 喜歡吃 變態 糖果 ', None)), 44 | ('.,<>(){}[]*^!?=+-~。「」﹁﹂『 』、‧( )《》〈〉 ﹏﹏﹏……—~,?;:[]【 】!', (' ', None)), 45 | ('家豪大大亂入', ('家豪大大亂入', None)), 46 | ] 47 | for test_case in test_cases: 48 | with self.subTest(test_case=test_case): 49 | self.assertEqual( 50 | test_case[1], 51 | all_punctuation_text_normalizer.normalize(sentence=test_case[0]), 52 | ) 53 | 54 | def test_all_punctuation_without_endpoint_text_normalizer_normalize(self): 55 | test_cases = [ 56 | ('勤彥大大:喜歡吃87.9《》<變態>《》糖果!!!', 57 | ('勤彥大大 喜歡吃87.9 變態 糖果 ', None)), 58 | ('.,<>(){}[]*^!?=+-~。「」﹁﹂『 』、‧( )《》〈〉 ﹏﹏﹏……—~,?;:[]【 】!', 59 | ('. ', None)), 60 | ('家豪大大亂入', ('家豪大大亂入', None)), 61 | ] 62 | for test_case in test_cases: 63 | with self.subTest(test_case=test_case): 64 | self.assertEqual( 65 | test_case[1], 66 | all_punctuation_without_endpoint_text_normalizer.normalize( 67 | sentence=test_case[0], 68 | ), 69 | ) 70 | 71 | def test_all_punctuation_without_underscore_text_normalizer_normalize(self): 72 | test_cases = [ 73 | ('勤彥大大:喜歡吃87.9《》_<變態>_《》糖果!!!', ('勤彥大大 喜歡吃87 9 _ 變態 _ 糖果 ', None)), 74 | ('_.,<>(){}[]*^!?=+-~。「」﹁﹂『 』、‧( )《》〈〉 ﹏﹏﹏……—~,?;:[]【 】!', ('_ ', None)), 75 | ('家豪大大亂入', ('家豪大大亂入', None)), 76 | ] 77 | for test_case in test_cases: 78 | with self.subTest(test_case=test_case): 79 | self.assertEqual( 80 | test_case[1], 81 | all_punctuation_without_underscore_text_normalizer.normalize( 82 | sentence=test_case[0], 83 | ), 84 | ) 85 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_punctuation_mapping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..punctuation_mapping import ( 5 | full_punctuation_mapping_text_normalizer, 6 | simplified_punctuation_mapping_text_normalizer, 7 | ) 8 | 9 | 10 | class PunctuationMappingTextNormalizerTestCase(TestCase): 11 | 12 | def run_test(self, test_cases, normalizer): 13 | for test_case in test_cases: 14 | with self.subTest(test_case=test_case): 15 | revised_sentence, meta = normalizer.normalize( 16 | sentence=test_case[0], 17 | ) 18 | self.assertEqual( 19 | test_case[1], 20 | revised_sentence, 21 | ) 22 | recovered_sentence = normalizer.denormalize( 23 | sentence=test_case[1], 24 | meta=meta, 25 | ) 26 | self.assertEqual( 27 | test_case[0], 28 | recovered_sentence, 29 | ) 30 | 31 | def test_full_punctuation_mapping_text_normalizer(self): 32 | test_cases = [ 33 | ( 34 | "符號, 、 。 . ? ! ~ $ % @ & # * ‧ , 、 。 . ? ! ~ $ % @ & # * ‧", 35 | "符號, , . . ? ! ~ $ % @ & # * . , , . . ? ! ~ $ % @ & # * .", 36 | ), 37 | ( 38 | "; ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞 ; ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞", 39 | "; : ... , . . . ; : \" \" \" \" \" \" ; : ... , . . . ; : \" \" \" \" \" \"", 40 | ), 41 | ( 42 | "括號符號; 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 ( ) ; 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 ( )", 43 | "括號符號; [ ] [ ] [ ] < > ( ) < > ( ) ; [ ] [ ] [ ] < > ( ) < > ( )", 44 | ), 45 | ( 46 | "{ } ﹛ ﹜ 『 』 「 」 < > ≦ ≧ ﹤ ﹥ { } ﹛ ﹜ 『 』 「 」 < > ≦ ≧ ﹤ ﹥", 47 | "{ } { } \" \" \" \" < > < > < > { } { } \" \" \" \" < > < > < >", 48 | ), 49 | ( 50 | "括號符號; ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄ ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄", 51 | "括號符號; ( ) { } [ ] [ ] < > < > \" \" \" \" ( ) { } [ ] [ ] < > < > \" \" \" \"", 52 | ), 53 | ( 54 | "線段符號; ﹣ ﹦ ≡ | ∣ ∥ – ︱ — ︳ ╴ ¯  ̄ ﹉ ﹣ ﹦ ≡ | ∣ ∥ – ︱ — ︳ ╴ ¯  ̄ ﹉", 55 | "線段符號; - = = | | / - | - | - - - - - = = | | / - | - | - - - -", 56 | ), 57 | ( 58 | "﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ \ / ﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ \ /", 59 | "- _ _ - - _ | \ / \ / \ / - _ _ - - _ | \ / \ / \ /", 60 | ), 61 | ( 62 | "+ + + ﹢ * * × ╳", 63 | "+ + + + * * * *", 64 | ), 65 | ] 66 | self.run_test(test_cases, normalizer=full_punctuation_mapping_text_normalizer) 67 | 68 | def test_simplified_punctuation_mapping_text_normalizer(self): 69 | test_cases = [ 70 | ( 71 | "符號, 、 。 . ? ! ~ $ % @ & # * ‧ , 、 。 . ? ! ~ $ % @ & # * ‧", 72 | "符號, , . . ? ! - $ % @ & # * . , , . . ? ! - $ % @ & # * .", 73 | ), 74 | ( 75 | "; ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞 ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞", 76 | "; : ... , . . . ; : \" \" \" \" \" \" : ... , . . . ; : \" \" \" \" \" \"", 77 | ), 78 | ( 79 | "括號符號; 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 ( ) 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 ( )", 80 | "括號符號; ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( )", 81 | ), 82 | ( 83 | "{ } ﹛ ﹜ 『 』 「 」 < > ≦ ≧ ﹤ ﹥ { } ﹛ ﹜ 『 』 「 」 < > ≦ ≧ ﹤ ﹥", 84 | "( ) ( ) \" \" \" \" ( ) ( ) ( ) ( ) ( ) \" \" \" \" ( ) ( ) ( )", 85 | ), 86 | ( 87 | "括號符號; ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄ ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄", 88 | "括號符號; ( ) ( ) ( ) ( ) ( ) ( ) \" \" \" \" ( ) ( ) ( ) ( ) ( ) ( ) \" \" \" \"", 89 | ), 90 | ( 91 | "線段符號; ﹣ ﹦ ≡ | ∣ ∥ – ︱ — ︳ ╴ ¯  ̄ ﹉ ﹣ ﹦ ≡ | ∣ ∥ – ︱ — ︳ ╴ ¯  ̄ ﹉", 92 | "線段符號; - = = , , , - , - , - - - - - = = , , , - , - , - - - -", 93 | ), 94 | ( 95 | "﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ \ / ﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ \ /", 96 | "- _ _ - - _ , , , , , , , - _ _ - - _ , , , , , , ,", 97 | ), 98 | ] 99 | self.run_test(test_cases, normalizer=simplified_punctuation_mapping_text_normalizer) 100 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_strip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | from ..strip import ( 4 | pure_strip_text_normalizer, 5 | ) 6 | 7 | 8 | class StripTextNormalizerTestCase(TestCase): 9 | 10 | def normalize(self): 11 | result = pure_strip_text_normalizer.normalize( 12 | sentence=' \n\n\t\t LALALA 拉拉 xddd \n\n\t\t\t ') 13 | self.assertEqual( 14 | ('LALALA 拉拉 xddd', None), 15 | result, 16 | ) 17 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_time.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | 4 | from ..time import ( 5 | time_text_normalizer_hhmm, 6 | ) 7 | 8 | 9 | class TimeTextNormalizersTestCase(TestCase): 10 | 11 | def test_time_hhmm_normalize(self): 12 | test_cases = [ 13 | ('12:18', (' _time_ ', {' _time_ ': ['12:18']})), 14 | ('現在時間12:18', ('現在時間 _time_ ', {' _time_ ': ['12:18']})), 15 | ('12:18XD', (' _time_ XD', {' _time_ ': ['12:18']})), 16 | ('現在時間12:18XD', ('現在時間 _time_ XD', {' _time_ ': ['12:18']})), 17 | ('12:18:00', ('12:18:00', {' _time_ ': []})), 18 | ('12:1828', ('12:1828', {' _time_ ': []})), 19 | ('1233:18', ('1233:18', {' _time_ ': []})), 20 | ('12:18和19:37', (' _time_ 和 _time_ ', {' _time_ ': ['12:18', '19:37']})), 21 | ('12:1819:37', ('12:1819:37', {' _time_ ': []})), 22 | ('家豪大大亂入', ('家豪大大亂入', {' _time_ ': []})), 23 | ] 24 | for test_case in test_cases: 25 | with self.subTest(test_case=test_case): 26 | self.assertEqual( 27 | test_case[1], 28 | time_text_normalizer_hhmm.normalize(sentence=test_case[0]), 29 | ) 30 | 31 | def test_time_hhmm_denormalize(self): 32 | normal_test_cases = [ 33 | (' _time_ ', {' _time_ ': ['12:18']}, '12:18'), 34 | ('現在時間 _time_ ', {' _time_ ': ['12:18']}, '現在時間12:18'), 35 | (' _time_ XD', {' _time_ ': ['12:18']}, '12:18XD'), 36 | ('現在時間 _time_ XD', {' _time_ ': ['12:18']}, '現在時間12:18XD'), 37 | (' _time_ 和 _time_ ', {' _time_ ': ['12:18', '19:37']}, '12:18和19:37'), 38 | (' _time_ _time_ ', {' _time_ ': ['12:18', '19:37']}, '12:1819:37'), 39 | ('家豪大大亂入', {' _time_ ': []}, '家豪大大亂入'), 40 | ] 41 | for test_case in normal_test_cases: 42 | with self.subTest(test_case=test_case): 43 | self.assertEqual( 44 | test_case[2], 45 | time_text_normalizer_hhmm.denormalize( 46 | sentence=test_case[0], 47 | meta=test_case[1], 48 | ), 49 | ) 50 | with self.assertRaises(KeyError): 51 | time_text_normalizer_hhmm.denormalize( 52 | sentence='家豪大大亂入', 53 | meta={'_雞排_': ['大雞排']}, 54 | ), 55 | with self.assertRaises(ValueError): 56 | time_text_normalizer_hhmm.denormalize( 57 | sentence=' _time_ 和 _time_ 這兩個時間都沒有雞排', 58 | meta={' _time_ ': ['12:18']}, 59 | ) 60 | -------------------------------------------------------------------------------- /text_normalizer/library/test/test_unicode_text_normalizers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from unittest import TestCase 4 | from ..unicode import ( 5 | unicode__chinese_characters_text_normalizer, 6 | unicode__chinese_characters_and_digits_text_normalizer, 7 | unicode__english_characters_and_digits_text_normalizer, 8 | unicode__english_digits_and_full_punctuations_text_normalizer, 9 | unicode__chinese_english_digits_and_full_punctuations_text_normalizer, 10 | unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer, 11 | ) 12 | 13 | 14 | class PunctuationTextNormalizersTestCase(TestCase): 15 | 16 | def unit_test(self, normalizer, test_cases): 17 | for test_case in test_cases: 18 | with self.subTest(test_case=test_case): 19 | self.assertEqual( 20 | test_case[1], 21 | normalizer.normalize( 22 | sentence=test_case[0], 23 | ), 24 | ) 25 | self.assertEqual( 26 | test_case[0], 27 | normalizer.denormalize( 28 | sentence=test_case[1][0], 29 | meta=test_case[1][1], 30 | ), 31 | ) 32 | 33 | def test_unicode__chinese_characters_text_normalizer(self): 34 | normalizer = unicode__chinese_characters_text_normalizer 35 | test_cases = [ 36 | ( 37 | '><我想喝100.3元可樂xd~~', 38 | ( 39 | ' 我想喝 元可樂 ', 40 | { 41 | '想': ['想'], 42 | ' ': ['>', '<', '1', '0', '0', '.', '3', 43 | 'x', 'd', '~', '~'], 44 | '我': ['我'], 45 | '喝': ['喝'], 46 | '樂': ['樂'], 47 | '可': ['可'], 48 | '元': ['元'], 49 | }, 50 | ), 51 | ), 52 | ] 53 | self.unit_test( 54 | normalizer=normalizer, 55 | test_cases=test_cases, 56 | ) 57 | 58 | def test_unicode__chinese_characters_and_digits_text_normalizer(self): 59 | normalizer = unicode__chinese_characters_and_digits_text_normalizer 60 | test_cases = [ 61 | ( 62 | '><我想喝100.3元可樂xd~~', 63 | ( 64 | ' 我想喝100.3元可樂 ', 65 | { 66 | '0': ['0', '0'], 67 | '想': ['想'], 68 | ' ': ['>', '<', 'x', 'd', '~', '~'], 69 | '我': ['我'], 70 | '喝': ['喝'], 71 | '樂': ['樂'], 72 | '可': ['可'], 73 | '元': ['元'], 74 | '1': ['1'], 75 | '.': ['.'], 76 | '3': ['3'], 77 | }, 78 | ), 79 | ), 80 | ] 81 | self.unit_test( 82 | normalizer=normalizer, 83 | test_cases=test_cases, 84 | ) 85 | 86 | def test_unicode__english_characters_and_digits_text_normalizer(self): 87 | normalizer = unicode__english_characters_and_digits_text_normalizer 88 | test_cases = [ 89 | ( 90 | 'hate cola 123!', 91 | ( 92 | 'hate cola 123 ', 93 | { 94 | 'h': ['h'], 95 | 'a': ['a', 'a'], 96 | 't': ['t'], 97 | 'e': ['e'], 98 | 'c': ['c'], 99 | 'o': ['o'], 100 | 'l': ['l'], 101 | '1': ['1'], 102 | '2': ['2'], 103 | '3': ['3'], 104 | ' ': [' ', ' ', '!'], 105 | }, 106 | ), 107 | ), 108 | ] 109 | self.unit_test( 110 | normalizer=normalizer, 111 | test_cases=test_cases, 112 | ) 113 | 114 | def test_unicode__english_digits_and_full_punctuations_text_normalizer(self): 115 | normalizer = unicode__english_digits_and_full_punctuations_text_normalizer 116 | test_cases = [ 117 | ( 118 | 'hate cola 123!', 119 | ( 120 | 'hate cola 123!', 121 | { 122 | 'h': ['h'], 123 | 'a': ['a', 'a'], 124 | 't': ['t'], 125 | 'e': ['e'], 126 | 'c': ['c'], 127 | 'o': ['o'], 128 | 'l': ['l'], 129 | '1': ['1'], 130 | '2': ['2'], 131 | '3': ['3'], 132 | '!': ['!'], 133 | ' ': [' ', ' '], 134 | }, 135 | ), 136 | ), 137 | ] 138 | self.unit_test( 139 | normalizer=normalizer, 140 | test_cases=test_cases, 141 | ) 142 | 143 | def test_unicode__chinese_english_digits_and_full_punctuations_text_normalizer(self): 144 | normalizer = unicode__chinese_english_digits_and_full_punctuations_text_normalizer 145 | test_cases = [ 146 | ( 147 | '~我想喝100元『可樂』,cola xd~。', 148 | ( 149 | '~我想喝100元"可樂",cola xd~.', 150 | { 151 | '0': ['0', '0'], 152 | '想': ['想'], 153 | ' ': [' '], 154 | '~': ['~', '~'], 155 | ',': [','], 156 | '.': ['。'], 157 | '"': ['『', '』'], 158 | '我': ['我'], 159 | '喝': ['喝'], 160 | '樂': ['樂'], 161 | '可': ['可'], 162 | '元': ['元'], 163 | '1': ['1'], 164 | 'c': ['c'], 165 | 'o': ['o'], 166 | 'l': ['l'], 167 | 'a': ['a'], 168 | 'x': ['x'], 169 | 'd': ['d'], 170 | }, 171 | ), 172 | ), 173 | ( 174 | '。「」﹁﹂『 』、‧( )《》〈〉 ﹏﹏﹏……—~,?;:[]【 】!', 175 | ( 176 | '."" " ", ( )<><> ~,?;:[][ ]!', 177 | { 178 | ' ': ['﹁', '﹂', ' ', '‧', ' ', ' ', '﹏', 179 | '﹏', '﹏', '…', '…', '—', ' '], 180 | '!': ['!'], 181 | '"': ['「', '」', '『', '』'], 182 | '(': ['('], 183 | ')': [')'], 184 | ',': ['、', ','], 185 | '.': ['。'], 186 | ':': [':'], 187 | ';': [';'], 188 | '<': ['《', '〈'], 189 | '>': ['》', '〉'], 190 | '?': ['?'], 191 | '[': ['[', '【'], 192 | ']': [']', '】'], 193 | '~': ['~'], 194 | }, 195 | ), 196 | ), 197 | ] 198 | self.unit_test( 199 | normalizer=normalizer, 200 | test_cases=test_cases, 201 | ) 202 | 203 | def test_unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer(self): 204 | normalizer = unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer 205 | test_cases = [ 206 | ( 207 | '><我想喝100元可樂/cola xd~~', 208 | ( 209 | ' 我想喝100元可樂,cola xd--', 210 | { 211 | '0': ['0', '0'], 212 | '想': ['想'], 213 | ' ': ['>', '<', ' '], 214 | '-': ['~', '~'], 215 | ',': ['/'], 216 | '我': ['我'], 217 | '喝': ['喝'], 218 | '樂': ['樂'], 219 | '可': ['可'], 220 | '元': ['元'], 221 | '1': ['1'], 222 | 'c': ['c'], 223 | 'o': ['o'], 224 | 'l': ['l'], 225 | 'a': ['a'], 226 | 'x': ['x'], 227 | 'd': ['d'], 228 | }, 229 | ), 230 | ), 231 | ( 232 | '。「」﹁﹂『 』、‧( )《》〈〉 ﹏﹏﹏……—~,?;:[]【 】!', 233 | ( 234 | '. , -, , ', 235 | { 236 | ' ': ['「', '」', '﹁', '﹂', '『', ' ', '』', 237 | '‧', '(', ' ', ')', '《', '》', '〈', '〉', 238 | ' ', '﹏', '﹏', '﹏', '…', '…', '—', '?', ':', 239 | '[', ']', '【', ' ', '】', '!'], 240 | ',': ['、', ',', ';'], 241 | '-': ['~'], 242 | '.': ['。'], 243 | }, 244 | ), 245 | ), 246 | ] 247 | self.unit_test( 248 | normalizer=normalizer, 249 | test_cases=test_cases, 250 | ) 251 | -------------------------------------------------------------------------------- /text_normalizer/library/time.py: -------------------------------------------------------------------------------- 1 | from ..factory import ReplacePatternWithToken 2 | 3 | 4 | time_text_normalizer_hhmm = ReplacePatternWithToken( 5 | name='time_hhmm', 6 | denormalizable=True, 7 | target_pattern=r'[0-2]*\d:[0-5]*\d', 8 | prefix_pattern=r'[^\d:]{1}|\A', 9 | suffix_pattern=r'[^\d:]{1}|\Z', 10 | token=' _time_ ', 11 | ) 12 | -------------------------------------------------------------------------------- /text_normalizer/library/unicode.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | from ..factory import UnicodeMapping 4 | from text_normalizer import ROOT_DIR 5 | 6 | 7 | unicode__chinese_characters_text_normalizer = UnicodeMapping( 8 | unicode_mapping_path=join( 9 | ROOT_DIR, 10 | 'data/unicode/chinese_characters_only.txt', 11 | ), 12 | ) 13 | 14 | unicode__chinese_characters_and_digits_text_normalizer = UnicodeMapping( 15 | unicode_mapping_path=join( 16 | ROOT_DIR, 17 | 'data/unicode/chinese_characters_and_digits.txt', 18 | ), 19 | ) 20 | 21 | unicode__english_characters_and_digits_text_normalizer = UnicodeMapping( 22 | unicode_mapping_path=join( 23 | ROOT_DIR, 24 | 'data/unicode/english_characters_and_digits.txt', 25 | ), 26 | ) 27 | 28 | unicode__english_digits_and_full_punctuations_text_normalizer = UnicodeMapping( 29 | unicode_mapping_path=join( 30 | ROOT_DIR, 31 | 'data/unicode/english_digits_and_full_punctuations.txt', 32 | ), 33 | ) 34 | 35 | unicode__chinese_english_digits_and_full_punctuations_text_normalizer = UnicodeMapping( 36 | unicode_mapping_path=join( 37 | ROOT_DIR, 38 | 'data/unicode/chinese_english_digits_and_full_punctuations.txt', 39 | ), 40 | ) 41 | 42 | unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer = \ 43 | UnicodeMapping( 44 | unicode_mapping_path=join( 45 | ROOT_DIR, 46 | 'data/unicode/chinese_english_digits_and_simplified_punctuations_1.txt', 47 | ), 48 | ) 49 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/utils/__init__.py -------------------------------------------------------------------------------- /utils/label_propagation.pyx: -------------------------------------------------------------------------------- 1 | 2 | 3 | def propagate_label( 4 | label: list[int], 5 | annotations: list[dict], 6 | ) -> list[int]: 7 | return propagate_label_in_c( 8 | label=label, 9 | annotations=annotations, 10 | ) 11 | 12 | 13 | def backpropagate_label( 14 | label: list[int], 15 | annotations: list[dict], 16 | ) -> list[int]: 17 | return backpropagate_label_in_c( 18 | label=label, 19 | annotations=annotations, 20 | ) 21 | 22 | 23 | cdef list propagate_label_in_c( # noqa: E999 24 | list label, 25 | list annotations, # list of dict 26 | ): 27 | cdef unsigned int i, n_anno 28 | 29 | n_anno = len(annotations) 30 | for i in range(n_anno): 31 | label = propagate_label_for_a_pair_of_annotations_in_c( 32 | label=label, 33 | forward_annotations=annotations[i]['forward'], 34 | backward_annotations=annotations[i]['backward'], 35 | ) 36 | return label 37 | 38 | 39 | cdef list backpropagate_label_in_c( # noqa: E999 40 | list label, 41 | list annotations, 42 | ): 43 | cdef unsigned int i, j, n_anno 44 | 45 | n_anno = len(annotations) 46 | for i in range(n_anno - 1, -1, -1): 47 | label = propagate_label_for_a_pair_of_annotations_in_c( 48 | label=label, 49 | forward_annotations=annotations[i]['backward'], 50 | backward_annotations=annotations[i]['forward'], 51 | ) 52 | return label 53 | 54 | 55 | cdef list propagate_label_for_a_pair_of_annotations_in_c( # noqa: E999 56 | list label, 57 | list forward_annotations, # list of tuples 58 | list backward_annotations, # list of tuples 59 | ): 60 | 61 | cdef unsigned int i, n_fmodif, n_bmodif, current_pt 62 | cdef list output_label 63 | 64 | n_fmodif = len(forward_annotations) 65 | n_bmodif = len(backward_annotations) 66 | 67 | if n_fmodif != n_bmodif: 68 | raise ValueError( 69 | f'number of forward and backward modifications is not the same') 70 | 71 | if n_bmodif == 0: 72 | # no modification return label 73 | return label 74 | 75 | output_label = [0] * (2 * n_fmodif + 1) 76 | current_pt = 0 77 | for i in range(n_fmodif): 78 | # before annotations 79 | output_label[2 * i] = label[current_pt: forward_annotations[i][0]] 80 | 81 | # annotate 82 | merged_label = get_high_freq_label( 83 | label[forward_annotations[i][0]: forward_annotations[i][1]]) 84 | n_labels = backward_annotations[i][1] - backward_annotations[i][0] 85 | output_label[2 * i + 1] = [merged_label] * n_labels 86 | 87 | current_pt = forward_annotations[i][1] 88 | 89 | output_label[-1] = label[forward_annotations[-1][1]:] 90 | 91 | output_label = sum(output_label, []) 92 | return output_label 93 | 94 | 95 | cdef unsigned int get_high_freq_label( # noqa: E999 96 | list label): 97 | 98 | cdef unsigned int max_f, label_f 99 | cdef dict record = {} 100 | 101 | max_f = 0 102 | label_f = 0 103 | for l in label: 104 | if l not in record: 105 | record[l] = 1 106 | else: 107 | record[l] += 1 108 | 109 | if record[l] > max_f: 110 | max_f = record[l] 111 | label_f = l 112 | return label_f 113 | -------------------------------------------------------------------------------- /utils/setup_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/utils/setup_utils/__init__.py -------------------------------------------------------------------------------- /utils/setup_utils/get_ext.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from setuptools import Extension 4 | 5 | 6 | def get_ext_modules_n_cmdclass(): 7 | 8 | root_path = Path('.') 9 | 10 | try: 11 | from Cython.Distutils import build_ext 12 | except ImportError: 13 | use_cython = False 14 | else: 15 | use_cython = True 16 | 17 | cmdclass = {} 18 | ext_modules = [] 19 | if use_cython: 20 | # get all .pyx files 21 | pyx_paths = sorted(root_path.rglob("*.pyx")) 22 | for pyx_path in pyx_paths: 23 | path_str = str(pyx_path) 24 | header = pyx_path.read_text().split('\n')[0] 25 | if ('cpp' in header) or ('c++' in header): 26 | language = 'c++' 27 | else: 28 | language = 'c' 29 | 30 | extension = Extension( 31 | path_str[:-4].replace('/', '.'), 32 | [path_str], 33 | language=language, 34 | ) 35 | 36 | # Have Cython embed function call signature information in docstrings, 37 | # so that Sphinx can extract and use those signatures. 38 | extension.cython_directives = {"embedsignature": True} 39 | ext_modules.append(extension) 40 | cmdclass.update({'build_ext': build_ext}) 41 | 42 | else: 43 | # .c files 44 | c_paths = sorted(root_path.rglob("*.c")) 45 | for c_path in c_paths: 46 | path_str = str(c_path) 47 | ext_modules.append( 48 | Extension( 49 | path_str[:-2].replace('/', '.'), 50 | [path_str], 51 | ), 52 | ) 53 | 54 | # .cpp files 55 | cpp_paths = sorted(root_path.rglob("*.cpp")) 56 | for cpp_path in cpp_paths: 57 | path_str = str(cpp_path) 58 | ext_modules.append( 59 | Extension( 60 | path_str[:-4].replace('/', '.'), 61 | [path_str], 62 | ), 63 | ) 64 | 65 | return ext_modules, cmdclass 66 | -------------------------------------------------------------------------------- /utils/setup_utils/remove_so_files.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | 4 | 5 | def remove_so_files(): 6 | so_paths = sorted( 7 | Path('./strpipe').rglob( 8 | "*.cpython-36m-x86_64-linux-gnu.so", 9 | ), 10 | ) 11 | for path in so_paths: 12 | os.remove(str(path.resolve())) 13 | 14 | 15 | if __name__ == '__main__': 16 | remove_so_files() 17 | -------------------------------------------------------------------------------- /utils/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/utils/test/__init__.py -------------------------------------------------------------------------------- /utils/test/test_label_propagation.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from ..label_propagation import ( 4 | # propagate_label, 5 | backpropagate_label, 6 | ) 7 | 8 | 9 | class LabelPropagationTestCase(TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | """ 14 | input str: 我想買10元的100c.c.飲料 15 | result of normalization: 16 | 我想買_int_元的_int_c.c.飲料 17 | meta = { 18 | 'forward': [(3,5, '10'), (7, 10, '100')], 19 | 'backward': [(3,8, '_int_'), (10, 15, '_int_')], 20 | } 21 | 22 | """ 23 | cls.meta = { 24 | 'forward': [(3, 5, '10'), (7, 10, '100')], 25 | 'backward': [(3, 8, '_int_'), (10, 15, '_int_')], 26 | } 27 | cls.label = [0, 0, 0, 1, 1, 1, 1, 1, 1, 28 | 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0] 29 | cls.expected_label = [0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0] 30 | 31 | def test_backpropagate_label(self): 32 | output = backpropagate_label( 33 | label=self.label, 34 | annotations=[self.meta], 35 | ) 36 | self.assertEqual(self.expected_label, output) 37 | --------------------------------------------------------------------------------