├── .editorconfig
├── .flake8
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── requirements.txt
├── setup.py
├── text_normalizer
    ├── __init__.py
    ├── collection
    │   ├── __init__.py
    │   ├── base_collection.py
    │   ├── basic.py
    │   ├── charactor.py
    │   ├── eng_basic.py
    │   ├── punctuation_keeping.py
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── test_base_collection.py
    │   │   ├── test_basic.py
    │   │   ├── test_charactor.py
    │   │   ├── test_eng_basic.py
    │   │   ├── test_punctuation_keeping.py
    │   │   └── test_unicode_mapping.py
    │   └── unicode_mapping.py
    ├── data
    │   ├── punctuation
    │   │   ├── punctuation_mapping_0221.csv
    │   │   └── punctuation_mapping_0221_simplified.csv
    │   └── unicode
    │   │   ├── chinese_characters_and_digits.txt
    │   │   ├── chinese_characters_only.txt
    │   │   ├── chinese_english_digits.txt
    │   │   ├── chinese_english_digits_and_full_punctuations.txt
    │   │   ├── chinese_english_digits_and_simplified_punctuations_1.txt
    │   │   ├── english_characters_and_digits.txt
    │   │   └── english_digits_and_full_punctuations.txt
    ├── factory
    │   ├── __init__.py
    │   ├── base_factory.py
    │   ├── eng_lowercase.py
    │   ├── identity.py
    │   ├── number_token.py
    │   ├── punctuation_mapping.py
    │   ├── replace_pattern_with_token.py
    │   ├── strip.py
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── example_punctuation_mapping.csv
    │   │   ├── example_unicode_mapping.txt
    │   │   ├── test_base_factory.py
    │   │   ├── test_eng_lowercase.py
    │   │   ├── test_identity.py
    │   │   ├── test_number_token_test_tokenizer.py
    │   │   ├── test_punctuation_mapping.py
    │   │   ├── test_strip.py
    │   │   └── test_unicode_mapping.py
    │   ├── toolkit
    │   │   ├── __init__.py
    │   │   ├── findall_position.c
    │   │   └── findall_position.pyx
    │   └── unicode_mapping.py
    └── library
    │   ├── __init__.py
    │   ├── basic.py
    │   ├── date.py
    │   ├── eng_lowercase.py
    │   ├── identity.py
    │   ├── number.py
    │   ├── punctuation.py
    │   ├── punctuation_mapping.py
    │   ├── strip.py
    │   ├── test
    │       ├── __init__.py
    │       ├── test_basic.py
    │       ├── test_date.py
    │       ├── test_eng_lowercase.py
    │       ├── test_identity.py
    │       ├── test_number.py
    │       ├── test_punctuation.py
    │       ├── test_punctuation_mapping.py
    │       ├── test_strip.py
    │       ├── test_time.py
    │       └── test_unicode_text_normalizers.py
    │   ├── time.py
    │   └── unicode.py
└── utils
    ├── __init__.py
    ├── label_propagation.c
    ├── label_propagation.pyx
    ├── setup_utils
        ├── __init__.py
        ├── get_ext.py
        └── remove_so_files.py
    └── test
        ├── __init__.py
        └── test_label_propagation.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | end_of_line = lf
 6 | charset = utf-8
 7 | trim_trailing_whitespace = true
 8 | insert_final_newline = true
 9 | 
10 | [*.{js,yml,css,html}]
11 | indent_size = 2
12 | 
13 | [*.{py,pyx,pxd}]
14 | indent_size = 4
15 | max_line_length = 100
16 | 
17 | [*.json]
18 | indent_size = 2
19 | insert_final_newline = ignore
20 | 
21 | [Makefile]
22 | indent_style = tab
23 | 
24 | [*.md]
25 | indent_size = 4
26 | trim_trailing_whitespace = false
27 | 
28 | [*.{c,cpp}]
29 | indent_size = 4
30 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | filename = *.py,*.pyx,*.pxd
 3 | max-line-length = 100
 4 | ignore =
 5 |     E125,E121,E266,
 6 |     # print is allowed
 7 |     T001, T003,
 8 |     # invalid escape sequence
 9 |     W605
10 | exclude =
11 |     .git
12 |     # __pycache__
13 |     __pycache__
14 |     # virtual environment
15 |     .venv/
16 |     venv/
17 |     env/
18 |     build/
19 |     # sphinx docs
20 |     docs/
21 | max_complexity = 10
22 | statistics = true
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | *.fuse_hidden*
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # IPython Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 
92 | example.py
93 | .vscode/
94 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | - '3.5'
 4 | - '3.6'
 5 | install:
 6 | - make install
 7 | script:
 8 | - make lint
 9 | - make test
10 | deploy:
11 |   provider: pypi
12 |   user: solumilken
13 |   password:
14 |     secure: 0Opwlmv6qK1uyW2u3sE5rbdNiopeFTvG8kKAFZS3b5joRpiCKRdRGPYPIfCf1l8S5SeXq6INt6HIHcxyNDsIB1ejHYYJu33wL9tK1mUekivlEXibdEDaN3/qNfT9dZDWm/4tUFrFvHGhB6krJjIToUYsJvM3tYBSX6uCgfxFrpr6GsLNrxs/nIzy2aCD9MMReQ89iC3IQoWkTuTIbGnuj7eWEQpbhjLmBIrwJwnh5zcjdrR9PAzWakOX4bMeVa89nQiaL16icTaHthCQrLuyCP7lQ2tlh7rO4yT+UF4qLynWFAEYEQL3mvx+I/bNpKaRvHy26ZgkTLsd5mJsntbohDYN0Ydyx6nXTzuAMsElumMdVYizJghh8+/x9CfbF+CqK6qQ/UqL10OjFUinTNcYUi9jzt2hsGnno9eDjzVtlQmo4i+N3MQRciTWbQawWM7VXmjT7rGI18Zc4zp4/Y9qEZG18QZzaDPexXFOpJU7pWt07658jMHwGqmQJiyIWXTKjBq4IWxIw/s7VmE5R0ElqgCL6spwC3ErHzJvvX1XhrU98lDqyk1VWQxtRl/jyA3OLKnInou92jLPH3M0iAriKlttHxEacFEj0rsaaYDYtLwvyIiWFNdaATraIBaH8cQeoMJH4HmNTCAQFFRUW1B1/Ss2XCgEuUpLaPQaGrpNyNo=
15 |   skip_upload_docs: true
16 |   skip_cleanup: true
17 |   on:
18 |     tags: true
19 |   python: 3.5
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017  YOCTOL INFO INC.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include text_normalizer/data/*/*
2 | include README.md
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := all
 2 | 
 3 | .PHONY: installself
 4 | installself:
 5 | 	python setup.py build_ext
 6 | 	pip install -e .
 7 | 
 8 | .PHONY: install
 9 | install:
10 | 	pip install -U pip wheel setuptools cython
11 | 	pip install -r requirements.txt
12 | 	make installself
13 | 
14 | .PHONY: lint
15 | lint:
16 | 	flake8
17 | 
18 | .PHONY: test
19 | test:
20 | 	python -m unittest -v
21 | 
22 | .PHONY: all
23 | all: test lint
24 | 
25 | .PHONY: clean
26 | clean:
27 | 	rm -rf `find . -name __pycache__`
28 | 	rm -f `find . -type f -name '*.py[co]' `
29 | 	rm -f `find . -type f -name '*~' `
30 | 	rm -f `find . -type f -name '.*~' `
31 | 	rm -rf .cache
32 | 	rm -rf htmlcov
33 | 	rm -rf *.egg-info
34 | 	rm -f .coverage
35 | 	rm -f .coverage.*
36 | 	rm -rf build
37 | 	python setup_utils/remove_so_files.py
38 | 	make -C docs clean
39 | 	python setup.py clean
40 | 
41 | .PHONY: dev-test
42 | dev-test:
43 | 	rm -rf build
44 | 	python utils/setup_utils/remove_so_files.py
45 | 	python setup.py build_ext
46 | 	pip install -e .
47 | 	make lint
48 | 	make test
49 | 
50 | .PHONY: docs
51 | docs:
52 | 	make installself
53 | 	make -C docs
54 | 
55 | .PHONY: distribute
56 | distribute:
57 | 	make clean
58 | 	python setup.py sdist
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # text-normalizer
 2 | 
 3 | [![travis][travis-image]][travis-url]
 4 | [![pypi][pypi-image]][pypi-url]
 5 | 
 6 | [travis-image]: https://img.shields.io/travis/Yoctol/text-normalizer.svg?style=flat
 7 | [travis-url]: https://travis-ci.org/Yoctol/text-normalizer
 8 | [pypi-image]: https://img.shields.io/pypi/v/text-normalizer.svg?style=flat
 9 | [pypi-url]: https://pypi.python.org/pypi/text-normalizer
10 | 
11 | Normalize your Text String. 
12 | It is a python package that help you normalize your text data and recover it.
13 | 
14 | ## Install
15 | Use Python3
16 | ```
17 | > pip install text-normalizer
18 | ```
19 | ## Usage
20 | ```python
21 | from text_normalizer.text_normalizer_collection_library import chinese_charactor_text_normalizer_collection_2
22 | 
23 | 
24 | input_sentence = "   我在85.33度C買了一杯900──1000元的咖啡    《ohoh》？？ m_m"
25 | nor_sentence, meta = chinese_charactor_text_normalizer_collection_2.normalize(input_sentence)
26 | print(nor_sentence)
27 | > "我在_float_度c買了一杯_int_-_int_元的咖啡 <ohoh>?? m_m"
28 | 
29 | de_sentence = chinese_charactor_text_normalizer_collection_2.denormalize(nor_sentence, meta)
30 | print(de_sentence)
31 | > "我在85.33度C買了一杯900──1000元的咖啡 《ohoh》？？ m_m",
32 | 
33 | ```
34 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .
2 | flake8-config-yoctol>=0.0.11
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from setuptools import setup, find_packages
 4 | from utils.setup_utils.get_ext import get_ext_modules_n_cmdclass
 5 | 
 6 | 
 7 | ROOT_DIR = Path(__file__).parent
 8 | 
 9 | 
10 | # make description
11 | readme = ROOT_DIR.joinpath('README.md')
12 | if readme.exists():
13 |     with readme.open() as f:
14 |         long_description = f.read()
15 |         try:
16 |             from pypandoc import convert_text
17 |             long_description = convert_text(
18 |                 long_description, 'rst', format='md')
19 |         except ImportError:
20 |             print("warning: pypandoc module not found, could not convert Markdown to RST")
21 | else:
22 |     long_description = '-'
23 | 
24 | 
25 | # get cython extension
26 | ext_modules, cmdclass = get_ext_modules_n_cmdclass()
27 | 
28 | 
29 | setup(
30 |     name="text-normalizer",
31 |     version="0.1.3",
32 |     description="Yoctol Natural Language Text Normalizer",
33 |     license="MIT",
34 |     author="Solumilken",
35 |     author_email="yien.tsai@yoctol.com",
36 |     url="https://github.com/Yoctol/text-normalizer",
37 |     packages=find_packages(),
38 |     install_requires=[
39 |         'pandas;python_version>="3.5"',
40 |         'pandas<0.21;python_version<"3.5"',
41 |     ],
42 |     python_requires=">=3.5",
43 |     long_description=long_description,
44 |     classifiers=[
45 |         "Programming Language :: Python",
46 |         "Programming Language :: Python :: 3.5",
47 |         "Programming Language :: Python :: 3.6",
48 |     ],
49 |     include_package_data=True,
50 |     cmdclass=cmdclass,
51 |     ext_modules=ext_modules,
52 | )
53 | 


--------------------------------------------------------------------------------
/text_normalizer/__init__.py:
--------------------------------------------------------------------------------
1 | from os.path import abspath, dirname
2 | 
3 | ROOT_DIR = dirname(abspath(__file__))
4 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/__init__.py:
--------------------------------------------------------------------------------
 1 | from .basic import (  # noqa
 2 |     basic_text_normalizer_collection,
 3 |     number_with_digits_text_normalizer_collection,
 4 | )
 5 | from .eng_basic import eng_basic_text_normalizer_collection  # noqa
 6 | from .punctuation_keeping import (  # noqa
 7 |     full_punctuation_keeping_text_normalizer_collection,
 8 |     simplified_punctuation_keeping_text_normalizer_collection,
 9 |     number_with_digits_n_simplified_punctuation_text_normalizer_collection,
10 | )
11 | from .charactor import (  # noqa
12 |     chinese_charactor_text_normalizer_collection_1,
13 |     chinese_charactor_text_normalizer_collection_2,
14 |     chinese_charactor_text_normalizer_collection_3,
15 |     chinese_charactor_text_normalizer_collection_4,
16 | )
17 | from .unicode_mapping import (  # noqa
18 |     u_zh_text_normalizer_collection_1,
19 |     u_zh_text_normalizer_collection_2,
20 |     u_zh_text_normalizer_collection_3,
21 |     u_zh_text_normalizer_collection_4,
22 |     u_en_text_normalizer_collection_1,
23 |     u_en_text_normalizer_collection_2,
24 |     u_en_text_normalizer_collection_3,
25 |     u_zh_en_text_normalizer_collection_1,
26 |     u_zh_en_text_normalizer_collection_2,
27 |     u_zh_en_text_normalizer_collection_3,
28 |     u_zh_en_text_normalizer_collection_4,
29 | )
30 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/base_collection.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | 
 4 | class BaseCollection(object):
 5 | 
 6 |     def __init__(self):
 7 |         self.text_normalizers = []
 8 | 
 9 |     def add_text_normalizers(
10 |             self,
11 |             text_normalizers: List[object],
12 |         ) -> None:
13 |         '''
14 |         TODO: Ensure text normalizer is a subclass of BaseTextNormalizer
15 |         '''
16 |         for text_normalizer in text_normalizers:
17 |             self.text_normalizers.append(text_normalizer)
18 | 
19 |     def clear_text_normalizers(self):
20 |         self.text_normalizers = []
21 | 
22 |     def normalize(
23 |             self,
24 |             sentence: str,
25 |         )-> (str, List[dict]):
26 |         meta = []
27 |         for text_normalizer in self.text_normalizers:
28 |             sentence, meta_data = text_normalizer.normalize(sentence=sentence)
29 |             meta.append({
30 |                 'name': text_normalizer.name,
31 |                 'revised_sentence': sentence,
32 |                 'meta_data': meta_data,
33 |             })
34 |         return sentence, meta
35 | 
36 |     def denormalize(
37 |             self,
38 |             sentence: str,
39 |             meta: List[dict],
40 |         ) -> str:
41 |         for text_normalizer, record in zip(
42 |                 self.text_normalizers[::-1],
43 |                 meta[::-1],
44 |             ):
45 |             if record['name'] == text_normalizer.name:
46 |                 sentence = text_normalizer.denormalize(
47 |                     sentence=sentence,
48 |                     meta=record['meta_data'],
49 |                 )
50 |         return sentence.strip()
51 | 
52 |     # def ldenormalize(
53 |     #         self,
54 |     #         sentence: List[str],
55 |     #         meta: List[dict],
56 |     #     ):
57 |     #     for text_normalizer, record in zip(self.text_normalizers[::-1], meta[::-1]):
58 |     #         if record['name'] == text_normalizer.name:
59 |     #             sentence = text_normalizer.lretrieve(
60 |     #                 sentence=sentence,
61 |     #                 meta=record['meta_data'],
62 |     #             )
63 |     #     return sentence
64 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/basic.py:
--------------------------------------------------------------------------------
 1 | from .base_collection import BaseCollection
 2 | from ..library import (
 3 |     whitespace_char_text_normalizer,
 4 |     float_with_space_text_normalizer,
 5 |     int_with_space_text_normalizer,
 6 |     float_with_digit_n_space_text_normalizer,
 7 |     int_with_digit_n_space_text_normalizer,
 8 |     all_punctuation_without_endpoint_text_normalizer,
 9 |     all_punctuation_without_underscore_text_normalizer,
10 |     pure_strip_text_normalizer,
11 |     eng_lowercase_text_normalizer,
12 | )
13 | 
14 | 
15 | basic_text_normalizer_collection = BaseCollection()
16 | basic_text_normalizer_collection.add_text_normalizers(
17 |     text_normalizers=[
18 |         eng_lowercase_text_normalizer,
19 |         all_punctuation_without_endpoint_text_normalizer,
20 |         float_with_space_text_normalizer,
21 |         int_with_space_text_normalizer,
22 |         all_punctuation_without_underscore_text_normalizer,
23 |         whitespace_char_text_normalizer,
24 |         pure_strip_text_normalizer,
25 |     ],
26 | )
27 | 
28 | number_with_digits_text_normalizer_collection = BaseCollection()
29 | number_with_digits_text_normalizer_collection.add_text_normalizers(
30 |     text_normalizers=[
31 |         eng_lowercase_text_normalizer,
32 |         all_punctuation_without_endpoint_text_normalizer,
33 |         float_with_digit_n_space_text_normalizer,
34 |         int_with_digit_n_space_text_normalizer,
35 |         all_punctuation_without_underscore_text_normalizer,
36 |         whitespace_char_text_normalizer,
37 |         pure_strip_text_normalizer,
38 |     ],
39 | )
40 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/charactor.py:
--------------------------------------------------------------------------------
 1 | from .base_collection import BaseCollection
 2 | from ..library import (
 3 |     whitespace_char_text_normalizer,
 4 |     float_text_normalizer,
 5 |     int_text_normalizer,
 6 |     int_with_digit_text_normalizer,
 7 |     float_with_digit_text_normalizer,
 8 |     full_punctuation_mapping_text_normalizer,
 9 |     simplified_punctuation_mapping_text_normalizer,
10 |     pure_strip_text_normalizer,
11 |     eng_lowercase_text_normalizer,
12 | )
13 | 
14 | 
15 | chinese_charactor_text_normalizer_collection_1 = BaseCollection()
16 | chinese_charactor_text_normalizer_collection_1.add_text_normalizers(
17 |     text_normalizers=[
18 |         eng_lowercase_text_normalizer,
19 |         simplified_punctuation_mapping_text_normalizer,
20 |         float_text_normalizer,
21 |         int_text_normalizer,
22 |         whitespace_char_text_normalizer,
23 |         pure_strip_text_normalizer,
24 |     ],
25 | )
26 | 
27 | 
28 | chinese_charactor_text_normalizer_collection_2 = BaseCollection()
29 | chinese_charactor_text_normalizer_collection_2.add_text_normalizers(
30 |     text_normalizers=[
31 |         eng_lowercase_text_normalizer,
32 |         full_punctuation_mapping_text_normalizer,
33 |         float_text_normalizer,
34 |         int_text_normalizer,
35 |         whitespace_char_text_normalizer,
36 |         pure_strip_text_normalizer,
37 |     ],
38 | )
39 | 
40 | 
41 | chinese_charactor_text_normalizer_collection_3 = BaseCollection()
42 | chinese_charactor_text_normalizer_collection_3.add_text_normalizers(
43 |     text_normalizers=[
44 |         eng_lowercase_text_normalizer,
45 |         simplified_punctuation_mapping_text_normalizer,
46 |         float_with_digit_text_normalizer,
47 |         int_with_digit_text_normalizer,
48 |         whitespace_char_text_normalizer,
49 |         pure_strip_text_normalizer,
50 |     ],
51 | )
52 | 
53 | 
54 | chinese_charactor_text_normalizer_collection_4 = BaseCollection()
55 | chinese_charactor_text_normalizer_collection_4.add_text_normalizers(
56 |     text_normalizers=[
57 |         eng_lowercase_text_normalizer,
58 |         full_punctuation_mapping_text_normalizer,
59 |         float_with_digit_text_normalizer,
60 |         int_with_digit_text_normalizer,
61 |         whitespace_char_text_normalizer,
62 |         pure_strip_text_normalizer,
63 |     ],
64 | )
65 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/eng_basic.py:
--------------------------------------------------------------------------------
 1 | from .base_collection import BaseCollection
 2 | from ..library import (
 3 |     whitespace_char_text_normalizer,
 4 |     pure_strip_text_normalizer,
 5 |     eng_lowercase_text_normalizer,
 6 | )
 7 | 
 8 | 
 9 | eng_basic_text_normalizer_collection = BaseCollection()
10 | eng_basic_text_normalizer_collection.add_text_normalizers(
11 |     text_normalizers=[
12 |         eng_lowercase_text_normalizer,
13 |         whitespace_char_text_normalizer,
14 |         pure_strip_text_normalizer,
15 |     ],
16 | )
17 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/punctuation_keeping.py:
--------------------------------------------------------------------------------
 1 | from .base_collection import BaseCollection
 2 | from ..library import (
 3 |     whitespace_char_text_normalizer,
 4 |     float_with_space_text_normalizer,
 5 |     int_with_space_text_normalizer,
 6 |     float_with_digit_n_space_text_normalizer,
 7 |     int_with_digit_n_space_text_normalizer,
 8 |     full_punctuation_mapping_text_normalizer,
 9 |     simplified_punctuation_mapping_text_normalizer,
10 |     pure_strip_text_normalizer,
11 |     eng_lowercase_text_normalizer,
12 | )
13 | 
14 | 
15 | full_punctuation_keeping_text_normalizer_collection = BaseCollection()
16 | full_punctuation_keeping_text_normalizer_collection.add_text_normalizers(
17 |     text_normalizers=[
18 |         eng_lowercase_text_normalizer,
19 |         full_punctuation_mapping_text_normalizer,
20 |         float_with_space_text_normalizer,
21 |         int_with_space_text_normalizer,
22 |         whitespace_char_text_normalizer,
23 |         pure_strip_text_normalizer,
24 |     ],
25 | )
26 | 
27 | 
28 | simplified_punctuation_keeping_text_normalizer_collection = BaseCollection()
29 | simplified_punctuation_keeping_text_normalizer_collection.add_text_normalizers(
30 |     text_normalizers=[
31 |         eng_lowercase_text_normalizer,
32 |         simplified_punctuation_mapping_text_normalizer,
33 |         float_with_space_text_normalizer,
34 |         int_with_space_text_normalizer,
35 |         whitespace_char_text_normalizer,
36 |         pure_strip_text_normalizer,
37 |     ],
38 | )
39 | 
40 | 
41 | number_with_digits_n_simplified_punctuation_text_normalizer_collection = \
42 |     BaseCollection()
43 | number_with_digits_n_simplified_punctuation_text_normalizer_collection.add_text_normalizers(
44 |     text_normalizers=[
45 |         eng_lowercase_text_normalizer,
46 |         simplified_punctuation_mapping_text_normalizer,
47 |         float_with_digit_n_space_text_normalizer,
48 |         int_with_digit_n_space_text_normalizer,
49 |         whitespace_char_text_normalizer,
50 |         pure_strip_text_normalizer,
51 |     ],
52 | )
53 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/collection/test/__init__.py


--------------------------------------------------------------------------------
/text_normalizer/collection/test/test_base_collection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest.mock import Mock, call
  3 | 
  4 | from unittest import TestCase
  5 | from ..base_collection import BaseCollection
  6 | 
  7 | 
  8 | class TestBaseCollection(TestCase):
  9 | 
 10 |     def setUp(self):
 11 |         self.base_text_normalizer_collection = BaseCollection()
 12 |         self.example_sentence = "0123456789"
 13 |         self.text_normalizers = Mock()
 14 |         self.text_normalizer_0 = Mock()
 15 |         self.text_normalizer_0.normalize = Mock(return_value=("我123456789", {"我": ["0"]}))
 16 |         self.text_normalizer_0.denormalize = Mock(return_value="023456789")
 17 |         self.text_normalizer_0.name = "text_normalizer_0"
 18 |         self.text_normalizer_1 = Mock()
 19 |         self.text_normalizer_1.normalize = Mock(return_value=("我23456789", None))
 20 |         self.text_normalizer_1.denormalize = Mock(return_value="我23456789")
 21 |         self.text_normalizer_1.name = "text_normalizer_1"
 22 |         self.text_normalizer_2 = Mock()
 23 |         self.text_normalizer_2.normalize = Mock(return_value=("我要3456789", {"要": ["2"]}))
 24 |         self.text_normalizer_2.denormalize = Mock(return_value="我23456789")
 25 |         self.text_normalizer_2.name = "text_normalizer_2"
 26 |         self.text_normalizers.f0, self.text_normalizers.f1, self.text_normalizers.f2 = \
 27 |             self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2
 28 | 
 29 |     def test_attributes(self):
 30 |         self.assertEqual(
 31 |             {
 32 |                 'text_normalizers': [],
 33 |             },
 34 |             self.base_text_normalizer_collection.__dict__,
 35 |         )
 36 | 
 37 |     def test_add_text_normalizers(self):
 38 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_0])
 39 |         self.assertEqual(
 40 |             [self.text_normalizer_0],
 41 |             self.base_text_normalizer_collection.text_normalizers,
 42 |         )
 43 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_1])
 44 |         self.assertEqual(
 45 |             [self.text_normalizer_0, self.text_normalizer_1],
 46 |             self.base_text_normalizer_collection.text_normalizers,
 47 |         )
 48 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_2])
 49 |         self.assertEqual(
 50 |             [self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2],
 51 |             self.base_text_normalizer_collection.text_normalizers,
 52 |         )
 53 |         self.base_text_normalizer_collection.clear_text_normalizers()
 54 |         self.base_text_normalizer_collection.add_text_normalizers(
 55 |             [self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2],
 56 |         )
 57 |         self.assertEqual(
 58 |             [self.text_normalizer_0, self.text_normalizer_1, self.text_normalizer_2],
 59 |             self.base_text_normalizer_collection.text_normalizers,
 60 |         )
 61 | 
 62 |     def test_call(self):
 63 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_0])
 64 |         self.base_text_normalizer_collection.normalize(
 65 |             sentence=self.example_sentence,
 66 |         )
 67 |         self.text_normalizers.assert_has_calls(
 68 |             [call.f0.normalize(sentence=self.example_sentence)],
 69 |         )
 70 | 
 71 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_1])
 72 |         self.base_text_normalizer_collection.normalize(
 73 |             sentence=self.example_sentence,
 74 |         )
 75 |         self.text_normalizers.assert_has_calls(
 76 |             [
 77 |                 call.f0.normalize(sentence=self.example_sentence),
 78 |                 call.f1.normalize(sentence="我123456789"),
 79 |             ],
 80 |         )
 81 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_2])
 82 |         self.base_text_normalizer_collection.normalize(
 83 |             sentence=self.example_sentence,
 84 |         )
 85 |         self.text_normalizers.assert_has_calls(
 86 |             [
 87 |                 call.f0.normalize(sentence=self.example_sentence),
 88 |                 call.f1.normalize(sentence="我123456789"),
 89 |                 call.f2.normalize(sentence="我23456789"),
 90 |             ],
 91 |         )
 92 | 
 93 |     def test_denormalize(self):
 94 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_0])
 95 |         self.base_text_normalizer_collection.denormalize(
 96 |             sentence="我123456789",
 97 |             meta=[
 98 |                 {
 99 |                     'name': "text_normalizer_0",
100 |                     'revised_sentence': "XDDD",
101 |                     'meta_data': {"我": ["0"]},
102 |                 },
103 |             ],
104 |         )
105 |         self.text_normalizers.assert_has_calls(
106 |             [
107 |                 call.f0.denormalize(
108 |                     sentence="我123456789",
109 |                     meta={"我": ["0"]},
110 |                 ),
111 |             ],
112 |         )
113 | 
114 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_1])
115 |         self.base_text_normalizer_collection.denormalize(
116 |             sentence="我123456789",
117 |             meta=[
118 |                 {
119 |                     'name': "text_normalizer_0",
120 |                     'revised_sentence': "XDDD",
121 |                     'meta_data': {"我": ["0"]},
122 |                 },
123 |                 {
124 |                     'name': "text_normalizer_1",
125 |                     "revise_sentence": ">O<",
126 |                     "meta_data": None,
127 |                 },
128 |             ],
129 |         )
130 |         self.text_normalizers.assert_has_calls(
131 |             [
132 |                 call.f1.denormalize(
133 |                     sentence="我123456789",
134 |                     meta=None,
135 |                 ),
136 |                 call.f0.denormalize(
137 |                     sentence="我23456789",
138 |                     meta={"我": ["0"]},
139 |                 ),
140 |             ],
141 |         )
142 | 
143 |         self.base_text_normalizer_collection.add_text_normalizers([self.text_normalizer_2])
144 |         self.base_text_normalizer_collection.denormalize(
145 |             sentence="我要3456789",
146 |             meta=[
147 |                 {
148 |                     'name': "text_normalizer_0",
149 |                     'revised_sentence': "XDDD",
150 |                     'meta_data': {"我": ["0"]},
151 |                 },
152 |                 {
153 |                     'name': "text_normalizer_1",
154 |                     "revise_sentence": ">O<",
155 |                     "meta_data": None,
156 |                 },
157 |                 {
158 |                     'name': "text_normalizer_2",
159 |                     "revised_sentence": "M_M",
160 |                     "meta_data": {"要": ["2"]},
161 |                 },
162 |             ],
163 |         )
164 | 
165 |         self.text_normalizers.assert_has_calls(
166 |             [
167 |                 call.f2.denormalize(
168 |                     sentence="我要3456789",
169 |                     meta={"要": ["2"]},
170 |                 ),
171 |                 call.f1.denormalize(
172 |                     sentence="我23456789",
173 |                     meta=None,
174 |                 ),
175 |                 call.f0.denormalize(
176 |                     sentence="我23456789",
177 |                     meta={"我": ["0"]},
178 |                 ),
179 |             ],
180 |         )
181 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/test/test_basic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | 
 4 | from ..basic import (
 5 |     basic_text_normalizer_collection,
 6 |     number_with_digits_text_normalizer_collection,
 7 | )
 8 | 
 9 | 
10 | class BasicNormalizerCollectionTestCase(TestCase):
11 | 
12 |     def test_basic_text_normalizer_collection(self):
13 |         normalizer = basic_text_normalizer_collection
14 |         test_cases = [
15 |             (
16 |                 '我在85.33度C買了一杯(*999*)的咖啡--',
17 |                 '我在 _float_ 度c買了一杯 _int_ 的咖啡',
18 |                 '我在85.33度C買了一杯999的咖啡',
19 |             ),
20 |             (
21 |                 '++',
22 |                 '',
23 |                 '',
24 |             ),
25 |         ]
26 |         for test_case in test_cases:
27 |             with self.subTest(test_case=test_case):
28 |                 revised_sentence, meta = normalizer.normalize(
29 |                     sentence=test_case[0],
30 |                 )
31 |                 self.assertEqual(
32 |                     test_case[1],
33 |                     revised_sentence,
34 |                 )
35 |                 recovered_sentence = normalizer.denormalize(
36 |                     sentence=test_case[1],
37 |                     meta=meta,
38 |                 )
39 |                 self.assertEqual(
40 |                     test_case[2],
41 |                     recovered_sentence,
42 |                 )
43 | 
44 |     def test_number_with_digits_text_normalizer_collection(self):
45 |         normalizer = number_with_digits_text_normalizer_collection
46 |         test_cases = [
47 |             (
48 |                 '我在85.33度C買了一杯(*999*)的咖啡--',
49 |                 '我在 _2float2_ 度c買了一杯 _3int_ 的咖啡',
50 |                 '我在85.33度C買了一杯999的咖啡',
51 |             ),
52 |             (
53 |                 '++??',
54 |                 '',
55 |                 '',
56 |             ),
57 |         ]
58 |         for test_case in test_cases:
59 |             with self.subTest(test_case=test_case):
60 |                 revised_sentence, meta = normalizer.normalize(
61 |                     sentence=test_case[0],
62 |                 )
63 |                 self.assertEqual(
64 |                     test_case[1],
65 |                     revised_sentence,
66 |                 )
67 |                 recovered_sentence = normalizer.denormalize(
68 |                     sentence=test_case[1],
69 |                     meta=meta,
70 |                 )
71 |                 self.assertEqual(
72 |                     test_case[2],
73 |                     recovered_sentence,
74 |                 )
75 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/test/test_charactor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest import TestCase
  3 | 
  4 | from ..charactor import (
  5 |     chinese_charactor_text_normalizer_collection_1,
  6 |     chinese_charactor_text_normalizer_collection_2,
  7 |     chinese_charactor_text_normalizer_collection_3,
  8 |     chinese_charactor_text_normalizer_collection_4,
  9 | )
 10 | 
 11 | 
 12 | class CharactorTextNormalizerCollectionTestCase(TestCase):
 13 | 
 14 |     def _run_test(self, test_cases, normalizer, normalizer_name):
 15 |         for test_case in test_cases:
 16 |             with self.subTest(test_case=(normalizer_name, test_case[0])):
 17 |                 revised_sentence, meta = normalizer.normalize(
 18 |                     sentence=test_case[0],
 19 |                 )
 20 |                 self.assertEqual(
 21 |                     test_case[1],
 22 |                     revised_sentence,
 23 |                 )
 24 |                 recovered_sentence = normalizer.denormalize(
 25 |                     sentence=test_case[1],
 26 |                     meta=meta,
 27 |                 )
 28 |                 self.assertEqual(
 29 |                     test_case[2],
 30 |                     recovered_sentence,
 31 |                 )
 32 | 
 33 |     def test_chinese_charactor_text_normalizer_collection_1(self):
 34 |         test_cases = [
 35 |             (
 36 |                 "   我在85.33度C買了一杯900──1000元的咖啡《ohoh》？？ m_m",
 37 |                 "我在_float_度c買了一杯_int_-_int_元的咖啡(ohoh)?? m_m",
 38 |                 "我在85.33度C買了一杯900──1000元的咖啡《ohoh》？？ m_m",
 39 |             ),
 40 |             (
 41 |                 "買5──8年五門車    ～～",
 42 |                 "買_int_-_int_年五門車 --",
 43 |                 "買5──8年五門車 ～～",
 44 |             ),
 45 |             (
 46 |                 "2001 ~ 2007年紅色\藍色的Benz    OHOHOH",
 47 |                 "_int_ - _int_年紅色,藍色的benz ohohoh",
 48 |                 "2001 ~ 2007年紅色\藍色的Benz OHOHOH",
 49 |             ),
 50 |         ]
 51 |         self._run_test(
 52 |             test_cases=test_cases,
 53 |             normalizer=chinese_charactor_text_normalizer_collection_1,
 54 |             normalizer_name="chinese_charactor_text_normalizer_collection_1",
 55 |         )
 56 | 
 57 |     def test_chinese_charactor_text_normalizer_collection_2(self):
 58 |         test_cases = [
 59 |             (
 60 |                 "   我在85.33度C買了一杯900──1000元的咖啡    《ohoh》？？ m_m",
 61 |                 "我在_float_度c買了一杯_int_-_int_元的咖啡 <ohoh>?? m_m",
 62 |                 "我在85.33度C買了一杯900──1000元的咖啡 《ohoh》？？ m_m",
 63 |             ),
 64 |             (
 65 |                 "買5-8年五門車  ～～    ",
 66 |                 "買_int_-_int_年五門車 ~~",
 67 |                 "買5-8年五門車 ～～",
 68 |             ),
 69 |             (
 70 |                 "2001 ~ 2007年紅色\藍色的Benz    OHOHOH    ",
 71 |                 "_int_ ~ _int_年紅色\藍色的benz ohohoh",
 72 |                 "2001 ~ 2007年紅色\藍色的Benz OHOHOH",
 73 |             ),
 74 |         ]
 75 |         self._run_test(
 76 |             test_cases=test_cases,
 77 |             normalizer=chinese_charactor_text_normalizer_collection_2,
 78 |             normalizer_name="chinese_charactor_text_normalizer_collection_2",
 79 |         )
 80 | 
 81 |     def test_chinese_charactor_text_normalizer_collection_3(self):
 82 |         test_cases = [
 83 |             (
 84 |                 "   我在85.33度C買了一杯900-1000元的咖啡{ohoh}",
 85 |                 "我在_2float2_度c買了一杯_3int_-_4int_元的咖啡(ohoh)",
 86 |                 "我在85.33度C買了一杯900-1000元的咖啡{ohoh}",
 87 |             ),
 88 |             (
 89 |                 "   買5 - 80年     五門車~    ",
 90 |                 "買_1int_ - _2int_年 五門車-",
 91 |                 "買5 - 80年 五門車~",
 92 |             ),
 93 |         ]
 94 |         self._run_test(
 95 |             test_cases=test_cases,
 96 |             normalizer=chinese_charactor_text_normalizer_collection_3,
 97 |             normalizer_name="chinese_charactor_text_normalizer_collection_3",
 98 |         )
 99 | 
100 |     def test_chinese_charactor_text_normalizer_collection_4(self):
101 |         test_cases = [
102 |             (
103 |                 "    我在85.333度C買了一杯900──1000元的咖啡《ohoh》？？ m_m  ",
104 |                 "我在_2float3_度c買了一杯_3int_-_4int_元的咖啡<ohoh>?? m_m",
105 |                 "我在85.333度C買了一杯900──1000元的咖啡《ohoh》？？ m_m",
106 |             ),
107 |             (
108 |                 "買5-800年   五門車  ～～    ",
109 |                 "買_1int_-_3int_年 五門車 ~~",
110 |                 "買5-800年 五門車 ～～",
111 |             ),
112 |             (
113 |                 "    2001 ~ 2007年紅色\藍色的Benz    OHOHOH",
114 |                 "_4int_ ~ _4int_年紅色\藍色的benz ohohoh",
115 |                 "2001 ~ 2007年紅色\藍色的Benz OHOHOH",
116 |             ),
117 |         ]
118 |         self._run_test(
119 |             test_cases=test_cases,
120 |             normalizer=chinese_charactor_text_normalizer_collection_4,
121 |             normalizer_name="chinese_charactor_text_normalizer_collection_4",
122 |         )
123 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/test/test_eng_basic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | 
 4 | from ..eng_basic import eng_basic_text_normalizer_collection
 5 | 
 6 | 
 7 | class EngBasicNormalizerCollectionTestCase(TestCase):
 8 | 
 9 |     def test_eng_basic_text_normalizer_collection(self):
10 |         test_cases = [
11 |             (
12 |                 'Hoa DADA loves to eat chicken pie.',
13 |                 'hoa dada loves to eat chicken pie.',
14 |                 'Hoa DADA loves to eat chicken pie.',
15 |             ),
16 |             (
17 |                 'CPH DA    DA want to hang    out  with Hoa \t\t DADA! \n\n',
18 |                 'cph da da want to hang out with hoa dada!',
19 |                 'CPH DA DA want to hang out with Hoa DADA!',
20 |             ),
21 |         ]
22 |         for test_case in test_cases:
23 |             with self.subTest(test_case=test_case):
24 |                 revised_sentence, meta = eng_basic_text_normalizer_collection.normalize(
25 |                     sentence=test_case[0],
26 |                 )
27 |                 self.assertEqual(test_case[1], revised_sentence)
28 |                 recovered_sentence = eng_basic_text_normalizer_collection.denormalize(
29 |                     sentence=revised_sentence,
30 |                     meta=meta,
31 |                 )
32 |                 self.assertEqual(test_case[2], recovered_sentence)
33 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/test/test_punctuation_keeping.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest import TestCase
  3 | 
  4 | from ..punctuation_keeping import (
  5 |     full_punctuation_keeping_text_normalizer_collection,
  6 |     simplified_punctuation_keeping_text_normalizer_collection,
  7 |     number_with_digits_n_simplified_punctuation_text_normalizer_collection,
  8 | )
  9 | 
 10 | 
 11 | class PunctuationKeepingTextNormalizerCollectionTestCase(TestCase):
 12 | 
 13 |     def test_full_punctuation_keeping_text_normalizer_collection(self):
 14 |         normalizer = full_punctuation_keeping_text_normalizer_collection
 15 |         test_cases = [
 16 |             (
 17 |                 "我在85.33度C買了一杯900──1000元的咖啡《ohoh》？？",
 18 |                 "我在 _float_ 度c買了一杯 _int_ - _int_ 元的咖啡<ohoh>??",
 19 |             ),
 20 |             (
 21 |                 "買5──8年五門車～～",
 22 |                 "買 _int_ - _int_ 年五門車~~",
 23 |             ),
 24 |         ]
 25 |         for test_case in test_cases:
 26 |             with self.subTest(test_case=test_case):
 27 |                 revised_sentence, meta = normalizer.normalize(
 28 |                     sentence=test_case[0],
 29 |                 )
 30 |                 self.assertEqual(
 31 |                     test_case[1],
 32 |                     revised_sentence,
 33 |                 )
 34 |                 recovered_sentence = normalizer.denormalize(
 35 |                     sentence=test_case[1],
 36 |                     meta=meta,
 37 |                 )
 38 |                 self.assertEqual(
 39 |                     test_case[0],
 40 |                     recovered_sentence,
 41 |                 )
 42 | 
 43 |     def test_simplified_punctuation_keeping_text_normalizer_collection(self):
 44 |         normalizer = simplified_punctuation_keeping_text_normalizer_collection
 45 |         test_cases = [
 46 |             (
 47 |                 "我在85.33度C買了一杯900-1000元的咖啡{ohoh}",
 48 |                 "我在 _float_ 度c買了一杯 _int_ - _int_ 元的咖啡(ohoh)",
 49 |             ),
 50 |             (
 51 |                 "買5-8年五門車~",
 52 |                 "買 _int_ - _int_ 年五門車-",
 53 |             ),
 54 |         ]
 55 |         for test_case in test_cases:
 56 |             with self.subTest(test_case=test_case):
 57 |                 revised_sentence, meta = normalizer.normalize(
 58 |                     sentence=test_case[0],
 59 |                 )
 60 |                 self.assertEqual(
 61 |                     test_case[1],
 62 |                     revised_sentence,
 63 |                 )
 64 |                 recovered_sentence = normalizer.denormalize(
 65 |                     sentence=test_case[1],
 66 |                     meta=meta,
 67 |                 )
 68 |                 self.assertEqual(
 69 |                     test_case[0],
 70 |                     recovered_sentence,
 71 |                 )
 72 | 
 73 |     def test_number_with_digits_n_simplified_punctuation_text_normalizer_collection(self):
 74 |         normalizer = number_with_digits_n_simplified_punctuation_text_normalizer_collection
 75 |         test_cases = [
 76 |             (
 77 |                 "我在85.33度C買了一杯900-1000元的咖啡{ohoh}",
 78 |                 "我在 _2float2_ 度c買了一杯 _3int_ - _4int_ 元的咖啡(ohoh)",
 79 |             ),
 80 |             (
 81 |                 "買5-8年五門車~",
 82 |                 "買 _1int_ - _1int_ 年五門車-",
 83 |             ),
 84 |         ]
 85 |         for test_case in test_cases:
 86 |             with self.subTest(test_case=test_case):
 87 |                 revised_sentence, meta = normalizer.normalize(
 88 |                     sentence=test_case[0],
 89 |                 )
 90 |                 self.assertEqual(
 91 |                     test_case[1],
 92 |                     revised_sentence,
 93 |                 )
 94 |                 recovered_sentence = normalizer.denormalize(
 95 |                     sentence=test_case[1],
 96 |                     meta=meta,
 97 |                 )
 98 |                 self.assertEqual(
 99 |                     test_case[0],
100 |                     recovered_sentence,
101 |                 )
102 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/test/test_unicode_mapping.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest import TestCase
  3 | 
  4 | from ..unicode_mapping import (
  5 |     u_zh_text_normalizer_collection_1,
  6 |     u_zh_text_normalizer_collection_2,
  7 |     u_zh_text_normalizer_collection_3,
  8 |     u_zh_text_normalizer_collection_4,
  9 |     u_en_text_normalizer_collection_1,
 10 |     u_en_text_normalizer_collection_2,
 11 |     u_en_text_normalizer_collection_3,
 12 |     u_zh_en_text_normalizer_collection_1,
 13 |     u_zh_en_text_normalizer_collection_2,
 14 |     u_zh_en_text_normalizer_collection_3,
 15 |     u_zh_en_text_normalizer_collection_4,
 16 | )
 17 | 
 18 | 
 19 | class UnicodeTextNormalizerCollectionTestCase(TestCase):
 20 | 
 21 |     def unit_test(self, normalizer, test_cases):
 22 |         for test_case in test_cases:
 23 |             with self.subTest(test_case=test_case):
 24 |                 revised_sentence, meta = normalizer.normalize(
 25 |                     sentence=test_case[0],
 26 |                 )
 27 |                 self.assertEqual(
 28 |                     test_case[1],
 29 |                     revised_sentence,
 30 |                 )
 31 |                 recovered_sentence = normalizer.denormalize(
 32 |                     sentence=test_case[1],
 33 |                     meta=meta,
 34 |                 )
 35 |                 self.assertEqual(
 36 |                     test_case[0],
 37 |                     recovered_sentence,
 38 |                 )
 39 | 
 40 |     def test_u_zh_text_normalizer_collection_1(self):
 41 |         normalizer = u_zh_text_normalizer_collection_1
 42 |         test_cases = [
 43 |             (
 44 |                 '我在85.33度C買了一杯900──1000元的咖啡《ohoh》？？',
 45 |                 '我在 度 買了一杯 元的咖啡 ',
 46 |             ),
 47 |             (
 48 |                 '買5──8年五門車～～',
 49 |                 '買 年五門車 ',
 50 |             ),
 51 |         ]
 52 |         self.unit_test(
 53 |             normalizer=normalizer,
 54 |             test_cases=test_cases,
 55 |         )
 56 | 
 57 |     def test_u_zh_text_normalizer_collection_2(self):
 58 |         normalizer = u_zh_text_normalizer_collection_2
 59 |         test_cases = [
 60 |             (
 61 |                 '我在85.33度C買了一杯900 1000元的咖啡《ohoh》？？',
 62 |                 '我在85.33度 買了一杯900 1000元的咖啡 ',
 63 |             ),
 64 |             (
 65 |                 '買5──8年五門車～～',
 66 |                 '買5 8年五門車 ',
 67 |             ),
 68 |         ]
 69 |         self.unit_test(
 70 |             normalizer=normalizer,
 71 |             test_cases=test_cases,
 72 |         )
 73 | 
 74 |     def test_u_zh_text_normalizer_collection_3(self):
 75 |         normalizer = u_zh_text_normalizer_collection_3
 76 |         test_cases = [
 77 |             (
 78 |                 '我在85.33度C買了一杯900 1000元的咖啡《ohoh》？？',
 79 |                 '我在_float_度 買了一杯_int_ _int_元的咖啡 ',
 80 |             ),
 81 |             (
 82 |                 '買5──8年五門車～～',
 83 |                 '買_int_ _int_年五門車 ',
 84 |             ),
 85 |         ]
 86 |         self.unit_test(
 87 |             normalizer=normalizer,
 88 |             test_cases=test_cases,
 89 |         )
 90 | 
 91 |     def test_u_zh_text_normalizer_collection_4(self):
 92 |         normalizer = u_zh_text_normalizer_collection_4
 93 |         test_cases = [
 94 |             (
 95 |                 '我在85.333度C買了一杯900 1000元的咖啡《ohoh》？？',
 96 |                 '我在_2float3_度 買了一杯_3int_ _4int_元的咖啡 ',
 97 |             ),
 98 |             (
 99 |                 '買5──8年五門車～～',
100 |                 '買_1int_ _1int_年五門車 ',
101 |             ),
102 |         ]
103 |         self.unit_test(
104 |             normalizer=normalizer,
105 |             test_cases=test_cases,
106 |         )
107 | 
108 |     def test_u_en_text_normalizer_collection_1(self):
109 |         normalizer = u_en_text_normalizer_collection_1
110 |         test_cases = [
111 |             (
112 |                 'I want to buy 300 cups of $10.7 coffee. OHOH@@',
113 |                 'i want to buy 300 cups of $10.7 coffee. ohoh@@',
114 |             ),
115 |         ]
116 |         self.unit_test(
117 |             normalizer=normalizer,
118 |             test_cases=test_cases,
119 |         )
120 | 
121 |     def test_u_en_text_normalizer_collection_2(self):
122 |         normalizer = u_en_text_normalizer_collection_2
123 |         test_cases = [
124 |             (
125 |                 'I want    to buy 300 cups of $10.7 coffee. OHOH',
126 |                 'i want to buy _int_ cups of $_float_ coffee. ohoh',
127 |             ),
128 |         ]
129 |         self.unit_test(
130 |             normalizer=normalizer,
131 |             test_cases=test_cases,
132 |         )
133 | 
134 |     def test_u_en_text_normalizer_collection_3(self):
135 |         normalizer = u_en_text_normalizer_collection_3
136 |         test_cases = [
137 |             (
138 |                 'I want    to buy 300 cups of $10.7 coffee. OHOH',
139 |                 'i want to buy _3int_ cups of $_2float1_ coffee. ohoh',
140 |             ),
141 |         ]
142 |         self.unit_test(
143 |             normalizer=normalizer,
144 |             test_cases=test_cases,
145 |         )
146 | 
147 |     def test_u_zh_en_text_normalizer_collection_1(self):
148 |         normalizer = u_zh_en_text_normalizer_collection_1
149 |         test_cases = [
150 |             (
151 |                 '我在85.333度C買了a cup of900-1000元的咖啡《ohoh》？？',
152 |                 '我在_float_度c買了a cup of_int_-_int_元的咖啡<ohoh>??',
153 |             ),
154 |             (
155 |                 '+1~~',
156 |                 '+_int_~~',
157 |             ),
158 |             (
159 |                 '，買5～80年五門車～～',
160 |                 ',買_int_~_int_年五門車~~',
161 |             ),
162 |             (
163 |                 '<><>@@##',
164 |                 '<><>@@##',
165 |             ),
166 |         ]
167 |         self.unit_test(
168 |             normalizer=normalizer,
169 |             test_cases=test_cases,
170 |         )
171 | 
172 |     def test_u_zh_en_text_normalizer_collection_2(self):
173 |         normalizer = u_zh_en_text_normalizer_collection_2
174 |         test_cases = [
175 |             (
176 |                 '我在85.333度C買了a cup of900-1000元的咖啡《ohoh》？？',
177 |                 '我在_2float3_度c買了a cup of_3int_-_4int_元的咖啡<ohoh>??',
178 |             ),
179 |             (
180 |                 '+1~~',
181 |                 '+_1int_~~',
182 |             ),
183 |             (
184 |                 '，買5～80年五門車～～',
185 |                 ',買_1int_~_2int_年五門車~~',
186 |             ),
187 |             (
188 |                 '<><>@@##',
189 |                 '<><>@@##',
190 |             ),
191 |         ]
192 |         self.unit_test(
193 |             normalizer=normalizer,
194 |             test_cases=test_cases,
195 |         )
196 | 
197 |     def test_u_zh_en_text_normalizer_collection_3(self):
198 |         normalizer = u_zh_en_text_normalizer_collection_3
199 |         test_cases = [
200 |             (
201 |                 '我在85.333度C買了a cup of900～1000元的咖啡《ohoh》？？',
202 |                 '我在_float_度c買了a cup of_int_-_int_元的咖啡 ohoh ',
203 |             ),
204 |             (
205 |                 '+1~~',
206 |                 '+_int_--',
207 |             ),
208 |             (
209 |                 '<><>@@##',
210 |                 ' ',
211 |             ),
212 |         ]
213 |         self.unit_test(
214 |             normalizer=normalizer,
215 |             test_cases=test_cases,
216 |         )
217 | 
218 |     def test_u_zh_en_text_normalizer_collection_4(self):
219 |         normalizer = u_zh_en_text_normalizer_collection_4
220 |         test_cases = [
221 |             (
222 |                 '我在85.333度C買了a cup of900～1000元的咖啡《OhoH》？？',
223 |                 '我在_2float3_度c買了a cup of_3int_-_4int_元的咖啡 ohoh ',
224 |             ),
225 |             (
226 |                 '+1~~',
227 |                 '+_1int_--',
228 |             ),
229 |             (
230 |                 '，買5～80年五門車～～',
231 |                 ',買_1int_-_2int_年五門車--',
232 |             ),
233 |             (
234 |                 '<><>@@##',
235 |                 ' ',
236 |             ),
237 |         ]
238 |         self.unit_test(
239 |             normalizer=normalizer,
240 |             test_cases=test_cases,
241 |         )
242 | 


--------------------------------------------------------------------------------
/text_normalizer/collection/unicode_mapping.py:
--------------------------------------------------------------------------------
  1 | from .base_collection import BaseCollection
  2 | from ..library import (
  3 |     whitespace_reduction_text_normalizer,
  4 |     eng_lowercase_text_normalizer,
  5 |     float_text_normalizer,
  6 |     int_text_normalizer,
  7 |     int_with_digit_text_normalizer,
  8 |     float_with_digit_text_normalizer,
  9 |     unicode__chinese_characters_text_normalizer,
 10 |     unicode__chinese_characters_and_digits_text_normalizer,
 11 |     unicode__english_digits_and_full_punctuations_text_normalizer,
 12 |     unicode__chinese_english_digits_and_full_punctuations_text_normalizer,
 13 |     unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer,
 14 | )
 15 | 
 16 | 
 17 | u_zh_text_normalizer_collection_1 = BaseCollection()
 18 | u_zh_text_normalizer_collection_1.add_text_normalizers(
 19 |     text_normalizers=[
 20 |         unicode__chinese_characters_text_normalizer,
 21 |         unicode__chinese_characters_and_digits_text_normalizer,
 22 |         whitespace_reduction_text_normalizer,
 23 |     ],
 24 | )
 25 | 
 26 | u_zh_text_normalizer_collection_2 = BaseCollection()
 27 | u_zh_text_normalizer_collection_2.add_text_normalizers(
 28 |     text_normalizers=[
 29 |         unicode__chinese_characters_and_digits_text_normalizer,
 30 |         whitespace_reduction_text_normalizer,
 31 |     ],
 32 | )
 33 | 
 34 | u_zh_text_normalizer_collection_3 = BaseCollection()
 35 | u_zh_text_normalizer_collection_3.add_text_normalizers(
 36 |     text_normalizers=[
 37 |         unicode__chinese_characters_and_digits_text_normalizer,
 38 |         float_text_normalizer,
 39 |         int_text_normalizer,
 40 |         whitespace_reduction_text_normalizer,
 41 |     ],
 42 | )
 43 | 
 44 | u_zh_text_normalizer_collection_4 = BaseCollection()
 45 | u_zh_text_normalizer_collection_4.add_text_normalizers(
 46 |     text_normalizers=[
 47 |         unicode__chinese_characters_and_digits_text_normalizer,
 48 |         float_with_digit_text_normalizer,
 49 |         int_with_digit_text_normalizer,
 50 |         whitespace_reduction_text_normalizer,
 51 |     ],
 52 | )
 53 | 
 54 | u_en_text_normalizer_collection_1 = BaseCollection()
 55 | u_en_text_normalizer_collection_1.add_text_normalizers(
 56 |     text_normalizers=[
 57 |         unicode__english_digits_and_full_punctuations_text_normalizer,
 58 |         eng_lowercase_text_normalizer,
 59 |         whitespace_reduction_text_normalizer,
 60 |     ],
 61 | )
 62 | 
 63 | u_en_text_normalizer_collection_2 = BaseCollection()
 64 | u_en_text_normalizer_collection_2.add_text_normalizers(
 65 |     text_normalizers=[
 66 |         unicode__english_digits_and_full_punctuations_text_normalizer,
 67 |         eng_lowercase_text_normalizer,
 68 |         float_text_normalizer,
 69 |         int_text_normalizer,
 70 |         whitespace_reduction_text_normalizer,
 71 |     ],
 72 | )
 73 | 
 74 | u_en_text_normalizer_collection_3 = BaseCollection()
 75 | u_en_text_normalizer_collection_3.add_text_normalizers(
 76 |     text_normalizers=[
 77 |         unicode__english_digits_and_full_punctuations_text_normalizer,
 78 |         eng_lowercase_text_normalizer,
 79 |         float_with_digit_text_normalizer,
 80 |         int_with_digit_text_normalizer,
 81 |         whitespace_reduction_text_normalizer,
 82 |     ],
 83 | )
 84 | 
 85 | u_zh_en_text_normalizer_collection_1 = BaseCollection()
 86 | u_zh_en_text_normalizer_collection_1.add_text_normalizers(
 87 |     text_normalizers=[
 88 |         unicode__chinese_english_digits_and_full_punctuations_text_normalizer,
 89 |         eng_lowercase_text_normalizer,
 90 |         float_text_normalizer,
 91 |         int_text_normalizer,
 92 |         whitespace_reduction_text_normalizer,
 93 |     ],
 94 | )
 95 | 
 96 | u_zh_en_text_normalizer_collection_2 = BaseCollection()
 97 | u_zh_en_text_normalizer_collection_2.add_text_normalizers(
 98 |     text_normalizers=[
 99 |         unicode__chinese_english_digits_and_full_punctuations_text_normalizer,
100 |         eng_lowercase_text_normalizer,
101 |         float_with_digit_text_normalizer,
102 |         int_with_digit_text_normalizer,
103 |         whitespace_reduction_text_normalizer,
104 |     ],
105 | )
106 | 
107 | u_zh_en_text_normalizer_collection_3 = BaseCollection()
108 | u_zh_en_text_normalizer_collection_3.add_text_normalizers(
109 |     text_normalizers=[
110 |         unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer,
111 |         eng_lowercase_text_normalizer,
112 |         float_text_normalizer,
113 |         int_text_normalizer,
114 |         whitespace_reduction_text_normalizer,
115 |     ],
116 | )
117 | 
118 | u_zh_en_text_normalizer_collection_4 = BaseCollection()
119 | u_zh_en_text_normalizer_collection_4.add_text_normalizers(
120 |     text_normalizers=[
121 |         unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer,
122 |         eng_lowercase_text_normalizer,
123 |         float_with_digit_text_normalizer,
124 |         int_with_digit_text_normalizer,
125 |         whitespace_reduction_text_normalizer,
126 |     ],
127 | )
128 | 


--------------------------------------------------------------------------------
/text_normalizer/data/punctuation/punctuation_mapping_0221.csv:
--------------------------------------------------------------------------------
 1 | "before","after"
 2 | "（ ( ❨ ﹙ （ ︵","("
 3 | "） ) ❩ ﹚ ） ︶",")"
 4 | """ 『 「 “ ‘ ' ` ﹁ ﹃ 〝 〃 ’ "" 』 」 ” ’' ﹂ ﹄ 〞",""""
 5 | "[ 〔 〘 【 ﹝ 【 〔  ︹  ︻","["
 6 | "] 〕 〙 】 ﹞ 】 〕 ︺ ︼","]"
 7 | "{ ❴ ﹛ ｛ ︷","{"
 8 | "} ❵ ﹜ ｝ ︸","}"
 9 | "< ⟨ 《 〈 〈 ＜ ﹤ ≦ ︽ ︿","<"
10 | "> ⟩ 》 〉 ＞ ﹥ ≧ ︾ ﹀",">"
11 | ", ， , 、 , ﹐",","
12 | "- — ― ── ﹣ – — ╴ ¯ ￣ ﹉ ﹊ ﹋ ﹌","-"
13 | "~ ～ ~","~"
14 | "! ！","!"
15 | ". 。 ‧ ． ﹒ ˙ ·","."
16 | "... …","..."
17 | ": ： ﹕ ︰",":"
18 | " ； ﹔",";"
19 | "? ？","?"
20 | "+ ＋ ＋ ﹢","+"
21 | "% ％","%"
22 | "* ＊ ×  ╳","*"
23 | "| ｜ ︱ ︳ ∣ ︴","|"
24 | "/ ∕ ╱ ／ ∥","/"
25 | "\ ﹨ ╲ ＼ ﹨","\"
26 | "＃ #","#"
27 | "＄ $","$"
28 | "＠ @","@"
29 | "＆ &","&"
30 | "= ﹦ ＝ ≡ ＝ ≒","="
31 | "﹍ ﹎ ﹏ _","_"
32 | 


--------------------------------------------------------------------------------
/text_normalizer/data/punctuation/punctuation_mapping_0221_simplified.csv:
--------------------------------------------------------------------------------
 1 | "before","after"
 2 | "（ ( ❨ ﹙ （ ︵ [ 〔 〘 【 ﹝ 【 〔  ︹  ︻ < ⟨ 《 〈 〈 ＜ ﹤ ≦ ︽ ︿ { ❴ ﹛ ｛ ︷","("
 3 | "） ) ❩ ﹚ ） ︶ ] 〕 〙 】 ﹞ 】 〕 ︺ ︼ > ⟩ 》 〉 ＞ ﹥ ≧ ︾ ﹀ } ❵ ﹜ ｝ ︸",")"
 4 | """ 『 「 “ ‘ ' ` ﹁ ﹃ 〝 〃 ’ "" 』 」 ” ’' ﹂ ﹄ 〞",""""
 5 | ", ， , 、 , ﹐ / ∕ ╱ ／ ∥ \ ﹨ ╲ ＼ ﹨ | ｜ ︱ ︳ ∣ ︴",","
 6 | "-  — ― ── ﹣ – — ╴ ¯ ￣ ﹉ ﹊ ﹋  ﹌ ~ ～","-"
 7 | "! ！","!"
 8 | ". 。 ‧ ． ﹒ ˙ ·","."
 9 | "... …","..."
10 | ": ： ﹕ ︰",":"
11 | " ； ﹔",";"
12 | "? ？","?"
13 | "+ ＋ ＋ ﹢","+"
14 | "% ％","%"
15 | "* ＊ ×  ╳","*"
16 | "＃ #","#"
17 | "＄ $","$"
18 | "＠ @","@"
19 | "＆ &","&"
20 | "= ﹦ ＝ ≡ ＝ ≒","="
21 | "﹍ ﹎ ﹏ _","_"
22 | 


--------------------------------------------------------------------------------
/text_normalizer/data/unicode/chinese_characters_and_digits.txt:
--------------------------------------------------------------------------------
1 | 0030-0039:one2one(HalfWidth Numbers)
2 | FF10-FF19:one2one(FullWidth Numbers)
3 | 4E00-9FFF:one2one(CJK Unified Ideographs)
4 | F900-FAFF:one2one(CJK Compatibility Ideographs)
5 | 002E:002E(.)


--------------------------------------------------------------------------------
/text_normalizer/data/unicode/chinese_characters_only.txt:
--------------------------------------------------------------------------------
1 | 4E00-9FFF:one2one(CJK Unified Ideographs)
2 | F900-FAFF:one2one(CJK Compatibility Ideographs)


--------------------------------------------------------------------------------
/text_normalizer/data/unicode/chinese_english_digits.txt:
--------------------------------------------------------------------------------
1 | 0030-0039:one2one(HalfWidth Numbers)
2 | 0041-005A:one2one(HalfWidth Uppercase English Characters)
3 | 0061-007A:one2one(HalfWidth Lower English Characters)
4 | FF10-FF19:one2one(FullWidth Numbers)
5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters)
6 | FF41-FF5A:one2one(FullWidth Lower English Characters)
7 | 4E00-9FFF:one2one(CJK Unified Ideographs)
8 | F900-FAFF:one2one(CJK Compatibility Ideographs)


--------------------------------------------------------------------------------
/text_normalizer/data/unicode/chinese_english_digits_and_full_punctuations.txt:
--------------------------------------------------------------------------------
 1 | 0030-0039:one2one(HalfWidth Numbers)
 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters)
 3 | 0061-007A:one2one(HalfWidth Lower English Characters)
 4 | FF10-FF19:one2one(FullWidth Numbers)
 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters)
 6 | FF41-FF5A:one2one(FullWidth Lower English Characters)
 7 | 4E00-9FFF:one2one(CJK Unified Ideographs)
 8 | F900-FAFF:one2one(CJK Compatibility Ideographs)
 9 | FF01 01C3 0021:0021(!)
10 | 3003 300C 300D 300E 300F 201C 201D 201F FF62 FF63 FF02 0022:0022(")
11 | FF03 0023:0023(#)
12 | 1F4B2 FF04 0024:0024($)
13 | FF05 0025:0025(%)
14 | 1F674 FF06 0026:0026(&)
15 | 2018 2019 FF07 0027:0027(')
16 | FF5F FF08 0028:0028(()
17 | FF60 FF09 0029:0029())
18 | 2217 FF0A 002A:002A(*)
19 | FF0B 002B:002B(+)
20 | 3001 201A FF64 FF0C 002C:002C(,)
21 | 2010 23BA 23BB 23BC 23BD FF0D 002D:002D(-)
22 | 3002 302A 302B 302C 302D 302E 2218 2219 FF65 FF61 FF0E 002E:002E(.)
23 | 27CB 2215 2044 0338 2215 FF0F 002F:002F(/)
24 | 302F 0589 05C3 A789 2236 FF1A 003A:003A(:)
25 | 037E FF1B 003B:003B(;)
26 | 3008 300A 2039 227A 2329 FF1C 003C:003C(<)
27 | 2261 10190 A78A FF1D 003D:003D(=)
28 | 3009 300B 203A 227B 232A FF1E 003E:003E(>)
29 | 203D FF1F 003F:003F(?)
30 | FF20 0040:0040(@)
31 | 3010 3016 FF3B 005B:005B([)
32 | 20E5 2216 FF3C 005C:005C(\)
33 | 3011 3017 FF3D 005D:005D(])
34 | 2038 FF3E 005E:005E(^)
35 | 02CD 0331 0332 FF3F 005F:005F(_)
36 | 2035 FF40 0060:0060(`)
37 | FF5B 007B:007B({)
38 | 2223 23B8 23B9 23D0 FF5C 007C:007C(|)
39 | FF5D 007D:007D(})
40 | 301C 2053 02DC 0303 223C 223D 223E 223F FF5E 007E:007E(~)


--------------------------------------------------------------------------------
/text_normalizer/data/unicode/chinese_english_digits_and_simplified_punctuations_1.txt:
--------------------------------------------------------------------------------
 1 | 0030-0039:one2one(HalfWidth Numbers)
 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters)
 3 | 0061-007A:one2one(HalfWidth Lower English Characters)
 4 | FF10-FF19:one2one(FullWidth Numbers)
 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters)
 6 | FF41-FF5A:one2one(FullWidth Lower English Characters)
 7 | 4E00-9FFF:one2one(CJK Unified Ideographs)
 8 | F900-FAFF:one2one(CJK Compatibility Ideographs)
 9 | 1F4B2 FF04 0024:0024($)
10 | FF0B 002B:002B(+)
11 | FF05 0025:0025(%)
12 | 3001 201A FF64 FF0C 002C 037E FF1B 003B 27CB 2215 2044 0338 2215 FF0F 002F 2223 23B8 23B9 23D0 FF5C 007C 1F674 FF06 0026:002C(separation symbols)
13 | 2010 23BA 23BB 23BC 23BD FF0D 002D 301C 2053 02DC 0303 223C 223D 223E 223F FF5E 007E:002D(to)
14 | 3002 302A 302B 302C 302D 302E 2218 2219 FF65 FF61 FF0E 002E:002E(.)


--------------------------------------------------------------------------------
/text_normalizer/data/unicode/english_characters_and_digits.txt:
--------------------------------------------------------------------------------
1 | 0030-0039:one2one(HalfWidth Numbers)
2 | 0041-005A:one2one(HalfWidth Uppercase English Characters)
3 | 0061-007A:one2one(HalfWidth Lower English Characters)
4 | FF10-FF19:one2one(FullWidth Numbers)
5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters)
6 | FF41-FF5A:one2one(FullWidth Lower English Characters)


--------------------------------------------------------------------------------
/text_normalizer/data/unicode/english_digits_and_full_punctuations.txt:
--------------------------------------------------------------------------------
 1 | 0030-0039:one2one(HalfWidth Numbers)
 2 | 0041-005A:one2one(HalfWidth Uppercase English Characters)
 3 | 0061-007A:one2one(HalfWidth Lower English Characters)
 4 | FF10-FF19:one2one(FullWidth Numbers)
 5 | FF21-FF2A:one2one(FullWidth Uppercase English Characters)
 6 | FF41-FF5A:one2one(FullWidth Lower English Characters)
 7 | FF01 01C3 0021:0021(!)
 8 | 3003 300C 300D 300E 300F 201C 201D 201F FF62 FF63 FF02 0022:0022(")
 9 | FF03 0023:0023(#)
10 | 1F4B2 FF04 0024:0024($)
11 | FF05 0025:0025(%)
12 | 1F674 FF06 0026:0026(&)
13 | 2018 2019 FF07 0027:0027(')
14 | FF5F FF08 0028:0028(()
15 | FF60 FF09 0029:0029())
16 | 2217 FF0A 002A:002A(*)
17 | FF0B 002B:002B(+)
18 | 3001 201A FF64 FF0C 002C:002C(,)
19 | 2010 23BA 23BB 23BC 23BD FF0D 002D:002D(-)
20 | 3002 302A 302B 302C 302D 302E 2218 2219 FF65 FF61 FF0E 002E:002E(.)
21 | 27CB 2215 2044 0338 2215 FF0F 002F:002F(/)
22 | 302F 0589 05C3 A789 2236 FF1A 003A:003A(:)
23 | 037E FF1B 003B:003B(;)
24 | 3008 300A 2039 227A 2329 FF1C 003C:003C(<)
25 | 2261 10190 A78A FF1D 003D:003D(=)
26 | 3009 300B 203A 227B 232A FF1E 003E:003E(>)
27 | 203D FF1F 003F:003F(?)
28 | FF20 0040:0040(@)
29 | 3010 3016 FF3B 005B:005B([)
30 | 20E5 2216 FF3C 005C:005C(\)
31 | 3011 3017 FF3D 005D:005D(])
32 | 2038 FF3E 005E:005E(^)
33 | 02CD 0331 0332 FF3F 005F:005F(_)
34 | 2035 FF40 0060:0060(`)
35 | FF5B 007B:007B({)
36 | 2223 23B8 23B9 23D0 FF5C 007C:007C(|)
37 | FF5D 007D:007D(})
38 | 301C 2053 02DC 0303 223C 223D 223E 223F FF5E 007E:007E(~)


--------------------------------------------------------------------------------
/text_normalizer/factory/__init__.py:
--------------------------------------------------------------------------------
1 | from .eng_lowercase import EngLowercase  # noqa
2 | from .identity import Identity  # noqa
3 | from .number_token import NumberToken  # noqa
4 | from .punctuation_mapping import PunctuationMapping  # noqa
5 | from .replace_pattern_with_token import ReplacePatternWithToken  # noqa
6 | from .strip import Strip  # noqa
7 | from .unicode_mapping import UnicodeMapping  # noqa
8 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/base_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | 
 4 | class BaseFactory(object):
 5 | 
 6 |     def __init__(
 7 |             self,
 8 |             denormalizable: bool = False,
 9 |             name: str = None,
10 |         ) -> None:
11 |         self.denormalizable = denormalizable
12 |         if name is None:
13 |             self.name = self.__class__.__name__
14 |         else:
15 |             self.name = name
16 | 
17 |     def normalize(
18 |             self,
19 |             sentence: str,
20 |         ) -> (str, List[dict]):
21 |         raise NotImplementedError
22 | 
23 |     def denormalize(
24 |             self,
25 |             sentence: str,
26 |             meta: dict = None,
27 |         ) -> str:
28 |         '''
29 |         If the text normalizer is denormalizable, then this method should be implemented.
30 |         '''
31 |         if not self.denormalizable:
32 |             return sentence
33 | 
34 |     # def ldenormalize(
35 |     #         self,
36 |     #         sentence: List[str],
37 |     #         meta: dict = None,
38 |     #     ) -> str:
39 |     #     if not self.denormalizable:
40 |     #         return sentence
41 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/eng_lowercase.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Tuple
 2 | import re
 3 | 
 4 | from .base_factory import BaseFactory
 5 | 
 6 | 
 7 | class EngLowercase(BaseFactory):
 8 | 
 9 |     def __init__(self, name='eng_lowercase'):
10 |         super().__init__(name=name, denormalizable=True)
11 |         self.fullwidth_uppercase = "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ"
12 |         self.fullwidth_lowercase = "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ"
13 |         self.halfwidth_uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
14 |         self.halfwidth_lowercase = "abcdefghijklmnopqrstuvwxyz"
15 | 
16 |         self.pattern = "[a-zA-Z{}{}]+".format(
17 |             self.fullwidth_uppercase,
18 |             self.fullwidth_lowercase,
19 |         )
20 |         self.findall_prog = re.compile(self.pattern)
21 |         self.mapping_table = self.gen_table()
22 | 
23 |     def gen_table(self) -> Dict[str, str]:
24 |         table = {}
25 |         for index in range(26):
26 |             table[self.fullwidth_uppercase[index]] = \
27 |                 self.halfwidth_lowercase[index]
28 |             table[self.fullwidth_lowercase[index]] = \
29 |                 self.halfwidth_lowercase[index]
30 |             table[self.halfwidth_uppercase[index]] = \
31 |                 self.halfwidth_lowercase[index]
32 |         return table
33 | 
34 |     def lowercase(self, sentence: str) -> str:
35 |         output = []
36 |         for char in sentence:
37 |             if char in self.mapping_table:
38 |                 output.append(self.mapping_table[char])
39 |             else:
40 |                 output.append(char)
41 |         return ''.join(output)
42 | 
43 |     def normalize(
44 |             self,
45 |             sentence: str,
46 |         ) -> Tuple[str, List[dict]]:
47 |         eng_words = self.findall_prog.findall(sentence)
48 |         if len(eng_words) == 0:
49 |             return sentence, None
50 |         else:
51 |             meta = []
52 |             for eng_word in eng_words:
53 |                 meta.append(
54 |                     {
55 |                         'before': eng_word,
56 |                         'after': self.lowercase(eng_word),
57 |                     },
58 |                 )
59 |             return self.lowercase(sentence), meta
60 | 
61 |     def denormalize(
62 |             self,
63 |             sentence: str,
64 |             meta: List[dict] = None,
65 |         ) -> str:
66 |         if (not self.denormalizable) or (meta is None):
67 |             # Case1: self.denormalizable = False
68 |             return sentence
69 |         else:
70 |             begin_index = 0
71 |             output = []
72 |             for single_meta in meta:
73 |                 start = sentence.find(single_meta['after'], begin_index)
74 |                 if start != -1:
75 |                     if begin_index != start:
76 |                         output.append(sentence[begin_index: start])
77 |                         begin_index = start
78 |                     output.append(single_meta['before'])
79 |                     begin_index += len(single_meta['before'])
80 |             if begin_index != len(sentence):
81 |                 output.append(sentence[begin_index:])
82 |             return ''.join(output)
83 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/identity.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .base_factory import BaseFactory
 4 | 
 5 | 
 6 | class Identity(BaseFactory):
 7 | 
 8 |     def __init__(self):
 9 |         super().__init__(name='identity', denormalizable=False)
10 | 
11 |     def normalize(
12 |             self,
13 |             sentence: str,
14 |         ) -> (str, List[dict]):
15 |         return sentence, None
16 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/number_token.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import re
  3 | 
  4 | from .base_factory import BaseFactory
  5 | 
  6 | 
  7 | INT_PATTERN = re.compile(r"[0-9０１２３４５６７８９]+(?!float|\_|\d)")
  8 | FLOAT_PATTERN = re.compile(
  9 |     r"(?<!\.|\d)[0-9０１２３４５６７８９]+\.[0-9０１２３４５６７８９]+(?!\.|\d)",
 10 | )
 11 | 
 12 | 
 13 | def gen_float_token_with_digit(floats: List[str], token: str = "_{}float{}_"):
 14 |     output = []
 15 |     for float_str in floats:
 16 |         digit_lst = INT_PATTERN.findall(float_str)
 17 |         float_token = token.format(len(digit_lst[0]), len(digit_lst[1]))
 18 |         output.append(float_token)
 19 |     return output
 20 | 
 21 | 
 22 | def gen_int_token_with_digit(ints: List[str], token: str = "_{}int_"):
 23 |     output = []
 24 |     for int_str in ints:
 25 |         int_token = token.format(len(int_str))
 26 |         output.append(int_token)
 27 |     return output
 28 | 
 29 | 
 30 | def sub_token_with_value_sequentially(
 31 |         sentence: str,
 32 |         token: str,
 33 |         value_list: List[str],
 34 |     ) -> str:
 35 |     split_prog = re.compile(
 36 |         '{}|{}'.format(
 37 |             token,
 38 |             token.strip(),
 39 |         ),
 40 |     )
 41 |     splited_sentence = split_prog.split(sentence)
 42 |     if len(splited_sentence) != len(value_list) + 1:
 43 |         raise ValueError(
 44 |             "Number of tokens in sentence should be equal to that of values",
 45 |             "original sentence = {}".format(sentence),
 46 |             "token = {}".format(token),
 47 |             "value_list = {}".format(value_list),
 48 |         )
 49 | 
 50 |     output_sent = []
 51 |     for i, segment in enumerate(splited_sentence):
 52 |         output_sent.append(segment)
 53 |         if i != len(splited_sentence) - 1:
 54 |             output_sent.append(value_list[i])
 55 |     return ''.join(output_sent)
 56 | 
 57 | 
 58 | CASES = {
 59 |     "_int_": {
 60 |         "pattern": INT_PATTERN,
 61 |     },
 62 |     "_float_": {
 63 |         "pattern": FLOAT_PATTERN,
 64 |     },
 65 |     "_{}int_": {
 66 |         "pattern": INT_PATTERN,
 67 |         "gen_token_with_digit": gen_int_token_with_digit,
 68 |     },
 69 |     "_{}float{}_": {
 70 |         "pattern": FLOAT_PATTERN,
 71 |         "gen_token_with_digit": gen_float_token_with_digit,
 72 |     },
 73 |     " _int_ ": {
 74 |         "pattern": INT_PATTERN,
 75 |     },
 76 |     " _float_ ": {
 77 |         "pattern": FLOAT_PATTERN,
 78 |     },
 79 |     " _{}int_ ": {
 80 |         "pattern": INT_PATTERN,
 81 |         "gen_token_with_digit": gen_int_token_with_digit,
 82 |     },
 83 |     " _{}float{}_ ": {
 84 |         "pattern": FLOAT_PATTERN,
 85 |         "gen_token_with_digit": gen_float_token_with_digit,
 86 |     },
 87 | }
 88 | 
 89 | 
 90 | class NumberToken(BaseFactory):
 91 | 
 92 |     def __init__(
 93 |             self,
 94 |             token: str,
 95 |             denormalizable: bool = True,
 96 |             name: str = None,
 97 |         ) -> None:
 98 |         super().__init__(name=name, denormalizable=denormalizable)
 99 |         if token not in CASES:
100 |             raise KeyError(
101 |                 "This case [{}] is not handled".format(token),
102 |                 "Handle cases {} only".format(CASES.keys()),
103 |             )
104 |         self.token = token
105 | 
106 |     def normalize(
107 |             self,
108 |             sentence: str,
109 |         ) -> (str, dict):
110 |         revised_sentence = CASES[self.token]["pattern"].sub(
111 |             repl=self.token,
112 |             string=sentence,
113 |         )
114 |         value_list = CASES[self.token]["pattern"].findall(string=sentence)
115 |         if "gen_token_with_digit" not in CASES[self.token]:
116 |             if not self.denormalizable:
117 |                 return revised_sentence, None
118 |             return revised_sentence, {self.token: value_list}
119 | 
120 |         #### token with digits ####
121 |         tokens_with_digit = CASES[self.token]["gen_token_with_digit"](
122 |             value_list,
123 |             token=self.token,
124 |         )
125 |         revised_sentence = sub_token_with_value_sequentially(
126 |             sentence=revised_sentence,
127 |             token=self.token,
128 |             value_list=tokens_with_digit,
129 |         )
130 |         if not self.denormalizable:
131 |             return revised_sentence, None
132 | 
133 |         meta = {}
134 |         for token, value in zip(tokens_with_digit, value_list):
135 |             if token in meta:
136 |                 meta[token].append(value)
137 |             else:
138 |                 meta[token] = [value]
139 |         return revised_sentence, meta
140 | 
141 |     def denormalize(
142 |             self,
143 |             sentence: str,
144 |             meta: dict = None,
145 |         ) -> str:
146 |         if meta is None:
147 |             meta = {}
148 |         if (not self.denormalizable) or (len(meta) == 0):
149 |             # Case1: self.denormalizable = False
150 |             return sentence
151 | 
152 |         for token, values in meta.items():
153 |             sentence = sub_token_with_value_sequentially(
154 |                 sentence=sentence,
155 |                 token=token,
156 |                 value_list=values,
157 |             )
158 |         return sentence
159 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/punctuation_mapping.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict
  2 | import re
  3 | 
  4 | import pandas as pd
  5 | from .base_factory import BaseFactory
  6 | 
  7 | SpecialCases = {
  8 |     '\\': '\\\\',
  9 | }
 10 | 
 11 | RevSpecialCases = {v: k for k, v in SpecialCases.items()}
 12 | 
 13 | 
 14 | class PunctuationMapping(BaseFactory):
 15 | 
 16 |     def __init__(
 17 |             self,
 18 |             normalization_table_path: str,
 19 |             denormalizable: bool = True,
 20 |             name: str = 'punctuation_normalizer',
 21 |         ) -> None:
 22 |         super().__init__(name=name, denormalizable=True)
 23 |         remove_space = re.compile(r"\s+")
 24 |         table_df = pd.read_csv(normalization_table_path).astype(str)
 25 |         for column in table_df.columns.tolist():
 26 |             table_df[column] = table_df[column].str.strip()
 27 |         table_dict = table_df.to_dict(orient='index')
 28 | 
 29 |         self.patterns = []
 30 |         for _, mapping in table_dict.items():
 31 |             cleaned_before_pattern = remove_space.sub(" ", mapping["before"])
 32 |             before_pattern_list = cleaned_before_pattern.split(" ")
 33 |             escaped_before_pattern_list = [
 34 |                 re.escape(pat) for pat in list(set(before_pattern_list))]
 35 |             escaped_after_pattern = re.escape(mapping["after"])
 36 |             self.patterns.append(
 37 |                 {
 38 |                     "normalization_pattern": re.compile(
 39 |                         r"{}".format("|".join(escaped_before_pattern_list)),
 40 |                     ),
 41 |                     "denormalization_pattern": re.compile(
 42 |                         r"{}".format(escaped_after_pattern),
 43 |                     ),
 44 |                     "replacement": mapping["after"],
 45 |                 },
 46 |             )
 47 | 
 48 |     def normalize(
 49 |             self,
 50 |             sentence: str,
 51 |         ) -> (str, List[Dict[str, List[str]]]):
 52 |         revised_sentence = sentence
 53 |         meta = []
 54 |         for pattern in self.patterns:
 55 |             if pattern["replacement"] in SpecialCases:
 56 |                 pattern["replacement"] = SpecialCases[pattern["replacement"]]
 57 | 
 58 |             revised_sentence = pattern["normalization_pattern"].sub(
 59 |                 repl=pattern["replacement"],
 60 |                 string=revised_sentence,
 61 |             )
 62 |             meta.append(
 63 |                 {
 64 |                     "before": pattern["normalization_pattern"].findall(
 65 |                         string=sentence,
 66 |                     ),
 67 |                     "after": pattern["replacement"],
 68 |                 },
 69 |             )
 70 |         if not self.denormalizable:
 71 |             return revised_sentence, None
 72 |         return revised_sentence, meta
 73 | 
 74 |     def denormalize(
 75 |             self,
 76 |             sentence: str,
 77 |             meta: List[Dict[str, List[str]]] = None,
 78 |         ) -> str:
 79 |         if (not self.denormalizable) or (meta is None):
 80 |             # Case1: self.denormalizable = False
 81 |             return sentence
 82 | 
 83 |         for single_meta, pattern in zip(meta[::-1], self.patterns[::-1]):
 84 |             if single_meta["after"] != pattern["replacement"]:
 85 |                 KeyError(
 86 |                     "WRONG META !!!",
 87 |                     "The AFTER token should be the same as REPLACEMENT in patterns",
 88 |                     "Now, AFTER token is {} and REPLACEMENT is {}".format(
 89 |                         single_meta["after"],
 90 |                         pattern["replacement"],
 91 |                     ),
 92 |                 )
 93 | 
 94 |             if pattern["replacement"] in RevSpecialCases:
 95 |                 pattern["replacement"] = RevSpecialCases[pattern["replacement"]]
 96 | 
 97 |             punct_to_be_denormalized = pattern["denormalization_pattern"].findall(
 98 |                 string=sentence,
 99 |             )
100 |             if len(punct_to_be_denormalized) != len(single_meta["before"]):
101 |                 raise KeyError(
102 |                     "The number of punctuation to be denormalized is not equal to",
103 |                     "the number of that in meta data",
104 |                     "# of punctuations to be denormalized = {}".format(
105 |                         len(punct_to_be_denormalized),
106 |                     ),
107 |                     "punctuations to be denormalized = {}".format(
108 |                         punct_to_be_denormalized,
109 |                     ),
110 |                     "punctuations in meta = {}".format(single_meta["before"]),
111 |                 )
112 | 
113 |             splited_sentence = pattern["denormalization_pattern"].split(sentence)
114 |             output_sentence = []
115 |             for idx, segment in enumerate(splited_sentence):
116 |                 output_sentence.append(segment)
117 |                 if idx != len(splited_sentence) - 1:
118 |                     output_sentence.append(single_meta["before"][idx])
119 |             sentence = ''.join(output_sentence)
120 |         return sentence
121 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/replace_pattern_with_token.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import re
  3 | 
  4 | from .base_factory import BaseFactory
  5 | 
  6 | 
  7 | class ReplacePatternWithToken(BaseFactory):
  8 | 
  9 |     def __init__(
 10 |             self,
 11 |             target_pattern: str,
 12 |             token: str,
 13 |             prefix_pattern: str = None,
 14 |             suffix_pattern: str = None,
 15 |             denormalizable: bool = False,
 16 |             name: str = None,
 17 |         ):
 18 |         super().__init__(name=name, denormalizable=denormalizable)
 19 |         self.token = token
 20 |         if prefix_pattern and suffix_pattern:
 21 |             self.findall_pattern = "(?:{})({})(?={})".format(
 22 |                 prefix_pattern,
 23 |                 target_pattern,
 24 |                 suffix_pattern,
 25 |             )
 26 |             self.sub_pattern = "({}){}(?={})".format(
 27 |                 prefix_pattern,
 28 |                 target_pattern,
 29 |                 suffix_pattern,
 30 |             )
 31 |             self.sub_replacement = "\g<1>{}".format(token)
 32 |         else:
 33 |             self.findall_pattern = target_pattern
 34 |             self.sub_pattern = target_pattern
 35 |             self.sub_replacement = token
 36 | 
 37 |         self.findall_prog = re.compile(self.findall_pattern)
 38 |         self.sub_prog = re.compile(self.sub_pattern)
 39 |         self.split_prog = re.compile(
 40 |             '{}|{}|{}|{}'.format(
 41 |                 self.token,
 42 |                 self.token.rstrip(),
 43 |                 self.token.lstrip(),
 44 |                 self.token.strip(),
 45 |             ),
 46 |         )
 47 | 
 48 |     def normalize(
 49 |             self,
 50 |             sentence: str,
 51 |         ) -> (str, List[dict]):
 52 |         revised_sentence = self.sub_prog.sub(
 53 |             repl=self.sub_replacement,
 54 |             string=sentence,
 55 |         )
 56 |         if self.denormalizable:
 57 |             meta = self.findall_prog.findall(
 58 |                 string=sentence,
 59 |             )
 60 |             return revised_sentence, {self.token: meta}
 61 |         else:
 62 |             return revised_sentence, None
 63 | 
 64 |     def denormalize(
 65 |             self,
 66 |             sentence: str,
 67 |             meta: dict = None,
 68 |         ) -> str:
 69 |         if not self.denormalizable:
 70 |             # Case1: self.denormalizable = False
 71 |             return sentence
 72 | 
 73 |         if self.token not in meta:
 74 |             # Case2: meta = {'a': ['XX', 'cc']}, 'a' != self.token
 75 |             raise KeyError(
 76 |                 'Wrong meta :{} !!!'.format(meta),
 77 |                 'Meta should be { %s: [...]}.' % self.token,
 78 |             )
 79 | 
 80 |         splited_sentence = self.split_prog.split(sentence)
 81 |         if (len(splited_sentence) == 1) and (len(meta[self.token]) == 0):
 82 |             # Case3: no token in sentence and meta is empty
 83 |             return sentence
 84 |         elif len(splited_sentence) - 1 != len(meta[self.token]):
 85 |             # Case4: # of token in sentence != # of token in meta
 86 |             raise ValueError(
 87 |                 '# of tokens in sentence is not equal to that in meta'
 88 |                 'sentence = {}'.format(sentence),
 89 |                 'meta = {}'.format(meta),
 90 |             )
 91 |         else:
 92 |             output_sentence = ''
 93 |             idx = 0
 94 |             for s_idx, segment in enumerate(splited_sentence, start=1):
 95 |                 output_sentence += segment
 96 |                 if s_idx != len(splited_sentence):
 97 |                     output_sentence += meta[self.token][idx]
 98 |                     idx += 1
 99 |             return output_sentence
100 | 
101 |     # def ldenormalize(
102 |     #         self,
103 |     #         sentence: List[str],
104 |     #         meta: dict = None,
105 |     #     ) -> List[str]:
106 | 
107 |     #     super().ldenormalize(sentence=sentence)
108 |     #     if self.token not in meta:
109 |     #         raise KeyError('Wrong meta :{} !!!'.format(meta))
110 | 
111 |     #     '''
112 |     #     Each segment should not contain more than one token.
113 |     #     '''
114 |     #     idx = 0
115 |     #     output_sentence = []
116 |     #     for segment in sentence:
117 |     #         if self.token in segment:
118 |     #             denormalized_segment = re.sub(self.token, meta[self.token][idx], segment)
119 |     #             output_sentence.append(denormalized_segment)
120 |     #             idx += 1
121 |     #         else:
122 |     #             output_sentence.append(segment)
123 |     #     return output_sentence
124 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/strip.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .base_factory import BaseFactory
 4 | 
 5 | 
 6 | class Strip(BaseFactory):
 7 | 
 8 |     def __init__(
 9 |             self,
10 |             chars: List[str] = None,
11 |             direction: str = 'both',
12 |             name: str = 'strip',
13 |         ):
14 |         self.chars = chars
15 |         if self.chars is None:
16 |             self.chars_str = None
17 |         else:
18 |             self.chars_str = ''.join(chars)
19 |         if direction not in ['both', 'left', 'right']:
20 |             raise ValueError(
21 |                 'WRONG direction input! '
22 |                 'Direction has three options [both, left, right]',
23 |                 'Your input is {}'.format(direction),
24 |             )
25 |         else:
26 |             self.direction = direction
27 |         super().__init__(
28 |             name=name + '_' + self.direction + '_' + str(self.chars_str),
29 |             denormalizable=False,
30 |         )
31 | 
32 |     def normalize(
33 |             self,
34 |             sentence: str,
35 |         ) -> (str, List[dict]):
36 |         if self.direction == 'both':
37 |             return sentence.strip(self.chars_str), None
38 |         elif self.direction == 'left':
39 |             return sentence.lstrip(self.chars_str), None
40 |         elif self.direction == 'right':
41 |             return sentence.rstrip(self.chars_str), None
42 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/factory/test/__init__.py


--------------------------------------------------------------------------------
/text_normalizer/factory/test/example_punctuation_mapping.csv:
--------------------------------------------------------------------------------
1 | before,after
2 | （ ( ❨ ﹙ （,(
3 | ） ) ❩ ﹚ ）,)
4 | "," ，,","
5 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/example_unicode_mapping.txt:
--------------------------------------------------------------------------------
1 | FF10-FF10:one2one(0)
2 | FF11 0031:0031(1)
3 | FF0C 002C:002C(,)


--------------------------------------------------------------------------------
/text_normalizer/factory/test/test_base_factory.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from ..base_factory import BaseFactory
 3 | 
 4 | 
 5 | class TestBaseFactory(TestCase):
 6 | 
 7 |     def setUp(self):
 8 |         self.base_text_normalizer_class = BaseFactory()
 9 |         self.base_text_normalizer_class_with_name = BaseFactory(name='123')
10 | 
11 |     def test_attributes(self):
12 |         self.assertEqual(
13 |             {
14 |                 'denormalizable': False,
15 |                 'name': 'BaseFactory',
16 |             },
17 |             self.base_text_normalizer_class.__dict__,
18 |         )
19 | 
20 |         self.assertEqual(
21 |             {
22 |                 'denormalizable': False,
23 |                 'name': '123',
24 |             },
25 |             self.base_text_normalizer_class_with_name.__dict__,
26 |         )
27 | 
28 |     def test_denormalize(self):
29 |         self.assertEqual(
30 |             'HAHA',
31 |             self.base_text_normalizer_class.denormalize(sentence='HAHA'),
32 |         )
33 |         self.assertEqual(
34 |             'HAHA',
35 |             self.base_text_normalizer_class_with_name.denormalize(sentence='HAHA'),
36 |         )
37 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/test_eng_lowercase.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest import TestCase
  3 | from ..eng_lowercase import EngLowercase
  4 | 
  5 | 
  6 | class EngLowercaseTestCase(TestCase):
  7 | 
  8 |     def setUp(self):
  9 |         self.eng_lowercase_text_normalizer = EngLowercase()
 10 | 
 11 |     def test_lowercase(self):
 12 |         test_cases = [
 13 |             (
 14 |                 "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ",
 15 |                 "abcdefghijklmnopqrstuvwxyz",
 16 |             ),
 17 |             (
 18 |                 "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ",
 19 |                 "abcdefghijklmnopqrstuvwxyz",
 20 |             ),
 21 |             (
 22 |                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
 23 |                 "abcdefghijklmnopqrstuvwxyz",
 24 |             ),
 25 |         ]
 26 |         for test_case in test_cases:
 27 |             with self.subTest(test_case=test_case):
 28 |                 self.assertEqual(
 29 |                     test_case[1],
 30 |                     self.eng_lowercase_text_normalizer.lowercase(
 31 |                         sentence=test_case[0],
 32 |                     ),
 33 |                 )
 34 | 
 35 |     def test_normalize_n_denormalize_0(self):
 36 |         test_cases = [
 37 |             (
 38 |                 "哈囉 AAB 123 Cddef 哈囉 >< ???",
 39 |                 (
 40 |                     "哈囉 aab 123 cddef 哈囉 >< ???",
 41 |                     [
 42 |                         {
 43 |                             "before": "AAB",
 44 |                             "after": "aab",
 45 |                         },
 46 |                         {
 47 |                             "before": "Cddef",
 48 |                             "after": "cddef",
 49 |                         },
 50 |                     ],
 51 |                 ),
 52 |                 "哈囉 AAB 123 Cddef 哈囉 >< ???",
 53 |             ),
 54 |             (
 55 |                 "AAB 123 哈囉 Cddef 456 ffecI",
 56 |                 (
 57 |                     "aab 123 哈囉 cddef 456 ffeci",
 58 |                     [
 59 |                         {
 60 |                             "before": "AAB",
 61 |                             "after": "aab",
 62 |                         },
 63 |                         {
 64 |                             "before": "Cddef",
 65 |                             "after": "cddef",
 66 |                         },
 67 |                         {
 68 |                             "before": "ffecI",
 69 |                             "after": "ffeci",
 70 |                         },
 71 |                     ],
 72 |                 ),
 73 |                 "AAB 123 哈囉 Cddef 456 ffecI",
 74 |             ),
 75 |             (
 76 |                 "家豪大大亂入吃雞排",
 77 |                 ("家豪大大亂入吃雞排", None),
 78 |                 "家豪大大亂入吃雞排",
 79 |             ),
 80 |             (
 81 |                 "ａbc",
 82 |                 (
 83 |                     "abc",
 84 |                     [
 85 |                         {
 86 |                             "before": "ａbc",
 87 |                             "after": "abc",
 88 |                         },
 89 |                     ],
 90 |                 ),
 91 |                 "ａbc",
 92 |             ),
 93 |         ]
 94 | 
 95 |         for test_case in test_cases:
 96 |             with self.subTest(
 97 |                 test_case="normalize {}".format(test_case[0]),
 98 |             ):
 99 |                 self.assertEqual(
100 |                     test_case[1],
101 |                     self.eng_lowercase_text_normalizer.normalize(
102 |                         sentence=test_case[0],
103 |                     ),
104 |                 )
105 |             with self.subTest(
106 |                 test_case="denormalize {}".format(test_case[0]),
107 |             ):
108 |                 self.assertEqual(
109 |                     test_case[2],
110 |                     self.eng_lowercase_text_normalizer.denormalize(
111 |                         sentence=test_case[1][0],
112 |                         meta=test_case[1][1],
113 |                     ),
114 |                 )
115 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/test_identity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | from ..identity import Identity
 4 | 
 5 | 
 6 | class IdentityTestCase(TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.identity_text_normalizer = Identity()
10 | 
11 |     def test_attributes(self):
12 |         self.assertEqual(
13 |             {
14 |                 'denormalizable': False,
15 |                 'name': 'identity',
16 |             },
17 |             self.identity_text_normalizer.__dict__,
18 |         )
19 | 
20 |     def test_normalize(self):
21 |         result = self.identity_text_normalizer.normalize(
22 |             '不管你測什麼 我都會回傳原本的句子給你 呵呵',
23 |         )
24 |         self.assertEqual(
25 |             ('不管你測什麼 我都會回傳原本的句子給你 呵呵', None),
26 |             result,
27 |         )
28 | 
29 |     def test_denormalize(self):
30 |         result = self.identity_text_normalizer.denormalize(
31 |             '不管你測什麼 我都會回傳原本的句子給你 呵呵',
32 |         )
33 |         self.assertEqual(
34 |             '不管你測什麼 我都會回傳原本的句子給你 呵呵',
35 |             result,
36 |         )
37 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/test_number_token_test_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest import TestCase
  3 | from ..number_token import (
  4 |     gen_float_token_with_digit,
  5 |     gen_int_token_with_digit,
  6 |     sub_token_with_value_sequentially,
  7 |     NumberToken,
  8 | )
  9 | 
 10 | 
 11 | class NumberTokenTestCase(TestCase):
 12 | 
 13 |     def run_test_denormalizable(self, test_cases, normalizer):
 14 |         for test_case in test_cases:
 15 |             with self.subTest(test_case=test_case):
 16 |                 self.assertEqual(
 17 |                     test_case[1],
 18 |                     normalizer.normalize(test_case[0]),
 19 |                 )
 20 |                 self.assertEqual(
 21 |                     test_case[0],
 22 |                     normalizer.denormalize(
 23 |                         sentence=test_case[1][0],
 24 |                         meta=test_case[1][1],
 25 |                     ),
 26 |                 )
 27 | 
 28 |     def run_test_not_denormalizable(self, test_cases, normalizer):
 29 |         for test_case in test_cases:
 30 |             with self.subTest(test_case=test_case):
 31 |                 self.assertEqual(
 32 |                     test_case[1],
 33 |                     normalizer.normalize(test_case[0]),
 34 |                 )
 35 |                 self.assertEqual(
 36 |                     test_case[1][0],
 37 |                     normalizer.denormalize(
 38 |                         sentence=test_case[1][0],
 39 |                         meta=test_case[1][1],
 40 |                     ),
 41 |                 )
 42 | 
 43 |     def test_gen_float_token_with_digit(self):
 44 |         self.assertEqual(
 45 |             ["_1float1_", "_1float5_", "_3float4_", "_4float2_"],
 46 |             gen_float_token_with_digit(
 47 |                 ["2.0", "0.00003", "300.1113", "5000.05"]),
 48 |         )
 49 | 
 50 |     def test_gen_int_token_with_digit(self):
 51 |         self.assertEqual(
 52 |             ["_1int_", "_2int_", "_3int_", "_7int_"],
 53 |             gen_int_token_with_digit(["1", "20", "300", "5000.05"]),
 54 |         )
 55 | 
 56 |     def test_sub_token_with_value_sequentially(self):
 57 |         test_cases = [
 58 |             (
 59 |                 {
 60 |                     "sentence": "A@A@@A@@@AA",
 61 |                     "token": "A",
 62 |                     "value_list": ["1", "2", "3", "4", "5"],
 63 |                 },
 64 |                 "1@2@@3@@@45",
 65 |             ),
 66 |             (
 67 |                 {
 68 |                     "sentence": "來亂的",
 69 |                     "token": "bla",
 70 |                     "value_list": [],
 71 |                 },
 72 |                 "來亂的",
 73 |             ),
 74 |         ]
 75 |         for test_case in test_cases:
 76 |             with self.subTest(test_case=test_case):
 77 |                 self.assertEqual(
 78 |                     test_case[1],
 79 |                     sub_token_with_value_sequentially(**test_case[0]),
 80 |                 )
 81 | 
 82 |     def test_unhandle_case(self):
 83 |         with self.assertRaises(KeyError):
 84 |             NumberToken(token="_ohoh_")
 85 | 
 86 |     def test_pure_int(self):
 87 |         int_text_normalizer = NumberToken(token="_int_")
 88 |         test_cases = [
 89 |             ("123", ("_int_", {"_int_": ["123"]})),
 90 |             ("23.35", ("_int_._int_", {"_int_": ["23", "35"]})),
 91 |             ("23 0000", ("_int_ _int_", {"_int_": ["23", "0000"]})),
 92 |             ("OHOH 23", ("OHOH _int_", {"_int_": ["23"]})),
 93 |             ("122223333 OHOH", ("_int_ OHOH", {"_int_": ["122223333"]})),
 94 |             ("１００", ("_int_", {"_int_": ["１００"]})),
 95 |             ("3４0分", ("_int_分", {"_int_": ["3４0"]})),
 96 |             ("薄餡大大１個打10個", ("薄餡大大_int_個打_int_個", {"_int_": ["１", "10"]})),
 97 |             ("0８00-２２-44-６６",
 98 |                 ("_int_-_int_-_int_-_int_", {"_int_": ["0８00", "２２", "44", "６６"]})),
 99 |             ("來亂的", ("來亂的", {"_int_": []})),
100 |         ]
101 |         self.run_test_denormalizable(
102 |             normalizer=int_text_normalizer,
103 |             test_cases=test_cases,
104 |         )
105 | 
106 |     def test_pure_int_not_denormalizable(self):
107 |         int_text_normalizer_not_denormalizable = NumberToken(
108 |             token="_int_",
109 |             denormalizable=False,
110 |         )
111 |         test_cases = [
112 |             ("123", ("_int_", None)),
113 |             ("23.35", ("_int_._int_", None)),
114 |             ("23 0000", ("_int_ _int_", None)),
115 |             ("OHOH 23", ("OHOH _int_", None)),
116 |             ("122223333 OHOH", ("_int_ OHOH", None)),
117 |             ("１００", ("_int_", None)),
118 |             ("3４0分", ("_int_分", None)),
119 |             ("薄餡大大１個打10個", ("薄餡大大_int_個打_int_個", None)),
120 |             ("0８00-２２-44-６６", ("_int_-_int_-_int_-_int_", None)),
121 |             ("來亂的", ("來亂的", None)),
122 |         ]
123 |         self.run_test_not_denormalizable(
124 |             normalizer=int_text_normalizer_not_denormalizable,
125 |             test_cases=test_cases,
126 |         )
127 | 
128 |     def test_pure_float(self):
129 |         float_text_normalizer = NumberToken(token="_float_")
130 |         test_cases = [
131 |             ("49.3", ("_float_", {"_float_": ["49.3"]})),
132 |             ("12.33 456.0", ("_float_ _float_", {"_float_": ["12.33", "456.0"]})),
133 |             ("123", ("123", {"_float_": []})),
134 |             ("94.87分", ("_float_分", {"_float_": ["94.87"]})),
135 |             ("薄餡大大1.5個打10.7個",
136 |                 ("薄餡大大_float_個打_float_個", {"_float_": ["1.5", "10.7"]})),
137 |             ("123.456.789", ("123.456.789", {"_float_": []})),
138 |             ("1０0.0００", ("_float_", {"_float_": ["1０0.0００"]})),
139 |             ("９4.87分", ("_float_分", {"_float_": ["９4.87"]})),
140 |             ("薄餡大大1.5個打１０.７個",
141 |                 ("薄餡大大_float_個打_float_個", {"_float_": ["1.5", "１０.７"]})),
142 |             ("１２３.４５６.７８９", ("１２３.４５６.７８９", {"_float_": []})),
143 |             ("來亂的", ("來亂的", {"_float_": []})),
144 |         ]
145 |         self.run_test_denormalizable(
146 |             normalizer=float_text_normalizer,
147 |             test_cases=test_cases,
148 |         )
149 | 
150 |     def test_pure_float_not_denormalizable(self):
151 |         float_text_normalizer = NumberToken(
152 |             token="_float_",
153 |             denormalizable=False,
154 |         )
155 |         test_cases = [
156 |             ("49.3", ("_float_", None)),
157 |             ("12.33 456.0", ("_float_ _float_", None)),
158 |             ("123", ("123", None)),
159 |             ("94.87分", ("_float_分", None)),
160 |             ("薄餡大大1.5個打10.7個", ("薄餡大大_float_個打_float_個", None)),
161 |             ("123.456.789", ("123.456.789", None)),
162 |             ("1０0.0００", ("_float_", None)),
163 |             ("９4.87分", ("_float_分", None)),
164 |             ("薄餡大大1.5個打１０.７個", ("薄餡大大_float_個打_float_個", None)),
165 |             ("１２３.４５６.７８９", ("１２３.４５６.７８９", None)),
166 |             ("來亂的", ("來亂的", None)),
167 |         ]
168 |         self.run_test_not_denormalizable(
169 |             normalizer=float_text_normalizer,
170 |             test_cases=test_cases,
171 |         )
172 | 
173 |     def test_int_with_digit(self):
174 |         intd_text_normalizer = NumberToken(token="_{}int_")
175 |         test_cases = [
176 |             ("123", ("_3int_", {"_3int_": ["123"]})),
177 |             ("098765431389", ("_12int_", {"_12int_": ["098765431389"]})),
178 |             ("1 4567890103",
179 |                 ("_1int_ _10int_", {"_1int_": ["1"], "_10int_": ["4567890103"]})),
180 |             ("_12float733_", ("_12float733_", {})),
181 |             ("ohoh 000 _33float0_ 1",
182 |              ("ohoh _3int_ _33float0_ _1int_", {"_3int_": ["000"], "_1int_": ["1"]})),
183 |             ("123 345 678 901",
184 |                 ("_3int_ _3int_ _3int_ _3int_", {"_3int_": ["123", "345", "678", "901"]})),
185 |             ("１００", ("_3int_", {"_3int_": ["１００"]})),
186 |             ("3４0分", ("_3int_分", {"_3int_": ["3４0"]})),
187 |             ("薄餡大大１個打10個", ("薄餡大大_1int_個打_2int_個", {"_1int_": ["１"], "_2int_": ["10"]})),
188 |             ("0８00-２２-44-６６",
189 |                 ("_4int_-_2int_-_2int_-_2int_",
190 |                     {"_4int_": ["0８00"], "_2int_": ["２２", "44", "６６"]})),
191 |             ("來亂的", ("來亂的", {})),
192 |         ]
193 |         self.run_test_denormalizable(
194 |             test_cases=test_cases,
195 |             normalizer=intd_text_normalizer,
196 |         )
197 | 
198 |     def test_int_with_digit_not_denormalizable(self):
199 |         intd_text_normalizer = NumberToken(
200 |             token="_{}int_",
201 |             denormalizable=False,
202 |         )
203 |         test_cases = [
204 |             ("123", ("_3int_", None)),
205 |             ("098765431389", ("_12int_", None)),
206 |             ("1 4567890103", ("_1int_ _10int_", None)),
207 |             ("_12float733_", ("_12float733_", None)),
208 |             ("ohoh 000 _33float0_ 1", ("ohoh _3int_ _33float0_ _1int_", None)),
209 |             ("１００", ("_3int_", None)),
210 |             ("3４0分", ("_3int_分", None)),
211 |             ("薄餡大大１個打10個", ("薄餡大大_1int_個打_2int_個", None)),
212 |             ("0８00-２２-44-６６", ("_4int_-_2int_-_2int_-_2int_", None)),
213 |             ("來亂的", ("來亂的", None)),
214 |         ]
215 |         self.run_test_not_denormalizable(
216 |             test_cases=test_cases,
217 |             normalizer=intd_text_normalizer,
218 |         )
219 | 
220 |     def test_float_with_digit(self):
221 |         floatd_text_normalizer = NumberToken(
222 |             token="_{}float{}_",
223 |         )
224 |         test_cases = [
225 |             ("123.33", ("_3float2_", {"_3float2_": ["123.33"]})),
226 |             ("123", ("123", {})),
227 |             ("1234567890.123456789011",
228 |              ("_10float12_", {"_10float12_": ["1234567890.123456789011"]})),
229 |             ("1.3 224.00", ("_1float1_ _3float2_",
230 |                             {"_1float1_": ["1.3"], "_3float2_": ["224.00"]})),
231 |             ("12.3 34.5 67.8 90.1",
232 |                 ("_2float1_ _2float1_ _2float1_ _2float1_",
233 |                     {"_2float1_": ["12.3", "34.5", "67.8", "90.1"]})),
234 |             ("_3int_", ("_3int_", {})),
235 |             ("94.87分", ("_2float2_分", {"_2float2_": ["94.87"]})),
236 |             ("薄餡大大1.5個打10.7個",
237 |                 ("薄餡大大_1float1_個打_2float1_個",
238 |                     {"_1float1_": ["1.5"], "_2float1_": ["10.7"]})),
239 |             ("123.456.789", ("123.456.789", {})),
240 |             ("1０0.0００", ("_3float3_", {"_3float3_": ["1０0.0００"]})),
241 |             ("９4.87分", ("_2float2_分", {"_2float2_": ["９4.87"]})),
242 |             ("薄餡大大1.5個打１０.７個",
243 |                 ("薄餡大大_1float1_個打_2float1_個",
244 |                     {"_1float1_": ["1.5"], "_2float1_": ["１０.７"]})),
245 |             ("１２３.４５６.７８９", ("１２３.４５６.７８９", {})),
246 |             ("來亂的", ("來亂的", {})),
247 |         ]
248 |         self.run_test_denormalizable(
249 |             test_cases=test_cases,
250 |             normalizer=floatd_text_normalizer,
251 |         )
252 | 
253 |     def test_float_with_digit_not_denrmalizable(self):
254 |         floatd_text_normalizer = NumberToken(
255 |             token="_{}float{}_",
256 |             denormalizable=False,
257 |         )
258 |         test_cases = [
259 |             ("123.33", ("_3float2_", None)),
260 |             ("123", ("123", None)),
261 |             ("1234567890.123456789011", ("_10float12_", None)),
262 |             ("1.3 224.00", ("_1float1_ _3float2_", None)),
263 |             ("_3int_", ("_3int_", None)),
264 |             ("94.87分", ("_2float2_分", None)),
265 |             ("薄餡大大1.5個打10.7個", ("薄餡大大_1float1_個打_2float1_個", None)),
266 |             ("123.456.789", ("123.456.789", None)),
267 |             ("1０0.0００", ("_3float3_", None)),
268 |             ("９4.87分", ("_2float2_分", None)),
269 |             ("薄餡大大1.5個打１０.７個", ("薄餡大大_1float1_個打_2float1_個", None)),
270 |             ("１２３.４５６.７８９", ("１２３.４５６.７８９", None)),
271 |             ("來亂的", ("來亂的", None)),
272 |         ]
273 |         self.run_test_not_denormalizable(
274 |             test_cases=test_cases,
275 |             normalizer=floatd_text_normalizer,
276 |         )
277 | 
278 |     def test_int_text_normalizer_with_space(self):
279 |         int_text_normalizer_with_space = NumberToken(token=" _int_ ")
280 |         test_cases = [
281 |             ("12345678900", (" _int_ ", {" _int_ ": ["12345678900"]})),
282 |             ("340分", (" _int_ 分", {" _int_ ": ["340"]})),
283 |             ("薄餡大大1個打10個", ("薄餡大大 _int_ 個打 _int_ 個", {" _int_ ": ["1", "10"]})),
284 |             ("0800-22-44-66", (" _int_ - _int_ - _int_ - _int_ ",
285 |                                {" _int_ ": ["0800", "22", "44", "66"]})),
286 |             ("１００", (" _int_ ", {" _int_ ": ["１００"]})),
287 |             ("3４0分", (" _int_ 分", {" _int_ ": ["3４0"]})),
288 |             ("薄餡大大１個打10個", ("薄餡大大 _int_ 個打 _int_ 個", {" _int_ ": ["１", "10"]})),
289 |             ("0８00-２２-44-６６", (" _int_ - _int_ - _int_ - _int_ ",
290 |                                {" _int_ ": ["0８00", "２２", "44", "６６"]})),
291 |             ("家豪大大亂入", ("家豪大大亂入", {" _int_ ": []})),
292 |         ]
293 |         self.run_test_denormalizable(
294 |             test_cases=test_cases,
295 |             normalizer=int_text_normalizer_with_space,
296 |         )
297 |         with self.assertRaises(ValueError):
298 |             int_text_normalizer_with_space.denormalize(
299 |                 sentence=" _int_ 和 _int_ 這兩個日期都沒有雞排",
300 |                 meta={" _int_ ": ["12"]},
301 |             )
302 | 
303 |     def test_float_text_normalizer_with_space(self):
304 |         float_text_normalizer_with_space = NumberToken(token=" _float_ ")
305 |         test_cases = [
306 |             ("100.000", (" _float_ ", {" _float_ ": ["100.000"]})),
307 |             ("94.87分", (" _float_ 分", {" _float_ ": ["94.87"]})),
308 |             ("薄餡大大1.5個打10.7個", ("薄餡大大 _float_ 個打 _float_ 個", {" _float_ ": ["1.5", "10.7"]})),
309 |             ("123.456.789", ("123.456.789", {" _float_ ": []})),
310 |             ("1０0.0００", (" _float_ ", {" _float_ ": ["1０0.0００"]})),
311 |             ("９4.87分", (" _float_ 分", {" _float_ ": ["９4.87"]})),
312 |             ("薄餡大大1.5個打１０.７個", ("薄餡大大 _float_ 個打 _float_ 個", {" _float_ ": ["1.5", "１０.７"]})),
313 |             ("１２３.４５６.７８９", ("１２３.４５６.７８９", {" _float_ ": []})),
314 |             ("家豪大大亂入", ("家豪大大亂入", {" _float_ ": []})),
315 |         ]
316 |         self.run_test_denormalizable(
317 |             test_cases=test_cases,
318 |             normalizer=float_text_normalizer_with_space,
319 |         )
320 | 
321 |     def test_int_with_digit_n_space(self):
322 |         intd_text_normalizer_with_space = NumberToken(token=" _{}int_ ")
323 |         test_cases = [
324 |             ("123", (" _3int_ ", {" _3int_ ": ["123"]})),
325 |             ("098765431389", (" _12int_ ", {" _12int_ ": ["098765431389"]})),
326 |             ("1 4567890103",
327 |                 (" _1int_   _10int_ ",
328 |                     {" _1int_ ": ["1"], " _10int_ ": ["4567890103"]})),
329 |             ("_12float733_", ("_12float733_", {})),
330 |             ("ohoh 000 _33float0_ 1",
331 |                 ("ohoh  _3int_  _33float0_  _1int_ ",
332 |                     {" _3int_ ": ["000"], " _1int_ ": ["1"]})),
333 |             ("123 345 678 901",
334 |                 (" _3int_   _3int_   _3int_   _3int_ ",
335 |                     {" _3int_ ": ["123", "345", "678", "901"]})),
336 |             ("１００", (" _3int_ ", {" _3int_ ": ["１００"]})),
337 |             ("3４0分", (" _3int_ 分", {" _3int_ ": ["3４0"]})),
338 |             ("薄餡大大１個打10個",
339 |                 ("薄餡大大 _1int_ 個打 _2int_ 個",
340 |                     {" _1int_ ": ["１"], " _2int_ ": ["10"]})),
341 |             ("0８00-２２-44-６６",
342 |                 (" _4int_ - _2int_ - _2int_ - _2int_ ",
343 |                     {" _4int_ ": ["0８00"], " _2int_ ": ["２２", "44", "６６"]})),
344 |             ("來亂的", ("來亂的", {})),
345 |         ]
346 |         self.run_test_denormalizable(
347 |             test_cases=test_cases,
348 |             normalizer=intd_text_normalizer_with_space,
349 |         )
350 | 
351 |     def test_float_with_digit_n_space(self):
352 |         floatd_text_normalizer_with_space = NumberToken(
353 |             token=" _{}float{}_ ",
354 |         )
355 |         test_cases = [
356 |             ("123.33", (" _3float2_ ", {" _3float2_ ": ["123.33"]})),
357 |             ("123", ("123", {})),
358 |             ("1234567890.123456789011",
359 |              (" _10float12_ ", {" _10float12_ ": ["1234567890.123456789011"]})),
360 |             ("1.3 224.00", (" _1float1_   _3float2_ ",
361 |                             {" _1float1_ ": ["1.3"], " _3float2_ ": ["224.00"]})),
362 |             ("12.3 34.5 67.8 90.1",
363 |                 (" _2float1_   _2float1_   _2float1_   _2float1_ ",
364 |                     {" _2float1_ ": ["12.3", "34.5", "67.8", "90.1"]})),
365 |             ("_3int_", ("_3int_", {})),
366 |             ("94.87分", (" _2float2_ 分", {" _2float2_ ": ["94.87"]})),
367 |             ("薄餡大大1.5個打10.7個",
368 |                 ("薄餡大大 _1float1_ 個打 _2float1_ 個",
369 |                     {" _1float1_ ": ["1.5"], " _2float1_ ": ["10.7"]})),
370 |             ("123.456.789", ("123.456.789", {})),
371 |             ("1０0.0００", (" _3float3_ ", {" _3float3_ ": ["1０0.0００"]})),
372 |             ("９4.87分", (" _2float2_ 分", {" _2float2_ ": ["９4.87"]})),
373 |             ("薄餡大大1.5個打１０.７個",
374 |                 ("薄餡大大 _1float1_ 個打 _2float1_ 個",
375 |                     {" _1float1_ ": ["1.5"], " _2float1_ ": ["１０.７"]})),
376 |             ("１２３.４５６.７８９", ("１２３.４５６.７８９", {})),
377 |             ("來亂的", ("來亂的", {})),
378 |         ]
379 |         self.run_test_denormalizable(
380 |             test_cases=test_cases,
381 |             normalizer=floatd_text_normalizer_with_space,
382 |         )
383 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/test_punctuation_mapping.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from os.path import abspath, dirname, join
 3 | from unittest import TestCase
 4 | 
 5 | from ..punctuation_mapping import PunctuationMapping
 6 | 
 7 | ROOT_DIR = dirname(abspath(__file__))
 8 | 
 9 | 
10 | class PunctMappingTestCase(TestCase):
11 | 
12 |     def setUp(self):
13 |         self.punct_normalizer = PunctuationMapping(
14 |             normalization_table_path=join(
15 |                 ROOT_DIR, "example_punctuation_mapping.csv"),
16 |         )
17 | 
18 |     def test_normalize_n_denormalize(self):
19 |         result = self.punct_normalizer.normalize(
20 |             "❨哈囉❩，（（❩） ） ,,,﹙﹚() ❨",
21 |         )
22 |         self.assertEqual(
23 |             ("(哈囉),(()) ) ,,,()() (",
24 |              [
25 |                  {
26 |                      "before": ["❨", "（", "（", "﹙", "(", "❨"],
27 |                      "after": "(",
28 |                  },
29 |                  {
30 |                      "before": ["❩", "❩", "）", "）", "﹚", ")"],
31 |                      "after": ")",
32 |                  },
33 |                  {
34 |                      "before": ["，", ",", ",", ","],
35 |                      "after": ",",
36 |                  },
37 |              ],
38 |              ),
39 |             result,
40 |         )
41 |         result = self.punct_normalizer.denormalize(
42 |             sentence=result[0],
43 |             meta=result[1],
44 |         )
45 |         self.assertEqual(
46 |             "❨哈囉❩，（（❩） ） ,,,﹙﹚() ❨",
47 |             result,
48 |         )
49 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/test_strip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | from ..strip import Strip
 4 | 
 5 | 
 6 | class StripTestCase(TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.strip_text_normalizer_default = Strip()
10 |         self.strip_text_normalizer_left = Strip(
11 |             direction='left',
12 |             chars=['#', ' '],
13 |         )
14 |         self.strip_text_normalizer_right = Strip(
15 |             direction='right',
16 |             chars=['/', ' '],
17 |         )
18 | 
19 |     def test_attributes(self):
20 |         self.assertEqual(
21 |             {
22 |                 'chars': None,
23 |                 'chars_str': None,
24 |                 'direction': 'both',
25 |                 'denormalizable': False,
26 |                 'name': 'strip_both_None',
27 |             },
28 |             self.strip_text_normalizer_default.__dict__,
29 |         )
30 |         self.assertEqual(
31 |             {
32 |                 'chars': ['#', ' '],
33 |                 'chars_str': '# ',
34 |                 'direction': 'left',
35 |                 'denormalizable': False,
36 |                 'name': 'strip_left_# ',
37 |             },
38 |             self.strip_text_normalizer_left.__dict__,
39 |         )
40 |         self.assertEqual(
41 |             {
42 |                 'chars': ['/', ' '],
43 |                 'chars_str': '/ ',
44 |                 'direction': 'right',
45 |                 'denormalizable': False,
46 |                 'name': 'strip_right_/ ',
47 |             },
48 |             self.strip_text_normalizer_right.__dict__,
49 |         )
50 | 
51 |     def test_normalize(self):
52 |         result = self.strip_text_normalizer_default.normalize(
53 |             sentence='         HAHA               ',
54 |         )
55 |         self.assertEqual(
56 |             ('HAHA', None),
57 |             result,
58 |         )
59 |         result = self.strip_text_normalizer_left.normalize(
60 |             sentence='##  \t\tHAHA',
61 |         )
62 |         self.assertEqual(
63 |             ('\t\tHAHA', None),
64 |             result,
65 |         )
66 |         result = self.strip_text_normalizer_right.normalize(
67 |             sentence='HAHA\t\t///  ',
68 |         )
69 |         self.assertEqual(
70 |             ('HAHA\t\t', None),
71 |             result,
72 |         )
73 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/test/test_unicode_mapping.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from os.path import abspath, dirname, join
 3 | 
 4 | from ..unicode_mapping import UnicodeMapping
 5 | 
 6 | 
 7 | ROOT_DIR = dirname(abspath(__file__))
 8 | 
 9 | 
10 | class UnicodeMappingTestCase(TestCase):
11 | 
12 |     def setUp(self):
13 |         self.normalizer = UnicodeMapping(
14 |             unicode_mapping_path=join(
15 |                 ROOT_DIR,
16 |                 "example_unicode_mapping.txt",
17 |             ),
18 |         )
19 | 
20 |     def test_attributes(self):
21 |         table = self.normalizer.mapping_table
22 |         self.assertEqual(
23 |             {
24 |                 '0xff11': '1',
25 |                 '0x31': '1',
26 |                 '0xff0c': ',',
27 |                 '0xff10': '０',
28 |                 '0x2c': ',',
29 |             },
30 |             table,
31 |         )
32 |         unicode_other = self.normalizer.u_other
33 |         self.assertEqual("0x20", unicode_other)
34 | 
35 |         other = self.normalizer.other
36 |         self.assertEqual(" ", other)
37 | 
38 |     def test_normalize(self):
39 |         result = self.normalizer.normalize(
40 |             sentence='，，HAHA０1０1 1１',
41 |         )
42 |         self.assertEqual(
43 |             (
44 |                 ',,    ０1０1 11',
45 |                 {
46 |                     '０': ['０', '０'],
47 |                     ',': ['，', '，'],
48 |                     ' ': ['H', 'A', 'H', 'A', ' '],
49 |                     '1': ['1', '1', '1', '１'],
50 |                 },
51 |             ),
52 |             result,
53 |         )
54 | 
55 |     def test_denormalize(self):
56 |         nor_result = self.normalizer.normalize(
57 |             sentence='，，HAHA０1０1 1１',
58 |         )
59 |         de_result = self.normalizer.denormalize(
60 |             sentence=nor_result[0],
61 |             meta=nor_result[1],
62 |         )
63 |         self.assertEqual(
64 |             '，，HAHA０1０1 1１',
65 |             de_result,
66 |         )
67 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/toolkit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/factory/toolkit/__init__.py


--------------------------------------------------------------------------------
/text_normalizer/factory/toolkit/findall_position.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def findall_position(input_str, reg_pattern):
 4 |     return findall_position_in_c(
 5 |         input_str,
 6 |         reg_pattern,
 7 |     )
 8 | 
 9 | 
10 | cdef list findall_position_in_c(  # noqa: E999
11 |         str input_str,
12 |         reg_pattern,
13 |     ):
14 |     cdef unsigned int i, str_len
15 |     cdef list output_list
16 | 
17 |     i = 0
18 |     str_len = len(input_str)
19 |     output_list = []
20 |     while (i < str_len):
21 |         output = reg_pattern.search(input_str[i:])
22 |         if output is None:
23 |             break
24 |         start, end = output.span()
25 |         output_list.append((start + i, end + i))
26 |         i += end
27 |     return output_list
28 | 


--------------------------------------------------------------------------------
/text_normalizer/factory/unicode_mapping.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Tuple
  2 | import re
  3 | 
  4 | from .base_factory import BaseFactory
  5 | 
  6 | 
  7 | PROG = re.compile(r"([0-9A-Z\s\-]+)\:([0-9A-Za-z]+)")
  8 | PROG_DASH = re.compile(r"([0-9A-Z]+)\-([0-9A-Z]+)")
  9 | 
 10 | 
 11 | class UnicodeMapping(BaseFactory):
 12 | 
 13 |     def __init__(
 14 |             self,
 15 |             unicode_mapping_path: str,
 16 |             other: hex = "0x20",
 17 |             name: str = 'unicode_normalizer',
 18 |             denormalizable: bool = True,
 19 |         ) -> None:
 20 | 
 21 |         self.denormalizable = denormalizable
 22 |         self.mapping_table = self._gen_unicode_mapping_table(
 23 |             unicode_mapping_path=unicode_mapping_path,
 24 |         )
 25 |         if len(other) > 0:
 26 |             self.u_other = other
 27 |             self.other = chr(int(other, 16))
 28 |         else:
 29 |             self.u_other = None
 30 |             self.other = other
 31 |             self.denormalizable = False
 32 | 
 33 |         super().__init__(
 34 |             name=name,
 35 |             denormalizable=self.denormalizable,
 36 |         )
 37 | 
 38 |     @staticmethod
 39 |     def _gen_unicode_mapping_table(
 40 |             unicode_mapping_path: str,
 41 |         ) -> Dict[hex, str]:
 42 | 
 43 |         with open(unicode_mapping_path, "r") as filep:
 44 |             mapping_list = filep.read().split("\n")
 45 | 
 46 |         mapping_table = {}
 47 |         for map_ in mapping_list:
 48 | 
 49 |             if len(map_) == 0:
 50 |                 continue
 51 | 
 52 |             input_, output = PROG.findall(map_)[0]
 53 | 
 54 |             range_or_not = PROG_DASH.findall(input_)
 55 | 
 56 |             if len(range_or_not) > 0:
 57 |                 for uninum in range(
 58 |                     int(range_or_not[0][0], 16),
 59 |                     int(range_or_not[0][1], 16) + 1,
 60 |                 ):
 61 |                     if output == "one2one":
 62 |                         output_token = chr(uninum)
 63 |                     else:
 64 |                         output_token = chr(int(output, 16))
 65 |                     mapping_table[hex(uninum)] = output_token
 66 |             else:
 67 |                 for uninum in input_.split(" "):
 68 |                     mapping_table[hex(int(uninum, 16))] = chr(int(output, 16))
 69 | 
 70 |         return mapping_table
 71 | 
 72 |     @staticmethod
 73 |     def _check_utf8_encoding(sentence: str):
 74 | 
 75 |         try:
 76 |             output_sentence = sentence.encode('utf-8').decode('utf-8')
 77 |         except UnicodeEncodeError as e:
 78 |             print("sentence: {}, error: {}".format(sentence, e))
 79 |             return False
 80 |         if output_sentence != sentence:
 81 |             return False
 82 | 
 83 |         return True
 84 | 
 85 |     def normalize(
 86 |             self,
 87 |             sentence: str,
 88 |         ) -> Tuple[str, Dict[str, List[str]]]:
 89 | 
 90 |         if not self._check_utf8_encoding(sentence):
 91 |             raise ValueError(
 92 |                 "sentence: {} can not be encoded by UTF-8".format(sentence),
 93 |             )
 94 | 
 95 |         output_sentence = []
 96 |         meta = {}
 97 |         for char in sentence:
 98 |             uchar = hex(ord(char))
 99 |             if uchar in self.mapping_table:
100 |                 output_char = self.mapping_table[uchar]
101 |             else:
102 |                 output_char = self.other
103 |             if output_char not in meta:
104 |                 meta[output_char] = [char]
105 |             else:
106 |                 meta[output_char].extend(char)
107 |             output_sentence.append(output_char)
108 | 
109 |         return "".join(output_sentence), meta
110 | 
111 |     def denormalize(
112 |             self,
113 |             sentence: str,
114 |             meta: Dict[str, List[str]],
115 |         ) -> str:
116 | 
117 |         if not self.denormalizable:
118 |             return sentence
119 | 
120 |         for org_o, org_i in meta.items():
121 |             splited_sent = sentence.split(org_o)
122 |             output_sentence = []
123 |             for i, token in enumerate(splited_sent):
124 |                 output_sentence.append(token)
125 |                 if i != len(org_i):
126 |                     output_sentence.append(org_i[i])
127 |             sentence = "".join(output_sentence)
128 | 
129 |         return sentence
130 | 


--------------------------------------------------------------------------------
/text_normalizer/library/__init__.py:
--------------------------------------------------------------------------------
 1 | from .basic import (  # noqa
 2 |     whitespace_char_text_normalizer,
 3 |     whitespace_reduction_text_normalizer,
 4 | )
 5 | from .punctuation import (  # noqa
 6 |     chinese_punctuation_text_normalizer,
 7 |     english_punctuation_text_normalizer,
 8 |     all_punctuation_text_normalizer,
 9 |     all_punctuation_without_endpoint_text_normalizer,
10 |     all_punctuation_without_underscore_text_normalizer,
11 | )
12 | from .date import (  # noqa
13 |     date_text_normalizer_yymmdd,
14 | )
15 | from .time import (  # noqa
16 |     time_text_normalizer_hhmm,
17 | )
18 | from .identity import identity_text_normalizer  # noqa
19 | from .eng_lowercase import eng_lowercase_text_normalizer  # noqa
20 | from .punctuation_mapping import (  # noqa
21 |     full_punctuation_mapping_text_normalizer,
22 |     simplified_punctuation_mapping_text_normalizer,
23 | )
24 | from .number import (  # noqa
25 |     int_text_normalizer,
26 |     float_text_normalizer,
27 |     int_with_digit_text_normalizer,
28 |     float_with_digit_text_normalizer,
29 |     int_with_space_text_normalizer,
30 |     float_with_space_text_normalizer,
31 |     int_with_digit_n_space_text_normalizer,
32 |     float_with_digit_n_space_text_normalizer,
33 | )
34 | from .strip import (  # noqa
35 |     pure_strip_text_normalizer,
36 | )
37 | from .unicode import (  # noqa
38 |     unicode__chinese_characters_text_normalizer,
39 |     unicode__chinese_characters_and_digits_text_normalizer,
40 |     unicode__english_characters_and_digits_text_normalizer,
41 |     unicode__english_digits_and_full_punctuations_text_normalizer,
42 |     unicode__chinese_english_digits_and_full_punctuations_text_normalizer,
43 |     unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer,
44 | )
45 | 


--------------------------------------------------------------------------------
/text_normalizer/library/basic.py:
--------------------------------------------------------------------------------
 1 | from ..factory import ReplacePatternWithToken
 2 | 
 3 | 
 4 | whitespace_char_text_normalizer = ReplacePatternWithToken(
 5 |     name='whitespace_char',
 6 |     denormalizable=False,
 7 |     target_pattern=r'\s+',
 8 |     prefix_pattern=None,
 9 |     suffix_pattern=None,
10 |     token=' ',
11 | )
12 | 
13 | whitespace_reduction_text_normalizer = ReplacePatternWithToken(
14 |     name='whitespaces2one',
15 |     denormalizable=True,
16 |     target_pattern=r'\s+',
17 |     prefix_pattern=None,
18 |     suffix_pattern=None,
19 |     token=' ',
20 | )
21 | 


--------------------------------------------------------------------------------
/text_normalizer/library/date.py:
--------------------------------------------------------------------------------
 1 | from ..factory import ReplacePatternWithToken
 2 | 
 3 | 
 4 | date_text_normalizer_yymmdd = ReplacePatternWithToken(
 5 |     name='date_yymmdd',
 6 |     denormalizable=True,
 7 |     target_pattern=r'[0-2]*\d\d\d-[0-1]*\d-[0-3]*\d',
 8 |     prefix_pattern=r'[^\d-]{1}|\A',
 9 |     suffix_pattern=r'[^\d-]{1}|\Z',
10 |     token=' _date_ ',
11 | )
12 | 


--------------------------------------------------------------------------------
/text_normalizer/library/eng_lowercase.py:
--------------------------------------------------------------------------------
1 | from ..factory import EngLowercase
2 | 
3 | 
4 | eng_lowercase_text_normalizer = EngLowercase()
5 | 


--------------------------------------------------------------------------------
/text_normalizer/library/identity.py:
--------------------------------------------------------------------------------
1 | from ..factory import Identity
2 | 
3 | 
4 | identity_text_normalizer = Identity()
5 | 


--------------------------------------------------------------------------------
/text_normalizer/library/number.py:
--------------------------------------------------------------------------------
 1 | from ..factory import NumberToken
 2 | 
 3 | 
 4 | int_text_normalizer = NumberToken(token="_int_")
 5 | float_text_normalizer = NumberToken(token="_float_")
 6 | int_with_digit_text_normalizer = NumberToken(token="_{}int_")
 7 | float_with_digit_text_normalizer = NumberToken(token="_{}float{}_")
 8 | 
 9 | int_with_space_text_normalizer = NumberToken(token=" _int_ ")
10 | float_with_space_text_normalizer = NumberToken(token=" _float_ ")
11 | int_with_digit_n_space_text_normalizer = NumberToken(token=" _{}int_ ")
12 | float_with_digit_n_space_text_normalizer = NumberToken(token=" _{}float{}_ ")
13 | 


--------------------------------------------------------------------------------
/text_normalizer/library/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..factory import ReplacePatternWithToken
 2 | 
 3 | CHINESE_PUNCTUATIONS = r"。「」﹁﹂『 』、‧（ ）《》〈〉 ﹏﹏﹏……—～，？；：［］【 】！"
 4 | ENGLISH_PUNCTUATIONS = \
 5 |     r"\!\#\$\%\&\(\)\*\+\,\-\.\/\:\;\?\@\[\]\{\}\|\~\`\_\^\<\>\=\'\"\\"
 6 | ENGLISH_PUNCTUATIONS_WITHOUT_ENDPOINT = ENGLISH_PUNCTUATIONS.replace("\.", "")
 7 | ENGLISH_PUNCTUATIONS_WITHOUT_UNDERSCORE = ENGLISH_PUNCTUATIONS.replace("\_", "")
 8 | 
 9 | 
10 | chinese_punctuation_text_normalizer = ReplacePatternWithToken(
11 |     name='chinese_punctuation',
12 |     denormalizable=False,
13 |     target_pattern=r'[{}]+'.format(CHINESE_PUNCTUATIONS),
14 |     prefix_pattern=None,
15 |     suffix_pattern=None,
16 |     token=' ',
17 | )
18 | 
19 | english_punctuation_text_normalizer = ReplacePatternWithToken(
20 |     name='english_punctuation',
21 |     denormalizable=False,
22 |     target_pattern=r'[{}]+'.format(ENGLISH_PUNCTUATIONS),
23 |     prefix_pattern=None,
24 |     suffix_pattern=None,
25 |     token=' ',
26 | )
27 | 
28 | all_punctuation_text_normalizer = ReplacePatternWithToken(
29 |     name='all_punctuation',
30 |     denormalizable=False,
31 |     target_pattern=r'[{}]+'.format(ENGLISH_PUNCTUATIONS + CHINESE_PUNCTUATIONS),
32 |     prefix_pattern=None,
33 |     suffix_pattern=None,
34 |     token=' ',
35 | )
36 | 
37 | all_punctuation_without_endpoint_text_normalizer = ReplacePatternWithToken(
38 |     name='all_punctuation_without_endpoint',
39 |     denormalizable=False,
40 |     target_pattern=r'[{}]+'.format(
41 |         CHINESE_PUNCTUATIONS + ENGLISH_PUNCTUATIONS_WITHOUT_ENDPOINT,
42 |     ),
43 |     prefix_pattern=None,
44 |     suffix_pattern=None,
45 |     token=' ',
46 | )
47 | all_punctuation_without_underscore_text_normalizer = ReplacePatternWithToken(
48 |     name='all_punctuation_without_underscore',
49 |     denormalizable=False,
50 |     target_pattern=r'[{}]+'.format(
51 |         CHINESE_PUNCTUATIONS + ENGLISH_PUNCTUATIONS_WITHOUT_UNDERSCORE,
52 |     ),
53 |     prefix_pattern=None,
54 |     suffix_pattern=None,
55 |     token=' ',
56 | )
57 | 


--------------------------------------------------------------------------------
/text_normalizer/library/punctuation_mapping.py:
--------------------------------------------------------------------------------
 1 | from os.path import join
 2 | 
 3 | from ..factory import PunctuationMapping
 4 | from text_normalizer import ROOT_DIR
 5 | 
 6 | 
 7 | full_punctuation_mapping_text_normalizer = PunctuationMapping(
 8 |     normalization_table_path=join(
 9 |         ROOT_DIR,
10 |         'data/punctuation/punctuation_mapping_0221.csv',
11 |     ),
12 | )
13 | 
14 | simplified_punctuation_mapping_text_normalizer = PunctuationMapping(
15 |     normalization_table_path=join(
16 |         ROOT_DIR,
17 |         'data/punctuation/punctuation_mapping_0221_simplified.csv',
18 |     ),
19 | )
20 | 


--------------------------------------------------------------------------------
/text_normalizer/library/strip.py:
--------------------------------------------------------------------------------
1 | from ..factory import Strip
2 | 
3 | 
4 | pure_strip_text_normalizer = Strip()
5 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/text_normalizer/library/test/__init__.py


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_basic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from unittest import TestCase
 4 | from ..basic import (
 5 |     whitespace_char_text_normalizer,
 6 |     whitespace_reduction_text_normalizer,
 7 | )
 8 | 
 9 | 
10 | class BasicTextNormalizersTestCase(TestCase):
11 | 
12 |     def test_whitespace_char_text_normalizer_normalize(self):
13 |         test_cases = [
14 |             ('      ', (' ', None)),
15 |             ('              ', (' ', None)),
16 |             ('\n\n\n\n\n', (' ', None)),
17 |             ('我有很多      空白', ('我有很多 空白', None)),
18 |             ('我有很多          tab', ('我有很多 tab', None)),
19 |             ('我有很多\n\n\n\n\n分行', ('我有很多 分行', None)),
20 |             ('家豪大大亂入', ('家豪大大亂入', None)),
21 |         ]
22 |         for test_case in test_cases:
23 |             with self.subTest(test_case=test_case):
24 |                 self.assertEqual(
25 |                     test_case[1],
26 |                     whitespace_char_text_normalizer.normalize(
27 |                         sentence=test_case[0],
28 |                     ),
29 |                 )
30 | 
31 |     def test_whitespace_reduction_text_normalizer(self):
32 |         normalizer = whitespace_reduction_text_normalizer
33 |         test_cases = [
34 |             (
35 |                 '      ',
36 |                 (
37 |                     ' ',
38 |                     {' ': ['      ']},
39 |                 ),
40 |             ),
41 |             (
42 |                 '              ',
43 |                 (
44 |                     ' ',
45 |                     {' ': ['              ']},
46 |                 ),
47 |             ),
48 |             (
49 |                 '\n\n\n\n\n',
50 |                 (
51 |                     ' ',
52 |                     {' ': ['\n\n\n\n\n']},
53 |                 ),
54 |             ),
55 |             (
56 |                 '我有很多      空白\n',
57 |                 (
58 |                     '我有很多 空白 ',
59 |                     {' ': ['      ', '\n']},
60 |                 ),
61 |             ),
62 |             (
63 |                 '我有很多          tab\n\n\t',
64 |                 (
65 |                     '我有很多 tab ',
66 |                     {' ': ['          ', '\n\n\t']},
67 |                 ),
68 |             ),
69 |             (
70 |                 '我有很多\n\n\n\n\n分行',
71 |                 (
72 |                     '我有很多 分行',
73 |                     {' ': ['\n\n\n\n\n']},
74 |                 ),
75 |             ),
76 |             ('家豪大大亂入', ('家豪大大亂入', {' ': []})),
77 |         ]
78 |         for test_case in test_cases:
79 |             with self.subTest(test_case=test_case):
80 |                 self.assertEqual(
81 |                     test_case[1],
82 |                     normalizer.normalize(
83 |                         sentence=test_case[0],
84 |                     ),
85 |                 )
86 |                 self.assertEqual(
87 |                     test_case[0],
88 |                     normalizer.denormalize(
89 |                         sentence=test_case[1][0],
90 |                         meta=test_case[1][1],
91 |                     ),
92 |                 )
93 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_date.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | 
 4 | from ..date import (
 5 |     date_text_normalizer_yymmdd,
 6 | )
 7 | 
 8 | 
 9 | class DateTextNormalizersTestCase(TestCase):
10 | 
11 |     def test_date_yymmdd_normalize(self):
12 |         test_cases = [
13 |             ('2017-12-07', (' _date_ ', {' _date_ ': ['2017-12-07']})),
14 |             ('2017-1-3', (' _date_ ', {' _date_ ': ['2017-1-3']})),
15 |             ('2017-1-30', (' _date_ ', {' _date_ ': ['2017-1-30']})),
16 |             ('2017-12-3', (' _date_ ', {' _date_ ': ['2017-12-3']})),
17 |             ('3017-12-07', ('3017-12-07', {' _date_ ': []})),
18 |             ('2017-22-07', ('2017-22-07', {' _date_ ': []})),
19 |             ('2017-12-47', ('2017-12-47', {' _date_ ': []})),
20 |             ('今天日期是2017-12-07', ('今天日期是 _date_ ', {' _date_ ': ['2017-12-07']})),
21 |             ('2017-12-07XD', (' _date_ XD', {' _date_ ': ['2017-12-07']})),
22 |             ('現在日期2017-12-07XD', ('現在日期 _date_ XD', {' _date_ ': ['2017-12-07']})),
23 |             ('2017-12-07-00', ('2017-12-07-00', {' _date_ ': []})),
24 |             ('2017-12-0708', ('2017-12-0708', {' _date_ ': []})),
25 |             ('2017-1208-07', ('2017-1208-07', {' _date_ ': []})),
26 |             ('2017-12-07和2018-01-10', (' _date_ 和 _date_ ',
27 |                                        {' _date_ ': ['2017-12-07', '2018-01-10']})),
28 |             ('2017-12-072018-01-10', ('2017-12-072018-01-10', {' _date_ ': []})),
29 |             ('家豪大大亂入', ('家豪大大亂入', {' _date_ ': []})),
30 |         ]
31 |         for test_case in test_cases:
32 |             with self.subTest(test_case=test_case):
33 |                 self.assertEqual(
34 |                     test_case[1],
35 |                     date_text_normalizer_yymmdd.normalize(sentence=test_case[0]),
36 |                 )
37 | 
38 |     def test_date_yymmdd_denormalize(self):
39 |         normal_test_cases = [
40 |             (' _date_ ', {' _date_ ': ['2017-12-18']}, '2017-12-18'),
41 |             ('現在日期 _date_ ', {' _date_ ': ['2017-12-18']}, '現在日期2017-12-18'),
42 |             (' _date_ XD', {' _date_ ': ['2017-12-18']}, '2017-12-18XD'),
43 |             ('現在日期 _date_ XD', {' _date_ ': ['2017-12-18']}, '現在日期2017-12-18XD'),
44 |             (' _date_ 和 _date_ ',
45 |              {' _date_ ': ['2017-12-18', '2018-01-02']}, '2017-12-18和2018-01-02'),
46 |             (' _date_  _date_ ',
47 |              {' _date_ ': ['2017-12-18', '2018-01-02']}, '2017-12-182018-01-02'),
48 |             ('家豪大大亂入', {' _date_ ': []}, '家豪大大亂入'),
49 |         ]
50 |         for test_case in normal_test_cases:
51 |             with self.subTest(test_case=test_case):
52 |                 self.assertEqual(
53 |                     test_case[2],
54 |                     date_text_normalizer_yymmdd.denormalize(
55 |                         sentence=test_case[0],
56 |                         meta=test_case[1],
57 |                     ),
58 |                 )
59 |         with self.assertRaises(KeyError):
60 |             date_text_normalizer_yymmdd.denormalize(
61 |                 sentence='家豪大大亂入',
62 |                 meta={'_雞排_': ['大雞排']},
63 |             ),
64 |         with self.assertRaises(ValueError):
65 |             date_text_normalizer_yymmdd.denormalize(
66 |                 sentence=' _date_ 和 _date_ 這兩個日期都沒有雞排',
67 |                 meta={' _date_ ': ['2017-12-18']},
68 |             )
69 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_eng_lowercase.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from unittest import TestCase
 4 | from ..eng_lowercase import (
 5 |     eng_lowercase_text_normalizer,
 6 | )
 7 | 
 8 | 
 9 | class EngLowercaseTextNormalizerTestCase(TestCase):
10 | 
11 |     def test_normalize_n_denormalize_0(self):
12 |         result = eng_lowercase_text_normalizer.normalize(
13 |             '哈囉 AAB 123 Cddef 哈囉 >< ???',
14 |         )
15 |         self.assertEqual(
16 |             ('哈囉 aab 123 cddef 哈囉 >< ???',
17 |              [
18 |                  {
19 |                      'before': 'AAB',
20 |                      'after': 'aab',
21 |                  },
22 |                  {
23 |                      'before': 'Cddef',
24 |                      'after': 'cddef',
25 |                  },
26 |              ],
27 |              ),
28 |             result,
29 |         )
30 |         result = eng_lowercase_text_normalizer.denormalize(
31 |             sentence=result[0],
32 |             meta=result[1],
33 |         )
34 |         self.assertEqual(
35 |             '哈囉 AAB 123 Cddef 哈囉 >< ???',
36 |             result,
37 |         )
38 | 
39 |     def test_normalize_n_denormalize_1(self):
40 |         result = eng_lowercase_text_normalizer.normalize(
41 |             'AAB 123 哈囉 Cddef 456 ffecI',
42 |         )
43 |         self.assertEqual(
44 |             ('aab 123 哈囉 cddef 456 ffeci',
45 |              [
46 |                  {
47 |                      'before': 'AAB',
48 |                      'after': 'aab',
49 |                  },
50 |                  {
51 |                      'before': 'Cddef',
52 |                      'after': 'cddef',
53 |                  },
54 |                  {
55 |                      'before': 'ffecI',
56 |                      'after': 'ffeci',
57 |                  },
58 |              ],
59 |              ),
60 |             result,
61 |         )
62 |         result = eng_lowercase_text_normalizer.denormalize(
63 |             sentence=result[0],
64 |             meta=result[1],
65 |         )
66 |         self.assertEqual(
67 |             'AAB 123 哈囉 Cddef 456 ffecI',
68 |             result,
69 |         )
70 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_identity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from unittest import TestCase
 4 | from ..identity import (
 5 |     identity_text_normalizer,
 6 | )
 7 | 
 8 | 
 9 | class IdentityTextNormalizersTestCase(TestCase):
10 | 
11 |     def test_identity_text_normalizer_normalize(self):
12 |         result = identity_text_normalizer.normalize(
13 |             sentence='我超懶惰 我就是想耍廢 KerKer ><',
14 |         )
15 |         self.assertEqual(
16 |             ('我超懶惰 我就是想耍廢 KerKer ><', None),
17 |             result,
18 |         )
19 |         result = identity_text_normalizer.denormalize(
20 |             sentence=result[0],
21 |             meta=result[1],
22 |         )
23 |         self.assertEqual(
24 |             '我超懶惰 我就是想耍廢 KerKer ><',
25 |             result,
26 |         )
27 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_number.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from ..number import (
  3 |     int_text_normalizer,
  4 |     float_text_normalizer,
  5 |     int_with_digit_text_normalizer,
  6 |     float_with_digit_text_normalizer,
  7 |     int_with_space_text_normalizer,
  8 |     float_with_space_text_normalizer,
  9 |     int_with_digit_n_space_text_normalizer,
 10 |     float_with_digit_n_space_text_normalizer,
 11 | )
 12 | 
 13 | 
 14 | class NumberTextNormalizersTestCase(TestCase):
 15 | 
 16 |     def test_int_text_normalizer(self):
 17 |         revised_sentence, meta = int_text_normalizer.normalize(sentence="123")
 18 |         recovered_sentence = int_text_normalizer.denormalize(
 19 |             sentence=revised_sentence,
 20 |             meta=meta,
 21 |         )
 22 |         self.assertEqual("_int_", revised_sentence)
 23 |         self.assertEqual({"_int_": ["123"]}, meta)
 24 |         self.assertEqual("123", recovered_sentence)
 25 | 
 26 |     def test_float_text_normalizer(self):
 27 |         revised_sentence, meta = float_text_normalizer.normalize(sentence="123.33")
 28 |         recovered_sentence = float_text_normalizer.denormalize(
 29 |             sentence=revised_sentence,
 30 |             meta=meta,
 31 |         )
 32 |         self.assertEqual("_float_", revised_sentence)
 33 |         self.assertEqual({"_float_": ["123.33"]}, meta)
 34 |         self.assertEqual("123.33", recovered_sentence)
 35 | 
 36 |     def test_int_with_digit_text_normalizer(self):
 37 |         revised_sentence, meta = int_with_digit_text_normalizer.normalize(sentence="123")
 38 |         recovered_sentence = int_with_digit_text_normalizer.denormalize(
 39 |             sentence=revised_sentence,
 40 |             meta=meta,
 41 |         )
 42 |         self.assertEqual("_3int_", revised_sentence)
 43 |         self.assertEqual({"_3int_": ["123"]}, meta)
 44 |         self.assertEqual("123", recovered_sentence)
 45 | 
 46 |     def test_float_with_digit_text_normalizer(self):
 47 |         revised_sentence, meta = float_with_digit_text_normalizer.normalize(
 48 |             sentence="123.33",
 49 |         )
 50 |         recovered_sentence = float_with_digit_text_normalizer.denormalize(
 51 |             sentence=revised_sentence,
 52 |             meta=meta,
 53 |         )
 54 |         self.assertEqual("_3float2_", revised_sentence)
 55 |         self.assertEqual({"_3float2_": ["123.33"]}, meta)
 56 |         self.assertEqual("123.33", recovered_sentence)
 57 | 
 58 |     def test_int_with_space_text_normalizer(self):
 59 |         revised_sentence, meta = int_with_space_text_normalizer.normalize(sentence="123")
 60 |         recovered_sentence = int_with_space_text_normalizer.denormalize(
 61 |             sentence=revised_sentence,
 62 |             meta=meta,
 63 |         )
 64 |         self.assertEqual(" _int_ ", revised_sentence)
 65 |         self.assertEqual({" _int_ ": ["123"]}, meta)
 66 |         self.assertEqual("123", recovered_sentence)
 67 | 
 68 |     def test_float_with_space_text_normalizer(self):
 69 |         revised_sentence, meta = float_with_space_text_normalizer.normalize(sentence="123.33")
 70 |         recovered_sentence = float_with_space_text_normalizer.denormalize(
 71 |             sentence=revised_sentence,
 72 |             meta=meta,
 73 |         )
 74 |         self.assertEqual(" _float_ ", revised_sentence)
 75 |         self.assertEqual({" _float_ ": ["123.33"]}, meta)
 76 |         self.assertEqual("123.33", recovered_sentence)
 77 | 
 78 |     def test_int_with_digit_n_space_text_normalizer(self):
 79 |         revised_sentence, meta = int_with_digit_n_space_text_normalizer.normalize(
 80 |             sentence="123",
 81 |         )
 82 |         recovered_sentence = int_with_digit_n_space_text_normalizer.denormalize(
 83 |             sentence=revised_sentence,
 84 |             meta=meta,
 85 |         )
 86 |         self.assertEqual(" _3int_ ", revised_sentence)
 87 |         self.assertEqual({" _3int_ ": ["123"]}, meta)
 88 |         self.assertEqual("123", recovered_sentence)
 89 | 
 90 |     def test_float_with_digit_n_space_text_normalizer(self):
 91 |         revised_sentence, meta = float_with_digit_n_space_text_normalizer.normalize(
 92 |             sentence="123.33",
 93 |         )
 94 |         recovered_sentence = float_with_digit_n_space_text_normalizer.denormalize(
 95 |             sentence=revised_sentence,
 96 |             meta=meta,
 97 |         )
 98 |         self.assertEqual(" _3float2_ ", revised_sentence)
 99 |         self.assertEqual({" _3float2_ ": ["123.33"]}, meta)
100 |         self.assertEqual("123.33", recovered_sentence)
101 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_punctuation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from unittest import TestCase
 4 | from ..punctuation import (
 5 |     chinese_punctuation_text_normalizer,
 6 |     english_punctuation_text_normalizer,
 7 |     all_punctuation_text_normalizer,
 8 |     all_punctuation_without_endpoint_text_normalizer,
 9 |     all_punctuation_without_underscore_text_normalizer,
10 | )
11 | 
12 | 
13 | class PunctuationTextNormalizersTestCase(TestCase):
14 | 
15 |     def test_chinese_punctuation_text_normalizer_normalize(self):
16 |         test_cases = [
17 |             ('勤彥大大喜歡吃《變態》糖果！！！', ('勤彥大大喜歡吃 變態 糖果 ', None)),
18 |             ('。「」﹁﹂『 』、‧（ ）《》〈〉 ﹏﹏﹏……—～，？；：［］【 】！', (' ', None)),
19 |             ('家豪大大亂入', ('家豪大大亂入', None)),
20 |         ]
21 |         for test_case in test_cases:
22 |             with self.subTest(test_case=test_case):
23 |                 self.assertEqual(
24 |                     test_case[1],
25 |                     chinese_punctuation_text_normalizer.normalize(sentence=test_case[0]),
26 |                 )
27 | 
28 |     def test_english_punctuation_text_normalizer_normalize(self):
29 |         test_cases = [
30 |             ('勤彥大大喜歡吃<變態>糖果!!!', ('勤彥大大喜歡吃 變態 糖果 ', None)),
31 |             ('.,<>(){}[]*^!?=+-~', (' ', None)),
32 |             ('家豪大大亂入', ('家豪大大亂入', None)),
33 |         ]
34 |         for test_case in test_cases:
35 |             with self.subTest(test_case=test_case):
36 |                 self.assertEqual(
37 |                     test_case[1],
38 |                     english_punctuation_text_normalizer.normalize(sentence=test_case[0]),
39 |                 )
40 | 
41 |     def test_all_punctuation_text_normalizer_normalize(self):
42 |         test_cases = [
43 |             ('勤彥大大：喜歡吃《》<變態>《》糖果!!!', ('勤彥大大 喜歡吃 變態 糖果 ', None)),
44 |             ('.,<>(){}[]*^!?=+-~。「」﹁﹂『 』、‧（ ）《》〈〉 ﹏﹏﹏……—～，？；：［］【 】！', (' ', None)),
45 |             ('家豪大大亂入', ('家豪大大亂入', None)),
46 |         ]
47 |         for test_case in test_cases:
48 |             with self.subTest(test_case=test_case):
49 |                 self.assertEqual(
50 |                     test_case[1],
51 |                     all_punctuation_text_normalizer.normalize(sentence=test_case[0]),
52 |                 )
53 | 
54 |     def test_all_punctuation_without_endpoint_text_normalizer_normalize(self):
55 |         test_cases = [
56 |             ('勤彥大大：喜歡吃87.9《》<變態>《》糖果!!!',
57 |              ('勤彥大大 喜歡吃87.9 變態 糖果 ', None)),
58 |             ('.,<>(){}[]*^!?=+-~。「」﹁﹂『 』、‧（ ）《》〈〉 ﹏﹏﹏……—～，？；：［］【 】！',
59 |              ('. ', None)),
60 |             ('家豪大大亂入', ('家豪大大亂入', None)),
61 |         ]
62 |         for test_case in test_cases:
63 |             with self.subTest(test_case=test_case):
64 |                 self.assertEqual(
65 |                     test_case[1],
66 |                     all_punctuation_without_endpoint_text_normalizer.normalize(
67 |                         sentence=test_case[0],
68 |                     ),
69 |                 )
70 | 
71 |     def test_all_punctuation_without_underscore_text_normalizer_normalize(self):
72 |         test_cases = [
73 |             ('勤彥大大：喜歡吃87.9《》_<變態>_《》糖果!!!', ('勤彥大大 喜歡吃87 9 _ 變態 _ 糖果 ', None)),
74 |             ('_.,<>(){}[]*^!?=+-~。「」﹁﹂『 』、‧（ ）《》〈〉 ﹏﹏﹏……—～，？；：［］【 】！', ('_ ', None)),
75 |             ('家豪大大亂入', ('家豪大大亂入', None)),
76 |         ]
77 |         for test_case in test_cases:
78 |             with self.subTest(test_case=test_case):
79 |                 self.assertEqual(
80 |                     test_case[1],
81 |                     all_punctuation_without_underscore_text_normalizer.normalize(
82 |                         sentence=test_case[0],
83 |                     ),
84 |                 )
85 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_punctuation_mapping.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from unittest import TestCase
  3 | 
  4 | from ..punctuation_mapping import (
  5 |     full_punctuation_mapping_text_normalizer,
  6 |     simplified_punctuation_mapping_text_normalizer,
  7 | )
  8 | 
  9 | 
 10 | class PunctuationMappingTextNormalizerTestCase(TestCase):
 11 | 
 12 |     def run_test(self, test_cases, normalizer):
 13 |         for test_case in test_cases:
 14 |             with self.subTest(test_case=test_case):
 15 |                 revised_sentence, meta = normalizer.normalize(
 16 |                     sentence=test_case[0],
 17 |                 )
 18 |                 self.assertEqual(
 19 |                     test_case[1],
 20 |                     revised_sentence,
 21 |                 )
 22 |                 recovered_sentence = normalizer.denormalize(
 23 |                     sentence=test_case[1],
 24 |                     meta=meta,
 25 |                 )
 26 |                 self.assertEqual(
 27 |                     test_case[0],
 28 |                     recovered_sentence,
 29 |                 )
 30 | 
 31 |     def test_full_punctuation_mapping_text_normalizer(self):
 32 |         test_cases = [
 33 |             (
 34 |                 "符號， 、 。 ． ？ ！ ～ ＄ ％ ＠ ＆ ＃ ＊ ‧ ， 、 。 ． ？ ！ ～ ＄ ％ ＠ ＆ ＃ ＊ ‧",
 35 |                 "符號, , . . ? ! ~ $ % @ & # * . , , . . ? ! ~ $ % @ & # * .",
 36 |             ),
 37 |             (
 38 |                 "； ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞 ； ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞",
 39 |                 "; : ... , . . . ; : \" \" \" \" \" \" ; : ... , . . . ; : \" \" \" \" \" \"",
 40 |             ),
 41 |             (
 42 |                 "括號符號； 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 （ ） ； 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 （ ）",
 43 |                 "括號符號; [ ] [ ] [ ] < > ( ) < > ( ) ; [ ] [ ] [ ] < > ( ) < > ( )",
 44 |             ),
 45 |             (
 46 |                 "｛ ｝ ﹛ ﹜ 『 』 「 」 ＜ ＞ ≦ ≧ ﹤ ﹥ ｛ ｝ ﹛ ﹜ 『 』 「 」 ＜ ＞ ≦ ≧ ﹤ ﹥",
 47 |                 "{ } { } \" \" \" \" < > < > < > { } { } \" \" \" \" < > < > < >",
 48 |             ),
 49 |             (
 50 |                 "括號符號； ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄  ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄",
 51 |                 "括號符號; ( ) { } [ ] [ ] < > < > \" \" \" \"  ( ) { } [ ] [ ] < > < > \" \" \" \"",
 52 |             ),
 53 |             (
 54 |                 "線段符號； ﹣ ﹦ ≡ ｜ ∣ ∥ – ︱ — ︳ ╴ ¯ ￣ ﹉ ﹣ ﹦ ≡ ｜ ∣ ∥ – ︱ — ︳ ╴ ¯ ￣ ﹉",
 55 |                 "線段符號; - = = | | / - | - | - - - - - = = | | / - | - | - - - -",
 56 |             ),
 57 |             (
 58 |                 "﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ ＼ ／ ﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ ＼ ／",
 59 |                 "- _ _ - - _ | \ / \ / \ / - _ _ - - _ | \ / \ / \ /",
 60 |             ),
 61 |             (
 62 |                 "+ ＋ ＋ ﹢ * ＊ × ╳",
 63 |                 "+ + + + * * * *",
 64 |             ),
 65 |         ]
 66 |         self.run_test(test_cases, normalizer=full_punctuation_mapping_text_normalizer)
 67 | 
 68 |     def test_simplified_punctuation_mapping_text_normalizer(self):
 69 |         test_cases = [
 70 |             (
 71 |                 "符號， 、 。 ． ？ ！ ～ ＄ ％ ＠ ＆ ＃ ＊ ‧ ， 、 。 ． ？ ！ ～ ＄ ％ ＠ ＆ ＃ ＊ ‧",
 72 |                 "符號, , . . ? ! - $ % @ & # * . , , . . ? ! - $ % @ & # * .",
 73 |             ),
 74 |             (
 75 |                 "； ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞  ︰ … ﹐ ﹒ ˙ · ﹔ ﹕ ‘ ’ “ ” 〝 〞",
 76 |                 "; : ... , . . . ; : \" \" \" \" \" \"  : ... , . . . ; : \" \" \" \" \" \"",
 77 |             ),
 78 |             (
 79 |                 "括號符號； 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 （ ） 〔 〕 【 】 ﹝ ﹞ 〈 〉 ﹙ ﹚ 《 》 （ ）",
 80 |                 "括號符號; ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( ) ( )",
 81 |             ),
 82 |             (
 83 |                 "｛ ｝ ﹛ ﹜ 『 』 「 」 ＜ ＞ ≦ ≧ ﹤ ﹥ ｛ ｝ ﹛ ﹜ 『 』 「 」 ＜ ＞ ≦ ≧ ﹤ ﹥",
 84 |                 "( ) ( ) \" \" \" \" ( ) ( ) ( ) ( ) ( ) \" \" \" \" ( ) ( ) ( )",
 85 |             ),
 86 |             (
 87 |                 "括號符號； ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄ ︵ ︶ ︷ ︸ ︹ ︺ ︻ ︼ ︽ ︾ ︿ ﹀ ﹁ ﹂ ﹃ ﹄",
 88 |                 "括號符號; ( ) ( ) ( ) ( ) ( ) ( ) \" \" \" \" ( ) ( ) ( ) ( ) ( ) ( ) \" \" \" \"",
 89 |             ),
 90 |             (
 91 |                 "線段符號； ﹣ ﹦ ≡ ｜ ∣ ∥ – ︱ — ︳ ╴ ¯ ￣ ﹉ ﹣ ﹦ ≡ ｜ ∣ ∥ – ︱ — ︳ ╴ ¯ ￣ ﹉",
 92 |                 "線段符號; - = = , , , - , - , - - - - - = = , , , - , - , - - - -",
 93 |             ),
 94 |             (
 95 |                 "﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ ＼ ／ ﹊ ﹍ ﹎ ﹋ ﹌ ﹏ ︴ ﹨ ∕ ╲ ╱ ＼ ／",
 96 |                 "- _ _ - - _ , , , , , , , - _ _ - - _ , , , , , , ,",
 97 |             ),
 98 |         ]
 99 |         self.run_test(test_cases, normalizer=simplified_punctuation_mapping_text_normalizer)
100 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_strip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | from ..strip import (
 4 |     pure_strip_text_normalizer,
 5 | )
 6 | 
 7 | 
 8 | class StripTextNormalizerTestCase(TestCase):
 9 | 
10 |     def normalize(self):
11 |         result = pure_strip_text_normalizer.normalize(
12 |             sentence='   \n\n\t\t    LALALA 拉拉 xddd \n\n\t\t\t   ')
13 |         self.assertEqual(
14 |             ('LALALA 拉拉 xddd', None),
15 |             result,
16 |         )
17 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_time.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | 
 4 | from ..time import (
 5 |     time_text_normalizer_hhmm,
 6 | )
 7 | 
 8 | 
 9 | class TimeTextNormalizersTestCase(TestCase):
10 | 
11 |     def test_time_hhmm_normalize(self):
12 |         test_cases = [
13 |             ('12:18', (' _time_ ', {' _time_ ': ['12:18']})),
14 |             ('現在時間12:18', ('現在時間 _time_ ', {' _time_ ': ['12:18']})),
15 |             ('12:18XD', (' _time_ XD', {' _time_ ': ['12:18']})),
16 |             ('現在時間12:18XD', ('現在時間 _time_ XD', {' _time_ ': ['12:18']})),
17 |             ('12:18:00', ('12:18:00', {' _time_ ': []})),
18 |             ('12:1828', ('12:1828', {' _time_ ': []})),
19 |             ('1233:18', ('1233:18', {' _time_ ': []})),
20 |             ('12:18和19:37', (' _time_ 和 _time_ ', {' _time_ ': ['12:18', '19:37']})),
21 |             ('12:1819:37', ('12:1819:37', {' _time_ ': []})),
22 |             ('家豪大大亂入', ('家豪大大亂入', {' _time_ ': []})),
23 |         ]
24 |         for test_case in test_cases:
25 |             with self.subTest(test_case=test_case):
26 |                 self.assertEqual(
27 |                     test_case[1],
28 |                     time_text_normalizer_hhmm.normalize(sentence=test_case[0]),
29 |                 )
30 | 
31 |     def test_time_hhmm_denormalize(self):
32 |         normal_test_cases = [
33 |             (' _time_ ', {' _time_ ': ['12:18']}, '12:18'),
34 |             ('現在時間 _time_ ', {' _time_ ': ['12:18']}, '現在時間12:18'),
35 |             (' _time_ XD', {' _time_ ': ['12:18']}, '12:18XD'),
36 |             ('現在時間 _time_ XD', {' _time_ ': ['12:18']}, '現在時間12:18XD'),
37 |             (' _time_ 和 _time_ ', {' _time_ ': ['12:18', '19:37']}, '12:18和19:37'),
38 |             (' _time_  _time_ ', {' _time_ ': ['12:18', '19:37']}, '12:1819:37'),
39 |             ('家豪大大亂入', {' _time_ ': []}, '家豪大大亂入'),
40 |         ]
41 |         for test_case in normal_test_cases:
42 |             with self.subTest(test_case=test_case):
43 |                 self.assertEqual(
44 |                     test_case[2],
45 |                     time_text_normalizer_hhmm.denormalize(
46 |                         sentence=test_case[0],
47 |                         meta=test_case[1],
48 |                     ),
49 |                 )
50 |         with self.assertRaises(KeyError):
51 |             time_text_normalizer_hhmm.denormalize(
52 |                 sentence='家豪大大亂入',
53 |                 meta={'_雞排_': ['大雞排']},
54 |             ),
55 |         with self.assertRaises(ValueError):
56 |             time_text_normalizer_hhmm.denormalize(
57 |                 sentence=' _time_ 和 _time_ 這兩個時間都沒有雞排',
58 |                 meta={' _time_ ': ['12:18']},
59 |             )
60 | 


--------------------------------------------------------------------------------
/text_normalizer/library/test/test_unicode_text_normalizers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from unittest import TestCase
  4 | from ..unicode import (
  5 |     unicode__chinese_characters_text_normalizer,
  6 |     unicode__chinese_characters_and_digits_text_normalizer,
  7 |     unicode__english_characters_and_digits_text_normalizer,
  8 |     unicode__english_digits_and_full_punctuations_text_normalizer,
  9 |     unicode__chinese_english_digits_and_full_punctuations_text_normalizer,
 10 |     unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer,
 11 | )
 12 | 
 13 | 
 14 | class PunctuationTextNormalizersTestCase(TestCase):
 15 | 
 16 |     def unit_test(self, normalizer, test_cases):
 17 |         for test_case in test_cases:
 18 |             with self.subTest(test_case=test_case):
 19 |                 self.assertEqual(
 20 |                     test_case[1],
 21 |                     normalizer.normalize(
 22 |                         sentence=test_case[0],
 23 |                     ),
 24 |                 )
 25 |                 self.assertEqual(
 26 |                     test_case[0],
 27 |                     normalizer.denormalize(
 28 |                         sentence=test_case[1][0],
 29 |                         meta=test_case[1][1],
 30 |                     ),
 31 |                 )
 32 | 
 33 |     def test_unicode__chinese_characters_text_normalizer(self):
 34 |         normalizer = unicode__chinese_characters_text_normalizer
 35 |         test_cases = [
 36 |             (
 37 |                 '><我想喝100.3元可樂xd～～',
 38 |                 (
 39 |                     '  我想喝     元可樂    ',
 40 |                     {
 41 |                         '想': ['想'],
 42 |                         ' ': ['>', '<', '1', '0', '0', '.', '3',
 43 |                               'x', 'd', '～', '～'],
 44 |                         '我': ['我'],
 45 |                         '喝': ['喝'],
 46 |                         '樂': ['樂'],
 47 |                         '可': ['可'],
 48 |                         '元': ['元'],
 49 |                     },
 50 |                 ),
 51 |             ),
 52 |         ]
 53 |         self.unit_test(
 54 |             normalizer=normalizer,
 55 |             test_cases=test_cases,
 56 |         )
 57 | 
 58 |     def test_unicode__chinese_characters_and_digits_text_normalizer(self):
 59 |         normalizer = unicode__chinese_characters_and_digits_text_normalizer
 60 |         test_cases = [
 61 |             (
 62 |                 '><我想喝100.3元可樂xd～～',
 63 |                 (
 64 |                     '  我想喝100.3元可樂    ',
 65 |                     {
 66 |                         '0': ['0', '0'],
 67 |                         '想': ['想'],
 68 |                         ' ': ['>', '<', 'x', 'd', '～', '～'],
 69 |                         '我': ['我'],
 70 |                         '喝': ['喝'],
 71 |                         '樂': ['樂'],
 72 |                         '可': ['可'],
 73 |                         '元': ['元'],
 74 |                         '1': ['1'],
 75 |                         '.': ['.'],
 76 |                         '3': ['3'],
 77 |                     },
 78 |                 ),
 79 |             ),
 80 |         ]
 81 |         self.unit_test(
 82 |             normalizer=normalizer,
 83 |             test_cases=test_cases,
 84 |         )
 85 | 
 86 |     def test_unicode__english_characters_and_digits_text_normalizer(self):
 87 |         normalizer = unicode__english_characters_and_digits_text_normalizer
 88 |         test_cases = [
 89 |             (
 90 |                 'hate cola 123!',
 91 |                 (
 92 |                     'hate cola 123 ',
 93 |                     {
 94 |                         'h': ['h'],
 95 |                         'a': ['a', 'a'],
 96 |                         't': ['t'],
 97 |                         'e': ['e'],
 98 |                         'c': ['c'],
 99 |                         'o': ['o'],
100 |                         'l': ['l'],
101 |                         '1': ['1'],
102 |                         '2': ['2'],
103 |                         '3': ['3'],
104 |                         ' ': [' ', ' ', '!'],
105 |                     },
106 |                 ),
107 |             ),
108 |         ]
109 |         self.unit_test(
110 |             normalizer=normalizer,
111 |             test_cases=test_cases,
112 |         )
113 | 
114 |     def test_unicode__english_digits_and_full_punctuations_text_normalizer(self):
115 |         normalizer = unicode__english_digits_and_full_punctuations_text_normalizer
116 |         test_cases = [
117 |             (
118 |                 'hate cola 123!',
119 |                 (
120 |                     'hate cola 123!',
121 |                     {
122 |                         'h': ['h'],
123 |                         'a': ['a', 'a'],
124 |                         't': ['t'],
125 |                         'e': ['e'],
126 |                         'c': ['c'],
127 |                         'o': ['o'],
128 |                         'l': ['l'],
129 |                         '1': ['1'],
130 |                         '2': ['2'],
131 |                         '3': ['3'],
132 |                         '!': ['!'],
133 |                         ' ': [' ', ' '],
134 |                     },
135 |                 ),
136 |             ),
137 |         ]
138 |         self.unit_test(
139 |             normalizer=normalizer,
140 |             test_cases=test_cases,
141 |         )
142 | 
143 |     def test_unicode__chinese_english_digits_and_full_punctuations_text_normalizer(self):
144 |         normalizer = unicode__chinese_english_digits_and_full_punctuations_text_normalizer
145 |         test_cases = [
146 |             (
147 |                 '～我想喝100元『可樂』，cola xd~。',
148 |                 (
149 |                     '~我想喝100元"可樂",cola xd~.',
150 |                     {
151 |                         '0': ['0', '0'],
152 |                         '想': ['想'],
153 |                         ' ': [' '],
154 |                         '~': ['～', '~'],
155 |                         ',': ['，'],
156 |                         '.': ['。'],
157 |                         '"': ['『', '』'],
158 |                         '我': ['我'],
159 |                         '喝': ['喝'],
160 |                         '樂': ['樂'],
161 |                         '可': ['可'],
162 |                         '元': ['元'],
163 |                         '1': ['1'],
164 |                         'c': ['c'],
165 |                         'o': ['o'],
166 |                         'l': ['l'],
167 |                         'a': ['a'],
168 |                         'x': ['x'],
169 |                         'd': ['d'],
170 |                     },
171 |                 ),
172 |             ),
173 |             (
174 |                 '。「」﹁﹂『 』、‧（ ）《》〈〉 ﹏﹏﹏……—～，？；：［］【 】！',
175 |                 (
176 |                     '.""  " ", ( )<><>       ~,?;:[][ ]!',
177 |                     {
178 |                         ' ': ['﹁', '﹂', ' ', '‧', ' ', ' ', '﹏',
179 |                               '﹏', '﹏', '…', '…', '—', ' '],
180 |                         '!': ['！'],
181 |                         '"': ['「', '」', '『', '』'],
182 |                         '(': ['（'],
183 |                         ')': ['）'],
184 |                         ',': ['、', '，'],
185 |                         '.': ['。'],
186 |                         ':': ['：'],
187 |                         ';': ['；'],
188 |                         '<': ['《', '〈'],
189 |                         '>': ['》', '〉'],
190 |                         '?': ['？'],
191 |                         '[': ['［', '【'],
192 |                         ']': ['］', '】'],
193 |                         '~': ['～'],
194 |                     },
195 |                 ),
196 |             ),
197 |         ]
198 |         self.unit_test(
199 |             normalizer=normalizer,
200 |             test_cases=test_cases,
201 |         )
202 | 
203 |     def test_unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer(self):
204 |         normalizer = unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer
205 |         test_cases = [
206 |             (
207 |                 '><我想喝100元可樂/cola xd～～',
208 |                 (
209 |                     '  我想喝100元可樂,cola xd--',
210 |                     {
211 |                         '0': ['0', '0'],
212 |                         '想': ['想'],
213 |                         ' ': ['>', '<', ' '],
214 |                         '-': ['～', '～'],
215 |                         ',': ['/'],
216 |                         '我': ['我'],
217 |                         '喝': ['喝'],
218 |                         '樂': ['樂'],
219 |                         '可': ['可'],
220 |                         '元': ['元'],
221 |                         '1': ['1'],
222 |                         'c': ['c'],
223 |                         'o': ['o'],
224 |                         'l': ['l'],
225 |                         'a': ['a'],
226 |                         'x': ['x'],
227 |                         'd': ['d'],
228 |                     },
229 |                 ),
230 |             ),
231 |             (
232 |                 '。「」﹁﹂『 』、‧（ ）《》〈〉 ﹏﹏﹏……—～，？；：［］【 】！',
233 |                 (
234 |                     '.       ,               -, ,       ',
235 |                     {
236 |                         ' ': ['「', '」', '﹁', '﹂', '『', ' ', '』',
237 |                               '‧', '（', ' ', '）', '《', '》', '〈', '〉',
238 |                               ' ', '﹏', '﹏', '﹏', '…', '…', '—', '？', '：',
239 |                               '［', '］', '【', ' ', '】', '！'],
240 |                         ',': ['、', '，', '；'],
241 |                         '-': ['～'],
242 |                         '.': ['。'],
243 |                     },
244 |                 ),
245 |             ),
246 |         ]
247 |         self.unit_test(
248 |             normalizer=normalizer,
249 |             test_cases=test_cases,
250 |         )
251 | 


--------------------------------------------------------------------------------
/text_normalizer/library/time.py:
--------------------------------------------------------------------------------
 1 | from ..factory import ReplacePatternWithToken
 2 | 
 3 | 
 4 | time_text_normalizer_hhmm = ReplacePatternWithToken(
 5 |     name='time_hhmm',
 6 |     denormalizable=True,
 7 |     target_pattern=r'[0-2]*\d:[0-5]*\d',
 8 |     prefix_pattern=r'[^\d:]{1}|\A',
 9 |     suffix_pattern=r'[^\d:]{1}|\Z',
10 |     token=' _time_ ',
11 | )
12 | 


--------------------------------------------------------------------------------
/text_normalizer/library/unicode.py:
--------------------------------------------------------------------------------
 1 | from os.path import join
 2 | 
 3 | from ..factory import UnicodeMapping
 4 | from text_normalizer import ROOT_DIR
 5 | 
 6 | 
 7 | unicode__chinese_characters_text_normalizer = UnicodeMapping(
 8 |     unicode_mapping_path=join(
 9 |         ROOT_DIR,
10 |         'data/unicode/chinese_characters_only.txt',
11 |     ),
12 | )
13 | 
14 | unicode__chinese_characters_and_digits_text_normalizer = UnicodeMapping(
15 |     unicode_mapping_path=join(
16 |         ROOT_DIR,
17 |         'data/unicode/chinese_characters_and_digits.txt',
18 |     ),
19 | )
20 | 
21 | unicode__english_characters_and_digits_text_normalizer = UnicodeMapping(
22 |     unicode_mapping_path=join(
23 |         ROOT_DIR,
24 |         'data/unicode/english_characters_and_digits.txt',
25 |     ),
26 | )
27 | 
28 | unicode__english_digits_and_full_punctuations_text_normalizer = UnicodeMapping(
29 |     unicode_mapping_path=join(
30 |         ROOT_DIR,
31 |         'data/unicode/english_digits_and_full_punctuations.txt',
32 |     ),
33 | )
34 | 
35 | unicode__chinese_english_digits_and_full_punctuations_text_normalizer = UnicodeMapping(
36 |     unicode_mapping_path=join(
37 |         ROOT_DIR,
38 |         'data/unicode/chinese_english_digits_and_full_punctuations.txt',
39 |     ),
40 | )
41 | 
42 | unicode__chinese_english_digits_and_simplified_punctuations_1_text_normalizer = \
43 |     UnicodeMapping(
44 |         unicode_mapping_path=join(
45 |             ROOT_DIR,
46 |             'data/unicode/chinese_english_digits_and_simplified_punctuations_1.txt',
47 |         ),
48 |     )
49 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/utils/__init__.py


--------------------------------------------------------------------------------
/utils/label_propagation.pyx:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | def propagate_label(
  4 |         label: list[int],
  5 |         annotations: list[dict],
  6 |     ) -> list[int]:
  7 |     return propagate_label_in_c(
  8 |         label=label,
  9 |         annotations=annotations,
 10 |     )
 11 | 
 12 | 
 13 | def backpropagate_label(
 14 |         label: list[int],
 15 |         annotations: list[dict],
 16 |     ) -> list[int]:
 17 |     return backpropagate_label_in_c(
 18 |         label=label,
 19 |         annotations=annotations,
 20 |     )
 21 | 
 22 | 
 23 | cdef list propagate_label_in_c(  # noqa: E999
 24 |         list label,
 25 |         list annotations,  # list of dict
 26 |     ):
 27 |     cdef unsigned int i, n_anno
 28 | 
 29 |     n_anno = len(annotations)
 30 |     for i in range(n_anno):
 31 |         label = propagate_label_for_a_pair_of_annotations_in_c(
 32 |             label=label,
 33 |             forward_annotations=annotations[i]['forward'],
 34 |             backward_annotations=annotations[i]['backward'],
 35 |         )
 36 |     return label
 37 | 
 38 | 
 39 | cdef list backpropagate_label_in_c(  # noqa: E999
 40 |         list label,
 41 |         list annotations,
 42 |     ):
 43 |     cdef unsigned int i, j, n_anno
 44 | 
 45 |     n_anno = len(annotations)
 46 |     for i in range(n_anno - 1, -1, -1):
 47 |         label = propagate_label_for_a_pair_of_annotations_in_c(
 48 |             label=label,
 49 |             forward_annotations=annotations[i]['backward'],
 50 |             backward_annotations=annotations[i]['forward'],
 51 |         )
 52 |     return label
 53 | 
 54 | 
 55 | cdef list propagate_label_for_a_pair_of_annotations_in_c(  # noqa: E999
 56 |         list label,
 57 |         list forward_annotations,  # list of tuples
 58 |         list backward_annotations,  # list of tuples
 59 |     ):
 60 | 
 61 |     cdef unsigned int i, n_fmodif, n_bmodif, current_pt
 62 |     cdef list output_label
 63 | 
 64 |     n_fmodif = len(forward_annotations)
 65 |     n_bmodif = len(backward_annotations)
 66 | 
 67 |     if n_fmodif != n_bmodif:
 68 |         raise ValueError(
 69 |             f'number of forward and backward modifications is not the same')
 70 | 
 71 |     if n_bmodif == 0:
 72 |         # no modification return label
 73 |         return label
 74 | 
 75 |     output_label = [0] * (2 * n_fmodif + 1)
 76 |     current_pt = 0
 77 |     for i in range(n_fmodif):
 78 |         # before annotations
 79 |         output_label[2 * i] = label[current_pt: forward_annotations[i][0]]
 80 | 
 81 |         # annotate
 82 |         merged_label = get_high_freq_label(
 83 |             label[forward_annotations[i][0]: forward_annotations[i][1]])
 84 |         n_labels = backward_annotations[i][1] - backward_annotations[i][0]
 85 |         output_label[2 * i + 1] = [merged_label] * n_labels
 86 | 
 87 |         current_pt = forward_annotations[i][1]
 88 | 
 89 |     output_label[-1] = label[forward_annotations[-1][1]:]
 90 | 
 91 |     output_label = sum(output_label, [])
 92 |     return output_label
 93 | 
 94 | 
 95 | cdef unsigned int get_high_freq_label(  # noqa: E999
 96 |     list label):
 97 | 
 98 |     cdef unsigned int max_f, label_f
 99 |     cdef dict record = {}
100 | 
101 |     max_f = 0
102 |     label_f = 0
103 |     for l in label:
104 |         if l not in record:
105 |             record[l] = 1
106 |         else:
107 |             record[l] += 1
108 | 
109 |         if record[l] > max_f:
110 |             max_f = record[l]
111 |             label_f = l
112 |     return label_f
113 | 


--------------------------------------------------------------------------------
/utils/setup_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/utils/setup_utils/__init__.py


--------------------------------------------------------------------------------
/utils/setup_utils/get_ext.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from setuptools import Extension
 4 | 
 5 | 
 6 | def get_ext_modules_n_cmdclass():
 7 | 
 8 |     root_path = Path('.')
 9 | 
10 |     try:
11 |         from Cython.Distutils import build_ext
12 |     except ImportError:
13 |         use_cython = False
14 |     else:
15 |         use_cython = True
16 | 
17 |     cmdclass = {}
18 |     ext_modules = []
19 |     if use_cython:
20 |         # get all .pyx files
21 |         pyx_paths = sorted(root_path.rglob("*.pyx"))
22 |         for pyx_path in pyx_paths:
23 |             path_str = str(pyx_path)
24 |             header = pyx_path.read_text().split('\n')[0]
25 |             if ('cpp' in header) or ('c++' in header):
26 |                 language = 'c++'
27 |             else:
28 |                 language = 'c'
29 | 
30 |             extension = Extension(
31 |                 path_str[:-4].replace('/', '.'),
32 |                 [path_str],
33 |                 language=language,
34 |             )
35 | 
36 |             # Have Cython embed function call signature information in docstrings,
37 |             # so that Sphinx can extract and use those signatures.
38 |             extension.cython_directives = {"embedsignature": True}
39 |             ext_modules.append(extension)
40 |         cmdclass.update({'build_ext': build_ext})
41 | 
42 |     else:
43 |         # .c files
44 |         c_paths = sorted(root_path.rglob("*.c"))
45 |         for c_path in c_paths:
46 |             path_str = str(c_path)
47 |             ext_modules.append(
48 |                 Extension(
49 |                     path_str[:-2].replace('/', '.'),
50 |                     [path_str],
51 |                 ),
52 |             )
53 | 
54 |         # .cpp files
55 |         cpp_paths = sorted(root_path.rglob("*.cpp"))
56 |         for cpp_path in cpp_paths:
57 |             path_str = str(cpp_path)
58 |             ext_modules.append(
59 |                 Extension(
60 |                     path_str[:-4].replace('/', '.'),
61 |                     [path_str],
62 |                 ),
63 |             )
64 | 
65 |     return ext_modules, cmdclass
66 | 


--------------------------------------------------------------------------------
/utils/setup_utils/remove_so_files.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | 
 4 | 
 5 | def remove_so_files():
 6 |     so_paths = sorted(
 7 |         Path('./strpipe').rglob(
 8 |             "*.cpython-36m-x86_64-linux-gnu.so",
 9 |         ),
10 |     )
11 |     for path in so_paths:
12 |         os.remove(str(path.resolve()))
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     remove_so_files()
17 | 


--------------------------------------------------------------------------------
/utils/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yoctol/text-normalizer/3609c10cd229c08b4623531e82d2292fc370734c/utils/test/__init__.py


--------------------------------------------------------------------------------
/utils/test/test_label_propagation.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from ..label_propagation import (
 4 |     # propagate_label,
 5 |     backpropagate_label,
 6 | )
 7 | 
 8 | 
 9 | class LabelPropagationTestCase(TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         """
14 |         input str: 我想買10元的100c.c.飲料
15 |         result of normalization:
16 |         我想買_int_元的_int_c.c.飲料
17 |         meta = {
18 |             'forward': [(3,5, '10'), (7, 10, '100')],
19 |             'backward': [(3,8, '_int_'), (10, 15, '_int_')],
20 |         }
21 | 
22 |         """
23 |         cls.meta = {
24 |             'forward': [(3, 5, '10'), (7, 10, '100')],
25 |             'backward': [(3, 8, '_int_'), (10, 15, '_int_')],
26 |         }
27 |         cls.label = [0, 0, 0, 1, 1, 1, 1, 1, 1,
28 |                      0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0]
29 |         cls.expected_label = [0, 0, 0, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0]
30 | 
31 |     def test_backpropagate_label(self):
32 |         output = backpropagate_label(
33 |             label=self.label,
34 |             annotations=[self.meta],
35 |         )
36 |         self.assertEqual(self.expected_label, output)
37 | 


--------------------------------------------------------------------------------