├── icu_tokenizer ├── bin │ ├── __init__.py │ ├── split.py │ ├── tokenize.py │ └── normalize.py ├── __init__.py ├── sent_splitter.py ├── __main__.py ├── utils.py ├── tokenizer.py ├── url_utils.py └── normalizer.py ├── MANIFEST.in ├── requirements.txt ├── docs ├── docutils.conf ├── requirements.txt ├── _static │ └── theme_overrides.css ├── api.rst ├── index.rst ├── Makefile ├── tools.rst └── conf.py ├── setup.cfg ├── LICENSE ├── INSTALL.md ├── setup.py ├── .github └── workflows │ └── build-docs.yml ├── README.md └── .gitignore /icu_tokenizer/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | PyICU 3 | regex 4 | -------------------------------------------------------------------------------- /docs/docutils.conf: -------------------------------------------------------------------------------- 1 | [writers] 2 | option_limit=0 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = D100,D104,D401 3 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx-argparse 3 | sphinx_rtd_theme 4 | -------------------------------------------------------------------------------- /icu_tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from icu_tokenizer.normalizer import Normalizer 2 | from icu_tokenizer.sent_splitter import SentSplitter 3 | from icu_tokenizer.tokenizer import Tokenizer 4 | 5 | __all__ = ['Normalizer', 'SentSplitter', 'Tokenizer'] 6 | -------------------------------------------------------------------------------- /docs/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | .wy-table-responsive table td kbd { 2 | white-space: nowrap; 3 | } 4 | .wy-table-responsive table td { 5 | white-space: normal !important; 6 | } 7 | .wy-table-responsive { 8 | overflow: visible !important; 9 | } 10 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | Python API Reference 2 | ==================== 3 | 4 | 5 | Sentence Splitter 6 | ----------------- 7 | 8 | .. autoclass:: icu_tokenizer.sent_splitter.SentSplitter 9 | :members: 10 | 11 | .. automethod:: __init__ 12 | 13 | 14 | Normalizer 15 | ---------- 16 | 17 | .. autoclass:: icu_tokenizer.normalizer.Normalizer 18 | :members: 19 | 20 | .. automethod:: __init__ 21 | 22 | 23 | Tokenizer 24 | --------- 25 | 26 | .. autoclass:: icu_tokenizer.tokenizer.Tokenizer 27 | :members: 28 | 29 | .. automethod:: __init__ 30 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ICU-Tokenizer documentation master file, created by 2 | sphinx-quickstart on Thu Jun 18 23:55:49 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to ICU-Tokenizer's documentation! 7 | ========================================= 8 | 9 | **ICU-tokenizer** is a python package used to perform universal language 10 | normalization and tokenization using the International Components for 11 | Unicode. 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: Library Reference 16 | 17 | tools 18 | api 19 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = ../build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /icu_tokenizer/sent_splitter.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from icu import BreakIterator, Locale 4 | 5 | from icu_tokenizer.utils import apply_break_iterator 6 | 7 | 8 | class SentSplitter(object): 9 | """ICU sentence splitter. 10 | 11 | Usage: 12 | 13 | >>> splitter = SentSplitter(lang) 14 | >>> sents: List[str] = splitter.split(paragraph) 15 | """ 16 | 17 | def __init__(self, lang: str = 'en'): 18 | """SentSplitter.""" 19 | self.lang = lang 20 | self.locale = Locale(lang) 21 | self.break_iterator = \ 22 | BreakIterator.createSentenceInstance(self.locale) 23 | 24 | def split(self, text: str) -> List[str]: 25 | """Split a sentence with the ICU sentence splitter.""" 26 | return apply_break_iterator(self.break_iterator, text) 27 | -------------------------------------------------------------------------------- /docs/tools.rst: -------------------------------------------------------------------------------- 1 | Commandline Tools 2 | ================= 3 | 4 | **ICU-Tokenzier** provides it's full set of functionalities through the 5 | commandline. The commandline tools can be accessed by calling the module 6 | as a script. 7 | 8 | :: 9 | 10 | python -m icu_tokenizer 11 | 12 | 13 | Sentence Splitting 14 | ------------------ 15 | 16 | .. automodule:: icu_tokenizer.bin.split 17 | .. argparse:: 18 | :module: icu_tokenizer.__main__ 19 | :func: make_parser 20 | :path: split 21 | 22 | 23 | Normalize 24 | --------- 25 | 26 | .. automodule:: icu_tokenizer.bin.normalize 27 | .. argparse:: 28 | :module: icu_tokenizer.__main__ 29 | :func: make_parser 30 | :path: normalize 31 | 32 | 33 | Tokenize 34 | -------- 35 | 36 | .. automodule:: icu_tokenizer.bin.tokenize 37 | .. argparse:: 38 | :module: icu_tokenizer.__main__ 39 | :func: make_parser 40 | :path: tokenize 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 mingruimingrui 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /icu_tokenizer/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from types import ModuleType 3 | from typing import Dict 4 | 5 | from icu_tokenizer.bin import normalize as normalize_module 6 | from icu_tokenizer.bin import split as split_module 7 | from icu_tokenizer.bin import tokenize as tokenize_module 8 | 9 | SUBCOMMANDS: Dict[str, ModuleType] = { 10 | 'normalize': normalize_module, 11 | 'split': split_module, 12 | 'tokenize': tokenize_module, 13 | } 14 | 15 | 16 | def make_parser() -> argparse.ArgumentParser: 17 | """Make the parser for the main program.""" 18 | parser = argparse.ArgumentParser( 19 | prog='python3 -m mt_experiments', 20 | description='Machine Translation Experimentation Toolkit' 21 | ) 22 | subparsers = parser.add_subparsers(dest='subcommand') 23 | 24 | for subcommand, module in SUBCOMMANDS.items(): 25 | module.add_options(subparsers.add_parser( 26 | subcommand, help=module.__doc__, 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 28 | )) 29 | 30 | return parser 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = make_parser() 35 | args = parser.parse_args() 36 | 37 | if args.subcommand in SUBCOMMANDS: 38 | module = SUBCOMMANDS[args.subcommand] 39 | module.main(args) 40 | else: 41 | parser.print_help() 42 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Requirements 2 | 3 | **ICU-Tokenizer** likely requires the following tools and libraries for a 4 | successful installation. 5 | 6 | - ICU library 7 | - GNU C++11 compiler 8 | 9 | ### Installing the ICU Library 10 | 11 | #### Conda 12 | 13 | ```sh 14 | conda install icu pkg-config 15 | 16 | # Or if you wish to use the latest version of the ICU library, 17 | # the conda-forge channel typically contains a more up to date version. 18 | conda install -c conda-forge icu 19 | ``` 20 | 21 | #### MacOS 22 | 23 | ```sh 24 | # With homebrew 25 | brew install icu4c 26 | 27 | 28 | # With macports 29 | port install icu 30 | ``` 31 | 32 | #### Windows 33 | 34 | ```sh 35 | # You can download their binaries from their site or just build from source. 36 | # It's probably easier to use conda, docker, wsl or dual boot linux. 37 | # Good luck! 38 | ``` 39 | 40 | #### Debian/Ubuntu 41 | 42 | ```sh 43 | apt update 44 | apt install libicu-dev pkg-config 45 | ``` 46 | 47 | #### Archlinux 48 | 49 | ```sh 50 | pacman -Sy icu 51 | ``` 52 | 53 | #### Fedora/RHEL/Centos 54 | 55 | ```sh 56 | yum install libicu-devel 57 | ``` 58 | 59 | ## Installation 60 | 61 | This package is released on PyPI, install with `pip`. 62 | 63 | ```sh 64 | pip install ICU-Tokenizer 65 | ``` 66 | 67 | On MacOS, it is likely necessary to specify the compiler and path to the 68 | `icu-config` tool. 69 | 70 | ```sh 71 | CFLAGS="-std=c++11" PATH="/usr/local/opt/icu4c/bin:$PATH" \ 72 | pip install ICU-Tokenizer 73 | ``` 74 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | def get_long_description(): 5 | """Reads and return README as a string.""" 6 | with open('README.md', 'rb') as f: 7 | return f.read().decode('utf-8', errors='ignore') 8 | 9 | 10 | with open('requirements.txt', 'r') as f: 11 | install_requires = f.read() 12 | 13 | 14 | setuptools.setup( 15 | name='icu_tokenizer', 16 | version='0.0.1', 17 | author='Wang Ming Rui', 18 | author_email='mingruimingrui@hotmail.com', 19 | description="ICU based universal language tokenizer", 20 | long_description=get_long_description(), 21 | long_description_content_type="text/markdown", 22 | url="https://github.com/mingruimingrui/ICU-tokenizer", 23 | 24 | install_requires=install_requires, 25 | packages=['icu_tokenizer', 'icu_tokenizer.bin'], 26 | 27 | classifiers=[ 28 | 'Intended Audience :: Developers', 29 | 'Intended Audience :: Science/Research', 30 | 'Programming Language :: Python', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | 'Programming Language :: Python :: 3.8', 34 | 'Programming Language :: Python :: 3.9', 35 | 'License :: OSI Approved :: MIT License', 36 | 'Topic :: Scientific/Engineering', 37 | 'Topic :: Software Development', 38 | 'Topic :: Software Development :: Libraries', 39 | 'Topic :: Software Development :: Libraries :: Python Modules', 40 | 'Topic :: Software Development :: Localization', 41 | ], 42 | license='MIT License' 43 | ) 44 | -------------------------------------------------------------------------------- /.github/workflows/build-docs.yml: -------------------------------------------------------------------------------- 1 | name: Build documentations 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | create: 8 | tags: 9 | - v* 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Set up Python 3.7 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.7 21 | 22 | - name: Install dependencies 23 | run: sudo apt install -y g++ libicu-dev pkg-config 24 | env: 25 | DEBIAN_FRONTEND: noninteractive 26 | 27 | - name: Install python packages 28 | run: | 29 | python -m pip install --no-cache-dir . 30 | python -m pip install --no-cache-dir -r docs/requirements.txt 31 | 32 | - name: Build docs 33 | run: sphinx-build -b html docs public 34 | 35 | # Publish built docs to gh-pages branch. 36 | # =============================== 37 | - name: Commit documentation changes 38 | run: | 39 | git clone https://github.com/mingruimingrui/ICU-tokenizer.git --branch gh-pages --single-branch gh-pages 40 | cp -r public/* gh-pages/ 41 | cd gh-pages 42 | git config --local user.email "action@github.com" 43 | git config --local user.name "GitHub Action" 44 | git add . 45 | git commit -m "Update documentation" -a || true 46 | - name: Push changes 47 | uses: ad-m/github-push-action@master 48 | with: 49 | branch: gh-pages 50 | directory: gh-pages 51 | github_token: ${{ secrets.GITHUB_TOKEN }} 52 | # =============================== 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **ICU-tokenizer** is a python package used to perform universal language 2 | normalization and tokenization using the International Components for 3 | Unicode. 4 | 5 | - [Install](#install) 6 | - [Usage (Python)](#usage-python) 7 | - [Sentence splitter](#sentence-splitter) 8 | - [Normalizer](#normalizer) 9 | - [Tokenizer](#tokenizer) 10 | 11 | ## Install 12 | 13 | See [./INSTALL.md](./INSTALL.md) 14 | 15 | ## Usage (Python) 16 | 17 | ### Sentence splitter 18 | 19 | ```py 20 | # To split a paragraph into multiple sentences 21 | >>> from icu_tokenizer import SentSplitter 22 | >>> splitter = SentSplitter('zh') 23 | 24 | >>> paragraph = """ 25 | 美国最高法院(英语:Supreme Court of the United States),一般是指美国联邦最高法院,是美国最高级别的联邦法院,为美国三权继总统、国会后最为重要的一环。根据1789年《美国宪法第三条》的规定,最高法院对所有联邦法院、州法院和涉及联邦法律问题的诉讼案件具有最终(并且在很大程度上是有斟酌决定权的)上诉管辖权,以及对小范围案件的具有初审管辖权。在美国的法律制度中,最高法院通常是包括《美国宪法》在内的联邦法律的最终解释者,但仅在具有管辖权的案件范围内。法院不享有判定政治问题的权力;政治问题的执法机关是行政机关,而不是政府的司法部门。 26 | """ 27 | >>> splitter.split(paragraph) 28 | [ 29 | '美国最高法院(英语:Supreme Court of the United States),一般是指美国联邦最高法院,是美国最高级别的联邦法院,为美国三权继总统、国会后最为重要的一环。', 30 | '根据1789年《美国宪法第三条》的规定,最高法院对所有联邦法院、州法院和涉及联邦法律问题的诉讼案件具有最终(并且在很大程度上是有斟酌决定权的)上诉管辖权,以及对小范围案件的具有初审管辖权。', 31 | '在美国的法律制度中,最高法院通常是包括《美国宪法》在内的联邦法律的最终解释者,但仅在具有管辖权的案件范围内。', 32 | '法院不享有判定政治问题的权力;政治问题的执法机关是行政机关,而不是政府的司法部门。' 33 | ] 34 | ``` 35 | 36 | ### Normalizer 37 | 38 | ```py 39 | # To normalize text 40 | >>> from icu_tokenizer import Normalizer 41 | >>> normalizer = Normalizer(lang='en', norm_puncts=True) 42 | 43 | >>> text = "𝑻𝒉𝒆 𝒑𝒓𝒐𝒅𝒖𝒄𝒕𝒔 𝒚𝒐𝒖 𝒐𝒓𝒅𝒆𝒓𝒆𝒅 𝒘𝒊𝒍𝒍 𝒃𝒆 𝒔𝒉𝒊𝒑𝒑𝒆𝒅 𝒅𝒊𝒓𝒆𝒄𝒕𝒍𝒚 𝒇𝒓𝒐𝒎 𝑲𝒐𝒓𝒆𝒂." 44 | >>> normalizer.normalize(text) 45 | "The products you ordered will be shipped directly from Korea." 46 | 47 | >>> text = "【】()" 48 | >>> normalizer.normalize(text) 49 | "[]()" 50 | ``` 51 | 52 | ### Tokenizer 53 | 54 | ```py 55 | >>> from icu_tokenizer import Tokenizer 56 | >>> tokenizer = Tokenizer(lang='th') 57 | 58 | >>> text = "ภาษาไทยเป็นภาษาที่มีระดับเสียงของคำแน่นอนหรือวรรณยุกต์เช่นเดียวกับภาษาจีน และออกเสียงแยกคำต่อคำ" 59 | >>> tokenizer.tokenize(text) 60 | ['ภาษา', 'ไทย', 'เป็น', 'ภาษา', 'ที่', 'มี', 'ระดับ', 'เสียง', 'ของ', 'คำ', 'แน่นอน', 'หรือ', 'วรรณยุกต์', 'เช่น', 'เดียว', 'กับ', 'ภาษา', 'จีน', 'และ', 'ออก', 'เสียง', 'แยก', 'คำ', 'ต่อ', 'คำ'] 61 | ``` 62 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'ICU-Tokenizer' 21 | copyright = '2020, Ming Rui' 22 | author = 'Ming Rui' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '0.0.1' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.intersphinx', 36 | 'sphinx.ext.viewcode', 37 | 'sphinx.ext.napoleon', 38 | 'sphinxarg.ext' 39 | ] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # List of patterns, relative to source directory, that match files and 45 | # directories to ignore when looking for source files. 46 | # This pattern also affects html_static_path and html_extra_path. 47 | exclude_patterns = [] 48 | 49 | 50 | # -- Options for HTML output ------------------------------------------------- 51 | 52 | # The theme to use for HTML and HTML Help pages. See the documentation for 53 | # a list of builtin themes. 54 | # 55 | html_theme = 'sphinx_rtd_theme' 56 | html4_writer = True 57 | 58 | # Add any paths that contain custom static files (such as style sheets) here, 59 | # relative to this directory. They are copied after the builtin static files, 60 | # so a file named "default.css" will overwrite the builtin "default.css". 61 | html_static_path = ['_static'] 62 | 63 | # Additional theme customization 64 | html_context = { 65 | 'css_files': [ 66 | '_static/theme_overrides.css', # override wide tables in RTD theme 67 | ], 68 | } 69 | -------------------------------------------------------------------------------- /icu_tokenizer/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import argparse 4 | import sys 5 | from typing import List 6 | 7 | import icu 8 | import regex 9 | from icu import BreakIterator 10 | 11 | 12 | def apply_break_iterator( 13 | break_iterator: BreakIterator, 14 | text: str 15 | ) -> List[str]: 16 | """Apply ICU break iterator on a text.""" 17 | break_iterator.setText(text) 18 | parts = [] 19 | p0 = 0 20 | for p1 in break_iterator: 21 | part = text[p0:p1].strip() 22 | if len(part) > 0: 23 | parts.append(part) 24 | p0 = p1 25 | return parts 26 | 27 | 28 | def get_all_unicode_chars(): 29 | """Get all unicode characters.""" 30 | all_unicode_chars = [] 31 | i = 0 32 | while True: 33 | try: 34 | all_unicode_chars.append(chr(i)) 35 | except ValueError: 36 | break 37 | i += 1 38 | return all_unicode_chars 39 | 40 | 41 | def get_versions() -> dict: 42 | """Get versions of the various dependecies related to icu_tokenizer.""" 43 | versions = { 44 | 'icu': icu.ICU_VERSION, 45 | 'PyICU': icu.VERSION, 46 | 'regex': regex.__version__ 47 | } 48 | 49 | try: 50 | import opencc 51 | versions['opencc'] = opencc.__version__ 52 | except ImportError: 53 | pass 54 | 55 | return versions 56 | 57 | 58 | class TextFileType(argparse.FileType): 59 | """argparse.FileType modified for utf-8 text files.""" 60 | 61 | def __init__(self, mode: str = 'r', bufsize: int = -1): 62 | """TextFileType.""" 63 | self._mode = mode 64 | self._bufsize = bufsize 65 | self._encoding = 'utf-8' 66 | self._errors = 'ignore' 67 | 68 | def __call__(self, string): # noqa 69 | # the special argument "-" means sys.std{in,out} 70 | if string == '-': 71 | if 'r' in self._mode: 72 | return sys.stdin 73 | elif 'w' in self._mode: 74 | return sys.stdout 75 | else: 76 | msg = 'argument "-" with mode {}'.format(self._mode) 77 | raise ValueError(msg) 78 | 79 | # all other arguments are used as file names 80 | try: 81 | return open(string, self._mode, self._bufsize, self._encoding, 82 | self._errors) 83 | except OSError as e: 84 | msg = "can't open '{}': {}".format(string, e) 85 | raise argparse.ArgumentTypeError(msg) 86 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Development 132 | .vscode 133 | sync.sh 134 | -------------------------------------------------------------------------------- /icu_tokenizer/bin/split.py: -------------------------------------------------------------------------------- 1 | """Split lines containing multiple sentences.""" 2 | 3 | import sys 4 | import argparse 5 | 6 | from tqdm import tqdm 7 | 8 | from icu_tokenizer.sent_splitter import SentSplitter 9 | from icu_tokenizer.utils import TextFileType 10 | 11 | CACHE = {} 12 | 13 | 14 | def add_options(parser: argparse.ArgumentParser): 15 | """Add options to a parser.""" 16 | parser.add_argument( 17 | '-l', '--lang', type=str, default='en', 18 | help='Language identifier') 19 | 20 | parser.add_argument( 21 | '-i', '--inputs', type=TextFileType('r'), 22 | nargs='+', default=[sys.stdin], 23 | help='Input files. Defaults to stdin.') 24 | parser.add_argument( 25 | '-o', '--output', type=TextFileType('w'), default=sys.stdout, 26 | help='Output file. Defaults to stdout.') 27 | 28 | parser.add_argument( 29 | '-j', '--num-workers', type=int, default=0, 30 | help='Number of processes to use') 31 | parser.add_argument( 32 | '--show-pbar', action='store_true', 33 | help='Show progressbar') 34 | parser.add_argument( 35 | '--verbose', action='store_true', 36 | help='Print splits to stderr') 37 | 38 | 39 | def main(args: argparse.Namespace): # noqa 40 | if args.num_workers == 0: 41 | import multiprocessing.dummy as multiprocessing 42 | args.num_workers = 1 43 | else: 44 | import multiprocessing 45 | 46 | if args.num_workers < 0: # Use all cores 47 | args.num_workers = multiprocessing.cpu_count() 48 | 49 | def create_chunk_input_stream(): 50 | chunk = [] 51 | for f in args.inputs: 52 | for line in f: 53 | chunk.append(line) 54 | if len(chunk) >= 256: 55 | yield chunk 56 | chunk = [] 57 | if len(chunk) > 0: 58 | yield chunk 59 | 60 | pbar = None 61 | if args.show_pbar: 62 | pbar = tqdm() 63 | 64 | with multiprocessing.Pool( 65 | args.num_workers, 66 | initializer=worker_init_fn, 67 | initargs=[args.lang] 68 | ) as pool: 69 | for chunk in pool.imap(worker_fn, create_chunk_input_stream()): 70 | if pbar is not None: 71 | pbar.update(len(chunk)) 72 | for sents in chunk: 73 | for sent in sents: 74 | args.output.write(sent + '\n') 75 | if args.verbose and len(sents) > 1: 76 | sys.stderr.write('\rSplitting done: {}\n'.format(sents)) 77 | sys.stderr.flush() 78 | args.output.flush() 79 | 80 | if pbar is not None: 81 | pbar.close() 82 | 83 | 84 | def worker_init_fn(lang: str): # noqa 85 | CACHE['sent_splitter'] = SentSplitter(lang) 86 | 87 | 88 | def worker_fn(texts): # noqa 89 | split_fn = CACHE['sent_splitter'].split 90 | return [split_fn(t) for t in texts] 91 | -------------------------------------------------------------------------------- /icu_tokenizer/bin/tokenize.py: -------------------------------------------------------------------------------- 1 | """Tokenize text using unicode properties.""" 2 | 3 | import sys 4 | import argparse 5 | 6 | from tqdm import tqdm 7 | 8 | from icu_tokenizer.tokenizer import Tokenizer 9 | from icu_tokenizer.utils import TextFileType 10 | 11 | CACHE = {} 12 | 13 | 14 | def add_options(parser: argparse.ArgumentParser): 15 | """Add options to a parser.""" 16 | parser.add_argument( 17 | '-i', '--inputs', type=TextFileType('r'), 18 | nargs='+', default=[sys.stdin], 19 | help='Input files. Defaults to stdin.') 20 | parser.add_argument( 21 | '-o', '--output', type=TextFileType('w'), default=sys.stdout, 22 | help='Output file. Defaults to stdout.') 23 | 24 | parser.add_argument( 25 | '-l', '--lang', type=str, default='en', 26 | help='Language identifier') 27 | parser.add_argument( 28 | '-a', '--annotate-hyphens', action='store_true', 29 | help='Annotate hyphens similar to moses') 30 | parser.add_argument( 31 | '-url', '--protect-urls', action='store_true', 32 | help='Protect url patterns') 33 | 34 | parser.add_argument( 35 | '-j', '--num-workers', type=int, default=0, 36 | help='Number of processes to use') 37 | parser.add_argument( 38 | '--show-pbar', action='store_true', 39 | help='Show progressbar') 40 | 41 | 42 | def main(args: argparse.Namespace): # noqa 43 | if args.num_workers == 0: 44 | import multiprocessing.dummy as multiprocessing 45 | args.num_workers = 1 46 | else: 47 | import multiprocessing 48 | 49 | if args.num_workers < 0: # Use all cores 50 | args.num_workers = multiprocessing.cpu_count() 51 | 52 | def create_chunk_input_stream(): 53 | chunk = [] 54 | for f in args.inputs: 55 | for line in f: 56 | chunk.append(line) 57 | if len(chunk) >= 256: 58 | yield chunk 59 | chunk = [] 60 | if len(chunk) > 0: 61 | yield chunk 62 | 63 | pbar = None 64 | if args.show_pbar: 65 | pbar = tqdm() 66 | 67 | with multiprocessing.Pool( 68 | args.num_workers, 69 | initializer=worker_init_fn, 70 | initargs=[args.lang, args.annotate_hyphens, args.protect_urls] 71 | ) as pool: 72 | for chunk in pool.imap(worker_fn, create_chunk_input_stream()): 73 | if pbar is not None: 74 | pbar.update(len(chunk)) 75 | for line in chunk: 76 | args.output.write(line + '\n') 77 | args.output.flush() 78 | 79 | if pbar is not None: 80 | pbar.close() 81 | 82 | 83 | def worker_init_fn(lang: str, annotate_hyphens: bool, protect_urls: bool): # noqa 84 | CACHE['tokenizer'] = Tokenizer( 85 | lang, 86 | annotate_hyphens=annotate_hyphens, 87 | protect_urls=protect_urls 88 | ) 89 | 90 | 91 | def worker_fn(texts): # noqa 92 | tokenize_fn = CACHE['tokenizer'].tokenize 93 | return [' '.join(tokenize_fn(t)) for t in texts] 94 | -------------------------------------------------------------------------------- /icu_tokenizer/bin/normalize.py: -------------------------------------------------------------------------------- 1 | """Normalize text using unicode properties.""" 2 | 3 | import sys 4 | import argparse 5 | 6 | from tqdm import tqdm 7 | 8 | from icu_tokenizer.normalizer import Normalizer 9 | from icu_tokenizer.utils import TextFileType 10 | 11 | CACHE = {} 12 | 13 | 14 | def add_options(parser: argparse.ArgumentParser): 15 | """Add options to a parser.""" 16 | parser.add_argument( 17 | '-l', '--lang', type=str, default='en', 18 | help='Language identifier') 19 | parser.add_argument( 20 | '-p', '--norm-puncts', action='store_true', 21 | help='Normalize punctuations') 22 | parser.add_argument( 23 | '-lc', '--lowercase', action='store_true', 24 | help='Cast all characters to lowercase') 25 | 26 | parser.add_argument( 27 | '-i', '--inputs', type=TextFileType('r'), 28 | nargs='+', default=[sys.stdin], 29 | help='Input files. Defaults to stdin.') 30 | parser.add_argument( 31 | '-o', '--output', type=TextFileType('w'), default=sys.stdout, 32 | help='Output file. Defaults to stdout.') 33 | 34 | parser.add_argument( 35 | '-j', '--num-workers', type=int, default=0, 36 | help='Number of processes to use') 37 | parser.add_argument( 38 | '--show-pbar', action='store_true', 39 | help='Show progressbar') 40 | 41 | 42 | def main(args: argparse.Namespace): # noqa 43 | if args.num_workers == 0: 44 | import multiprocessing.dummy as multiprocessing 45 | args.num_workers = 1 46 | else: 47 | import multiprocessing 48 | 49 | if args.num_workers < 0: # Use all cores 50 | args.num_workers = multiprocessing.cpu_count() 51 | 52 | def create_chunk_input_stream(): 53 | chunk = [] 54 | for f in args.inputs: 55 | for line in f: 56 | chunk.append(line) 57 | if len(chunk) >= 256: 58 | yield chunk 59 | chunk = [] 60 | if len(chunk) > 0: 61 | yield chunk 62 | 63 | pbar = None 64 | if args.show_pbar: 65 | pbar = tqdm() 66 | 67 | with multiprocessing.Pool( 68 | args.num_workers, 69 | initializer=worker_init_fn, 70 | initargs=[args.lang, args.norm_puncts, args.lowercase] 71 | ) as pool: 72 | for chunk in pool.imap(worker_fn, create_chunk_input_stream()): 73 | if pbar is not None: 74 | pbar.update(len(chunk)) 75 | for line in chunk: 76 | args.output.write(line + '\n') 77 | args.output.flush() 78 | 79 | if pbar is not None: 80 | pbar.close() 81 | 82 | 83 | def worker_init_fn(lang: str, norm_puncts: bool, lowercase: bool): # noqa 84 | CACHE['normalizer'] = Normalizer(lang, norm_puncts) 85 | CACHE['lowercase'] = lowercase 86 | 87 | 88 | def worker_fn(texts): # noqa 89 | normalize_fn = CACHE['normalizer'].normalize 90 | texts = [normalize_fn(t) for t in texts] 91 | if CACHE['lowercase']: 92 | texts = [t.lower() for t in texts] 93 | return texts 94 | -------------------------------------------------------------------------------- /icu_tokenizer/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Union 3 | 4 | from icu import BreakIterator, Locale 5 | 6 | from icu_tokenizer.url_utils import email_pattern, grubber_url_matcher 7 | from icu_tokenizer.utils import apply_break_iterator 8 | 9 | PROTECTED_TEMPLATE = '__PROTECTED_SEQUENCE_{}__' 10 | 11 | 12 | class Tokenizer(object): 13 | """ICU based tokenizer with additional functionality to protect sequences. 14 | 15 | Usage: 16 | 17 | >>> tokenizer = Tokenizer( 18 | lang, 19 | annotate_hyphens: bool, 20 | protect_emails_urls: bool, 21 | extra_protected_patterns: List[Union[str, re.Pattern]] = [], 22 | ) 23 | >>> tokens: List[str] = tokenizer.tokenize(text) 24 | """ 25 | 26 | HYPHEN_PATTERN = re.compile(r'(\w)\-(?=\w)') 27 | HYPHEN_PATTERN_REPL = r'\1 @-@ ' 28 | PROTECTED_HYPHEN_PATTERN = re.compile(r'@\-@') 29 | 30 | def __init__( 31 | self, 32 | lang: str = 'en', 33 | annotate_hyphens: bool = False, 34 | protect_emails_urls: bool = False, 35 | extra_protected_patterns: List[Union[str, re.Pattern]] = [], 36 | ): 37 | """Tokenizer. 38 | 39 | Keyword Arguments: 40 | lang {str} -- language identifier (default: {'en'}) 41 | annotate_hyphens {bool} -- Protect dashes (default: {False}) 42 | protect_emails_urls {bool} -- Protect urls (default: {False}) 43 | extra_protected_patterns {List[Union[str, re.Pattern]]} -- 44 | A list of regex patterns (default: {[]}) 45 | """ 46 | self.lang = lang 47 | self.locale = Locale(lang) 48 | self.break_iterator = \ 49 | BreakIterator.createWordInstance(self.locale) 50 | self.protected_patterns = [] 51 | 52 | self.annotate_hyphens = annotate_hyphens 53 | if self.annotate_hyphens: 54 | self.protected_patterns.append(self.PROTECTED_HYPHEN_PATTERN) 55 | 56 | if protect_emails_urls: 57 | self.protected_patterns.append(email_pattern) 58 | self.protected_patterns.append(grubber_url_matcher) 59 | 60 | for pattern in extra_protected_patterns: 61 | if isinstance(pattern, str): 62 | pattern = re.compile(pattern) 63 | self.protected_patterns.append(pattern) 64 | 65 | def tokenize(self, text: str) -> List[str]: 66 | """Tokenize text into list of tokens. 67 | 68 | Args: 69 | text (str): Raw input text. 70 | 71 | Returns: 72 | List[str]: List of tokens. 73 | """ 74 | if self.annotate_hyphens: 75 | text = self.HYPHEN_PATTERN.sub(self.HYPHEN_PATTERN_REPL, text) 76 | 77 | protected_map = {} 78 | 79 | def protect_replace(match): 80 | protected_str = PROTECTED_TEMPLATE.format(len(protected_map)) 81 | protected_map[protected_str] = match.group(0) 82 | return ' {} '.format(protected_str) 83 | 84 | for i, pattern in enumerate(self.protected_patterns): 85 | text = pattern.sub(protect_replace, text) 86 | 87 | tokens = apply_break_iterator(self.break_iterator, text) 88 | return [protected_map.get(t, t) for t in tokens] 89 | -------------------------------------------------------------------------------- /icu_tokenizer/url_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import regex 3 | 4 | __all__ = ['email_pattern', 'grubber_url_matcher'] 5 | 6 | 7 | sub_domain_pstr = r'[0-9A-Za-z\-\_\~]+' 8 | top_domain_pstr = r'(?:[.](?:{}))'.format(r'|'.join(re.escape(s) for s in [ 9 | 'com', 'org', 'net', 'int', 'edu', 'gov', 'mil', 'ac', 'ad', 10 | 'ae', 'af', 'ag', 'ai', 'al', 'am', 'ao', 'aq', 'ar', 'as', 11 | 'at', 'au', 'aw', 'ax', 'az', 12 | 'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh', 'bi', 'bj', 'bm', 13 | 'bn', 'bo', 'br', 'bs', 'bt', 'bw', 'by', 'bz', 14 | 'ca', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', 'cm', 15 | 'cn', 'co', 'cr', 'cu', 'cv', 'cw', 'cx', 'cy', 'cz', 16 | 'de', 'dj', 'dk', 'dm', 'do', 'dz', 17 | 'ec', 'ee', 'eg', 'er', 'es', 'et', 'eu', 18 | 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 19 | 'ga', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 20 | 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', 'gw', 'gy', 21 | 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 22 | 'id', 'ie', 'il', 'im', 'in', 'io', 'iq', 'ir', 'is', 'it', 23 | 'je', 'jm', 'jo', 'jp', 24 | 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky', 25 | 'kz', 26 | 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 27 | 'ly', 28 | 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mk', 'ml', 'mm', 'mn', 29 | 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 30 | 'my', 'mz', 31 | 'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 32 | 'nu', 'nz', 33 | 'om', 34 | 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 35 | 'ps', 'pt', 'pw', 'py', 36 | 'qa', 37 | 're', 'ro', 'rs', 'ru', 'rw', 38 | 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 39 | 'sm', 'sn', 'so', 'sr', 'ss', 'st', 'su', 'sv', 'sx', 'sy', 40 | 'sz', 41 | 'tc', 'td', 'tf', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn', 42 | 'to', 'tr', 'tt', 'tv', 'tw', 'tz', 43 | 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 44 | 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu', 45 | 'wf', 'ws', 46 | 'ye', 'yt', 47 | 'za', 'zm', 'zw' 48 | ])) # https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains 49 | 50 | domain_pstr = r'(?:{sub_domain}\.)*{sub_domain}{top_domain}'.format( 51 | sub_domain=sub_domain_pstr, 52 | top_domain=top_domain_pstr, 53 | ) 54 | 55 | # https://stackoverflow.com/questions/2049502/what-characters-are-allowed-in-an-email-address 56 | local_part_valid_word = r'[0-9A-Za-z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~]+' 57 | local_part_pstr = r'{word}(?:\.{word})*'.format(word=local_part_valid_word) 58 | 59 | email_pstr = r'({})'.format( 60 | r'{local_part}\@{domain}'.format( 61 | local_part=local_part_pstr, 62 | domain=domain_pstr, 63 | ) 64 | ) 65 | email_pattern = regex.compile(email_pstr, re.IGNORECASE) 66 | """Custom email matcher based on 67 | https://en.wikipedia.org/wiki/International_email 68 | """ 69 | 70 | 71 | # A customized grubber v1 URL matcher 72 | # Designed to work with urls starting with https, http, ftp, or www 73 | grubber_url_pstr = r'(?i)\b((?:(?:https|http|ftp):/{1,3}|www[.])[^\s()<>\(\)\【\】]+(?:\([\w\d]+\)|(?:[^!"#$%&\'()*+,\-./:;<=>?@\[\]\s\(\)\【\】。,?!]|/)))' # noqa 74 | grubber_url_matcher: re.Pattern = re.compile(grubber_url_pstr, re.ASCII) 75 | """Grubber v1 URL matcher with additional rules to account for chinese 76 | punctuations. 77 | 78 | Designed to work with urls starting with https, http, ftp, or www. 79 | """ 80 | -------------------------------------------------------------------------------- /icu_tokenizer/normalizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import unicodedata 5 | from typing import Dict, List 6 | 7 | import regex 8 | 9 | from icu_tokenizer.utils import get_all_unicode_chars 10 | 11 | 12 | class Normalizer(object): 13 | """Unicode information based normalizer. 14 | 15 | Does the following 16 | 17 | - Ensure NFKC format 18 | - Handle pseudo-spaces (for numbers) 19 | - Normalize by unicode categories \ 20 | https://www.fileformat.info/info/unicode/category/index.htm 21 | 22 | - ``[C*|So|Z*]`` → ' ' 23 | - ``[Pc]`` → ``_`` 24 | - ``[Pd]`` → ``-`` 25 | - ``[Pf|Pi]`` → ``"`` (except for ``'``) 26 | - ``[Ps]`` → ``(`` (except for ``{``, ``[``) 27 | - ``[Pe]`` → ``)`` (except for ``}``, ``]``) 28 | - Normalize Nd (Numbers) 29 | - Account for some outliers 30 | - Remove non printable characters 31 | - Normalize whitespace characters 32 | - Perform language specific normalization 33 | 34 | Usage: 35 | 36 | >>> normalizer = Normalizer(lang, norm_puncts=True) 37 | >>> norm_text: str = normalizer.normalize(text) 38 | """ 39 | 40 | def __init__(self, lang: str = 'en', norm_puncts: bool = False): 41 | """Normalizer. 42 | 43 | Args: 44 | lang (str, optional): Language identifier. Defaults to 'en'. 45 | norm_puncts (bool, optional): Normalize punctuations?. 46 | Defaults to False. 47 | """ 48 | # Handle control tokens 49 | self.ignore_pattern = regex.compile(r'\p{C}|\p{So}|\p{Z}') 50 | 51 | # Handle pseudo-spaces 52 | # Random note: it appears pseudo-spaces primarily makes a difference 53 | # when numbers are involved 54 | self.pseudo_num_pattern = re.compile(r'(\d) (\d)') 55 | 56 | # Punctuation and number replace maps 57 | self.num_pattern = regex.compile(r'\p{Nd}+') 58 | self.punct_replace_map = self.punct_pattern = None 59 | if norm_puncts: 60 | self.punct_replace_map = make_punct_replace_map() 61 | self.punct_pattern = \ 62 | make_pattern_from_keys(self.punct_replace_map.keys()) 63 | 64 | # Other language specific normalizers 65 | lang_replace_map = make_lang_specific_replace_map(lang) 66 | self.lang_replace_map = self.lang_replace_pattern = None 67 | if len(lang_replace_map) > 0: 68 | self.lang_replace_map = lang_replace_map 69 | self.lang_replace_pattern = \ 70 | make_pattern_from_keys(lang_replace_map.keys()) 71 | 72 | def _num_replace_fn(self, match: re.Match) -> str: 73 | return str(int(match.group(0))) 74 | 75 | def _punct_replace_fn(self, match: re.Match) -> str: 76 | return self.punct_replace_map[match.group(0)] 77 | 78 | def _lang_replace_fn(self, match: re.Match) -> str: 79 | return self.lang_replace_map[match.group(0)] 80 | 81 | def normalize(self, text: str) -> str: 82 | """Perform normalization. 83 | 84 | Args: 85 | text (str): Input text 86 | 87 | Returns: 88 | str: Normalized text 89 | """ 90 | text = unicodedata.normalize('NFKC', text) 91 | 92 | text = self.pseudo_num_pattern.sub(r'\1.\2', text) 93 | text = self.num_pattern.sub(self._num_replace_fn, text) 94 | if self.punct_pattern is not None: 95 | text = self.punct_pattern.sub(self._punct_replace_fn, text) 96 | 97 | text = self.ignore_pattern.sub(' ', text) 98 | text = ' '.join(text.split()) # Normalize whitespace 99 | 100 | if self.lang_replace_pattern is not None: 101 | text = self.lang_replace_pattern(self._lang_replace_fn, text) 102 | 103 | return text 104 | 105 | 106 | def make_pattern_from_keys(keys: List[str]) -> re.Pattern: 107 | """Make a re.Pattern that matches a list of strings.""" 108 | keys = sorted(keys, key=lambda x: len(x), reverse=True) 109 | pattern_str = r'|'.join(re.escape(k) for k in keys) 110 | return re.compile(pattern_str) 111 | 112 | 113 | def make_punct_replace_map() -> Dict[str, str]: 114 | """Make the punctuation replace map.""" 115 | # Generate punctuation and number replace maps 116 | punct_replace_map = {} 117 | 118 | # Normalization rules based on unicode category 119 | punct_exceptions = {"'", '[', ']', '{', '}'} 120 | for c in get_all_unicode_chars(): 121 | if c in punct_exceptions: 122 | continue 123 | 124 | cat = unicodedata.category(c) 125 | if cat == 'Pc': 126 | punct_replace_map[c] = '_' 127 | elif cat == 'Pd': 128 | punct_replace_map[c] = '-' 129 | elif cat == 'Pe': 130 | punct_replace_map[c] = ')' 131 | elif cat == 'Pf': 132 | punct_replace_map[c] = '"' 133 | elif cat == 'Pi': 134 | punct_replace_map[c] = '"' 135 | elif cat == 'Ps': 136 | punct_replace_map[c] = '(' 137 | 138 | # User provided rules 139 | 140 | # Soft hyphen 141 | punct_replace_map['\xad'] = '' 142 | 143 | # Double quotes 144 | punct_replace_map["''"] = '"' 145 | punct_replace_map["´´"] = '"' 146 | punct_replace_map['„'] = '"' 147 | 148 | # Apostrophes 149 | punct_replace_map["`"] = "'" 150 | punct_replace_map['´'] = "'" 151 | punct_replace_map['‘'] = "'" 152 | punct_replace_map['’'] = "'" 153 | punct_replace_map['‚'] = "'" # Not a comma 154 | 155 | # Brackets 156 | punct_replace_map['【'] = '[' 157 | punct_replace_map['】'] = ']' 158 | punct_replace_map['['] = '[' 159 | punct_replace_map[']'] = ']' 160 | 161 | # Common unicode variations 162 | punct_replace_map['∶'] = ':' 163 | punct_replace_map['?'] = '?' 164 | punct_replace_map['.'] = '.' 165 | punct_replace_map['━'] = '-' 166 | punct_replace_map['%'] = '%' 167 | 168 | # Chinese punctuations 169 | punct_replace_map['!'] = '!' 170 | punct_replace_map['、'] = ',' 171 | punct_replace_map['|'] = '|' 172 | punct_replace_map[':'] = ':' 173 | punct_replace_map[';'] = ';' 174 | punct_replace_map[','] = ',' 175 | punct_replace_map['。'] = '.' 176 | punct_replace_map['~'] = '~' 177 | 178 | # Others 179 | punct_replace_map['…'] = '...' 180 | 181 | return punct_replace_map 182 | 183 | 184 | def make_lang_specific_replace_map(lang: str = 'en') -> Dict[str, str]: 185 | """Create a language specific replace map.""" 186 | replace_map = {} 187 | 188 | if lang == 'ro': 189 | # Remove diacritics for romanian 190 | replace_map['Ş'] = 'S' 191 | replace_map['ş'] = 's' 192 | 193 | replace_map['Ș'] = 'S' 194 | replace_map['ș'] = 's' 195 | 196 | replace_map['Ţ'] = 'T' 197 | replace_map['ţ'] = 't' 198 | 199 | replace_map['Ț'] = 'T' 200 | replace_map['ț'] = 't' 201 | 202 | replace_map['Ă'] = 'A' 203 | replace_map['ă'] = 'a' 204 | 205 | replace_map['Â'] = 'A' 206 | replace_map['â'] = 'a' 207 | 208 | replace_map['Î'] = 'I' 209 | replace_map['î'] = 'i' 210 | 211 | return replace_map 212 | --------------------------------------------------------------------------------