├── icu_tokenizer
    ├── bin
    │   ├── __init__.py
    │   ├── split.py
    │   ├── tokenize.py
    │   └── normalize.py
    ├── __init__.py
    ├── sent_splitter.py
    ├── __main__.py
    ├── utils.py
    ├── tokenizer.py
    ├── url_utils.py
    └── normalizer.py
├── MANIFEST.in
├── requirements.txt
├── docs
    ├── docutils.conf
    ├── requirements.txt
    ├── _static
    │   └── theme_overrides.css
    ├── api.rst
    ├── index.rst
    ├── Makefile
    ├── tools.rst
    └── conf.py
├── setup.cfg
├── LICENSE
├── INSTALL.md
├── setup.py
├── .github
    └── workflows
    │   └── build-docs.yml
├── README.md
└── .gitignore


/icu_tokenizer/bin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | PyICU
3 | regex
4 | 


--------------------------------------------------------------------------------
/docs/docutils.conf:
--------------------------------------------------------------------------------
1 | [writers]
2 | option_limit=0
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = D100,D104,D401
3 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx-argparse
3 | sphinx_rtd_theme
4 | 


--------------------------------------------------------------------------------
/icu_tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from icu_tokenizer.normalizer import Normalizer
2 | from icu_tokenizer.sent_splitter import SentSplitter
3 | from icu_tokenizer.tokenizer import Tokenizer
4 | 
5 | __all__ = ['Normalizer', 'SentSplitter', 'Tokenizer']
6 | 


--------------------------------------------------------------------------------
/docs/_static/theme_overrides.css:
--------------------------------------------------------------------------------
 1 | .wy-table-responsive table td kbd {
 2 |     white-space: nowrap;
 3 | }
 4 | .wy-table-responsive table td {
 5 |     white-space: normal !important;
 6 | }
 7 | .wy-table-responsive {
 8 |     overflow: visible !important;
 9 | }
10 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | Python API Reference
 2 | ====================
 3 | 
 4 | 
 5 | Sentence Splitter
 6 | -----------------
 7 | 
 8 | .. autoclass:: icu_tokenizer.sent_splitter.SentSplitter
 9 |     :members:
10 | 
11 |     .. automethod:: __init__
12 | 
13 | 
14 | Normalizer
15 | ----------
16 | 
17 | .. autoclass:: icu_tokenizer.normalizer.Normalizer
18 |     :members:
19 | 
20 |     .. automethod:: __init__
21 | 
22 | 
23 | Tokenizer
24 | ---------
25 | 
26 | .. autoclass:: icu_tokenizer.tokenizer.Tokenizer
27 |     :members:
28 | 
29 |     .. automethod:: __init__
30 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. ICU-Tokenizer documentation master file, created by
 2 |    sphinx-quickstart on Thu Jun 18 23:55:49 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to ICU-Tokenizer's documentation!
 7 | =========================================
 8 | 
 9 | **ICU-tokenizer** is a python package used to perform universal language
10 | normalization and tokenization using the International Components for
11 | Unicode.
12 | 
13 | .. toctree::
14 |    :maxdepth: 2
15 |    :caption: Library Reference
16 | 
17 |    tools
18 |    api
19 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = ../build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/icu_tokenizer/sent_splitter.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from icu import BreakIterator, Locale
 4 | 
 5 | from icu_tokenizer.utils import apply_break_iterator
 6 | 
 7 | 
 8 | class SentSplitter(object):
 9 |     """ICU sentence splitter.
10 | 
11 |     Usage:
12 | 
13 |     >>> splitter = SentSplitter(lang)
14 |     >>> sents: List[str] = splitter.split(paragraph)
15 |     """
16 | 
17 |     def __init__(self, lang: str = 'en'):
18 |         """SentSplitter."""
19 |         self.lang = lang
20 |         self.locale = Locale(lang)
21 |         self.break_iterator = \
22 |             BreakIterator.createSentenceInstance(self.locale)
23 | 
24 |     def split(self, text: str) -> List[str]:
25 |         """Split a sentence with the ICU sentence splitter."""
26 |         return apply_break_iterator(self.break_iterator, text)
27 | 


--------------------------------------------------------------------------------
/docs/tools.rst:
--------------------------------------------------------------------------------
 1 | Commandline Tools
 2 | =================
 3 | 
 4 | **ICU-Tokenzier** provides it's full set of functionalities through the
 5 | commandline. The commandline tools can be accessed by calling the module
 6 | as a script.
 7 | 
 8 | ::
 9 | 
10 |     python -m icu_tokenizer
11 | 
12 | 
13 | Sentence Splitting
14 | ------------------
15 | 
16 | .. automodule:: icu_tokenizer.bin.split
17 | .. argparse::
18 |     :module: icu_tokenizer.__main__
19 |     :func: make_parser
20 |     :path: split
21 | 
22 | 
23 | Normalize
24 | ---------
25 | 
26 | .. automodule:: icu_tokenizer.bin.normalize
27 | .. argparse::
28 |     :module: icu_tokenizer.__main__
29 |     :func: make_parser
30 |     :path: normalize
31 | 
32 | 
33 | Tokenize
34 | --------
35 | 
36 | .. automodule:: icu_tokenizer.bin.tokenize
37 | .. argparse::
38 |     :module: icu_tokenizer.__main__
39 |     :func: make_parser
40 |     :path: tokenize
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 mingruimingrui
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/icu_tokenizer/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from types import ModuleType
 3 | from typing import Dict
 4 | 
 5 | from icu_tokenizer.bin import normalize as normalize_module
 6 | from icu_tokenizer.bin import split as split_module
 7 | from icu_tokenizer.bin import tokenize as tokenize_module
 8 | 
 9 | SUBCOMMANDS: Dict[str, ModuleType] = {
10 |     'normalize': normalize_module,
11 |     'split': split_module,
12 |     'tokenize': tokenize_module,
13 | }
14 | 
15 | 
16 | def make_parser() -> argparse.ArgumentParser:
17 |     """Make the parser for the main program."""
18 |     parser = argparse.ArgumentParser(
19 |         prog='python3 -m mt_experiments',
20 |         description='Machine Translation Experimentation Toolkit'
21 |     )
22 |     subparsers = parser.add_subparsers(dest='subcommand')
23 | 
24 |     for subcommand, module in SUBCOMMANDS.items():
25 |         module.add_options(subparsers.add_parser(
26 |             subcommand, help=module.__doc__,
27 |             formatter_class=argparse.ArgumentDefaultsHelpFormatter
28 |         ))
29 | 
30 |     return parser
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     parser = make_parser()
35 |     args = parser.parse_args()
36 | 
37 |     if args.subcommand in SUBCOMMANDS:
38 |         module = SUBCOMMANDS[args.subcommand]
39 |         module.main(args)
40 |     else:
41 |         parser.print_help()
42 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Requirements
 2 | 
 3 | **ICU-Tokenizer** likely requires the following tools and libraries for a
 4 | successful installation.
 5 | 
 6 | - ICU library
 7 | - GNU C++11 compiler
 8 | 
 9 | ### Installing the ICU Library
10 | 
11 | #### Conda
12 | 
13 | ```sh
14 | conda install icu pkg-config
15 | 
16 | # Or if you wish to use the latest version of the ICU library,
17 | # the conda-forge channel typically contains a more up to date version.
18 | conda install -c conda-forge icu
19 | ```
20 | 
21 | #### MacOS
22 | 
23 | ```sh
24 | # With homebrew
25 | brew install icu4c
26 | 
27 | 
28 | # With macports
29 | port install icu
30 | ```
31 | 
32 | #### Windows
33 | 
34 | ```sh
35 | # You can download their binaries from their site or just build from source.
36 | # It's probably easier to use conda, docker, wsl or dual boot linux.
37 | # Good luck!
38 | ```
39 | 
40 | #### Debian/Ubuntu
41 | 
42 | ```sh
43 | apt update
44 | apt install libicu-dev pkg-config
45 | ```
46 | 
47 | #### Archlinux
48 | 
49 | ```sh
50 | pacman -Sy icu
51 | ```
52 | 
53 | #### Fedora/RHEL/Centos
54 | 
55 | ```sh
56 | yum install libicu-devel
57 | ```
58 | 
59 | ## Installation
60 | 
61 | This package is released on PyPI, install with `pip`.
62 | 
63 | ```sh
64 | pip install ICU-Tokenizer
65 | ```
66 | 
67 | On MacOS, it is likely necessary to specify the compiler and path to the
68 | `icu-config` tool.
69 | 
70 | ```sh
71 | CFLAGS="-std=c++11" PATH="/usr/local/opt/icu4c/bin:$PATH" \
72 |     pip install ICU-Tokenizer
73 | ```
74 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | 
 4 | def get_long_description():
 5 |     """Reads and return README as a string."""
 6 |     with open('README.md', 'rb') as f:
 7 |         return f.read().decode('utf-8', errors='ignore')
 8 | 
 9 | 
10 | with open('requirements.txt', 'r') as f:
11 |     install_requires = f.read()
12 | 
13 | 
14 | setuptools.setup(
15 |     name='icu_tokenizer',
16 |     version='0.0.1',
17 |     author='Wang Ming Rui',
18 |     author_email='mingruimingrui@hotmail.com',
19 |     description="ICU based universal language tokenizer",
20 |     long_description=get_long_description(),
21 |     long_description_content_type="text/markdown",
22 |     url="https://github.com/mingruimingrui/ICU-tokenizer",
23 | 
24 |     install_requires=install_requires,
25 |     packages=['icu_tokenizer', 'icu_tokenizer.bin'],
26 | 
27 |     classifiers=[
28 |         'Intended Audience :: Developers',
29 |         'Intended Audience :: Science/Research',
30 |         'Programming Language :: Python',
31 |         'Programming Language :: Python :: 3.6',
32 |         'Programming Language :: Python :: 3.7',
33 |         'Programming Language :: Python :: 3.8',
34 |         'Programming Language :: Python :: 3.9',
35 |         'License :: OSI Approved :: MIT License',
36 |         'Topic :: Scientific/Engineering',
37 |         'Topic :: Software Development',
38 |         'Topic :: Software Development :: Libraries',
39 |         'Topic :: Software Development :: Libraries :: Python Modules',
40 |         'Topic :: Software Development :: Localization',
41 |     ],
42 |     license='MIT License'
43 | )
44 | 


--------------------------------------------------------------------------------
/.github/workflows/build-docs.yml:
--------------------------------------------------------------------------------
 1 | name: Build documentations
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   create:
 8 |     tags:
 9 |       - v*
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - uses: actions/checkout@v2
16 | 
17 |     - name: Set up Python 3.7
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: 3.7
21 | 
22 |     - name: Install dependencies
23 |       run: sudo apt install -y g++ libicu-dev pkg-config
24 |       env:
25 |         DEBIAN_FRONTEND: noninteractive
26 | 
27 |     - name: Install python packages
28 |       run: |
29 |         python -m pip install --no-cache-dir .
30 |         python -m pip install --no-cache-dir -r docs/requirements.txt
31 | 
32 |     - name: Build docs
33 |       run: sphinx-build -b html docs public
34 | 
35 |     # Publish built docs to gh-pages branch.
36 |     # ===============================
37 |     - name: Commit documentation changes
38 |       run: |
39 |         git clone https://github.com/mingruimingrui/ICU-tokenizer.git --branch gh-pages --single-branch gh-pages
40 |         cp -r public/* gh-pages/
41 |         cd gh-pages
42 |         git config --local user.email "action@github.com"
43 |         git config --local user.name "GitHub Action"
44 |         git add .
45 |         git commit -m "Update documentation" -a || true
46 |     - name: Push changes
47 |       uses: ad-m/github-push-action@master
48 |       with:
49 |         branch: gh-pages
50 |         directory: gh-pages
51 |         github_token: ${{ secrets.GITHUB_TOKEN }}
52 |     # ===============================
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **ICU-tokenizer** is a python package used to perform universal language
 2 | normalization and tokenization using the International Components for
 3 | Unicode.
 4 | 
 5 | - [Install](#install)
 6 | - [Usage (Python)](#usage-python)
 7 |   - [Sentence splitter](#sentence-splitter)
 8 |   - [Normalizer](#normalizer)
 9 |   - [Tokenizer](#tokenizer)
10 | 
11 | ## Install
12 | 
13 | See [./INSTALL.md](./INSTALL.md)
14 | 
15 | ## Usage (Python)
16 | 
17 | ### Sentence splitter
18 | 
19 | ```py
20 | # To split a paragraph into multiple sentences
21 | >>> from icu_tokenizer import SentSplitter
22 | >>> splitter = SentSplitter('zh')
23 | 
24 | >>> paragraph = """
25 | 美国最高法院（英语：Supreme Court of the United States），一般是指美国联邦最高法院，是美国最高级别的联邦法院，为美国三权继总统、国会后最为重要的一环。根据1789年《美国宪法第三条》的规定，最高法院对所有联邦法院、州法院和涉及联邦法律问题的诉讼案件具有最终（并且在很大程度上是有斟酌决定权的）上诉管辖权，以及对小范围案件的具有初审管辖权。在美国的法律制度中，最高法院通常是包括《美国宪法》在内的联邦法律的最终解释者，但仅在具有管辖权的案件范围内。法院不享有判定政治问题的权力；政治问题的执法机关是行政机关，而不是政府的司法部门。
26 | """
27 | >>> splitter.split(paragraph)
28 | [
29 |     '美国最高法院（英语：Supreme Court of the United States），一般是指美国联邦最高法院，是美国最高级别的联邦法院，为美国三权继总统、国会后最为重要的一环。',
30 |     '根据1789年《美国宪法第三条》的规定，最高法院对所有联邦法院、州法院和涉及联邦法律问题的诉讼案件具有最终（并且在很大程度上是有斟酌决定权的）上诉管辖权，以及对小范围案件的具有初审管辖权。',
31 |     '在美国的法律制度中，最高法院通常是包括《美国宪法》在内的联邦法律的最终解释者，但仅在具有管辖权的案件范围内。',
32 |     '法院不享有判定政治问题的权力；政治问题的执法机关是行政机关，而不是政府的司法部门。'
33 | ]
34 | ```
35 | 
36 | ### Normalizer
37 | 
38 | ```py
39 | # To normalize text
40 | >>> from icu_tokenizer import Normalizer
41 | >>> normalizer = Normalizer(lang='en', norm_puncts=True)
42 | 
43 | >>> text = "𝑻𝒉𝒆 𝒑𝒓𝒐𝒅𝒖𝒄𝒕𝒔 𝒚𝒐𝒖 𝒐𝒓𝒅𝒆𝒓𝒆𝒅 𝒘𝒊𝒍𝒍 𝒃𝒆 𝒔𝒉𝒊𝒑𝒑𝒆𝒅 𝒅𝒊𝒓𝒆𝒄𝒕𝒍𝒚 𝒇𝒓𝒐𝒎 𝑲𝒐𝒓𝒆𝒂."
44 | >>> normalizer.normalize(text)
45 | "The products you ordered will be shipped directly from Korea."
46 | 
47 | >>> text = "【】（）"
48 | >>> normalizer.normalize(text)
49 | "[]()"
50 | ```
51 | 
52 | ### Tokenizer
53 | 
54 | ```py
55 | >>> from icu_tokenizer import Tokenizer
56 | >>> tokenizer = Tokenizer(lang='th')
57 | 
58 | >>> text = "ภาษาไทยเป็นภาษาที่มีระดับเสียงของคำแน่นอนหรือวรรณยุกต์เช่นเดียวกับภาษาจีน และออกเสียงแยกคำต่อคำ"
59 | >>> tokenizer.tokenize(text)
60 | ['ภาษา', 'ไทย', 'เป็น', 'ภาษา', 'ที่', 'มี', 'ระดับ', 'เสียง', 'ของ', 'คำ', 'แน่นอน', 'หรือ', 'วรรณยุกต์', 'เช่น', 'เดียว', 'กับ', 'ภาษา', 'จีน', 'และ', 'ออก', 'เสียง', 'แยก', 'คำ', 'ต่อ', 'คำ']
61 | ```
62 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'ICU-Tokenizer'
21 | copyright = '2020, Ming Rui'
22 | author = 'Ming Rui'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '0.0.1'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.intersphinx',
36 |     'sphinx.ext.viewcode',
37 |     'sphinx.ext.napoleon',
38 |     'sphinxarg.ext'
39 | ]
40 | 
41 | # Add any paths that contain templates here, relative to this directory.
42 | templates_path = ['_templates']
43 | 
44 | # List of patterns, relative to source directory, that match files and
45 | # directories to ignore when looking for source files.
46 | # This pattern also affects html_static_path and html_extra_path.
47 | exclude_patterns = []
48 | 
49 | 
50 | # -- Options for HTML output -------------------------------------------------
51 | 
52 | # The theme to use for HTML and HTML Help pages.  See the documentation for
53 | # a list of builtin themes.
54 | #
55 | html_theme = 'sphinx_rtd_theme'
56 | html4_writer = True
57 | 
58 | # Add any paths that contain custom static files (such as style sheets) here,
59 | # relative to this directory. They are copied after the builtin static files,
60 | # so a file named "default.css" will overwrite the builtin "default.css".
61 | html_static_path = ['_static']
62 | 
63 | # Additional theme customization
64 | html_context = {
65 |     'css_files': [
66 |         '_static/theme_overrides.css',  # override wide tables in RTD theme
67 |     ],
68 | }
69 | 


--------------------------------------------------------------------------------
/icu_tokenizer/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import argparse
 4 | import sys
 5 | from typing import List
 6 | 
 7 | import icu
 8 | import regex
 9 | from icu import BreakIterator
10 | 
11 | 
12 | def apply_break_iterator(
13 |     break_iterator: BreakIterator,
14 |     text: str
15 | ) -> List[str]:
16 |     """Apply ICU break iterator on a text."""
17 |     break_iterator.setText(text)
18 |     parts = []
19 |     p0 = 0
20 |     for p1 in break_iterator:
21 |         part = text[p0:p1].strip()
22 |         if len(part) > 0:
23 |             parts.append(part)
24 |         p0 = p1
25 |     return parts
26 | 
27 | 
28 | def get_all_unicode_chars():
29 |     """Get all unicode characters."""
30 |     all_unicode_chars = []
31 |     i = 0
32 |     while True:
33 |         try:
34 |             all_unicode_chars.append(chr(i))
35 |         except ValueError:
36 |             break
37 |         i += 1
38 |     return all_unicode_chars
39 | 
40 | 
41 | def get_versions() -> dict:
42 |     """Get versions of the various dependecies related to icu_tokenizer."""
43 |     versions = {
44 |         'icu': icu.ICU_VERSION,
45 |         'PyICU': icu.VERSION,
46 |         'regex': regex.__version__
47 |     }
48 | 
49 |     try:
50 |         import opencc
51 |         versions['opencc'] = opencc.__version__
52 |     except ImportError:
53 |         pass
54 | 
55 |     return versions
56 | 
57 | 
58 | class TextFileType(argparse.FileType):
59 |     """argparse.FileType modified for utf-8 text files."""
60 | 
61 |     def __init__(self, mode: str = 'r', bufsize: int = -1):
62 |         """TextFileType."""
63 |         self._mode = mode
64 |         self._bufsize = bufsize
65 |         self._encoding = 'utf-8'
66 |         self._errors = 'ignore'
67 | 
68 |     def __call__(self, string):  # noqa
69 |         # the special argument "-" means sys.std{in,out}
70 |         if string == '-':
71 |             if 'r' in self._mode:
72 |                 return sys.stdin
73 |             elif 'w' in self._mode:
74 |                 return sys.stdout
75 |             else:
76 |                 msg = 'argument "-" with mode {}'.format(self._mode)
77 |                 raise ValueError(msg)
78 | 
79 |         # all other arguments are used as file names
80 |         try:
81 |             return open(string, self._mode, self._bufsize, self._encoding,
82 |                         self._errors)
83 |         except OSError as e:
84 |             msg = "can't open '{}': {}".format(string, e)
85 |             raise argparse.ArgumentTypeError(msg)
86 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Development
132 | .vscode
133 | sync.sh
134 | 


--------------------------------------------------------------------------------
/icu_tokenizer/bin/split.py:
--------------------------------------------------------------------------------
 1 | """Split lines containing multiple sentences."""
 2 | 
 3 | import sys
 4 | import argparse
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | from icu_tokenizer.sent_splitter import SentSplitter
 9 | from icu_tokenizer.utils import TextFileType
10 | 
11 | CACHE = {}
12 | 
13 | 
14 | def add_options(parser: argparse.ArgumentParser):
15 |     """Add options to a parser."""
16 |     parser.add_argument(
17 |         '-l', '--lang', type=str, default='en',
18 |         help='Language identifier')
19 | 
20 |     parser.add_argument(
21 |         '-i', '--inputs', type=TextFileType('r'),
22 |         nargs='+', default=[sys.stdin],
23 |         help='Input files. Defaults to stdin.')
24 |     parser.add_argument(
25 |         '-o', '--output', type=TextFileType('w'), default=sys.stdout,
26 |         help='Output file. Defaults to stdout.')
27 | 
28 |     parser.add_argument(
29 |         '-j', '--num-workers', type=int, default=0,
30 |         help='Number of processes to use')
31 |     parser.add_argument(
32 |         '--show-pbar', action='store_true',
33 |         help='Show progressbar')
34 |     parser.add_argument(
35 |         '--verbose', action='store_true',
36 |         help='Print splits to stderr')
37 | 
38 | 
39 | def main(args: argparse.Namespace):  # noqa
40 |     if args.num_workers == 0:
41 |         import multiprocessing.dummy as multiprocessing
42 |         args.num_workers = 1
43 |     else:
44 |         import multiprocessing
45 | 
46 |     if args.num_workers < 0:  # Use all cores
47 |         args.num_workers = multiprocessing.cpu_count()
48 | 
49 |     def create_chunk_input_stream():
50 |         chunk = []
51 |         for f in args.inputs:
52 |             for line in f:
53 |                 chunk.append(line)
54 |                 if len(chunk) >= 256:
55 |                     yield chunk
56 |                     chunk = []
57 |         if len(chunk) > 0:
58 |             yield chunk
59 | 
60 |     pbar = None
61 |     if args.show_pbar:
62 |         pbar = tqdm()
63 | 
64 |     with multiprocessing.Pool(
65 |         args.num_workers,
66 |         initializer=worker_init_fn,
67 |         initargs=[args.lang]
68 |     ) as pool:
69 |         for chunk in pool.imap(worker_fn, create_chunk_input_stream()):
70 |             if pbar is not None:
71 |                 pbar.update(len(chunk))
72 |             for sents in chunk:
73 |                 for sent in sents:
74 |                     args.output.write(sent + '\n')
75 |                 if args.verbose and len(sents) > 1:
76 |                     sys.stderr.write('\rSplitting done: {}\n'.format(sents))
77 |     sys.stderr.flush()
78 |     args.output.flush()
79 | 
80 |     if pbar is not None:
81 |         pbar.close()
82 | 
83 | 
84 | def worker_init_fn(lang: str):  # noqa
85 |     CACHE['sent_splitter'] = SentSplitter(lang)
86 | 
87 | 
88 | def worker_fn(texts):  # noqa
89 |     split_fn = CACHE['sent_splitter'].split
90 |     return [split_fn(t) for t in texts]
91 | 


--------------------------------------------------------------------------------
/icu_tokenizer/bin/tokenize.py:
--------------------------------------------------------------------------------
 1 | """Tokenize text using unicode properties."""
 2 | 
 3 | import sys
 4 | import argparse
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | from icu_tokenizer.tokenizer import Tokenizer
 9 | from icu_tokenizer.utils import TextFileType
10 | 
11 | CACHE = {}
12 | 
13 | 
14 | def add_options(parser: argparse.ArgumentParser):
15 |     """Add options to a parser."""
16 |     parser.add_argument(
17 |         '-i', '--inputs', type=TextFileType('r'),
18 |         nargs='+', default=[sys.stdin],
19 |         help='Input files. Defaults to stdin.')
20 |     parser.add_argument(
21 |         '-o', '--output', type=TextFileType('w'), default=sys.stdout,
22 |         help='Output file. Defaults to stdout.')
23 | 
24 |     parser.add_argument(
25 |         '-l', '--lang', type=str, default='en',
26 |         help='Language identifier')
27 |     parser.add_argument(
28 |         '-a', '--annotate-hyphens', action='store_true',
29 |         help='Annotate hyphens similar to moses')
30 |     parser.add_argument(
31 |         '-url', '--protect-urls', action='store_true',
32 |         help='Protect url patterns')
33 | 
34 |     parser.add_argument(
35 |         '-j', '--num-workers', type=int, default=0,
36 |         help='Number of processes to use')
37 |     parser.add_argument(
38 |         '--show-pbar', action='store_true',
39 |         help='Show progressbar')
40 | 
41 | 
42 | def main(args: argparse.Namespace):  # noqa
43 |     if args.num_workers == 0:
44 |         import multiprocessing.dummy as multiprocessing
45 |         args.num_workers = 1
46 |     else:
47 |         import multiprocessing
48 | 
49 |     if args.num_workers < 0:  # Use all cores
50 |         args.num_workers = multiprocessing.cpu_count()
51 | 
52 |     def create_chunk_input_stream():
53 |         chunk = []
54 |         for f in args.inputs:
55 |             for line in f:
56 |                 chunk.append(line)
57 |                 if len(chunk) >= 256:
58 |                     yield chunk
59 |                     chunk = []
60 |         if len(chunk) > 0:
61 |             yield chunk
62 | 
63 |     pbar = None
64 |     if args.show_pbar:
65 |         pbar = tqdm()
66 | 
67 |     with multiprocessing.Pool(
68 |         args.num_workers,
69 |         initializer=worker_init_fn,
70 |         initargs=[args.lang, args.annotate_hyphens, args.protect_urls]
71 |     ) as pool:
72 |         for chunk in pool.imap(worker_fn, create_chunk_input_stream()):
73 |             if pbar is not None:
74 |                 pbar.update(len(chunk))
75 |             for line in chunk:
76 |                 args.output.write(line + '\n')
77 |     args.output.flush()
78 | 
79 |     if pbar is not None:
80 |         pbar.close()
81 | 
82 | 
83 | def worker_init_fn(lang: str, annotate_hyphens: bool, protect_urls: bool):  # noqa
84 |     CACHE['tokenizer'] = Tokenizer(
85 |         lang,
86 |         annotate_hyphens=annotate_hyphens,
87 |         protect_urls=protect_urls
88 |     )
89 | 
90 | 
91 | def worker_fn(texts):  # noqa
92 |     tokenize_fn = CACHE['tokenizer'].tokenize
93 |     return [' '.join(tokenize_fn(t)) for t in texts]
94 | 


--------------------------------------------------------------------------------
/icu_tokenizer/bin/normalize.py:
--------------------------------------------------------------------------------
 1 | """Normalize text using unicode properties."""
 2 | 
 3 | import sys
 4 | import argparse
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | from icu_tokenizer.normalizer import Normalizer
 9 | from icu_tokenizer.utils import TextFileType
10 | 
11 | CACHE = {}
12 | 
13 | 
14 | def add_options(parser: argparse.ArgumentParser):
15 |     """Add options to a parser."""
16 |     parser.add_argument(
17 |         '-l', '--lang', type=str, default='en',
18 |         help='Language identifier')
19 |     parser.add_argument(
20 |         '-p', '--norm-puncts', action='store_true',
21 |         help='Normalize punctuations')
22 |     parser.add_argument(
23 |         '-lc', '--lowercase', action='store_true',
24 |         help='Cast all characters to lowercase')
25 | 
26 |     parser.add_argument(
27 |         '-i', '--inputs', type=TextFileType('r'),
28 |         nargs='+', default=[sys.stdin],
29 |         help='Input files. Defaults to stdin.')
30 |     parser.add_argument(
31 |         '-o', '--output', type=TextFileType('w'), default=sys.stdout,
32 |         help='Output file. Defaults to stdout.')
33 | 
34 |     parser.add_argument(
35 |         '-j', '--num-workers', type=int, default=0,
36 |         help='Number of processes to use')
37 |     parser.add_argument(
38 |         '--show-pbar', action='store_true',
39 |         help='Show progressbar')
40 | 
41 | 
42 | def main(args: argparse.Namespace):  # noqa
43 |     if args.num_workers == 0:
44 |         import multiprocessing.dummy as multiprocessing
45 |         args.num_workers = 1
46 |     else:
47 |         import multiprocessing
48 | 
49 |     if args.num_workers < 0:  # Use all cores
50 |         args.num_workers = multiprocessing.cpu_count()
51 | 
52 |     def create_chunk_input_stream():
53 |         chunk = []
54 |         for f in args.inputs:
55 |             for line in f:
56 |                 chunk.append(line)
57 |                 if len(chunk) >= 256:
58 |                     yield chunk
59 |                     chunk = []
60 |         if len(chunk) > 0:
61 |             yield chunk
62 | 
63 |     pbar = None
64 |     if args.show_pbar:
65 |         pbar = tqdm()
66 | 
67 |     with multiprocessing.Pool(
68 |         args.num_workers,
69 |         initializer=worker_init_fn,
70 |         initargs=[args.lang, args.norm_puncts, args.lowercase]
71 |     ) as pool:
72 |         for chunk in pool.imap(worker_fn, create_chunk_input_stream()):
73 |             if pbar is not None:
74 |                 pbar.update(len(chunk))
75 |             for line in chunk:
76 |                 args.output.write(line + '\n')
77 |     args.output.flush()
78 | 
79 |     if pbar is not None:
80 |         pbar.close()
81 | 
82 | 
83 | def worker_init_fn(lang: str, norm_puncts: bool, lowercase: bool):  # noqa
84 |     CACHE['normalizer'] = Normalizer(lang, norm_puncts)
85 |     CACHE['lowercase'] = lowercase
86 | 
87 | 
88 | def worker_fn(texts):  # noqa
89 |     normalize_fn = CACHE['normalizer'].normalize
90 |     texts = [normalize_fn(t) for t in texts]
91 |     if CACHE['lowercase']:
92 |         texts = [t.lower() for t in texts]
93 |     return texts
94 | 


--------------------------------------------------------------------------------
/icu_tokenizer/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import List, Union
 3 | 
 4 | from icu import BreakIterator, Locale
 5 | 
 6 | from icu_tokenizer.url_utils import email_pattern, grubber_url_matcher
 7 | from icu_tokenizer.utils import apply_break_iterator
 8 | 
 9 | PROTECTED_TEMPLATE = '__PROTECTED_SEQUENCE_{}__'
10 | 
11 | 
12 | class Tokenizer(object):
13 |     """ICU based tokenizer with additional functionality to protect sequences.
14 | 
15 |     Usage:
16 | 
17 |     >>> tokenizer = Tokenizer(
18 |             lang,
19 |             annotate_hyphens: bool,
20 |             protect_emails_urls: bool,
21 |             extra_protected_patterns: List[Union[str, re.Pattern]] = [],
22 |         )
23 |     >>> tokens: List[str] = tokenizer.tokenize(text)
24 |     """
25 | 
26 |     HYPHEN_PATTERN = re.compile(r'(\w)\-(?=\w)')
27 |     HYPHEN_PATTERN_REPL = r'\1 @-@ '
28 |     PROTECTED_HYPHEN_PATTERN = re.compile(r'@\-@')
29 | 
30 |     def __init__(
31 |         self,
32 |         lang: str = 'en',
33 |         annotate_hyphens: bool = False,
34 |         protect_emails_urls: bool = False,
35 |         extra_protected_patterns: List[Union[str, re.Pattern]] = [],
36 |     ):
37 |         """Tokenizer.
38 | 
39 |         Keyword Arguments:
40 |             lang {str} -- language identifier (default: {'en'})
41 |             annotate_hyphens {bool} -- Protect dashes (default: {False})
42 |             protect_emails_urls {bool} -- Protect urls (default: {False})
43 |             extra_protected_patterns {List[Union[str, re.Pattern]]} --
44 |                 A list of regex patterns (default: {[]})
45 |         """
46 |         self.lang = lang
47 |         self.locale = Locale(lang)
48 |         self.break_iterator = \
49 |             BreakIterator.createWordInstance(self.locale)
50 |         self.protected_patterns = []
51 | 
52 |         self.annotate_hyphens = annotate_hyphens
53 |         if self.annotate_hyphens:
54 |             self.protected_patterns.append(self.PROTECTED_HYPHEN_PATTERN)
55 | 
56 |         if protect_emails_urls:
57 |             self.protected_patterns.append(email_pattern)
58 |             self.protected_patterns.append(grubber_url_matcher)
59 | 
60 |         for pattern in extra_protected_patterns:
61 |             if isinstance(pattern, str):
62 |                 pattern = re.compile(pattern)
63 |             self.protected_patterns.append(pattern)
64 | 
65 |     def tokenize(self, text: str) -> List[str]:
66 |         """Tokenize text into list of tokens.
67 | 
68 |         Args:
69 |             text (str): Raw input text.
70 | 
71 |         Returns:
72 |             List[str]: List of tokens.
73 |         """
74 |         if self.annotate_hyphens:
75 |             text = self.HYPHEN_PATTERN.sub(self.HYPHEN_PATTERN_REPL, text)
76 | 
77 |         protected_map = {}
78 | 
79 |         def protect_replace(match):
80 |             protected_str = PROTECTED_TEMPLATE.format(len(protected_map))
81 |             protected_map[protected_str] = match.group(0)
82 |             return ' {} '.format(protected_str)
83 | 
84 |         for i, pattern in enumerate(self.protected_patterns):
85 |             text = pattern.sub(protect_replace, text)
86 | 
87 |         tokens = apply_break_iterator(self.break_iterator, text)
88 |         return [protected_map.get(t, t) for t in tokens]
89 | 


--------------------------------------------------------------------------------
/icu_tokenizer/url_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import regex
 3 | 
 4 | __all__ = ['email_pattern', 'grubber_url_matcher']
 5 | 
 6 | 
 7 | sub_domain_pstr = r'[0-9A-Za-z\-\_\~]+'
 8 | top_domain_pstr = r'(?:[.](?:{}))'.format(r'|'.join(re.escape(s) for s in [
 9 |     'com', 'org', 'net', 'int', 'edu', 'gov', 'mil', 'ac', 'ad',
10 |     'ae', 'af', 'ag', 'ai', 'al', 'am', 'ao', 'aq', 'ar', 'as',
11 |     'at', 'au', 'aw', 'ax', 'az',
12 |     'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh', 'bi', 'bj', 'bm',
13 |     'bn', 'bo', 'br', 'bs', 'bt', 'bw', 'by', 'bz',
14 |     'ca', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', 'cm',
15 |     'cn', 'co', 'cr', 'cu', 'cv', 'cw', 'cx', 'cy', 'cz',
16 |     'de', 'dj', 'dk', 'dm', 'do', 'dz',
17 |     'ec', 'ee', 'eg', 'er', 'es', 'et', 'eu',
18 |     'fi', 'fj', 'fk', 'fm', 'fo', 'fr',
19 |     'ga', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn',
20 |     'gp', 'gq', 'gr', 'gs', 'gt', 'gu', 'gw', 'gy',
21 |     'hk', 'hm', 'hn', 'hr', 'ht', 'hu',
22 |     'id', 'ie', 'il', 'im', 'in', 'io', 'iq', 'ir', 'is', 'it',
23 |     'je', 'jm', 'jo', 'jp',
24 |     'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky',
25 |     'kz',
26 |     'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv',
27 |     'ly',
28 |     'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mk', 'ml', 'mm', 'mn',
29 |     'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx',
30 |     'my', 'mz',
31 |     'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr',
32 |     'nu', 'nz',
33 |     'om',
34 |     'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr',
35 |     'ps', 'pt', 'pw', 'py',
36 |     'qa',
37 |     're', 'ro', 'rs', 'ru', 'rw',
38 |     'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl',
39 |     'sm', 'sn', 'so', 'sr', 'ss', 'st', 'su', 'sv', 'sx', 'sy',
40 |     'sz',
41 |     'tc', 'td', 'tf', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn',
42 |     'to', 'tr', 'tt', 'tv', 'tw', 'tz',
43 |     'ua', 'ug', 'uk', 'us', 'uy', 'uz',
44 |     'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu',
45 |     'wf', 'ws',
46 |     'ye', 'yt',
47 |     'za', 'zm', 'zw'
48 | ]))  # https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
49 | 
50 | domain_pstr = r'(?:{sub_domain}\.)*{sub_domain}{top_domain}'.format(
51 |     sub_domain=sub_domain_pstr,
52 |     top_domain=top_domain_pstr,
53 | )
54 | 
55 | # https://stackoverflow.com/questions/2049502/what-characters-are-allowed-in-an-email-address
56 | local_part_valid_word = r'[0-9A-Za-z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~]+'
57 | local_part_pstr = r'{word}(?:\.{word})*'.format(word=local_part_valid_word)
58 | 
59 | email_pstr = r'({})'.format(
60 |     r'{local_part}\@{domain}'.format(
61 |         local_part=local_part_pstr,
62 |         domain=domain_pstr,
63 |     )
64 | )
65 | email_pattern = regex.compile(email_pstr, re.IGNORECASE)
66 | """Custom email matcher based on
67 | https://en.wikipedia.org/wiki/International_email
68 | """
69 | 
70 | 
71 | # A customized grubber v1 URL matcher
72 | # Designed to work with urls starting with https, http, ftp, or www
73 | grubber_url_pstr = r'(?i)\b((?:(?:https|http|ftp):/{1,3}|www[.])[^\s()<>\（\）\【\】]+(?:\([\w\d]+\)|(?:[^!"#$%&\'()*+,\-./:;<=>?@\[\]\s\（\）\【\】。，？！]|/)))'  # noqa
74 | grubber_url_matcher: re.Pattern = re.compile(grubber_url_pstr, re.ASCII)
75 | """Grubber v1 URL matcher with additional rules to account for chinese
76 | punctuations.
77 | 
78 | Designed to work with urls starting with https, http, ftp, or www.
79 | """
80 | 


--------------------------------------------------------------------------------
/icu_tokenizer/normalizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re
  4 | import unicodedata
  5 | from typing import Dict, List
  6 | 
  7 | import regex
  8 | 
  9 | from icu_tokenizer.utils import get_all_unicode_chars
 10 | 
 11 | 
 12 | class Normalizer(object):
 13 |     """Unicode information based normalizer.
 14 | 
 15 |     Does the following
 16 | 
 17 |     - Ensure NFKC format
 18 |     - Handle pseudo-spaces (for numbers)
 19 |     - Normalize by unicode categories \
 20 |       https://www.fileformat.info/info/unicode/category/index.htm
 21 | 
 22 |         - ``[C*|So|Z*]`` → ' '
 23 |         - ``[Pc]`` → ``_``
 24 |         - ``[Pd]`` → ``-``
 25 |         - ``[Pf|Pi]`` → ``"``  (except for ``'``)
 26 |         - ``[Ps]`` → ``(``  (except for ``{``, ``[``)
 27 |         - ``[Pe]`` → ``)``  (except for ``}``, ``]``)
 28 |     - Normalize Nd (Numbers)
 29 |     - Account for some outliers
 30 |     - Remove non printable characters
 31 |     - Normalize whitespace characters
 32 |     - Perform language specific normalization
 33 | 
 34 |     Usage:
 35 | 
 36 |     >>> normalizer = Normalizer(lang, norm_puncts=True)
 37 |     >>> norm_text: str = normalizer.normalize(text)
 38 |     """
 39 | 
 40 |     def __init__(self, lang: str = 'en', norm_puncts: bool = False):
 41 |         """Normalizer.
 42 | 
 43 |         Args:
 44 |             lang (str, optional): Language identifier. Defaults to 'en'.
 45 |             norm_puncts (bool, optional): Normalize punctuations?.
 46 |                 Defaults to False.
 47 |         """
 48 |         # Handle control tokens
 49 |         self.ignore_pattern = regex.compile(r'\p{C}|\p{So}|\p{Z}')
 50 | 
 51 |         # Handle pseudo-spaces
 52 |         # Random note: it appears pseudo-spaces primarily makes a difference
 53 |         # when numbers are involved
 54 |         self.pseudo_num_pattern = re.compile(r'(\d) (\d)')
 55 | 
 56 |         # Punctuation and number replace maps
 57 |         self.num_pattern = regex.compile(r'\p{Nd}+')
 58 |         self.punct_replace_map = self.punct_pattern = None
 59 |         if norm_puncts:
 60 |             self.punct_replace_map = make_punct_replace_map()
 61 |             self.punct_pattern = \
 62 |                 make_pattern_from_keys(self.punct_replace_map.keys())
 63 | 
 64 |         # Other language specific normalizers
 65 |         lang_replace_map = make_lang_specific_replace_map(lang)
 66 |         self.lang_replace_map = self.lang_replace_pattern = None
 67 |         if len(lang_replace_map) > 0:
 68 |             self.lang_replace_map = lang_replace_map
 69 |             self.lang_replace_pattern = \
 70 |                 make_pattern_from_keys(lang_replace_map.keys())
 71 | 
 72 |     def _num_replace_fn(self, match: re.Match) -> str:
 73 |         return str(int(match.group(0)))
 74 | 
 75 |     def _punct_replace_fn(self, match: re.Match) -> str:
 76 |         return self.punct_replace_map[match.group(0)]
 77 | 
 78 |     def _lang_replace_fn(self, match: re.Match) -> str:
 79 |         return self.lang_replace_map[match.group(0)]
 80 | 
 81 |     def normalize(self, text: str) -> str:
 82 |         """Perform normalization.
 83 | 
 84 |         Args:
 85 |             text (str): Input text
 86 | 
 87 |         Returns:
 88 |             str: Normalized text
 89 |         """
 90 |         text = unicodedata.normalize('NFKC', text)
 91 | 
 92 |         text = self.pseudo_num_pattern.sub(r'\1.\2', text)
 93 |         text = self.num_pattern.sub(self._num_replace_fn, text)
 94 |         if self.punct_pattern is not None:
 95 |             text = self.punct_pattern.sub(self._punct_replace_fn, text)
 96 | 
 97 |         text = self.ignore_pattern.sub(' ', text)
 98 |         text = ' '.join(text.split())  # Normalize whitespace
 99 | 
100 |         if self.lang_replace_pattern is not None:
101 |             text = self.lang_replace_pattern(self._lang_replace_fn, text)
102 | 
103 |         return text
104 | 
105 | 
106 | def make_pattern_from_keys(keys: List[str]) -> re.Pattern:
107 |     """Make a re.Pattern that matches a list of strings."""
108 |     keys = sorted(keys, key=lambda x: len(x), reverse=True)
109 |     pattern_str = r'|'.join(re.escape(k) for k in keys)
110 |     return re.compile(pattern_str)
111 | 
112 | 
113 | def make_punct_replace_map() -> Dict[str, str]:
114 |     """Make the punctuation replace map."""
115 |     # Generate punctuation and number replace maps
116 |     punct_replace_map = {}
117 | 
118 |     # Normalization rules based on unicode category
119 |     punct_exceptions = {"'", '[', ']', '{', '}'}
120 |     for c in get_all_unicode_chars():
121 |         if c in punct_exceptions:
122 |             continue
123 | 
124 |         cat = unicodedata.category(c)
125 |         if cat == 'Pc':
126 |             punct_replace_map[c] = '_'
127 |         elif cat == 'Pd':
128 |             punct_replace_map[c] = '-'
129 |         elif cat == 'Pe':
130 |             punct_replace_map[c] = ')'
131 |         elif cat == 'Pf':
132 |             punct_replace_map[c] = '"'
133 |         elif cat == 'Pi':
134 |             punct_replace_map[c] = '"'
135 |         elif cat == 'Ps':
136 |             punct_replace_map[c] = '('
137 | 
138 |     # User provided rules
139 | 
140 |     # Soft hyphen
141 |     punct_replace_map['\xad'] = ''
142 | 
143 |     # Double quotes
144 |     punct_replace_map["''"] = '"'
145 |     punct_replace_map["´´"] = '"'
146 |     punct_replace_map['„'] = '"'
147 | 
148 |     # Apostrophes
149 |     punct_replace_map["`"] = "'"
150 |     punct_replace_map['´'] = "'"
151 |     punct_replace_map['‘'] = "'"
152 |     punct_replace_map['’'] = "'"
153 |     punct_replace_map['‚'] = "'"  # Not a comma
154 | 
155 |     # Brackets
156 |     punct_replace_map['【'] = '['
157 |     punct_replace_map['】'] = ']'
158 |     punct_replace_map['［'] = '['
159 |     punct_replace_map['］'] = ']'
160 | 
161 |     # Common unicode variations
162 |     punct_replace_map['∶'] = ':'
163 |     punct_replace_map['？'] = '?'
164 |     punct_replace_map['．'] = '.'
165 |     punct_replace_map['━'] = '-'
166 |     punct_replace_map['％'] = '%'
167 | 
168 |     # Chinese punctuations
169 |     punct_replace_map['！'] = '!'
170 |     punct_replace_map['、'] = ','
171 |     punct_replace_map['｜'] = '|'
172 |     punct_replace_map['：'] = ':'
173 |     punct_replace_map['；'] = ';'
174 |     punct_replace_map['，'] = ','
175 |     punct_replace_map['。'] = '.'
176 |     punct_replace_map['～'] = '~'
177 | 
178 |     # Others
179 |     punct_replace_map['…'] = '...'
180 | 
181 |     return punct_replace_map
182 | 
183 | 
184 | def make_lang_specific_replace_map(lang: str = 'en') -> Dict[str, str]:
185 |     """Create a language specific replace map."""
186 |     replace_map = {}
187 | 
188 |     if lang == 'ro':
189 |         # Remove diacritics for romanian
190 |         replace_map['Ş'] = 'S'
191 |         replace_map['ş'] = 's'
192 | 
193 |         replace_map['Ș'] = 'S'
194 |         replace_map['ș'] = 's'
195 | 
196 |         replace_map['Ţ'] = 'T'
197 |         replace_map['ţ'] = 't'
198 | 
199 |         replace_map['Ț'] = 'T'
200 |         replace_map['ț'] = 't'
201 | 
202 |         replace_map['Ă'] = 'A'
203 |         replace_map['ă'] = 'a'
204 | 
205 |         replace_map['Â'] = 'A'
206 |         replace_map['â'] = 'a'
207 | 
208 |         replace_map['Î'] = 'I'
209 |         replace_map['î'] = 'i'
210 | 
211 |     return replace_map
212 | 


--------------------------------------------------------------------------------