├── .github └── workflows │ ├── ci.yml │ └── cron.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── pyproject.toml ├── requirements-dev.txt ├── setup.cfg ├── setup.py ├── src └── pypinyin_g2pw │ ├── __init__.py │ └── g2pw.py └── tests └── test_pinyin.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: CI 5 | 6 | on: 7 | push: 8 | pull_request: 9 | workflow_dispatch: 10 | 11 | jobs: 12 | test: 13 | 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: [3.7, 3.8, 3.9, '3.10', 3.11] 18 | 19 | steps: 20 | - uses: actions/checkout@v3 21 | - name: Set up Python 22 | uses: actions/setup-python@v4 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi 32 | pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu 33 | pip3 install . 34 | 35 | - name: Lint with flake8 36 | run: | 37 | echo skip 38 | # stop the build if there are Python syntax errors or undefined names 39 | # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | # 43 | - name: Download models 44 | run: | 45 | wget https://storage.googleapis.com/esun-ai/g2pW/G2PWModel-v2-onnx.zip 46 | unzip G2PWModel-v2-onnx.zip 47 | 48 | git lfs install 49 | git clone https://huggingface.co/bert-base-chinese 50 | 51 | - name: Test with pytest 52 | run: | 53 | make test 54 | 55 | - name: run demo file 56 | run: | 57 | python tests/test_pinyin.py 58 | -------------------------------------------------------------------------------- /.github/workflows/cron.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: cron 5 | 6 | on: 7 | workflow_dispatch: 8 | schedule: 9 | - cron: "05 06 */1 * *" 10 | 11 | jobs: 12 | test: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 3.9 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: 3.9 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu 27 | 28 | - name: Download models 29 | run: | 30 | wget https://storage.googleapis.com/esun-ai/g2pW/G2PWModel-v2-onnx.zip 31 | unzip G2PWModel-v2-onnx.zip 32 | 33 | git lfs install 34 | git clone https://huggingface.co/bert-base-chinese 35 | 36 | - name: run demo file with stable 37 | run: | 38 | pip install pypinyin-g2pw 39 | python tests/test_pinyin.py 40 | 41 | - name: run demo file with master code 42 | run: | 43 | pip3 install . 44 | python tests/test_pinyin.py 45 | 46 | - name: Test with pytest 47 | run: | 48 | pip install flake8 pytest 49 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 50 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi 51 | make test 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # ChangeLog 2 | 3 | ## [0.4.0] (2023-06-24) 4 | 5 | * 新增参数 ``tone_sandhi=False`` 。 6 | 7 | 8 | ## [0.3.0] (2022-09-03) 9 | 10 | * 依赖最新的 0.1.1 版本的 g2pW,实现完全离线使用的需求(详见 README)。 11 | 12 | 13 | ## [0.2.0] (2022-08-28) 14 | 15 | * 支持最新 0.1.0 版本的 g2pW (使用时需要下载新的模型文件,详见 README)。 16 | 17 | 18 | ## 0.1.0 (2022-08-21) 19 | 20 | * 基于 g2pW 0.0.6 实现的第一个版本。 21 | 22 | 23 | [0.2.0]: https://github.com/mozillazg/pypinyin-g2pW/compare/v0.1.0...v0.2.0 24 | [0.3.0]: https://github.com/mozillazg/pypinyin-g2pW/compare/v0.2.0...v0.3.0 25 | [0.4.0]: https://github.com/mozillazg/pypinyin-g2pW/compare/v0.3.0...v0.4.0 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 mozillazg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: 3 | @echo "test: test" 4 | @echo "publish_test: publish to test registry" 5 | @echo "publish: publish to stable registry" 6 | 7 | .PHONY: build 8 | build: clean 9 | python3 -m build -s -w -n 10 | 11 | .PHONY: test-with-cov 12 | test-with-cov: 13 | PYTHONPATH=$(shell pwd)/src py.test --cov pypinyin_g2pw tests src/pypinyin_g2pw -v 14 | 15 | .PHONY: test 16 | test: 17 | PYTHONPATH=$(shell pwd)/src py.test tests src/pypinyin_g2pw -v 18 | 19 | .PHONY: publish 20 | publish: build 21 | @echo "publish to pypi" 22 | twine upload dist/* 23 | 24 | .PHONY: publish_test 25 | publish_test: build 26 | @echo "publish to test pypi" 27 | twine upload --repository test dist/* 28 | 29 | clean: clean-build clean-pyc clean-test 30 | 31 | clean-build: 32 | rm -fr build/ 33 | rm -fr dist/ 34 | rm -fr .eggs/ 35 | find . -name '*.egg-info' -exec rm -fr {} + 36 | find . -name '*.egg' -exec rm -f {} + 37 | 38 | clean-pyc: 39 | find . -name '*.pyc' -exec rm -f {} + 40 | find . -name '*.pyo' -exec rm -f {} + 41 | find . -name '*~' -exec rm -f {} + 42 | find . -name '__pycache__' -exec rm -fr {} + 43 | 44 | clean-test: 45 | rm -fr .tox/ 46 | rm -f .coverage 47 | rm -rf .cache/ 48 | rm -rf .pytest_cache/ 49 | rm -fr htmlcov/ 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pypinyin-g2pW 2 | 3 | 基于 [g2pW](https://github.com/GitYCC/g2pW/) 提升 [pypinyin](https://github.com/mozillazg/python-pinyin) 的准确性。 4 | 5 | 特点: 6 | 7 | * 可以通过训练模型的方式提升拼音准确性。 8 | * 功能和使用习惯与 pypinyin 基本保持一致,支持多种拼音风格。 9 | 10 | 11 | ## 使用 12 | 13 | ### 安装依赖 14 | 15 | 1. 安装 [PyTorch](https://pytorch.org/get-started/locally/)。 16 | 2. 下载并解压 G2PWModel: 17 | 18 | ``` 19 | wget https://storage.googleapis.com/esun-ai/g2pW/G2PWModel-v2-onnx.zip 20 | unzip G2PWModel-v2-onnx.zip 21 | ``` 22 | 3. 安装 [git-lfs](https://git-lfs.github.com/)。 23 | 4. 下载 [bert-base-chinese](https://huggingface.co/bert-base-chinese): 24 | 25 | ``` 26 | git lfs install 27 | git clone https://huggingface.co/bert-base-chinese 28 | ``` 29 | 5. 安装本项目: 30 | 31 | ``` 32 | pip install pypinyin-g2pw 33 | ``` 34 | 35 | ### 使用示例 36 | 37 | ```python 38 | >>> from pypinyin import Style 39 | >>> from pypinyin_g2pw import G2PWPinyin 40 | 41 | # 需要将 model_dir 和 model_source 的值指向下载的模型数据目录 42 | >>> g2pw = G2PWPinyin(model_dir='G2PWModel/', 43 | model_source='bert-base-chinese/', 44 | v_to_u=False, neutral_tone_with_five=True) 45 | >>> han = '然而,他红了20年以后,他竟退出了大家的视线。' 46 | 47 | # def lazy_pinyin(self, hans, style=Style.NORMAL, errors='default', strict=True, **kwargs) 48 | # 通过 lazy_pinyin 方法获取拼音数据,各个参数的含义和作用跟 pypinyin 中是一样的, 49 | # v_to_u 和 neutral_tone_with_five 参数只能在初始化 G2PWPinyin 时指定。 50 | 51 | >>> g2pw.lazy_pinyin(han) 52 | ['ran', 'er', ',', 'ta', 'hong', 'le', '20', 'nian', 'yi', 'hou', ',', 'ta', 'jing', 'tui', 'chu', 'le', 'da', 'jia', 'de', 'shi', 'xian', '。'] 53 | 54 | >>> g2pw.lazy_pinyin(han, style=Style.TONE) 55 | ['rán', 'ér', ',', 'tā', 'hóng', 'le', '20', 'nián', 'yǐ', 'hòu', ',', 'tā', 'jìng', 'tuì', 'chū', 'le', 'dà', 'jiā', 'de', 'shì', 'xiàn', '。'] 56 | 57 | >>> g2pw.lazy_pinyin(han, style=Style.TONE3) 58 | ['ran2', 'er2', ',', 'ta1', 'hong2', 'le5', '20', 'nian2', 'yi3', 'hou4', ',', 'ta1', 'jing4', 'tui4', 'chu1', 'le5', 'da4', 'jia1', 'de5', 'shi4', 'xian4', '。'] 59 | ``` 60 | 61 | ## 离线使用 62 | 63 | 默认情况下,即便使用了离线的模型数据,程序使用的 transformers 模块仍旧会从 huggingface.co 下载部分模型元数据。 64 | 可以通过设置环境变量 `TRANSFORMERS_OFFLINE=1` 以及环境变量 `HF_DATASETS_OFFLINE=1` 禁用获取元数据的操作,实现完全离线使用的需求。 65 | 详见 [transformers 官方文档](https://huggingface.co/docs/transformers/v4.21.2/en/installation#offline-mode)。 66 | 67 | 68 | ## 模型训练 69 | 70 | 详见 [g2pW](https://github.com/GitYCC/g2pW/#train-model) 官方文档中的说明。 71 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | # "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest>=6.2.5 2 | pytest-cov>=2.12.1 3 | twine>=3.4.2 4 | wheel>=0.37.0 5 | build>=0.7.0 6 | setuptools>=42 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from codecs import open 3 | import setuptools 4 | 5 | with open("README.md", "r", encoding="utf-8") as fh: 6 | long_description = fh.read() 7 | 8 | 9 | setuptools.setup( 10 | name="pypinyin-g2pw", 11 | version="0.4.0", 12 | author="mozillazg", 13 | author_email="mozillazg101@gmail.com", 14 | description="基于 g2pW 提升 pypinyin 的准确性。", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/mozillazg/pypinyin-g2pW", 18 | project_urls={ 19 | "Bug Tracker": "https://github.com/mozillazg/pypinyin-g2pW/issues", 20 | }, 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "License :: OSI Approved :: MIT License", 24 | "Operating System :: OS Independent", 25 | ], 26 | package_dir={"": "src"}, 27 | install_requires=['g2pw>=0.1.1', 'pypinyin>=0.47.1'], 28 | packages=setuptools.find_packages(where="src"), 29 | python_requires='>=3.6, <4', 30 | ) 31 | -------------------------------------------------------------------------------- /src/pypinyin_g2pw/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .g2pw import G2PWPinyin 4 | 5 | __all__ = ['G2PWPinyin'] 6 | -------------------------------------------------------------------------------- /src/pypinyin_g2pw/g2pw.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pypinyin.constants import RE_HANS 4 | from pypinyin.core import Pinyin, Style 5 | from pypinyin.seg.simpleseg import simple_seg 6 | from pypinyin.converter import UltimateConverter 7 | from pypinyin.contrib.tone_convert import to_tone 8 | from g2pw.api import G2PWConverter 9 | 10 | 11 | class G2PWPinyin(Pinyin): 12 | def __init__(self, model_dir='G2PWModel/', model_source=None, 13 | num_workers=None, batch_size=None, 14 | turnoff_tqdm=True, enable_non_tradional_chinese=True, 15 | v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs): 16 | self._g2pw = G2PWConverter( 17 | model_dir=model_dir, 18 | style='pinyin', 19 | model_source=model_source, 20 | num_workers=num_workers, 21 | batch_size=batch_size, 22 | turnoff_tqdm=turnoff_tqdm, 23 | enable_non_tradional_chinese=enable_non_tradional_chinese, 24 | ) 25 | self._converter = Converter( 26 | self._g2pw, v_to_u=v_to_u, 27 | neutral_tone_with_five=neutral_tone_with_five, 28 | tone_sandhi=tone_sandhi, 29 | ) 30 | 31 | def get_seg(self, **kwargs): 32 | return simple_seg 33 | 34 | 35 | class Converter(UltimateConverter): 36 | def __init__(self, g2pw_instance, v_to_u=False, 37 | neutral_tone_with_five=False, 38 | tone_sandhi=False, **kwargs): 39 | super(Converter, self).__init__( 40 | v_to_u=v_to_u, 41 | neutral_tone_with_five=neutral_tone_with_five, 42 | tone_sandhi=tone_sandhi, **kwargs) 43 | 44 | self._g2pw = g2pw_instance 45 | 46 | def convert(self, words, style, heteronym, errors, strict, **kwargs): 47 | pys = [] 48 | if RE_HANS.match(words): 49 | pys = self._to_pinyin(words, style=style, heteronym=heteronym, 50 | errors=errors, strict=strict) 51 | post_data = self.post_pinyin(words, heteronym, pys) 52 | if post_data is not None: 53 | pys = post_data 54 | 55 | pys = self.convert_styles( 56 | pys, words, style, heteronym, errors, strict) 57 | 58 | else: 59 | py = self.handle_nopinyin(words, style=style, errors=errors, 60 | heteronym=heteronym, strict=strict) 61 | if py: 62 | pys.extend(py) 63 | 64 | return _remove_dup_and_empty(pys) 65 | 66 | def _to_pinyin(self, han, style, heteronym, errors, strict, **kwargs): 67 | g2pw_pinyin = self._g2pw(han) 68 | 69 | if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑 70 | return super(Converter, self).convert( 71 | han, Style.TONE, heteronym, errors, strict, **kwargs) 72 | 73 | pinyins = [] 74 | 75 | for i, item in enumerate(g2pw_pinyin[0]): 76 | if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑 77 | py = super(Converter, self).convert( 78 | han[i], Style.TONE, heteronym, errors, strict, **kwargs) 79 | pinyins.extend(py) 80 | else: 81 | pinyins.append([to_tone(item)]) 82 | 83 | return pinyins 84 | 85 | 86 | def _remove_dup_items(lst, remove_empty=False): 87 | new_lst = [] 88 | for item in lst: 89 | if remove_empty and not item: 90 | continue 91 | if item not in new_lst: 92 | new_lst.append(item) 93 | return new_lst 94 | 95 | 96 | def _remove_dup_and_empty(lst_list): 97 | new_lst_list = [] 98 | for lst in lst_list: 99 | lst = _remove_dup_items(lst, remove_empty=True) 100 | if lst: 101 | new_lst_list.append(lst) 102 | else: 103 | new_lst_list.append(['']) 104 | 105 | return new_lst_list 106 | -------------------------------------------------------------------------------- /tests/test_pinyin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import time 4 | 5 | from pypinyin import lazy_pinyin, Style 6 | 7 | from pypinyin_g2pw import G2PWPinyin 8 | 9 | model_dir = os.getenv('MODEL_DIR', 'G2PWModel/') 10 | model_source = os.getenv('MODEL_SOURCE', 'bert-base-chinese/') 11 | 12 | 13 | def test_lazy_pinyin(): 14 | han = ('它没有婆娑的姿态,没有屈曲盘旋的虬枝,也许你要说它不美丽,' 15 | '——如果美是专指“婆娑”或“横斜逸出”之类而言,那么,' 16 | '白杨树算不得树中的好女子;但是它却是伟岸,正直,朴质,严肃,' 17 | '也不缺乏温和,更不用提它的坚强不屈与挺拔,它是树中的伟丈夫!') 18 | g2pw = G2PWPinyin(model_dir=model_dir, model_source=model_source) 19 | 20 | now = time.time() 21 | p1 = lazy_pinyin(han, style=Style.TONE3) 22 | t1 = time.time() - now 23 | 24 | now = time.time() 25 | p2 = g2pw.lazy_pinyin(han, style=Style.TONE3) 26 | t2 = time.time() - now 27 | 28 | print('han: \n{}'.format(han)) 29 | 30 | print('pypinyin {}s: \n{}'.format(t1, ' '.join(p1))) 31 | 32 | print('pypinyin_g2pw {}s: \n{}'.format(t2, ' '.join(p2))) 33 | 34 | 35 | if __name__ == '__main__': 36 | test_lazy_pinyin() 37 | --------------------------------------------------------------------------------