├── .gitignore ├── readme.md ├── requirements.txt ├── setup.py ├── test.py └── textfrontend ├── __init__.py ├── arpabet.py ├── g2pw ├── __init__.py ├── dataset.py ├── onnx_api.py └── utils.py ├── generate_lexicon.py ├── mix_frontend.py ├── normalizer ├── __init__.py ├── abbrrviation.py ├── acronyms.py ├── normalizer.py ├── numbers.py └── width.py ├── phonectic.py ├── polyphonic.yaml ├── punctuation.py ├── ssml ├── __init__.py └── xml_processor.py ├── tone_sandhi.py ├── version.py ├── vocab.py ├── zh_frontend.py └── zh_normalization ├── README.md ├── __init__.py ├── char_convert.py ├── chronology.py ├── constants.py ├── num.py ├── phonecode.py ├── quantifier.py └── text_normlization.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Text Frontend 2 | 3 | 中文 TTS 相关资源过于零散,不仅没有大规模数据集,连个正常的能精准做 CN TTS 的都很少,这一块可能连越南都不如。本仓库主要将 TTS 的前端部分单独整理成一个仓库,这样对于 TTS 的引擎,就不需要考虑太多前端处理的东西,你拿到拼音和韵律之后转一下 token 就可以合成了。 4 | 5 | > WIP: work still in progress. 6 | 7 | 当前的版本主要考虑三个问题: 8 | 9 | - 韵律 Prosody 预测; 10 | - 多音字 Polyphone 预测; 11 | - 拼音预测; 12 | 13 | 那么基本上 TTS 的前端问题就差不多解决了。 14 | 15 | 目前的实现,主要是基于 PaddleSpeech,将其的操作结合一些个人经验,变成一个简单的统一化的函数。当中用到的模型可以基于 ONNXRuntime 推理,部署也更简单方便。 16 | 17 | # 安装 18 | 19 | note: 我们不需要paddle 20 | 21 | ``` 22 | pip install -r requirements.txt 23 | python setup.py build develop 24 | ``` 25 | 26 | # 使用 27 | 28 | ```python 29 | from textfrontend.zh_frontend import FrontEnd 30 | 31 | fe = FrontEnd() 32 | fe.get_phonemes('从前有座山,山里有座庙,庙里有个老和尚') 33 | ``` 34 | 35 | 36 | # QA 37 | 38 | 1. ` from pypinyin_dict.phrase_pinyin_data import large_pinyin 39 | OverflowError: line number table is too long` 40 | 41 | 大概是python3.10的问题 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pypinyin_dict 2 | g2p_en 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | from setuptools import find_packages, setup 5 | import io 6 | from os import path 7 | 8 | 9 | this_directory = path.abspath(path.dirname(__file__)) 10 | with io.open(path.join(this_directory, "README.md"), encoding="utf-8") as f: 11 | long_description = f.read() 12 | 13 | 14 | version_file = "textfrontend/version.py" 15 | 16 | 17 | def get_version(): 18 | with open(version_file, "r") as f: 19 | exec(compile(f.read(), version_file, "exec")) 20 | return locals()["__version__"] 21 | 22 | 23 | if __name__ == "__main__": 24 | setup( 25 | name="textfrontend", 26 | version=get_version(), 27 | description="textfrontend: A Chinese TTS Training and Deploy Framework", 28 | long_description="", 29 | author="LucasJin", 30 | author_email="jinfagang19@163.com", 31 | keywords="computer vision, object detection", 32 | url="https://github.com/jinfagang/textfrontend", 33 | packages=find_packages(exclude=("configs", "tools", "demo", "images")), 34 | classifiers=[ 35 | "Development Status :: 4 - Beta", 36 | "License :: OSI Approved :: Apache Software License", 37 | "Operating System :: OS Independent", 38 | "Programming Language :: Python :: 3", 39 | "Programming Language :: Python :: 3.5", 40 | "Programming Language :: Python :: 3.6", 41 | "Programming Language :: Python :: 3.7", 42 | "Programming Language :: Python :: 3.8", 43 | "Programming Language :: Python :: 3.9", 44 | ], 45 | license="Apache License 2.0", 46 | zip_safe=False, 47 | ) 48 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from textfrontend.zh_frontend import Frontend 2 | 3 | fe = Frontend() 4 | res = fe.get_phonemes('你之前可不是这么说的,一只兔子能做什么,黑兔子呢') 5 | print(res) -------------------------------------------------------------------------------- /textfrontend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .generate_lexicon import * 15 | from .normalizer import * 16 | from .phonectic import * 17 | from .punctuation import * 18 | from .tone_sandhi import * 19 | from .vocab import * 20 | from .zh_normalization import * 21 | -------------------------------------------------------------------------------- /textfrontend/arpabet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from textfrontend.phonectic import Phonetics 15 | """ 16 | A phonology system with ARPABET symbols and limited punctuations. The G2P 17 | conversion is done by g2p_en. 18 | 19 | Note that g2p_en does not handle words with hypen well. So make sure the input 20 | sentence is first normalized. 21 | """ 22 | from textfrontend.vocab import Vocab 23 | from g2p_en import G2p 24 | 25 | 26 | class ARPABET(Phonetics): 27 | """A phonology for English that uses ARPABET as the phoneme vocabulary. 28 | See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details. 29 | Phoneme Example Translation 30 | ------- ------- ----------- 31 | AA odd AA D 32 | AE at AE T 33 | AH hut HH AH T 34 | AO ought AO T 35 | AW cow K AW 36 | AY hide HH AY D 37 | B be B IY 38 | CH cheese CH IY Z 39 | D dee D IY 40 | DH thee DH IY 41 | EH Ed EH D 42 | ER hurt HH ER T 43 | EY ate EY T 44 | F fee F IY 45 | G green G R IY N 46 | HH he HH IY 47 | IH it IH T 48 | IY eat IY T 49 | JH gee JH IY 50 | K key K IY 51 | L lee L IY 52 | M me M IY 53 | N knee N IY 54 | NG ping P IH NG 55 | OW oat OW T 56 | OY toy T OY 57 | P pee P IY 58 | R read R IY D 59 | S sea S IY 60 | SH she SH IY 61 | T tea T IY 62 | TH theta TH EY T AH 63 | UH hood HH UH D 64 | UW two T UW 65 | V vee V IY 66 | W we W IY 67 | Y yield Y IY L D 68 | Z zee Z IY 69 | ZH seizure S IY ZH ER 70 | """ 71 | phonemes = [ 72 | 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 73 | 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 74 | 'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UW', 'UH', 'V', 'W', 'Y', 'Z', 75 | 'ZH' 76 | ] 77 | punctuations = [',', '.', '?', '!'] 78 | symbols = phonemes + punctuations 79 | _stress_to_no_stress_ = { 80 | 'AA0': 'AA', 81 | 'AA1': 'AA', 82 | 'AA2': 'AA', 83 | 'AE0': 'AE', 84 | 'AE1': 'AE', 85 | 'AE2': 'AE', 86 | 'AH0': 'AH', 87 | 'AH1': 'AH', 88 | 'AH2': 'AH', 89 | 'AO0': 'AO', 90 | 'AO1': 'AO', 91 | 'AO2': 'AO', 92 | 'AW0': 'AW', 93 | 'AW1': 'AW', 94 | 'AW2': 'AW', 95 | 'AY0': 'AY', 96 | 'AY1': 'AY', 97 | 'AY2': 'AY', 98 | 'EH0': 'EH', 99 | 'EH1': 'EH', 100 | 'EH2': 'EH', 101 | 'ER0': 'ER', 102 | 'ER1': 'ER', 103 | 'ER2': 'ER', 104 | 'EY0': 'EY', 105 | 'EY1': 'EY', 106 | 'EY2': 'EY', 107 | 'IH0': 'IH', 108 | 'IH1': 'IH', 109 | 'IH2': 'IH', 110 | 'IY0': 'IY', 111 | 'IY1': 'IY', 112 | 'IY2': 'IY', 113 | 'OW0': 'OW', 114 | 'OW1': 'OW', 115 | 'OW2': 'OW', 116 | 'OY0': 'OY', 117 | 'OY1': 'OY', 118 | 'OY2': 'OY', 119 | 'UH0': 'UH', 120 | 'UH1': 'UH', 121 | 'UH2': 'UH', 122 | 'UW0': 'UW', 123 | 'UW1': 'UW', 124 | 'UW2': 'UW' 125 | } 126 | 127 | def __init__(self): 128 | self.backend = G2p() 129 | self.vocab = Vocab(self.phonemes + self.punctuations) 130 | 131 | def _remove_vowels(self, phone): 132 | return self._stress_to_no_stress_.get(phone, phone) 133 | 134 | def phoneticize(self, sentence, add_start_end=False): 135 | """ Normalize the input text sequence and convert it into pronunciation sequence. 136 | Args: 137 | sentence (str): The input text sequence. 138 | 139 | Returns: 140 | List[str]: The list of pronunciation sequence. 141 | """ 142 | phonemes = [ 143 | self._remove_vowels(item) for item in self.backend(sentence) 144 | ] 145 | if add_start_end: 146 | start = self.vocab.start_symbol 147 | end = self.vocab.end_symbol 148 | phonemes = [start] + phonemes + [end] 149 | phonemes = [item for item in phonemes if item in self.vocab.stoi] 150 | return phonemes 151 | 152 | def numericalize(self, phonemes): 153 | """ Convert pronunciation sequence into pronunciation id sequence. 154 | 155 | Args: 156 | phonemes (List[str]): The list of pronunciation sequence. 157 | 158 | Returns: 159 | List[int]: The list of pronunciation id sequence. 160 | """ 161 | ids = [self.vocab.lookup(item) for item in phonemes] 162 | return ids 163 | 164 | def reverse(self, ids): 165 | """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. 166 | 167 | Args: 168 | ids( List[int]): The list of pronunciation id sequence. 169 | 170 | Returns: 171 | List[str]: 172 | The list of pronunciation sequence. 173 | """ 174 | return [self.vocab.reverse(i) for i in ids] 175 | 176 | def __call__(self, sentence, add_start_end=False): 177 | """ Convert the input text sequence into pronunciation id sequence. 178 | 179 | Args: 180 | sentence (str): The input text sequence. 181 | 182 | Returns: 183 | List[str]: The list of pronunciation id sequence. 184 | """ 185 | return self.numericalize( 186 | self.phoneticize(sentence, add_start_end=add_start_end)) 187 | 188 | @property 189 | def vocab_size(self): 190 | """ Vocab size. 191 | """ 192 | # 47 = 39 phones + 4 punctuations + 4 special tokens 193 | return len(self.vocab) 194 | 195 | 196 | class ARPABETWithStress(Phonetics): 197 | phonemes = [ 198 | 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 199 | 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 200 | 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 201 | 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 202 | 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 203 | 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V', 204 | 'W', 'Y', 'Z', 'ZH' 205 | ] 206 | punctuations = [',', '.', '?', '!'] 207 | symbols = phonemes + punctuations 208 | 209 | def __init__(self): 210 | self.backend = G2p() 211 | self.vocab = Vocab(self.phonemes + self.punctuations) 212 | 213 | def phoneticize(self, sentence, add_start_end=False): 214 | """ Normalize the input text sequence and convert it into pronunciation sequence. 215 | 216 | Args: 217 | sentence (str): The input text sequence. 218 | 219 | Returns: 220 | List[str]: The list of pronunciation sequence. 221 | """ 222 | phonemes = self.backend(sentence) 223 | if add_start_end: 224 | start = self.vocab.start_symbol 225 | end = self.vocab.end_symbol 226 | phonemes = [start] + phonemes + [end] 227 | phonemes = [item for item in phonemes if item in self.vocab.stoi] 228 | return phonemes 229 | 230 | def numericalize(self, phonemes): 231 | """ Convert pronunciation sequence into pronunciation id sequence. 232 | 233 | Args: 234 | phonemes (List[str]): The list of pronunciation sequence. 235 | 236 | Returns: 237 | List[int]: The list of pronunciation id sequence. 238 | """ 239 | ids = [self.vocab.lookup(item) for item in phonemes] 240 | return ids 241 | 242 | def reverse(self, ids): 243 | """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. 244 | Args: 245 | ids (List[int]): The list of pronunciation id sequence. 246 | 247 | Returns: 248 | List[str]: The list of pronunciation sequence. 249 | """ 250 | return [self.vocab.reverse(i) for i in ids] 251 | 252 | def __call__(self, sentence, add_start_end=False): 253 | """ Convert the input text sequence into pronunciation id sequence. 254 | Args: 255 | sentence (str): The input text sequence. 256 | 257 | Returns: 258 | List[str]: The list of pronunciation id sequence. 259 | """ 260 | return self.numericalize( 261 | self.phoneticize(sentence, add_start_end=add_start_end)) 262 | 263 | @property 264 | def vocab_size(self): 265 | """ Vocab size. 266 | """ 267 | # 77 = 69 phones + 4 punctuations + 4 special tokens 268 | return len(self.vocab) 269 | -------------------------------------------------------------------------------- /textfrontend/g2pw/__init__.py: -------------------------------------------------------------------------------- 1 | from .onnx_api import G2PWOnnxConverter 2 | -------------------------------------------------------------------------------- /textfrontend/g2pw/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Credits 16 | This code is modified from https://github.com/GitYCC/g2pW 17 | """ 18 | from typing import Dict 19 | from typing import List 20 | from typing import Tuple 21 | 22 | import numpy as np 23 | from .utils import tokenize_and_map 24 | 25 | ANCHOR_CHAR = '▁' 26 | 27 | 28 | def prepare_onnx_input(tokenizer, 29 | labels: List[str], 30 | char2phonemes: Dict[str, List[int]], 31 | chars: List[str], 32 | texts: List[str], 33 | query_ids: List[int], 34 | use_mask: bool=False, 35 | window_size: int=None, 36 | max_len: int=512) -> Dict[str, np.array]: 37 | if window_size is not None: 38 | truncated_texts, truncated_query_ids = _truncate_texts( 39 | window_size=window_size, texts=texts, query_ids=query_ids) 40 | input_ids = [] 41 | token_type_ids = [] 42 | attention_masks = [] 43 | phoneme_masks = [] 44 | char_ids = [] 45 | position_ids = [] 46 | 47 | for idx in range(len(texts)): 48 | text = (truncated_texts if window_size else texts)[idx].lower() 49 | query_id = (truncated_query_ids if window_size else query_ids)[idx] 50 | 51 | try: 52 | tokens, text2token, token2text = tokenize_and_map( 53 | tokenizer=tokenizer, text=text) 54 | except Exception: 55 | print(f'warning: text "{text}" is invalid') 56 | return {} 57 | 58 | text, query_id, tokens, text2token, token2text = _truncate( 59 | max_len=max_len, 60 | text=text, 61 | query_id=query_id, 62 | tokens=tokens, 63 | text2token=text2token, 64 | token2text=token2text) 65 | 66 | processed_tokens = ['[CLS]'] + tokens + ['[SEP]'] 67 | 68 | input_id = list( 69 | np.array(tokenizer.convert_tokens_to_ids(processed_tokens))) 70 | token_type_id = list(np.zeros((len(processed_tokens), ), dtype=int)) 71 | attention_mask = list(np.ones((len(processed_tokens), ), dtype=int)) 72 | 73 | query_char = text[query_id] 74 | phoneme_mask = [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] \ 75 | if use_mask else [1] * len(labels) 76 | char_id = chars.index(query_char) 77 | position_id = text2token[ 78 | query_id] + 1 # [CLS] token locate at first place 79 | 80 | input_ids.append(input_id) 81 | token_type_ids.append(token_type_id) 82 | attention_masks.append(attention_mask) 83 | phoneme_masks.append(phoneme_mask) 84 | char_ids.append(char_id) 85 | position_ids.append(position_id) 86 | 87 | outputs = { 88 | 'input_ids': np.array(input_ids).astype(np.int64), 89 | 'token_type_ids': np.array(token_type_ids).astype(np.int64), 90 | 'attention_masks': np.array(attention_masks).astype(np.int64), 91 | 'phoneme_masks': np.array(phoneme_masks).astype(np.float32), 92 | 'char_ids': np.array(char_ids).astype(np.int64), 93 | 'position_ids': np.array(position_ids).astype(np.int64), 94 | } 95 | return outputs 96 | 97 | 98 | def _truncate_texts(window_size: int, texts: List[str], 99 | query_ids: List[int]) -> Tuple[List[str], List[int]]: 100 | truncated_texts = [] 101 | truncated_query_ids = [] 102 | for text, query_id in zip(texts, query_ids): 103 | start = max(0, query_id - window_size // 2) 104 | end = min(len(text), query_id + window_size // 2) 105 | truncated_text = text[start:end] 106 | truncated_texts.append(truncated_text) 107 | 108 | truncated_query_id = query_id - start 109 | truncated_query_ids.append(truncated_query_id) 110 | return truncated_texts, truncated_query_ids 111 | 112 | 113 | def _truncate(max_len: int, 114 | text: str, 115 | query_id: int, 116 | tokens: List[str], 117 | text2token: List[int], 118 | token2text: List[Tuple[int]]): 119 | truncate_len = max_len - 2 120 | if len(tokens) <= truncate_len: 121 | return (text, query_id, tokens, text2token, token2text) 122 | 123 | token_position = text2token[query_id] 124 | 125 | token_start = token_position - truncate_len // 2 126 | token_end = token_start + truncate_len 127 | font_exceed_dist = -token_start 128 | back_exceed_dist = token_end - len(tokens) 129 | if font_exceed_dist > 0: 130 | token_start += font_exceed_dist 131 | token_end += font_exceed_dist 132 | elif back_exceed_dist > 0: 133 | token_start -= back_exceed_dist 134 | token_end -= back_exceed_dist 135 | 136 | start = token2text[token_start][0] 137 | end = token2text[token_end - 1][1] 138 | 139 | return (text[start:end], query_id - start, tokens[token_start:token_end], [ 140 | i - token_start if i is not None else None 141 | for i in text2token[start:end] 142 | ], [(s - start, e - start) for s, e in token2text[token_start:token_end]]) 143 | 144 | 145 | def get_phoneme_labels(polyphonic_chars: List[List[str]] 146 | ) -> Tuple[List[str], Dict[str, List[int]]]: 147 | labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars]))) 148 | char2phonemes = {} 149 | for char, phoneme in polyphonic_chars: 150 | if char not in char2phonemes: 151 | char2phonemes[char] = [] 152 | char2phonemes[char].append(labels.index(phoneme)) 153 | return labels, char2phonemes 154 | 155 | 156 | def get_char_phoneme_labels(polyphonic_chars: List[List[str]] 157 | ) -> Tuple[List[str], Dict[str, List[int]]]: 158 | labels = sorted( 159 | list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars]))) 160 | char2phonemes = {} 161 | for char, phoneme in polyphonic_chars: 162 | if char not in char2phonemes: 163 | char2phonemes[char] = [] 164 | char2phonemes[char].append(labels.index(f'{char} {phoneme}')) 165 | return labels, char2phonemes 166 | -------------------------------------------------------------------------------- /textfrontend/g2pw/onnx_api.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Credits 16 | This code is modified from https://github.com/GitYCC/g2pW 17 | """ 18 | import json 19 | import os 20 | from typing import Any 21 | from typing import Dict 22 | from typing import List 23 | from typing import Tuple 24 | 25 | import numpy as np 26 | import onnxruntime 27 | from opencc import OpenCC 28 | from paddlenlp.transformers import BertTokenizer 29 | from pypinyin import pinyin 30 | from pypinyin import Style 31 | 32 | from paddlespeech.cli.utils import download_and_decompress 33 | from paddlespeech.resource.pretrained_models import g2pw_onnx_models 34 | from paddlespeech.t2s.frontend.g2pw.dataset import get_char_phoneme_labels 35 | from paddlespeech.t2s.frontend.g2pw.dataset import get_phoneme_labels 36 | from paddlespeech.t2s.frontend.g2pw.dataset import prepare_onnx_input 37 | from paddlespeech.t2s.frontend.g2pw.utils import load_config 38 | from paddlespeech.t2s.frontend.zh_normalization.char_convert import tranditional_to_simplified 39 | from paddlespeech.utils.env import MODEL_HOME 40 | 41 | model_version = '1.1' 42 | 43 | 44 | def predict(session, onnx_input: Dict[str, Any], 45 | labels: List[str]) -> Tuple[List[str], List[float]]: 46 | all_preds = [] 47 | all_confidences = [] 48 | probs = session.run([], { 49 | "input_ids": onnx_input['input_ids'], 50 | "token_type_ids": onnx_input['token_type_ids'], 51 | "attention_mask": onnx_input['attention_masks'], 52 | "phoneme_mask": onnx_input['phoneme_masks'], 53 | "char_ids": onnx_input['char_ids'], 54 | "position_ids": onnx_input['position_ids'] 55 | })[0] 56 | 57 | preds = np.argmax(probs, axis=1).tolist() 58 | max_probs = [] 59 | for index, arr in zip(preds, probs.tolist()): 60 | max_probs.append(arr[index]) 61 | all_preds += [labels[pred] for pred in preds] 62 | all_confidences += max_probs 63 | 64 | return all_preds, all_confidences 65 | 66 | 67 | class G2PWOnnxConverter: 68 | def __init__(self, 69 | model_dir: os.PathLike=MODEL_HOME, 70 | style: str='bopomofo', 71 | model_source: str=None, 72 | enable_non_tradional_chinese: bool=False): 73 | uncompress_path = download_and_decompress( 74 | g2pw_onnx_models['G2PWModel'][model_version], model_dir) 75 | 76 | sess_options = onnxruntime.SessionOptions() 77 | sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL 78 | sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL 79 | sess_options.intra_op_num_threads = 2 80 | self.session_g2pW = onnxruntime.InferenceSession( 81 | os.path.join(uncompress_path, 'g2pW.onnx'), 82 | sess_options=sess_options) 83 | self.config = load_config( 84 | config_path=os.path.join(uncompress_path, 'config.py'), 85 | use_default=True) 86 | 87 | self.model_source = model_source if model_source else self.config.model_source 88 | self.enable_opencc = enable_non_tradional_chinese 89 | 90 | self.tokenizer = BertTokenizer.from_pretrained(self.config.model_source) 91 | 92 | polyphonic_chars_path = os.path.join(uncompress_path, 93 | 'POLYPHONIC_CHARS.txt') 94 | monophonic_chars_path = os.path.join(uncompress_path, 95 | 'MONOPHONIC_CHARS.txt') 96 | self.polyphonic_chars = [ 97 | line.split('\t') 98 | for line in open(polyphonic_chars_path, encoding='utf-8').read() 99 | .strip().split('\n') 100 | ] 101 | self.non_polyphonic = { 102 | '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', 103 | '肖', '瘙', '誒', '泊' 104 | } 105 | self.non_monophonic = {'似', '攢'} 106 | self.monophonic_chars = [ 107 | line.split('\t') 108 | for line in open(monophonic_chars_path, encoding='utf-8').read() 109 | .strip().split('\n') 110 | ] 111 | self.labels, self.char2phonemes = get_char_phoneme_labels( 112 | polyphonic_chars=self.polyphonic_chars 113 | ) if self.config.use_char_phoneme else get_phoneme_labels( 114 | polyphonic_chars=self.polyphonic_chars) 115 | 116 | self.chars = sorted(list(self.char2phonemes.keys())) 117 | 118 | self.polyphonic_chars_new = set(self.chars) 119 | for char in self.non_polyphonic: 120 | if char in self.polyphonic_chars_new: 121 | self.polyphonic_chars_new.remove(char) 122 | 123 | self.monophonic_chars_dict = { 124 | char: phoneme 125 | for char, phoneme in self.monophonic_chars 126 | } 127 | for char in self.non_monophonic: 128 | if char in self.monophonic_chars_dict: 129 | self.monophonic_chars_dict.pop(char) 130 | 131 | self.pos_tags = [ 132 | 'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI' 133 | ] 134 | 135 | with open( 136 | os.path.join(uncompress_path, 137 | 'bopomofo_to_pinyin_wo_tune_dict.json'), 138 | 'r', 139 | encoding='utf-8') as fr: 140 | self.bopomofo_convert_dict = json.load(fr) 141 | self.style_convert_func = { 142 | 'bopomofo': lambda x: x, 143 | 'pinyin': self._convert_bopomofo_to_pinyin, 144 | }[style] 145 | 146 | with open( 147 | os.path.join(uncompress_path, 'char_bopomofo_dict.json'), 148 | 'r', 149 | encoding='utf-8') as fr: 150 | self.char_bopomofo_dict = json.load(fr) 151 | 152 | if self.enable_opencc: 153 | self.cc = OpenCC('s2tw') 154 | 155 | def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str: 156 | tone = bopomofo[-1] 157 | assert tone in '12345' 158 | component = self.bopomofo_convert_dict.get(bopomofo[:-1]) 159 | if component: 160 | return component + tone 161 | else: 162 | print(f'Warning: "{bopomofo}" cannot convert to pinyin') 163 | return None 164 | 165 | def __call__(self, sentences: List[str]) -> List[List[str]]: 166 | if isinstance(sentences, str): 167 | sentences = [sentences] 168 | 169 | if self.enable_opencc: 170 | translated_sentences = [] 171 | for sent in sentences: 172 | translated_sent = self.cc.convert(sent) 173 | assert len(translated_sent) == len(sent) 174 | translated_sentences.append(translated_sent) 175 | sentences = translated_sentences 176 | 177 | texts, query_ids, sent_ids, partial_results = self._prepare_data( 178 | sentences=sentences) 179 | if len(texts) == 0: 180 | # sentences no polyphonic words 181 | return partial_results 182 | 183 | onnx_input = prepare_onnx_input( 184 | tokenizer=self.tokenizer, 185 | labels=self.labels, 186 | char2phonemes=self.char2phonemes, 187 | chars=self.chars, 188 | texts=texts, 189 | query_ids=query_ids, 190 | use_mask=self.config.use_mask, 191 | window_size=None) 192 | 193 | preds, confidences = predict( 194 | session=self.session_g2pW, 195 | onnx_input=onnx_input, 196 | labels=self.labels) 197 | if self.config.use_char_phoneme: 198 | preds = [pred.split(' ')[1] for pred in preds] 199 | 200 | results = partial_results 201 | for sent_id, query_id, pred in zip(sent_ids, query_ids, preds): 202 | results[sent_id][query_id] = self.style_convert_func(pred) 203 | 204 | return results 205 | 206 | def _prepare_data( 207 | self, sentences: List[str] 208 | ) -> Tuple[List[str], List[int], List[int], List[List[str]]]: 209 | texts, query_ids, sent_ids, partial_results = [], [], [], [] 210 | for sent_id, sent in enumerate(sentences): 211 | # pypinyin works well for Simplified Chinese than Traditional Chinese 212 | sent_s = tranditional_to_simplified(sent) 213 | pypinyin_result = pinyin(sent_s, style=Style.TONE3) 214 | partial_result = [None] * len(sent) 215 | for i, char in enumerate(sent): 216 | if char in self.polyphonic_chars_new: 217 | texts.append(sent) 218 | query_ids.append(i) 219 | sent_ids.append(sent_id) 220 | elif char in self.monophonic_chars_dict: 221 | partial_result[i] = self.style_convert_func( 222 | self.monophonic_chars_dict[char]) 223 | elif char in self.char_bopomofo_dict: 224 | partial_result[i] = pypinyin_result[i][0] 225 | # partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0]) 226 | else: 227 | partial_result[i] = pypinyin_result[i][0] 228 | 229 | partial_results.append(partial_result) 230 | return texts, query_ids, sent_ids, partial_results 231 | -------------------------------------------------------------------------------- /textfrontend/g2pw/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Credits 16 | This code is modified from https://github.com/GitYCC/g2pW 17 | """ 18 | import os 19 | import re 20 | 21 | 22 | def wordize_and_map(text: str): 23 | words = [] 24 | index_map_from_text_to_word = [] 25 | index_map_from_word_to_text = [] 26 | while len(text) > 0: 27 | match_space = re.match(r'^ +', text) 28 | if match_space: 29 | space_str = match_space.group(0) 30 | index_map_from_text_to_word += [None] * len(space_str) 31 | text = text[len(space_str):] 32 | continue 33 | 34 | match_en = re.match(r'^[a-zA-Z0-9]+', text) 35 | if match_en: 36 | en_word = match_en.group(0) 37 | 38 | word_start_pos = len(index_map_from_text_to_word) 39 | word_end_pos = word_start_pos + len(en_word) 40 | index_map_from_word_to_text.append((word_start_pos, word_end_pos)) 41 | 42 | index_map_from_text_to_word += [len(words)] * len(en_word) 43 | 44 | words.append(en_word) 45 | text = text[len(en_word):] 46 | else: 47 | word_start_pos = len(index_map_from_text_to_word) 48 | word_end_pos = word_start_pos + 1 49 | index_map_from_word_to_text.append((word_start_pos, word_end_pos)) 50 | 51 | index_map_from_text_to_word += [len(words)] 52 | 53 | words.append(text[0]) 54 | text = text[1:] 55 | return words, index_map_from_text_to_word, index_map_from_word_to_text 56 | 57 | 58 | def tokenize_and_map(tokenizer, text: str): 59 | words, text2word, word2text = wordize_and_map(text=text) 60 | 61 | tokens = [] 62 | index_map_from_token_to_text = [] 63 | for word, (word_start, word_end) in zip(words, word2text): 64 | word_tokens = tokenizer.tokenize(word) 65 | 66 | if len(word_tokens) == 0 or word_tokens == ['[UNK]']: 67 | index_map_from_token_to_text.append((word_start, word_end)) 68 | tokens.append('[UNK]') 69 | else: 70 | current_word_start = word_start 71 | for word_token in word_tokens: 72 | word_token_len = len(re.sub(r'^##', '', word_token)) 73 | index_map_from_token_to_text.append( 74 | (current_word_start, current_word_start + word_token_len)) 75 | current_word_start = current_word_start + word_token_len 76 | tokens.append(word_token) 77 | 78 | index_map_from_text_to_token = text2word 79 | for i, (token_start, token_end) in enumerate(index_map_from_token_to_text): 80 | for token_pos in range(token_start, token_end): 81 | index_map_from_text_to_token[token_pos] = i 82 | 83 | return tokens, index_map_from_text_to_token, index_map_from_token_to_text 84 | 85 | 86 | def _load_config(config_path: os.PathLike): 87 | import importlib.util 88 | spec = importlib.util.spec_from_file_location('__init__', config_path) 89 | config = importlib.util.module_from_spec(spec) 90 | spec.loader.exec_module(config) 91 | return config 92 | 93 | 94 | default_config_dict = { 95 | 'manual_seed': 1313, 96 | 'model_source': 'bert-base-chinese', 97 | 'window_size': 32, 98 | 'num_workers': 2, 99 | 'use_mask': True, 100 | 'use_char_phoneme': False, 101 | 'use_conditional': True, 102 | 'param_conditional': { 103 | 'affect_location': 'softmax', 104 | 'bias': True, 105 | 'char-linear': True, 106 | 'pos-linear': False, 107 | 'char+pos-second': True, 108 | 'char+pos-second_lowrank': False, 109 | 'lowrank_size': 0, 110 | 'char+pos-second_fm': False, 111 | 'fm_size': 0, 112 | 'fix_mode': None, 113 | 'count_json': 'train.count.json' 114 | }, 115 | 'lr': 5e-5, 116 | 'val_interval': 200, 117 | 'num_iter': 10000, 118 | 'use_focal': False, 119 | 'param_focal': { 120 | 'alpha': 0.0, 121 | 'gamma': 0.7 122 | }, 123 | 'use_pos': True, 124 | 'param_pos ': { 125 | 'weight': 0.1, 126 | 'pos_joint_training': True, 127 | 'train_pos_path': 'train.pos', 128 | 'valid_pos_path': 'dev.pos', 129 | 'test_pos_path': 'test.pos' 130 | } 131 | } 132 | 133 | 134 | def load_config(config_path: os.PathLike, use_default: bool=False): 135 | config = _load_config(config_path) 136 | if use_default: 137 | for attr, val in default_config_dict.items(): 138 | if not hasattr(config, attr): 139 | setattr(config, attr, val) 140 | elif isinstance(val, dict): 141 | d = getattr(config, attr) 142 | for dict_k, dict_v in val.items(): 143 | if dict_k not in d: 144 | d[dict_k] = dict_v 145 | return config 146 | -------------------------------------------------------------------------------- /textfrontend/generate_lexicon.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Design principles: https://zhuanlan.zhihu.com/p/349600439 15 | """Generate lexicon and symbols for Mandarin Chinese phonology. 16 | The lexicon is used for Montreal Force Aligner. 17 | Note that syllables are used as word in this lexicon. Since syllables rather 18 | than words are used in transcriptions produced by `reorganize_baker.py`. 19 | We make this choice to better leverage other software for chinese text to 20 | pinyin tools like pypinyin. This is the convention for G2P in Chinese. 21 | """ 22 | import re 23 | from collections import OrderedDict 24 | 25 | INITIALS = [ 26 | 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', 27 | 'r', 'z', 'c', 's', 'j', 'q', 'x' 28 | ] 29 | 30 | FINALS = [ 31 | 'a', 'ai', 'ao', 'an', 'ang', 'e', 'er', 'ei', 'en', 'eng', 'o', 'ou', 32 | 'ong', 'ii', 'iii', 'i', 'ia', 'iao', 'ian', 'iang', 'ie', 'io', 'iou', 33 | 'iong', 'in', 'ing', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uo', 'uen', 34 | 'ueng', 'v', 've', 'van', 'vn' 35 | ] 36 | 37 | SPECIALS = ['sil', 'sp'] 38 | 39 | 40 | def rule(C, V, R, T): 41 | """Generate a syllable given the initial, the final, erhua indicator, and tone. 42 | Orthographical rules for pinyin are applied. (special case for y, w, ui, un, iu) 43 | 44 | Note that in this system, 'ü' is alway written as 'v' when appeared in phoneme, but converted to 45 | 'u' in syllables when certain conditions are satisfied. 46 | 47 | 'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'. 48 | Erhua is is possibly applied to every finals, except for finals that already ends with 'r'. 49 | When a syllable is impossible or does not have any characters with this pronunciation, return None 50 | to filter it out. 51 | """ 52 | 53 | # 不可拼的音节, ii 只能和 z, c, s 拼 54 | if V in ["ii"] and (C not in ['z', 'c', 's']): 55 | return None 56 | # iii 只能和 zh, ch, sh, r 拼 57 | if V in ['iii'] and (C not in ['zh', 'ch', 'sh', 'r']): 58 | return None 59 | 60 | # 齐齿呼或者撮口呼不能和 f, g, k, h, zh, ch, sh, r, z, c, s 61 | if (V not in ['ii', 'iii']) and V[0] in ['i', 'v'] and ( 62 | C in ['f', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's']): 63 | return None 64 | 65 | # 撮口呼只能和 j, q, x l, n 拼 66 | if V.startswith("v"): 67 | # v, ve 只能和 j ,q , x, n, l 拼 68 | if V in ['v', 've']: 69 | if C not in ['j', 'q', 'x', 'n', 'l', '']: 70 | return None 71 | # 其他只能和 j, q, x 拼 72 | else: 73 | if C not in ['j', 'q', 'x', '']: 74 | return None 75 | 76 | # j, q, x 只能和齐齿呼或者撮口呼拼 77 | if (C in ['j', 'q', 'x']) and not ( 78 | (V not in ['ii', 'iii']) and V[0] in ['i', 'v']): 79 | return None 80 | 81 | # b, p ,m, f 不能和合口呼拼,除了 u 之外 82 | # bm p, m, f 不能和撮口呼拼 83 | if (C in ['b', 'p', 'm', 'f']) and ((V[0] in ['u', 'v'] and V != "u") or 84 | V == 'ong'): 85 | return None 86 | 87 | # ua, uai, uang 不能和 d, t, n, l, r, z, c, s 拼 88 | if V in ['ua', 'uai', 89 | 'uang'] and C in ['d', 't', 'n', 'l', 'r', 'z', 'c', 's']: 90 | return None 91 | 92 | # sh 和 ong 不能拼 93 | if V == 'ong' and C in ['sh']: 94 | return None 95 | 96 | # o 和 gkh, zh ch sh r z c s 不能拼 97 | if V == "o" and C in [ 98 | 'd', 't', 'n', 'g', 'k', 'h', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's' 99 | ]: 100 | return None 101 | 102 | # ueng 只是 weng 这个 ad-hoc 其他情况下都是 ong 103 | if V == 'ueng' and C != '': 104 | return 105 | 106 | # 非儿化的 er 只能单独存在 107 | if V == 'er' and C != '': 108 | return None 109 | 110 | if C == '': 111 | if V in ["i", "in", "ing"]: 112 | C = 'y' 113 | elif V == 'u': 114 | C = 'w' 115 | elif V.startswith('i') and V not in ["ii", "iii"]: 116 | C = 'y' 117 | V = V[1:] 118 | elif V.startswith('u'): 119 | C = 'w' 120 | V = V[1:] 121 | elif V.startswith('v'): 122 | C = 'yu' 123 | V = V[1:] 124 | else: 125 | if C in ['j', 'q', 'x']: 126 | if V.startswith('v'): 127 | V = re.sub('v', 'u', V) 128 | if V == 'iou': 129 | V = 'iu' 130 | elif V == 'uei': 131 | V = 'ui' 132 | elif V == 'uen': 133 | V = 'un' 134 | result = C + V 135 | 136 | # Filter er 不能再儿化 137 | if result.endswith('r') and R == 'r': 138 | return None 139 | 140 | # ii and iii, change back to i 141 | result = re.sub(r'i+', 'i', result) 142 | 143 | result = result + R + T 144 | return result 145 | 146 | 147 | def generate_lexicon(with_tone=False, with_erhua=False): 148 | """Generate lexicon for Mandarin Chinese.""" 149 | syllables = OrderedDict() 150 | 151 | for C in [''] + INITIALS: 152 | for V in FINALS: 153 | for R in [''] if not with_erhua else ['', 'r']: 154 | for T in [''] if not with_tone else ['1', '2', '3', '4', '5']: 155 | result = rule(C, V, R, T) 156 | if result: 157 | syllables[result] = f'{C} {V}{R}{T}' 158 | return syllables 159 | -------------------------------------------------------------------------------- /textfrontend/mix_frontend.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Dict 15 | from typing import List 16 | 17 | import paddle 18 | 19 | from paddlespeech.t2s.frontend import English 20 | from paddlespeech.t2s.frontend.zh_frontend import Frontend 21 | 22 | 23 | class MixFrontend(): 24 | def __init__(self, 25 | g2p_model="pypinyin", 26 | phone_vocab_path=None, 27 | tone_vocab_path=None): 28 | 29 | self.zh_frontend = Frontend( 30 | phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path) 31 | self.en_frontend = English(phone_vocab_path=phone_vocab_path) 32 | self.sp_id = self.zh_frontend.vocab_phones["sp"] 33 | self.sp_id_tensor = paddle.to_tensor([self.sp_id]) 34 | 35 | def is_chinese(self, char): 36 | if char >= '\u4e00' and char <= '\u9fa5': 37 | return True 38 | else: 39 | return False 40 | 41 | def is_alphabet(self, char): 42 | if (char >= '\u0041' and char <= '\u005a') or (char >= '\u0061' and 43 | char <= '\u007a'): 44 | return True 45 | else: 46 | return False 47 | 48 | def is_other(self, char): 49 | if not (self.is_chinese(char) or self.is_alphabet(char)): 50 | return True 51 | else: 52 | return False 53 | 54 | def get_segment(self, text: str) -> List[str]: 55 | # sentence --> [ch_part, en_part, ch_part, ...] 56 | segments = [] 57 | types = [] 58 | flag = 0 59 | temp_seg = "" 60 | temp_lang = "" 61 | 62 | # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point. 63 | for ch in text: 64 | if self.is_chinese(ch): 65 | types.append("zh") 66 | elif self.is_alphabet(ch): 67 | types.append("en") 68 | else: 69 | types.append("other") 70 | 71 | assert len(types) == len(text) 72 | 73 | for i in range(len(types)): 74 | # find the first char of the seg 75 | if flag == 0: 76 | temp_seg += text[i] 77 | temp_lang = types[i] 78 | flag = 1 79 | 80 | else: 81 | if temp_lang == "other": 82 | if types[i] == temp_lang: 83 | temp_seg += text[i] 84 | else: 85 | temp_seg += text[i] 86 | temp_lang = types[i] 87 | 88 | else: 89 | if types[i] == temp_lang: 90 | temp_seg += text[i] 91 | elif types[i] == "other": 92 | temp_seg += text[i] 93 | else: 94 | segments.append((temp_seg, temp_lang)) 95 | temp_seg = text[i] 96 | temp_lang = types[i] 97 | flag = 1 98 | 99 | segments.append((temp_seg, temp_lang)) 100 | 101 | return segments 102 | 103 | def get_input_ids(self, 104 | sentence: str, 105 | merge_sentences: bool=False, 106 | get_tone_ids: bool=False, 107 | add_sp: bool=True, 108 | to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: 109 | 110 | segments = self.get_segment(sentence) 111 | 112 | phones_list = [] 113 | result = {} 114 | 115 | for seg in segments: 116 | content = seg[0] 117 | lang = seg[1] 118 | if content != '': 119 | if lang == "en": 120 | input_ids = self.en_frontend.get_input_ids( 121 | content, merge_sentences=False, to_tensor=to_tensor) 122 | else: 123 | input_ids = self.zh_frontend.get_input_ids( 124 | content, 125 | merge_sentences=False, 126 | get_tone_ids=get_tone_ids, 127 | to_tensor=to_tensor) 128 | if add_sp: 129 | input_ids["phone_ids"][-1] = paddle.concat( 130 | [input_ids["phone_ids"][-1], self.sp_id_tensor]) 131 | 132 | for phones in input_ids["phone_ids"]: 133 | phones_list.append(phones) 134 | 135 | if merge_sentences: 136 | merge_list = paddle.concat(phones_list) 137 | # rm the last 'sp' to avoid the noise at the end 138 | # cause in the training data, no 'sp' in the end 139 | if merge_list[-1] == self.sp_id_tensor: 140 | merge_list = merge_list[:-1] 141 | phones_list = [] 142 | phones_list.append(merge_list) 143 | 144 | result["phone_ids"] = phones_list 145 | 146 | return result 147 | -------------------------------------------------------------------------------- /textfrontend/normalizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .normalizer import * 15 | from .numbers import * 16 | -------------------------------------------------------------------------------- /textfrontend/normalizer/abbrrviation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /textfrontend/normalizer/acronyms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /textfrontend/normalizer/normalizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import unicodedata 16 | from builtins import str as unicode 17 | 18 | from .numbers import normalize_numbers 19 | 20 | 21 | def normalize(sentence): 22 | """ Normalize English text. 23 | """ 24 | # preprocessing 25 | sentence = unicode(sentence) 26 | sentence = normalize_numbers(sentence) 27 | sentence = ''.join( 28 | char for char in unicodedata.normalize('NFD', sentence) 29 | if unicodedata.category(char) != 'Mn') # Strip accents 30 | sentence = sentence.lower() 31 | sentence = re.sub(r"[^ a-z'.,?!\-]", "", sentence) 32 | sentence = sentence.replace("i.e.", "that is") 33 | sentence = sentence.replace("e.g.", "for example") 34 | return sentence 35 | -------------------------------------------------------------------------------- /textfrontend/normalizer/numbers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # number expansion is not that easy 15 | import re 16 | 17 | import inflect 18 | 19 | _inflect = inflect.engine() 20 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 21 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 22 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 23 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 24 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 25 | _number_re = re.compile(r'[0-9]+') 26 | 27 | 28 | def _remove_commas(m): 29 | return m.group(1).replace(',', '') 30 | 31 | 32 | def _expand_decimal_point(m): 33 | return m.group(1).replace('.', ' point ') 34 | 35 | 36 | def _expand_dollars(m): 37 | match = m.group(1) 38 | parts = match.split('.') 39 | if len(parts) > 2: 40 | return match + ' dollars' # Unexpected format 41 | dollars = int(parts[0]) if parts[0] else 0 42 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 43 | if dollars and cents: 44 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 45 | cent_unit = 'cent' if cents == 1 else 'cents' 46 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 47 | elif dollars: 48 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 49 | return '%s %s' % (dollars, dollar_unit) 50 | elif cents: 51 | cent_unit = 'cent' if cents == 1 else 'cents' 52 | return '%s %s' % (cents, cent_unit) 53 | else: 54 | return 'zero dollars' 55 | 56 | 57 | def _expand_ordinal(m): 58 | return _inflect.number_to_words(m.group(0)) 59 | 60 | 61 | def _expand_number(m): 62 | num = int(m.group(0)) 63 | if num > 1000 and num < 3000: 64 | if num == 2000: 65 | return 'two thousand' 66 | elif num > 2000 and num < 2010: 67 | return 'two thousand ' + _inflect.number_to_words(num % 100) 68 | elif num % 100 == 0: 69 | return _inflect.number_to_words(num // 100) + ' hundred' 70 | else: 71 | return _inflect.number_to_words( 72 | num, andword='', zero='oh', group=2).replace(', ', ' ') 73 | else: 74 | return _inflect.number_to_words(num, andword='') 75 | 76 | 77 | def normalize_numbers(text): 78 | """ Normalize numbers in English text. 79 | """ 80 | text = re.sub(_comma_number_re, _remove_commas, text) 81 | text = re.sub(_pounds_re, r'\1 pounds', text) 82 | text = re.sub(_dollars_re, _expand_dollars, text) 83 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 84 | text = re.sub(_ordinal_re, _expand_ordinal, text) 85 | text = re.sub(_number_re, _expand_number, text) 86 | return text 87 | -------------------------------------------------------------------------------- /textfrontend/normalizer/width.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def full2half_width(ustr): 17 | half = [] 18 | for u in ustr: 19 | num = ord(u) 20 | if num == 0x3000: # 全角空格变半角 21 | num = 32 22 | elif 0xFF01 <= num <= 0xFF5E: 23 | num -= 0xfee0 24 | u = chr(num) 25 | half.append(u) 26 | return ''.join(half) 27 | 28 | 29 | def half2full_width(ustr): 30 | full = [] 31 | for u in ustr: 32 | num = ord(u) 33 | if num == 32: # 半角空格变全角 34 | num = 0x3000 35 | elif 0x21 <= num <= 0x7E: 36 | num += 0xfee0 37 | u = chr(num) # to unicode 38 | full.append(u) 39 | 40 | return ''.join(full) 41 | -------------------------------------------------------------------------------- /textfrontend/phonectic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from abc import ABC 15 | from abc import abstractmethod 16 | from typing import List 17 | 18 | import numpy as np 19 | from torch import Tensor 20 | import torch 21 | from g2p_en import G2p 22 | from g2pM import G2pM 23 | 24 | from textfrontend.normalizer.normalizer import normalize 25 | from textfrontend.punctuation import get_punctuations 26 | from textfrontend.vocab import Vocab 27 | from textfrontend.zh_normalization.text_normlization import TextNormalizer 28 | 29 | # discard opencc untill we find an easy solution to install it on windows 30 | # from opencc import OpenCC 31 | 32 | __all__ = ["Phonetics", "English", "EnglishCharacter", "Chinese"] 33 | 34 | 35 | class Phonetics(ABC): 36 | @abstractmethod 37 | def __call__(self, sentence): 38 | pass 39 | 40 | @abstractmethod 41 | def phoneticize(self, sentence): 42 | pass 43 | 44 | @abstractmethod 45 | def numericalize(self, phonemes): 46 | pass 47 | 48 | 49 | class English(Phonetics): 50 | """ Normalize the input text sequence and convert into pronunciation id sequence. 51 | """ 52 | 53 | def __init__(self, phone_vocab_path=None): 54 | self.backend = G2p() 55 | self.phonemes = list(self.backend.phonemes) 56 | self.punctuations = get_punctuations("en") 57 | self.vocab = Vocab(self.phonemes + self.punctuations) 58 | self.vocab_phones = {} 59 | self.punc = ":,;。?!“”‘’':,;.?!" 60 | self.text_normalizer = TextNormalizer() 61 | if phone_vocab_path: 62 | with open(phone_vocab_path, 'rt') as f: 63 | phn_id = [line.strip().split() for line in f.readlines()] 64 | for phn, id in phn_id: 65 | self.vocab_phones[phn] = int(id) 66 | 67 | def phoneticize(self, sentence): 68 | """ Normalize the input text sequence and convert it into pronunciation sequence. 69 | Args: 70 | sentence (str): The input text sequence. 71 | Returns: 72 | List[str]: The list of pronunciation sequence. 73 | """ 74 | start = self.vocab.start_symbol 75 | end = self.vocab.end_symbol 76 | phonemes = ([] if start is None else [start]) \ 77 | + self.backend(sentence) \ 78 | + ([] if end is None else [end]) 79 | phonemes = [item for item in phonemes if item in self.vocab.stoi] 80 | return phonemes 81 | 82 | def _p2id(self, phonemes: List[str]) -> np.array: 83 | phone_ids = [self.vocab_phones[item] for item in phonemes] 84 | return np.array(phone_ids, np.int64) 85 | 86 | def get_input_ids(self, 87 | sentence: str, 88 | merge_sentences: bool=False, 89 | to_tensor: bool=True) -> Tensor: 90 | result = {} 91 | sentences = self.text_normalizer._split(sentence, lang="en") 92 | phones_list = [] 93 | temp_phone_ids = [] 94 | for sentence in sentences: 95 | phones = self.phoneticize(sentence) 96 | # remove start_symbol and end_symbol 97 | phones = phones[1:-1] 98 | phones = [phn for phn in phones if not phn.isspace()] 99 | # replace unk phone with sp 100 | phones = [ 101 | phn 102 | if (phn in self.vocab_phones and phn not in self.punc) else "sp" 103 | for phn in phones 104 | ] 105 | if len(phones) != 0: 106 | phones_list.append(phones) 107 | 108 | if merge_sentences: 109 | merge_list = sum(phones_list, []) 110 | # rm the last 'sp' to avoid the noise at the end 111 | # cause in the training data, no 'sp' in the end 112 | if merge_list[-1] == 'sp': 113 | merge_list = merge_list[:-1] 114 | phones_list = [] 115 | phones_list.append(merge_list) 116 | 117 | for part_phones_list in phones_list: 118 | phone_ids = self._p2id(part_phones_list) 119 | if to_tensor: 120 | phone_ids = torch.as_tensor(phone_ids) 121 | temp_phone_ids.append(phone_ids) 122 | result["phone_ids"] = temp_phone_ids 123 | return result 124 | 125 | def numericalize(self, phonemes): 126 | """ Convert pronunciation sequence into pronunciation id sequence. 127 | Args: 128 | phonemes (List[str]): The list of pronunciation sequence. 129 | Returns: 130 | List[int]: The list of pronunciation id sequence. 131 | """ 132 | ids = [ 133 | self.vocab.lookup(item) for item in phonemes 134 | if item in self.vocab.stoi 135 | ] 136 | return ids 137 | 138 | def reverse(self, ids): 139 | """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. 140 | Args: 141 | ids (List[int]): The list of pronunciation id sequence. 142 | Returns: 143 | List[str]: The list of pronunciation sequence. 144 | """ 145 | return [self.vocab.reverse(i) for i in ids] 146 | 147 | def __call__(self, sentence): 148 | """ Convert the input text sequence into pronunciation id sequence. 149 | Args: 150 | sentence(str): The input text sequence. 151 | Returns: 152 | List[str]: The list of pronunciation id sequence. 153 | """ 154 | return self.numericalize(self.phoneticize(sentence)) 155 | 156 | @property 157 | def vocab_size(self): 158 | """ Vocab size. 159 | """ 160 | return len(self.vocab) 161 | 162 | 163 | class EnglishCharacter(Phonetics): 164 | """ Normalize the input text sequence and convert it into character id sequence. 165 | """ 166 | 167 | def __init__(self): 168 | self.backend = G2p() 169 | self.graphemes = list(self.backend.graphemes) 170 | self.punctuations = get_punctuations("en") 171 | self.vocab = Vocab(self.graphemes + self.punctuations) 172 | 173 | def phoneticize(self, sentence): 174 | """ Normalize the input text sequence. 175 | Args: 176 | sentence(str): The input text sequence. 177 | Returns: 178 | str: A text sequence after normalize. 179 | """ 180 | words = normalize(sentence) 181 | return words 182 | 183 | def numericalize(self, sentence): 184 | """ Convert a text sequence into ids. 185 | Args: 186 | sentence (str): The input text sequence. 187 | Returns: 188 | List[int]: 189 | List of a character id sequence. 190 | """ 191 | ids = [ 192 | self.vocab.lookup(item) for item in sentence 193 | if item in self.vocab.stoi 194 | ] 195 | return ids 196 | 197 | def reverse(self, ids): 198 | """ Convert a character id sequence into text. 199 | Args: 200 | ids (List[int]): List of a character id sequence. 201 | Returns: 202 | str: The input text sequence. 203 | """ 204 | return [self.vocab.reverse(i) for i in ids] 205 | 206 | def __call__(self, sentence): 207 | """ Normalize the input text sequence and convert it into character id sequence. 208 | Args: 209 | sentence (str): The input text sequence. 210 | Returns: 211 | List[int]: List of a character id sequence. 212 | """ 213 | return self.numericalize(self.phoneticize(sentence)) 214 | 215 | @property 216 | def vocab_size(self): 217 | """ Vocab size. 218 | """ 219 | return len(self.vocab) 220 | 221 | 222 | class Chinese(Phonetics): 223 | """Normalize Chinese text sequence and convert it into ids. 224 | """ 225 | 226 | def __init__(self): 227 | # self.opencc_backend = OpenCC('t2s.json') 228 | self.backend = G2pM() 229 | self.phonemes = self._get_all_syllables() 230 | self.punctuations = get_punctuations("cn") 231 | self.vocab = Vocab(self.phonemes + self.punctuations) 232 | 233 | def _get_all_syllables(self): 234 | all_syllables = set([ 235 | syllable for k, v in self.backend.cedict.items() for syllable in v 236 | ]) 237 | return list(all_syllables) 238 | 239 | def phoneticize(self, sentence): 240 | """ Normalize the input text sequence and convert it into pronunciation sequence. 241 | Args: 242 | sentence(str): The input text sequence. 243 | Returns: 244 | List[str]: The list of pronunciation sequence. 245 | """ 246 | # simplified = self.opencc_backend.convert(sentence) 247 | simplified = sentence 248 | phonemes = self.backend(simplified) 249 | start = self.vocab.start_symbol 250 | end = self.vocab.end_symbol 251 | phonemes = ([] if start is None else [start]) \ 252 | + phonemes \ 253 | + ([] if end is None else [end]) 254 | return self._filter_symbols(phonemes) 255 | 256 | def _filter_symbols(self, phonemes): 257 | cleaned_phonemes = [] 258 | for item in phonemes: 259 | if item in self.vocab.stoi: 260 | cleaned_phonemes.append(item) 261 | else: 262 | for char in item: 263 | if char in self.vocab.stoi: 264 | cleaned_phonemes.append(char) 265 | return cleaned_phonemes 266 | 267 | def numericalize(self, phonemes): 268 | """ Convert pronunciation sequence into pronunciation id sequence. 269 | Args: 270 | phonemes(List[str]): The list of pronunciation sequence. 271 | Returns: 272 | List[int]: The list of pronunciation id sequence. 273 | """ 274 | ids = [self.vocab.lookup(item) for item in phonemes] 275 | return ids 276 | 277 | def __call__(self, sentence): 278 | """ Convert the input text sequence into pronunciation id sequence. 279 | Args: 280 | sentence (str): The input text sequence. 281 | Returns: 282 | List[str]: The list of pronunciation id sequence. 283 | """ 284 | return self.numericalize(self.phoneticize(sentence)) 285 | 286 | @property 287 | def vocab_size(self): 288 | """ Vocab size. 289 | """ 290 | return len(self.vocab) 291 | 292 | def reverse(self, ids): 293 | """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. 294 | Args: 295 | ids (List[int]): The list of pronunciation id sequence. 296 | Returns: 297 | List[str]: The list of pronunciation sequence. 298 | """ 299 | return [self.vocab.reverse(i) for i in ids] 300 | -------------------------------------------------------------------------------- /textfrontend/polyphonic.yaml: -------------------------------------------------------------------------------- 1 | polyphonic: 2 | 湖泊: ['hu2','po1'] 3 | 地壳: ['di4','qiao4'] 4 | 柏树: ['bai3','shu4'] 5 | 曝光: ['bao4','guang1'] 6 | 弹力: ['tan2','li4'] 7 | 字帖: ['zi4','tie4'] 8 | 口吃: ['kou3','chi1'] 9 | 包扎: ['bao1','za1'] 10 | 哪吒: ['ne2','zha1'] 11 | 说服: ['shuo1','fu2'] 12 | 识字: ['shi2','zi4'] 13 | 骨头: ['gu3','tou5'] 14 | 对称: ['dui4','chen4'] 15 | 口供: ['kou3','gong4'] 16 | 抹布: ['ma1','bu4'] 17 | 露背: ['lu4','bei4'] 18 | 圈养: ['juan4', 'yang3'] 19 | 眼眶: ['yan3', 'kuang4'] 20 | 品行: ['pin3','xing2'] 21 | 颤抖: ['chan4','dou3'] 22 | 差不多: ['cha4','bu5','duo1'] 23 | 鸭绿江: ['ya1','lu4','jiang1'] 24 | 撒切尔: ['sa4','qie4','er3'] 25 | 比比皆是: ['bi3','bi3','jie1','shi4'] 26 | 身无长物: ['shen1','wu2','chang2','wu4'] 27 | 手里: ['shou2','li3'] 28 | 关卡: ['guan1','qia3'] 29 | 怀揣: ['huai2','chuai1'] 30 | 挑剔: ['tiao1','ti4'] 31 | 供称: ['gong4','cheng1'] 32 | 作坊: ['zuo1', 'fang5'] 33 | 中医: ['zhong1','yi1'] 34 | 嚷嚷: ['rang1','rang5'] 35 | 商厦: ['shang1','sha4'] 36 | 大厦: ['da4','sha4'] 37 | 刹车: ['sha1','che1'] 38 | 嘚瑟: ['de4','se5'] 39 | 朝鲜: ['chao2','xian3'] 40 | 阿房宫: ['e1','pang2','gong1'] 41 | 阿胶: ['e1','jiao1'] 42 | 咖喱: ['ga1','li5'] 43 | 时分: ['shi2','fen1'] 44 | 蚌埠: ['beng4','bu4'] 45 | 驯服: ['xun4','fu2'] 46 | 幸免于难: ['xing4','mian3','yu2','nan4'] 47 | 恶行: ['e4','xing2'] 48 | 唉: ['ai4'] 49 | 扎实: ['zha1','shi2'] 50 | 干将: ['gan4','jiang4'] -------------------------------------------------------------------------------- /textfrontend/punctuation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __all__ = ["get_punctuations"] 16 | 17 | EN_PUNCT = [ 18 | " ", 19 | "-", 20 | "...", 21 | ",", 22 | ".", 23 | "?", 24 | "!", 25 | ] 26 | 27 | CN_PUNCT = ["、", ",", ";", ":", "。", "?", "!"] 28 | 29 | 30 | def get_punctuations(lang): 31 | if lang == "en": 32 | return EN_PUNCT 33 | elif lang == "cn": 34 | return CN_PUNCT 35 | else: 36 | raise ValueError(f"language {lang} Not supported") 37 | -------------------------------------------------------------------------------- /textfrontend/ssml/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .xml_processor import * 15 | -------------------------------------------------------------------------------- /textfrontend/ssml/xml_processor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import xml.dom.minidom 4 | import xml.parsers.expat 5 | from xml.dom.minidom import Node 6 | from xml.dom.minidom import parseString 7 | ''' 8 | Note: xml 有5种特殊字符, &<>"' 9 | 其一,采用特殊标签,将包含特殊字符的字符串封装起来。 10 | 例如: 11 | 12 | 其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为: 13 | & & 14 | < < 15 | > > 16 | " " 17 | ' ' 18 | 例如: 19 | "姓名" 20 | 21 | ''' 22 | 23 | 24 | class MixTextProcessor(): 25 | def __repr__(self): 26 | print("@an MixTextProcessor class") 27 | 28 | def get_xml_content(self, mixstr): 29 | '''返回字符串的 xml 内容''' 30 | xmlptn = re.compile(r".*?", re.M | re.S) 31 | ctn = re.search(xmlptn, mixstr) 32 | if ctn: 33 | return ctn.group(0) 34 | else: 35 | return None 36 | 37 | def get_content_split(self, mixstr): 38 | ''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号 39 | 不能去除空格,因为 xml 中tag 属性带空格 40 | ''' 41 | ctlist = [] 42 | # print("Testing:",mixstr[:20]) 43 | patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) 44 | mat = re.match(patn, mixstr) 45 | if mat: 46 | pre_xml = mat.group(1) 47 | in_xml = mat.group(2) 48 | after_xml = mat.group(3) 49 | 50 | ctlist.append(pre_xml) 51 | ctlist.append(in_xml) 52 | ctlist.append(after_xml) 53 | return ctlist 54 | else: 55 | ctlist.append(mixstr) 56 | return ctlist 57 | 58 | @classmethod 59 | def get_pinyin_split(self, mixstr): 60 | ctlist = [] 61 | patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) 62 | mat = re.match(patn, mixstr) 63 | if mat: 64 | pre_xml = mat.group(1) 65 | in_xml = mat.group(2) 66 | after_xml = mat.group(3) 67 | 68 | ctlist.append([pre_xml, []]) 69 | dom = DomXml(in_xml) 70 | pinyinlist = dom.get_pinyins_for_xml() 71 | ctlist = ctlist + pinyinlist 72 | ctlist.append([after_xml, []]) 73 | else: 74 | ctlist.append([mixstr, []]) 75 | return ctlist 76 | 77 | 78 | class DomXml(): 79 | def __init__(self, xmlstr): 80 | self.tdom = parseString(xmlstr) #Document 81 | self.root = self.tdom.documentElement #Element 82 | self.rnode = self.tdom.childNodes #NodeList 83 | 84 | def get_text(self): 85 | '''返回 xml 内容的所有文本内容的列表''' 86 | res = [] 87 | 88 | for x1 in self.rnode: 89 | if x1.nodeType == Node.TEXT_NODE: 90 | res.append(x1.value) 91 | else: 92 | for x2 in x1.childNodes: 93 | if isinstance(x2, xml.dom.minidom.Text): 94 | res.append(x2.data) 95 | else: 96 | for x3 in x2.childNodes: 97 | if isinstance(x3, xml.dom.minidom.Text): 98 | res.append(x3.data) 99 | else: 100 | print("len(nodes of x3):", len(x3.childNodes)) 101 | 102 | return res 103 | 104 | def get_xmlchild_list(self): 105 | '''返回 xml 内容的列表,包括所有文本内容(不带 tag)''' 106 | res = [] 107 | 108 | for x1 in self.rnode: 109 | if x1.nodeType == Node.TEXT_NODE: 110 | res.append(x1.value) 111 | else: 112 | for x2 in x1.childNodes: 113 | if isinstance(x2, xml.dom.minidom.Text): 114 | res.append(x2.data) 115 | else: 116 | for x3 in x2.childNodes: 117 | if isinstance(x3, xml.dom.minidom.Text): 118 | res.append(x3.data) 119 | else: 120 | print("len(nodes of x3):", len(x3.childNodes)) 121 | print(res) 122 | return res 123 | 124 | def get_pinyins_for_xml(self): 125 | '''返回 xml 内容,字符串和拼音的 list ''' 126 | res = [] 127 | 128 | for x1 in self.rnode: 129 | if x1.nodeType == Node.TEXT_NODE: 130 | t = re.sub(r"\s+", "", x1.value) 131 | res.append([t, []]) 132 | else: 133 | for x2 in x1.childNodes: 134 | if isinstance(x2, xml.dom.minidom.Text): 135 | t = re.sub(r"\s+", "", x2.data) 136 | res.append([t, []]) 137 | else: 138 | # print("x2",x2,x2.tagName) 139 | if x2.hasAttribute('pinyin'): 140 | pinyin_value = x2.getAttribute("pinyin") 141 | pinyins = pinyin_value.split(" ") 142 | for x3 in x2.childNodes: 143 | # print('x3',x3) 144 | if isinstance(x3, xml.dom.minidom.Text): 145 | t = re.sub(r"\s+", "", x3.data) 146 | res.append([t, pinyins]) 147 | else: 148 | print("len(nodes of x3):", len(x3.childNodes)) 149 | 150 | return res 151 | 152 | def get_all_tags(self, tag_name): 153 | '''获取所有的 tag 及属性值''' 154 | alltags = self.root.getElementsByTagName(tag_name) 155 | for x in alltags: 156 | if x.hasAttribute('pinyin'): # pinyin 157 | print(x.tagName, 'pinyin', 158 | x.getAttribute('pinyin'), x.firstChild.data) 159 | -------------------------------------------------------------------------------- /textfrontend/tone_sandhi.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import List 15 | from typing import Tuple 16 | 17 | import jieba 18 | from pypinyin import lazy_pinyin 19 | from pypinyin import Style 20 | 21 | 22 | class ToneSandhi(): 23 | def __init__(self): 24 | self.must_neural_tone_words = { 25 | '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝', 26 | '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊', 27 | '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去', 28 | '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号', 29 | '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当', 30 | '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻', 31 | '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂', 32 | '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆', 33 | '戏弄', '将军', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂', 34 | '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿', 35 | '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台', 36 | '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算', 37 | '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨', 38 | '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快', 39 | '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜', 40 | '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔', 41 | '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事', 42 | '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾', 43 | '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼', 44 | '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担', 45 | '戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人', 46 | '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事', 47 | '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股', 48 | '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在', 49 | '官司', '学问', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', '姑娘', '姐夫', 50 | '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', '大意', '大夫', 51 | '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', '嘱咐', '嘟囔', 52 | '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', '咳嗽', '和尚', 53 | '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', '叫唤', '口袋', 54 | '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', '功夫', '力气', 55 | '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', '凑合', '凉快', 56 | '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', '佩服', '作坊', 57 | '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', '交情', '云彩', 58 | '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', '不由', '下水', 59 | '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜', 60 | '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划', 61 | '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜', 62 | '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记' 63 | } 64 | self.must_not_neural_tone_words = { 65 | '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎', 66 | '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得', 67 | '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打', 68 | '考考', '整整', '莘莘', '落地', '算子', '家家户户' 69 | } 70 | self.punc = ":,;。?!“”‘’':,;.?!" 71 | 72 | # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 73 | # e.g. 74 | # word: "家里" 75 | # pos: "s" 76 | # finals: ['ia1', 'i3'] 77 | def _neural_sandhi(self, word: str, pos: str, 78 | finals: List[str]) -> List[str]: 79 | if word in self.must_not_neural_tone_words: 80 | return finals 81 | # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 82 | for j, item in enumerate(word): 83 | if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: 84 | finals[j] = finals[j][:-1] + "5" 85 | ge_idx = word.find("个") 86 | if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶": 87 | finals[-1] = finals[-1][:-1] + "5" 88 | elif len(word) >= 1 and word[-1] in "的地得": 89 | finals[-1] = finals[-1][:-1] + "5" 90 | # e.g. 走了, 看着, 去过 91 | elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: 92 | finals[-1] = finals[-1][:-1] + "5" 93 | elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"}: 94 | finals[-1] = finals[-1][:-1] + "5" 95 | # e.g. 桌上, 地下 96 | elif len(word) > 1 and word[-1] in "上下" and pos in {"s", "l", "f"}: 97 | finals[-1] = finals[-1][:-1] + "5" 98 | # e.g. 上来, 下去 99 | elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": 100 | finals[-1] = finals[-1][:-1] + "5" 101 | # 个做量词 102 | elif (ge_idx >= 1 and 103 | (word[ge_idx - 1].isnumeric() or 104 | word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个': 105 | finals[ge_idx] = finals[ge_idx][:-1] + "5" 106 | else: 107 | if word in self.must_neural_tone_words or word[ 108 | -2:] in self.must_neural_tone_words: 109 | finals[-1] = finals[-1][:-1] + "5" 110 | 111 | word_list = self._split_word(word) 112 | finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]] 113 | for i, word in enumerate(word_list): 114 | # conventional neural in Chinese 115 | if word in self.must_neural_tone_words or word[ 116 | -2:] in self.must_neural_tone_words: 117 | finals_list[i][-1] = finals_list[i][-1][:-1] + "5" 118 | finals = sum(finals_list, []) 119 | return finals 120 | 121 | def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: 122 | # e.g. 看不懂 123 | if len(word) == 3 and word[1] == "不": 124 | finals[1] = finals[1][:-1] + "5" 125 | else: 126 | for i, char in enumerate(word): 127 | # "不" before tone4 should be bu2, e.g. 不怕 128 | if char == "不" and i + 1 < len(word) and finals[i + 129 | 1][-1] == "4": 130 | finals[i] = finals[i][:-1] + "2" 131 | return finals 132 | 133 | def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: 134 | # "一" in number sequences, e.g. 一零零, 二一零 135 | if word.find("一") != -1 and all( 136 | [item.isnumeric() for item in word if item != "一"]): 137 | return finals 138 | # "一" between reduplication words shold be yi5, e.g. 看一看 139 | elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: 140 | finals[1] = finals[1][:-1] + "5" 141 | # when "一" is ordinal word, it should be yi1 142 | elif word.startswith("第一"): 143 | finals[1] = finals[1][:-1] + "1" 144 | else: 145 | for i, char in enumerate(word): 146 | if char == "一" and i + 1 < len(word): 147 | # "一" before tone4 should be yi2, e.g. 一段 148 | if finals[i + 1][-1] in {'4', '5'}: 149 | finals[i] = finals[i][:-1] + "2" 150 | # "一" before non-tone4 should be yi4, e.g. 一天 151 | else: 152 | # "一" 后面如果是标点,还读一声 153 | if word[i + 1] not in self.punc: 154 | finals[i] = finals[i][:-1] + "4" 155 | return finals 156 | 157 | def _split_word(self, word: str) -> List[str]: 158 | word_list = jieba.cut_for_search(word) 159 | word_list = sorted(word_list, key=lambda i: len(i), reverse=False) 160 | first_subword = word_list[0] 161 | first_begin_idx = word.find(first_subword) 162 | if first_begin_idx == 0: 163 | second_subword = word[len(first_subword):] 164 | new_word_list = [first_subword, second_subword] 165 | else: 166 | second_subword = word[:-len(first_subword)] 167 | new_word_list = [second_subword, first_subword] 168 | return new_word_list 169 | 170 | def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: 171 | 172 | if len(word) == 2 and self._all_tone_three(finals): 173 | finals[0] = finals[0][:-1] + "2" 174 | elif len(word) == 3: 175 | word_list = self._split_word(word) 176 | if self._all_tone_three(finals): 177 | # disyllabic + monosyllabic, e.g. 蒙古/包 178 | if len(word_list[0]) == 2: 179 | finals[0] = finals[0][:-1] + "2" 180 | finals[1] = finals[1][:-1] + "2" 181 | # monosyllabic + disyllabic, e.g. 纸/老虎 182 | elif len(word_list[0]) == 1: 183 | finals[1] = finals[1][:-1] + "2" 184 | else: 185 | finals_list = [ 186 | finals[:len(word_list[0])], finals[len(word_list[0]):] 187 | ] 188 | if len(finals_list) == 2: 189 | for i, sub in enumerate(finals_list): 190 | # e.g. 所有/人 191 | if self._all_tone_three(sub) and len(sub) == 2: 192 | finals_list[i][0] = finals_list[i][0][:-1] + "2" 193 | # e.g. 好/喜欢 194 | elif i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and \ 195 | finals_list[0][-1][-1] == "3": 196 | 197 | finals_list[0][-1] = finals_list[0][-1][:-1] + "2" 198 | finals = sum(finals_list, []) 199 | # split idiom into two words who's length is 2 200 | elif len(word) == 4: 201 | finals_list = [finals[:2], finals[2:]] 202 | finals = [] 203 | for sub in finals_list: 204 | if self._all_tone_three(sub): 205 | sub[0] = sub[0][:-1] + "2" 206 | finals += sub 207 | 208 | return finals 209 | 210 | def _all_tone_three(self, finals: List[str]) -> bool: 211 | return all(x[-1] == "3" for x in finals) 212 | 213 | # merge "不" and the word behind it 214 | # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error 215 | def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: 216 | new_seg = [] 217 | last_word = "" 218 | for word, pos in seg: 219 | if last_word == "不": 220 | word = last_word + word 221 | if word != "不": 222 | new_seg.append((word, pos)) 223 | last_word = word[:] 224 | if last_word == "不": 225 | new_seg.append((last_word, 'd')) 226 | last_word = "" 227 | return new_seg 228 | 229 | # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听" 230 | # function 2: merge single "一" and the word behind it 231 | # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error 232 | # e.g. 233 | # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')] 234 | # output seg: [['听一听', 'v']] 235 | def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: 236 | new_seg = [] 237 | # function 1 238 | for i, (word, pos) in enumerate(seg): 239 | if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ 240 | 0] == seg[i + 1][0] and seg[i - 1][1] == "v": 241 | if i - 1 < len(new_seg): 242 | new_seg[i - 243 | 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] 244 | else: 245 | new_seg.append([word, pos]) 246 | new_seg.append([seg[i + 1][0], pos]) 247 | else: 248 | if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][ 249 | 0] == word and pos == "v": 250 | continue 251 | else: 252 | new_seg.append([word, pos]) 253 | seg = new_seg 254 | new_seg = [] 255 | # function 2 256 | for i, (word, pos) in enumerate(seg): 257 | if new_seg and new_seg[-1][0] == "一": 258 | new_seg[-1][0] = new_seg[-1][0] + word 259 | else: 260 | new_seg.append([word, pos]) 261 | return new_seg 262 | 263 | # the first and the second words are all_tone_three 264 | def _merge_continuous_three_tones( 265 | self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: 266 | new_seg = [] 267 | sub_finals_list = [ 268 | lazy_pinyin( 269 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) 270 | for (word, pos) in seg 271 | ] 272 | assert len(sub_finals_list) == len(seg) 273 | merge_last = [False] * len(seg) 274 | for i, (word, pos) in enumerate(seg): 275 | if i - 1 >= 0 and self._all_tone_three( 276 | sub_finals_list[i - 1]) and self._all_tone_three( 277 | sub_finals_list[i]) and not merge_last[i - 1]: 278 | # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi 279 | if not self._is_reduplication(seg[i - 1][0]) and len( 280 | seg[i - 1][0]) + len(seg[i][0]) <= 3: 281 | new_seg[-1][0] = new_seg[-1][0] + seg[i][0] 282 | merge_last[i] = True 283 | else: 284 | new_seg.append([word, pos]) 285 | else: 286 | new_seg.append([word, pos]) 287 | 288 | return new_seg 289 | 290 | def _is_reduplication(self, word: str) -> bool: 291 | return len(word) == 2 and word[0] == word[1] 292 | 293 | # the last char of first word and the first char of second word is tone_three 294 | def _merge_continuous_three_tones_2( 295 | self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: 296 | new_seg = [] 297 | sub_finals_list = [ 298 | lazy_pinyin( 299 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) 300 | for (word, pos) in seg 301 | ] 302 | assert len(sub_finals_list) == len(seg) 303 | merge_last = [False] * len(seg) 304 | for i, (word, pos) in enumerate(seg): 305 | if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \ 306 | merge_last[i - 1]: 307 | # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi 308 | if not self._is_reduplication(seg[i - 1][0]) and len( 309 | seg[i - 1][0]) + len(seg[i][0]) <= 3: 310 | new_seg[-1][0] = new_seg[-1][0] + seg[i][0] 311 | merge_last[i] = True 312 | else: 313 | new_seg.append([word, pos]) 314 | else: 315 | new_seg.append([word, pos]) 316 | return new_seg 317 | 318 | def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: 319 | new_seg = [] 320 | for i, (word, pos) in enumerate(seg): 321 | if i - 1 >= 0 and word == "儿": 322 | new_seg[-1][0] = new_seg[-1][0] + seg[i][0] 323 | else: 324 | new_seg.append([word, pos]) 325 | return new_seg 326 | 327 | def _merge_reduplication( 328 | self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: 329 | new_seg = [] 330 | for i, (word, pos) in enumerate(seg): 331 | if new_seg and word == new_seg[-1][0]: 332 | new_seg[-1][0] = new_seg[-1][0] + seg[i][0] 333 | else: 334 | new_seg.append([word, pos]) 335 | return new_seg 336 | 337 | def pre_merge_for_modify( 338 | self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: 339 | seg = self._merge_bu(seg) 340 | seg = self._merge_yi(seg) 341 | seg = self._merge_reduplication(seg) 342 | seg = self._merge_continuous_three_tones(seg) 343 | seg = self._merge_continuous_three_tones_2(seg) 344 | seg = self._merge_er(seg) 345 | return seg 346 | 347 | def modified_tone(self, word: str, pos: str, 348 | finals: List[str]) -> List[str]: 349 | 350 | finals = self._bu_sandhi(word, finals) 351 | finals = self._yi_sandhi(word, finals) 352 | finals = self._neural_sandhi(word, pos, finals) 353 | finals = self._three_sandhi(word, finals) 354 | return finals 355 | -------------------------------------------------------------------------------- /textfrontend/version.py: -------------------------------------------------------------------------------- 1 | 2 | __version_ = '0.0.1' -------------------------------------------------------------------------------- /textfrontend/vocab.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from collections import OrderedDict 15 | from typing import Iterable 16 | 17 | __all__ = ["Vocab"] 18 | 19 | 20 | class Vocab(object): 21 | """ Vocabulary. 22 | 23 | Args: 24 | symbols (Iterable[str]): Common symbols. 25 | padding_symbol (str, optional): Symbol for pad. Defaults to "". 26 | unk_symbol (str, optional): Symbol for unknow. Defaults to "" 27 | start_symbol (str, optional): Symbol for start. Defaults to "" 28 | end_symbol (str, optional): Symbol for end. Defaults to "" 29 | """ 30 | 31 | def __init__(self, 32 | symbols: Iterable[str], 33 | padding_symbol="", 34 | unk_symbol="", 35 | start_symbol="", 36 | end_symbol=""): 37 | self.special_symbols = OrderedDict() 38 | for i, item in enumerate( 39 | [padding_symbol, unk_symbol, start_symbol, end_symbol]): 40 | if item: 41 | self.special_symbols[item] = len(self.special_symbols) 42 | 43 | self.padding_symbol = padding_symbol 44 | self.unk_symbol = unk_symbol 45 | self.start_symbol = start_symbol 46 | self.end_symbol = end_symbol 47 | 48 | self.stoi = OrderedDict() 49 | self.stoi.update(self.special_symbols) 50 | 51 | for i, s in enumerate(symbols): 52 | if s not in self.stoi: 53 | self.stoi[s] = len(self.stoi) 54 | self.itos = {v: k for k, v in self.stoi.items()} 55 | 56 | def __len__(self): 57 | return len(self.stoi) 58 | 59 | @property 60 | def num_specials(self): 61 | """ The number of special symbols. 62 | """ 63 | return len(self.special_symbols) 64 | 65 | # special tokens 66 | @property 67 | def padding_index(self): 68 | """ The index of padding symbol 69 | """ 70 | return self.stoi.get(self.padding_symbol, -1) 71 | 72 | @property 73 | def unk_index(self): 74 | """The index of unknow symbol. 75 | """ 76 | return self.stoi.get(self.unk_symbol, -1) 77 | 78 | @property 79 | def start_index(self): 80 | """The index of start symbol. 81 | """ 82 | return self.stoi.get(self.start_symbol, -1) 83 | 84 | @property 85 | def end_index(self): 86 | """ The index of end symbol. 87 | """ 88 | return self.stoi.get(self.end_symbol, -1) 89 | 90 | def __repr__(self): 91 | fmt = "Vocab(size: {},\nstoi:\n{})" 92 | return fmt.format(len(self), self.stoi) 93 | 94 | def __str__(self): 95 | return self.__repr__() 96 | 97 | def lookup(self, symbol): 98 | """ The index that symbol correspond. 99 | """ 100 | return self.stoi[symbol] 101 | 102 | def reverse(self, index): 103 | """ The symbol thar index cottespond. 104 | """ 105 | return self.itos[index] 106 | 107 | def add_symbol(self, symbol): 108 | """ Add a new symbol in vocab. 109 | """ 110 | if symbol in self.stoi: 111 | return 112 | N = len(self.stoi) 113 | self.stoi[symbol] = N 114 | self.itos[N] = symbol 115 | 116 | def add_symbols(self, symbols): 117 | """ Add multiple symbols in vocab. 118 | """ 119 | for symbol in symbols: 120 | self.add_symbol(symbol) 121 | -------------------------------------------------------------------------------- /textfrontend/zh_frontend.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | import re 16 | from operator import itemgetter 17 | from typing import Dict 18 | from typing import List 19 | 20 | import jieba.posseg as psg 21 | import numpy as np 22 | from torch import Tensor 23 | import torch 24 | import yaml 25 | from g2pM import G2pM 26 | from pypinyin import lazy_pinyin 27 | from pypinyin import load_phrases_dict 28 | from pypinyin import load_single_dict 29 | from pypinyin import Style 30 | from pypinyin_dict.phrase_pinyin_data import large_pinyin 31 | 32 | from textfrontend.g2pw import G2PWOnnxConverter 33 | from textfrontend.generate_lexicon import generate_lexicon 34 | from textfrontend.tone_sandhi import ToneSandhi 35 | from textfrontend.zh_normalization.text_normlization import TextNormalizer 36 | from textfrontend.ssml.xml_processor import MixTextProcessor 37 | 38 | INITIALS = [ 39 | 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', 40 | 'r', 'z', 'c', 's', 'j', 'q', 'x' 41 | ] 42 | INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil'] 43 | 44 | 45 | def intersperse(lst, item): 46 | result = [item] * (len(lst) * 2 + 1) 47 | result[1::2] = lst 48 | return result 49 | 50 | 51 | def insert_after_character(lst, item): 52 | result = [item] 53 | for phone in lst: 54 | result.append(phone) 55 | if phone not in INITIALS: 56 | # finals has tones 57 | # assert phone[-1] in "12345" 58 | result.append(item) 59 | return result 60 | 61 | 62 | class Polyphonic(): 63 | def __init__(self): 64 | with open( 65 | os.path.join( 66 | os.path.dirname(os.path.abspath(__file__)), 67 | 'polyphonic.yaml'), 68 | 'r', 69 | encoding='utf-8') as polyphonic_file: 70 | # 解析yaml 71 | polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader) 72 | self.polyphonic_words = polyphonic_dict["polyphonic"] 73 | 74 | def correct_pronunciation(self, word, pinyin): 75 | # 词汇被词典收录则返回纠正后的读音 76 | if word in self.polyphonic_words.keys(): 77 | pinyin = self.polyphonic_words[word] 78 | # 否则返回原读音 79 | return pinyin 80 | 81 | 82 | class Frontend(): 83 | def __init__(self, 84 | g2p_model="g2pW", 85 | phone_vocab_path=None, 86 | tone_vocab_path=None): 87 | self.mix_ssml_processor = MixTextProcessor() 88 | self.tone_modifier = ToneSandhi() 89 | self.text_normalizer = TextNormalizer() 90 | self.punc = ":,;。?!“”‘’':,;.?!" 91 | self.phrases_dict = { 92 | '开户行': [['ka1i'], ['hu4'], ['hang2']], 93 | '发卡行': [['fa4'], ['ka3'], ['hang2']], 94 | '放款行': [['fa4ng'], ['kua3n'], ['hang2']], 95 | '茧行': [['jia3n'], ['hang2']], 96 | '行号': [['hang2'], ['ha4o']], 97 | '各地': [['ge4'], ['di4']], 98 | '借还款': [['jie4'], ['hua2n'], ['kua3n']], 99 | '时间为': [['shi2'], ['jia1n'], ['we2i']], 100 | '为准': [['we2i'], ['zhu3n']], 101 | '色差': [['se4'], ['cha1']], 102 | '嗲': [['dia3']], 103 | '呗': [['bei5']], 104 | '不': [['bu4']], 105 | '咗': [['zuo5']], 106 | '嘞': [['lei5']], 107 | '掺和': [['chan1'], ['huo5']] 108 | } 109 | # g2p_model can be pypinyin and g2pM and g2pW 110 | self.g2p_model = g2p_model 111 | if self.g2p_model == "g2pM": 112 | self.g2pM_model = G2pM() 113 | self.pinyin2phone = generate_lexicon( 114 | with_tone=True, with_erhua=False) 115 | elif self.g2p_model == "g2pW": 116 | # use pypinyin as backup for non polyphonic characters in g2pW 117 | self._init_pypinyin() 118 | self.corrector = Polyphonic() 119 | self.g2pM_model = G2pM() 120 | self.g2pW_model = G2PWOnnxConverter( 121 | style='pinyin', enable_non_tradional_chinese=True) 122 | self.pinyin2phone = generate_lexicon( 123 | with_tone=True, with_erhua=False) 124 | 125 | else: 126 | self._init_pypinyin() 127 | self.must_erhua = { 128 | "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" 129 | } 130 | self.not_erhua = { 131 | "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", 132 | "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", 133 | "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", 134 | "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", 135 | "狗儿" 136 | } 137 | 138 | self.vocab_phones = {} 139 | self.vocab_tones = {} 140 | if phone_vocab_path: 141 | with open(phone_vocab_path, 'rt') as f: 142 | phn_id = [line.strip().split() for line in f.readlines()] 143 | for phn, id in phn_id: 144 | self.vocab_phones[phn] = int(id) 145 | if tone_vocab_path: 146 | with open(tone_vocab_path, 'rt') as f: 147 | tone_id = [line.strip().split() for line in f.readlines()] 148 | for tone, id in tone_id: 149 | self.vocab_tones[tone] = int(id) 150 | 151 | def _init_pypinyin(self): 152 | large_pinyin.load() 153 | load_phrases_dict(self.phrases_dict) 154 | # 调整字的拼音顺序 155 | load_single_dict({ord(u'地'): u'de,di4'}) 156 | 157 | def _get_initials_finals(self, word: str) -> List[List[str]]: 158 | initials = [] 159 | finals = [] 160 | if self.g2p_model == "pypinyin": 161 | orig_initials = lazy_pinyin( 162 | word, neutral_tone_with_five=True, style=Style.INITIALS) 163 | orig_finals = lazy_pinyin( 164 | word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) 165 | for c, v in zip(orig_initials, orig_finals): 166 | if re.match(r'i\d', v): 167 | if c in ['z', 'c', 's']: 168 | v = re.sub('i', 'ii', v) 169 | elif c in ['zh', 'ch', 'sh', 'r']: 170 | v = re.sub('i', 'iii', v) 171 | initials.append(c) 172 | finals.append(v) 173 | elif self.g2p_model == "g2pM": 174 | pinyins = self.g2pM_model(word, tone=True, char_split=False) 175 | for pinyin in pinyins: 176 | pinyin = pinyin.replace("u:", "v") 177 | if pinyin in self.pinyin2phone: 178 | initial_final_list = self.pinyin2phone[pinyin].split(" ") 179 | if len(initial_final_list) == 2: 180 | initials.append(initial_final_list[0]) 181 | finals.append(initial_final_list[1]) 182 | elif len(initial_final_list) == 1: 183 | initials.append('') 184 | finals.append(initial_final_list[1]) 185 | else: 186 | # If it's not pinyin (possibly punctuation) or no conversion is required 187 | initials.append(pinyin) 188 | finals.append(pinyin) 189 | return initials, finals 190 | 191 | # if merge_sentences, merge all sentences into one phone sequence 192 | def _g2p(self, 193 | sentences: List[str], 194 | merge_sentences: bool=True, 195 | with_erhua: bool=True) -> List[List[str]]: 196 | segments = sentences 197 | phones_list = [] 198 | for seg in segments: 199 | phones = [] 200 | # Replace all English words in the sentence 201 | seg = re.sub('[a-zA-Z]+', '', seg) 202 | seg_cut = psg.lcut(seg) 203 | initials = [] 204 | finals = [] 205 | seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut) 206 | # 为了多音词获得更好的效果,这里采用整句预测 207 | if self.g2p_model == "g2pW": 208 | try: 209 | pinyins = self.g2pW_model(seg)[0] 210 | except Exception: 211 | # g2pW采用模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测 212 | print("[%s] not in g2pW dict,use g2pM" % seg) 213 | pinyins = self.g2pM_model(seg, tone=True, char_split=False) 214 | pre_word_length = 0 215 | for word, pos in seg_cut: 216 | sub_initials = [] 217 | sub_finals = [] 218 | now_word_length = pre_word_length + len(word) 219 | if pos == 'eng': 220 | pre_word_length = now_word_length 221 | continue 222 | word_pinyins = pinyins[pre_word_length:now_word_length] 223 | # 矫正发音 224 | word_pinyins = self.corrector.correct_pronunciation( 225 | word, word_pinyins) 226 | for pinyin, char in zip(word_pinyins, word): 227 | if pinyin is None: 228 | pinyin = char 229 | pinyin = pinyin.replace("u:", "v") 230 | if pinyin in self.pinyin2phone: 231 | initial_final_list = self.pinyin2phone[ 232 | pinyin].split(" ") 233 | if len(initial_final_list) == 2: 234 | sub_initials.append(initial_final_list[0]) 235 | sub_finals.append(initial_final_list[1]) 236 | elif len(initial_final_list) == 1: 237 | sub_initials.append('') 238 | sub_finals.append(initial_final_list[1]) 239 | else: 240 | # If it's not pinyin (possibly punctuation) or no conversion is required 241 | sub_initials.append(pinyin) 242 | sub_finals.append(pinyin) 243 | pre_word_length = now_word_length 244 | sub_finals = self.tone_modifier.modified_tone(word, pos, 245 | sub_finals) 246 | if with_erhua: 247 | sub_initials, sub_finals = self._merge_erhua( 248 | sub_initials, sub_finals, word, pos) 249 | initials.append(sub_initials) 250 | finals.append(sub_finals) 251 | # assert len(sub_initials) == len(sub_finals) == len(word) 252 | else: 253 | for word, pos in seg_cut: 254 | if pos == 'eng': 255 | continue 256 | sub_initials, sub_finals = self._get_initials_finals(word) 257 | sub_finals = self.tone_modifier.modified_tone(word, pos, 258 | sub_finals) 259 | if with_erhua: 260 | sub_initials, sub_finals = self._merge_erhua( 261 | sub_initials, sub_finals, word, pos) 262 | initials.append(sub_initials) 263 | finals.append(sub_finals) 264 | # assert len(sub_initials) == len(sub_finals) == len(word) 265 | initials = sum(initials, []) 266 | finals = sum(finals, []) 267 | 268 | for c, v in zip(initials, finals): 269 | # NOTE: post process for pypinyin outputs 270 | # we discriminate i, ii and iii 271 | if c and c not in self.punc: 272 | phones.append(c) 273 | if c and c in self.punc: 274 | phones.append('sp') 275 | if v and v not in self.punc: 276 | phones.append(v) 277 | phones_list.append(phones) 278 | if merge_sentences: 279 | merge_list = sum(phones_list, []) 280 | # rm the last 'sp' to avoid the noise at the end 281 | # cause in the training data, no 'sp' in the end 282 | if merge_list[-1] == 'sp': 283 | merge_list = merge_list[:-1] 284 | phones_list = [] 285 | phones_list.append(merge_list) 286 | return phones_list 287 | 288 | def _split_word_to_char(self, words): 289 | res = [] 290 | for x in words: 291 | res.append(x) 292 | return res 293 | 294 | # if using ssml, have pingyin specified, assign pinyin to words 295 | def _g2p_assign(self, 296 | words: List[str], 297 | pinyin_spec: List[str], 298 | merge_sentences: bool=True) -> List[List[str]]: 299 | phones_list = [] 300 | initials = [] 301 | finals = [] 302 | 303 | words = self._split_word_to_char(words[0]) 304 | for pinyin, char in zip(pinyin_spec, words): 305 | sub_initials = [] 306 | sub_finals = [] 307 | pinyin = pinyin.replace("u:", "v") 308 | #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu 309 | if pinyin in self.pinyin2phone: 310 | initial_final_list = self.pinyin2phone[pinyin].split(" ") 311 | if len(initial_final_list) == 2: 312 | sub_initials.append(initial_final_list[0]) 313 | sub_finals.append(initial_final_list[1]) 314 | elif len(initial_final_list) == 1: 315 | sub_initials.append('') 316 | sub_finals.append(initial_final_list[1]) 317 | else: 318 | # If it's not pinyin (possibly punctuation) or no conversion is required 319 | sub_initials.append(pinyin) 320 | sub_finals.append(pinyin) 321 | initials.append(sub_initials) 322 | finals.append(sub_finals) 323 | 324 | initials = sum(initials, []) 325 | finals = sum(finals, []) 326 | phones = [] 327 | for c, v in zip(initials, finals): 328 | # NOTE: post process for pypinyin outputs 329 | # we discriminate i, ii and iii 330 | if c and c not in self.punc: 331 | phones.append(c) 332 | if c and c in self.punc: 333 | phones.append('sp') 334 | if v and v not in self.punc: 335 | phones.append(v) 336 | phones_list.append(phones) 337 | if merge_sentences: 338 | merge_list = sum(phones_list, []) 339 | # rm the last 'sp' to avoid the noise at the end 340 | # cause in the training data, no 'sp' in the end 341 | if merge_list[-1] == 'sp': 342 | merge_list = merge_list[:-1] 343 | phones_list = [] 344 | phones_list.append(merge_list) 345 | return phones_list 346 | 347 | def _merge_erhua(self, 348 | initials: List[str], 349 | finals: List[str], 350 | word: str, 351 | pos: str) -> List[List[str]]: 352 | # fix er1 353 | for i, phn in enumerate(finals): 354 | if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': 355 | finals[i] = 'er2' 356 | if word not in self.must_erhua and (word in self.not_erhua or 357 | pos in {"a", "j", "nr"}): 358 | return initials, finals 359 | # "……" 等情况直接返回 360 | if len(finals) != len(word): 361 | return initials, finals 362 | 363 | assert len(finals) == len(word) 364 | 365 | new_initials = [] 366 | new_finals = [] 367 | for i, phn in enumerate(finals): 368 | if i == len(finals) - 1 and word[i] == "儿" and phn in { 369 | "er2", "er5" 370 | } and word[-2:] not in self.not_erhua and new_finals: 371 | new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1] 372 | else: 373 | new_finals.append(phn) 374 | new_initials.append(initials[i]) 375 | return new_initials, new_finals 376 | 377 | def _p2id(self, phonemes: List[str]) -> np.ndarray: 378 | # replace unk phone with sp 379 | phonemes = [ 380 | phn if phn in self.vocab_phones else "sp" for phn in phonemes 381 | ] 382 | phone_ids = [self.vocab_phones[item] for item in phonemes] 383 | return np.array(phone_ids, np.int64) 384 | 385 | def _t2id(self, tones: List[str]) -> np.ndarray: 386 | # replace unk phone with sp 387 | tones = [tone if tone in self.vocab_tones else "0" for tone in tones] 388 | tone_ids = [self.vocab_tones[item] for item in tones] 389 | return np.array(tone_ids, np.int64) 390 | 391 | def _get_phone_tone(self, phonemes: List[str], 392 | get_tone_ids: bool=False) -> List[List[str]]: 393 | phones = [] 394 | tones = [] 395 | if get_tone_ids and self.vocab_tones: 396 | for full_phone in phonemes: 397 | # split tone from finals 398 | match = re.match(r'^(\w+)([012345])$', full_phone) 399 | if match: 400 | phone = match.group(1) 401 | tone = match.group(2) 402 | # if the merged erhua not in the vocab 403 | # assume that the input is ['iaor3'] and 'iaor' not in self.vocab_phones, we split 'iaor' into ['iao','er'] 404 | # and the tones accordingly change from ['3'] to ['3','2'], while '2' is the tone of 'er2' 405 | if len(phone) >= 2 and phone != "er" and phone[ 406 | -1] == 'r' and phone not in self.vocab_phones and phone[: 407 | -1] in self.vocab_phones: 408 | phones.append(phone[:-1]) 409 | phones.append("er") 410 | tones.append(tone) 411 | tones.append("2") 412 | else: 413 | phones.append(phone) 414 | tones.append(tone) 415 | else: 416 | phones.append(full_phone) 417 | tones.append('0') 418 | else: 419 | for phone in phonemes: 420 | # if the merged erhua not in the vocab 421 | # assume that the input is ['iaor3'] and 'iaor' not in self.vocab_phones, change ['iaor3'] to ['iao3','er2'] 422 | if len(phone) >= 3 and phone[:-1] != "er" and phone[ 423 | -2] == 'r' and phone not in self.vocab_phones and ( 424 | phone[:-2] + phone[-1]) in self.vocab_phones: 425 | phones.append((phone[:-2] + phone[-1])) 426 | phones.append("er2") 427 | else: 428 | phones.append(phone) 429 | return phones, tones 430 | 431 | def get_phonemes(self, 432 | sentence: str, 433 | merge_sentences: bool=True, 434 | with_erhua: bool=True, 435 | robot: bool=False, 436 | print_info: bool=False) -> List[List[str]]: 437 | sentences = self.text_normalizer.normalize(sentence) 438 | phonemes = self._g2p( 439 | sentences, merge_sentences=merge_sentences, with_erhua=with_erhua) 440 | # change all tones to `1` 441 | if robot: 442 | new_phonemes = [] 443 | for sentence in phonemes: 444 | new_sentence = [] 445 | for item in sentence: 446 | # `er` only have tone `2` 447 | if item[-1] in "12345" and item != "er2": 448 | item = item[:-1] + "1" 449 | new_sentence.append(item) 450 | new_phonemes.append(new_sentence) 451 | phonemes = new_phonemes 452 | if print_info: 453 | print("----------------------------") 454 | print("text norm results:") 455 | print(sentences) 456 | print("----------------------------") 457 | print("g2p results:") 458 | print(phonemes) 459 | print("----------------------------") 460 | return phonemes 461 | 462 | #@an added for ssml pinyin 463 | def get_phonemes_ssml(self, 464 | ssml_inputs: list, 465 | merge_sentences: bool=True, 466 | with_erhua: bool=True, 467 | robot: bool=False, 468 | print_info: bool=False) -> List[List[str]]: 469 | all_phonemes = [] 470 | for word_pinyin_item in ssml_inputs: 471 | phonemes = [] 472 | sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) 473 | sentences = self.text_normalizer.normalize(sentence) 474 | if len(pinyin_spec) == 0: 475 | phonemes = self._g2p( 476 | sentences, 477 | merge_sentences=merge_sentences, 478 | with_erhua=with_erhua) 479 | else: 480 | # phonemes should be pinyin_spec 481 | phonemes = self._g2p_assign( 482 | sentences, pinyin_spec, merge_sentences=merge_sentences) 483 | 484 | all_phonemes = all_phonemes + phonemes 485 | 486 | if robot: 487 | new_phonemes = [] 488 | for sentence in all_phonemes: 489 | new_sentence = [] 490 | for item in sentence: 491 | # `er` only have tone `2` 492 | if item[-1] in "12345" and item != "er2": 493 | item = item[:-1] + "1" 494 | new_sentence.append(item) 495 | new_phonemes.append(new_sentence) 496 | all_phonemes = new_phonemes 497 | 498 | if print_info: 499 | print("----------------------------") 500 | print("text norm results:") 501 | print(sentences) 502 | print("----------------------------") 503 | print("g2p results:") 504 | print(all_phonemes[0]) 505 | print("----------------------------") 506 | return [sum(all_phonemes, [])] 507 | 508 | def get_input_ids(self, 509 | sentence: str, 510 | merge_sentences: bool=True, 511 | get_tone_ids: bool=False, 512 | robot: bool=False, 513 | print_info: bool=False, 514 | add_blank: bool=False, 515 | blank_token: str="", 516 | to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: 517 | 518 | phonemes = self.get_phonemes( 519 | sentence, 520 | merge_sentences=merge_sentences, 521 | print_info=print_info, 522 | robot=robot) 523 | result = {} 524 | phones = [] 525 | tones = [] 526 | temp_phone_ids = [] 527 | temp_tone_ids = [] 528 | 529 | for part_phonemes in phonemes: 530 | phones, tones = self._get_phone_tone( 531 | part_phonemes, get_tone_ids=get_tone_ids) 532 | if add_blank: 533 | phones = insert_after_character(phones, blank_token) 534 | if tones: 535 | tone_ids = self._t2id(tones) 536 | if to_tensor: 537 | tone_ids = torch.as_tensor(tone_ids) 538 | temp_tone_ids.append(tone_ids) 539 | if phones: 540 | phone_ids = self._p2id(phones) 541 | # if use torch.as_tensor() in onnxruntime, the first time will be too low 542 | if to_tensor: 543 | phone_ids = torch.as_tensor(phone_ids) 544 | temp_phone_ids.append(phone_ids) 545 | if temp_tone_ids: 546 | result["tone_ids"] = temp_tone_ids 547 | if temp_phone_ids: 548 | result["phone_ids"] = temp_phone_ids 549 | return result 550 | 551 | # @an added for ssml 552 | def get_input_ids_ssml( 553 | self, 554 | sentence: str, 555 | merge_sentences: bool=True, 556 | get_tone_ids: bool=False, 557 | robot: bool=False, 558 | print_info: bool=False, 559 | add_blank: bool=False, 560 | blank_token: str="", 561 | to_tensor: bool=True) -> Dict[str, List[Tensor]]: 562 | 563 | l_inputs = MixTextProcessor.get_pinyin_split(sentence) 564 | phonemes = self.get_phonemes_ssml( 565 | l_inputs, 566 | merge_sentences=merge_sentences, 567 | print_info=print_info, 568 | robot=robot) 569 | result = {} 570 | phones = [] 571 | tones = [] 572 | temp_phone_ids = [] 573 | temp_tone_ids = [] 574 | 575 | for part_phonemes in phonemes: 576 | phones, tones = self._get_phone_tone( 577 | part_phonemes, get_tone_ids=get_tone_ids) 578 | if add_blank: 579 | phones = insert_after_character(phones, blank_token) 580 | if tones: 581 | tone_ids = self._t2id(tones) 582 | if to_tensor: 583 | tone_ids = torch.as_tensor(tone_ids) 584 | temp_tone_ids.append(tone_ids) 585 | if phones: 586 | phone_ids = self._p2id(phones) 587 | # if use torch.as_tensor() in onnxruntime, the first time will be too low 588 | if to_tensor: 589 | phone_ids = torch.as_tensor(phone_ids) 590 | temp_phone_ids.append(phone_ids) 591 | if temp_tone_ids: 592 | result["tone_ids"] = temp_tone_ids 593 | if temp_phone_ids: 594 | result["phone_ids"] = temp_phone_ids 595 | return result 596 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .text_normlization import * 15 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/char_convert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters. 16 | """ 17 | simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' 18 | 19 | traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤' 20 | 21 | assert len(simplified_charcters) == len(simplified_charcters) 22 | 23 | s2t_dict = {} 24 | t2s_dict = {} 25 | for i, item in enumerate(simplified_charcters): 26 | s2t_dict[item] = traditional_characters[i] 27 | t2s_dict[traditional_characters[i]] = item 28 | 29 | 30 | def tranditional_to_simplified(text: str) -> str: 31 | return "".join( 32 | [t2s_dict[item] if item in t2s_dict else item for item in text]) 33 | 34 | 35 | def simplified_to_traditional(text: str) -> str: 36 | return "".join( 37 | [s2t_dict[item] if item in s2t_dict else item for item in text]) 38 | 39 | 40 | if __name__ == "__main__": 41 | text = "一般是指存取一個應用程式啟動時始終顯示在網站或網頁瀏覽器中的一個或多個初始網頁等畫面存在的站點" 42 | print(text) 43 | text_simple = tranditional_to_simplified(text) 44 | print(text_simple) 45 | text_traditional = simplified_to_traditional(text_simple) 46 | print(text_traditional) 47 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip('0')) 25 | if num_string.startswith('0'): 26 | result = DIGITS['0'] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' 32 | r':([0-5][0-9])' 33 | r'(:([0-5][0-9]))?') 34 | 35 | # 时间范围,如8:30-12:30 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' 37 | r':([0-5][0-9])' 38 | r'(:([0-5][0-9]))?' 39 | r'(~|-)' 40 | r'([0-1]?[0-9]|2[0-3])' 41 | r':([0-5][0-9])' 42 | r'(:([0-5][0-9]))?') 43 | 44 | 45 | def replace_time(match) -> str: 46 | """ 47 | Args: 48 | match (re.Match) 49 | Returns: 50 | str 51 | """ 52 | 53 | is_range = len(match.groups()) > 5 54 | 55 | hour = match.group(1) 56 | minute = match.group(2) 57 | second = match.group(4) 58 | 59 | if is_range: 60 | hour_2 = match.group(6) 61 | minute_2 = match.group(7) 62 | second_2 = match.group(9) 63 | 64 | result = f"{num2str(hour)}点" 65 | if minute.lstrip('0'): 66 | if int(minute) == 30: 67 | result += "半" 68 | else: 69 | result += f"{_time_num2str(minute)}分" 70 | if second and second.lstrip('0'): 71 | result += f"{_time_num2str(second)}秒" 72 | 73 | if is_range: 74 | result += "至" 75 | result += f"{num2str(hour_2)}点" 76 | if minute_2.lstrip('0'): 77 | if int(minute) == 30: 78 | result += "半" 79 | else: 80 | result += f"{_time_num2str(minute_2)}分" 81 | if second_2 and second_2.lstrip('0'): 82 | result += f"{_time_num2str(second_2)}秒" 83 | 84 | return result 85 | 86 | 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年' 88 | r'((0?[1-9]|1[0-2])月)?' 89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') 90 | 91 | 92 | def replace_date(match) -> str: 93 | """ 94 | Args: 95 | match (re.Match) 96 | Returns: 97 | str 98 | """ 99 | year = match.group(1) 100 | month = match.group(3) 101 | day = match.group(5) 102 | result = "" 103 | if year: 104 | result += f"{verbalize_digit(year)}年" 105 | if month: 106 | result += f"{verbalize_cardinal(month)}月" 107 | if day: 108 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 109 | return result 110 | 111 | 112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 113 | RE_DATE2 = re.compile( 114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') 115 | 116 | 117 | def replace_date2(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | year = match.group(1) 125 | month = match.group(3) 126 | day = match.group(4) 127 | result = "" 128 | if year: 129 | result += f"{verbalize_digit(year)}年" 130 | if month: 131 | result += f"{verbalize_cardinal(month)}月" 132 | if day: 133 | result += f"{verbalize_cardinal(day)}日" 134 | return result 135 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = { 22 | chr(ord(char) + 65248): char 23 | for char in string.ascii_letters 24 | } 25 | 26 | # 英文字符半角 -> 全角映射表 27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 28 | 29 | # 数字字符全角 -> 半角映射表 (num: 10) 30 | F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits} 31 | # 数字字符半角 -> 全角映射表 32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 33 | 34 | # 标点符号全角 -> 半角映射表 (num: 32) 35 | F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation} 36 | # 标点符号半角 -> 全角映射表 37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 38 | 39 | # 空格 (num: 1) 40 | F2H_SPACE = {'\u3000': ' '} 41 | H2F_SPACE = {' ': '\u3000'} 42 | 43 | # 非"有拼音的汉字"的字符串,可用于NSW提取 44 | if SUPPORT_UCS4: 45 | RE_NSW = re.compile(r'(?:[^' 46 | r'\u3007' # 〇 47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] 51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] 52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] 53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] 54 | r'])+') 55 | else: 56 | RE_NSW = re.compile( # pragma: no cover 57 | r'(?:[^' 58 | r'\u3007' # 〇 59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 62 | r'])+') 63 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/num.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Rules to verbalize numbers into Chinese characters. 16 | https://zh.wikipedia.org/wiki/中文数字#現代中文 17 | """ 18 | import re 19 | from collections import OrderedDict 20 | from typing import List 21 | 22 | DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} 23 | UNITS = OrderedDict({ 24 | 1: '十', 25 | 2: '百', 26 | 3: '千', 27 | 4: '万', 28 | 8: '亿', 29 | }) 30 | 31 | COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' 32 | 33 | # 分数表达式 34 | RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') 35 | 36 | 37 | def replace_frac(match) -> str: 38 | """ 39 | Args: 40 | match (re.Match) 41 | Returns: 42 | str 43 | """ 44 | sign = match.group(1) 45 | nominator = match.group(2) 46 | denominator = match.group(3) 47 | sign: str = "负" if sign else "" 48 | nominator: str = num2str(nominator) 49 | denominator: str = num2str(denominator) 50 | result = f"{sign}{denominator}分之{nominator}" 51 | return result 52 | 53 | 54 | # 百分数表达式 55 | RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') 56 | 57 | 58 | def replace_percentage(match) -> str: 59 | """ 60 | Args: 61 | match (re.Match) 62 | Returns: 63 | str 64 | """ 65 | sign = match.group(1) 66 | percent = match.group(2) 67 | sign: str = "负" if sign else "" 68 | percent: str = num2str(percent) 69 | result = f"{sign}百分之{percent}" 70 | return result 71 | 72 | 73 | # 整数表达式 74 | # 带负号的整数 -10 75 | RE_INTEGER = re.compile(r'(-)' r'(\d+)') 76 | 77 | 78 | def replace_negative_num(match) -> str: 79 | """ 80 | Args: 81 | match (re.Match) 82 | Returns: 83 | str 84 | """ 85 | sign = match.group(1) 86 | number = match.group(2) 87 | sign: str = "负" if sign else "" 88 | number: str = num2str(number) 89 | result = f"{sign}{number}" 90 | return result 91 | 92 | 93 | # 编号-无符号整形 94 | # 00078 95 | RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') 96 | 97 | 98 | def replace_default_num(match): 99 | """ 100 | Args: 101 | match (re.Match) 102 | Returns: 103 | str 104 | """ 105 | number = match.group(0) 106 | return verbalize_digit(number, alt_one=True) 107 | 108 | 109 | # 数字表达式 110 | # 纯小数 111 | RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') 112 | # 正整数 + 量词 113 | RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) 114 | RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') 115 | 116 | 117 | def replace_positive_quantifier(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | number = match.group(1) 125 | match_2 = match.group(2) 126 | if match_2 == "+": 127 | match_2 = "多" 128 | match_2: str = match_2 if match_2 else "" 129 | quantifiers: str = match.group(3) 130 | number: str = num2str(number) 131 | result = f"{number}{match_2}{quantifiers}" 132 | return result 133 | 134 | 135 | def replace_number(match) -> str: 136 | """ 137 | Args: 138 | match (re.Match) 139 | Returns: 140 | str 141 | """ 142 | sign = match.group(1) 143 | number = match.group(2) 144 | pure_decimal = match.group(5) 145 | if pure_decimal: 146 | result = num2str(pure_decimal) 147 | else: 148 | sign: str = "负" if sign else "" 149 | number: str = num2str(number) 150 | result = f"{sign}{number}" 151 | return result 152 | 153 | 154 | # 范围表达式 155 | # match.group(1) and match.group(8) are copy from RE_NUMBER 156 | 157 | RE_RANGE = re.compile( 158 | r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') 159 | 160 | 161 | def replace_range(match) -> str: 162 | """ 163 | Args: 164 | match (re.Match) 165 | Returns: 166 | str 167 | """ 168 | first, second = match.group(1), match.group(8) 169 | first = RE_NUMBER.sub(replace_number, first) 170 | second = RE_NUMBER.sub(replace_number, second) 171 | result = f"{first}到{second}" 172 | return result 173 | 174 | 175 | def _get_value(value_string: str, use_zero: bool=True) -> List[str]: 176 | stripped = value_string.lstrip('0') 177 | if len(stripped) == 0: 178 | return [] 179 | elif len(stripped) == 1: 180 | if use_zero and len(stripped) < len(value_string): 181 | return [DIGITS['0'], DIGITS[stripped]] 182 | else: 183 | return [DIGITS[stripped]] 184 | else: 185 | largest_unit = next( 186 | power for power in reversed(UNITS.keys()) if power < len(stripped)) 187 | first_part = value_string[:-largest_unit] 188 | second_part = value_string[-largest_unit:] 189 | return _get_value(first_part) + [UNITS[largest_unit]] + _get_value( 190 | second_part) 191 | 192 | 193 | def verbalize_cardinal(value_string: str) -> str: 194 | if not value_string: 195 | return '' 196 | 197 | # 000 -> '零' , 0 -> '零' 198 | value_string = value_string.lstrip('0') 199 | if len(value_string) == 0: 200 | return DIGITS['0'] 201 | 202 | result_symbols = _get_value(value_string) 203 | # verbalized number starting with '一十*' is abbreviated as `十*` 204 | if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[ 205 | '1'] and result_symbols[1] == UNITS[1]: 206 | result_symbols = result_symbols[1:] 207 | return ''.join(result_symbols) 208 | 209 | 210 | def verbalize_digit(value_string: str, alt_one=False) -> str: 211 | result_symbols = [DIGITS[digit] for digit in value_string] 212 | result = ''.join(result_symbols) 213 | if alt_one: 214 | result = result.replace("一", "幺") 215 | return result 216 | 217 | 218 | def num2str(value_string: str) -> str: 219 | integer_decimal = value_string.split('.') 220 | if len(integer_decimal) == 1: 221 | integer = integer_decimal[0] 222 | decimal = '' 223 | elif len(integer_decimal) == 2: 224 | integer, decimal = integer_decimal 225 | else: 226 | raise ValueError( 227 | f"The value string: '${value_string}' has more than one point in it." 228 | ) 229 | 230 | result = verbalize_cardinal(integer) 231 | 232 | decimal = decimal.rstrip('0') 233 | if decimal: 234 | # '.22' is verbalized as '零点二二' 235 | # '3.20' is verbalized as '三点二 236 | result = result if result else "零" 237 | result += '点' + verbalize_digit(decimal) 238 | return result 239 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile( 25 | r"(? str: 34 | if mobile: 35 | sp_parts = phone_string.strip('+').split() 36 | result = ','.join( 37 | [verbalize_digit(part, alt_one=True) for part in sp_parts]) 38 | return result 39 | else: 40 | sil_parts = phone_string.split('-') 41 | result = ','.join( 42 | [verbalize_digit(part, alt_one=True) for part in sil_parts]) 43 | return result 44 | 45 | 46 | def replace_phone(match) -> str: 47 | """ 48 | Args: 49 | match (re.Match) 50 | Returns: 51 | str 52 | """ 53 | return phone2str(match.group(0), mobile=False) 54 | 55 | 56 | def replace_mobile(match) -> str: 57 | """ 58 | Args: 59 | match (re.Match) 60 | Returns: 61 | str 62 | """ 63 | return phone2str(match.group(0)) 64 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') 21 | 22 | 23 | def replace_temperature(match) -> str: 24 | """ 25 | Args: 26 | match (re.Match) 27 | Returns: 28 | str 29 | """ 30 | sign = match.group(1) 31 | temperature = match.group(2) 32 | unit = match.group(3) 33 | sign: str = "零下" if sign else "" 34 | temperature: str = num2str(temperature) 35 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 36 | result = f"{sign}{temperature}{unit}" 37 | return result 38 | -------------------------------------------------------------------------------- /textfrontend/zh_normalization/text_normlization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | from typing import List 16 | 17 | from .char_convert import tranditional_to_simplified 18 | from .chronology import RE_DATE 19 | from .chronology import RE_DATE2 20 | from .chronology import RE_TIME 21 | from .chronology import RE_TIME_RANGE 22 | from .chronology import replace_date 23 | from .chronology import replace_date2 24 | from .chronology import replace_time 25 | from .constants import F2H_ASCII_LETTERS 26 | from .constants import F2H_DIGITS 27 | from .constants import F2H_SPACE 28 | from .num import RE_DECIMAL_NUM 29 | from .num import RE_DEFAULT_NUM 30 | from .num import RE_FRAC 31 | from .num import RE_INTEGER 32 | from .num import RE_NUMBER 33 | from .num import RE_PERCENTAGE 34 | from .num import RE_POSITIVE_QUANTIFIERS 35 | from .num import RE_RANGE 36 | from .num import replace_default_num 37 | from .num import replace_frac 38 | from .num import replace_negative_num 39 | from .num import replace_number 40 | from .num import replace_percentage 41 | from .num import replace_positive_quantifier 42 | from .num import replace_range 43 | from .phonecode import RE_MOBILE_PHONE 44 | from .phonecode import RE_NATIONAL_UNIFORM_NUMBER 45 | from .phonecode import RE_TELEPHONE 46 | from .phonecode import replace_mobile 47 | from .phonecode import replace_phone 48 | from .quantifier import RE_TEMPERATURE 49 | from .quantifier import replace_temperature 50 | 51 | 52 | class TextNormalizer(): 53 | def __init__(self): 54 | self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') 55 | 56 | def _split(self, text: str, lang="zh") -> List[str]: 57 | """Split long text into sentences with sentence-splitting punctuations. 58 | Args: 59 | text (str): The input text. 60 | Returns: 61 | List[str]: Sentences. 62 | """ 63 | # Only for pure Chinese here 64 | if lang == "zh": 65 | text = text.replace(" ", "") 66 | # 过滤掉特殊字符 67 | text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) 68 | text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) 69 | text = text.strip() 70 | sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] 71 | return sentences 72 | 73 | def _post_replace(self, sentence: str) -> str: 74 | sentence = sentence.replace('/', '每') 75 | sentence = sentence.replace('~', '至') 76 | 77 | return sentence 78 | 79 | def normalize_sentence(self, sentence: str) -> str: 80 | # basic character conversions 81 | sentence = tranditional_to_simplified(sentence) 82 | sentence = sentence.translate(F2H_ASCII_LETTERS).translate( 83 | F2H_DIGITS).translate(F2H_SPACE) 84 | 85 | # number related NSW verbalization 86 | sentence = RE_DATE.sub(replace_date, sentence) 87 | sentence = RE_DATE2.sub(replace_date2, sentence) 88 | 89 | # range first 90 | sentence = RE_TIME_RANGE.sub(replace_time, sentence) 91 | sentence = RE_TIME.sub(replace_time, sentence) 92 | 93 | sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) 94 | sentence = RE_FRAC.sub(replace_frac, sentence) 95 | sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) 96 | sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) 97 | 98 | sentence = RE_TELEPHONE.sub(replace_phone, sentence) 99 | sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) 100 | 101 | sentence = RE_RANGE.sub(replace_range, sentence) 102 | sentence = RE_INTEGER.sub(replace_negative_num, sentence) 103 | sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) 104 | sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, 105 | sentence) 106 | sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) 107 | sentence = RE_NUMBER.sub(replace_number, sentence) 108 | sentence = self._post_replace(sentence) 109 | 110 | return sentence 111 | 112 | def normalize(self, text: str) -> List[str]: 113 | sentences = self._split(text) 114 | 115 | sentences = [self.normalize_sentence(sent) for sent in sentences] 116 | return sentences 117 | --------------------------------------------------------------------------------