├── .gitignore ├── README.md ├── __init__.py ├── basic_class.py ├── basic_constant.py ├── basic_util.py ├── cardinal.py ├── date.py ├── digit.py ├── fraction.py ├── money.py ├── percentage.py ├── telephone.py └── text.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # JetBrains PyCharm 107 | .idea 108 | 109 | # Customize 110 | references 111 | url.txt 112 | 113 | # Git 114 | .git 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chn Text Norm 2 | 3 | this is a repository for chinese text normalization (no longer maintained). 4 | 5 | ## Quick Start ## 6 | 7 | ### Git Clone Repo ### 8 | 9 | git clone this repo to the root directory of your project which need to use it. 10 | 11 | cd /path/to/proj 12 | git clone https://github.com/Joee1995/chn-text-norm.git 13 | 14 | after that, your doc tree should be: 15 | ``` 16 | proj # root of your project 17 | |--- chn_text_norm # this chn-text-norm tool 18 | |--- text.py 19 | |--- ... 20 | |--- text_normalize.py # your text normalization code 21 | |--- ... 22 | ``` 23 | 24 | ### How to Use ? ### 25 | 26 | # text_normalize.py 27 | from chn_text_norm.text import * 28 | 29 | raw_text = 'your raw text' 30 | text = Text(raw_text=raw_text).normalize() 31 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atomicoo/chn_text_norm/8210575e9a4fddb409e5a76b922bf6c30f5833c5/__init__.py -------------------------------------------------------------------------------- /basic_class.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """基本类 3 | 中文字符类 4 | 中文数字/数位类 5 | 中文数字类 6 | 中文数位类 7 | 中文数字系统类 8 | 中文数学符号类 9 | *中文其他符号类 10 | """ 11 | 12 | __author__ = 'Zhiyang Zhou ' 13 | __data__ = '2019-05-02' 14 | 15 | from chn_text_norm.basic_constant import NUMBERING_TYPES 16 | 17 | 18 | class ChineseChar(object): 19 | """ 20 | 中文字符 21 | 每个字符对应简体和繁体, 22 | e.g. 简体 = '负', 繁体 = '負' 23 | 转换时可转换为简体或繁体 24 | """ 25 | 26 | def __init__(self, simplified, traditional): 27 | self.simplified = simplified 28 | self.traditional = traditional 29 | self.__repr__ = self.__str__ 30 | 31 | def __str__(self): 32 | return self.simplified or self.traditional or None 33 | 34 | def __repr__(self): 35 | return self.__str__() 36 | 37 | 38 | class ChineseNumberUnit(ChineseChar): 39 | """ 40 | 中文数字/数位字符 41 | 每个字符除繁简体外还有一个额外的大写字符 42 | e.g. '陆' 和 '陸' 43 | """ 44 | 45 | def __init__(self, power, simplified, traditional, big_s, big_t): 46 | super(ChineseNumberUnit, self).__init__(simplified, traditional) 47 | self.power = power 48 | self.big_s = big_s 49 | self.big_t = big_t 50 | 51 | def __str__(self): 52 | return '10^{}'.format(self.power) 53 | 54 | @classmethod 55 | def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): 56 | 57 | if small_unit: 58 | return ChineseNumberUnit(power=index + 1, 59 | simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1]) 60 | elif numbering_type == NUMBERING_TYPES[0]: 61 | return ChineseNumberUnit(power=index + 8, 62 | simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) 63 | elif numbering_type == NUMBERING_TYPES[1]: 64 | return ChineseNumberUnit(power=(index + 2) * 4, 65 | simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) 66 | elif numbering_type == NUMBERING_TYPES[2]: 67 | return ChineseNumberUnit(power=pow(2, index + 3), 68 | simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) 69 | else: 70 | raise ValueError( 71 | 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type)) 72 | 73 | 74 | class ChineseNumberDigit(ChineseChar): 75 | """ 76 | 中文数字字符 77 | """ 78 | 79 | def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): 80 | super(ChineseNumberDigit, self).__init__(simplified, traditional) 81 | self.value = value 82 | self.big_s = big_s 83 | self.big_t = big_t 84 | self.alt_s = alt_s 85 | self.alt_t = alt_t 86 | 87 | def __str__(self): 88 | return str(self.value) 89 | 90 | @classmethod 91 | def create(cls, i, v): 92 | return ChineseNumberDigit(i, v[0], v[1], v[2], v[3]) 93 | 94 | 95 | class ChineseMath(ChineseChar): 96 | """ 97 | 中文数位字符 98 | """ 99 | 100 | def __init__(self, simplified, traditional, symbol, expression=None): 101 | super(ChineseMath, self).__init__(simplified, traditional) 102 | self.symbol = symbol 103 | self.expression = expression 104 | self.big_s = simplified 105 | self.big_t = traditional 106 | 107 | 108 | CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath 109 | 110 | 111 | class NumberSystem(object): 112 | """ 113 | 中文数字系统 114 | """ 115 | pass 116 | 117 | 118 | class MathSymbol(object): 119 | """ 120 | 用于中文数字系统的数学符号 (繁/简体), e.g. 121 | positive = ['正', '正'] 122 | negative = ['负', '負'] 123 | point = ['点', '點'] 124 | """ 125 | 126 | def __init__(self, positive, negative, point): 127 | self.positive = positive 128 | self.negative = negative 129 | self.point = point 130 | 131 | def __iter__(self): 132 | for v in self.__dict__.values(): 133 | yield v 134 | 135 | 136 | # class OtherSymbol(object): 137 | # """ 138 | # 其他符号 139 | # """ 140 | # 141 | # def __init__(self, sil): 142 | # self.sil = sil 143 | # 144 | # def __iter__(self): 145 | # for v in self.__dict__.values(): 146 | # yield v 147 | 148 | -------------------------------------------------------------------------------- /basic_constant.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """基本常量 3 | 中文数字/数位/符号字符常量 4 | """ 5 | 6 | __author__ = 'Zhiyang Zhou ' 7 | __data__ = '2019-05-02' 8 | 9 | CHINESE_DIGIS = u'零一二三四五六七八九' 10 | BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖' 11 | BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖' 12 | SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万' 13 | SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬' 14 | LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载' 15 | LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載' 16 | SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万' 17 | SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬' 18 | 19 | ZERO_ALT = u'〇' 20 | ONE_ALT = u'幺' 21 | TWO_ALTS = [u'两', u'兩'] 22 | 23 | POSITIVE = [u'正', u'正'] 24 | NEGATIVE = [u'负', u'負'] 25 | POINT = [u'点', u'點'] 26 | # PLUS = [u'加', u'加'] 27 | # SIL = [u'杠', u'槓'] 28 | 29 | # 中文数字系统类型 30 | NUMBERING_TYPES = ['low', 'mid', 'high'] 31 | -------------------------------------------------------------------------------- /basic_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """基本方法 3 | 创建中文数字系统 方法 4 | 中文字符串 <=> 数字串 方法 5 | 数字串 <=> 中文字符串 方法 6 | """ 7 | 8 | __author__ = 'Zhiyang Zhou ' 9 | __data__ = '2019-05-02' 10 | 11 | from chn_text_norm.basic_constant import * 12 | from chn_text_norm.basic_class import * 13 | 14 | 15 | def create_system(numbering_type=NUMBERING_TYPES[1]): 16 | """ 17 | 根据数字系统类型返回创建相应的数字系统,默认为 mid 18 | NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型 19 | low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc. 20 | mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc. 21 | high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc. 22 | 返回对应的数字系统 23 | """ 24 | 25 | # chinese number units of '亿' and larger 26 | all_larger_units = zip( 27 | LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) 28 | larger_units = [CNU.create(i, v, numbering_type, False) 29 | for i, v in enumerate(all_larger_units)] 30 | # chinese number units of '十, 百, 千, 万' 31 | all_smaller_units = zip( 32 | SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) 33 | smaller_units = [CNU.create(i, v, small_unit=True) 34 | for i, v in enumerate(all_smaller_units)] 35 | # digis 36 | chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, 37 | BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL) 38 | digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)] 39 | digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT 40 | digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT 41 | digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1] 42 | 43 | # symbols 44 | positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x) 45 | negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x) 46 | point_cn = CM(POINT[0], POINT[1], '.', lambda x, 47 | y: float(str(x) + '.' + str(y))) 48 | # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y))) 49 | system = NumberSystem() 50 | system.units = smaller_units + larger_units 51 | system.digits = digits 52 | system.math = MathSymbol(positive_cn, negative_cn, point_cn) 53 | # system.symbols = OtherSymbol(sil_cn) 54 | return system 55 | 56 | 57 | def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]): 58 | 59 | def get_symbol(char, system): 60 | for u in system.units: 61 | if char in [u.traditional, u.simplified, u.big_s, u.big_t]: 62 | return u 63 | for d in system.digits: 64 | if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]: 65 | return d 66 | for m in system.math: 67 | if char in [m.traditional, m.simplified]: 68 | return m 69 | 70 | def string2symbols(chinese_string, system): 71 | int_string, dec_string = chinese_string, '' 72 | for p in [system.math.point.simplified, system.math.point.traditional]: 73 | if p in chinese_string: 74 | int_string, dec_string = chinese_string.split(p) 75 | break 76 | return [get_symbol(c, system) for c in int_string], \ 77 | [get_symbol(c, system) for c in dec_string] 78 | 79 | def correct_symbols(integer_symbols, system): 80 | """ 81 | 一百八 to 一百八十 82 | 一亿一千三百万 to 一亿 一千万 三百万 83 | """ 84 | 85 | if integer_symbols and isinstance(integer_symbols[0], CNU): 86 | if integer_symbols[0].power == 1: 87 | integer_symbols = [system.digits[1]] + integer_symbols 88 | 89 | if len(integer_symbols) > 1: 90 | if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU): 91 | integer_symbols.append( 92 | CNU(integer_symbols[-2].power - 1, None, None, None, None)) 93 | 94 | result = [] 95 | unit_count = 0 96 | for s in integer_symbols: 97 | if isinstance(s, CND): 98 | result.append(s) 99 | unit_count = 0 100 | elif isinstance(s, CNU): 101 | current_unit = CNU(s.power, None, None, None, None) 102 | unit_count += 1 103 | 104 | if unit_count == 1: 105 | result.append(current_unit) 106 | elif unit_count > 1: 107 | for i in range(len(result)): 108 | if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power: 109 | result[-i - 1] = CNU(result[-i - 1].power + 110 | current_unit.power, None, None, None, None) 111 | return result 112 | 113 | def compute_value(integer_symbols): 114 | """ 115 | Compute the value. 116 | When current unit is larger than previous unit, current unit * all previous units will be used as all previous units. 117 | e.g. '两千万' = 2000 * 10000 not 2000 + 10000 118 | """ 119 | value = [0] 120 | last_power = 0 121 | for s in integer_symbols: 122 | if isinstance(s, CND): 123 | value[-1] = s.value 124 | elif isinstance(s, CNU): 125 | value[-1] *= pow(10, s.power) 126 | if s.power > last_power: 127 | value[:-1] = list(map(lambda v: v * 128 | pow(10, s.power), value[:-1])) 129 | last_power = s.power 130 | value.append(0) 131 | return sum(value) 132 | 133 | system = create_system(numbering_type) 134 | int_part, dec_part = string2symbols(chinese_string, system) 135 | int_part = correct_symbols(int_part, system) 136 | int_str = str(compute_value(int_part)) 137 | dec_str = ''.join([str(d.value) for d in dec_part]) 138 | if dec_part: 139 | return '{0}.{1}'.format(int_str, dec_str) 140 | else: 141 | return int_str 142 | 143 | 144 | def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False, 145 | traditional=False, alt_zero=False, alt_one=False, alt_two=True, 146 | use_zeros=True, use_units=True): 147 | 148 | def get_value(value_string, use_zeros=True): 149 | 150 | striped_string = value_string.lstrip('0') 151 | 152 | # record nothing if all zeros 153 | if not striped_string: 154 | return [] 155 | 156 | # record one digits 157 | elif len(striped_string) == 1: 158 | if use_zeros and len(value_string) != len(striped_string): 159 | return [system.digits[0], system.digits[int(striped_string)]] 160 | else: 161 | return [system.digits[int(striped_string)]] 162 | 163 | # recursively record multiple digits 164 | else: 165 | result_unit = next(u for u in reversed( 166 | system.units) if u.power < len(striped_string)) 167 | result_string = value_string[:-result_unit.power] 168 | return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:]) 169 | 170 | system = create_system(numbering_type) 171 | 172 | int_dec = number_string.split('.') 173 | if len(int_dec) == 1: 174 | int_string = int_dec[0] 175 | dec_string = "" 176 | elif len(int_dec) == 2: 177 | int_string = int_dec[0] 178 | dec_string = int_dec[1] 179 | else: 180 | raise ValueError( 181 | "invalid input num string with more than one dot: {}".format(number_string)) 182 | 183 | if use_units and len(int_string) > 1: 184 | result_symbols = get_value(int_string) 185 | else: 186 | result_symbols = [system.digits[int(c)] for c in int_string] 187 | dec_symbols = [system.digits[int(c)] for c in dec_string] 188 | if dec_string: 189 | result_symbols += [system.math.point] + dec_symbols 190 | 191 | if alt_two: 192 | liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, 193 | system.digits[2].big_s, system.digits[2].big_t) 194 | for i, v in enumerate(result_symbols): 195 | if isinstance(v, CND) and v.value == 2: 196 | next_symbol = result_symbols[i + 197 | 1] if i < len(result_symbols) - 1 else None 198 | previous_symbol = result_symbols[i - 1] if i > 0 else None 199 | if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): 200 | if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): 201 | result_symbols[i] = liang 202 | 203 | # if big is True, '两' will not be used and `alt_two` has no impact on output 204 | if big: 205 | attr_name = 'big_' 206 | if traditional: 207 | attr_name += 't' 208 | else: 209 | attr_name += 's' 210 | else: 211 | if traditional: 212 | attr_name = 'traditional' 213 | else: 214 | attr_name = 'simplified' 215 | 216 | result = ''.join([getattr(s, attr_name) for s in result_symbols]) 217 | 218 | # if not use_zeros: 219 | # result = result.strip(getattr(system.digits[0], attr_name)) 220 | 221 | if alt_zero: 222 | result = result.replace( 223 | getattr(system.digits[0], attr_name), system.digits[0].alt_s) 224 | 225 | if alt_one: 226 | result = result.replace( 227 | getattr(system.digits[1], attr_name), system.digits[1].alt_s) 228 | 229 | for i, p in enumerate(POINT): 230 | if result.startswith(p): 231 | return CHINESE_DIGIS[0] + result 232 | 233 | # ^10, 11, .., 19 234 | if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], 235 | SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \ 236 | result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]: 237 | result = result[1:] 238 | 239 | return result 240 | 241 | 242 | if __name__ == '__main__': 243 | 244 | # 测试程序 245 | all_chinese_number_string = ( 246 | CHINESE_DIGIS + BIG_CHINESE_DIGIS_SIMPLIFIED + BIG_CHINESE_DIGIS_TRADITIONAL + 247 | LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED + LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL + 248 | SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL + ZERO_ALT + 249 | ONE_ALT + ''.join(TWO_ALTS + POSITIVE + NEGATIVE + POINT)) 250 | 251 | print('num:', chn2num('一万零四百零三点八零五')) 252 | print('num:', chn2num('一亿六点三')) 253 | print('num:', chn2num('一亿零六点三')) 254 | print('num:', chn2num('两千零一亿六点三')) 255 | # print('num:', chn2num('一零零八六')) 256 | print('txt:', num2chn('10260.03', alt_zero=True)) 257 | print('txt:', num2chn('20037.090', numbering_type='low', traditional=True)) 258 | print('txt:', num2chn('100860001.77', numbering_type='high', big=True)) 259 | print('txt:', num2chn('059523810880', alt_one=True, alt_two=False, use_lzeros=True, use_rzeros=True, use_units=False)) 260 | 261 | print(all_chinese_number_string) 262 | -------------------------------------------------------------------------------- /cardinal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """CARDINAL类 (包含小数DECIMAL类) 3 | 纯数 <=> 中文字符串 方法 4 | 中文字符串 <=> 纯数 方法 5 | """ 6 | 7 | __author__ = 'Zhiyang Zhou ' 8 | __data__ = '2019-05-03' 9 | 10 | from chn_text_norm.basic_util import * 11 | 12 | 13 | class Cardinal: 14 | """ 15 | CARDINAL类 16 | """ 17 | 18 | def __init__(self, cardinal=None, chntext=None): 19 | self.cardinal = cardinal 20 | self.chntext = chntext 21 | 22 | def chntext2cardinal(self): 23 | return chn2num(self.chntext) 24 | 25 | def cardinal2chntext(self): 26 | return num2chn(self.cardinal) 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | # 测试程序 32 | print(Cardinal(cardinal='21357.230').cardinal2chntext()) 33 | 34 | -------------------------------------------------------------------------------- /date.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """DATE类 3 | 日期 <=> 中文字符串 方法 4 | 中文字符串 <=> 日期 方法 5 | """ 6 | 7 | __author__ = 'Zhiyang Zhou ' 8 | __data__ = '2019-05-07' 9 | 10 | from chn_text_norm.cardinal import Cardinal 11 | from chn_text_norm.digit import Digit 12 | 13 | 14 | class Date: 15 | """ 16 | DATE类 17 | """ 18 | 19 | def __init__(self, date=None, chntext=None): 20 | self.date = date 21 | self.chntext = chntext 22 | 23 | # def chntext2date(self): 24 | # chntext = self.chntext 25 | # try: 26 | # year, other = chntext.strip().split('年', maxsplit=1) 27 | # year = Digit(chntext=year).digit2chntext() + '年' 28 | # except ValueError: 29 | # other = chntext 30 | # year = '' 31 | # if other: 32 | # try: 33 | # month, day = other.strip().split('月', maxsplit=1) 34 | # month = Cardinal(chntext=month).chntext2cardinal() + '月' 35 | # except ValueError: 36 | # day = chntext 37 | # month = '' 38 | # if day: 39 | # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1] 40 | # else: 41 | # month = '' 42 | # day = '' 43 | # date = year + month + day 44 | # self.date = date 45 | # return self.date 46 | 47 | def date2chntext(self): 48 | date = self.date 49 | try: 50 | year, other = date.strip().split('年', maxsplit=1) 51 | year = Digit(digit=year).digit2chntext() + '年' 52 | except ValueError: 53 | other = date 54 | year = '' 55 | if other: 56 | try: 57 | month, day = other.strip().split('月', maxsplit=1) 58 | month = Cardinal(cardinal=month).cardinal2chntext() + '月' 59 | except ValueError: 60 | day = date 61 | month = '' 62 | if day: 63 | day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] 64 | else: 65 | month = '' 66 | day = '' 67 | chntext = year + month + day 68 | self.chntext = chntext 69 | return self.chntext 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | # 测试 75 | print(Date(date='09年3月16日').date2chntext()) 76 | 77 | -------------------------------------------------------------------------------- /digit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """DIGIT类 3 | 数字串 <=> 中文字符串 方法 4 | 中文字符串 <=> 数字串 方法 5 | """ 6 | 7 | __author__ = 'Zhiyang Zhou ' 8 | __data__ = '2019-05-03' 9 | 10 | from chn_text_norm.basic_util import * 11 | 12 | 13 | class Digit: 14 | """ 15 | DIGIT类 16 | """ 17 | 18 | def __init__(self, digit=None, chntext=None): 19 | self.digit = digit 20 | self.chntext = chntext 21 | 22 | # def chntext2digit(self): 23 | # return chn2num(self.chntext) 24 | 25 | def digit2chntext(self): 26 | return num2chn(self.digit, alt_two=False, use_units=False) 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | # 测试程序 32 | print(Digit(digit='2016').digit2chntext()) 33 | 34 | -------------------------------------------------------------------------------- /fraction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """FRACTION类 3 | 分数 <=> 中文字符串 方法 4 | 中文字符串 <=> 分数 方法 5 | """ 6 | 7 | __author__ = 'Zhiyang Zhou ' 8 | __data__ = '2019-05-03' 9 | 10 | from chn_text_norm.basic_util import * 11 | 12 | 13 | class Fraction: 14 | """ 15 | FRACTION类 16 | """ 17 | 18 | def __init__(self, fraction=None, chntext=None): 19 | self.fraction = fraction 20 | self.chntext = chntext 21 | 22 | def chntext2fraction(self): 23 | denominator, numerator = self.chntext.split('分之') 24 | return chn2num(numerator) + '/' + chn2num(denominator) 25 | 26 | def fraction2chntext(self): 27 | numerator, denominator = self.fraction.split('/') 28 | return num2chn(denominator) + '分之' + num2chn(numerator) 29 | 30 | 31 | if __name__ == '__main__': 32 | 33 | # 测试程序 34 | print(Fraction(fraction='2135/7230').fraction2chntext()) 35 | print(Fraction(chntext='五百八十一分之三百六十九').chntext2fraction()) 36 | 37 | -------------------------------------------------------------------------------- /money.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """MONEY类 3 | 金钱 <=> 中文字符串 方法 4 | 中文字符串 <=> 金钱 方法 5 | """ 6 | import re 7 | 8 | __author__ = 'Zhiyang Zhou ' 9 | __data__ = '2019-05-08' 10 | 11 | from chn_text_norm.cardinal import Cardinal 12 | 13 | 14 | class Money: 15 | """ 16 | MONEY类 17 | """ 18 | 19 | def __init__(self, money=None, chntext=None): 20 | self.money = money 21 | self.chntext = chntext 22 | 23 | # def chntext2money(self): 24 | # return self.money 25 | 26 | def money2chntext(self): 27 | money = self.money 28 | pattern = re.compile(r'(\d+(\.\d+)?)') 29 | matchers = pattern.findall(money) 30 | if matchers: 31 | for matcher in matchers: 32 | money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()) 33 | self.chntext = money 34 | return self.chntext 35 | 36 | 37 | if __name__ == '__main__': 38 | 39 | # 测试 40 | print(Money(money='21.5万元').money2chntext()) 41 | print(Money(money='230块5毛').money2chntext()) 42 | 43 | -------------------------------------------------------------------------------- /percentage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """PERCENTAGE类 3 | 百分数 <=> 中文字符串 方法 4 | 中文字符串 <=> 百分数 方法 5 | """ 6 | 7 | __author__ = 'Zhiyang Zhou ' 8 | __data__ = '2019-05-06' 9 | 10 | from chn_text_norm.basic_util import * 11 | 12 | 13 | class Percentage: 14 | """ 15 | PERCENTAGE类 16 | """ 17 | 18 | def __init__(self, percentage=None, chntext=None): 19 | self.percentage = percentage 20 | self.chntext = chntext 21 | 22 | def chntext2percentage(self): 23 | return chn2num(self.chntext.strip().strip('百分之')) + '%' 24 | 25 | def percentage2chntext(self): 26 | return '百分之' + num2chn(self.percentage.strip().strip('%')) 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | # 测试程序 32 | print(Percentage(chntext='百分之五十六点零三').chntext2percentage()) 33 | print(Percentage(percentage='65.3%').percentage2chntext()) 34 | 35 | -------------------------------------------------------------------------------- /telephone.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """TELEPHONE类 3 | 电话号码 <=> 中文字符串 方法 4 | 中文字符串 <=> 电话号码 方法 5 | """ 6 | 7 | __author__ = 'Zhiyang Zhou ' 8 | __data__ = '2019-05-03' 9 | 10 | from chn_text_norm.basic_util import * 11 | 12 | 13 | class TelePhone: 14 | """ 15 | TELEPHONE类 16 | """ 17 | 18 | def __init__(self, telephone=None, raw_chntext=None, chntext=None): 19 | self.telephone = telephone 20 | self.raw_chntext = raw_chntext 21 | self.chntext = chntext 22 | 23 | # def chntext2telephone(self): 24 | # sil_parts = self.raw_chntext.split('') 25 | # self.telephone = '-'.join([ 26 | # str(chn2num(p)) for p in sil_parts 27 | # ]) 28 | # return self.telephone 29 | 30 | def telephone2chntext(self, fixed=False): 31 | 32 | if fixed: 33 | sil_parts = self.telephone.split('-') 34 | self.raw_chntext = ''.join([ 35 | num2chn(part, alt_two=False, use_units=False) for part in sil_parts 36 | ]) 37 | self.chntext = self.raw_chntext.replace('', '') 38 | else: 39 | sp_parts = self.telephone.strip('+').split() 40 | self.raw_chntext = ''.join([ 41 | num2chn(part, alt_two=False, use_units=False) for part in sp_parts 42 | ]) 43 | self.chntext = self.raw_chntext.replace('', '') 44 | return self.chntext 45 | 46 | 47 | if __name__ == '__main__': 48 | 49 | # 测试程序 50 | print(TelePhone(telephone='0595-23980880').telephone2chntext()) 51 | # print(TelePhone(raw_chntext='零五九五杠二三八六五零九八').chntext2telephone()) 52 | -------------------------------------------------------------------------------- /text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | TEXT类 4 | """ 5 | 6 | __author__ = 'Zhiyang Zhou ' 7 | __data__ = '2019-05-03' 8 | 9 | import re 10 | 11 | from chn_text_norm.cardinal import Cardinal 12 | from chn_text_norm.digit import Digit 13 | from chn_text_norm.telephone import TelePhone 14 | from chn_text_norm.fraction import Fraction 15 | from chn_text_norm.date import Date 16 | from chn_text_norm.money import Money 17 | from chn_text_norm.percentage import Percentage 18 | 19 | CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \ 20 | '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)' 21 | CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' 22 | COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \ 23 | '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \ 24 | '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \ 25 | '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \ 26 | '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \ 27 | '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)' 28 | 29 | 30 | class Text: 31 | """ 32 | Text类 33 | """ 34 | 35 | def __init__(self, raw_text, norm_text=None): 36 | self.raw_text = '^' + raw_text + '$' 37 | self.norm_text = norm_text 38 | 39 | def _particular(self): 40 | text = self.norm_text 41 | pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))") 42 | matchers = pattern.findall(text) 43 | if matchers: 44 | # print('particular') 45 | for matcher in matchers: 46 | text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1) 47 | self.norm_text = text 48 | return self.norm_text 49 | 50 | def normalize(self): 51 | text = self.raw_text 52 | 53 | # 规范化日期 54 | pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") 55 | matchers = pattern.findall(text) 56 | if matchers: 57 | # print('date') 58 | for matcher in matchers: 59 | text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) 60 | 61 | # 规范化金钱 62 | pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + "(\d" + CURRENCY_UNITS + "?)?)") 63 | matchers = pattern.findall(text) 64 | if matchers: 65 | # print('money') 66 | for matcher in matchers: 67 | text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) 68 | 69 | # 规范化固话/手机号码 70 | # 手机 71 | # http://www.jihaoba.com/news/show/13680 72 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 73 | # 联通:130、131、132、156、155、186、185、176 74 | # 电信:133、153、189、180、181、177 75 | pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") 76 | matchers = pattern.findall(text) 77 | if matchers: 78 | # print('telephone') 79 | for matcher in matchers: 80 | text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) 81 | # 固话 82 | pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") 83 | matchers = pattern.findall(text) 84 | if matchers: 85 | # print('fixed telephone') 86 | for matcher in matchers: 87 | text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1) 88 | 89 | # 规范化分数 90 | pattern = re.compile(r"(\d+/\d+)") 91 | matchers = pattern.findall(text) 92 | if matchers: 93 | # print('fraction') 94 | for matcher in matchers: 95 | text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) 96 | 97 | # 规范化百分数 98 | text = text.replace('%', '%') 99 | pattern = re.compile(r"(\d+(\.\d+)?%)") 100 | matchers = pattern.findall(text) 101 | if matchers: 102 | # print('percentage') 103 | for matcher in matchers: 104 | text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) 105 | 106 | # 规范化纯数+量词 107 | pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) 108 | matchers = pattern.findall(text) 109 | if matchers: 110 | # print('cardinal+quantifier') 111 | for matcher in matchers: 112 | text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) 113 | 114 | # 规范化数字编号 115 | pattern = re.compile(r"(\d{4,32})") 116 | matchers = pattern.findall(text) 117 | if matchers: 118 | # print('digit') 119 | for matcher in matchers: 120 | text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1) 121 | 122 | # 规范化纯数 123 | pattern = re.compile(r"(\d+(\.\d+)?)") 124 | matchers = pattern.findall(text) 125 | if matchers: 126 | # print('cardinal') 127 | for matcher in matchers: 128 | text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) 129 | 130 | self.norm_text = text 131 | self._particular() 132 | 133 | return self.norm_text.lstrip('^').rstrip('$') 134 | 135 | 136 | if __name__ == '__main__': 137 | 138 | # 测试程序 139 | print(Text(raw_text='固话:0595-23865596或23880880。').normalize()) 140 | print(Text(raw_text='手机:+86 19859213959或15659451527。').normalize()) 141 | print(Text(raw_text='分数:32477/76391。').normalize()) 142 | print(Text(raw_text='百分数:80.03%。').normalize()) 143 | print(Text(raw_text='编号:31520181154418。').normalize()) 144 | print(Text(raw_text='纯数:2983.07克或12345.60米。').normalize()) 145 | print(Text(raw_text='日期:1999年2月20日或09年3月15号。').normalize()) 146 | print(Text(raw_text='金钱:12块5,34.5元,20.1万').normalize()) 147 | print(Text(raw_text='特殊:O2O或B2C。').normalize()) 148 | 149 | --------------------------------------------------------------------------------