├── .gitignore
├── README.md
├── __init__.py
├── basic_class.py
├── basic_constant.py
├── basic_util.py
├── cardinal.py
├── date.py
├── digit.py
├── fraction.py
├── money.py
├── percentage.py
├── telephone.py
└── text.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # JetBrains PyCharm
107 | .idea
108 | 
109 | # Customize
110 | references
111 | url.txt
112 | 
113 | # Git
114 | .git
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Chn Text Norm
 2 | 
 3 | this is a repository for chinese text normalization (no longer maintained).
 4 | 
 5 | ## Quick Start ##
 6 | 
 7 | ### Git Clone Repo ###
 8 | 
 9 | git clone this repo to the root directory of your project which need to use it.
10 | 
11 |     cd /path/to/proj
12 |     git clone https://github.com/Joee1995/chn-text-norm.git
13 | 
14 | after that, your doc tree should be:
15 | ```
16 | proj                     # root of your project
17 | |--- chn_text_norm       # this chn-text-norm tool
18 |      |--- text.py
19 |      |--- ...
20 | |--- text_normalize.py   # your text normalization code
21 | |--- ...
22 | ```
23 | 
24 | ### How to Use ? ###
25 | 
26 |     # text_normalize.py
27 |     from chn_text_norm.text import *
28 |     
29 |     raw_text = 'your raw text'
30 |     text = Text(raw_text=raw_text).normalize()
31 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atomicoo/chn_text_norm/8210575e9a4fddb409e5a76b922bf6c30f5833c5/__init__.py


--------------------------------------------------------------------------------
/basic_class.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """基本类
  3 | 中文字符类
  4 | 中文数字/数位类
  5 | 中文数字类
  6 | 中文数位类
  7 | 中文数字系统类
  8 | 中文数学符号类
  9 | *中文其他符号类
 10 | """
 11 | 
 12 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 13 | __data__ = '2019-05-02'
 14 | 
 15 | from chn_text_norm.basic_constant import NUMBERING_TYPES
 16 | 
 17 | 
 18 | class ChineseChar(object):
 19 |     """
 20 |     中文字符
 21 |     每个字符对应简体和繁体,
 22 |     e.g. 简体 = '负', 繁体 = '負'
 23 |     转换时可转换为简体或繁体
 24 |     """
 25 | 
 26 |     def __init__(self, simplified, traditional):
 27 |         self.simplified = simplified
 28 |         self.traditional = traditional
 29 |         self.__repr__ = self.__str__
 30 | 
 31 |     def __str__(self):
 32 |         return self.simplified or self.traditional or None
 33 | 
 34 |     def __repr__(self):
 35 |         return self.__str__()
 36 | 
 37 | 
 38 | class ChineseNumberUnit(ChineseChar):
 39 |     """
 40 |     中文数字/数位字符
 41 |     每个字符除繁简体外还有一个额外的大写字符
 42 |     e.g. '陆' 和 '陸'
 43 |     """
 44 | 
 45 |     def __init__(self, power, simplified, traditional, big_s, big_t):
 46 |         super(ChineseNumberUnit, self).__init__(simplified, traditional)
 47 |         self.power = power
 48 |         self.big_s = big_s
 49 |         self.big_t = big_t
 50 | 
 51 |     def __str__(self):
 52 |         return '10^{}'.format(self.power)
 53 | 
 54 |     @classmethod
 55 |     def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
 56 | 
 57 |         if small_unit:
 58 |             return ChineseNumberUnit(power=index + 1,
 59 |                                      simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
 60 |         elif numbering_type == NUMBERING_TYPES[0]:
 61 |             return ChineseNumberUnit(power=index + 8,
 62 |                                      simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
 63 |         elif numbering_type == NUMBERING_TYPES[1]:
 64 |             return ChineseNumberUnit(power=(index + 2) * 4,
 65 |                                      simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
 66 |         elif numbering_type == NUMBERING_TYPES[2]:
 67 |             return ChineseNumberUnit(power=pow(2, index + 3),
 68 |                                      simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
 69 |         else:
 70 |             raise ValueError(
 71 |                 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
 72 | 
 73 | 
 74 | class ChineseNumberDigit(ChineseChar):
 75 |     """
 76 |     中文数字字符
 77 |     """
 78 | 
 79 |     def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
 80 |         super(ChineseNumberDigit, self).__init__(simplified, traditional)
 81 |         self.value = value
 82 |         self.big_s = big_s
 83 |         self.big_t = big_t
 84 |         self.alt_s = alt_s
 85 |         self.alt_t = alt_t
 86 | 
 87 |     def __str__(self):
 88 |         return str(self.value)
 89 | 
 90 |     @classmethod
 91 |     def create(cls, i, v):
 92 |         return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
 93 | 
 94 | 
 95 | class ChineseMath(ChineseChar):
 96 |     """
 97 |     中文数位字符
 98 |     """
 99 | 
100 |     def __init__(self, simplified, traditional, symbol, expression=None):
101 |         super(ChineseMath, self).__init__(simplified, traditional)
102 |         self.symbol = symbol
103 |         self.expression = expression
104 |         self.big_s = simplified
105 |         self.big_t = traditional
106 | 
107 | 
108 | CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
109 | 
110 | 
111 | class NumberSystem(object):
112 |     """
113 |     中文数字系统
114 |     """
115 |     pass
116 | 
117 | 
118 | class MathSymbol(object):
119 |     """
120 |     用于中文数字系统的数学符号 (繁/简体), e.g.
121 |     positive = ['正', '正']
122 |     negative = ['负', '負']
123 |     point = ['点', '點']
124 |     """
125 | 
126 |     def __init__(self, positive, negative, point):
127 |         self.positive = positive
128 |         self.negative = negative
129 |         self.point = point
130 | 
131 |     def __iter__(self):
132 |         for v in self.__dict__.values():
133 |             yield v
134 | 
135 | 
136 | # class OtherSymbol(object):
137 | #     """
138 | #     其他符号
139 | #     """
140 | #
141 | #     def __init__(self, sil):
142 | #         self.sil = sil
143 | #
144 | #     def __iter__(self):
145 | #         for v in self.__dict__.values():
146 | #             yield v
147 | 
148 | 


--------------------------------------------------------------------------------
/basic_constant.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """基本常量
 3 | 中文数字/数位/符号字符常量
 4 | """
 5 | 
 6 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 7 | __data__ = '2019-05-02'
 8 | 
 9 | CHINESE_DIGIS = u'零一二三四五六七八九'
10 | BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
11 | BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'
12 | SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万'
13 | SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬'
14 | LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
15 | LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'
16 | SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
17 | SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'
18 | 
19 | ZERO_ALT = u'〇'
20 | ONE_ALT = u'幺'
21 | TWO_ALTS = [u'两', u'兩']
22 | 
23 | POSITIVE = [u'正', u'正']
24 | NEGATIVE = [u'负', u'負']
25 | POINT = [u'点', u'點']
26 | # PLUS = [u'加', u'加']
27 | # SIL = [u'杠', u'槓']
28 | 
29 | # 中文数字系统类型
30 | NUMBERING_TYPES = ['low', 'mid', 'high']
31 | 


--------------------------------------------------------------------------------
/basic_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """基本方法
  3 | 创建中文数字系统 方法
  4 | 中文字符串 <=> 数字串 方法
  5 | 数字串 <=> 中文字符串 方法
  6 | """
  7 | 
  8 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
  9 | __data__ = '2019-05-02'
 10 | 
 11 | from chn_text_norm.basic_constant import *
 12 | from chn_text_norm.basic_class import *
 13 | 
 14 | 
 15 | def create_system(numbering_type=NUMBERING_TYPES[1]):
 16 |     """
 17 |     根据数字系统类型返回创建相应的数字系统，默认为 mid
 18 |     NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
 19 |         low:  '兆' = '亿' * '十' = $10^{9}$,  '京' = '兆' * '十', etc.
 20 |         mid:  '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
 21 |         high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
 22 |     返回对应的数字系统
 23 |     """
 24 | 
 25 |     # chinese number units of '亿' and larger
 26 |     all_larger_units = zip(
 27 |         LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
 28 |     larger_units = [CNU.create(i, v, numbering_type, False)
 29 |                     for i, v in enumerate(all_larger_units)]
 30 |     # chinese number units of '十, 百, 千, 万'
 31 |     all_smaller_units = zip(
 32 |         SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
 33 |     smaller_units = [CNU.create(i, v, small_unit=True)
 34 |                      for i, v in enumerate(all_smaller_units)]
 35 |     # digis
 36 |     chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
 37 |                         BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
 38 |     digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
 39 |     digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
 40 |     digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
 41 |     digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
 42 | 
 43 |     # symbols
 44 |     positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
 45 |     negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
 46 |     point_cn = CM(POINT[0], POINT[1], '.', lambda x,
 47 |                   y: float(str(x) + '.' + str(y)))
 48 |     # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
 49 |     system = NumberSystem()
 50 |     system.units = smaller_units + larger_units
 51 |     system.digits = digits
 52 |     system.math = MathSymbol(positive_cn, negative_cn, point_cn)
 53 |     # system.symbols = OtherSymbol(sil_cn)
 54 |     return system
 55 | 
 56 | 
 57 | def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
 58 | 
 59 |     def get_symbol(char, system):
 60 |         for u in system.units:
 61 |             if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
 62 |                 return u
 63 |         for d in system.digits:
 64 |             if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
 65 |                 return d
 66 |         for m in system.math:
 67 |             if char in [m.traditional, m.simplified]:
 68 |                 return m
 69 | 
 70 |     def string2symbols(chinese_string, system):
 71 |         int_string, dec_string = chinese_string, ''
 72 |         for p in [system.math.point.simplified, system.math.point.traditional]:
 73 |             if p in chinese_string:
 74 |                 int_string, dec_string = chinese_string.split(p)
 75 |                 break
 76 |         return [get_symbol(c, system) for c in int_string], \
 77 |                [get_symbol(c, system) for c in dec_string]
 78 | 
 79 |     def correct_symbols(integer_symbols, system):
 80 |         """
 81 |         一百八 to 一百八十
 82 |         一亿一千三百万 to 一亿 一千万 三百万
 83 |         """
 84 | 
 85 |         if integer_symbols and isinstance(integer_symbols[0], CNU):
 86 |             if integer_symbols[0].power == 1:
 87 |                 integer_symbols = [system.digits[1]] + integer_symbols
 88 | 
 89 |         if len(integer_symbols) > 1:
 90 |             if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
 91 |                 integer_symbols.append(
 92 |                     CNU(integer_symbols[-2].power - 1, None, None, None, None))
 93 | 
 94 |         result = []
 95 |         unit_count = 0
 96 |         for s in integer_symbols:
 97 |             if isinstance(s, CND):
 98 |                 result.append(s)
 99 |                 unit_count = 0
100 |             elif isinstance(s, CNU):
101 |                 current_unit = CNU(s.power, None, None, None, None)
102 |                 unit_count += 1
103 | 
104 |             if unit_count == 1:
105 |                 result.append(current_unit)
106 |             elif unit_count > 1:
107 |                 for i in range(len(result)):
108 |                     if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
109 |                         result[-i - 1] = CNU(result[-i - 1].power +
110 |                                              current_unit.power, None, None, None, None)
111 |         return result
112 | 
113 |     def compute_value(integer_symbols):
114 |         """
115 |         Compute the value.
116 |         When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
117 |         e.g. '两千万' = 2000 * 10000 not 2000 + 10000
118 |         """
119 |         value = [0]
120 |         last_power = 0
121 |         for s in integer_symbols:
122 |             if isinstance(s, CND):
123 |                 value[-1] = s.value
124 |             elif isinstance(s, CNU):
125 |                 value[-1] *= pow(10, s.power)
126 |                 if s.power > last_power:
127 |                     value[:-1] = list(map(lambda v: v *
128 |                                                     pow(10, s.power), value[:-1]))
129 |                     last_power = s.power
130 |                 value.append(0)
131 |         return sum(value)
132 | 
133 |     system = create_system(numbering_type)
134 |     int_part, dec_part = string2symbols(chinese_string, system)
135 |     int_part = correct_symbols(int_part, system)
136 |     int_str = str(compute_value(int_part))
137 |     dec_str = ''.join([str(d.value) for d in dec_part])
138 |     if dec_part:
139 |         return '{0}.{1}'.format(int_str, dec_str)
140 |     else:
141 |         return int_str
142 | 
143 | 
144 | def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
145 |             traditional=False, alt_zero=False, alt_one=False, alt_two=True,
146 |             use_zeros=True, use_units=True):
147 | 
148 |     def get_value(value_string, use_zeros=True):
149 | 
150 |         striped_string = value_string.lstrip('0')
151 | 
152 |         # record nothing if all zeros
153 |         if not striped_string:
154 |             return []
155 | 
156 |         # record one digits
157 |         elif len(striped_string) == 1:
158 |             if use_zeros and len(value_string) != len(striped_string):
159 |                 return [system.digits[0], system.digits[int(striped_string)]]
160 |             else:
161 |                 return [system.digits[int(striped_string)]]
162 | 
163 |         # recursively record multiple digits
164 |         else:
165 |             result_unit = next(u for u in reversed(
166 |                 system.units) if u.power < len(striped_string))
167 |             result_string = value_string[:-result_unit.power]
168 |             return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
169 | 
170 |     system = create_system(numbering_type)
171 | 
172 |     int_dec = number_string.split('.')
173 |     if len(int_dec) == 1:
174 |         int_string = int_dec[0]
175 |         dec_string = ""
176 |     elif len(int_dec) == 2:
177 |         int_string = int_dec[0]
178 |         dec_string = int_dec[1]
179 |     else:
180 |         raise ValueError(
181 |             "invalid input num string with more than one dot: {}".format(number_string))
182 | 
183 |     if use_units and len(int_string) > 1:
184 |         result_symbols = get_value(int_string)
185 |     else:
186 |         result_symbols = [system.digits[int(c)] for c in int_string]
187 |     dec_symbols = [system.digits[int(c)] for c in dec_string]
188 |     if dec_string:
189 |         result_symbols += [system.math.point] + dec_symbols
190 | 
191 |     if alt_two:
192 |         liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
193 |                     system.digits[2].big_s, system.digits[2].big_t)
194 |         for i, v in enumerate(result_symbols):
195 |             if isinstance(v, CND) and v.value == 2:
196 |                 next_symbol = result_symbols[i +
197 |                                              1] if i < len(result_symbols) - 1 else None
198 |                 previous_symbol = result_symbols[i - 1] if i > 0 else None
199 |                 if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
200 |                     if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
201 |                         result_symbols[i] = liang
202 | 
203 |     # if big is True, '两' will not be used and `alt_two` has no impact on output
204 |     if big:
205 |         attr_name = 'big_'
206 |         if traditional:
207 |             attr_name += 't'
208 |         else:
209 |             attr_name += 's'
210 |     else:
211 |         if traditional:
212 |             attr_name = 'traditional'
213 |         else:
214 |             attr_name = 'simplified'
215 | 
216 |     result = ''.join([getattr(s, attr_name) for s in result_symbols])
217 | 
218 |     # if not use_zeros:
219 |     #     result = result.strip(getattr(system.digits[0], attr_name))
220 | 
221 |     if alt_zero:
222 |         result = result.replace(
223 |             getattr(system.digits[0], attr_name), system.digits[0].alt_s)
224 | 
225 |     if alt_one:
226 |         result = result.replace(
227 |             getattr(system.digits[1], attr_name), system.digits[1].alt_s)
228 | 
229 |     for i, p in enumerate(POINT):
230 |         if result.startswith(p):
231 |             return CHINESE_DIGIS[0] + result
232 | 
233 |     # ^10, 11, .., 19
234 |     if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
235 |                                           SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
236 |             result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
237 |         result = result[1:]
238 | 
239 |     return result
240 | 
241 | 
242 | if __name__ == '__main__':
243 | 
244 |     # 测试程序
245 |     all_chinese_number_string = (
246 |             CHINESE_DIGIS + BIG_CHINESE_DIGIS_SIMPLIFIED + BIG_CHINESE_DIGIS_TRADITIONAL +
247 |             LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED + LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL +
248 |             SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL + ZERO_ALT +
249 |             ONE_ALT + ''.join(TWO_ALTS + POSITIVE + NEGATIVE + POINT))
250 | 
251 |     print('num:', chn2num('一万零四百零三点八零五'))
252 |     print('num:', chn2num('一亿六点三'))
253 |     print('num:', chn2num('一亿零六点三'))
254 |     print('num:', chn2num('两千零一亿六点三'))
255 |     # print('num:', chn2num('一零零八六'))
256 |     print('txt:', num2chn('10260.03', alt_zero=True))
257 |     print('txt:', num2chn('20037.090', numbering_type='low', traditional=True))
258 |     print('txt:', num2chn('100860001.77', numbering_type='high', big=True))
259 |     print('txt:', num2chn('059523810880', alt_one=True, alt_two=False, use_lzeros=True, use_rzeros=True, use_units=False))
260 | 
261 |     print(all_chinese_number_string)
262 | 


--------------------------------------------------------------------------------
/cardinal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """CARDINAL类 (包含小数DECIMAL类)
 3 | 纯数 <=> 中文字符串 方法
 4 | 中文字符串 <=> 纯数 方法
 5 | """
 6 | 
 7 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 8 | __data__ = '2019-05-03'
 9 | 
10 | from chn_text_norm.basic_util import *
11 | 
12 | 
13 | class Cardinal:
14 |     """
15 |     CARDINAL类
16 |     """
17 | 
18 |     def __init__(self, cardinal=None, chntext=None):
19 |         self.cardinal = cardinal
20 |         self.chntext = chntext
21 | 
22 |     def chntext2cardinal(self):
23 |         return chn2num(self.chntext)
24 | 
25 |     def cardinal2chntext(self):
26 |         return num2chn(self.cardinal)
27 | 
28 | 
29 | if __name__ == '__main__':
30 | 
31 |     # 测试程序
32 |     print(Cardinal(cardinal='21357.230').cardinal2chntext())
33 | 
34 | 


--------------------------------------------------------------------------------
/date.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """DATE类
 3 | 日期 <=> 中文字符串 方法
 4 | 中文字符串 <=> 日期 方法
 5 | """
 6 | 
 7 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 8 | __data__ = '2019-05-07'
 9 | 
10 | from chn_text_norm.cardinal import Cardinal
11 | from chn_text_norm.digit import Digit
12 | 
13 | 
14 | class Date:
15 |     """
16 |     DATE类
17 |     """
18 | 
19 |     def __init__(self, date=None, chntext=None):
20 |         self.date = date
21 |         self.chntext = chntext
22 | 
23 |     # def chntext2date(self):
24 |     #     chntext = self.chntext
25 |     #     try:
26 |     #         year, other = chntext.strip().split('年', maxsplit=1)
27 |     #         year = Digit(chntext=year).digit2chntext() + '年'
28 |     #     except ValueError:
29 |     #         other = chntext
30 |     #         year = ''
31 |     #     if other:
32 |     #         try:
33 |     #             month, day = other.strip().split('月', maxsplit=1)
34 |     #             month = Cardinal(chntext=month).chntext2cardinal() + '月'
35 |     #         except ValueError:
36 |     #             day = chntext
37 |     #             month = ''
38 |     #         if day:
39 |     #             day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
40 |     #     else:
41 |     #         month = ''
42 |     #         day = ''
43 |     #     date = year + month + day
44 |     #     self.date = date
45 |     #     return self.date
46 | 
47 |     def date2chntext(self):
48 |         date = self.date
49 |         try:
50 |             year, other = date.strip().split('年', maxsplit=1)
51 |             year = Digit(digit=year).digit2chntext() + '年'
52 |         except ValueError:
53 |             other = date
54 |             year = ''
55 |         if other:
56 |             try:
57 |                 month, day = other.strip().split('月', maxsplit=1)
58 |                 month = Cardinal(cardinal=month).cardinal2chntext() + '月'
59 |             except ValueError:
60 |                 day = date
61 |                 month = ''
62 |             if day:
63 |                 day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
64 |         else:
65 |             month = ''
66 |             day = ''
67 |         chntext = year + month + day
68 |         self.chntext = chntext
69 |         return self.chntext
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     # 测试
75 |     print(Date(date='09年3月16日').date2chntext())
76 | 
77 | 


--------------------------------------------------------------------------------
/digit.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """DIGIT类
 3 | 数字串 <=> 中文字符串 方法
 4 | 中文字符串 <=> 数字串 方法
 5 | """
 6 | 
 7 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 8 | __data__ = '2019-05-03'
 9 | 
10 | from chn_text_norm.basic_util import *
11 | 
12 | 
13 | class Digit:
14 |     """
15 |     DIGIT类
16 |     """
17 | 
18 |     def __init__(self, digit=None, chntext=None):
19 |         self.digit = digit
20 |         self.chntext = chntext
21 | 
22 |     # def chntext2digit(self):
23 |     #     return chn2num(self.chntext)
24 | 
25 |     def digit2chntext(self):
26 |         return num2chn(self.digit, alt_two=False, use_units=False)
27 | 
28 | 
29 | if __name__ == '__main__':
30 | 
31 |     # 测试程序
32 |     print(Digit(digit='2016').digit2chntext())
33 | 
34 | 


--------------------------------------------------------------------------------
/fraction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """FRACTION类
 3 | 分数 <=> 中文字符串 方法
 4 | 中文字符串 <=> 分数 方法
 5 | """
 6 | 
 7 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 8 | __data__ = '2019-05-03'
 9 | 
10 | from chn_text_norm.basic_util import *
11 | 
12 | 
13 | class Fraction:
14 |     """
15 |     FRACTION类
16 |     """
17 | 
18 |     def __init__(self, fraction=None, chntext=None):
19 |         self.fraction = fraction
20 |         self.chntext = chntext
21 | 
22 |     def chntext2fraction(self):
23 |         denominator, numerator = self.chntext.split('分之')
24 |         return chn2num(numerator) + '/' + chn2num(denominator)
25 | 
26 |     def fraction2chntext(self):
27 |         numerator, denominator = self.fraction.split('/')
28 |         return num2chn(denominator) + '分之' + num2chn(numerator)
29 | 
30 | 
31 | if __name__ == '__main__':
32 | 
33 |     # 测试程序
34 |     print(Fraction(fraction='2135/7230').fraction2chntext())
35 |     print(Fraction(chntext='五百八十一分之三百六十九').chntext2fraction())
36 | 
37 | 


--------------------------------------------------------------------------------
/money.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """MONEY类
 3 | 金钱 <=> 中文字符串 方法
 4 | 中文字符串 <=> 金钱 方法
 5 | """
 6 | import re
 7 | 
 8 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 9 | __data__ = '2019-05-08'
10 | 
11 | from chn_text_norm.cardinal import Cardinal
12 | 
13 | 
14 | class Money:
15 |     """
16 |     MONEY类
17 |     """
18 | 
19 |     def __init__(self, money=None, chntext=None):
20 |         self.money = money
21 |         self.chntext = chntext
22 | 
23 |     # def chntext2money(self):
24 |     #     return self.money
25 | 
26 |     def money2chntext(self):
27 |         money = self.money
28 |         pattern = re.compile(r'(\d+(\.\d+)?)')
29 |         matchers = pattern.findall(money)
30 |         if matchers:
31 |             for matcher in matchers:
32 |                 money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
33 |         self.chntext = money
34 |         return self.chntext
35 | 
36 | 
37 | if __name__ == '__main__':
38 | 
39 |     # 测试
40 |     print(Money(money='21.5万元').money2chntext())
41 |     print(Money(money='230块5毛').money2chntext())
42 | 
43 | 


--------------------------------------------------------------------------------
/percentage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """PERCENTAGE类
 3 | 百分数 <=> 中文字符串 方法
 4 | 中文字符串 <=> 百分数 方法
 5 | """
 6 | 
 7 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 8 | __data__ = '2019-05-06'
 9 | 
10 | from chn_text_norm.basic_util import *
11 | 
12 | 
13 | class Percentage:
14 |     """
15 |     PERCENTAGE类
16 |     """
17 | 
18 |     def __init__(self, percentage=None, chntext=None):
19 |         self.percentage = percentage
20 |         self.chntext = chntext
21 | 
22 |     def chntext2percentage(self):
23 |         return chn2num(self.chntext.strip().strip('百分之')) + '%'
24 | 
25 |     def percentage2chntext(self):
26 |         return '百分之' + num2chn(self.percentage.strip().strip('%'))
27 | 
28 | 
29 | if __name__ == '__main__':
30 | 
31 |     # 测试程序
32 |     print(Percentage(chntext='百分之五十六点零三').chntext2percentage())
33 |     print(Percentage(percentage='65.3%').percentage2chntext())
34 | 
35 | 


--------------------------------------------------------------------------------
/telephone.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """TELEPHONE类
 3 | 电话号码 <=> 中文字符串 方法
 4 | 中文字符串 <=> 电话号码 方法
 5 | """
 6 | 
 7 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
 8 | __data__ = '2019-05-03'
 9 | 
10 | from chn_text_norm.basic_util import *
11 | 
12 | 
13 | class TelePhone:
14 |     """
15 |     TELEPHONE类
16 |     """
17 | 
18 |     def __init__(self, telephone=None, raw_chntext=None, chntext=None):
19 |         self.telephone = telephone
20 |         self.raw_chntext = raw_chntext
21 |         self.chntext = chntext
22 | 
23 |     # def chntext2telephone(self):
24 |     #     sil_parts = self.raw_chntext.split('<SIL>')
25 |     #     self.telephone = '-'.join([
26 |     #         str(chn2num(p)) for p in sil_parts
27 |     #     ])
28 |     #     return self.telephone
29 | 
30 |     def telephone2chntext(self, fixed=False):
31 | 
32 |         if fixed:
33 |             sil_parts = self.telephone.split('-')
34 |             self.raw_chntext = '<SIL>'.join([
35 |                 num2chn(part, alt_two=False, use_units=False) for part in sil_parts
36 |             ])
37 |             self.chntext = self.raw_chntext.replace('<SIL>', '')
38 |         else:
39 |             sp_parts = self.telephone.strip('+').split()
40 |             self.raw_chntext = '<SP>'.join([
41 |                 num2chn(part, alt_two=False, use_units=False) for part in sp_parts
42 |             ])
43 |             self.chntext = self.raw_chntext.replace('<SP>', '')
44 |         return self.chntext
45 | 
46 | 
47 | if __name__ == '__main__':
48 | 
49 |     # 测试程序
50 |     print(TelePhone(telephone='0595-23980880').telephone2chntext())
51 |     # print(TelePhone(raw_chntext='零五九五杠二三八六五零九八').chntext2telephone())
52 | 


--------------------------------------------------------------------------------
/text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | TEXT类
  4 | """
  5 | 
  6 | __author__ = 'Zhiyang Zhou <zyzhou@stu.xmu.edu.cn>'
  7 | __data__ = '2019-05-03'
  8 | 
  9 | import re
 10 | 
 11 | from chn_text_norm.cardinal import Cardinal
 12 | from chn_text_norm.digit import Digit
 13 | from chn_text_norm.telephone import TelePhone
 14 | from chn_text_norm.fraction import Fraction
 15 | from chn_text_norm.date import Date
 16 | from chn_text_norm.money import Money
 17 | from chn_text_norm.percentage import Percentage
 18 | 
 19 | CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \
 20 |                  '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)'
 21 | CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
 22 | COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \
 23 |                   '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \
 24 |                   '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \
 25 |                   '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \
 26 |                   '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \
 27 |                   '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)'
 28 | 
 29 | 
 30 | class Text:
 31 |     """
 32 |     Text类
 33 |     """
 34 | 
 35 |     def __init__(self, raw_text, norm_text=None):
 36 |         self.raw_text = '^' + raw_text + '$'
 37 |         self.norm_text = norm_text
 38 | 
 39 |     def _particular(self):
 40 |         text = self.norm_text
 41 |         pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
 42 |         matchers = pattern.findall(text)
 43 |         if matchers:
 44 |             # print('particular')
 45 |             for matcher in matchers:
 46 |                 text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
 47 |         self.norm_text = text
 48 |         return self.norm_text
 49 | 
 50 |     def normalize(self):
 51 |         text = self.raw_text
 52 | 
 53 |         # 规范化日期
 54 |         pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
 55 |         matchers = pattern.findall(text)
 56 |         if matchers:
 57 |             # print('date')
 58 |             for matcher in matchers:
 59 |                 text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
 60 | 
 61 |         # 规范化金钱
 62 |         pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + "(\d" + CURRENCY_UNITS + "?)?)")
 63 |         matchers = pattern.findall(text)
 64 |         if matchers:
 65 |             # print('money')
 66 |             for matcher in matchers:
 67 |                 text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
 68 | 
 69 |         # 规范化固话/手机号码
 70 |         # 手机
 71 |         # http://www.jihaoba.com/news/show/13680
 72 |         # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
 73 |         # 联通：130、131、132、156、155、186、185、176
 74 |         # 电信：133、153、189、180、181、177
 75 |         pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
 76 |         matchers = pattern.findall(text)
 77 |         if matchers:
 78 |             # print('telephone')
 79 |             for matcher in matchers:
 80 |                 text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
 81 |         # 固话
 82 |         pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
 83 |         matchers = pattern.findall(text)
 84 |         if matchers:
 85 |             # print('fixed telephone')
 86 |             for matcher in matchers:
 87 |                 text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
 88 | 
 89 |         # 规范化分数
 90 |         pattern = re.compile(r"(\d+/\d+)")
 91 |         matchers = pattern.findall(text)
 92 |         if matchers:
 93 |             # print('fraction')
 94 |             for matcher in matchers:
 95 |                 text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
 96 | 
 97 |         # 规范化百分数
 98 |         text = text.replace('％', '%')
 99 |         pattern = re.compile(r"(\d+(\.\d+)?%)")
100 |         matchers = pattern.findall(text)
101 |         if matchers:
102 |             # print('percentage')
103 |             for matcher in matchers:
104 |                 text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
105 | 
106 |         # 规范化纯数+量词
107 |         pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
108 |         matchers = pattern.findall(text)
109 |         if matchers:
110 |             # print('cardinal+quantifier')
111 |             for matcher in matchers:
112 |                 text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
113 | 
114 |         # 规范化数字编号
115 |         pattern = re.compile(r"(\d{4,32})")
116 |         matchers = pattern.findall(text)
117 |         if matchers:
118 |             # print('digit')
119 |             for matcher in matchers:
120 |                 text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
121 | 
122 |         # 规范化纯数
123 |         pattern = re.compile(r"(\d+(\.\d+)?)")
124 |         matchers = pattern.findall(text)
125 |         if matchers:
126 |             # print('cardinal')
127 |             for matcher in matchers:
128 |                 text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
129 | 
130 |         self.norm_text = text
131 |         self._particular()
132 | 
133 |         return self.norm_text.lstrip('^').rstrip('$')
134 | 
135 | 
136 | if __name__ == '__main__':
137 | 
138 |     # 测试程序
139 |     print(Text(raw_text='固话：0595-23865596或23880880。').normalize())
140 |     print(Text(raw_text='手机：+86 19859213959或15659451527。').normalize())
141 |     print(Text(raw_text='分数：32477/76391。').normalize())
142 |     print(Text(raw_text='百分数：80.03%。').normalize())
143 |     print(Text(raw_text='编号：31520181154418。').normalize())
144 |     print(Text(raw_text='纯数：2983.07克或12345.60米。').normalize())
145 |     print(Text(raw_text='日期：1999年2月20日或09年3月15号。').normalize())
146 |     print(Text(raw_text='金钱：12块5，34.5元，20.1万').normalize())
147 |     print(Text(raw_text='特殊：O2O或B2C。').normalize())
148 | 
149 | 


--------------------------------------------------------------------------------