├── README.md ├── setup.py └── timeparser ├── __init__.py ├── chinese_parser.py ├── extractor.py ├── helper.py ├── lunar_solar_date.py ├── rule_pattern.py ├── time_extractor.py └── time_parser.py /README.md: -------------------------------------------------------------------------------- 1 | # timeparser 2 | **文本时间抽取、解析、标准化工具** 3 | 4 | ### 安装方式 5 | 6 | ```shell 7 | cd timeparser 8 | python setup.py install 9 | ``` 10 | 11 | ### 使用实例 12 | 13 | ```python 14 | import timeparser as tp 15 | import json 16 | 17 | text = '''今年5月和6月腾讯和阿里的股票走势''' 18 | res = tp.extract_time(text) 19 | ``` 20 | 21 | ### 输出 22 | 23 | ```json 24 | [ 25 | { 26 | "text":"今年5月", 27 | "offset":[ 28 | 0, 29 | 4 30 | ], 31 | "type":"time_span", 32 | "detail":{ 33 | "type":"time_span", 34 | "definition":"accurate", 35 | "time":[ 36 | "2022-05-01 00:00:00", 37 | "2022-05-31 23:59:59" 38 | ] 39 | } 40 | }, 41 | { 42 | "text":"6月", 43 | "offset":[ 44 | 5, 45 | 7 46 | ], 47 | "type":"time_point", 48 | "detail":{ 49 | "type":"time_point", 50 | "definition":"accurate", 51 | "time":[ 52 | "2022-06-01 00:00:00", 53 | "2022-06-30 23:59:59" 54 | ] 55 | } 56 | } 57 | ] 58 | 59 | ``` 60 | 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Junhui Yu 3 | # @File : setup.py 4 | # @Time : 2022/10/25 15:44 5 | 6 | from setuptools import setup, find_packages 7 | from setuptools.command.install import install 8 | import shutil 9 | from pathlib import Path 10 | 11 | PACKAGE_NAME = "timeparser" 12 | ALL_PROGRAM_ENTRIES = ['task_dispatcher = backend.task_master:main'] 13 | version = '1.0' 14 | 15 | install_requires = [] 16 | try: 17 | import json 18 | except ImportError: 19 | try: 20 | import simplejson 21 | except ImportError: 22 | install_requires.append('simplejson') 23 | 24 | 25 | class CustomInstallCommand(install): 26 | """Customized setuptools install command """ 27 | 28 | def run(self): 29 | install.run(self) 30 | self.run_after_install() 31 | 32 | def run_after_install(self): 33 | pass 34 | 35 | 36 | def main(): 37 | setup( 38 | name=PACKAGE_NAME, 39 | version=version, 40 | author="Junhui Yu", 41 | author_email="Junhuy@163.com", 42 | description=("Timing Parsering"), 43 | license="https://github.com/yujunhuics/timeparser", 44 | packages=find_packages(), 45 | include_package_data=True, 46 | install_requires=install_requires, 47 | cmdclass={ 48 | 'install': CustomInstallCommand, 49 | }, 50 | entry_points={ 51 | 'console_scripts': ALL_PROGRAM_ENTRIES} 52 | ) 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | 58 | stale_egg_info = Path(__file__).parent / "timeparser.egg-info" 59 | build_info = Path(__file__).parent / "build" 60 | dist_info = Path(__file__).parent / "dist" 61 | if stale_egg_info.exists(): 62 | shutil.rmtree(stale_egg_info) 63 | 64 | if build_info.exists(): 65 | shutil.rmtree(build_info) 66 | 67 | if dist_info.exists(): 68 | shutil.rmtree(dist_info) 69 | -------------------------------------------------------------------------------- /timeparser/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Junhui Yu 3 | # @File : __init__.py.py 4 | # @Time : 2022/10/25 10:44 5 | 6 | 7 | from .helper import set_logger 8 | 9 | logging = set_logger(level='INFO', log_dir_name='.timeparser_logs') 10 | 11 | 12 | from .time_parser import * 13 | from .extractor import * 14 | 15 | 16 | from .time_extractor import TimeExtractor 17 | 18 | parse_time = TimeParser() 19 | extract_time = TimeExtractor() 20 | 21 | 22 | extractor = Extractor() 23 | extract_parentheses = extractor.extract_parentheses 24 | remove_parentheses = extractor.remove_parentheses 25 | -------------------------------------------------------------------------------- /timeparser/chinese_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Junhui Yu 3 | # @File : chinese_parser.py 4 | # @Time : 2022/10/25 11:17 5 | 6 | 7 | import re 8 | 9 | from .helper import start_end 10 | from .rule_pattern import CURRENCY_CASE, \ 11 | MONEY_PREFIX_STRING, MONEY_SUFFIX_STRING, MONEY_BLUR_STRING, \ 12 | MONEY_MINUS_STRING, MONEY_PLUS_STRING, MONEY_NUM_STRING, \ 13 | MONEY_KUAI_MAO_JIAO_FEN_STRING, MONEY_NUM_MIDDLE_STRING 14 | 15 | from .extractor import Extractor 16 | 17 | extractor = Extractor() 18 | extract_parentheses = extractor.extract_parentheses 19 | remove_parentheses = extractor.remove_parentheses 20 | 21 | 22 | 23 | __all__ = ['ChineseParser'] 24 | 25 | 26 | class ChineseParser(object): 27 | 28 | def __init__(self): 29 | self.money_pattern_1 = None 30 | 31 | def _prepare(self): 32 | self.float_num_pattern = re.compile('\d+(\.)?\d*') 33 | self.punc_pattern = re.compile(MONEY_NUM_MIDDLE_STRING) 34 | self.bai_pattern = re.compile('百|佰') 35 | self.qian_pattern = re.compile('千|仟|k') 36 | self.wan_pattern = re.compile('万|萬|w') 37 | self.yi_pattern = re.compile('亿') 38 | self.chinese_yuan_currency_pattern = re.compile('(块钱|元|块)') 39 | self.chinese_jiao_currency_pattern = re.compile('(角|毛)') 40 | self.currency_case_pattern = re.compile(CURRENCY_CASE) 41 | # self.currency_case_pattern = re.compile(MONEY_SUFFIX_CASE_STRING) 42 | # self.chinese_kuai_jiao_mao_fen_pattern = re.compile(MONEY_KUAI_MAO_JIAO_FEN_STRING) 43 | 44 | self.money_modifier_pattern = re.compile( 45 | MONEY_PREFIX_STRING[:-1] + '|' + MONEY_SUFFIX_STRING[1:]) 46 | 47 | # 判断货币金额精确度 48 | self.money_blur_pattern = re.compile(start_end(MONEY_BLUR_STRING)) 49 | self.money_minus_pattern = re.compile(start_end(MONEY_MINUS_STRING)) 50 | self.money_plus_pattern = re.compile(start_end(MONEY_PLUS_STRING)) 51 | 52 | self.zero_seg_pattern = re.compile(r'0+\.00') 53 | 54 | # 检测货币金额数值是否符合要求,不符合要求将直接报错,必须为数值字符与单位字符,可包括 角、分等 55 | self.money_num_string_pattern = re.compile( 56 | ''.join([MONEY_NUM_STRING[:-3], '元钱', MONEY_KUAI_MAO_JIAO_FEN_STRING[1:], '+$'])) 57 | 58 | # 纯数字的金额 59 | self.money_pattern_1 = re.compile(r'^\d+(\.)?\d*$') 60 | # 前为数字,后为汉字的金额 61 | self.money_pattern_2 = re.compile(r'^\d+(\.)?\d*[十拾百佰k千仟w万萬亿兆]{1,2}$') 62 | 63 | # 金额范围抽取 64 | self.first_1_span_pattern = re.compile( 65 | r'(?<=(从))([^起到至\-—~]+)(?=(起|(?= 1: 156 | tmp_nums[-1] = tmp_nums[-1] * multi_num 157 | 158 | rtn_std_num = sum(tmp_nums) 159 | return rtn_std_num 160 | 161 | def turn_money_std_fmt_util2(self, money_string): 162 | if '万' in money_string or '萬' in money_string: 163 | if money_string[0] in '万萬': 164 | money_string = '一' + money_string 165 | 166 | seg_money_string = self.wan_pattern.split(money_string) 167 | if len(seg_money_string) == 2: 168 | prev, nxt = seg_money_string 169 | tmp_prev_num = self.turn_money_std_fmt_util1(prev) 170 | tmp_prev_num = tmp_prev_num * 10000 171 | tmp_nxt_num = self.turn_money_std_fmt_util1(nxt) 172 | rtn_std_num = tmp_prev_num + tmp_nxt_num 173 | else: 174 | raise ValueError(self.type_error.format(money_string)) 175 | else: 176 | rtn_std_num = self.turn_money_std_fmt_util1(money_string) 177 | 178 | return rtn_std_num 179 | 180 | def turn_money_std_fmt_util3(self, money_string): 181 | if '亿' in money_string: 182 | if money_string.startswith('亿'): 183 | money_string = '一' + money_string 184 | 185 | seg_billion = self.yi_pattern.split(money_string) 186 | if len(seg_billion) == 2: 187 | prev, nxt = seg_billion 188 | prev_num = self.turn_money_std_fmt_util2(prev) 189 | nxt_num = self.turn_money_std_fmt_util2(nxt) 190 | rtn_std_num = prev_num * 100000000 + nxt_num 191 | 192 | else: 193 | raise ValueError(self.type_error.format(money_string)) 194 | else: 195 | rtn_std_num = self.turn_money_std_fmt_util2(money_string) 196 | 197 | return rtn_std_num 198 | 199 | def _get_currency_case(self, money_string, default_unit='元'): 200 | """ 获取金额中的货币类型 """ 201 | res_list = [item for item in self.currency_case_pattern.finditer(money_string)] 202 | 203 | if len(res_list) == 0: 204 | return default_unit, money_string # 默认是人民币元 205 | 206 | elif len(res_list) in [1, 2]: 207 | # 即,要么是首词,要么是末尾词 208 | res = res_list[0] 209 | currency_unit = res.group() 210 | # 规定标准的货币类型 211 | if currency_unit in self.alias_RMB_case: 212 | unit = '元' 213 | elif currency_unit in self.alias_HK_case: 214 | unit = '港元' 215 | elif currency_unit in self.alias_JP_case: 216 | unit = '日元' 217 | elif currency_unit in self.alias_KR_case: 218 | unit = '韩元' 219 | elif currency_unit in self.alias_TW_case: 220 | unit = '新台币' 221 | elif currency_unit in self.alias_AUS_case: 222 | unit = '澳元' 223 | elif currency_unit in self.alias_USA_case: 224 | unit = '美元' 225 | else: 226 | unit = currency_unit 227 | 228 | # 切去货币类型,保留数额,但不包括 角、分 229 | if len(res_list) == 1: 230 | if res.span()[1] == len(money_string) or res.span()[0] == 0: 231 | # 货币在首部、或尾部 232 | money_string = self.currency_case_pattern.sub('', money_string) 233 | return unit, money_string 234 | else: 235 | # 不在首部、尾部,说明尾部还有分、角等 236 | return unit, money_string 237 | elif len(res_list) == 2: 238 | if res.span()[0] != 0: 239 | raise ValueError(self.type_error.format(money_string)) 240 | 241 | if res_list[1].span()[1] == len(money_string): 242 | money_string = self.currency_case_pattern.sub('', money_string) 243 | return unit, money_string 244 | else: 245 | # 不在首部、尾部,说明尾部还有分、角等 246 | money_string = self.currency_case_pattern.sub('', money_string, 1) 247 | return unit, money_string 248 | 249 | else: 250 | raise ValueError(self.type_error.format(money_string)) 251 | 252 | def _cleansing(self, money_string): 253 | # 去除其中的标点符号 ,,等 254 | money_string = self.punc_pattern.sub('', money_string) 255 | 256 | # 去除其中的括号,如 “50万元(含)以上” 257 | sub_parentheses = extract_parentheses(money_string, parentheses='()()') 258 | if '含' in ''.join(sub_parentheses): 259 | money_string = remove_parentheses(money_string, parentheses='()()') 260 | 261 | return money_string 262 | 263 | def _definition(self, money_string): 264 | """判断货币金额的精确度,为精确,或模糊""" 265 | 266 | modifiers = [item.group() for item in self.money_modifier_pattern.finditer(money_string)] 267 | 268 | if len(modifiers) == 0: 269 | minus_res, plus_res, blur_res = None, None, None 270 | 271 | elif len(modifiers) == 1: 272 | # 仅一个前缀或后缀 273 | blur_res = self.money_blur_pattern.search(modifiers[0]) 274 | minus_res = self.money_minus_pattern.search(modifiers[0]) 275 | plus_res = self.money_plus_pattern.search(modifiers[0]) 276 | 277 | elif len(modifiers) == 2: 278 | # 分别有一个前缀和后缀 279 | blur_res_1 = self.money_blur_pattern.search(modifiers[0]) 280 | minus_res_1 = self.money_minus_pattern.search(modifiers[0]) 281 | plus_res_1 = self.money_plus_pattern.search(modifiers[0]) 282 | blur_res_2 = self.money_blur_pattern.search(modifiers[1]) 283 | minus_res_2 = self.money_minus_pattern.search(modifiers[1]) 284 | plus_res_2 = self.money_plus_pattern.search(modifiers[1]) 285 | 286 | blur_res = blur_res_1 or blur_res_2 287 | minus_res = minus_res_1 or minus_res_2 288 | plus_res = plus_res_1 or plus_res_2 289 | 290 | else: 291 | # 多余两个词缀,说明金额字符串有误 292 | raise ValueError(self.type_error.format(money_string)) 293 | 294 | definition = 'accurate' 295 | if minus_res: # 确定 minus_res 与 plus_res 不冲突,不同时 not None 296 | definition = 'blur-' 297 | elif plus_res: 298 | definition = 'blur+' 299 | elif blur_res: 300 | definition = 'blur' 301 | 302 | money_string = self.money_modifier_pattern.sub('', money_string) 303 | return money_string, definition 304 | 305 | def _accuracy(self, money_string, definition): 306 | """ 处理模糊金额,如 “六千多万日元”、“十几块钱”、“数十元”、“十多块钱” 等 """ 307 | if '多' in money_string: 308 | money_string = money_string.replace('多', '') 309 | definition = 'blur+span' 310 | return money_string, definition 311 | 312 | if '余' in money_string: 313 | money_string = money_string.replace('余', '') 314 | definition = 'blur+span' 315 | return money_string, definition 316 | 317 | if '几' in money_string or '数' in money_string: 318 | if money_string[0] in '几数': 319 | money_string = money_string.replace('几', '').replace('数', '') 320 | definition = 'blur++span' 321 | else: 322 | money_string = money_string.replace('几', '').replace('数', '') 323 | definition = 'blur+span' 324 | return money_string, definition 325 | 326 | return money_string, definition 327 | 328 | def _expand_sequential_string(self, money_string): 329 | """ 对某些字符串进行扩展,如 “五六百美元” 需要扩展为 “五到六百美元” """ 330 | if self.sequential_char_num_pattern.search(money_string): 331 | sequential_string = self.sequential_char_num_pattern.search(money_string).group() 332 | money_string_pattern = self.sequential_char_num_pattern.sub('{}', money_string) 333 | sub_token = sequential_string[0] + '到' + sequential_string[1] 334 | money_string = money_string_pattern.format(sub_token) 335 | 336 | return money_string 337 | 338 | def _split_money_span(self, money_string): 339 | """检测字符串,并将其分解为两个 money """ 340 | # 找第一个字符串 341 | if self.first_1_span_pattern.search(money_string): 342 | first_res = self.first_1_span_pattern.search(money_string) 343 | elif self.first_2_span_pattern.search(money_string): 344 | first_res = self.first_2_span_pattern.search(money_string) 345 | elif self.first_3_span_pattern.search(money_string): 346 | first_res = self.first_3_span_pattern.search(money_string) 347 | else: 348 | first_res = None 349 | 350 | first_string = None if first_res is None else first_res.group() 351 | 352 | # 找第二个字符串 353 | if self.second_0_span_pattern.search(money_string): 354 | second_res = self.second_0_span_pattern.search(money_string) 355 | elif self.second_1_span_pattern.search(money_string): 356 | second_res = self.second_1_span_pattern.search(money_string) 357 | else: 358 | second_res = None 359 | 360 | second_string = None if second_res is None else second_res.group() 361 | 362 | return first_string, second_string 363 | 364 | def _compensate_first_money_string( 365 | self, first_money_string, second_money_string): 366 | """ 根据情况,对金额范围的第一个金额进行单位补全 367 | 例如: 3到5万港币,被拆分为 3,5万港币,须将 3 补全为 3万港币 368 | 思路:第二个字符串一般为完全字符串,不须补全, 369 | 且默认第二个字符串是 数字、汉字单位混合字符串, 370 | 此时考察第一个字符串,若其数值低于 第二个字符串的数字值, 371 | 则为其添加第二个字符串的汉字单位。 372 | 373 | TODO:该函数有较多错误和纰漏。 374 | 375 | 十八到三十万元 376 | 一百二十到一百五十万元 377 | 一千到两千万元 378 | 一千到两千亿元 379 | 三到五百 380 | 八到九千 381 | """ 382 | # 先分析第一个字符串的金额,确定其信息,是否需要补全 383 | if self.money_pattern_1.search(first_money_string): 384 | first_computed_money_num = float(first_money_string) 385 | 386 | elif self.money_pattern_2.search(first_money_string): 387 | # 前为数字,后为汉字的金额,如 “6000万” 388 | # 若第一个字符串属于该种类型,且其 char_part 非空,说明可以直接返回 389 | char_part = self.float_num_pattern.sub('', first_money_string) 390 | if char_part in self.suffix_nums: 391 | return first_money_string 392 | else: 393 | raise ValueError(self.type_error.format(first_money_string)) 394 | 395 | else: 396 | # 若第一个字符串有单位,则直接返回结果 397 | res_list = [item for item in self.currency_case_pattern.finditer(first_money_string)] 398 | 399 | if len(res_list) != 0: 400 | # 有货币单位 401 | if res_list[-1].span()[1] == len(first_money_string): 402 | # 即第一个字符串末尾为单位,则直接跳过 403 | return first_money_string 404 | 405 | first_computed_money_num = self.turn_money_std_fmt_util3(first_money_string) 406 | 407 | # 前置操作,需要重复执行一次,因此较为耗时 408 | second_money_string = self._cleansing(second_money_string) 409 | second_money_string, definition = self._definition(second_money_string) 410 | unit, second_money_string = self._get_currency_case(second_money_string) 411 | second_money_string, definition = self._accuracy(second_money_string, definition) 412 | 413 | # 分析第二个字符串的类型,并按类型对其进行判断,是否对第一个字符串添加信息 414 | if self.money_pattern_2.search(second_money_string): 415 | char_part = self.float_num_pattern.sub('', second_money_string) 416 | if char_part not in self.suffix_nums: 417 | raise ValueError(self.type_error.format(second_money_string)) 418 | 419 | num_part = second_money_string.replace(char_part, '') 420 | if self.money_pattern_1.search(num_part): 421 | second_computed_money_num = float(num_part) 422 | else: 423 | raise ValueError(self.type_error.format(second_money_string)) 424 | 425 | if first_computed_money_num < second_computed_money_num: 426 | # 此时需要添加单位 427 | return first_money_string + char_part 428 | else: 429 | return first_money_string 430 | 431 | else: 432 | if self.yi_pattern.search(second_money_string): 433 | seg_billion = self.yi_pattern.split(second_money_string) 434 | if len(seg_billion) == 2: 435 | second_computed_money_num = self.turn_money_std_fmt_util2(seg_billion[0]) 436 | else: 437 | raise ValueError(self.type_error.format(second_money_string)) 438 | 439 | if first_computed_money_num < second_computed_money_num: 440 | return first_money_string + '亿' 441 | else: 442 | return first_money_string 443 | 444 | elif self.wan_pattern.search(second_money_string): 445 | seg_wan = self.wan_pattern.split(second_money_string) 446 | if len(seg_wan) == 2: 447 | second_computed_money_num = self.turn_money_std_fmt_util1(seg_wan[0]) 448 | else: 449 | raise ValueError(self.type_error.format(second_money_string)) 450 | 451 | if first_computed_money_num < second_computed_money_num: 452 | return first_money_string + '万' 453 | else: 454 | return first_money_string 455 | 456 | elif self.qian_pattern.search(second_money_string): 457 | seg_qian = self.qian_pattern.split(second_money_string) 458 | if len(seg_qian) == 2: 459 | second_computed_money_num = self.turn_money_std_fmt_util1(seg_qian[0]) 460 | else: 461 | raise ValueError(self.type_error.format(second_money_string)) 462 | 463 | if first_computed_money_num < second_computed_money_num: 464 | return first_money_string + '千' 465 | else: 466 | return first_money_string 467 | elif self.bai_pattern.search(second_money_string): 468 | seg_bai = self.bai_pattern.split(second_money_string) 469 | if len(seg_bai) == 2: 470 | second_computed_money_num = self.turn_money_std_fmt_util1(seg_bai[0]) 471 | else: 472 | raise ValueError(self.type_error.format(second_money_string)) 473 | 474 | if first_computed_money_num < second_computed_money_num: 475 | return first_money_string + '百' 476 | else: 477 | return first_money_string 478 | 479 | return first_money_string 480 | 481 | def __call__(self, money_string, default_unit='元', ret_format='detail'): 482 | 483 | if self.money_pattern_1 is None: 484 | self._prepare() 485 | 486 | if not money_string: # or len(money_string) == 1: 487 | raise ValueError(self.type_error.format(money_string)) 488 | 489 | # 若检测到需要扩展的类型,如 “五六百美元” 需要扩展为 “五到六百美元” 490 | money_string = self._expand_sequential_string(money_string) 491 | 492 | first_money_string, second_money_string = self._split_money_span(money_string) 493 | 494 | if first_money_string is None or second_money_string is None: 495 | # 按单金额字符串返回 496 | return self.parse_single_money( 497 | money_string, default_unit=default_unit, ret_format=ret_format) 498 | 499 | else: 500 | first_money_string = self._compensate_first_money_string( 501 | first_money_string, second_money_string) 502 | 503 | first_money_res = self.parse_single_money( 504 | first_money_string, default_unit=default_unit, ret_format=ret_format) 505 | second_money_res = self.parse_single_money( 506 | second_money_string, default_unit=default_unit, ret_format=ret_format) 507 | 508 | # 将两个货币金额合并 509 | if ret_format == 'str': 510 | if type(first_money_res) is str and type(second_money_res) is str: 511 | ret_money = [first_money_res, second_money_res] 512 | elif type(first_money_res) is str and type(second_money_res) is list: 513 | ret_money = [first_money_res, second_money_res[1]] 514 | elif type(first_money_res) is list and type(second_money_res) is str: 515 | ret_money = [first_money_res[0], second_money_res] 516 | elif type(first_money_res) is list and type(second_money_res) is list: 517 | ret_money = [first_money_res[0], second_money_res[1]] 518 | 519 | elif ret_format == 'detail': 520 | first_unit = first_money_res['case'] 521 | second_unit = second_money_res['case'] 522 | 523 | if second_unit != '元': 524 | unit = second_unit 525 | elif first_unit != '元': 526 | unit = first_unit 527 | else: 528 | unit = '元' 529 | 530 | definition = 'blur' 531 | 532 | if type(first_money_res['num']) is str and type(second_money_res['num']) is str: 533 | ret_money = [first_money_res['num'], second_money_res['num']] 534 | elif type(first_money_res['num']) is str and type(second_money_res['num']) is list: 535 | ret_money = [first_money_res['num'], second_money_res['num'][1]] 536 | elif type(first_money_res['num']) is list and type(second_money_res['num']) is str: 537 | ret_money = [first_money_res['num'][0], second_money_res['num']] 538 | elif type(first_money_res['num']) is list and type(second_money_res['num']) is list: 539 | ret_money = [first_money_res['num'][0], second_money_res['num'][1]] 540 | 541 | ret_money = {'num': ret_money, 'case': unit, 'definition': definition} 542 | 543 | return ret_money 544 | 545 | def parse_single_money(self, money_string, default_unit='元', ret_format='detail'): 546 | """ 解析单个金额字符串,可由解析两个组成金额范围 """ 547 | 548 | # 清洗字符串 549 | money_string = self._cleansing(money_string) 550 | 551 | # 判断金额精确粒度,并清除前置词汇和后置词汇 552 | money_string, definition = self._definition(money_string) 553 | 554 | # 判断货币类型 555 | unit, money_string = self._get_currency_case(money_string, default_unit=default_unit) 556 | 557 | # 处理模糊金额,如 “六千多万”、“十几块钱”、“数十元”、“十多块钱”、“2000余元” 等 558 | money_string, definition = self._accuracy(money_string, definition) 559 | 560 | if money_string == '': 561 | raise ValueError(self.type_error.format(money_string)) 562 | 563 | # 若货币的金额字符串部分有误,则报错返回。 564 | if self.money_num_string_pattern.search(money_string) is None: 565 | raise ValueError(self.type_error.format(money_string)) 566 | # pass 567 | 568 | if self.money_pattern_1.search(money_string): 569 | # 纯数字格式的金额,如 “549040.27” 570 | computed_money_num = float(money_string) 571 | 572 | elif self.money_pattern_2.search(money_string): 573 | # 前为数字,后为汉字的金额,如 “6000万” 574 | 575 | char_part = self.float_num_pattern.sub('', money_string) 576 | if char_part in self.suffix_nums: 577 | num_suffix = self.suffix_nums.get(char_part) 578 | else: 579 | raise ValueError(self.type_error.format(money_string)) 580 | 581 | num_part = money_string.replace(char_part, '') 582 | if self.money_pattern_1.search(num_part): 583 | computed_money_num = float(num_part) * num_suffix 584 | else: 585 | raise ValueError(self.type_error.format(money_string)) 586 | 587 | else: 588 | computed_money_num = self.turn_money_std_fmt_util3(money_string) 589 | 590 | # 金额标准化 591 | standard_money_num = self.turn_num_standard_format(computed_money_num) 592 | if standard_money_num is None: 593 | raise ValueError(self.type_error.format(money_string)) 594 | 595 | standard_money_num_list = list() 596 | if 'span' in definition: 597 | if definition == 'blur+span': 598 | second_money_num = self._get_second_num(standard_money_num) 599 | elif definition == 'blur++span': 600 | second_money_num = self._get_second_num(standard_money_num, flag='++') 601 | 602 | standard_money_num_list = [standard_money_num, second_money_num] 603 | definition = 'blur' 604 | 605 | # 组织返回格式 606 | if ret_format == 'str': 607 | if len(standard_money_num_list) == 0: 608 | ret_money = standard_money_num + unit 609 | elif len(standard_money_num_list) == 2: 610 | ret_money = [standard_money_num_list[0] + unit, standard_money_num_list[1] + unit] 611 | elif ret_format == 'detail': 612 | if len(standard_money_num_list) == 0: 613 | ret_money = {'num': standard_money_num, 'case': unit, 'definition': definition} 614 | elif len(standard_money_num_list) == 2: 615 | ret_money = {'num': standard_money_num_list, 'case': unit, 'definition': definition} 616 | 617 | return ret_money 618 | 619 | def _get_second_num(self, num, flag='+'): 620 | if flag == '+': 621 | res = self.zero_seg_pattern.search(num) 622 | if res is not None: 623 | back_part = res.group() 624 | front_part = num.replace(back_part, '') 625 | new_front_part = str(int(front_part) + 1) 626 | return new_front_part + back_part 627 | else: 628 | return None 629 | elif flag == '++': 630 | num = float(num) * 10 631 | standard_money_num = self.turn_num_standard_format(num) 632 | return standard_money_num 633 | -------------------------------------------------------------------------------- /timeparser/extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Junhui Yu 3 | # @File : extractor.py 4 | # @Time : 2022/10/25 12:20 5 | 6 | import re 7 | 8 | from .rule_pattern import * 9 | 10 | __all__ = ['Extractor'] 11 | 12 | 13 | class Extractor(object): 14 | """ 规则抽取器 """ 15 | 16 | def __init__(self): 17 | self.money_pattern = None 18 | self.email_pattern = None 19 | self.email_domain_pattern = None 20 | self.email_prefix_pattern = None 21 | self.url_pattern = None 22 | self.phone_number_pattern = None 23 | self.ip_address_pattern = None 24 | self.id_card_pattern = None 25 | self.html_tag_pattern = None 26 | self.qq_pattern = None 27 | self.strict_qq_pattern = None 28 | self.wechat_id_pattern = None 29 | self.strict_wechat_id_pattern = None 30 | self.cell_phone_pattern = None 31 | self.landline_phone_pattern = None 32 | self.phone_prefix_pattern = None 33 | self.extract_parentheses_pattern = None 34 | self.remove_parentheses_pattern = None 35 | self.parentheses_pattern = PARENTHESES_PATTERN 36 | self.parentheses_dict = None 37 | self.redundant_pattern = None 38 | self.exception_pattern = None 39 | self.full_angle_pattern = None 40 | self.chinese_char_pattern = None 41 | self.chinese_chars_pattern = None 42 | 43 | @staticmethod 44 | def _extract_base(pattern, text, with_offset=False): 45 | if with_offset: 46 | results = [{'text': item.group(1), 47 | 'offset': (item.span()[0] - 1, item.span()[1] - 1)} 48 | for item in pattern.finditer(text)] 49 | else: 50 | results = [item.group(1) for item in pattern.finditer(text)] 51 | 52 | return results 53 | 54 | def remove_redundant_char(self, text, redundant_chars=None): 55 | if self.redundant_pattern is None: 56 | pattern_list = list() 57 | if redundant_chars is None: 58 | redundant_chars = REDUNDANT_PATTERN 59 | 60 | for char in redundant_chars: 61 | pattern_tmp = '(?<={char}){char}+'.format( 62 | char=re.escape(char)) 63 | pattern_list.append(pattern_tmp) 64 | 65 | redundant_pattern = '|'.join(pattern_list) 66 | self.redundant_pattern = re.compile(redundant_pattern) 67 | 68 | return self.redundant_pattern.sub('', text) 69 | 70 | def clean_text(self, text, remove_html_tag=True, 71 | convert_full2half=True, 72 | remove_exception_char=True, remove_url=True, 73 | remove_redundant_char=True, remove_parentheses=True, 74 | remove_email=True, remove_phone_number=True, 75 | delete_prefix=False, redundant_chars=None): 76 | 77 | if remove_redundant_char: 78 | text = self.remove_redundant_char( 79 | text, redundant_chars=redundant_chars) 80 | if remove_parentheses: 81 | text = self.remove_parentheses(text) 82 | 83 | return text 84 | 85 | def remove_parentheses(self, text, parentheses=PARENTHESES_PATTERN): 86 | if self.remove_parentheses_pattern is None or self.parentheses_pattern != parentheses: 87 | self.parentheses_pattern = parentheses 88 | 89 | p_length = len(self.parentheses_pattern) 90 | remove_pattern_list = list() 91 | remove_pattern_format = '{left}[^{left}{right}]*{right}' 92 | 93 | for i in range(0, p_length, 2): 94 | left = re.escape(self.parentheses_pattern[i]) 95 | right = re.escape(self.parentheses_pattern[i + 1]) 96 | remove_pattern_list.append( 97 | remove_pattern_format.format(left=left, right=right)) 98 | 99 | remove_pattern = '|'.join(remove_pattern_list) 100 | remove_pattern = re.compile(remove_pattern) 101 | 102 | self.remove_parentheses_pattern = remove_pattern 103 | 104 | length = len(text) 105 | while True: 106 | text = self.remove_parentheses_pattern.sub('', text) 107 | if len(text) == length: 108 | return text 109 | length = len(text) 110 | 111 | def extract_parentheses(self, text, parentheses=PARENTHESES_PATTERN, detail=False): 112 | if self.extract_parentheses_pattern is None or self.parentheses_pattern != parentheses: 113 | self.parentheses_pattern = parentheses 114 | 115 | extract_pattern = '[' + re.escape(self.parentheses_pattern) + ']' 116 | extract_pattern = re.compile(extract_pattern) 117 | 118 | p_length = len(self.parentheses_pattern) 119 | 120 | parentheses_dict = dict() 121 | for i in range(0, p_length, 2): 122 | value = self.parentheses_pattern[i] 123 | key = self.parentheses_pattern[i + 1] 124 | parentheses_dict.update({key: value}) 125 | 126 | self.parentheses_dict = parentheses_dict 127 | self.extract_parentheses_pattern = extract_pattern 128 | 129 | content_list = list() 130 | parentheses_list = list() 131 | idx_list = list() 132 | finditer = self.extract_parentheses_pattern.finditer(text) 133 | for i in finditer: 134 | idx = i.start() 135 | parentheses = text[idx] 136 | 137 | if parentheses in self.parentheses_dict.keys(): 138 | if len(parentheses_list) > 0: 139 | if parentheses_list[-1] == self.parentheses_dict[parentheses]: 140 | parentheses_list.pop() 141 | if detail: 142 | start_idx = idx_list.pop() 143 | end_idx = idx + 1 144 | content_list.append( 145 | {'content': text[start_idx: end_idx], 146 | 'offset': (start_idx, end_idx)}) 147 | else: 148 | content_list.append(text[idx_list.pop(): idx + 1]) 149 | else: 150 | parentheses_list.append(parentheses) 151 | idx_list.append(idx) 152 | 153 | return content_list 154 | -------------------------------------------------------------------------------- /timeparser/helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Junhui Yu 3 | # @File : helper.py 4 | # @Time : 2022/10/25 11:16 5 | 6 | import os 7 | import sys 8 | import logging 9 | from logging.handlers import TimedRotatingFileHandler 10 | 11 | _LEVELS = { 12 | 'CRITICAL': logging.CRITICAL, 13 | 'ERROR': logging.ERROR, 14 | 'WARN': logging.WARNING, 15 | 'WARNING': logging.WARNING, 16 | 'INFO': logging.INFO, 17 | 'DEBUG': logging.DEBUG, 18 | 'NOTSET': logging.NOTSET 19 | } 20 | 21 | 22 | def _logging_level_from_str(level): 23 | level = level.upper() 24 | if level in _LEVELS: 25 | return _LEVELS[level] 26 | return logging.INFO 27 | 28 | 29 | def _refresh_logger(logger): 30 | # 清除 logger 中的 handler 31 | if len(logger.handlers) != 0: 32 | for i in range(len(logger.handlers)): 33 | logger.removeHandler(logger.handlers[0]) 34 | 35 | return logger 36 | 37 | 38 | def bracket(regular_expression): 39 | return ''.join([r'(', regular_expression, r')']) 40 | 41 | 42 | def bracket_absence(regular_expression): 43 | return ''.join([r'(', regular_expression, r')?']) 44 | 45 | 46 | def absence(regular_expression): 47 | return ''.join([regular_expression, r'?']) 48 | 49 | 50 | def start_end(regular_expression): 51 | return ''.join([r'^', regular_expression, r'$']) 52 | 53 | 54 | def set_logger(level='INFO', log_dir_name='.time_parrser_logs'): 55 | # 设置日志级别 56 | if level is None: 57 | logger = logging.getLogger(__name__) 58 | _refresh_logger(logger) 59 | return logger 60 | 61 | level = _logging_level_from_str(level) 62 | logger = logging.getLogger(__name__) 63 | # logger 为全局变量,因此须在申请前,先将日志清除 64 | _refresh_logger(logger) 65 | logger.setLevel(level) 66 | 67 | # 日志格式 68 | formatter = logging.Formatter( 69 | fmt="%(asctime)s %(levelname)s %(funcName)s: %(message)s", 70 | datefmt="%Y-%m-%d %H:%M:%S") 71 | 72 | # 输出流控制器 73 | stream_handler = logging.StreamHandler(sys.stdout) 74 | stream_handler.setLevel(level) 75 | stream_handler.setFormatter(formatter) 76 | 77 | if log_dir_name is not None: 78 | # 日志写入文件 hanlder 79 | if log_dir_name.startswith("/"): 80 | filename_directory = log_dir_name 81 | else: 82 | filename_directory = os.path.join(os.path.expanduser('~'), log_dir_name) 83 | if not os.path.exists(filename_directory): 84 | os.makedirs(filename_directory) 85 | 86 | # 文件输出控制器 87 | file_handler = TimedRotatingFileHandler( 88 | os.path.join(filename_directory, "log.txt"), 89 | when="midnight", backupCount=30) 90 | 91 | file_handler.setLevel(level) 92 | file_handler.suffix = "%Y%m%d" 93 | file_handler.setFormatter(formatter) 94 | 95 | logger.addHandler(file_handler) 96 | 97 | length = 20 98 | logger.log(level, "-" * length + " logging start " + "-" * length) 99 | logger.log(level, "LEVEL: {}".format(logging.getLevelName(level))) 100 | if log_dir_name is not None: 101 | logger.log(level, "PATH: {}".format(filename_directory)) 102 | logger.log(level, "-" * (length * 2 + 15)) 103 | 104 | logger.addHandler(stream_handler) 105 | 106 | return logger 107 | -------------------------------------------------------------------------------- /timeparser/lunar_solar_date.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Junhui Yu 3 | # @File : lunar_solar_date.py 4 | # @Time : 2022/10/25 11:18 5 | 6 | 7 | from datetime import datetime, timedelta 8 | from itertools import accumulate 9 | 10 | 11 | class LunarSolarDate(object): 12 | """农历日期与公历日期转换 """ 13 | # 中文年份编码 14 | CHINESE_YEAR_CODE = [ 15 | 19416, 19168, 42352, 21717, 53856, 55632, 91476, 22176, 39632, 21970, 19168, 42422, 16 | 42192, 53840, 119381, 46400, 54944, 44450, 38320, 84343, 18800, 42160, 46261, 27216, 17 | 27968, 109396, 11104, 38256, 21234, 18800, 25958, 54432, 59984, 92821, 23248, 11104, 18 | 100067, 37600, 116951, 51536, 54432, 120998, 46416, 22176, 107956, 9680, 37584, 53938, 19 | 43344, 46423, 27808, 46416, 86869, 19872, 42416, 83315, 21168, 43432, 59728, 27296, 20 | 44710, 43856, 19296, 43748, 42352, 21088, 62051, 55632, 23383, 22176, 38608, 19925, 21 | 19152, 42192, 54484, 53840, 54616, 46400, 46752, 103846, 38320, 18864, 43380, 42160, 22 | 45690, 27216, 27968, 44870, 43872, 38256, 19189, 18800, 25776, 29859, 59984, 27480, 23 | 23232, 43872, 38613, 37600, 51552, 55636, 54432, 55888, 30034, 22176, 43959, 9680, 24 | 37584, 51893, 43344, 46240, 47780, 44368, 21977, 19360, 42416, 86390, 21168, 43312, 25 | 31060, 27296, 44368, 23378, 19296, 42726, 42208, 53856, 60005, 54576, 23200, 30371, 26 | 38608, 19195, 19152, 42192, 118966, 53840, 54560, 56645, 46496, 22224, 21938, 18864, 27 | 42359, 42160, 43600, 111189, 27936, 44448, 84835, 37744, 18936, 18800, 25776, 92326, 28 | 59984, 27296, 108228, 43744, 37600, 53987, 51552, 54615, 54432, 55888, 23893, 22176, 29 | 42704, 21972, 21200, 43448, 43344, 46240, 46758, 44368, 21920, 43940, 42416, 21168, 30 | 45683, 26928, 29495, 27296, 44368, 84821, 19296, 42352, 21732, 53600, 59752, 54560, 31 | 55968, 92838, 22224, 19168, 43476, 41680, 53584, 62034, 54560] 32 | 33 | # 从 1900 年,至 2100 年每年的农历春节的公历日期 34 | CHINESE_NEW_YEAR = [ 35 | '19000131', 36 | '19010219', '19020208', '19030129', '19040216', '19050204', '19060125', '19070213', 37 | '19080202', '19090122', '19100210', '19110130', '19120218', '19130206', '19140126', 38 | '19150214', '19160203', '19170123', '19180211', '19190201', '19200220', '19210208', 39 | '19220128', '19230216', '19240205', '19250124', '19260213', '19270202', '19280123', 40 | '19290210', '19300130', '19310217', '19320206', '19330126', '19340214', '19350204', 41 | '19360124', '19370211', '19380131', '19390219', '19400208', '19410127', '19420215', 42 | '19430205', '19440125', '19450213', '19460202', '19470122', '19480210', '19490129', 43 | '19500217', '19510206', '19520127', '19530214', '19540203', '19550124', '19560212', 44 | '19570131', '19580218', '19590208', '19600128', '19610215', '19620205', '19630125', 45 | '19640213', '19650202', '19660121', '19670209', '19680130', '19690217', '19700206', 46 | '19710127', '19720215', '19730203', '19740123', '19750211', '19760131', '19770218', 47 | '19780207', '19790128', '19800216', '19810205', '19820125', '19830213', '19840202', 48 | '19850220', '19860209', '19870129', '19880217', '19890206', '19900127', '19910215', 49 | '19920204', '19930123', '19940210', '19950131', '19960219', '19970207', '19980128', 50 | '19990216', '20000205', '20010124', '20020212', '20030201', '20040122', '20050209', 51 | '20060129', '20070218', '20080207', '20090126', '20100214', '20110203', '20120123', 52 | '20130210', '20140131', '20150219', '20160208', '20170128', '20180216', '20190205', 53 | '20200125', '20210212', '20220201', '20230122', '20240210', '20250129', '20260217', 54 | '20270206', '20280126', '20290213', '20300203', '20310123', '20320211', '20330131', 55 | '20340219', '20350208', '20360128', '20370215', '20380204', '20390124', '20400212', 56 | '20410201', '20420122', '20430210', '20440130', '20450217', '20460206', '20470126', 57 | '20480214', '20490202', '20500123', '20510211', '20520201', '20530219', '20540208', 58 | '20550128', '20560215', '20570204', '20580124', '20590212', '20600202', '20610121', 59 | '20620209', '20630129', '20640217', '20650205', '20660126', '20670214', '20680203', 60 | '20690123', '20700211', '20710131', '20720219', '20730207', '20740127', '20750215', 61 | '20760205', '20770124', '20780212', '20790202', '20800122', '20810209', '20820129', 62 | '20830217', '20840206', '20850126', '20860214', '20870203', '20880124', '20890210', 63 | '20900130', '20910218', '20920207', '20930127', '20940215', '20950205', '20960125', 64 | '20970212', '20980201', '20990121', '21000209' 65 | ] 66 | 67 | def __init__(self): 68 | self.tian = '甲乙丙丁戊己庚辛壬癸' 69 | self.di = '子丑寅卯辰巳午未申酉戌亥' 70 | self.sheng_xiao = '鼠牛虎兔龙蛇马羊猴鸡狗猪' 71 | self.chinese_nums = '零一二三四五六七八九十' 72 | 73 | def to_solar_date(self, lunar_year, lunar_month, lunar_day, leap_month=False): 74 | if not self._validate(lunar_year, lunar_month, lunar_day, leap_month): 75 | raise ValueError('农历日期不支持 => 1.超出农历1900年1月1日~2100年12月29日,2.日期不存在') 76 | 77 | new_year = datetime.strptime(self.CHINESE_NEW_YEAR[lunar_year - 1900], '%Y%m%d') 78 | time_delta = timedelta(days=self._lunar_days_passed( 79 | lunar_year, lunar_month, lunar_day, leap_month)) 80 | return new_year + time_delta 81 | 82 | def to_lunar_date(self, solar_date): 83 | lunar_year = solar_date.year 84 | if (datetime.strptime(self.CHINESE_NEW_YEAR[lunar_year - 1900], '%Y%m%d') - solar_date).days > 0: 85 | lunar_year -= 1 86 | 87 | new_year_date = datetime.strptime(self.CHINESE_NEW_YEAR[lunar_year - 1900], '%Y%m%d') 88 | days_passed = (solar_date - new_year_date).days 89 | year_code = self.CHINESE_YEAR_CODE[lunar_year - 1900] 90 | month_days = LunarSolarDate._decode(year_code) 91 | 92 | for pos, days in enumerate(accumulate(month_days)): 93 | if days_passed + 1 <= days: 94 | month = pos + 1 95 | lunar_day = month_days[pos] - (days - days_passed) + 1 96 | break 97 | 98 | leap_month = False 99 | if (year_code & 0xf) == 0 or month <= (year_code & 0xf): 100 | lunar_month = month 101 | else: 102 | lunar_month = month - 1 103 | 104 | if (year_code & 0xf) != 0 and month == (year_code & 0xf) + 1: 105 | leap_month = True 106 | 107 | return lunar_year, lunar_month, lunar_day, leap_month 108 | 109 | def _lunar_days_passed(self, lunar_year, lunar_month, lunar_day, leap_month): 110 | """计算当前农历日期和当年农历新年之间的天数差值 111 | 112 | Returns: 113 | int: 差值天数 114 | """ 115 | year_code = self.CHINESE_YEAR_CODE[lunar_year - 1900] 116 | 117 | month_days = LunarSolarDate._decode(year_code) 118 | month_leap = year_code & 0xf # 当前农历年的闰月,为0表示无润叶 119 | 120 | if (month_leap == 0) or (lunar_month < month_leap): # 当年无闰月,或者有闰月但是当前月小于闰月 121 | days_passed_month = sum(month_days[:lunar_month - 1]) 122 | elif (not leap_month) and (lunar_month == month_leap): # 当前不是闰月,并且当前月份和闰月相同 123 | days_passed_month = sum(month_days[:lunar_month - 1]) 124 | else: 125 | days_passed_month = sum(month_days[:lunar_month]) 126 | 127 | return days_passed_month + lunar_day - 1 128 | 129 | def chinese_lunar_date(self, lunar_year, lunar_month, lunar_day, leap_month): 130 | zh_year = '' 131 | for i in range(0, 4): 132 | zh_year += self.chinese_nums[int(str(lunar_year)[i])] 133 | zh_year += '年' 134 | 135 | if leap_month: 136 | zh_month = '闰' 137 | else: 138 | zh_month = '' 139 | 140 | if lunar_month == 1: 141 | zh_month += '正' 142 | elif lunar_month == 12: 143 | zh_month += '腊' 144 | elif lunar_month <= 10: 145 | zh_month += self.chinese_nums[lunar_month] 146 | else: 147 | zh_month += '十{}'.format(self.chinese_nums[lunar_month - 10]) 148 | zh_month += '月' 149 | 150 | if lunar_day <= 10: 151 | zh_day = '初{}'.format(self.chinese_nums[lunar_day]) 152 | elif lunar_day < 20: 153 | zh_day = '十{}'.format(self.chinese_nums[lunar_day - 10]) 154 | elif lunar_day == 20: 155 | zh_day = '二十' 156 | elif lunar_day < 30: 157 | zh_day = '二十{}'.format(self.chinese_nums[lunar_day - 20]) 158 | else: 159 | zh_day = '三十' 160 | 161 | year_tian_di = '{}{}'.format( 162 | self.tian[(lunar_year - 1900 + 36) % 10], 163 | self.di[(lunar_year - 1900 + 36) % 12]) + '年' 164 | 165 | return '{}{}{} {} ({}年)'.format( 166 | zh_year, zh_month, zh_day, year_tian_di, 167 | self.sheng_xiao[(lunar_year - 1900) % 12]) 168 | 169 | def _validate(self, year, month, day, leap): 170 | """农历日期校验 171 | 172 | Args: 173 | year(int): 农历年份 174 | month(int): 农历月份 175 | day(int): 农历日期 176 | leap(bool): 农历是否为闰月日期 177 | 178 | Returns: 179 | bool: 校验是否通过 180 | """ 181 | # 年份低于1900,大于2100,或者月份不属于 1-12,或者日期不属于 1-30,返回校验失败 182 | if not (1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 30): 183 | return False 184 | 185 | year_code = self.CHINESE_YEAR_CODE[year - 1900] 186 | 187 | # 有闰月标志 188 | if leap: 189 | if (year_code & 0xf) != month: # 年度闰月和校验闰月不一致的话,返回校验失败 190 | return False 191 | elif day == 30: # 如果日期是30的话,直接返回年度代码首位是否为1,即闰月是否为大月 192 | return (year_code >> 16) == 1 193 | else: # 年度闰月和当前月份相同,日期不为30的情况,返回通过 194 | return True 195 | elif day <= 29: # 非闰月,并且日期小于等于29,返回通过 196 | return True 197 | else: # 非闰月日期为30,返回年度代码中的月份位是否为1,即是否为大月 198 | return ((year_code >> (12 - month) + 4) & 1) == 1 199 | 200 | @staticmethod 201 | def _decode(year_code): 202 | """解析年度农历代码函数 203 | 204 | Arguments: 205 | year_code(int): 从年度代码数组中获取的代码整数 206 | 207 | Returns: 208 | int: 当前年度代码解析以后形成的每月天数数组,已将闰月嵌入对应位置,即有闰月的年份返回长度为 13,否则为 12 209 | 210 | """ 211 | month_days = list() 212 | for i in range(5, 17): 213 | if (year_code >> (i - 1)) & 1: 214 | month_days.insert(0, 30) 215 | else: 216 | month_days.insert(0, 29) 217 | if year_code & 0xf: 218 | if year_code >> 16: 219 | month_days.insert((year_code & 0xf), 30) 220 | else: 221 | month_days.insert((year_code & 0xf), 29) 222 | return month_days 223 | 224 | 225 | if __name__ == '__main__': 226 | ls = LunarSolarDate() 227 | res = ls.to_lunar_date(datetime(1989, 10, 22)) 228 | print(res) 229 | res = ls.to_solar_date(1989, 9, 23, False) 230 | print(type(res)) 231 | -------------------------------------------------------------------------------- /timeparser/rule_pattern.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Junhui Yu 3 | # @File : rule_pattern.py 4 | # @Time : 2022/10/25 11:18 5 | 6 | 7 | from .helper import absence 8 | 9 | # --------------------------------------------------------------------- 10 | # # 中文字符正则 11 | ANCIENT_CHINESE_CHAR_PATTERN = '[一-龥㐀-䶵]' # 在 gb13000.1 基础上扩展 6582 个古汉字,共 27484 个汉字 12 | # gb13000.1 收录的汉字 20902 个,但其中有很多不常用字,在 chinese_char_dictionary_loader 有说明 13 | CHINESE_CHAR_PATTERN = '[一-龥]' 14 | 15 | # r'(?=[^:: \t\u3000])' 16 | 17 | # --------------------------------------------------------------------- 18 | # 转义符号 19 | ESCAPE_CHAR_PATTERN = '\t\n\a\b\f\r\v' 20 | 21 | # --------------------------------------------------------------------- 22 | # 异常字符 23 | # - 字节编码,包括单字节与多字节 unicode 编码,均有大量的字符无法正常显示,且 24 | # 无对应的打印文本,这部分字符需要被剔除。 25 | # - 此外,仍有大量的可打印字符,由于出现概率极低,且对中文处理作用极小,可以删除。 26 | 27 | # 一、单字节字符: 28 | # 即一个字节表示的字符,\x00~\xff,其中有一部分转义字符不打印,且非 # \t\n\r。 29 | # 因此需要作为异常字符剔除掉,主要包括:\x00~\x08,\x0e~\x1f,\x7f~\x9f,\xa1~\xff 30 | # 剩余的单字节字符参考 ascii 码表 31 | ASCII_EXCEPTION_PATTERN = '[^\x09-\x0d\x20-\x7e\xa0£¥©®°±×÷]' 32 | 33 | # 二、UNICODE 字符 34 | # 即多个字节组成的字符,其中,囊括了各种语言的各种符号,这里我们只关心常用中文字符 35 | # 以及相应的常用符号,单字节符号、标点符号等。而日文、俄文、拉丁、希腊、数学公式、 36 | # 物理单位等符号 绝大多数不常用的都被丢弃。其中 㐀-䶵 指的是另一个汉字字符集 37 | # 仅保留了常用符号,数字标识,如 ① 等 38 | UNICODE_EXCEPTION_PATTERN = '[^‐-”•·・…‰※℃℉Ⅰ-ⅹ①-⒛\u3000-】〔-〞㈠-㈩一-龥﹐-﹫!-~¢£¥]' 39 | EXCEPTION_PATTERN = ASCII_EXCEPTION_PATTERN[:-1] + UNICODE_EXCEPTION_PATTERN[2:] 40 | 41 | # --------------------------------------------------------------------- 42 | # 全角字母、数字、空格正则 43 | FULL_ANGLE_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ' 44 | HALF_ANGLE_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ' 45 | 46 | # --------------------------------------------------------------------- 47 | # HTML 标签 48 | HTML_TAG_PATTERN = '<[^<\u4E00-\u9FA5,。;!?、“”‘’()—《》…●]+?>' 49 | 50 | # 存在一定的错误金额字符串依然能够解析并通过的情况 51 | CHINESE_NUM = '[一二三四五六七八九壹贰叁弎仨肆伍陆柒捌玖俩两零]' # 金额数字 52 | CHINESE_UNIT = '[〇O零十百千万亿兆拾佰仟萬億]' # 金额数字单位 53 | CURRENCY_CASE = r'(块(钱)?(人民币)?|元((人民|港|日|澳|韩|(新)?台)币)?|(人民|港|日|澳|韩|(新)?台)币|圆(整)?|' \ 54 | r'(美|港|澳门|日|韩|缅|马|新加坡|欧|加|新西兰|澳|澳大利亚)元|美(金|刀)|英镑|马克|法郎|卢布|泰铢)' 55 | 56 | CHI_N = CHINESE_NUM 57 | CHI_U = CHINESE_UNIT 58 | 59 | # 标准金额数字格式 7,129,012.02元 60 | MONEY_PATTERN_1 = r'((\d{1,3}([,,]\d{1,3})*(\.\d{0,2})?)' + CURRENCY_CASE + ')' 61 | # 纯数字格式 340000.0元 62 | MONEY_PATTERN_2 = r'((\d{1,12}(\.\d{0,2})?)' + CURRENCY_CASE + ')' 63 | # 中文金额格式 一万二千三百四十五 64 | CHINESE_MONEY_PATTERN = ''.join(['(((', CHI_N, '?', CHI_U, '{1,2})*', CHI_N, '?)']) 65 | # 正式文本中文金额格式 一万二千三百四十五元 66 | MONEY_PATTERN_3 = CHINESE_MONEY_PATTERN + CURRENCY_CASE + '(' + CHI_N + '[角|毛])?(' + CHI_N + '分)?)' 67 | # 口语文本中文金额格式 “三十五块八毛”,但不允许 “三十五块” 或 “三十五块八” 出现:有歧义 68 | MONEY_PATTERN_4 = CHINESE_MONEY_PATTERN + '(块)' + '(' + CHI_N + '[角|毛])(' + CHI_N + '分)?)' 69 | # 数字+汉字单位格式 9300万元 1.2万元 9佰元 70 | MONEY_PATTERN_5 = r'(\d{1,4}(\.\d{0,4})?' + CHI_U + CURRENCY_CASE + ')' 71 | 72 | MONEY_PATTERN = '(' + '|'.join( 73 | [MONEY_PATTERN_1, MONEY_PATTERN_2, 74 | MONEY_PATTERN_3, MONEY_PATTERN_4, MONEY_PATTERN_5]) + ')' 75 | 76 | # --------------------------------------------------------------------- 77 | # 中文括号,用于提取括号内容,或删除 78 | PARENTHESES_PATTERN = '{}「」[]【】()()<>《》〈〉『』〔〕{}<>〖〗' 79 | 80 | # --------------------------------------------------------------------- 81 | # 标点符号 82 | PUNCTUATION_PATTERN = '' 83 | 84 | # --------------------------------------------------------------------- 85 | # 冗余字符处理 86 | # 文本中有连续的 “哈哈哈哈哈” 等字符串,需要删除冗余字符串,返回为 “哈” 87 | REDUNDANT_PATTERN = ' -\t\n啊哈呀~\u3000\xa0•·・' 88 | 89 | # --------------------------------------------------------------------- 90 | # 纯数字格式,用于过滤停用词时,过滤掉纯数字(包括汉字数字) 91 | # 融合了百分比格式、序数词,形容词(数*、*余等),负数,数字范围等,还差分数表 92 | # 示未添加,如 “三十分之一” 93 | BASE_NUMBER_PATTERN = '[' + CHINESE_NUM[1:-1] + CHINESE_UNIT[1:-1] + r'点\d\%%\.\,.多余几]+' 94 | NUMBER_PATTERN = r'^((十|百|千|万)分之|第|数|好|\-)?' + BASE_NUMBER_PATTERN + r'([\~\-~-至]?' + BASE_NUMBER_PATTERN + ')?(多|余)?$' 95 | 96 | # 纯数字格式(不包括汉字数字) 97 | # 融合了整数、小数、百分比等 98 | PURE_NUMBER_PATTERN = r'\-?\d+(\.\d+)?(%|%)?' 99 | 100 | # --------------------------------------------------------------------- 101 | # 时间词汇,用于停用词过滤时,将时间词汇过滤掉 102 | # 1. 时间格式仅用于滤除具体确切的时间点和时间段,如“2019年6月30日”,“第一季度”, 103 | # “18:30:51”,“3~4月份”,“清晨”,“年前” 等等,此类词汇描述了具体的时间,在语言中 104 | # 一般作为时间状语存在,因此在停用词滤除中,需要将该部分词汇滤除。 105 | # 2. 但不滤除模糊的时间范围,如“三十年”,“六七个月”,“十周”,“四日” 等等,这些时间 106 | # 描述了一个模糊的时间段,并没有确切的指代,在语言中一般做宾语,补语,主语等,因此 107 | # 在停用词滤除中,一般不将此类词汇滤除。 108 | # 3. 有些词汇含义指代不明,如“三十一日”,具体指某月 31日,还是31天的时间,并不确切, 109 | # 此时不予滤除。 110 | # 4. 节日名称不予滤除,如“圣诞节”、“除夕夜”,尽管其指示具体的时间点,但是一般做名词性 111 | # 成分,因此不予滤除。 112 | 113 | # 时分秒格式 114 | HO_N = r'([01]?\d|2[01234])' # 时 数字格式 115 | MI_N = r'[012345]?\d' # 分 数字格式 116 | SE_N = r'[012345]?\d' # 秒 数字格式 117 | HMS_GAP = '[::]' 118 | HMS_PATTERN_1 = '^(' + HO_N + HMS_GAP + MI_N + '(' + HMS_GAP + SE_N + ')?)$' # 纯数字格式时分秒,或时分 119 | HMS_PATTERN_2 = '^(' + HO_N + '(点|时|小时)(' + MI_N + '分(钟)?(' + SE_N + '秒(钟)?)?)?)$' # 带汉字时分秒 120 | HMS_PATTERN_3 = '^(' + HMS_PATTERN_1 + r'[\-\~~—]{1,2}' + HMS_PATTERN_1 + ')$' # 时间段 121 | # HMS_PATTERN_4 = '^([012]?\d点)$' # 有一定前提条件,即前后必须也有时间词汇 122 | 123 | # 年月日格式 124 | YE_N = r'[12]?\d{2,3}' # 年份数字格式 125 | MO_N = r'([0]?\d|1[012])' # 月份数字格式 126 | MO_C = r'(元|正|腊|一|二|三|四|五|六|七|八|九|十(一|二)?)' # 月份汉字格式 127 | DA_N = r'([012]?\d|3[01])' # 日数字格式 128 | YMD_GAP = r'[\-\~— ~\.]{1,2}' 129 | SPAN_GAP = r'[\~\-~-至]' 130 | 131 | YMD_PATTERN_1 = '^((公元(前)?)?' + YE_N + '年(初|底|中)?)?((' + MO_N + '|' + MO_C + ')月(份|底|初)?)?(' + DA_N + '[日号])?$' # 带汉字年月日 132 | YMD_PATTERN_2 = '^(' + YE_N + YMD_GAP + MO_N + '(' + YMD_GAP + DA_N + ')?)$' # 纯数字年月日,或年月 133 | YMD_PATTERN_3 = '^(' + MO_N + YMD_GAP + DA_N + '(' + YMD_GAP + YE_N + ')?)$' # 纯数字月日年,或月日 134 | YMD_PATTERN_4 = '^((公元(前)?)?' + r'(([12]?\d|(二)?十(一|二|三|四|五|六|七|八|九)?)世纪)?((\d0|(一|二|三|四|五|六|七|八|九)十)年代)?(初|末)?' + ')$' # 世纪,年代 135 | YMD_PATTERN_5 = '^(一|二|三|四|五|六|七|八|九|零|〇|○|0){4}年$' 136 | 137 | # 年月日-时分秒合并格式 138 | YMD_HMS_PATTERN = '^(' + YMD_PATTERN_2[1:-1] + r'([\-\~\—~]{1,2})?' + HMS_PATTERN_1[1:-1] + ')$' 139 | 140 | # 农历日期 141 | LUNAR_PATTERN = '((闰)?(元|正|腊|一|二|三|四|五|六|七|八|九|十(一|二)?)月|大年)(初(一|二|三|四|五|六|七|八|九|十)|(一|二|三|四|五|六|七|八|九|十){2,3})' 142 | 143 | # 时间段 144 | # 年时间段 145 | YEAR_SPAN_PATTERN = '^' + YE_N + SPAN_GAP + YE_N + '年(代)?$' 146 | # 月时间段 147 | MONTH_SPAN_PATTERN = '^' + MO_N + SPAN_GAP + MO_N + '月(份)?$' 148 | # 日时间段 149 | DAY_SPAN_PATTERN = '^' + DA_N + SPAN_GAP + DA_N + '日$' 150 | 151 | # 季节格式 152 | SEASON_PATTERN = '((春|夏|秋|冬){1,2}(季|天|日)|(第)?(一|二|三|四)(季度)(末)?)' 153 | 154 | # 星期格式 155 | WEEK_PATTERN = '((上(半)?|下(半)?|这|本|前|今|当|上上|下下)?(星期|周)(六日|一|二|三|四|五|六|日|七|天|末|初)?(时)?)' 156 | 157 | # 常见时间短语 158 | COMMON_TIME_PATTERN_1 = '^(年|月|日|时)$' 159 | COMMON_YEAR_PATTERN = r'(昔|翌|头(一|两|几|些)?|(大)?前(一|半|两|几|些)?|近(一|两|几|些)?|这(一|两|几|些)?|那(一|两|几|些)?|上(半)?|下(半)?|(大)?后(一|半|两|几|些)?|同|当|早(一|两|几|些)?|每|去|今|往|本|次|明|明后)?年(中|度|初|前|末|底|终|内)?' 160 | COMMON_SEASON_PATTERN = r'(本|上|下|这)' # 季度 161 | COMMON_MONTH_PATTERN = r'(下(个)?|首(个)?|前(两|几)?|上(个)?|这(个)?|次|这(些|个)?|那(些|个)?|上半(个)?|下半(个)?|同|本|当|每)?月(份|中|度|初|末|底)?' 162 | COMMON_DAY_PATTERN = r'(昔|首|前(一|两|几|些)?|翌|昨|次|今|往|明|平|即|往|半|旧|近(一|两|几|些)?|后(一|两|几|些)?|这(一|两|几|些)?|那(一|两|几|些)?|上半|下半|同|当|每(一)?)?(天|日)(前|后)?' 163 | COMMON_TIME_PATTERN_2 = '(下|中|上)(午|旬)|近(期|日)|此前' 164 | COMMON_TIME_PATTERN_3 = '(晚|早)(上|间)' 165 | COMMON_TIME_PATTERN_4 = '(深|每|昨|前|今|午|后|半|上半|下半|春|当|夏|秋|冬)?夜(里|晚|间)?' 166 | COMMON_TIME_PATTERN_5 = '(今|傍|昨|当)晚' 167 | COMMON_TIME_PATTERN_6 = '(早|凌|今|清)晨|黎明' 168 | COMMON_TIME_PATTERN_7 = '午(后|时)' 169 | 170 | # 各类型的时间正则汇总,可根据需要进行增删 171 | TIME_PATTERN = '(' + '|'.join( 172 | [COMMON_TIME_PATTERN_1, 173 | COMMON_YEAR_PATTERN, COMMON_SEASON_PATTERN, 174 | COMMON_MONTH_PATTERN, COMMON_DAY_PATTERN, 175 | COMMON_TIME_PATTERN_2, COMMON_TIME_PATTERN_3, COMMON_TIME_PATTERN_4, 176 | COMMON_TIME_PATTERN_5, COMMON_TIME_PATTERN_6, COMMON_TIME_PATTERN_7, 177 | WEEK_PATTERN, LUNAR_PATTERN, YMD_HMS_PATTERN, SEASON_PATTERN, 178 | YMD_PATTERN_1, YMD_PATTERN_2, YMD_PATTERN_3, YMD_PATTERN_4, 179 | YMD_PATTERN_5, 180 | HMS_PATTERN_1, HMS_PATTERN_2, HMS_PATTERN_3, 181 | YEAR_SPAN_PATTERN, MONTH_SPAN_PATTERN, DAY_SPAN_PATTERN, ]) + ')' 182 | 183 | # --------------------------------------------------------------------- 184 | # URL 185 | URL_PATTERN = r'(?<=[^.])((?:(?:https?|ftp|file)://|(?()—《》…● \t\n])' 188 | 189 | ####################################################################### 190 | # 针对 time_parser 的正则字符串 191 | # 字符串操作 192 | # 年 193 | LIMIT_YEAR_STRING = r'(前(一)?|今|明|去|同|当|后|大前|本|次|上(一)?|这(一)?)年' 194 | LUNAR_YEAR_STRING = r'([一二三四五六七八九零〇]{2}|[一二三四五六七八九零〇]{4}|[12]\d{3}|\d{2})年' 195 | YEAR_STRING = r'([12]?\d{2,3}|[一二三四五六七八九零〇]{2,4})年' 196 | 197 | # 月 198 | MONTH_NUM_STRING = r'(1[012]|[0]?[1-9]|十[一二]|[一二三四五六七八九十])' # 1~12 std month num 199 | MONTH_STRING = MONTH_NUM_STRING + r'月(份)?' 200 | MONTH_NUM_STRING = MONTH_NUM_STRING[:-2] + r'两])' # 1~12 order month num 201 | BLUR_MONTH_STRING = r'(初|[一]开年|伊始|末|尾|终|底|[上下]半年|[暑寒][假期]|[前中后]期)' 202 | LUNAR_MONTH_STRING = r'(闰)?([正一二三四五六七八九十冬腊]|十[一二]|[1-9]|1[012])月' 203 | LIMIT_MONTH_STRING = r'([下上]((一)?个)?|同|本|当|次|这)月' 204 | SELF_EVI_LUNAR_MONTH_STRING = r'((闰)?[正冬腊]|闰([一二三四五六七八九十]|十[一二]|[1-9]|1[012]))月' 205 | 206 | # 周 207 | WEEK_NUM_STRING = r'[一二两三四五六七八九十0-9]{1,3}' # 1~52 208 | WEEK_STRING = r'(周|星期|礼拜)' 209 | 210 | # 日 211 | DAY_NUM_STRING = r'(([12]\d|3[01]|[0]?[1-9])|([一二]?十)?[一二三四五六七八九]|(三十)?[一]|[二三]?十)' # 1~31 212 | DAY_STRING = DAY_NUM_STRING + r'[日号]' 213 | BLUR_DAY_STRING = r'([上中下]旬|初|中|底|末)' 214 | # 允许 `初8` 阿拉伯数字出现,但不允许 `廿2`、`23` 等作为农历`日` 215 | LUNAR_SOLAR_DAY_STRING = r'((初|(二)?十|廿)[一二三四五六七八九]|[初二三]十|初([1-9]|10)|[12]\d|3[01]|[0]?[1-9])' 216 | LUNAR_DAY_STRING = r'((初|(二)?十|廿)[一二三四五六七八九]|[初二三]十|初([1-9]|10))' 217 | SELF_EVI_LUNAR_DAY_STRING = r'([初廿]([一二三四五六七八九十1-9]|10))' 218 | 219 | # 时 220 | HOUR_STRING = r'((十)?[一两二三四五六七八九]|[零〇十]|二十[一二三四]?|[01]?\d|2[01234])[时点]' 221 | BLUR_HOUR_STRING = r'(凌晨|白天|清[晨|早]|黎明|一(大)?早|早[晨上]?|[上中下]午|午后|(傍)?晚[间上]?|[深半午]?夜[里间]?|[上下前后]半夜)' 222 | 223 | # 分、秒 224 | MIN_SEC_STRING = r'((零|〇|[一二三四五]?十)[一二三四五六七八九]|[二三四五]?十|[012345]?\d)' 225 | 226 | # seg 227 | I = '|' 228 | LU = r'([农阴]历)' 229 | LU_A = absence(LU) 230 | 231 | # appendix 232 | TIME_POINT_SUFFIX = r'(左右|许|前后)' 233 | TIME_SPAN_SUFFIX = r'((之)?间)' 234 | # TIME_DELTA_SUFFIX = r'' 235 | 236 | # 节气 237 | SOLAR_TERM_STRING = r'(立春|雨水|惊蛰|春分|清明|谷雨|立夏|小满|芒种|夏至|小暑|大暑|' \ 238 | r'立秋|处暑|白露|秋分|寒露|霜降|立冬|小雪|大雪|冬至|小寒|大寒)' 239 | 240 | # 固定公历节日 241 | FIXED_SOLAR_FESTIVAL = r'((元旦|十一)|(三八|五一|六一|七一|八一|国庆|圣诞)(节)?|' \ 242 | r'((三八)?妇女|女神|植树|(五一)?劳动|(五四)?青年|(六一)?儿童|(七一)?建党|(八一)?建军|教师|情人|愚人|万圣|护士)节|' \ 243 | r'地球日|三[\.•·・]?一五|双(十一|11)|(.{1,4})?消费者权益日)' 244 | # 固定农历节日 245 | FIXED_LUNAR_FESTIVAL = r'((春|填仓|上巳|寒食|浴佛|姑姑|财神|下元|寒衣)节|' \ 246 | r'(龙抬头|除夕)|' \ 247 | r'(大年初[一二三四五六七八九十])|' \ 248 | r'(端午|端阳|七夕|元宵|中秋|重阳|腊八|中元)(节)?)' 249 | # 规律公历节日 250 | REGULAR_FOREIGN_FESTIVAL = r'(感恩|母亲|父亲)节' 251 | 252 | # time_delta 数字正则 253 | DELTA_NUM_STRING = r'(([一两二三四五六七八九十百千万零]+点)?[一两二三四五六七八九十百千万零]+|([\d十百千万,]+\.)?[\d十百千万,]+)' 254 | QUARTER_NUM_STRING = r'[一两二三四1-4]' 255 | 256 | # 单个数字正则 257 | SINGLE_NUM_STRING = r'[一两二三四五六七八九十\d]' 258 | 259 | # time_delta 正则 260 | YEAR_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?(周)?年(多)?', I, '半年', I, SINGLE_NUM_STRING, '年半']) 261 | SOLAR_SEASON_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'个(多)?季度']) 262 | MONTH_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'(多)?个(多)?月', I, '(俩|三|仨)月', I, '半(个(多)?)?月', I, 263 | SINGLE_NUM_STRING, '个半月']) 264 | WORKDAY_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?(个)?(工作|交易)日']) 265 | DAY_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?[天日]', I, '半天', I, SINGLE_NUM_STRING, '天半']) 266 | WEEK_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?((个(多)?)?(星期|礼拜)|周(?!年))', I, r'俩(星期|礼拜)']) 267 | HOUR_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?(个(多)?)?(小时|钟头)', I, 268 | '半(个(多)?)?(小时|钟头)', I, '俩(小时|钟头)', I, SINGLE_NUM_STRING, '个半(小时|钟头)']) 269 | QUARTER_DELTA_STRING = ''.join([QUARTER_NUM_STRING, '刻钟']) 270 | MINUTE_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?分(钟)?(半)?', I, '半分钟', I, 271 | SINGLE_NUM_STRING, '+分半(钟)?']) 272 | SECOND_DELTA_STRING = ''.join([DELTA_NUM_STRING, r'[多余]?秒(钟)?']) 273 | 274 | # 将时间进行转换 275 | # DELTA_SUB = r'([之以]?[内前后上下来])' 276 | DELTA_SUB = r'([之以]?[内前后来])' 277 | 278 | ######################################################################## 279 | # 时间 NER 字符规则 280 | TIME_CHAR_STRING = ''.join( 281 | [r'(现在|开始|黎明|过去|愚人|感恩|圣诞|儿童|劳动|父亲|母亲|礼拜|霜降|立春|立冬|小寒|大寒|', 282 | r'立夏|立秋|冬至|', 283 | r'[102年月日3589647时午至天上个分今下\:\-点晚前一小后周起内以底三晨钟来半两凌当十份季去早多第五中初\.度二从六期旬到间四节号:', 284 | r'代\~\—~春明昨星末秋之同·世纪本七九秒每次八夏/夜零正冬腊余工作元国清傍交易首 ()()、万宵全暑头端庆旦-际消费者权益大里农阴历双财', 285 | r'近运深, ”夕〇几汛假壹无数白百刻许左右的这本])+']) 286 | FAKE_POSITIVE_START_STRING = r'[起到至以开-\—\-\~~]' # 此字符串不可作为时间串的开始, `以来|开始` 为取首字 287 | FAKE_POSITIVE_END_STRING = r'[到至-\—\-\~~]' # 此字符串不可作为时间串的结束 288 | 289 | ######################################################################## 290 | # 货币金额 NER 字符规则 291 | MONEY_PREFIX_STRING = r'((将)?近|只有|仅|(大)?约(莫|合)?|大概|至少(要)?|不(到|足|超过)?|逾|高于|(高)?达(到)?|^上|(超)?过|超)' 292 | MONEY_SUFFIX_STRING = r'(以上|以下|左右|上下)' 293 | 294 | MONEY_BLUR_STRING = r'((大)?约(莫|合)?|大概|左右|上下)' 295 | MONEY_MINUS_STRING = r'((将)?近|不(到|足|超过)?|以下)' 296 | MONEY_PLUS_STRING = r'(至少(要)?|逾|高于|上|(超)?过|超|以上)' 297 | 298 | MONEY_NUM_MIDDLE_STRING = r'[,, ]' 299 | # 用于检测字符串是否有误,直接报错 300 | MONEY_NUM_STRING = r'^[ \.多个数几百佰k千仟w万萬亿十拾兆〇O0-9零0-9一二两三四五六七八九壹贰俩叁弎仨肆伍陆柒捌玖]+$' 301 | 302 | MONEY_KUAI_MAO_JIAO_FEN_STRING = r'[分角毛块]' 303 | MONEY_PREFIX_CASE_STRING = r'(港币|人民币|(新)?台币)' 304 | 305 | # 货币单位,该货币单位也包含了上述 MONEY_PREFIX_CASE_STRING 306 | # 雷亚尔:巴西货币 307 | MONEY_SUFFIX_CASE_STRING = r'((分|角|毛|块|元)钱?|(人民|港|日|澳|(新)?台)币|圆(整)?|英镑|美(金|分|刀)|马克|法郎|卢布|泰铢|' \ 308 | r'元((人民|港|日|澳|韩|(新)?台)币)?|(美|港|澳门|日|韩|缅|马|新加坡|欧|加|加拿大|新西兰|澳|澳大利亚)元|' \ 309 | r'(越(南)?)盾|雷亚尔)' 310 | 311 | # MONEY_SPAN_GAP_START = r'(从)' 312 | # MONEY_SPAN_GAP_MIDDLE = r'(\~+|\-+|~+|-+|至(?!少)|(?= 0: 157 | if self.num_pattern.search(time_candidate[j - 1]): 158 | continue 159 | if self.num_pattern.search(sub_string_for_parse[-1]): 160 | if offset[1] < length: 161 | if self.num_pattern.search(time_candidate[offset[1]]): 162 | continue 163 | 164 | result = self.parse_time( 165 | sub_string_for_parse, time_base=time_base, strict=True, 166 | ret_type=ret_type, ret_future=ret_future, 167 | period_results_num=period_results_num) 168 | 169 | return sub_string, result, offset 170 | except (ValueError, Exception): 171 | continue 172 | 173 | return None, None, None 174 | 175 | def _grid_search_2(self, time_candidate): 176 | """ 全面搜索候选时间字符串,从前至后,从长至短 """ 177 | print(time_candidate) 178 | length = len(time_candidate) 179 | for i in range(length - 1): # 控制起始点 180 | for j in range(length, i, -1): # 控制终止点 181 | try: 182 | offset = [i, j] 183 | sub_string = time_candidate[i: j] 184 | print(sub_string) 185 | # 处理假阳性。检查子串,对某些产生歧义的内容进行过滤。 186 | # 原因在于,parse_time 会对某些不符合要求的字符串做正确解析. 187 | if not TimeExtractor._filter(sub_string): 188 | continue 189 | 190 | result = self.parse_time(sub_string, strict=True) 191 | 192 | return sub_string, result, offset 193 | except (ValueError, Exception): 194 | continue 195 | 196 | return None, None, None 197 | 198 | def extract_time_candidates(self, text): 199 | """ 获取所有的候选时间字符串,其中包含了时间实体 """ 200 | text_length = len(text) 201 | idx_count = 0 202 | time_candidates_list = list() 203 | while idx_count < text_length: 204 | matched_res = self.time_string_pattern.search(text[idx_count:]) 205 | # print(matched_res) 206 | if matched_res is not None: 207 | if len(matched_res.group()) > 1: 208 | time_candidates_list.append( 209 | {'time_candidate': matched_res.group(), 210 | 'offset': [idx_count + matched_res.span()[0], 211 | idx_count + matched_res.span()[1]]}) 212 | elif matched_res.group() in self.single_char_time: 213 | # 可能误打 “春”、“夏” 等单字符时间表达,故在此加入 214 | time_candidates_list.append( 215 | {'time_candidate': matched_res.group(), 216 | 'offset': [idx_count + matched_res.span()[0], 217 | idx_count + matched_res.span()[1]]}) 218 | idx_count += matched_res.span()[1] 219 | else: 220 | break 221 | 222 | return time_candidates_list 223 | 224 | 225 | if __name__ == '__main__': 226 | text = ''' 227 | 过去5天涨幅小于-15%''' 228 | 229 | extract_time = TimeExtractor() 230 | res = extract_time(text, with_parsing=False) 231 | print(res) 232 | --------------------------------------------------------------------------------