├── README.md ├── __pycache__ └── time_extractor.cpython-36.pyc ├── demo.py ├── result.png └── time_extractor.py /README.md: -------------------------------------------------------------------------------- 1 | # TimeExtractor 2 | 针对口语进行时间抽取并标准化 3 | 4 | ### 特性 5 | 6 | - 具有一定的时间段推理能力,如十天后xxx 7 | - 支持中文数字,如三月八日早上九点xxx 8 | - 针对口语化语句设计,如明天xxx,后天xxx,三月……嗯……八日……,九点半等口语描述 9 | 10 | ### 缺点 11 | 12 | 因为这个库只是满足我自己的需求问题,用于解析蕴含预约/提醒意图的句子,并不是通用的时间抽取器,所以具有如下的缺点: 13 | 14 | - 只能解析未来的时间,比如,昨天xxx,前天xxx等 15 | - 对于时间跨度过大,解析能力不足,比如一万天后提醒我xxxx 16 | - 目前只是基于规则进行解析,可能有一定的不足 17 | 18 | ### How To Use 19 | 20 | see [demo.py](demo.py) 21 | 22 | ```python 23 | from time_extractor import Extractor 24 | extractor = Extractor() 25 | text = "三月……嗯……八日早上八点提醒我出门" 26 | time_list, timestamp = extractor.extract(text) 27 | print(time_list, timestamp) 28 | # Output: [年,月,日,时,分,秒], 时间戳 29 | # [2020,3,8,8,0,0] ,1583625600.0 30 | ``` 31 | 32 | 33 | 34 | ### Demo 35 | 36 | 测试日期为2020.3.2 37 | 38 | ![](result.png) 39 | 40 | 41 | 42 | 项目更新与否取决于我有没有新需求,开源是为了方便ctrl+c,如果对你也有帮助,希望你能点一个star。 -------------------------------------------------------------------------------- /__pycache__/time_extractor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MashiMaroLjc/TimeExtractor/639bddb2ff2792c4822132e7ed2924234e518f1b/__pycache__/time_extractor.cpython-36.pyc -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from time_extractor import Extractor 3 | example = [ 4 | "八点提醒我出门", 5 | "大后天八点提醒我出门", 6 | "三月八日早上八点提醒我出门", 7 | "三月……嗯……八日早上八点提醒我出门", 8 | "三月八日中午一点提醒我出门", 9 | "三月八日晚上九点提醒我出门", 10 | "3月八日晚上9点提醒我出门", 11 | "3月八日晚上九点半提醒我出门", 12 | "3月八日晚上9点37分零六秒提醒我出门", 13 | "3月八日晚上09:37:06提醒我出门", 14 | "明年3月八日晚上09:37:06提醒我出门", 15 | "五十年后3月八日晚上09:37:06提醒我出门", 16 | "下一个月后3月八日晚上09:37:06提醒我出门", 17 | "十三个月后3月八日晚上09:37:06提醒我出门", 18 | "今天八点提醒我出门", 19 | "明天八点提醒我出门", 20 | "后天八点提醒我出门", 21 | "十天后八点提醒我出门", 22 | "二十日八点提醒我出么", 23 | "四十日后八点提醒我出么", 24 | ] 25 | extractor = Extractor() 26 | MAX_LEN = 100 27 | print("测试例子") 28 | for i, ex in enumerate(example): 29 | time_list, timestamp = extractor.extract(ex) 30 | time_str = "{}年{}月{}日,{}时{}分{}秒".format(time_list[0], time_list[1], time_list[2], \ 31 | time_list[3], time_list[4], time_list[5]) 32 | print(ex, " ---> ", time_str,timestamp) 33 | -------------------------------------------------------------------------------- /result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MashiMaroLjc/TimeExtractor/639bddb2ff2792c4822132e7ed2924234e518f1b/result.png -------------------------------------------------------------------------------- /time_extractor.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | import time 4 | 5 | 6 | def get_zh2num_dict(): 7 | num2zh = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"] 8 | # 不想打字,拼接字符串 9 | for i in range(100): 10 | if i < 10: 11 | pass 12 | elif i >= 10 and i < 20: 13 | m = i % 10 14 | r = "十" 15 | if m != 0: 16 | r += num2zh[m] 17 | num2zh.append(r) 18 | elif i >= 20 and i < 100: 19 | m = i % 10 20 | index = i // 10 21 | r = num2zh[index] + "十" 22 | if m != 0: 23 | r += num2zh[m] 24 | num2zh.append(r) 25 | else: 26 | raise ValueError("Not support") 27 | zh2num = dict([(zh, i) for i, zh in enumerate(num2zh)]) 28 | return zh2num 29 | 30 | 31 | def is_include(query, keywords: list, return_word=False): 32 | for keyword in keywords: 33 | if keyword in query: 34 | if return_word: 35 | return True, keyword 36 | return True 37 | if return_word: 38 | return False, None 39 | return False 40 | 41 | 42 | class Extractor: 43 | def __init__(self): 44 | self.year_extract_keyword = ["明年"] 45 | self.year_extract_pattern = re.compile("(?P[0-9]{1,2})年后") 46 | 47 | self.month_extract_keyword = ["下一个月"] 48 | self.month_extract_pattern = re.compile("(?P[0-9]{1,2})个{0,1}月后") 49 | 50 | self.day_extract_keyword = ["今天", "明天", "后天", "大后天", "今日", "后日", "明日"] 51 | self.day_extract_pattern = re.compile("(?P[0-9]{1,2})[天|日]后") 52 | self.day_extract_pattern2 = re.compile("(?P[0-9]{1,2})[日|号][^后]") 53 | self.time_format_pattern = re.compile("[小时|点][半|钟]{0,1}(.{1,3}分钟{0,1}.{1,3}秒钟{0,1}){0,1}") 54 | self.time_extract_pattern = re.compile("[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}") 55 | self.zh2num_dict = get_zh2num_dict() 56 | 57 | def zh2num(self, text): 58 | result = text 59 | keys = sorted(self.zh2num_dict, key=lambda x: self.zh2num_dict[x], reverse=True) # 从九十九往零检查,替换 60 | for key in keys: 61 | if key in result: 62 | result = result.replace(key, str(self.zh2num_dict[key])) 63 | return result 64 | 65 | def time_format(self, text): 66 | """ 67 | 中文格式时间转换 68 | :param text: 69 | :return: 70 | """ 71 | match_str = re.search(self.time_format_pattern, text) 72 | if match_str is None: 73 | return text 74 | else: 75 | match_str = match_str[0] 76 | result = match_str 77 | if "时" in match_str: 78 | result = result.replace("时", ":") 79 | elif "点钟" in match_str: 80 | result = result.replace("点钟", ":") 81 | elif "点" in match_str: 82 | result = result.replace("点", ":") 83 | if "半" in text: 84 | result = result.replace("半", "30:") 85 | elif "分钟" in match_str: 86 | result = result.replace("分钟", ":") 87 | elif "分" in text: 88 | result = result.replace("分", ":") 89 | else: 90 | result += "00:" 91 | if "秒" in match_str: 92 | result = result.replace("秒", "") 93 | else: 94 | result += "00" 95 | text = text.replace(match_str, result) 96 | return text 97 | 98 | def _extract(self, string): 99 | month_day = [-1, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] 100 | now_time = time.time() 101 | 102 | time_array = time.localtime(int(now_time)) 103 | format_time = time.strftime("%Y:%m:%d:%H:%M:%S", time_array) 104 | time_list = list(map(int, format_time.split(":"))) 105 | # 闰年? 106 | if time_list[0] % 4 == 0 and time_list[0] % 400 == 0: 107 | month_day[2] = 29 108 | # 检查年份 109 | flag, word = is_include(string, self.year_extract_keyword, return_word=True) 110 | if flag: 111 | if word == "明年": 112 | time_list[0] += 1 113 | else: 114 | match_result = self.year_extract_pattern.search(string) 115 | if match_result: 116 | value = match_result.group("value") 117 | time_list[0] += int(value) 118 | # 检查月 119 | flag, word = is_include(string, self.month_extract_keyword, return_word=True) 120 | if flag: 121 | if word == "下一个月": 122 | time_list[1] += 1 123 | else: 124 | match_result = self.month_extract_pattern.search(string) 125 | if match_result: 126 | value = match_result.group("value") 127 | value = int(value) 128 | time_list[1] += value % 12 129 | if value > 12: 130 | time_list[0] += (value // 12) 131 | # 检查日子 132 | flag, word = is_include(string, self.day_extract_keyword, return_word=True) 133 | if flag: 134 | if "明" in word: 135 | time_list[2] += 1 136 | elif "后" in word: 137 | v = 2 138 | if "大" in word: 139 | v = 3 140 | time_list[2] += v 141 | else: 142 | 143 | match_result = self.day_extract_pattern.search(string) 144 | if match_result: 145 | value = match_result.group("value") 146 | value = int(value) 147 | rest_day = month_day[time_list[1]] - time_list[2] # 当前月剩下的日子 148 | if value <= rest_day: 149 | time_list[2] += value 150 | else: 151 | # TODO 如果超过不止一个月 152 | time_list[2] += value - rest_day 153 | time_list[1] += 1 154 | if time_list[1] > 12: 155 | time_list[0] += 1 156 | time_list[1] = time_list[1] % 12 157 | 158 | match_result = self.day_extract_pattern2.search(string) 159 | if match_result: 160 | value = match_result.group("value") 161 | value = int(value) 162 | time_list[2] = value 163 | 164 | # 设置时分秒 165 | match_result = self.time_extract_pattern.search(string) 166 | if match_result: 167 | time_string = match_result[0] 168 | h, m, s = time_string.split(":") 169 | if any(["中午" in string, "下午" in string, "晚上" in string]) and int(h) <= 12: 170 | h = int(h) + 12 171 | time_list[3] = int(h) 172 | time_list[4] = int(m) 173 | time_list[5] = int(s) 174 | # 转成时间戳 175 | time_list = tuple(time_list) + (0, 0, 0) 176 | timestamp = time.mktime(time_list) 177 | return time_list, timestamp 178 | 179 | def extract(self, string): 180 | """ 181 | 182 | :param string: 183 | :return: 184 | """ 185 | # 先转成数字 186 | string = self.zh2num(string) 187 | string = self.time_format(string) 188 | time_list, timestamp = self._extract(string) 189 | return time_list, timestamp 190 | 191 | 192 | --------------------------------------------------------------------------------