├── LunarSolarConverter.py ├── README.md ├── RangeTimeEnum.py ├── StringPreHandler.py ├── TestTime.py ├── TimeNormalizer.py ├── TimePoint.py ├── TimeUnit.py ├── resource ├── __init__.py ├── holi_lunar.json ├── holi_solar.json ├── reg.pkl └── regex.txt └── setup.py /LunarSolarConverter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/12/11 11:08 4 | # @Author : zhm 5 | # @File : LunarSolarConverter.py 6 | # @Software: PyCharm 7 | from pprint import pprint 8 | 9 | 10 | class Lunar: 11 | def __init__(self, lunarYear, lunarMonth, lunarDay, isleap): 12 | self.isleap = isleap 13 | self.lunarDay = lunarDay 14 | self.lunarMonth = lunarMonth 15 | self.lunarYear = lunarYear 16 | 17 | 18 | class Solar: 19 | def __init__(self, solarYear, solarMonth, solarDay): 20 | self.solarDay = solarDay 21 | self.solarMonth = solarMonth 22 | self.solarYear = solarYear 23 | 24 | 25 | def GetBitInt(data, length, shift): 26 | return (data & (((1 << length) - 1) << shift)) >> shift 27 | 28 | 29 | def SolarToInt(y, m, d): 30 | m = (m + 9) % 12 31 | y -= m / 10 32 | return 365 * y + y / 4 - y / 100 + y / 400 + (m * 306 + 5) / 10 + (d - 1) 33 | 34 | 35 | def SolarFromInt(g): 36 | y = (10000 * g + 14780) / 3652425 37 | ddd = g - (365 * y + y / 4 - y / 100 + y / 400) 38 | if ddd < 0: 39 | y -= 1 40 | ddd = g - (365 * y + y / 4 - y / 100 + y / 400) 41 | 42 | mi = (100 * ddd + 52) / 3060 43 | mm = (mi + 2) % 12 + 1 44 | y += (mi + 2) / 12 45 | dd = ddd - (mi * 306 + 5) / 10 + 1 46 | solar = Solar(y, mm, dd) 47 | return solar 48 | 49 | 50 | class LunarSolarConverter: 51 | ##################################################################################### 52 | # 1888~2111年农历数据表 53 | # 农历数据 每个元素的存储格式如下: 54 | # 16~13 12 11~0 55 | # 闰几月 闰月日数 1~12月份农历日数(大小月) 56 | # 注:1、bit0表示农历1月份日数,为1表示30天,为0表示29天。bit1表示农历2月份日数,依次类推。 57 | # 2、bit12表示闰月日数,1为30天,0为29天。bit16~bit13表示第几月是闰月(注:为0表示该年无闰月) 58 | # 数据来源参考: http://data.weather.gov.hk/gts/time/conversion1_text_c.htm 59 | ##################################################################################### 60 | lunar_month_days = [1887, 0x1694, 0x16aa, 0x4ad5, 0xab6, 0xc4b7, 0x4ae, 0xa56, 0xb52a, 61 | 0x1d2a, 0xd54, 0x75aa, 0x156a, 0x1096d, 0x95c, 0x14ae, 0xaa4d, 0x1a4c, 0x1b2a, 0x8d55, 62 | 0xad4, 0x135a, 0x495d, 63 | 0x95c, 0xd49b, 0x149a, 0x1a4a, 0xbaa5, 0x16a8, 0x1ad4, 0x52da, 0x12b6, 0xe937, 0x92e, 64 | 0x1496, 0xb64b, 0xd4a, 65 | 0xda8, 0x95b5, 0x56c, 0x12ae, 0x492f, 0x92e, 0xcc96, 0x1a94, 0x1d4a, 0xada9, 0xb5a, 0x56c, 66 | 0x726e, 0x125c, 67 | 0xf92d, 0x192a, 0x1a94, 0xdb4a, 0x16aa, 0xad4, 0x955b, 0x4ba, 0x125a, 0x592b, 0x152a, 68 | 0xf695, 0xd94, 0x16aa, 69 | 0xaab5, 0x9b4, 0x14b6, 0x6a57, 0xa56, 0x1152a, 0x1d2a, 0xd54, 0xd5aa, 0x156a, 0x96c, 70 | 0x94ae, 0x14ae, 0xa4c, 71 | 0x7d26, 0x1b2a, 0xeb55, 0xad4, 0x12da, 0xa95d, 0x95a, 0x149a, 0x9a4d, 0x1a4a, 0x11aa5, 72 | 0x16a8, 0x16d4, 73 | 0xd2da, 0x12b6, 0x936, 0x9497, 0x1496, 0x1564b, 0xd4a, 0xda8, 0xd5b4, 0x156c, 0x12ae, 74 | 0xa92f, 0x92e, 0xc96, 75 | 0x6d4a, 0x1d4a, 0x10d65, 0xb58, 0x156c, 0xb26d, 0x125c, 0x192c, 0x9a95, 0x1a94, 0x1b4a, 76 | 0x4b55, 0xad4, 77 | 0xf55b, 0x4ba, 0x125a, 0xb92b, 0x152a, 0x1694, 0x96aa, 0x15aa, 0x12ab5, 0x974, 0x14b6, 78 | 0xca57, 0xa56, 0x1526, 79 | 0x8e95, 0xd54, 0x15aa, 0x49b5, 0x96c, 0xd4ae, 0x149c, 0x1a4c, 0xbd26, 0x1aa6, 0xb54, 80 | 0x6d6a, 0x12da, 0x1695d, 81 | 0x95a, 0x149a, 0xda4b, 0x1a4a, 0x1aa4, 0xbb54, 0x16b4, 0xada, 0x495b, 0x936, 0xf497, 82 | 0x1496, 0x154a, 0xb6a5, 83 | 0xda4, 0x15b4, 0x6ab6, 0x126e, 0x1092f, 0x92e, 0xc96, 0xcd4a, 0x1d4a, 0xd64, 0x956c, 84 | 0x155c, 0x125c, 0x792e, 85 | 0x192c, 0xfa95, 0x1a94, 0x1b4a, 0xab55, 0xad4, 0x14da, 0x8a5d, 0xa5a, 0x1152b, 0x152a, 86 | 0x1694, 0xd6aa, 87 | 0x15aa, 0xab4, 0x94ba, 0x14b6, 0xa56, 0x7527, 0xd26, 0xee53, 0xd54, 0x15aa, 0xa9b5, 0x96c, 88 | 0x14ae, 0x8a4e, 89 | 0x1a4c, 0x11d26, 0x1aa4, 0x1b54, 0xcd6a, 0xada, 0x95c, 0x949d, 0x149a, 0x1a2a, 0x5b25, 90 | 0x1aa4, 0xfb52, 91 | 0x16b4, 0xaba, 0xa95b, 0x936, 0x1496, 0x9a4b, 0x154a, 0x136a5, 0xda4, 0x15ac] 92 | # 额外添加数据,方便快速计算阴历转阳历 每个元素的存储格式如下: 93 | # 12~7 6~5 4~0 94 | # 离元旦多少天 春节月 春节日 95 | ##################################################################################### 96 | solar_1_1 = [1887, 0xec04c, 0xec23f, 0xec435, 0xec649, 0xec83e, 0xeca51, 0xecc46, 0xece3a, 97 | 0xed04d, 0xed242, 0xed436, 0xed64a, 0xed83f, 0xeda53, 0xedc48, 0xede3d, 0xee050, 0xee244, 0xee439, 98 | 0xee64d, 99 | 0xee842, 0xeea36, 0xeec4a, 0xeee3e, 0xef052, 0xef246, 0xef43a, 0xef64e, 0xef843, 0xefa37, 0xefc4b, 100 | 0xefe41, 101 | 0xf0054, 0xf0248, 0xf043c, 0xf0650, 0xf0845, 0xf0a38, 0xf0c4d, 0xf0e42, 0xf1037, 0xf124a, 0xf143e, 102 | 0xf1651, 103 | 0xf1846, 0xf1a3a, 0xf1c4e, 0xf1e44, 0xf2038, 0xf224b, 0xf243f, 0xf2653, 0xf2848, 0xf2a3b, 0xf2c4f, 104 | 0xf2e45, 105 | 0xf3039, 0xf324d, 0xf3442, 0xf3636, 0xf384a, 0xf3a3d, 0xf3c51, 0xf3e46, 0xf403b, 0xf424e, 0xf4443, 106 | 0xf4638, 107 | 0xf484c, 0xf4a3f, 0xf4c52, 0xf4e48, 0xf503c, 0xf524f, 0xf5445, 0xf5639, 0xf584d, 0xf5a42, 0xf5c35, 108 | 0xf5e49, 109 | 0xf603e, 0xf6251, 0xf6446, 0xf663b, 0xf684f, 0xf6a43, 0xf6c37, 0xf6e4b, 0xf703f, 0xf7252, 0xf7447, 110 | 0xf763c, 111 | 0xf7850, 0xf7a45, 0xf7c39, 0xf7e4d, 0xf8042, 0xf8254, 0xf8449, 0xf863d, 0xf8851, 0xf8a46, 0xf8c3b, 112 | 0xf8e4f, 113 | 0xf9044, 0xf9237, 0xf944a, 0xf963f, 0xf9853, 0xf9a47, 0xf9c3c, 0xf9e50, 0xfa045, 0xfa238, 0xfa44c, 114 | 0xfa641, 115 | 0xfa836, 0xfaa49, 0xfac3d, 0xfae52, 0xfb047, 0xfb23a, 0xfb44e, 0xfb643, 0xfb837, 0xfba4a, 0xfbc3f, 116 | 0xfbe53, 117 | 0xfc048, 0xfc23c, 0xfc450, 0xfc645, 0xfc839, 0xfca4c, 0xfcc41, 0xfce36, 0xfd04a, 0xfd23d, 0xfd451, 118 | 0xfd646, 119 | 0xfd83a, 0xfda4d, 0xfdc43, 0xfde37, 0xfe04b, 0xfe23f, 0xfe453, 0xfe648, 0xfe83c, 0xfea4f, 0xfec44, 120 | 0xfee38, 121 | 0xff04c, 0xff241, 0xff436, 0xff64a, 0xff83e, 0xffa51, 0xffc46, 0xffe3a, 0x10004e, 0x100242, 122 | 0x100437, 123 | 0x10064b, 0x100841, 0x100a53, 0x100c48, 0x100e3c, 0x10104f, 0x101244, 0x101438, 0x10164c, 124 | 0x101842, 0x101a35, 125 | 0x101c49, 0x101e3d, 0x102051, 0x102245, 0x10243a, 0x10264e, 0x102843, 0x102a37, 0x102c4b, 126 | 0x102e3f, 0x103053, 127 | 0x103247, 0x10343b, 0x10364f, 0x103845, 0x103a38, 0x103c4c, 0x103e42, 0x104036, 0x104249, 128 | 0x10443d, 0x104651, 129 | 0x104846, 0x104a3a, 0x104c4e, 0x104e43, 0x105038, 0x10524a, 0x10543e, 0x105652, 0x105847, 130 | 0x105a3b, 0x105c4f, 131 | 0x105e45, 0x106039, 0x10624c, 0x106441, 0x106635, 0x106849, 0x106a3d, 0x106c51, 0x106e47, 132 | 0x10703c, 0x10724f, 133 | 0x107444, 0x107638, 0x10784c, 0x107a3f, 0x107c53, 0x107e48] 134 | 135 | def LunarToSolar(self, lunar): 136 | days = LunarSolarConverter.lunar_month_days[lunar.lunarYear - LunarSolarConverter.lunar_month_days[0]] 137 | leap = GetBitInt(days, 4, 13) 138 | offset = 0 139 | loopend = leap 140 | if not lunar.isleap: 141 | 142 | if lunar.lunarMonth <= leap or leap == 0: 143 | 144 | loopend = lunar.lunarMonth - 1 145 | 146 | else: 147 | 148 | loopend = lunar.lunarMonth 149 | 150 | for i in range(0, loopend): 151 | offset += GetBitInt(days, 1, 12 - i) == 1 and 30 or 29 152 | 153 | offset += lunar.lunarDay 154 | 155 | solar11 = LunarSolarConverter.solar_1_1[lunar.lunarYear - LunarSolarConverter.solar_1_1[0]] 156 | 157 | y = GetBitInt(solar11, 12, 9) 158 | m = GetBitInt(solar11, 4, 5) 159 | d = GetBitInt(solar11, 5, 0) 160 | 161 | return SolarFromInt(SolarToInt(y, m, d) + offset - 1) 162 | 163 | def SolarToLunar(self, solar): 164 | 165 | lunar = Lunar(0, 0, 0, False) 166 | index = solar.solarYear - LunarSolarConverter.solar_1_1[0] 167 | data = (solar.solarYear << 9) | (solar.solarMonth << 5) | solar.solarDay 168 | if LunarSolarConverter.solar_1_1[index] > data: 169 | index -= 1 170 | 171 | solar11 = LunarSolarConverter.solar_1_1[index] 172 | y = GetBitInt(solar11, 12, 9) 173 | m = GetBitInt(solar11, 4, 5) 174 | d = GetBitInt(solar11, 5, 0) 175 | offset = SolarToInt(solar.solarYear, solar.solarMonth, solar.solarDay) - SolarToInt(y, m, d) 176 | 177 | days = LunarSolarConverter.lunar_month_days[index] 178 | leap = GetBitInt(days, 4, 13) 179 | 180 | lunarY = index + LunarSolarConverter.solar_1_1[0] 181 | lunarM = 1 182 | offset += 1 183 | 184 | for i in range(0, 13): 185 | 186 | dm = GetBitInt(days, 1, 12 - i) == 1 and 30 or 29 187 | if offset > dm: 188 | 189 | lunarM += 1 190 | offset -= dm 191 | 192 | else: 193 | 194 | break 195 | 196 | lunarD = int(offset) 197 | lunar.lunarYear = lunarY 198 | lunar.lunarMonth = lunarM 199 | lunar.isleap = False 200 | if leap != 0 and lunarM > leap: 201 | 202 | lunar.lunarMonth = lunarM - 1 203 | if lunarM == leap + 1: 204 | lunar.isleap = True 205 | 206 | lunar.lunarDay = lunarD 207 | return lunar 208 | 209 | def __init__(self): 210 | pass 211 | 212 | 213 | if __name__ == '__main__': 214 | converter = LunarSolarConverter() 215 | solar = Solar(2111, 1, 25) 216 | pprint(vars(solar)) 217 | lunar = converter.SolarToLunar(solar) 218 | pprint(vars(lunar)) 219 | solar = converter.LunarToSolar(lunar) 220 | pprint(vars(solar)) 221 | print(len(converter.solar_1_1)) 222 | print("Done") 223 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Python3 —— 时间关键词提取以及标准化 2 | [TOC] 3 | 4 | ## 1. 新增说明 5 | 6 | ryanInf版本与ryanlnf版本存在一些小问题,进行了修复,以及个性化地新增了几个功能。 7 | 8 | > (1)不能识别下星期下礼拜等和周相关的时间,已解决 9 | 10 | > (2)在windows下使用有问题,ryanInf给出的方案不能解决这个问题。解决方案:将其中的regex.txt文件放入python的site-packages下 11 | > 12 | > - 例如:D:\Users\Administrator\Anaconda3\Lib\site-packages\TimeConverter-1.0.0-py3.7.egg\resource\regex.txt 13 | 14 | > (3)新增关键词以及索引位置的提取,具体见TimeNormalizer.py 15 | 16 | ## 2. 说明 17 | Time-NLP的python3版本,由于原作者sunfiyes的是python2版本,无法在python3上使用,故ryanlnf修改部分代码,使其可在Python3上使用,但运行发现存在一些bug,本人进行了修复,可能仍存在一些bug。 18 | 19 | - Python2版本:https://github.com/sunfiyes/Time-NLPY (原始) 20 | 21 | - Python3版本:https://github.com/ryanInf/Time-NLPY 22 | - Java版本:https://github.com/shinyke/Time-NLP 23 | - PHP版本:https://github.com/crazywhalecc/Time-NLP-PHP 24 | 25 | ## 3. 安装方式 26 | > - cd到当前目录 27 | 28 | > - python setup.py install 29 | 30 | 说明:window下可能出现安装regex错误,可到https://www.lfd.uci.edu/~gohlke/pythonlibs/#regex下载对应版本的regex手动安装 31 | 32 | ## 4. 使用方法 33 | 将中文时间描述转换为三种标准的时间格式的时间字符串: 34 | 35 | - 时间点(timestamp,表示某一具体时间时间描述); 36 | 37 | - 时间量(timedelta,表示时间的增量的时间描述); 38 | 39 | - 时间区间(timespan,有具体起始和结束时间点的时间区间)。 40 | 41 | 42 | 43 | 运行代码: 44 | 45 | > python TestTime.py 46 | 47 | 48 | 49 | TestTime.py代码与示例: 50 | 51 | ``` python 52 | import time 53 | import warnings 54 | from TimeNormalizer import TimeNormalizer 55 | from arrow.factory import ArrowParseWarning 56 | warnings.simplefilter("ignore", ArrowParseWarning) 57 | 58 | 59 | if __name__ == "__main__": 60 | tn = TimeNormalizer() 61 | while True: 62 | query = input("\nINPUT : ") 63 | ss = time.time() 64 | # target为待分析语句, timeBase为基准时间, 默认是当前时间 65 | # print("OUTPUT: ", tn.parse(target=query, timeBase="2019-02-03")) 66 | print("OUTPUT: ", tn.parse(target=query)) 67 | print("TIME : {0}ms!".format(round(1000 * (time.time() - ss), 3))) 68 | ``` 69 | 输出: 70 | 71 | - 默认当前时间 72 | 73 | ```shell 74 | INPUT : 昨天上午和2019年8月 75 | OUTPUT: [['昨天上午', (0, 4), '2020-03-23 10:00:00'], ['2019年8月', (5, 12), '2019-08-01 00:00:00']] 76 | TIME : 3.734ms! 77 | 78 | INPUT : 看今天的新闻以及后天的疫情 79 | OUTPUT: [['今天', (1, 3), '2020-03-24 00:00:00'], ['后天', (7, 9), '2020-03-26 00:00:00']] 80 | TIME : 3.657ms! 81 | 82 | INPUT : 二零20年9月二十4号我不在家 83 | OUTPUT: [['2020年9月24号', (0, 10), '2020-09-24 00:00:00']] 84 | TIME : 2.102ms! 85 | 86 | INPUT : 周末在家 87 | OUTPUT: [['周7', (0, 2), '2020-03-29 00:00:00']] 88 | TIME : 1.847ms! 89 | 90 | INPUT : 今天上午9点23分我到成都 91 | OUTPUT: [['今天上午9点23分', (0, 9), '2020-03-24 09:23:00']] 92 | TIME : 2.189ms! 93 | 94 | INPUT : 昨晚7点你们在干嘛 95 | OUTPUT: [['昨晚7点', (0, 4), '2020-03-23 07:00:00']] 96 | TIME : 2.032ms! 97 | 98 | INPUT : 2025年八月20日下午4点半 99 | OUTPUT: [['2025年8月20日下午4点半', (0, 15), '2025-08-20 16:30:00']] 100 | TIME : 3.341ms! 101 | ``` 102 | 103 | 104 | - 自定义基准时间,如2019-02-03 105 | 106 | ```shell 107 | INPUT : 昨天上午和2019年8月 108 | OUTPUT: [['昨天上午', (0, 4), '2019-02-02 10:00:00'], ['2019年8月', (5, 12), '2019-08-01 00:00:00']] 109 | TIME : 4.585ms! 110 | 111 | INPUT : 看今天的新闻以及后天的疫情 112 | OUTPUT: [['今天', (1, 3), '2019-02-03 00:00:00'], ['后天', (7, 9), '2019-02-05 00:00:00']] 113 | TIME : 3.73ms! 114 | ``` 115 | 116 | 117 | 118 | 关于节假日的增加方法: 119 | 1) 在resource目录下的holi_lunar(阴历)或holi_solar(阳历)文件内按照格式加入新增的节日名称和日期 120 | 2) 在resource目录下的regex.txt文件内加入相应节日的正则匹配,并删除regex.pkl缓存文件 121 | 3) 在TimeUnit类中的norm_setHoliday方法同样加入节日的正则匹配 122 | 123 | 该功能完善中... -------------------------------------------------------------------------------- /RangeTimeEnum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 16:27 4 | # @Author : zhm 5 | # @File : RangeTimeEnum.py 6 | # @Software: PyCharm 7 | 8 | 9 | # 范围时间的默认时间点 10 | class RangeTimeEnum: 11 | day_break = 3 # 黎明 12 | early_morning = 8 # 早 13 | morning = 10 # 上午 14 | noon = 12 # 中午、午间 15 | afternoon = 15 # 下午、午后 16 | night = 18 # 晚上、傍晚 17 | lateNight = 20 # 晚、晚间 18 | midNight = 23 # 深夜 19 | 20 | 21 | if __name__ == "__main__": 22 | print(RangeTimeEnum.afternoon) 23 | -------------------------------------------------------------------------------- /StringPreHandler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/11/20 15:42 4 | # @Author : zhm 5 | # @File : StringPreHandler.py 6 | # @Software: PyCharm 7 | import regex as re 8 | 9 | 10 | # * 字符串预处理模块,为分析器TimeNormalizer提供相应的字符串预处理服务 11 | class StringPreHandler: 12 | @classmethod 13 | def delKeyword(cls, target, rules): 14 | """ 15 | 该方法删除一字符串中所有匹配某一规则字串 16 | 可用于清理一个字符串中的空白符和语气助词 17 | :param target: 待处理字符串 18 | :param rules: 删除规则 19 | :return: 清理工作完成后的字符串 20 | """ 21 | pattern = re.compile(rules) 22 | res = pattern.sub('', target) 23 | # print res 24 | return res 25 | 26 | @classmethod 27 | def numberTranslator(cls, target): 28 | """ 29 | 该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字 30 | 如"这里有一千两百个人,六百零五个来自中国"可以转化为 31 | "这里有1200个人,605个来自中国" 32 | 此外添加支持了部分不规则表达方法 33 | 如两万零六百五可转化为20650 34 | 两百一十四和两百十四都可以转化为214 35 | 一六零加一五八可以转化为160+158 36 | 该方法目前支持的正确转化范围是0-99999999 37 | 该功能模块具有良好的复用性 38 | :param target: 待转化的字符串 39 | :return: 转化完毕后的字符串 40 | """ 41 | pattern = re.compile("[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))") 42 | match = pattern.finditer(target) 43 | for m in match: 44 | group = m.group() 45 | s = group.split("万") 46 | s = [_f for _f in s if _f] 47 | num = 0 48 | if len(s) == 2: 49 | num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000 50 | target = pattern.sub(str(num), target, 1) 51 | 52 | pattern = re.compile("[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))") 53 | match = pattern.finditer(target) 54 | for m in match: 55 | group = m.group() 56 | s = group.split("千") 57 | s = [_f for _f in s if _f] 58 | num = 0 59 | if len(s) == 2: 60 | num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100 61 | target = pattern.sub(str(num), target, 1) 62 | 63 | pattern = re.compile("[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)") 64 | match = pattern.finditer(target) 65 | for m in match: 66 | group = m.group() 67 | s = group.split("百") 68 | s = [_f for _f in s if _f] 69 | num = 0 70 | if len(s) == 2: 71 | num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10 72 | target = pattern.sub(str(num), target, 1) 73 | 74 | pattern = re.compile("[零一二两三四五六七八九]") 75 | match = pattern.finditer(target) 76 | for m in match: 77 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 78 | 79 | pattern = re.compile("(?<=(周|星期))[末天日]") 80 | match = pattern.finditer(target) 81 | for m in match: 82 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 83 | 84 | pattern = re.compile("(? 0: 67 | days += 365 * self.tp.tunit[0] 68 | if self.tp.tunit[1] > 0: 69 | days += 30 * self.tp.tunit[1] 70 | if self.tp.tunit[2] > 0: 71 | days += self.tp.tunit[2] 72 | tunit = self.tp.tunit 73 | for i in range(3, 6): 74 | if self.tp.tunit[i] < 0: 75 | tunit[i] = 0 76 | seconds = tunit[3] * 3600 + tunit[4] * 60 + tunit[5] 77 | if seconds == 0 and days == 0: 78 | self.normalizer.invalidSpan = True 79 | self.normalizer.timeSpan = self.genSpan(days, seconds) 80 | return 81 | 82 | time_grid = self.normalizer.timeBase.split('-') 83 | tunitpointer = 5 84 | while tunitpointer >= 0 and self.tp.tunit[tunitpointer] < 0: 85 | tunitpointer -= 1 86 | for i in range(0, tunitpointer): 87 | if self.tp.tunit[i] < 0: 88 | self.tp.tunit[i] = int(time_grid[i]) 89 | 90 | self.time = self.genTime(self.tp.tunit) 91 | 92 | def genSpan(self, days, seconds): 93 | day = seconds // (3600 * 24) 94 | h = (seconds % (3600 * 24)) // 3600 95 | m = ((seconds % (3600 * 24)) % 3600) // 60 96 | s = ((seconds % (3600 * 24)) % 3600) % 60 97 | return str(days + day) + ' days, ' + "%d:%02d:%02d" % (h, m, s) 98 | 99 | def genTime(self, tunit): 100 | time = arrow.get('1970-01-01 00:00:00') 101 | if tunit[0] > 0: 102 | time = time.replace(year=tunit[0]) 103 | if tunit[1] > 0: 104 | time = time.replace(month=tunit[1]) 105 | if tunit[2] > 0: 106 | time = time.replace(day=tunit[2]) 107 | if tunit[3] > 0: 108 | time = time.replace(hour=tunit[3]) 109 | if tunit[4] > 0: 110 | time = time.replace(minute=tunit[4]) 111 | if tunit[5] > 0: 112 | time = time.replace(second=tunit[5]) 113 | return time 114 | 115 | def norm_setyear(self): 116 | """ 117 | 年-规范化方法--该方法识别时间表达式单元的年字段 118 | :return: 119 | """ 120 | # 一位数表示的年份 121 | rule = "(? weekday: 770 | cur = cur.shift(days=7) 771 | return cur 772 | 773 | def preferFuture(self, checkTimeIndex): 774 | """ 775 | 如果用户选项是倾向于未来时间,检查checkTimeIndex所指的时间是否是过去的时间,如果是的话,将大一级的时间设为当前时间的+1。 776 | 如在晚上说“早上8点看书”,则识别为明天早上; 777 | 12月31日说“3号买菜”,则识别为明年1月的3号。 778 | :param checkTimeIndex: _tp.tunit时间数组的下标 779 | :return: 780 | """ 781 | # 1. 检查被检查的时间级别之前,是否没有更高级的已经确定的时间,如果有,则不进行处理. 782 | for i in range(0, checkTimeIndex): 783 | if self.tp.tunit[i] != -1: 784 | return 785 | # 2. 根据上下文补充时间 786 | self.checkContextTime(checkTimeIndex) 787 | # 3. 根据上下文补充时间后再次检查被检查的时间级别之前,是否没有更高级的已经确定的时间,如果有,则不进行倾向处理. 788 | for i in range(0, checkTimeIndex): 789 | if self.tp.tunit[i] != -1: 790 | return 791 | # 4. 确认用户选项 792 | if not self.normalizer.isPreferFuture: 793 | return 794 | # 5. 获取当前时间,如果识别到的时间小于当前时间,则将其上的所有级别时间设置为当前时间,并且其上一级的时间步长+1 795 | time_arr = self.normalizer.timeBase.split('-') 796 | cur = arrow.get(self.normalizer.timeBase, "YYYY-M-D-H-m-s") 797 | cur_unit = int(time_arr[checkTimeIndex]) 798 | if cur_unit < self.tp.tunit[checkTimeIndex]: 799 | return 800 | # 准备增加的时间单位是被检查的时间的上一级,将上一级时间+1 801 | cur = self.addTime(cur, checkTimeIndex - 1) 802 | time_arr = cur.format("YYYY-M-D-H-m-s").split('-') 803 | for i in range(0, checkTimeIndex): 804 | self.tp.tunit[i] = int(time_arr[i]) 805 | # if i == 1: 806 | # self.tp.tunit[i] += 1 807 | 808 | def checkContextTime(self, checkTimeIndex): 809 | """ 810 | 根据上下文时间补充时间信息 811 | :param checkTimeIndex: 812 | :return: 813 | """ 814 | for i in range(0, checkTimeIndex): 815 | if self.tp.tunit[i] == -1 and self.tp_origin.tunit[i] != -1: 816 | self.tp.tunit[i] = self.tp_origin.tunit[i] 817 | # 在处理小时这个级别时,如果上文时间是下午的且下文没有主动声明小时级别以上的时间,则也把下文时间设为下午 818 | if self.isFirstTimeSolveContext and checkTimeIndex == 3 and self.tp_origin.tunit[checkTimeIndex] >= 12 and \ 819 | self.tp.tunit[checkTimeIndex] < 12: 820 | self.tp.tunit[checkTimeIndex] += 12 821 | self.isFirstTimeSolveContext = False 822 | 823 | def addTime(self, cur, fore_unit): 824 | if fore_unit == 0: 825 | cur = cur.shift(years=1) 826 | elif fore_unit == 1: 827 | cur = cur.shift(months=1) 828 | elif fore_unit == 2: 829 | cur = cur.shift(days=1) 830 | elif fore_unit == 3: 831 | cur = cur.shift(hours=1) 832 | elif fore_unit == 4: 833 | cur = cur.shift(minutes=1) 834 | elif fore_unit == 5: 835 | cur = cur.shift(seconds=1) 836 | return cur 837 | -------------------------------------------------------------------------------- /resource/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2017/12/5 17:29 4 | # @Author : zhm 5 | # @File : __init__.py 6 | # @Software: PyCharm -------------------------------------------------------------------------------- /resource/holi_lunar.json: -------------------------------------------------------------------------------- 1 | { 2 | "中和节": "02-02", 3 | "中秋节": "08-15", 4 | "中元节": "07-15", 5 | "端午节": "05-05", 6 | "春节": "01-01", 7 | "元宵节": "01-15", 8 | "重阳节": "09-09", 9 | "七夕节": "07-07" 10 | } 11 | -------------------------------------------------------------------------------- /resource/holi_solar.json: -------------------------------------------------------------------------------- 1 | { 2 | "植树节": "03-12", 3 | "圣诞节": "12-25", 4 | "青年节": "05-04", 5 | "教师节": "09-10", 6 | "儿童节": "06-01", 7 | "元旦节": "01-01", 8 | "国庆节": "10-01", 9 | "劳动节": "05-01", 10 | "妇女节": "03-08", 11 | "建军节": "08-01", 12 | "航海日节": "07-11", 13 | "建党节": "07-01", 14 | "记者节": "11-08" 15 | } 16 | -------------------------------------------------------------------------------- /resource/reg.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geekzhangwei/TimeNLP/9a823231bb41428e4010347c586cc563c379885a/resource/reg.pkl -------------------------------------------------------------------------------- /resource/regex.txt: -------------------------------------------------------------------------------- 1 | ((前|昨|今|明|后)(天|日)?(早|晚)(晨|上|间)?)|(\d+个?[年月日天][以之]?[前后])|(\d+个?半?(小时|钟头|h|H))|(半个?(小时|钟头))|(\d+(分钟|min))|([13]刻钟)|((上|这|本|下)+(周|星期)([一二三四五六七天日]|[1-7])?)|((周|星期)([一二三四五六七天日]|[1-7]))|((早|晚)?([0-2]?[0-9](点|时)半)(am|AM|pm|PM)?)|((早|晚)?(\d+[::]\d+([::]\d+)*)\s*(am|AM|pm|PM)?)|((早|晚)?([0-2]?[0-9](点|时)[13一三]刻)(am|AM|pm|PM)?)|((早|晚)?(\d+[时点](\d+)?分?(\d+秒?)?)\s*(am|AM|pm|PM)?)|(大+(前|后)天)|(([零一二三四五六七八九十百千万]+|\d+)世)|([0-9]?[0-9]?[0-9]{2}\.((10)|(11)|(12)|([1-9]))\.((?=2017', 25 | 'arrow>=0.10'], 26 | zip_safe=False, 27 | classifiers=[ 28 | 'Programming Language :: Python :: 2.6', 29 | 'Programming Language :: Python :: 2.7' 30 | ] 31 | ) --------------------------------------------------------------------------------