├── README.md ├── RuleTest.py └── RuleParser.py /README.md: -------------------------------------------------------------------------------- 1 | # BasedRuleQA_Parser 2 | 基于规则匹配的问答系统中的解析器 3 | 4 | # 带限定词库的规则匹配 5 | 在用规则做QA时,有时需要对相同句子结构的问法做不同的意图分类。例如:“李白是谁”需要分类到意图“问诗人简介”、“李鹏是谁”需要分类到意图”问政治名人简介“。 6 | 7 | 如果通过正则或机器学习(深度学习)文本分类,只能首先定义成一种意图(问人物简介),然后再对提取到的关键词再进一步通过数据区分为“问诗人简介”、”问政治名人简介“。 8 | 9 | 本DEMO仿照思必弛平台规则,在带限定词库的规则匹配方式下,定义规则”#诗人#是谁“、”#政治名人#是谁“两条规则,分别对应成“问诗人简介”、”问政治名人简介“两种意图即可,这样通过词库(#诗人#、#政治名人#词库)限制可以直接给出意图类别。 10 | 11 | 同时可以根据规则自动生成问句,用来给关键词提取模型训练制作语料。 12 | 13 | # 效率实测 14 | 通过实测1W多条规则(扩展20W条规则),在个人电脑上全部未命中大概300ms左右,耗时跟规则数以及复杂度成正比(扩展词库查询通过词典全部加载进内存基本上不耗时)。 15 | 16 | (注:单条规则”[请问|请告诉我]#诗人#的简介"可扩展出三条规则”#诗人#的简介"、”请问#诗人#的简介"、”请告诉我#诗人#的简介",这代表了规则的复杂度。) 17 | 18 | 19 | # DEMO输出 20 | ``` 21 | 解析规则:#sys.任意文本##诗人#[的]#诗名#的(介绍|说明|#歌曲#)[啊|哦|#呵呵#|额] 22 | 23 | =========================匹配句子================================= 24 | 句子:李白的将进酒的介绍哦 25 | 关键词:['', '李白', '将进酒'] 26 | 关联库:['sys.任意文本', '诗人', '诗名'] 27 | 位置:[(0, 0), (0, 2), (3, 3)] 28 | 节点路径:(FULL)-->sys.任意文本(LIB)-->诗人(LIB)-->的(FULL)-->诗名(LIB)-->的(FULL)-->介绍(FULL)-->哦(FULL) 29 | ================================================================= 30 | 31 | 32 | =========================匹配句子================================= 33 | 句子:李白将进酒的介绍 34 | 关键词:['', '李白', '将进酒'] 35 | 关联库:['sys.任意文本', '诗人', '诗名'] 36 | 位置:[(0, 0), (0, 2), (2, 3)] 37 | 节点路径:(FULL)-->sys.任意文本(LIB)-->诗人(LIB)-->(FULL)-->诗名(LIB)-->的(FULL)-->介绍(FULL)-->(FULL) 38 | ================================================================= 39 | 40 | 41 | =========================匹配句子================================= 42 | 句子:我请问李白冰将进酒的介绍 43 | 关键词:['我请问', '李白冰', '将进酒'] 44 | 关联库:['sys.任意文本', '诗人', '诗名'] 45 | 位置:[(0, 3), (3, 3), (6, 3)] 46 | 节点路径:(FULL)-->sys.任意文本(LIB)-->诗人(LIB)-->(FULL)-->诗名(LIB)-->的(FULL)-->介绍(FULL)-->(FULL) 47 | ================================================================= 48 | 49 | 50 | ================================================================= 51 | 匹配失败:李白是谁 52 | ================================================================= 53 | 54 | 解析规则:#诗人#是谁 55 | 56 | =========================匹配句子================================= 57 | 句子:李白是谁 58 | 关键词:['李白'] 59 | 关联库:['诗人'] 60 | 位置:[(0, 2)] 61 | 节点路径:(FULL)-->诗人(LIB)-->是谁(FULL) 62 | ================================================================= 63 | 64 | 解析规则:[请问]#sys.数字#个人是什么字 65 | 66 | =========================匹配句子================================= 67 | 句子:三个人是什么字 68 | 关键词:['三'] 69 | 关联库:['sys.数字'] 70 | 位置:[(0, 1)] 71 | 节点路径:(FULL)-->(FULL)-->sys.数字(LIB)-->个人是什么字(FULL) 72 | ================================================================= 73 | 74 | 75 | =========================匹配句子================================= 76 | 句子:请问三个人是什么字 77 | 关键词:['三'] 78 | 关联库:['sys.数字'] 79 | 位置:[(2, 1)] 80 | 节点路径:(FULL)-->请问(FULL)-->sys.数字(LIB)-->个人是什么字(FULL) 81 | ================================================================= 82 | 83 | 84 | =========================匹配句子================================= 85 | 句子:请问十二个人是什么字 86 | 关键词:['十二'] 87 | 关联库:['sys.数字'] 88 | 位置:[(2, 2)] 89 | 节点路径:(FULL)-->请问(FULL)-->sys.数字(LIB)-->个人是什么字(FULL) 90 | ================================================================= 91 | 92 | 93 | ======================生成句子===================================== 94 | 句子:脴梌李白沁园春的海阔天空额 95 | 关键词:['脴梌', '李白', '沁园春', '海阔天空'] 96 | 关联库:['sys.任意文本', '诗人', '诗名', '歌曲'] 97 | 位置:[(0, 2), (2, 2), (4, 3), (8, 4)] 98 | 节点路径:(FULL)-->sys.任意文本(LIB)-->诗人(LIB)-->(FULL)-->诗名(LIB)-->的(FULL)-->歌曲(LIB)-->额(FULL) 99 | ================================================================= 100 | 101 | 102 | =========================匹配句子================================= 103 | 句子:脴梌李白沁园春的海阔天空额 104 | 关键词:['脴梌', '李白', '沁园春', '海阔天空'] 105 | 关联库:['sys.任意文本', '诗人', '诗名', '歌曲'] 106 | 位置:[(0, 2), (2, 2), (4, 3), (8, 4)] 107 | 节点路径:(FULL)-->sys.任意文本(LIB)-->诗人(LIB)-->(FULL)-->诗名(LIB)-->的(FULL)-->歌曲(LIB)-->额(FULL) 108 | ================================================================= 109 | 110 | ``` 111 | 112 | # 后续问题 113 | 1. 系统集成词库暂时只加入了“#sys.任意问题#”,“#sys.数字#”,后续再加入其他内置库; 114 | 2. 效率始终是一个需要进一步优化的问题; 115 | 3. 花了两天时间把想法实现,解析部分代码有点乱,老想的C的指针去构建图,需要重构一下,匹配部分还算明确,BUG慢慢磨。 116 | 117 | # 20200313 118 | 已上线到实际项目中,大致思路相同: 119 | 一是可以将库拆分; 120 | 二是可以使用PyPy,或换成更高性能的语言实现,例如C、go; 121 | 三是仿照数据库加入检索方式。 122 | -------------------------------------------------------------------------------- /RuleTest.py: -------------------------------------------------------------------------------- 1 | import random 2 | from RuleParser import RuleParser 3 | 4 | # 用词典可以快速匹配到 5 | poet_names = {'李白':1,'李白冰':2,'杜甫':3} 6 | poetry_names = {'将进酒':1,'沁园春':2} 7 | poetry_sentences = {'黄河之水天上来':1,'海上升明月':2} 8 | music_names = {'海阔天空':1,'匆匆':2} 9 | 10 | # 外部库查找实现 11 | def hook_lib_method_impl(match_string,lib_name,params): 12 | #print("hook_lib_method_impl 库中查找,库名:"+lib_name+" 查找句子:"+match_string) 13 | # 传递参数测试 14 | assert(params=='HELLO') 15 | # 返回的关键词数组 16 | matched_strings = [] 17 | current_database = None 18 | if lib_name == '诗人': 19 | current_database = poet_names 20 | elif lib_name == '诗名': 21 | current_database = poetry_names 22 | elif lib_name == '诗句': 23 | current_database = poetry_sentences 24 | elif lib_name == '歌曲': 25 | current_database = music_names 26 | else: 27 | # print("hook_lib_method_impl 未找到库,库名:"+lib_name+" 查找句子:"+match_string) 28 | return matched_strings 29 | 30 | for i in range(len(match_string)): 31 | search_string = match_string[:i+1] 32 | if search_string in current_database: 33 | # print("hook_lib_method_impl 匹配到库:"+lib_name+" 关键词:"+search_string+" match_string:"+match_string) 34 | matched_strings.append(search_string) 35 | 36 | # print("hook_lib_method_impl 库:"+lib_name+" 全部匹配到的关键词:"+str(matched_strings)) 37 | return matched_strings 38 | 39 | # 外部库生成实现 40 | def hook_generate_lib_method_impl(lib_name,params): 41 | #print("hook_generate_lib_method_impl 库名:"+lib_name) 42 | generate_string = '' 43 | current_database = None 44 | if lib_name == '诗人': 45 | current_database = poet_names 46 | elif lib_name == '诗名': 47 | current_database = poetry_names 48 | elif lib_name == '诗句': 49 | current_database = poetry_sentences 50 | elif lib_name == '歌曲': 51 | current_database = music_names 52 | else: 53 | # print("hook_generate_lib_method_impl 未找到库,库名:"+lib_name) 54 | return generate_string 55 | 56 | values_len = len(current_database.keys()) 57 | if values_len == 0: 58 | return generate_string 59 | 60 | random_index = random.randint(0,values_len - 1) 61 | for i,item in enumerate(current_database): 62 | if i == random_index: 63 | generate_string = item 64 | break 65 | 66 | return generate_string 67 | 68 | def test_sentence(rule_parser,sentence): 69 | success,keywords,keywords_pos,lib_names,nodes_path = rule_parser.match(sentence) 70 | if success: 71 | print("\n=========================匹配句子=================================") 72 | print("句子:"+sentence) 73 | print("关键词:"+str(keywords)+" \n关联库:"+str(lib_names)+" \n位置:"+str(keywords_pos)) 74 | path_trace = '' 75 | for node in nodes_path: 76 | if path_trace == '': 77 | path_trace = str(node) 78 | else: 79 | path_trace += "-->"+str(node) 80 | print("节点路径:"+path_trace) 81 | print("=================================================================\n") 82 | else: 83 | print("\n=================================================================") 84 | print("匹配失败:"+sentence) 85 | print("=================================================================\n") 86 | 87 | if __name__ == '__main__': 88 | # 创建一个实例 89 | rule_parser = RuleParser() 90 | # 自定义词库匹配查询 91 | rule_parser.set_match_lib_hook(hook_lib_method_impl,"HELLO") 92 | # 解析规则 93 | rule = "#sys.任意文本##诗人#[的]#诗名#的(介绍|说明|#歌曲#)[啊|哦|#呵呵#|额]" 94 | print("解析规则:"+rule) 95 | rule_parser.parse(rule) 96 | 97 | # 默认关闭,设为True打开 98 | rule_parser.set_debug(False) 99 | 100 | # 测试句子 101 | test_sentence(rule_parser,'李白的将进酒的介绍哦') 102 | test_sentence(rule_parser,'李白将进酒的介绍') 103 | test_sentence(rule_parser,'我请问李白冰将进酒的介绍') 104 | 105 | # ”李白是谁“不能命中 106 | test_sentence(rule_parser,'李白是谁') 107 | # 重置新规则 108 | rule = "#诗人#是谁" 109 | print("解析规则:"+rule) 110 | rule_parser.parse(rule) 111 | # ”李白是谁“命中 112 | test_sentence(rule_parser,'李白是谁') 113 | 114 | # 加入内置数字测试 115 | rule = "[请问]#sys.数字#个人是什么字" 116 | print("解析规则:"+rule) 117 | rule_parser.parse(rule) 118 | test_sentence(rule_parser,'三个人是什么字') 119 | test_sentence(rule_parser,'请问三个人是什么字') 120 | test_sentence(rule_parser,'请问十二个人是什么字') 121 | 122 | # 根据规则随机生成句子 123 | rule = "#sys.任意文本##诗人#[的]#诗名#的(介绍|说明|#歌曲#)[啊|哦|额]" 124 | rule_parser.parse(rule) 125 | rule_parser.set_generate_lib_hook(hook_generate_lib_method_impl,'HI') 126 | sentence,keywords,keywords_pos,lib_names,nodes_path = rule_parser.generate() 127 | print("\n======================生成句子=====================================") 128 | print("句子:"+sentence) 129 | print("关键词:"+str(keywords)+" \n关联库:"+str(lib_names)+" \n位置:"+str(keywords_pos)) 130 | path_trace = '' 131 | for node in nodes_path: 132 | if path_trace == '': 133 | path_trace = str(node) 134 | else: 135 | path_trace += "-->"+str(node) 136 | print("节点路径:"+path_trace) 137 | print("=================================================================\n") 138 | test_sentence(rule_parser,sentence) 139 | -------------------------------------------------------------------------------- /RuleParser.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | TOKEN_TYPE_SELECT_LIB = 'LIB' 4 | TOKEN_TYPE_SELECT_FULL = 'FULL' 5 | 6 | class RuleNode: 7 | def __init__(self,value,token_type=TOKEN_TYPE_SELECT_FULL): 8 | self.parent = [] 9 | self.children = [] 10 | self.value = value 11 | self.rule_type = token_type 12 | #print("value is :"+self.value+" type is :"+token_type) 13 | 14 | def add_children(self,node): 15 | self.children.append(node) 16 | 17 | def set_childrens(self,nodes): 18 | self.children = nodes 19 | 20 | def get_childrens(self): 21 | return self.children 22 | 23 | def get_value(self): 24 | return self.value 25 | 26 | def get_type(self): 27 | return self.rule_type 28 | 29 | def __str__(self): 30 | return str(self.value)+"("+self.rule_type+")" 31 | 32 | class RuleGraph: 33 | def __init__(self): 34 | self.root_node = RuleNode('') 35 | 36 | def add_children_node(self,parent_node,node): 37 | if parent_node == None: 38 | self.root_node.add_children(node) 39 | else: 40 | parent_node.add_children(node) 41 | print(parent_node.get_childrens()) 42 | print(str(parent_node)+"==>"+str(node)) 43 | 44 | def get_childrens(self,parent_node): 45 | return parent_node.get_childrens() 46 | 47 | def set_childrens(self,current_node,children_nodes): 48 | current_node.set_childrens(children_nodes) 49 | 50 | def get_root_node(self): 51 | return self.root_node 52 | 53 | def travel(self,parent_node): 54 | if parent_node == None: 55 | parent_node = self.root_node.get_childrens()[0] 56 | 57 | print(parent_node) 58 | #print(parent_node.get_childrens()) 59 | childrens = parent_node.get_childrens() 60 | for children in childrens: 61 | ret = self.travel(children) 62 | if ret == True: 63 | return True 64 | 65 | return False 66 | 67 | 68 | class RuleParser: 69 | SYSTEM_LIB_DIGIT = ['零','一','二','三','四','五','六','七','八','九','十','百','千','万','亿','兆','京'] 70 | 71 | def __init__(self): 72 | self.rule_graph = RuleGraph() 73 | self.match_lib_hook = None 74 | self.match_lib_hook_parms = None 75 | self.generate_lib_hook = None 76 | self.generate_lib_hook_parms = None 77 | self.DEBUG_FLAG = False 78 | 79 | """ 80 | 调试开关 81 | """ 82 | def set_debug(self,flag): 83 | self.DEBUG_FLAG = flag 84 | 85 | """ 86 | 设置外部库查询函数 87 | """ 88 | def set_match_lib_hook(self,hook_method,hook_params): 89 | self.match_lib_hook = hook_method 90 | self.match_lib_hook_parms = hook_params 91 | 92 | """ 93 | 设置外部库生成函数 94 | """ 95 | def set_generate_lib_hook(self,hook_method,hook_params): 96 | self.generate_lib_hook = hook_method 97 | self.generate_lib_hook_parms = hook_params 98 | 99 | """ 100 | 内置知识库实现 101 | """ 102 | def hook_match_lib_default(self,match_string,lib_name): 103 | if self.DEBUG_FLAG: 104 | print("hook_lib_default 库中查找,库名:"+lib_name+" 查找实体:"+match_string) 105 | 106 | matched_strings = [] 107 | 108 | if lib_name == 'sys.任意文本': 109 | matched_strings.append('') 110 | for i in range(len(match_string)): 111 | search_string = match_string[:i+1] 112 | matched_strings.append(search_string) 113 | elif lib_name == 'sys.数字' or lib_name == 'sys.整数': 114 | for i in range(len(match_string)): 115 | search_string = match_string[:i+1] 116 | current_word = match_string[i] 117 | if current_word in self.SYSTEM_LIB_DIGIT: 118 | matched_strings.append(search_string) 119 | else: 120 | break 121 | 122 | if self.DEBUG_FLAG: 123 | print("匹配到的词典:"+str(matched_strings)) 124 | return matched_strings 125 | 126 | """ 127 | 匹配库 128 | """ 129 | def match_lib(self,match_string,lib_name): 130 | if lib_name.startswith('sys.'): 131 | return self.hook_match_lib_default(match_string,lib_name) 132 | 133 | if self.match_lib_hook != None: 134 | return self.match_lib_hook(match_string,lib_name,self.match_lib_hook_parms) 135 | 136 | return [] 137 | 138 | # 获取随机的字 139 | def get_random_chinese_char(self): 140 | val = random.randint(0x4e00, 0x9fbf) 141 | return chr(val) 142 | 143 | """ 144 | 内置知识库生成实现 145 | """ 146 | def hook_generate_lib_default(self,lib_name): 147 | if self.DEBUG_FLAG: 148 | print("hook_generate_lib_default 库中生成,库名:"+lib_name) 149 | 150 | generate_string = '' 151 | 152 | if lib_name == 'sys.任意文本': 153 | gen_len = random.randint(0,10) 154 | for i in range(0,gen_len): 155 | generate_string = generate_string + self.get_random_chinese_char() 156 | elif lib_name == 'sys.数字' or lib_name == 'sys.整数': 157 | gen_len = random.randint(1,4) 158 | for i in range(0,gen_len): 159 | generate_string = generate_string + self.SYSTEM_LIB_DIGIT[random.randint(0,len(self.SYSTEM_LIB_DIGIT)-1)] 160 | 161 | if self.DEBUG_FLAG: 162 | print("生成字符串:"+str(generate_string)) 163 | return generate_string 164 | 165 | """ 166 | 生成库 167 | """ 168 | def generate_lib(self,lib_name): 169 | if lib_name.startswith('sys.'): 170 | return self.hook_generate_lib_default(lib_name) 171 | 172 | if self.generate_lib_hook != None: 173 | return self.generate_lib_hook(lib_name,self.generate_lib_hook_parms) 174 | 175 | return '' 176 | 177 | def travel(self): 178 | self.rule_graph.travel(None) 179 | 180 | # 解析规则 181 | def parse(self,question_rule): 182 | self.keywords = [] 183 | self.keywords_postion = [] 184 | self.lib_names = [] 185 | self.nodes_path = [] 186 | 187 | token_tmp_setence = '' 188 | token_sharp_start_flag = False 189 | token_parentheses_count = 0 190 | token_bracket_count = 0 191 | current_nodes = [self.rule_graph.get_root_node()] 192 | 193 | for i in range(len(question_rule)): 194 | token = question_rule[i] 195 | # 开始子句 196 | if token_parentheses_count > 0: 197 | if token == ')': 198 | token_parentheses_count -= 1 199 | if token_parentheses_count == 0: 200 | sub_tokens = token_tmp_setence.split('|') 201 | token_tmp_setence = '' 202 | rule_nodes = [] 203 | 204 | for sub_token in sub_tokens: 205 | rule_node = None 206 | if sub_token.startswith('#'): 207 | rule_node = RuleNode(sub_token[1:-1],TOKEN_TYPE_SELECT_LIB) 208 | else: 209 | rule_node = RuleNode(sub_token,TOKEN_TYPE_SELECT_FULL) 210 | rule_nodes.append(rule_node) 211 | 212 | for current_node in current_nodes: 213 | # current_node.set_childrens(rule_nodes) 214 | self.rule_graph.set_childrens(current_node,rule_nodes) 215 | 216 | current_nodes = rule_nodes 217 | continue 218 | elif token_bracket_count > 0: 219 | if token == ']': 220 | token_bracket_count -= 1 221 | if token_bracket_count == 0: 222 | sub_tokens = token_tmp_setence.split('|') 223 | token_tmp_setence = '' 224 | rule_nodes = [] 225 | rule_node = RuleNode('',TOKEN_TYPE_SELECT_FULL) 226 | rule_nodes.append(rule_node) 227 | for sub_token in sub_tokens: 228 | rule_node = None 229 | if sub_token.startswith('#'): 230 | rule_node = RuleNode(sub_token[1:-1],TOKEN_TYPE_SELECT_LIB) 231 | else: 232 | rule_node = RuleNode(sub_token,TOKEN_TYPE_SELECT_FULL) 233 | rule_nodes.append(rule_node) 234 | for current_node in current_nodes: 235 | # current_node.set_childrens(rule_nodes) 236 | self.rule_graph.set_childrens(current_node,rule_nodes) 237 | current_nodes = rule_nodes 238 | continue 239 | else: 240 | if token == '#': 241 | if token_sharp_start_flag == True: # ‘#’结束 242 | token_sharp_start_flag = False 243 | rule_node = RuleNode(token_tmp_setence,TOKEN_TYPE_SELECT_LIB) 244 | token_tmp_setence = '' 245 | rule_nodes = [rule_node] 246 | for current_node in current_nodes: 247 | # current_node.set_childrens(rule_nodes) 248 | self.rule_graph.set_childrens(current_node,rule_nodes) 249 | current_nodes = rule_nodes 250 | else: # ‘#’开始 251 | token_sharp_start_flag = True 252 | if len(token_tmp_setence) > 0: 253 | rule_node = RuleNode(token_tmp_setence,TOKEN_TYPE_SELECT_FULL) 254 | token_tmp_setence = '' 255 | rule_nodes = [rule_node] 256 | for current_node in current_nodes: 257 | # current_node.set_childrens(rule_nodes) 258 | self.rule_graph.set_childrens(current_node,rule_nodes) 259 | current_nodes = rule_nodes 260 | continue 261 | 262 | if token == '}': 263 | token_sharp_start_flag = False 264 | rule_node = RuleNode(token_tmp_setence,TOKEN_TYPE_SELECT_LIB) 265 | token_tmp_setence = '' 266 | rule_nodes = [rule_node] 267 | for current_node in current_nodes: 268 | # current_node.set_childrens(rule_nodes) 269 | self.rule_graph.set_childrens(current_node,rule_nodes) 270 | current_nodes = rule_nodes 271 | continue 272 | 273 | if token == '{': 274 | if len(token_tmp_setence) > 0: 275 | rule_node = RuleNode(token_tmp_setence,TOKEN_TYPE_SELECT_FULL) 276 | token_tmp_setence = '' 277 | rule_nodes = [rule_node] 278 | for current_node in current_nodes: 279 | # current_node.set_childrens(rule_nodes) 280 | self.rule_graph.set_childrens(current_node,rule_nodes) 281 | current_nodes = rule_nodes 282 | continue 283 | 284 | if token == '(': 285 | token_parentheses_count += 1 286 | if len(token_tmp_setence) > 0: 287 | rule_node = RuleNode(token_tmp_setence,TOKEN_TYPE_SELECT_FULL) 288 | token_tmp_setence = '' 289 | rule_nodes = [rule_node] 290 | for current_node in current_nodes: 291 | # current_node.set_childrens(rule_nodes) 292 | self.rule_graph.set_childrens(current_node,rule_nodes) 293 | current_nodes = rule_nodes 294 | continue 295 | 296 | if token == '[': 297 | token_bracket_count += 1 298 | if len(token_tmp_setence) > 0: 299 | rule_node = RuleNode(token_tmp_setence,TOKEN_TYPE_SELECT_FULL) 300 | token_tmp_setence = '' 301 | rule_nodes = [rule_node] 302 | for current_node in current_nodes: 303 | # current_node.set_childrens(rule_nodes) 304 | self.rule_graph.set_childrens(current_node,rule_nodes) 305 | current_nodes = rule_nodes 306 | continue 307 | 308 | token_tmp_setence = token_tmp_setence + token 309 | 310 | if i == len(question_rule) - 1: 311 | rule_node = RuleNode(token_tmp_setence,TOKEN_TYPE_SELECT_FULL) 312 | token_tmp_setence = '' 313 | rule_nodes = [rule_node] 314 | for current_node in current_nodes: 315 | # current_node.set_childrens(rule_nodes) 316 | self.rule_graph.set_childrens(current_node,rule_nodes) 317 | current_nodes = rule_nodes 318 | 319 | 320 | """ 321 | 处理文本类型节点 322 | """ 323 | def real_match_process_fulltext(self,current_node,match_string,match_string_start_pos,keywords,keywords_postion,lib_names,nodes_path): 324 | # 当前节点信息 325 | current_node_value = current_node.get_value() 326 | current_node_type = current_node.get_type() 327 | if self.DEBUG_FLAG: 328 | print("real_match_process_fulltext 当前节点,值:"+current_node_value+" 类型:"+current_node_type) 329 | 330 | # 没匹配到节点文本,返回匹配失败,同时删除该节点路径记录 331 | if not match_string.startswith(current_node_value): 332 | node_pop = nodes_path.pop() 333 | if self.DEBUG_FLAG: 334 | print("real_match_process_fulltext 文本未匹配,节点路径,弹出:"+str(node_pop)) 335 | return False 336 | 337 | # 接下来匹配的字符串 338 | current_node_value_len = len(current_node_value) 339 | next_match_string_start_pos = match_string_start_pos + current_node_value_len 340 | next_match_string = match_string[current_node_value_len:] 341 | # 获取子节点 342 | childrens = current_node.get_childrens() 343 | 344 | # 如果接下来的匹配的字符串为空(全部匹配完成)并且子节点匹配完成了,返回匹配成功 345 | if next_match_string == '' and len(childrens) == 0: 346 | if self.DEBUG_FLAG: 347 | print("real_match_process_fulltext 文本与节点完全匹配了") 348 | return True 349 | 350 | if self.DEBUG_FLAG: 351 | print("real_match_process_fulltext 需要匹配子句:"+next_match_string + "子节点数:"+str(len(childrens))) 352 | 353 | # 匹配文本与子节点,深度优先递归 354 | for children in childrens: 355 | ret = self.real_match(children,next_match_string,next_match_string_start_pos,keywords,keywords_postion,lib_names,nodes_path) 356 | if ret == True: 357 | if self.DEBUG_FLAG: 358 | print("real_match_process_fulltext 子句匹配成功:"+next_match_string) 359 | match_string_start_pos = next_match_string_start_pos 360 | return True 361 | 362 | if self.DEBUG_FLAG: 363 | print("real_match_process_fulltext 子句匹配失败:"+next_match_string) 364 | 365 | # 没有一个子节点匹配到了文本 366 | node_pop = nodes_path.pop() 367 | if self.DEBUG_FLAG: 368 | print("real_match_process_fulltext 节点路径,弹出:"+str(node_pop)) 369 | 370 | return False 371 | 372 | """ 373 | 处理库类型节点 374 | """ 375 | def real_match_process_lib(self,current_node,match_string,match_string_start_pos,keywords,keywords_postion,lib_names,nodes_path): 376 | # 当前节点信息 377 | current_node_value = current_node.get_value() 378 | current_node_type = current_node.get_type() 379 | if self.DEBUG_FLAG: 380 | print("real_match_process_lib 当前节点,值:"+current_node_value+" 类型:"+current_node_type + "匹配文本:"+match_string) 381 | 382 | # 匹配词库 383 | matched_strings =self.match_lib(match_string,current_node_value) 384 | if self.DEBUG_FLAG: 385 | print("real_match_process_lib 匹配到的词库数组:"+str(matched_strings)) 386 | 387 | # 没有匹配到词典 388 | if len(matched_strings) == 0: 389 | if self.DEBUG_FLAG: 390 | print("real_match_process_lib 句子没有匹配到词典:"+match_string) 391 | node_pop = nodes_path.pop() 392 | if self.DEBUG_FLAG: 393 | print("real_match_process_lib 节点路径,弹出:"+str(node_pop)) 394 | return False 395 | 396 | # 遍历匹配到的词库数组,对每一条进行下一个节点的分析 397 | for matched_string in matched_strings: 398 | if self.DEBUG_FLAG: 399 | print("real_match_process_lib 处理知识库词条:"+matched_string+ " 匹配语句:"+match_string) 400 | # 作为关键词记录,作为抽取关键词 401 | keywords.append(matched_string) 402 | matched_string_len = len(matched_string) 403 | keywords_postion.append((match_string_start_pos,matched_string_len)) 404 | # 记录当前库名,作为抽取库 405 | lib_names.append(current_node_value) 406 | # 下一个需要处理的字句 407 | next_match_string_start_pos = match_string_start_pos + matched_string_len 408 | next_match_string = match_string[matched_string_len:] 409 | # 获取子节点 410 | childrens = current_node.get_childrens() 411 | 412 | # 如果接下来的匹配的字符串为空(全部匹配完成)并且子节点匹配完成了,返回匹配成功 413 | if next_match_string == '' and len(childrens) == 0: 414 | if self.DEBUG_FLAG: 415 | print("real_match_process_lib 文本与节点完全匹配了") 416 | return True 417 | 418 | # 匹配文本与子节点,深度优先递归 419 | for children in childrens: 420 | ret = self.real_match(children,next_match_string,next_match_string_start_pos,keywords,keywords_postion,lib_names,nodes_path) 421 | if ret == True: 422 | if self.DEBUG_FLAG: 423 | print("real_match_process_lib 子句匹配成功:"+next_match_string) 424 | match_string_start_pos = next_match_string_start_pos 425 | return True 426 | 427 | # 当前词条无法进行下一个节点,需要弹出关键词以及知识库名称,不做记录 428 | keyword = keywords.pop() 429 | lib_name = lib_names.pop() 430 | keywords_postion.pop() 431 | if self.DEBUG_FLAG: 432 | print("pop出关键词:"+keyword+" keywords:"+str(self.keywords) + " pop出库名:"+lib_name+" lib:"+str(self.lib_names)) 433 | 434 | # 没有一个子节点匹配到了文本 435 | node_pop = nodes_path.pop() 436 | if self.DEBUG_FLAG: 437 | print("real_match_process_lib 节点路径,弹出:"+str(node_pop)) 438 | 439 | return False 440 | 441 | 442 | """ 443 | 匹配文本与节点 444 | """ 445 | def real_match(self,current_node,match_string,match_string_start_pos,keywords,keywords_postion,lib_names,nodes_path): 446 | # 默认从Root节点的子节点开始 447 | if current_node == None: 448 | current_node = self.rule_graph.get_root_node() 449 | 450 | # 当前节点信息 451 | current_node_value = current_node.get_value() 452 | current_node_type = current_node.get_type() 453 | if self.DEBUG_FLAG: 454 | print("real_match 当前节点,值:"+current_node_value+" 类型:"+current_node_type) 455 | 456 | # 记录节点路径 457 | nodes_path.append(current_node) 458 | if self.DEBUG_FLAG: 459 | print("real_match 节点路径,添加:"+str(current_node)) 460 | 461 | # 处理节点 462 | if current_node_type == TOKEN_TYPE_SELECT_FULL: # 处理全匹配字段 463 | return self.real_match_process_fulltext(current_node,match_string,match_string_start_pos,keywords,keywords_postion,lib_names,nodes_path) 464 | elif current_node_type == TOKEN_TYPE_SELECT_LIB: # 处理库中查找字段 465 | return self.real_match_process_lib(current_node,match_string,match_string_start_pos,keywords,keywords_postion,lib_names,nodes_path) 466 | 467 | return False 468 | 469 | """ 470 | 匹配字符串 471 | """ 472 | def match(self,match_string): 473 | keywords = [] 474 | keywords_postion = [] 475 | lib_names = [] 476 | nodes_path = [] 477 | ret = self.real_match(None,match_string,0,keywords,keywords_postion,lib_names,nodes_path) 478 | return ret,keywords,keywords_postion,lib_names,nodes_path 479 | 480 | """ 481 | 生成字符串 482 | """ 483 | def generate(self): 484 | keywords = [] 485 | keywords_postion = [] 486 | lib_names = [] 487 | nodes_path = [] 488 | generate_string = '' 489 | current_node = self.rule_graph.get_root_node() 490 | 491 | while(True): 492 | # 当前节点信息 493 | current_node_value = current_node.get_value() 494 | current_node_type = current_node.get_type() 495 | 496 | if current_node_type == TOKEN_TYPE_SELECT_FULL: # 处理全匹配字段 497 | generate_string += current_node_value 498 | nodes_path.append(current_node) 499 | elif current_node_type == TOKEN_TYPE_SELECT_LIB: # 处理库中查找字段 500 | keyword = self.generate_lib(current_node_value) 501 | lib_names.append(current_node_value) 502 | keywords.append(keyword) 503 | keywords_postion.append((len(generate_string),len(keyword))) 504 | nodes_path.append(current_node) 505 | generate_string += keyword 506 | 507 | children_nodes = current_node.get_childrens() 508 | if len(children_nodes) == 0: 509 | break 510 | 511 | current_node = children_nodes[random.randint(0,len(children_nodes) - 1)] 512 | 513 | return generate_string,keywords,keywords_postion,lib_names,nodes_path 514 | 515 | --------------------------------------------------------------------------------