├── README.md ├── data_process.py ├── datachecker.py ├── fileutils.py ├── nlputils.py ├── project.ipynb ├── requirements.txt ├── run_role1_gru.sh ├── run_role1_gru_eval.sh ├── run_role1_gru_mix.sh ├── run_role1_gru_predict.sh ├── run_trigger_gru.sh ├── run_trigger_gru_mix.sh ├── run_trigger_gru_predict.sh ├── run_trigger_gru_predict_withmodel.sh └── sequence_label.py /README.md: -------------------------------------------------------------------------------- 1 | # 事件抽取模型(基于paddlehub) 2 | 本模型在官方PaddleHub版本上进行修改得到 3 | 官方原版地址:https://github.com/PaddlePaddle/Research/tree/master/KG/DuEE_baseline/DuEE-PaddleHub 4 | 5 | 本方案github地址:https://github.com/onewaymyway/DuEE_2020 6 | 7 | 本方案在官方baseline的基础上的改动 8 | 9 | 1.在网络结构上在CRF层前面增加了双向GRU层(代码见sequence_label.py中SequenceLabelTaskSP类) 10 | 11 | 2.将trigger预测结果拼接到text前面进行第二阶段的role预测(代码见data_process.py的data_process函数中model=role1的情况),这个改动可以解决同一个句子不同event之间role重叠的问题 12 | 13 | 3.在训练上,本方案先只用train进行训练,然后再将dev放入train进行最后的训练 14 | 15 | 4.增加了简单的最终结果剔除机制(代码见datachecker.py) 16 | 17 | 建议使用AIStudio环境跑这个项目,最好是直接Fork本人分享的项目, 18 | 19 | 项目地址:https://aistudio.baidu.com/aistudio/projectdetail/545914 20 | 21 | 22 | 23 | ### 环境准备 24 | 25 | - python适用版本 2.7.x(本代码测试时使用依赖见 ./requirements.txt ) 26 | - paddlepaddle-gpu >= 1.7.0、paddlehub >= 1.6.1 27 | - 请转至paddlepaddle官网按需求安装对应版本的paddlepaddle 28 | 29 | #### 依赖安装 30 | > pip install -r ./requirements.txt 31 | 32 | 33 | ### 模型训练 34 | 35 | 各个步骤在notebook文件里(project.ipynb)都有详细说明 36 | 按照notebook的顺序执行就可以了,这里就不详细说明了 37 | 38 | -------------------------------------------------------------------------------- /data_process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """hello world""" 17 | import os 18 | import sys 19 | import json 20 | import argparse 21 | from fileutils import wwprint 22 | 23 | 24 | def read_by_lines(path, encoding="utf-8"): 25 | """read the data by line""" 26 | result = list() 27 | with open(path, "r") as infile: 28 | for line in infile: 29 | result.append(line.strip().decode(encoding)) 30 | return result 31 | 32 | 33 | def write_by_lines(path, data, t_code="utf-8"): 34 | """write the data""" 35 | with open(path, "w") as outfile: 36 | [outfile.write(d.encode(t_code) + "\n") for d in data] 37 | 38 | 39 | def get_adptText(text): 40 | text_a = [ 41 | u"," if t == u" " or t == u"\n" or t == u"\t" else t 42 | for t in list(text.lower()) 43 | ] 44 | return text_a 45 | def data_process(path, model="trigger", is_predict=False,schema_labels=None): 46 | """data_process""" 47 | 48 | def label_data(data, start, l, _type): 49 | """label_data""" 50 | for i in range(start, start + l): 51 | suffix = u"B-" if i == start else u"I-" 52 | data[i] = u"{}{}".format(suffix, _type) 53 | return data 54 | 55 | sentences = [] 56 | output = [u"text_a"] if is_predict else [u"text_a\tlabel"] 57 | with open(path) as f: 58 | for line in f: 59 | d_json = json.loads(line.strip().decode("utf-8")) 60 | _id = d_json["id"] 61 | tDTxt=get_adptText(d_json["text"]) 62 | text_a=tDTxt 63 | 64 | if is_predict: 65 | if "event_lists" in d_json: 66 | for ccevent in d_json["event_lists"]: 67 | #将event_type和text拼接在一起 68 | tAdpt=ccevent+":"+d_json["text"] 69 | tDTxt=get_adptText(tAdpt) 70 | sentences.append({"text": d_json["text"], "id": _id,"e":ccevent}) 71 | output.append(u'\002'.join(tDTxt)) 72 | 73 | 74 | else: 75 | sentences.append({"text": d_json["text"], "id": _id}) 76 | output.append(u'\002'.join(tDTxt)) 77 | 78 | else: 79 | if model == u"trigger": 80 | labels = [u"O"] * len(text_a) 81 | for event in d_json["event_list"]: 82 | event_type = event["event_type"] 83 | start = event["trigger_start_index"] 84 | trigger = event["trigger"] 85 | labels = label_data(labels, start, 86 | len(trigger), event_type) 87 | output.append(u"{}\t{}".format(u'\002'.join(text_a), 88 | u'\002'.join(labels))) 89 | elif model == u"role": 90 | for event in d_json["event_list"]: 91 | labels = [u"O"] * len(text_a) 92 | for arg in event["arguments"]: 93 | role_type = arg["role"] 94 | argument = arg["argument"] 95 | start = arg["argument_start_index"] 96 | labels = label_data(labels, start, 97 | len(argument), role_type) 98 | output.append(u"{}\t{}".format(u'\002'.join(text_a), 99 | u'\002'.join(labels))) 100 | elif model == u"role1": 101 | events={} 102 | for event in d_json["event_list"]: 103 | tEventType=event["event_type"] 104 | if not tEventType in events: 105 | events[tEventType]=[] 106 | for arg in event["arguments"]: 107 | events[tEventType].append(arg) 108 | for event,arguments in events.items(): 109 | labels = [u"O"] * len(text_a) 110 | arguments.sort(key=lambda x:(x["argument_start_index"],-len(x["argument"]))) 111 | for arg in arguments: 112 | role_type = arg["role"] 113 | argument = arg["argument"] 114 | start = arg["argument_start_index"] 115 | if schema_labels and not u"B-"+role_type in schema_labels: 116 | print("Wrong:",role_type,d_json["text"]) 117 | labels = label_data(labels, start, 118 | len(argument), role_type) 119 | appendStr=event+":" 120 | #将eventtype拼接到句子前面 121 | labels=[u"O"]*len(appendStr)+labels 122 | tTxt=list(appendStr)+text_a 123 | output.append(u"{}\t{}".format(u'\002'.join(tTxt), 124 | u'\002'.join(labels))) 125 | if is_predict: 126 | return sentences, output 127 | else: 128 | return output 129 | 130 | 131 | def schema_process(path, model="trigger"): 132 | """schema_process""" 133 | 134 | def label_add(labels, _type): 135 | """label_add""" 136 | if u"B-{}".format(_type) not in labels: 137 | labels.extend([u"B-{}".format(_type), u"I-{}".format(_type)]) 138 | return labels 139 | 140 | labels = [] 141 | with open(path) as f: 142 | for line in f: 143 | d_json = json.loads(line.strip().decode("utf-8")) 144 | if model == u"trigger": 145 | labels = label_add(labels, d_json["event_type"]) 146 | elif model == u"role": 147 | for role in d_json["role_list"]: 148 | labels = label_add(labels, role["role"]) 149 | elif model == u"role1": 150 | for role in d_json["role_list"]: 151 | labels = label_add(labels, role["role"]) 152 | labels.append(u"O") 153 | return labels 154 | 155 | 156 | def extract_result(text, labels): 157 | """extract_result""" 158 | ret, is_start, cur_type = [], False, None 159 | if len(labels)>len(text): 160 | wwprint("warning",text,labels,len(labels),len(text)) 161 | for i, label in enumerate(labels): 162 | if i>=len(text): 163 | continue 164 | if label != u"O": 165 | _type = label[2:] 166 | if label.startswith(u"B-"): 167 | is_start = True 168 | cur_type = _type 169 | ret.append({"start": i, "text": [text[i]], "type": _type}) 170 | elif _type != cur_type: 171 | """ 172 | # 如果是没有B-开头的,则不要这部分数据 173 | cur_type = None 174 | is_start = False 175 | """ 176 | cur_type = _type 177 | is_start = True 178 | ret.append({"start": i, "text": [text[i]], "type": _type}) 179 | elif is_start: 180 | ret[-1]["text"].append(text[i]) 181 | else: 182 | cur_type = None 183 | is_start = False 184 | else: 185 | cur_type = None 186 | is_start = False 187 | return ret 188 | 189 | roleCoDic={ 190 | u"召回方,召回内容": 9, 191 | u"原所属组织,离职者": 166, 192 | u"死者年龄,死者": 70, 193 | u"降价方,降价物": 9, 194 | u"解雇方,被解雇人员": 5, 195 | u"原所属组织,退出方": 7, 196 | u"时间,活动名称": 6, 197 | u"地点,袭击对象": 6, 198 | u"时间,发布产品": 8, 199 | u"罢工人数,罢工人员": 4, 200 | u"时间,夺冠赛事": 11, 201 | u"发布方,发布产品": 78, 202 | u"被下架方,下架产品": 14, 203 | u"所属组织,停职人员": 14, 204 | u"地点,活动名称": 13, 205 | u"出售方,交易物": 13, 206 | u"地点,死者": 12, 207 | u"时间,赛事名称": 38, 208 | u"所属组织,罢工人员": 21} 209 | def extract_resultEX(text, labels,cDic): 210 | """extract_result""" 211 | ret, is_start, cur_type = [], False, None 212 | for i, label in enumerate(labels): 213 | if i>=len(text): 214 | continue 215 | if label != u"O": 216 | _type = label[2:] 217 | if label.startswith(u"B-"): 218 | is_start = True 219 | newTxt=[] 220 | if cur_type!=None: 221 | adkey=cur_type+","+_type 222 | #print("adkey",adkey,u"召回方,召回内容") 223 | #print(cDic[u"召回方,召回内容"]) 224 | if cur_type+","+_type in cDic: 225 | newTxt+=ret[-1]["text"] 226 | wwprint("concat by B","".join(newTxt)) 227 | if cur_type==_type: 228 | wwprint("sametype:",cur_type,ret[-1],text,labels) 229 | 230 | cur_type = _type 231 | newTxt.append(text[i]) 232 | ret.append({"start": i-len(newTxt)+1, "text": newTxt, "type": _type}) 233 | elif _type != cur_type: 234 | """ 235 | # 如果是没有B-开头的,则不要这部分数据 236 | cur_type = None 237 | is_start = False 238 | """ 239 | 240 | is_start = True 241 | newTxt=[] 242 | if cur_type!=None: 243 | adkey=cur_type+","+_type 244 | #print("adkey",adkey,u"召回方,召回内容") 245 | #print(cDic[u"召回方,召回内容"]) 246 | if cur_type+","+_type in cDic: 247 | newTxt+=ret[-1]["text"] 248 | wwprint("concat by I","".join(newTxt)) 249 | 250 | cur_type = _type 251 | newTxt.append(text[i]) 252 | ret.append({"start": i-len(newTxt)+1, "text": newTxt, "type": _type}) 253 | 254 | #cur_type = _type 255 | 256 | #ret.append({"start": i, "text": [text[i]], "type": _type}) 257 | elif is_start: 258 | ret[-1]["text"].append(text[i]) 259 | else: 260 | cur_type = None 261 | is_start = False 262 | else: 263 | cur_type = None 264 | is_start = False 265 | 266 | return ret 267 | 268 | def adptRet(ret): 269 | ret.sort(key=lambda x:x["start"]) 270 | lenRet=len(ret) 271 | rst=[] 272 | for i in range(lenRet): 273 | isMerged=False 274 | tRet=ret[i] 275 | if i=0: 108 | wwprint("Over:",text) 109 | wwprint("Overlap:",oRole,role,rolesNew) 110 | hasOverlap=True 111 | break 112 | if not hasOverlap: 113 | rolesNew2.append(oRole) 114 | 115 | 116 | 117 | 118 | event["arguments"]=rolesNew2 119 | 120 | if hasSameRole: 121 | wwprint("HasSame:",text) 122 | wwprint( "roles:",roles) 123 | 124 | #通过role出现的情况为事件成立的概率打个分 125 | sumRate=0 126 | for role,rc in seen.items(): 127 | if rc>1: 128 | wwprint( "SameRole",role) 129 | if self.roleSeenRateDic and self.roleSeenRateDic[role]: 130 | sumRate+=rc*self.roleSeenRateDic[role] 131 | 132 | if sumRate>0 and sumRate<50: 133 | wwprint( "UnderRate",text,sumRate) 134 | wwprint( "UnderRate",event) 135 | if len(minRole)>0: 136 | wwprint( "minRole",text) 137 | wwprint( "roles:",roles) 138 | wwprint( "minRole",minRole) 139 | lost=[] 140 | 141 | #统计某事件中高概率出现的role却没出现的情况 142 | for role,rate in self.roleSeenRateDic.items(): 143 | if rate>rateLimit: 144 | if not role in seen: 145 | lost.append([role,rate]) 146 | return lost 147 | 148 | def setBySchema(self,schemaData): 149 | self.reset() 150 | roles=schemaData["role_list"] 151 | for roleData in roles: 152 | role=roleData["role"] 153 | self.roleDic[role]=0 154 | self.roleAugDic[role]=[] 155 | self.roleAugDD[role]=0 156 | self.roleSeenDic[role]=0 157 | self.roles.append(role) 158 | 159 | def doSum(self): 160 | self.roleRateDic={} 161 | self.roleSeenRateDic={} 162 | emCount=self.count-self.empty 163 | wwprint( self.trigger,self.count,self.empty,emCount) 164 | 165 | #print(self.roleDic) 166 | for key,count in self.roleDic.items(): 167 | seenCount=self.roleSeenDic[key] 168 | self.roleRateDic[key]=round(count*100/emCount,2) 169 | self.roleSeenRateDic[key]=round(seenCount*100/emCount,2) 170 | wwprint( self.roleRateDic) 171 | wwprint( self.roleSeenRateDic) 172 | 173 | self.roleAugDD={} 174 | for role,ags in self.roleAugDic.items(): 175 | self.roleAugDD[role]=min(ags) 176 | wwprint( self.roleAugDD) 177 | 178 | def showSum(self): 179 | 180 | print("++++++++++++++++++++++++++++++++++++") 181 | self.doSum() 182 | 183 | class SchemaInfo(): 184 | ''' 185 | 事件统计 186 | ''' 187 | def __init__(self): 188 | self.eventDic={} 189 | #原来打算用分词信息对role进行补全或者剔除,但是实际没有做 190 | self.lac=LACTager() 191 | def setSchemaFile(self,schemafile): 192 | schemeData=readJsonLines(schemafile) 193 | for data in schemeData: 194 | tf=TriggerInfo(data["event_type"]) 195 | tf.setBySchema(data) 196 | self.eventDic[tf.trigger]=tf 197 | 198 | def addDataFile(self,dataFile): 199 | datas=readJsonLines(dataFile) 200 | for data in datas: 201 | text=data["text"] 202 | for event in data["event_list"]: 203 | etype=event["event_type"] 204 | triggerInfo=self.getTriggerInfo(etype) 205 | triggerInfo.addByEvent(event) 206 | 207 | 208 | def getTriggerInfo(self,trigger): 209 | return self.eventDic[trigger] 210 | 211 | def setSchemaData(self,schemaDataFile): 212 | pass 213 | 214 | def showSum(self): 215 | for key,tirggerInfo in self.eventDic.items(): 216 | tirggerInfo.showSum() 217 | def doSum(self): 218 | for key,tirggerInfo in self.eventDic.items(): 219 | tirggerInfo.doSum() 220 | 221 | def checkFile(self,dataFile,rateLimit=90,showWarning=True,saveFile=False): 222 | datas=readJsonLines(dataFile) 223 | warns=[] 224 | roleCount=0 225 | roleCharCount=0 226 | emptyCount=0 227 | emptyRoleCount=0 228 | roleCountDic={} 229 | roleAugDic={} 230 | for data in datas: 231 | text=data["text"] 232 | if len(data["event_list"])==0: 233 | emptyCount+=1 234 | preRoleCount=roleCount 235 | for event in data["event_list"]: 236 | etype=event["event_type"] 237 | roles=event["arguments"] 238 | for role in roles: 239 | roleCount+=1 240 | role_type=role["role"] 241 | role_aug=role["argument"] 242 | if not role_type in roleAugDic: 243 | roleAugDic[role_type]={} 244 | roleAugDic[role_type][role_aug]=True 245 | roleCharCount+=len(role["argument"]) 246 | triggerInfo=self.getTriggerInfo(etype) 247 | lost=triggerInfo.checkEvent(event,rateLimit,text) 248 | if len(lost)>0: 249 | warns.append(event) 250 | if showWarning: 251 | wwprint( "warning:",text,len(data["event_list"])) 252 | wwprint( "tags:",self.lac.getTag(text)) 253 | wwprint( "warning event:",event) 254 | wwprint( "warning lost",lost) 255 | dRole=roleCount-preRoleCount 256 | if dRole not in roleCountDic: 257 | roleCountDic[dRole]=0 258 | roleCountDic[dRole]+=1 259 | warnCount=len(warns) 260 | wwprint( "warnCount:",warnCount) 261 | wwprint( "role",roleCount,roleCharCount) 262 | wwprint( "warnrate:",round(warnCount*100/roleCount,2)) 263 | dataCount=len(datas) 264 | wwprint( "empty:",emptyCount,roleCountDic) 265 | items=list(roleCountDic.items()) 266 | items.sort() 267 | for key,count in items: 268 | tt=key*count 269 | wwprint( "c",key,count,round(count*100/dataCount,2),round(tt*100/roleCount,2)) 270 | if saveFile: 271 | saveJsonLines(dataFile.replace(".json","_md.json"),datas) 272 | #print(roleAugDic) 273 | for role,roleDic in roleAugDic.items(): 274 | wwprint( role,list(roleDic.keys())) 275 | 276 | parser = argparse.ArgumentParser(__doc__) 277 | parser.add_argument("--targetFile", type=str, default=None, help="./data1/test1fn_pred.json") 278 | args = parser.parse_args() 279 | 280 | def main(): 281 | 282 | 283 | #删选不合适的结果 比如明显低于限制的 284 | #类型不匹配的 285 | #去重 286 | #重组 287 | #高效触发词 288 | schemaInfo=SchemaInfo() 289 | schemaInfo.setSchemaFile("./data1/event_schema.json") 290 | schemaInfo.addDataFile("./data1/train.json") 291 | #schemaInfo.showSum() 292 | 293 | schemaInfo.checkFile(args.targetFile,90,False,True)# 294 | 295 | if __name__ == "__main__": 296 | main() -------------------------------------------------------------------------------- /fileutils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding:utf-8 3 | 4 | import json 5 | import numpy as np 6 | import os 7 | 8 | def wwprint(*args): 9 | ''' 10 | 封装的print函数 11 | 用于解决python2.7环境下中文输出乱码的问题 12 | ''' 13 | try: 14 | print(json.dumps(args,ensure_ascii=False)) 15 | except: 16 | print(json.dumps(args)) 17 | 18 | def saveFile(filepath,content): 19 | f=open(filepath,"w") 20 | fc=f.write(content.encode('utf-8')) 21 | f.close() 22 | 23 | 24 | def saveJsonLines(path, data): 25 | lines=[] 26 | for line in data: 27 | lines.append(json.dumps(line, ensure_ascii=False)) 28 | 29 | content="\n".join(lines) 30 | saveFile(path,content) 31 | 32 | def readFile(filepath): 33 | f=open(filepath,"r") 34 | fc=f.read().decode('utf-8') 35 | f.close() 36 | #print(fc) 37 | return fc 38 | def readJsonLines(filepath): 39 | print("readJsonLines",filepath) 40 | lines=readFile(filepath).split("\n") 41 | linefiles=[] 42 | for line in lines: 43 | 44 | line=line.strip() 45 | if not line: 46 | continue 47 | dd=json.loads(line) 48 | linefiles.append(dd) 49 | return linefiles -------------------------------------------------------------------------------- /nlputils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding:utf-8 3 | 4 | import json 5 | import numpy as np 6 | import os 7 | 8 | from visualdl import LogWriter 9 | 10 | class MyLog(): 11 | ''' 12 | 本类用于适配PaddleHub在AIStudio中VisualDL的使用 13 | 使用方式: 14 | # 创建 LogWriter 对象 15 | log_writer = MyLog(mode="role2") 16 | seq_label_task._tb_writer=log_writer 17 | ''' 18 | 19 | def __init__(self,mode="train",logDir="../log"): 20 | self.mode=mode 21 | self.varDic={} 22 | self.log_writer = LogWriter(logDir, sync_cycle=10) 23 | 24 | 25 | 26 | def add_scalar(self,tag,scalar_value,global_step): 27 | if not tag in self.varDic: 28 | with self.log_writer.mode(self.mode) as writer: 29 | self.varDic[tag]=writer.scalar(tag) 30 | self.varDic[tag].add_record(global_step,scalar_value) 31 | 32 | def saveFile(filepath,content): 33 | ''' 34 | 保存文件 35 | ''' 36 | f=open(filepath,"w") 37 | fc=f.write(content.encode('utf-8')) 38 | f.close() 39 | 40 | 41 | def saveJsonLines(path, data): 42 | ''' 43 | 保存文件 44 | ''' 45 | lines=[] 46 | for line in data: 47 | lines.append(json.dumps(line, ensure_ascii=False)) 48 | 49 | content="\n".join(lines) 50 | saveFile(path,content) 51 | 52 | def readFile(filepath): 53 | ''' 54 | 读取文件 55 | ''' 56 | f=open(filepath,"r") 57 | fc=f.read().decode('utf-8') 58 | f.close() 59 | #print(fc) 60 | return fc 61 | def readJsonLines(filepath): 62 | ''' 63 | 读取文件 64 | ''' 65 | print("readJsonLines",filepath) 66 | lines=readFile(filepath).split("\n") 67 | linefiles=[] 68 | for line in lines: 69 | 70 | line=line.strip() 71 | if not line: 72 | continue 73 | dd=json.loads(line) 74 | linefiles.append(dd) 75 | return linefiles 76 | 77 | 78 | import paddlehub as hub 79 | 80 | class LACTager(object): 81 | ''' 82 | 封装的分词工具 83 | ''' 84 | def __init__(self): 85 | self.module = hub.Module(name="lac") 86 | 87 | def getTagResult(self,text): 88 | inputs = {"text": [text]} 89 | results = self.module.lexical_analysis(data=inputs) 90 | result=results[0] 91 | return result 92 | 93 | def getTag(self,text): 94 | result=self.getTagResult(text) 95 | start=0 96 | rst=[] 97 | for word,ner in zip(result["word"],result["tag"]): 98 | rst.append([word,ner]) 99 | return rst 100 | 101 | def getLabels(self,text): 102 | result=self.getTagResult(text) 103 | labels=[""]*len(text) 104 | start=0 105 | for word,ner in zip(result["word"],result["tag"]): 106 | #print(word,ner) 107 | label_dataOT(labels,start,len(word),ner) 108 | start+=len(word) 109 | 110 | return labels -------------------------------------------------------------------------------- /project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": false 7 | }, 8 | "source": [ 9 | "# 2020语言与智能技术竞赛:事件抽取任务--方案分享(Test1:Rank17 Test2:Rank18)\n", 10 | "\n", 11 | "# 本模型在官方PaddleHub版本Baseline上进行修改得到\n", 12 | "\n", 13 | "官方原版地址:https://github.com/PaddlePaddle/Research/tree/master/KG/DuEE_baseline/DuEE-PaddleHub\n", 14 | "\n", 15 | "本方案github地址:https://github.com/onewaymyway/DuEE_2020\n", 16 | "\n", 17 | "# 本方案在官方baseline的基础上的改动\n", 18 | "\n", 19 | "1.在网络结构上在CRF层前面增加了双向GRU层(代码见sequence_label.py中SequenceLabelTaskSP类)\n", 20 | "\n", 21 | "2.将trigger预测结果拼接到text前面进行第二阶段的role预测(代码见data_process.py的data_process函数中model=role1的情况),这个改动可以解决同一个句子不同event之间role重叠的问题\n", 22 | "\n", 23 | "3.在训练上,本方案先只用train进行训练,然后再将dev放入train进行最后的训练\n", 24 | "\n", 25 | "4.增加了简单的最终结果剔除机制(代码见datachecker.py)\n", 26 | "\n", 27 | "\n", 28 | "## 注意\n", 29 | "\n", 30 | "本项目代码需要使用GPU环境来运行:\n", 31 | "\n", 32 | "\n", 33 | "
\n", 34 | "
\n", 35 | "并且检查相关参数设置, 例如use_gpu, fluid.CUDAPlace(0)等处是否设置正确. " 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "source": [ 44 | "# 事件抽取任务\n", 45 | "\n", 46 | "事件抽取任务的目标是通过给定目标事件类型和角色类型集合及句子,识别句子中所有目标事件类型的事件,并根据论元角色集合抽取事件所对应的论元。其中目标事件类型(event_type)和论元角色(role)限定了抽取的范围,例如:(event_type:胜负,role:时间,胜者,败者,赛事名称)、(event_type:夺冠,role:夺冠事件,夺冠赛事,冠军)。最终将抽取的所有事件论元按如下形式进行输出: {“id”: “id1”, “event_list”: [{“event_type”:“T1”, “arguments”: [{“role”:“R1”, “argument”:“A1”},…]}, {“event_type”:“T2”, “arguments”: [{“role”:“R2”, “argument”:“A2”},…]}]},比赛将对参赛者最终输出的论元列表进行评估。\n", 47 | "\n", 48 | "# 方案\n", 49 | "本方案分两个阶段,第一个阶段为事件类型抽取,第二阶段为事件论元抽取,两个阶段的模型都是 预训练模型+GRU+CRF进行序列标注,接下来用一个例子来大概讲讲流程。\n", 50 | "\n", 51 | "假设当前要预测的句子为:\n", 52 | "\n", 53 | "历经4小时51分钟的体力、意志力鏖战,北京时间9月9日上午纳达尔在亚瑟·阿什球场,以7比5、6比3、5比7、4比6和6比4击败赛会5号种子俄罗斯球员梅德韦杰夫,夺得了2019年美国网球公开赛男单冠军。\n", 54 | "\n", 55 | "## 事件类型抽取\n", 56 | "当输入例子句子之后,事件类型阶段的输出结果类似于是\n", 57 | "\n", 58 | "历经4小时51分钟的体力、意志力鏖战,北京时间9月9日上午纳达尔在亚瑟·阿什球场,以7比5、6比3、5比7、4比6和6比4击(B-竞赛行为-胜负)败(I-竞赛行为-胜负)赛会5号种子俄罗斯球员梅德韦杰夫,夺得了2019年美国网球公开赛男单冠(B-竞赛行为-夺冠)军(I-竞赛行为-夺冠)。\n", 59 | "\n", 60 | "(击,B-竞赛行为-胜负)(败,I-竞赛行为-胜负)\n", 61 | "\n", 62 | "(冠,B-竞赛行为-夺冠)(军,I-竞赛行为-夺冠)\n", 63 | "\n", 64 | "其它字符的标注为O\n", 65 | "\n", 66 | "然后就得到了这个句子的两个事件类型[竞赛行为-胜负,竞赛行为-夺冠]\n", 67 | "\n", 68 | "结果文件存在data1/test1.json.trigger.pred文件中\n", 69 | "\n", 70 | "## 论元抽取\n", 71 | "\n", 72 | "与官方baseline不同,本模型方案将事件拼接到句子前面进行论元抽取,比如刚才的例子,第二阶段将分别进行两次不同的预测\n", 73 | "\n", 74 | "1.竞赛行为-胜负:历经4小时51分钟的体力、意志力鏖战,北京时间9月9日上午纳达尔在亚瑟·阿什球场,以7比5、6比3、5比7、4比6和6比4击败赛会5号种子俄罗斯球员梅德韦杰夫,夺得了2019年美国网球公开赛男单冠军。\n", 75 | "\n", 76 | "输出结果:\n", 77 | "\n", 78 | "竞赛行为-胜负:历经4小时51分钟的体力、意志力鏖战,北(B-时间)京(I-时间)时(I-时间)间(I-时间)9(I-时间)月(I-时间)9(I-时间)日(I-时间)上(I-时间)午(I-时间)纳(B-胜者)达(I-胜者)尔(I-胜者)在亚瑟·阿什球场,以7比5、6比3、5比7、4比6和6比4击败赛会5(B-败者)号(I-败者)种(I-败者)子(I-败者)俄(I-败者)罗(I-败者)斯(I-败者)球(I-败者)员(I-败者)梅(I-败者)德(I-败者)韦(I-败者)杰(I-败者)夫(I-败者),夺得了2(B-赛事名称)0(I-赛事名称)1(I-赛事名称)9(I-赛事名称)年(I-赛事名称)美(I-赛事名称)国(I-赛事名称)网(I-赛事名称)球(I-赛事名称)公(I-赛事名称)开(I-赛事名称)赛(I-赛事名称)男单冠军。\n", 79 | "\n", 80 | "2.竞赛行为-夺冠:历经4小时51分钟的体力、意志力鏖战,北京时间9月9日上午纳达尔在亚瑟·阿什球场,以7比5、6比3、5比7、4比6和6比4击败赛会5号种子俄罗斯球员梅德韦杰夫,夺得了2019年美国网球公开赛男单冠军。\n", 81 | "\n", 82 | "输出结果:\n", 83 | "\n", 84 | "竞赛行为-夺冠:历经4小时51分钟的体力、意志力鏖战,北(B-时间)京(I-时间)时(I-时间)间(I-时间)9(I-时间)月(I-时间)9(I-时间)日(I-时间)上(I-时间)午(I-时间)纳(B-冠军)达(I-冠军)尔(I-冠军)在亚瑟·阿什球场,以7比5、6比3、5比7、4比6和6比4击败赛会5号种子俄罗斯球员梅德韦杰夫,夺得了(B-夺冠赛事)2(I-夺冠赛事)0(I-夺冠赛事)1(I-夺冠赛事)9(I-夺冠赛事)年(I-夺冠赛事)美(I-夺冠赛事)国(I-夺冠赛事)网(I-夺冠赛事)球(I-夺冠赛事)公(I-夺冠赛事)开(I-夺冠赛事)赛(I-夺冠赛事)男单冠军。\n", 85 | "\n", 86 | "训练的时候也是将同一个句子的不同事件拆开转换成不同的句子进行训练,训练的时候一个事件里只有这个事件的论元,忽略这个句子里其它事件的论元。\n", 87 | "\n", 88 | "## 生成最终结果\n", 89 | "\n", 90 | "最终结果根据论元抽取结果生成,将相同句子的不同事件合到同一个结果里。\n", 91 | "\n", 92 | "```\n", 93 | "{\n", 94 | " \"id\":\"6a10824fe9c7b2aa776aa7e3de35d45d\",\n", 95 | " \"event_list\":[\n", 96 | " {\n", 97 | " \"event_type\":\"竞赛行为-胜负\",\n", 98 | " \"arguments\":[\n", 99 | " {\n", 100 | " \"role\":\"时间\",\n", 101 | " \"argument\":\"北京时间9月9日上午\"\n", 102 | " },\n", 103 | " {\n", 104 | " \"role\":\"胜者\",\n", 105 | " \"argument\":\"纳达尔\"\n", 106 | " },\n", 107 | " {\n", 108 | " \"role\":\"败者\",\n", 109 | " \"argument\":\"5号种子俄罗斯球员梅德韦杰夫\"\n", 110 | " },\n", 111 | " {\n", 112 | " \"role\":\"赛事名称\",\n", 113 | " \"argument\":\"2019年美国网球公开赛\"\n", 114 | " }\n", 115 | " ]\n", 116 | " },\n", 117 | " {\n", 118 | " \"event_type\":\"竞赛行为-夺冠\",\n", 119 | " \"arguments\":[\n", 120 | " {\n", 121 | " \"role\":\"时间\",\n", 122 | " \"argument\":\"北京时间9月9日上午\"\n", 123 | " },\n", 124 | " {\n", 125 | " \"role\":\"夺冠赛事\",\n", 126 | " \"argument\":\"2019年美国网球公开赛\"\n", 127 | " },\n", 128 | " {\n", 129 | " \"role\":\"冠军\",\n", 130 | " \"argument\":\"纳达尔\"\n", 131 | " }\n", 132 | " ]\n", 133 | " }\n", 134 | " ]\n", 135 | "}\n", 136 | "```" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "source": [ 145 | "# 题外话\n", 146 | "本方案是最终参赛的方案,在此之外还有一个魔改了PaddleHub的方案,但是效果并没有这个好,但是那个方案可以作为一个魔改PaddleHub的好案例,有时间把那个方案也分享一下。基本思路和这个有点相似,也是将事件信息加到第二阶段的论元抽取,不同的是,这个方案是把文字直接加到了句子前面,另一个方案是把事件做了onehot编码加到了每个词向量上,还将分词信息也加到了词向量上。和这个方案相比,另一个方案实现了一套往PaddleHub模型中的词向量追加Feature的机制(其实我没想明白为啥那个方案效果没这个方案好:()。\n", 147 | "\n", 148 | "本方案排名并不靠前,分享的目的还是希望可以抛砖引玉,希望也能看到前排的大佬们分享一下方案(虽然赛后群里有大佬大致分享了,但是还是想看更详细的分享:))。\n", 149 | "\n", 150 | "感觉本次参加比赛的收获挺大的,大概有以下几点:\n", 151 | "\n", 152 | "1.大致熟悉了PaddleHub这个框架\n", 153 | "\n", 154 | "2.认识了好多大佬,学习到了好多新的知识点\n", 155 | "\n", 156 | "3.发现了竟然还有AIStudio这么一个可以白嫖GPU的好地方(因为没GPU其实这是我第一次玩深度学习打比赛)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "source": [ 165 | "# 如何在AIStudio把这个方案跑起来\n", 166 | "\n", 167 | "fork这个项目,然后一直往后执行就可以了 : )\n", 168 | "\n", 169 | "PS.\n", 170 | "\n", 171 | "为了训练方便,本项目接了VisualDL,所以在训练的时候打开VisualDL的页面是可以看到统计信息的\n", 172 | "\n", 173 | "比如:\n", 174 | "![](https://ai-studio-static-online.cdn.bcebos.com/a03481088c6040bb9adb0036d08e41c9cbeb04648dde49f4aa7a74b15692bbd8)\n", 175 | "\n", 176 | "## 如何打开VisualDL\n", 177 | "\n", 178 | "# Notebooks项目访问URL\n", 179 | "\n", 180 | "比如你的notebook网页地址为\n", 181 | "url_notebook = 'http://aistudio.baidu.com/user/30799/33852/notebooks/33852.ipynb?redirects=1'\n", 182 | "\n", 183 | "# 替换后visualdl访问URL\n", 184 | "url_visualdl = 'http://aistudio.baidu.com/user/30799/33852/visualdl'\n", 185 | "\n", 186 | "访问后面那个网址就能打开VisualDL了" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "#从github拉代码\r\n", 198 | "#如果是fork的这个项目,这一步就不用执行了,如果是自己新开的项目可以执行这一步\r\n", 199 | "#!svn checkout https://github.com/PaddlePaddle/Research.git/trunk/KG/DuEE_baseline/DuEE-PaddleHub/ ./baseline\r\n", 200 | "!svn checkout https://github.com/onewaymyway/DuEE_2020.git/trunk ./baseline" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [ 210 | { 211 | "name": "stdout", 212 | "output_type": "stream", 213 | "text": [ 214 | "/home/aistudio/baseline\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "# 切换到代码目录\r\n", 220 | "%cd baseline/" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "#安装依赖\r\n", 232 | "!pip install -r ./requirements.txt" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 18, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "#这个代码不用执行!!!\r\n", 244 | "#打包文件夹下的文件 用于本地备份 排除数据目录和模型目录\r\n", 245 | "\r\n", 246 | "!zip -r mb.zip * -x \"./models/*\" -x \"./model/*\" -x \"./data/*\" -x \"./data1/*\" -x \"./orzdata/*\"" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "#拉取最新的比赛数据\r\n", 258 | "\r\n", 259 | "!rm -rf ./orzdata\r\n", 260 | "!mkdir ./orzdata\r\n", 261 | "!rm -rf ./data\r\n", 262 | "!mkdir ./data\r\n", 263 | "%cd ./orzdata\r\n", 264 | "\r\n", 265 | "#训练数据 dev数据 test1\r\n", 266 | "!wget -O train.zip https://dataset-bj.cdn.bcebos.com/event_extraction/train_data.json.zip\r\n", 267 | "!unzip train.zip\r\n", 268 | "!wget -O dev.zip https://dataset-bj.cdn.bcebos.com/event_extraction/dev_data.json.zip\r\n", 269 | "!unzip dev.zip\r\n", 270 | "!wget -O test.zip https://dataset-bj.cdn.bcebos.com/event_extraction/test1_data.json.zip\r\n", 271 | "!unzip test.zip\r\n", 272 | "\r\n", 273 | "!cp -r ./dev_data/dev.json ../data/dev.json\r\n", 274 | "!cp -r ./train_data/train.json ../data/train.json\r\n", 275 | "!cp -r ./test1_data/test1.json ../data/test1.json\r\n", 276 | "\r\n", 277 | "#schema数据\r\n", 278 | "!wget -O schema.zip https://ai.baidu.com/file/9C92719AF96D4DDB96477BFBE1435262\r\n", 279 | "!unzip schema.zip\r\n", 280 | "\r\n", 281 | "!cp -r ./event_schema/event_schema.json ../data/event_schema.json\r\n", 282 | "\r\n", 283 | "#test2\r\n", 284 | "!wget -O test2.zip https://dataset-bj.cdn.bcebos.com/lic2020/test2_data.json.zip\r\n", 285 | "!unzip test2.zip\r\n", 286 | "!cp -r ./test2.json ../data/test2.json\r\n", 287 | "\r\n", 288 | "%cd ../" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "#将数据拷到data1防止做实验把数据覆盖了\r\n", 300 | "!cp -r ./data ./data1" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "#!/usr/bin/env python\r\n", 312 | "# -*- coding: utf-8 -*-\r\n", 313 | "\r\n", 314 | "#为了做实验方便封装的脚本调用\r\n", 315 | "\r\n", 316 | "import os\r\n", 317 | "import subprocess\r\n", 318 | "\r\n", 319 | "\r\n", 320 | "def executeSh(sh):\r\n", 321 | " '''\r\n", 322 | " 执行sh命令的封装\r\n", 323 | " '''\r\n", 324 | " print(\"sh:\",sh)\r\n", 325 | " v=os.popen(sh)\r\n", 326 | " lines=v.readlines()\r\n", 327 | " for line in lines:\r\n", 328 | " print(line)\r\n", 329 | " #print(line.decode(\"utf-8\"))\r\n", 330 | " #print(line.decode(\"gb2312\"))\r\n", 331 | " v.close()\r\n", 332 | "def executeShs(shlist):\r\n", 333 | " '''\r\n", 334 | " 执行多个sh命令的封装\r\n", 335 | " '''\r\n", 336 | " for sh in shlist:\r\n", 337 | " executeSh(sh)\r\n", 338 | " \r\n", 339 | "def savebestToTemp(modelpath):\r\n", 340 | " '''\r\n", 341 | " 将modelpath/best_model复制到modelpath/curtest\r\n", 342 | " 主要用于保存和测试当前的best_model,防止best_model被覆盖无法重现测试的结果\r\n", 343 | " '''\r\n", 344 | " tar=modelpath+\"/curtest\"\r\n", 345 | " src=modelpath+\"/best_model/.\"\r\n", 346 | " sh=\"rm -rf \"+tar\r\n", 347 | " executeSh(sh)\r\n", 348 | " sh=\"mkdir \"+tar\r\n", 349 | " executeSh(sh)\r\n", 350 | " sh=\"cp -r \"+src+\" \"+tar\r\n", 351 | " executeSh(sh)\r\n", 352 | "\r\n", 353 | "def checkPredFile(filePath):\r\n", 354 | " '''\r\n", 355 | " 删除结果中不合理的元素\r\n", 356 | " '''\r\n", 357 | " executeSh(\"python datachecker.py --targetFile \"+filePath)\r\n", 358 | "\r\n", 359 | "def run_gru_model(triggermodel,testFile,saveFile):\r\n", 360 | " '''\r\n", 361 | " 一次性跑完从trigger预测到role预测再到输出最终结果\r\n", 362 | " triggermodel:使用的trigger模型目录\r\n", 363 | " testFile:要预测的文件\r\n", 364 | " saveFile:最终输出文件\r\n", 365 | " '''\r\n", 366 | " shs=[\r\n", 367 | " \"sh run_trigger_gru_predict_withmodel.sh 0 ./data1/ models/trigger_gru {0} {1}\".format(testFile,triggermodel),\r\n", 368 | " \"python data_process.py --action predict_data_process2 --trigger_file data1/{0}.trigger.pred --schema_file data/event_schema.json --save_path data1/{0}_triggerpred.json\".format(testFile,triggermodel),\r\n", 369 | " \"sh run_role1_gru_predict.sh 0 ./data1/ models/rolegru {0}_triggerpred.json\".format(testFile,triggermodel),\r\n", 370 | " \"python data_process.py --action predict_data_process3 --role_file data1/{0}_triggerpred.json.role1.pred --schema_file data1/event_schema.json --save_path data1/{2}\".format(testFile,triggermodel,saveFile)\r\n", 371 | " ]\r\n", 372 | " #print(shs)\r\n", 373 | " executeShs(shs)\r\n", 374 | "\r\n", 375 | "def run_gru_modelWithOutmark(triggermodel,testFile,outmark=\"\"):\r\n", 376 | " '''\r\n", 377 | " 生成带mark的预测文件\r\n", 378 | " 主要用于模型之间的融合\r\n", 379 | " 仅用于实验,比赛最终结果没用模型融合\r\n", 380 | " '''\r\n", 381 | " shs=[\r\n", 382 | " \"sh run_trigger_gru_predict_withmodel.sh 0 ./data1/ models/trigger_gru {0} {1}\".format(testFile,triggermodel),\r\n", 383 | " \"python data_process.py --action predict_data_process2 --trigger_file data1/{0}.trigger.pred --schema_file data/event_schema.json --save_path data1/{0}{2}_triggerpred.json\".format(testFile,triggermodel,outmark)\r\n", 384 | " ]\r\n", 385 | " #print(shs)\r\n", 386 | " executeShs(shs) \r\n", 387 | "\r\n", 388 | "def runRolePredict(triggerfile=\"test1_triggerpred.json\",rolemodel=\"models/rolegru\",gru=False,predictmodel=\"best_model\",outmark=\"\",finalFile=None,ifCheck=False,data_dir=\"./data1\"):\r\n", 389 | " '''\r\n", 390 | " 根据trigger预测结果进行role预测并生成最终的结果\r\n", 391 | " '''\r\n", 392 | " sh=\"python sequence_label.py --num_epoch 30 \\\r\n", 393 | " --learning_rate 3e-5 \\\r\n", 394 | " --data_dir {data_dir} \\\r\n", 395 | " --schema_path {data_dir}/event_schema.json \\\r\n", 396 | " --train_data {data_dir}/train.json \\\r\n", 397 | " --dev_data {data_dir}/dev.json \\\r\n", 398 | " --test_data {data_dir}/dev.json \\\r\n", 399 | " --predict_data {data_dir}/{predictfile} \\\r\n", 400 | " --do_train False \\\r\n", 401 | " --do_predict False \\\r\n", 402 | " --do_predict2 True \\\r\n", 403 | " --do_model role1 \\\r\n", 404 | " --add_gru {gru} \\\r\n", 405 | " --predictmodel {predictmodel} \\\r\n", 406 | " --max_seq_len 256 \\\r\n", 407 | " --batch_size 8 \\\r\n", 408 | " --model_save_step 3000 \\\r\n", 409 | " --eval_step 200 \\\r\n", 410 | " --checkpoint_dir {ckpt_dir}\".format(data_dir=data_dir,predictfile=triggerfile,gru=gru,ckpt_dir=rolemodel,predictmodel=predictmodel)\r\n", 411 | " if outmark:\r\n", 412 | " sh+=\" --outmark \"+outmark\r\n", 413 | " executeSh(sh)\r\n", 414 | " if finalFile:\r\n", 415 | " makefinalFile((\"{data_dir}/\"+triggerfile+\".role1{outmark}.pred\").format(data_dir=data_dir,outmark=outmark),finalFile)\r\n", 416 | " if ifCheck:\r\n", 417 | " checkPredFile(finalFile)\r\n", 418 | "\r\n", 419 | "def makefinalFile(rolePredFile=\"data1/test1_triggerpred_md.json.role1.pred\",savefile=\"data1/test1fn_pred.json\"):\r\n", 420 | " '''\r\n", 421 | " 根据role预测结果生成最终提交数据\r\n", 422 | " '''\r\n", 423 | " sh=\"python data_process.py --action predict_data_process3 --role_file {0} --schema_file data1/event_schema.json --save_path {1}\".format(rolePredFile,savefile)\r\n", 424 | " #sh=\"sh makefinalpred.sh {0} {1}\".format(rolePredFile,savefile)\r\n", 425 | " executeSh(sh)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "collapsed": false 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "#为了快速跑流程 代码中的预训练模型改成了ernie_tiny\r\n", 437 | "#实际比赛用的是chinese-roberta-wwm-ext-large\r\n", 438 | "#要达到比赛的分数效果可以在sequence_label.py中将预训练模型改为chinese-roberta-wwm-ext-large\r\n", 439 | "#为了简洁,删掉了很多做实验的代码和脚本\r\n", 440 | "#训练建议到终端里进行训练,因为在终端训练关了浏览器再进还能看到输出 : )" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "#训练trigger_gru 加了gru层版本trigger识别模型\r\n", 452 | "!sh run_trigger_gru.sh 0 ./data1/ models/trigger_gru" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": { 459 | "collapsed": false 460 | }, 461 | "outputs": [], 462 | "source": [ 463 | "#训练trigger_gru_mix 将dev数据也放入训练\r\n", 464 | "!sh run_trigger_gru_mix.sh 0 ./data1/ models/trigger_gru" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "collapsed": false 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "#trigger_gru 预测\r\n", 476 | "#预测结果存在data1/test1.json.trigger.pred\r\n", 477 | "!sh run_trigger_gru_predict.sh 0 ./data1/ models/trigger_gru test1.json" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "#根据trigger_gru预测结果数据生成role预测需要的数据\r\n", 489 | "#结果文件为data1/test1_triggerpred.json\r\n", 490 | "#具体代码见data_process.py predict_data_process2函数\r\n", 491 | "!python data_process.py --action predict_data_process2 --trigger_file data1/test1.json.trigger.pred --schema_file data/event_schema.json --save_path data1/test1_triggerpred.json" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": { 498 | "collapsed": false 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "#训练role1 gru 训练加了gru层的role识别模型\r\n", 503 | "!sh run_role1_gru.sh 0 ./data1/ models/rolegru" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 15, 509 | "metadata": { 510 | "collapsed": false 511 | }, 512 | "outputs": [], 513 | "source": [ 514 | "#训练role1_gru mix 将dev也加入训练\r\n", 515 | "!sh run_role1_gru_mix.sh 0 ./data1/ models/rolegru" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 16, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "#role1_gru预测\r\n", 527 | "#结果文件为data1/test1_triggerpred.json.role1.pred\r\n", 528 | "!sh run_role1_gru_predict.sh 0 ./data1/ models/rolegru test1_triggerpred.json\r\n", 529 | "#role1 生成最终提交结果\r\n", 530 | "#结果文件为data1/test1fn_pred.json\r\n", 531 | "!python data_process.py --action predict_data_process3 --role_file data1/test1_triggerpred.json.role1.pred --schema_file data1/event_schema.json --save_path data1/test1fn_pred.json" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 17, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [], 541 | "source": [ 542 | "#从结果中删除明显不合理的结果,这一步有一点提升,不做也没大影响\r\n", 543 | "#结果文件为data1/test1fn_pred_md.json\r\n", 544 | "checkPredFile(\"./data1/test1fn_pred.json\")" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": { 551 | "collapsed": false 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "#AIStudio更新代码编辑器之后出现编辑sh文件后无法执行sh文件的问题\r\n", 556 | "#需要将每行的\\r手动去掉才能正常执行\r\n", 557 | "\r\n", 558 | "from nlputils import readFile,saveFile\r\n", 559 | "def adptSHFile(filePath):\r\n", 560 | " txt=readFile(filePath)\r\n", 561 | " txt=txt.replace(\"\\r\",\"\")\r\n", 562 | " saveFile(filePath,txt)\r\n", 563 | "\r\n", 564 | "adptSHFile(\"run_role1_gru.sh\")\r\n", 565 | "adptSHFile(\"run_role1_gru_mix.sh\")\r\n", 566 | "adptSHFile(\"run_role1_gru_eval.sh\")\r\n", 567 | "adptSHFile(\"run_role1_gru_predict.sh\")\r\n", 568 | "adptSHFile(\"run_trigger_gru.sh\")\r\n", 569 | "adptSHFile(\"run_trigger_gru_predict.sh\")\r\n", 570 | "adptSHFile(\"run_trigger_gru_mix.sh\")\r\n", 571 | "adptSHFile(\"run_trigger_gru_predict_withmodel.sh\")\r\n", 572 | "print(\"ok\")" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": { 578 | "collapsed": false 579 | }, 580 | "source": [ 581 | "请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.
\n", 582 | "Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. " 583 | ] 584 | } 585 | ], 586 | "metadata": { 587 | "kernelspec": { 588 | "display_name": "PaddlePaddle 1.7.1 (Python 2.7)", 589 | "language": "python", 590 | "name": "py27-paddle1.2.0" 591 | }, 592 | "language_info": { 593 | "codemirror_mode": { 594 | "name": "ipython", 595 | "version": 2 596 | }, 597 | "file_extension": ".py", 598 | "mimetype": "text/x-python", 599 | "name": "python", 600 | "nbconvert_exporter": "python", 601 | "pygments_lexer": "ipython2", 602 | "version": "2.7.15" 603 | } 604 | }, 605 | "nbformat": 4, 606 | "nbformat_minor": 1 607 | } 608 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.14.5 2 | six==1.11.0 3 | paddlepaddle-gpu==1.7.1.post97 4 | paddlehub==1.6.1 5 | Werkzeug==1.0.1 6 | 7 | -------------------------------------------------------------------------------- /run_role1_gru.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | 11 | python sequence_label.py --num_epoch 50 \ 12 | --learning_rate 3e-5 \ 13 | --data_dir ${data_dir} \ 14 | --schema_path ${data_dir}/event_schema.json \ 15 | --train_data ${data_dir}/train.json \ 16 | --dev_data ${data_dir}/dev.json \ 17 | --test_data ${data_dir}/dev.json \ 18 | --do_train True \ 19 | --add_gru True \ 20 | --do_predict False \ 21 | --do_model role1 \ 22 | --max_seq_len 256 \ 23 | --batch_size 32 \ 24 | --model_save_step 3000 \ 25 | --eval_step 100 \ 26 | --checkpoint_dir ${ckpt_dir} 27 | 28 | -------------------------------------------------------------------------------- /run_role1_gru_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | 11 | python sequence_label.py --num_epoch 80 \ 12 | --learning_rate 3e-5 \ 13 | --data_dir ${data_dir} \ 14 | --schema_path ${data_dir}/event_schema.json \ 15 | --train_data ${data_dir}/train.json \ 16 | --dev_data ${data_dir}/dev.json \ 17 | --test_data ${data_dir}/dev.json \ 18 | --do_train False \ 19 | --do_eval True \ 20 | --add_gru True \ 21 | --do_predict False \ 22 | --do_model role1 \ 23 | --max_seq_len 256 \ 24 | --batch_size 32 \ 25 | --model_save_step 3000 \ 26 | --eval_step 200 \ 27 | --checkpoint_dir ${ckpt_dir} 28 | 29 | -------------------------------------------------------------------------------- /run_role1_gru_mix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | 11 | python sequence_label.py --num_epoch 75 \ 12 | --learning_rate 3e-5 \ 13 | --data_dir ${data_dir} \ 14 | --schema_path ${data_dir}/event_schema.json \ 15 | --train_data ${data_dir}/train.json \ 16 | --dev_data ${data_dir}/dev.json \ 17 | --test_data ${data_dir}/dev.json \ 18 | --do_train True \ 19 | --mixtrain True \ 20 | --add_gru True \ 21 | --do_predict False \ 22 | --do_model role1 \ 23 | --max_seq_len 256 \ 24 | --batch_size 32 \ 25 | --model_save_step 3000 \ 26 | --eval_step 100 \ 27 | --checkpoint_dir ${ckpt_dir} -------------------------------------------------------------------------------- /run_role1_gru_predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | predictfile=$4 11 | 12 | python sequence_label.py --num_epoch 30 \ 13 | --learning_rate 3e-5 \ 14 | --data_dir ${data_dir} \ 15 | --schema_path ${data_dir}/event_schema.json \ 16 | --train_data ${data_dir}/train.json \ 17 | --dev_data ${data_dir}/dev.json \ 18 | --test_data ${data_dir}/dev.json \ 19 | --predict_data ${data_dir}/${predictfile} \ 20 | --do_train False \ 21 | --do_predict False \ 22 | --do_predict2 True \ 23 | --add_gru True \ 24 | --do_model role1 \ 25 | --max_seq_len 256 \ 26 | --batch_size 8 \ 27 | --model_save_step 3000 \ 28 | --eval_step 200 \ 29 | --checkpoint_dir ${ckpt_dir} 30 | 31 | -------------------------------------------------------------------------------- /run_trigger_gru.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | 11 | python sequence_label.py --num_epoch 50 \ 12 | --learning_rate 3e-5 \ 13 | --data_dir ${data_dir} \ 14 | --schema_path ${data_dir}/event_schema.json \ 15 | --train_data ${data_dir}/train.json \ 16 | --dev_data ${data_dir}/dev.json \ 17 | --test_data ${data_dir}/dev.json \ 18 | --predict_data ${data_dir}/test1.json \ 19 | --do_train True \ 20 | --do_predict False \ 21 | --add_gru True \ 22 | --do_model trigger \ 23 | --max_seq_len 300 \ 24 | --batch_size 32 \ 25 | --model_save_step 3000 \ 26 | --eval_step 200 \ 27 | --checkpoint_dir ${ckpt_dir} 28 | 29 | -------------------------------------------------------------------------------- /run_trigger_gru_mix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | 11 | python sequence_label.py --num_epoch 75 \ 12 | --learning_rate 3e-5 \ 13 | --data_dir ${data_dir} \ 14 | --schema_path ${data_dir}/event_schema.json \ 15 | --train_data ${data_dir}/train.json \ 16 | --dev_data ${data_dir}/dev.json \ 17 | --test_data ${data_dir}/dev.json \ 18 | --predict_data ${data_dir}/test1.json \ 19 | --do_train True \ 20 | --do_predict False \ 21 | --add_gru True \ 22 | --mixtrain True \ 23 | --do_model trigger \ 24 | --max_seq_len 300 \ 25 | --batch_size 20 \ 26 | --model_save_step 3000 \ 27 | --eval_step 200 \ 28 | --checkpoint_dir ${ckpt_dir} 29 | 30 | -------------------------------------------------------------------------------- /run_trigger_gru_predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | predictfile=$4 11 | 12 | python sequence_label.py --num_epoch 30 \ 13 | --learning_rate 3e-5 \ 14 | --data_dir ${data_dir} \ 15 | --schema_path ${data_dir}/event_schema.json \ 16 | --train_data ${data_dir}/train.json \ 17 | --dev_data ${data_dir}/dev.json \ 18 | --test_data ${data_dir}/dev.json \ 19 | --predict_data ${data_dir}/${predictfile} \ 20 | --do_train False \ 21 | --do_predict True \ 22 | --add_gru True \ 23 | --do_model trigger \ 24 | --max_seq_len 300 \ 25 | --batch_size 8 \ 26 | --model_save_step 3000 \ 27 | --eval_step 200 \ 28 | --checkpoint_dir ${ckpt_dir} 29 | 30 | -------------------------------------------------------------------------------- /run_trigger_gru_predict_withmodel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | export CUDA_VISIBLE_DEVICES=$1 5 | export FLAGS_eager_delete_tensor_gb=0 6 | export FLAGS_fraction_of_gpu_memory_to_use=0.3 7 | 8 | data_dir=$2 9 | ckpt_dir=$3 10 | predictfile=$4 11 | 12 | python sequence_label.py --num_epoch 30 \ 13 | --learning_rate 3e-5 \ 14 | --data_dir ${data_dir} \ 15 | --schema_path ${data_dir}/event_schema.json \ 16 | --train_data ${data_dir}/train.json \ 17 | --dev_data ${data_dir}/dev.json \ 18 | --test_data ${data_dir}/dev.json \ 19 | --predict_data ${data_dir}/${predictfile} \ 20 | --do_train False \ 21 | --do_predict True \ 22 | --add_gru True \ 23 | --do_model trigger \ 24 | --predictmodel $5 \ 25 | --max_seq_len 300 \ 26 | --batch_size 8 \ 27 | --model_save_step 3000 \ 28 | --eval_step 200 \ 29 | --checkpoint_dir ${ckpt_dir} 30 | 31 | -------------------------------------------------------------------------------- /sequence_label.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding:utf-8 3 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Finetuning on sequence labeling task.""" 17 | from paddlehub.common.logger import logger 18 | import argparse 19 | import ast 20 | import json 21 | import numpy as np 22 | import paddle.fluid as fluid 23 | import paddlehub as hub 24 | from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset 25 | 26 | from data_process import data_process 27 | from data_process import schema_process 28 | from data_process import write_by_lines 29 | 30 | from nlputils import MyLog 31 | import sys 32 | print(sys.argv) 33 | # yapf: disable 34 | parser = argparse.ArgumentParser(__doc__) 35 | parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") 36 | parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") 37 | parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") 38 | parser.add_argument("--data_dir", type=str, default=None, help="data save dir") 39 | parser.add_argument("--schema_path", type=str, default=None, help="schema path") 40 | parser.add_argument("--train_data", type=str, default=None, help="train data") 41 | parser.add_argument("--dev_data", type=str, default=None, help="dev data") 42 | parser.add_argument("--test_data", type=str, default=None, help="test data") 43 | parser.add_argument("--predict_data", type=str, default=None, help="predict data") 44 | parser.add_argument("--do_train", type=ast.literal_eval, default=False, help="do train") 45 | parser.add_argument("--do_eval", type=ast.literal_eval, default=False, help="do eval") 46 | parser.add_argument("--mixtrain", type=ast.literal_eval, default=False, help="mixtrain") 47 | parser.add_argument("--do_predict", type=ast.literal_eval, default=True, help="do predict") 48 | parser.add_argument("--do_predict2", type=ast.literal_eval, default=False, help="do predict") 49 | parser.add_argument("--do_model", type=str, default="trigger", choices=["trigger", "role","role1"], help="trigger or role") 50 | parser.add_argument("--predictmodel", type=str, default="best_model", help="bestmodel") 51 | parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") 52 | parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") 53 | parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") 54 | parser.add_argument("--eval_step", type=int, default=200, help="eval step") 55 | parser.add_argument("--model_save_step", type=int, default=3000, help="model save step") 56 | parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") 57 | parser.add_argument("--add_crf", type=ast.literal_eval, default=True, help="add crf") 58 | parser.add_argument("--add_gru", type=ast.literal_eval, default=False, help="add gru") 59 | parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") 60 | parser.add_argument("--outmark", type=str, default="", help="outmark") 61 | 62 | parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") 63 | args = parser.parse_args() 64 | # yapf: enable. 65 | 66 | schema_labels = schema_process(args.schema_path, args.do_model) 67 | 68 | # 先把数据处理好保存下来 69 | train_data = data_process(args.train_data, args.do_model) # 处理训练数据 70 | dev_data = data_process(args.dev_data, args.do_model,False,schema_labels) # 处理dev数据 71 | test_data = data_process(args.test_data, args.do_model) 72 | print("train",len(train_data)) 73 | print("dev",len(dev_data)) 74 | if args.mixtrain: 75 | #将dev数据加到train里面进行训练 76 | train_data=train_data+dev_data[1:] 77 | #dev_data=train_data[:] 78 | print("mix",len(train_data)) 79 | 80 | 81 | 82 | 83 | 84 | write_by_lines("{}/{}_train.tsv".format(args.data_dir, args.do_model), train_data) 85 | write_by_lines("{}/{}_dev.tsv".format(args.data_dir, args.do_model), dev_data) 86 | write_by_lines("{}/{}_test.tsv".format(args.data_dir, args.do_model), test_data) 87 | if args.predict_data: 88 | predict_sents, predict_data = data_process(args.predict_data, args.do_model, is_predict=True) 89 | write_by_lines("{}/{}_predict.tsv".format(args.data_dir, args.do_model), predict_data) 90 | 91 | 92 | schema_labels = schema_process(args.schema_path, args.do_model) 93 | 94 | from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 95 | from paddlehub.common.utils import version_compare 96 | import paddle 97 | import os 98 | 99 | class SequenceLabelTaskSP(hub.SequenceLabelTask): 100 | ''' 101 | 扩展序列标注任务 102 | 增加从非best_model目录加载模型的功能 103 | 添加gru层 104 | ''' 105 | def __init__(self, 106 | feature, 107 | max_seq_len, 108 | num_classes, 109 | feed_list, 110 | data_reader, 111 | startup_program=None, 112 | config=None, 113 | metrics_choices="default", 114 | add_crf=False): 115 | 116 | print("SequenceLabelTaskSP") 117 | 118 | super(SequenceLabelTaskSP, self).__init__( 119 | feature=feature, 120 | max_seq_len=max_seq_len, 121 | num_classes=num_classes, 122 | feed_list=feed_list, 123 | data_reader=data_reader, 124 | startup_program=startup_program, 125 | config=config, 126 | metrics_choices=metrics_choices, 127 | add_crf=add_crf) 128 | 129 | def init_if_load_best_model(self): 130 | ''' 131 | 支持从自定义的目录加载bestmodel 132 | ''' 133 | if not self.is_best_model_loaded: 134 | best_model_path = os.path.join(self.config.checkpoint_dir, 135 | args.predictmodel) 136 | logger.info("Load the best model from %s" % best_model_path) 137 | if os.path.exists(best_model_path): 138 | self.load_parameters(best_model_path) 139 | self.is_checkpoint_loaded = False 140 | self.is_best_model_loaded = True 141 | else: 142 | self.init_if_necessary() 143 | else: 144 | logger.info("The best model has been loaded") 145 | def _build_net(self): 146 | self.seq_len = fluid.layers.data( 147 | name="seq_len", shape=[1], dtype='int64', lod_level=0) 148 | 149 | if version_compare(paddle.__version__, "1.6"): 150 | self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) 151 | else: 152 | self.seq_len_used = self.seq_len 153 | 154 | #增加gru层相关的代码 155 | grnn_hidden_dim = 256 # 768 156 | crf_lr = 0.2 157 | bigru_num = 2 158 | init_bound = 0.1 159 | 160 | def _bigru_layer(input_feature): 161 | """define the bidirectional gru layer 162 | """ 163 | pre_gru = fluid.layers.fc( 164 | input=input_feature, 165 | size=grnn_hidden_dim * 3, 166 | param_attr=fluid.ParamAttr( 167 | initializer=fluid.initializer.Uniform( 168 | low=-init_bound, high=init_bound), 169 | regularizer=fluid.regularizer.L2DecayRegularizer( 170 | regularization_coeff=1e-4))) 171 | gru = fluid.layers.dynamic_gru( 172 | input=pre_gru, 173 | size=grnn_hidden_dim, 174 | param_attr=fluid.ParamAttr( 175 | initializer=fluid.initializer.Uniform( 176 | low=-init_bound, high=init_bound), 177 | regularizer=fluid.regularizer.L2DecayRegularizer( 178 | regularization_coeff=1e-4))) 179 | pre_gru_r = fluid.layers.fc( 180 | input=input_feature, 181 | size=grnn_hidden_dim * 3, 182 | param_attr=fluid.ParamAttr( 183 | initializer=fluid.initializer.Uniform( 184 | low=-init_bound, high=init_bound), 185 | regularizer=fluid.regularizer.L2DecayRegularizer( 186 | regularization_coeff=1e-4))) 187 | gru_r = fluid.layers.dynamic_gru( 188 | input=pre_gru_r, 189 | size=grnn_hidden_dim, 190 | is_reverse=True, 191 | param_attr=fluid.ParamAttr( 192 | initializer=fluid.initializer.Uniform( 193 | low=-init_bound, high=init_bound), 194 | regularizer=fluid.regularizer.L2DecayRegularizer( 195 | regularization_coeff=1e-4))) 196 | bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1) 197 | return bi_merge 198 | 199 | if self.add_crf: 200 | unpad_feature = fluid.layers.sequence_unpad( 201 | self.feature, length=self.seq_len_used) 202 | 203 | #增加gru层相关的代码 204 | input_feature = unpad_feature 205 | for i in range(bigru_num): 206 | bigru_output = _bigru_layer(input_feature) 207 | input_feature = bigru_output 208 | 209 | unpad_feature=input_feature 210 | self.emission = fluid.layers.fc( 211 | size=self.num_classes, 212 | input=unpad_feature, 213 | param_attr=fluid.ParamAttr( 214 | initializer=fluid.initializer.Uniform(low=-0.1, high=0.1), 215 | regularizer=fluid.regularizer.L2DecayRegularizer( 216 | regularization_coeff=1e-4))) 217 | size = self.emission.shape[1] 218 | fluid.layers.create_parameter( 219 | shape=[size + 2, size], dtype=self.emission.dtype, name='crfw') 220 | self.ret_infers = fluid.layers.crf_decoding( 221 | input=self.emission, param_attr=fluid.ParamAttr(name='crfw')) 222 | ret_infers = fluid.layers.assign(self.ret_infers) 223 | return [ret_infers] 224 | else: 225 | self.logits = fluid.layers.fc( 226 | input=self.feature, 227 | size=self.num_classes, 228 | num_flatten_dims=2, 229 | param_attr=fluid.ParamAttr( 230 | name="cls_seq_label_out_w", 231 | initializer=fluid.initializer.TruncatedNormal(scale=0.02)), 232 | bias_attr=fluid.ParamAttr( 233 | name="cls_seq_label_out_b", 234 | initializer=fluid.initializer.Constant(0.))) 235 | 236 | self.ret_infers = fluid.layers.reshape( 237 | x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) 238 | 239 | logits = self.logits 240 | logits = fluid.layers.flatten(logits, axis=2) 241 | logits = fluid.layers.softmax(logits) 242 | self.num_labels = logits.shape[1] 243 | return [logits] 244 | 245 | class EEDataset(BaseNLPDataset): 246 | """EEDataset""" 247 | def __init__(self, data_dir, labels, model="trigger"): 248 | pdf="{}_predict.tsv".format(model) 249 | if not args.predict_data: 250 | pdf="{}_test.tsv".format(model) 251 | print("labels:",labels) 252 | # 数据集存放位置 253 | super(EEDataset, self).__init__( 254 | base_path=data_dir, 255 | train_file="{}_train.tsv".format(model), 256 | dev_file="{}_dev.tsv".format(model), 257 | test_file="{}_test.tsv".format(model), 258 | # 如果还有预测数据(不需要文本类别label),可以放在predict.tsv 259 | predict_file=pdf, 260 | train_file_with_header=True, 261 | dev_file_with_header=True, 262 | test_file_with_header=True, 263 | predict_file_with_header=True, 264 | # 数据集类别集合 265 | label_list=labels) 266 | 267 | 268 | def main(): 269 | # Load Paddlehub pretrained model 270 | # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel 271 | 272 | #设置使用的预训练模型 273 | model_name = "ernie_tiny" 274 | #model_name = "chinese-roberta-wwm-ext-large" 275 | module = hub.Module(name=model_name) 276 | 277 | 278 | inputs, outputs, program = module.context( 279 | trainable=True, max_seq_len=args.max_seq_len) 280 | 281 | # Download dataset and use SequenceLabelReader to read dataset 282 | dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) 283 | reader = hub.reader.SequenceLabelReader( 284 | dataset=dataset, 285 | vocab_path=module.get_vocab_path(), 286 | max_seq_len=args.max_seq_len, 287 | sp_model_path=module.get_spm_path(), 288 | word_dict_path=module.get_word_dict_path()) 289 | 290 | # Construct transfer learning network 291 | # Use "sequence_output" for token-level output. 292 | sequence_output = outputs["sequence_output"] 293 | 294 | # Setup feed list for data feeder 295 | # Must feed all the tensor of module need 296 | feed_list = [ 297 | inputs["input_ids"].name, inputs["position_ids"].name, 298 | inputs["segment_ids"].name, inputs["input_mask"].name 299 | ] 300 | 301 | # Select a finetune strategy 302 | strategy = hub.AdamWeightDecayStrategy( 303 | warmup_proportion=args.warmup_proportion, 304 | weight_decay=args.weight_decay, 305 | learning_rate=args.learning_rate) 306 | 307 | print("use_cuda:",args.use_gpu) 308 | 309 | # Setup runing config for PaddleHub Finetune API 310 | config = hub.RunConfig( 311 | eval_interval=args.eval_step, 312 | save_ckpt_interval=args.model_save_step, 313 | use_data_parallel=args.use_data_parallel, 314 | use_cuda=args.use_gpu, 315 | num_epoch=args.num_epoch, 316 | batch_size=args.batch_size, 317 | checkpoint_dir=args.checkpoint_dir, 318 | strategy=strategy) 319 | 320 | # Define a sequence labeling finetune task by PaddleHub's API 321 | # If add crf, the network use crf as decoder 322 | 323 | print("add_gru",args.add_gru) 324 | print("add_crf",args.add_crf) 325 | 326 | if args.add_gru: 327 | 328 | seq_label_task = SequenceLabelTaskSP( 329 | data_reader=reader, 330 | feature=sequence_output, 331 | feed_list=feed_list, 332 | max_seq_len=args.max_seq_len, 333 | num_classes=dataset.num_labels, 334 | config=config, 335 | add_crf=args.add_crf) 336 | else: 337 | seq_label_task = hub.SequenceLabelTask( 338 | data_reader=reader, 339 | feature=sequence_output, 340 | feed_list=feed_list, 341 | max_seq_len=args.max_seq_len, 342 | num_classes=dataset.num_labels, 343 | config=config, 344 | add_crf=args.add_crf) 345 | 346 | 347 | 348 | # 创建 LogWriter 对象 349 | log_writer = MyLog(mode="role2") 350 | seq_label_task._tb_writer=log_writer 351 | 352 | # Finetune and evaluate model by PaddleHub's API 353 | # will finish training, evaluation, testing, save model automatically 354 | if args.do_train: 355 | print("start finetune and eval process") 356 | seq_label_task.finetune_and_eval() 357 | seq_label_task.best_score=-999 358 | 359 | if args.do_eval: 360 | print("start eval process") 361 | seq_label_task.eval() 362 | 363 | if args.do_predict: 364 | print("start predict process") 365 | ret = [] 366 | id2label = {val: key for key, val in reader.label_map.items()} 367 | input_data = [[d] for d in predict_data] 368 | run_states = seq_label_task.predict(data=input_data[1:]) 369 | results = [] 370 | for batch_states in run_states: 371 | batch_results = batch_states.run_results 372 | batch_infers = batch_results[0].reshape([-1]).astype(np.int32).tolist() 373 | seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() 374 | current_id = 0 375 | for length in seq_lens: 376 | seq_infers = batch_infers[current_id:current_id + length] 377 | seq_result = list(map(id2label.get, seq_infers[1: -1])) 378 | current_id += length if args.add_crf else args.max_seq_len 379 | results.append(seq_result) 380 | 381 | ret = [] 382 | for sent, r_label in zip(predict_sents, results): 383 | sent["labels"] = r_label 384 | ret.append(json.dumps(sent, ensure_ascii=False)) 385 | write_by_lines("{}.{}{}.pred".format(args.predict_data, args.do_model, args.outmark), ret) 386 | 387 | if args.do_predict2: 388 | print("start predict2 process") 389 | ret = [] 390 | id2label = {val: key for key, val in reader.label_map.items()} 391 | input_data = [[d] for d in predict_data] 392 | run_states = seq_label_task.predict(data=input_data[1:]) 393 | results = [] 394 | for batch_states in run_states: 395 | batch_results = batch_states.run_results 396 | batch_infers = batch_results[0].reshape([-1]).astype(np.int32).tolist() 397 | seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() 398 | current_id = 0 399 | for length in seq_lens: 400 | seq_infers = batch_infers[current_id:current_id + length] 401 | seq_result = list(map(id2label.get, seq_infers[1: -1])) 402 | current_id += length if args.add_crf else args.max_seq_len 403 | results.append(seq_result) 404 | 405 | ret = [] 406 | for sent, r_label in zip(predict_sents, results): 407 | sent["labels"] = r_label 408 | ret.append(json.dumps(sent, ensure_ascii=False)) 409 | write_by_lines("{}.{}{}.pred".format(args.predict_data, args.do_model, args.outmark), ret) 410 | 411 | 412 | if __name__ == "__main__": 413 | main() 414 | --------------------------------------------------------------------------------