├── __init__.py ├── README.md ├── html_entities ├── ace2ner.py ├── ltf2sent.py ├── ltf2bio.py ├── source2rsd.py ├── tokenizer.py ├── rsd2ltf.py ├── ace2event.py ├── bio2ere.py └── bio2ace.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ACE and ERE Preprocessing 2 | 3 | This repository includes the preprocessing scripts for ACE and ERE datasets, including name tagging, entity coreference, relation extraction, event extraction and event coreference tasks. (verified on ACE_2005 and Rich_ERE corpus) 4 | 5 | ## Requirements 6 | 7 | Python 3.6, jieba, NLTK 8 | 9 | ## Usage 10 | 11 | ### Step 1 12 | Preprocess Data: remove XML tags from ACE/ERE articles, sentence merging. 13 | 14 | ``` 15 | python source2rsd.py --source [source_path] --rsd [rsd_path] --data [ace or ere] --extension [ending_of_source_files] 16 | ``` 17 | 18 | > [source_path]: the path for input files (all .sgm files from ACE source corpus) 19 | 20 | > [rsd_path]: output path 21 | 22 | ### Step 2 23 | Sentence segmentation, tokenization with offset retrieval 24 | 25 | ``` 26 | python rsd2ltf.py --rsd [rsd_path] --ltf [ltf_path] --extension [ending_of_rsd_files] 27 | ``` 28 | 29 | > [rsd_path]: the path for rsd files from step 1 30 | 31 | > [ltf_path]: output path 32 | 33 | ### Step 3 34 | Convert ltf files to sentences of tokens as the bio format in name tagging tasks 35 | 36 | ``` 37 | python ltf2bio.py --ltf [ltf_path] --bio [bio_path] 38 | ``` 39 | 40 | > [ltf_path]: the path for input files 41 | 42 | > [bio_path]: output path 43 | 44 | ### Step 4 45 | Add annotations to bio files 46 | 47 | ``` 48 | python bio2ace.py --bio [bio_path] --ann [ann_path] --ace [ace_path] 49 | ``` 50 | 51 | > [bio_path]: the path for input files 52 | 53 | > [ann_path]: the path for all annotation files from ACE 54 | 55 | > [ace_path]: output path 56 | 57 | Similarly, for ERE corpus, 58 | 59 | ``` 60 | python bio2ere.py --bio [bio_path] --ann [ann_path] --ere [ace_path] 61 | ``` 62 | 63 | ## Citation 64 | [1] Lifu Huang, Taylor Cassidy, Xiaocheng Feng, Heng Ji, Clare R Voss, Jiawei Han, Avirup Sil. Liberal Event Extraction and Event Schema Induction. Proc. ACL'2016 65 | 66 | [2] Lifu Huang, Avirup Sil, Heng Ji, Radu Florian. Improving slot filling performance with attentive neural networks on dependency structures. Proc. EMNLP'2017 67 | 68 | [3] Lifu Huang, Heng Ji, Kyunghyun Cho, Clare R Voss. Zero-shot transfer learning for event extraction, Proc. ACL, 2018 69 | -------------------------------------------------------------------------------- /html_entities: -------------------------------------------------------------------------------- 1 | " " quotation mark 2 | ' ' apostrophe 3 | & & ampersand 4 | < < less-than 5 | > > greater-than 6 |   non-breaking space 7 | ¡ ¡ inverted exclamation mark 8 | ¢ ¢ cent 9 | £ £ pound 10 | ¤ ¤ currency 11 | ¥ ¥ yen 12 | ¦ ¦ broken vertical bar 13 | § § section 14 | ¨ ¨ spacing diaeresis 15 | © © copyright 16 | ª ª feminine ordinal indicator 17 | « « angle quotation mark (left) 18 | ¬ ¬ negation 19 | ­ soft hyphen 20 | ® ® registered trademark 21 | ¯ ¯ spacing macron 22 | ° ° degree 23 | ± ± plus-or-minus 24 | ² ² superscript 2 25 | ³ ³ superscript 3 26 | ´ ´ spacing acute 27 | µ µ micro 28 | ¶ ¶ paragraph 29 | · · middle dot 30 | ¸ ¸ spacing cedilla 31 | ¹ ¹ superscript 1 32 | º º masculine ordinal indicator 33 | » » angle quotation mark (right) 34 | ¼ ¼ fraction 1/4 35 | ½ ½ fraction 1/2 36 | ¾ ¾ fraction 3/4 37 | ¿ ¿ inverted question mark 38 | × × multiplication 39 | ÷ ÷ division 40 | À À capital a, grave accent 41 | Á Á capital a, acute accent 42 | Â Â capital a, circumflex accent 43 | Ã Ã capital a, tilde 44 | Ä Ä capital a, umlaut mark 45 | Å Å capital a, ring 46 | Æ Æ capital ae 47 | Ç Ç capital c, cedilla 48 | È È capital e, grave accent 49 | É É capital e, acute accent 50 | Ê Ê capital e, circumflex accent 51 | Ë Ë capital e, umlaut mark 52 | Ì Ì capital i, grave accent 53 | Í Í capital i, acute accent 54 | Î Î capital i, circumflex accent 55 | Ï Ï capital i, umlaut mark 56 | Ð Ð capital eth, Icelandic 57 | Ñ Ñ capital n, tilde 58 | Ò Ò capital o, grave accent 59 | Ó Ó capital o, acute accent 60 | Ô Ô capital o, circumflex accent 61 | Õ Õ capital o, tilde 62 | Ö Ö capital o, umlaut mark 63 | Ø Ø capital o, slash 64 | Ù Ù capital u, grave accent 65 | Ú Ú capital u, acute accent 66 | Û Û capital u, circumflex accent 67 | Ü Ü capital u, umlaut mark 68 | Ý Ý capital y, acute accent 69 | Þ Þ capital THORN, Icelandic 70 | ß ß small sharp s, German 71 | à à small a, grave accent 72 | á á small a, acute accent 73 | â â small a, circumflex accent 74 | ã ã small a, tilde 75 | ä ä small a, umlaut mark 76 | å å small a, ring 77 | æ æ small ae 78 | ç ç small c, cedilla 79 | è è small e, grave accent 80 | é é small e, acute accent 81 | ê ê small e, circumflex accent 82 | ë ë small e, umlaut mark 83 | ì ì small i, grave accent 84 | í í small i, acute accent 85 | î î small i, circumflex accent 86 | ï ï small i, umlaut mark 87 | ð ð small eth, Icelandic 88 | ñ ñ small n, tilde 89 | ò ò small o, grave accent 90 | ó ó small o, acute accent 91 | ô ô small o, circumflex accent 92 | õ õ small o, tilde 93 | ö ö small o, umlaut mark 94 | ø ø small o, slash 95 | ù ù small u, grave accent 96 | ú ú small u, acute accent 97 | û û small u, circumflex accent 98 | ü ü small u, umlaut mark 99 | ý ý small y, acute accent 100 | þ þ small thorn, Icelandic 101 | ÿ ÿ small y, umlaut mark -------------------------------------------------------------------------------- /ace2ner.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import csv 4 | 5 | 6 | def write_ner(ace_file, ner_file): 7 | all_lines = [] 8 | with open(ace_file, 'r') as csv_file: 9 | reader = csv.DictReader(csv_file) 10 | for row in reader: 11 | token = row["token"] 12 | if token == "----sentence_delimiter----": 13 | all_lines.append("\n") 14 | else: 15 | token_offset_parts = row["offset"].split(':') 16 | offset_parts = token_offset_parts[1].split('-') 17 | token_start = int(offset_parts[0]) 18 | token_end = int(offset_parts[1]) 19 | if row["ner_type"] == "O": 20 | all_lines.append(row["token"] + " " + row["offset"] + " " + row["ner_type"] + "\n") 21 | else: 22 | ner_nam_nom = row["ner_nam_nom"] 23 | if ner_nam_nom == "NAM": 24 | ner_offset_parts = row["ner_offset"].split(':') 25 | ner_start = int(ner_offset_parts[0]) 26 | ner_end = int(ner_offset_parts[1]) 27 | ner_type_parts = row["ner_type"].split(":") 28 | tag = ner_type_parts[0] + "-" + determine_tag(token_start, token_end, ner_start, ner_end) 29 | all_lines.append(row["token"] + " " + row["offset"] + " " + tag + "\n") 30 | else: 31 | all_lines.append(row["token"] + " " + row["offset"] + " " + "O" + "\n") 32 | new_all_lines = validate_lines(all_lines) 33 | out = open(ner_file, 'w') 34 | for l in new_all_lines: 35 | out.write(l) 36 | out.close() 37 | 38 | 39 | def validate_lines(all_lines): 40 | new_all_lines = [] 41 | pre_tag = "" 42 | for i in range(len(all_lines)): 43 | current_line = all_lines[i].strip() 44 | if len(current_line) == 0: 45 | new_all_lines.append(current_line + "\n") 46 | else: 47 | parts = current_line.split(' ') 48 | tag = parts[2] 49 | if tag.endswith("I") and not (pre_tag.endswith("B") or pre_tag.endswith("I")): 50 | print("Error " + current_line) 51 | new_line = all_lines[i].strip()[:-1] + "B" 52 | new_all_lines.append(new_line + "\n") 53 | else: 54 | new_all_lines.append(all_lines[i].strip() + "\n") 55 | pre_tag = tag 56 | return new_all_lines 57 | 58 | 59 | def determine_tag(token_start, token_end, ner_start, ner_end): 60 | tag = "B" 61 | if token_start <= ner_start <= token_end: 62 | tag = "B" 63 | elif ner_start < token_start < ner_end: 64 | tag = "I" 65 | return tag 66 | 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('--ace', type=str, 71 | help='ace input path') 72 | parser.add_argument('--ner', type=str, 73 | help='ner bio path') 74 | 75 | args = parser.parse_args() 76 | 77 | ace_path = args.ace 78 | ner_path = args.ner 79 | 80 | if not os.path.exists(ner_path): 81 | os.makedirs(ner_path) 82 | 83 | file_names = [] 84 | if os.path.isdir(ace_path): 85 | file_names = [item[:-4] 86 | for item in os.listdir(ace_path) 87 | if item.endswith(".csv")] 88 | else: 89 | file_names = [ace_path] 90 | 91 | for f in file_names: 92 | print(f) 93 | ace_file= os.path.join(ace_path, f+".csv") 94 | ner_file = os.path.join(ner_path, f+".ner") 95 | 96 | if os.path.exists(ace_file): 97 | write_ner(ace_file, ner_file) -------------------------------------------------------------------------------- /ltf2sent.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | import argparse 4 | import sys 5 | import xml.etree.ElementTree as ET 6 | 7 | 8 | def ltf2sent(ltf_str): 9 | doc_tokens = load_ltf(ltf_str.encode('utf-8')) 10 | 11 | all_sents = [] 12 | for sent in doc_tokens: 13 | sent_res = [] 14 | for token in sent: 15 | t_text = token[0] 16 | if not t_text.strip(): 17 | continue 18 | if t_text is None: 19 | t_text = '' 20 | # get token bio tag 21 | sent_res.append(t_text) 22 | all_sents.append(' '.join(sent_res)) 23 | 24 | return '\n'.join(all_sents) 25 | 26 | 27 | def load_ltf(ltf_str): 28 | doc_tokens = [] 29 | root = ET.fromstring(ltf_str) 30 | doc_id = root.find('DOC').get('id') 31 | for seg in root.find('DOC').find('TEXT').findall('SEG'): 32 | sent_tokens = [] 33 | seg_text = seg.find('ORIGINAL_TEXT').text 34 | seg_start = int(seg.get('start_char')) 35 | seg_end = int(seg.get('end_char')) 36 | for token in seg.findall('TOKEN'): 37 | token_text = token.text 38 | start_char = int(token.get('start_char')) 39 | end_char = int(token.get('end_char')) 40 | 41 | assert seg_text[start_char-seg_start:end_char-seg_start+1] == token_text, \ 42 | 'ltf2bio load_ltf token offset error.' 43 | 44 | sent_tokens.append((token_text, doc_id, start_char, end_char)) 45 | doc_tokens.append(sent_tokens) 46 | 47 | return doc_tokens 48 | 49 | 50 | def write2file(bio_str, out_file): 51 | with codecs.open(out_file, 'w', 'utf-8') as f: 52 | f.write(bio_str) 53 | 54 | 55 | if __name__ == "__main__": 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('--ltf', type=str, 58 | help='ltf input path') 59 | parser.add_argument('--sent', type=str, 60 | help='output path') 61 | parser.add_argument('--ltf_filelist', type=str, 62 | help='ltf filelist path') 63 | parser.add_argument('-s', '--separate_output', action='store_true', default=True, 64 | help='separate output') 65 | 66 | args = parser.parse_args() 67 | 68 | ltf_input = args.ltf 69 | output = args.sent 70 | ltf_filelist = args.ltf_filelist 71 | separate_output = args.separate_output 72 | 73 | ltf_fp = [] 74 | if os.path.isdir(ltf_input): 75 | if not os.path.exists(output): 76 | os.makedirs(output) 77 | if args.ltf_filelist: 78 | ltf_filelist = open(args.ltf_filelist).read().splitlines() 79 | ltf_fp = [os.path.join(ltf_input, item) 80 | for item in ltf_filelist] 81 | else: 82 | ltf_fp = [os.path.join(ltf_input, item) 83 | for item in os.listdir(args.ltf) 84 | if '.ltf.xml' in item] 85 | else: 86 | ltf_fp = [ltf_input] 87 | 88 | res = [] 89 | for i, filepath in enumerate(ltf_fp): 90 | 91 | assert os.path.exists(filepath) 92 | 93 | print(filepath) 94 | 95 | ltf_str = codecs.open(filepath, 'r', 'utf-8').read() 96 | bio_str = ltf2sent(ltf_str) 97 | if separate_output: 98 | out_file = os.path.join( 99 | output, os.path.basename(filepath).replace('.ltf.xml', '') 100 | ) 101 | write2file(bio_str, out_file) 102 | res.append(bio_str) 103 | 104 | sys.stdout.write('%d docs processed.\r' % i) 105 | sys.stdout.flush() 106 | 107 | if not separate_output: 108 | write2file('\n\n'.join(res), args.output) 109 | 110 | print('%d docs processed in total.' % len(ltf_fp)) 111 | -------------------------------------------------------------------------------- /ltf2bio.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | import argparse 4 | import sys 5 | import xml.etree.ElementTree as ET 6 | 7 | 8 | def ltf2bio(ltf_str): 9 | doc_tokens = load_ltf(ltf_str.encode('utf-8')) 10 | 11 | bio = [] 12 | for sent in doc_tokens: 13 | sent_res = [] 14 | for token in sent: 15 | t_text = token[0] 16 | if not t_text.strip(): 17 | continue 18 | if t_text is None: 19 | t_text = '' 20 | t_doc_id = token[1] 21 | t_start_char = token[2] 22 | t_end_char = token[3] 23 | 24 | # get token bio tag 25 | sent_res.append(' '.join([t_text, 26 | '%s:%s-%s' % (t_doc_id, 27 | t_start_char, 28 | t_end_char)])) 29 | bio.append('\n'.join(sent_res)) 30 | 31 | return '\n\n'.join(bio) 32 | 33 | 34 | def load_ltf(ltf_str): 35 | doc_tokens = [] 36 | root = ET.fromstring(ltf_str) 37 | doc_id = root.find('DOC').get('id') 38 | for seg in root.find('DOC').find('TEXT').findall('SEG'): 39 | sent_tokens = [] 40 | seg_text = seg.find('ORIGINAL_TEXT').text 41 | seg_start = int(seg.get('start_char')) 42 | seg_end = int(seg.get('end_char')) 43 | for token in seg.findall('TOKEN'): 44 | token_text = token.text 45 | start_char = int(token.get('start_char')) 46 | end_char = int(token.get('end_char')) 47 | 48 | assert seg_text[start_char-seg_start:end_char-seg_start+1] == token_text, \ 49 | 'ltf2bio load_ltf token offset error.' 50 | 51 | sent_tokens.append((token_text, doc_id, start_char, end_char)) 52 | doc_tokens.append(sent_tokens) 53 | 54 | return doc_tokens 55 | 56 | 57 | def write2file(bio_str, out_file): 58 | with codecs.open(out_file, 'w', 'utf-8') as f: 59 | f.write(bio_str) 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--ltf', type=str, 65 | help='ltf input path') 66 | parser.add_argument('--bio', type=str, 67 | help='output path') 68 | parser.add_argument('--ltf_filelist', type=str, 69 | help='ltf filelist path') 70 | parser.add_argument('-s', '--separate_output', action='store_true', default=True, 71 | help='separate output') 72 | 73 | args = parser.parse_args() 74 | 75 | ltf_input = args.ltf 76 | output = args.bio 77 | ltf_filelist = args.ltf_filelist 78 | separate_output = args.separate_output 79 | 80 | ltf_fp = [] 81 | if os.path.isdir(ltf_input): 82 | if not os.path.exists(output): 83 | os.makedirs(output) 84 | if args.ltf_filelist: 85 | ltf_filelist = open(args.ltf_filelist).read().splitlines() 86 | ltf_fp = [os.path.join(ltf_input, item) 87 | for item in ltf_filelist] 88 | else: 89 | ltf_fp = [os.path.join(ltf_input, item) 90 | for item in os.listdir(args.ltf) 91 | if '.ltf.xml' in item] 92 | else: 93 | ltf_fp = [ltf_input] 94 | 95 | res = [] 96 | for i, filepath in enumerate(ltf_fp): 97 | 98 | assert os.path.exists(filepath) 99 | 100 | print(filepath) 101 | 102 | ltf_str = codecs.open(filepath, 'r', 'utf-8').read() 103 | bio_str = ltf2bio(ltf_str) 104 | if separate_output: 105 | out_file = os.path.join( 106 | output, os.path.basename(filepath).replace('.ltf.xml', '.bio') 107 | ) 108 | write2file(bio_str, out_file) 109 | res.append(bio_str) 110 | 111 | sys.stdout.write('%d docs processed.\r' % i) 112 | sys.stdout.flush() 113 | 114 | if not separate_output: 115 | write2file('\n\n'.join(res), args.output) 116 | 117 | print('%d docs processed in total.' % len(ltf_fp)) 118 | -------------------------------------------------------------------------------- /source2rsd.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | html_entities = [] 5 | 6 | python_path = os.path.abspath(__file__).replace("source2rsd.py", "") 7 | with open(os.path.join(python_path, "html_entities"), 'r') as f: 8 | for line in f: 9 | parts = line.strip().split('\t') 10 | html_entities.append(parts[1]) 11 | 12 | 13 | def remove_xml_tag(source_file, rsd_file, data): 14 | out = open(rsd_file, 'w') 15 | signal = 0 # 0 before read , 1 after 16 | lines = [] 17 | with open(source_file, 'r') as f: 18 | for line in f: 19 | line = line.strip('\n') 20 | if line == "": 21 | signal = 1 22 | if signal == 0: 23 | new_line = remove_tag(line, data, signal) 24 | out.write(new_line + " ") 25 | elif signal == 1: 26 | lines.append(line) 27 | con_line = ' '.join(lines) 28 | new_line = remove_tag(con_line, data, signal) 29 | out.write(new_line + " ") 30 | out.close() 31 | 32 | 33 | def remove_tag(sent, data, signal): 34 | newsent = sent 35 | if data == 'ace' or data.lower() == 'ace': 36 | # keep text only after 37 | 38 | if (newsent.startswith("") or newsent.startswith("") 39 | or newsent.startswith("") or newsent.startswith("")): 40 | while "<" in newsent and ">" in newsent and newsent.index("<") < newsent.index(">"): 41 | index1 = newsent.index("<") 42 | index2 = newsent.index(">") 43 | str1 = newsent[0:index1] 44 | str2 = newsent[index2+1:] 45 | newsent = str1+str2 46 | else: 47 | while "<" in newsent and ">" in newsent and newsent.index("<") < newsent.index(">"): 48 | index1 = newsent.index("<") 49 | index2 = newsent.index(">") 50 | str1 = newsent[0:index1] 51 | str2 = newsent[index2+1:] 52 | newsent = str1+str2 53 | 54 | if signal == 0: 55 | newsent = ''.join(len(newsent) * [' ']) 56 | 57 | elif data == 'ere' or data.lower() == 'ere': 58 | # replace html entities 59 | for ent in html_entities: 60 | space_str = ''.join(len(ent)*[' ']) 61 | newsent = newsent.replace(ent, space_str) 62 | tags = ["", " ") 67 | newsent = newsent.replace("=", " ") 68 | newsent = newsent.replace("\"", " ") 69 | 70 | tags1 = ["" in newsent and newsent.index(tag)"): 73 | idx1 = newsent.index(tag) 74 | idx2 = newsent.index(">") 75 | subsent1 = newsent[0:idx1] 76 | subsent2 = newsent[idx2+1:] 77 | subsent3 = newsent[idx1:idx2+1] 78 | spaces_str = ''.join(len(subsent3) * [' ']) 79 | newsent = subsent1 + spaces_str + subsent2 80 | return newsent 81 | 82 | 83 | if __name__ == "__main__": 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument('--source', type=str, 86 | help='input path') 87 | parser.add_argument('--rsd', type=str, 88 | help='rsd path') 89 | parser.add_argument('--data', type=str, 90 | help='ace or ere') 91 | parser.add_argument('--extension', type=str, default=".sgm", 92 | help='') 93 | 94 | args = parser.parse_args() 95 | 96 | source_path = args.source 97 | rsd_path = args.rsd 98 | data = args.data 99 | suffix = args.extension 100 | 101 | if not os.path.exists(rsd_path): 102 | os.makedirs(rsd_path) 103 | 104 | file_names = [] 105 | if os.path.isdir(source_path): 106 | file_names = [item for item in os.listdir(source_path) if item.endswith(suffix)] 107 | else: 108 | file_names = [source_path] 109 | 110 | for f in file_names: 111 | source_file= os.path.join(source_path, f) 112 | rsd_file = os.path.join(rsd_path, f) 113 | 114 | if os.path.exists(source_file): 115 | remove_xml_tag(source_file, rsd_file, data) -------------------------------------------------------------------------------- /tokenizer.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import os 3 | import jieba 4 | import nltk 5 | import re 6 | import itertools 7 | import unicodedata as ud 8 | 9 | 10 | class Tokenizer(object): 11 | def __init__(self, seg_option="linebreak", tok_option="unitok"): 12 | self.segmenters = {'linebreak': self.seg_linebreak, 13 | 'nltk': self.seg_nltk, 14 | 'cmn': self.seg_cmn, 15 | 'edl_spanish': self.seg_edl_spanish, 16 | 'edl_cmn': self.seg_edl_cmn, 17 | 'nltk+linebreak': self.seg_nltk_linebreak, 18 | 'tigrinya': self.seg_tigrinya 19 | } 20 | self.tokenizers = {'unitok': self.tok_unitok, 21 | 'unitok_cut': self.tok_unitok_cut, 22 | 'regexp': self.tok_regexp, 23 | 'nltk_wordpunct': self.tok_nltk_wordpunct, 24 | 'space': self.tok_space, 25 | 'char': self.tok_char, 26 | 'jieba': self.tok_jieba, 27 | } 28 | 29 | self.root_dir = os.path.dirname(os.path.abspath(__file__)) 30 | 31 | self.seg_option = seg_option 32 | self.tok_option = tok_option 33 | 34 | # initialize jieba cn tok 35 | if tok_option == 'jieba': 36 | jieba.initialize() 37 | 38 | def run_segmenter(self, plain_text): 39 | # right strip plain text 40 | plain_text = plain_text.rstrip() 41 | 42 | # run segmenter 43 | sents = self.segmenters[self.seg_option](plain_text) 44 | 45 | sents = [s for s in sents if s.strip()] 46 | 47 | return sents 48 | 49 | def run_tokenizer(self, sents): 50 | # right strip each sent 51 | for i in range(len(sents)): 52 | sents[i] = sents[i].rstrip() 53 | 54 | # run tokenizer 55 | tokenized_sents = self.tokenizers[self.tok_option](sents) 56 | 57 | for i, s in enumerate(tokenized_sents): 58 | s = [t for t in s if t.strip()] 59 | tokenized_sents[i] = s 60 | 61 | return tokenized_sents 62 | 63 | # 64 | # segmenters 65 | # 66 | def seg_linebreak(self, plain_text): 67 | """ 68 | use "\n" as delimiter 69 | :param plain_text: 70 | :return: 71 | """ 72 | result = [item.strip() for item in plain_text.split('\n') if item.strip()] 73 | 74 | return result 75 | 76 | def seg_nltk(self, plain_text): 77 | """ 78 | use nltk default segmenter 79 | :param plain_text: 80 | :return: 81 | """ 82 | result = [item.strip() for item in nltk.sent_tokenize(plain_text)] 83 | 84 | return result 85 | 86 | def seg_nltk_linebreak(self, plain_text): 87 | """ 88 | use nltk segmenter and then use "\n" as delimiter to re-segment. 89 | :param plain_text: 90 | :return: 91 | """ 92 | nltk_result = '\n'.join(self.seg_nltk(plain_text)) 93 | linebreak_result = self.seg_linebreak(nltk_result) 94 | 95 | return linebreak_result 96 | 97 | def seg_cmn(self, plain_text): 98 | """ 99 | use Chinese punctuation as delimiter 100 | :param plain_text: 101 | :return: 102 | """ 103 | res = [] 104 | sent_end_char = [u'。', u'!', u'?'] 105 | current_sent = '' 106 | for i, char in enumerate(list(plain_text)): 107 | if char in sent_end_char or i == len(list(plain_text)) - 1: 108 | res.append(current_sent + char) 109 | current_sent = '' 110 | else: 111 | current_sent += char 112 | 113 | return [item.strip() for item in res] 114 | 115 | def seg_edl(self, plain_text, seg_option): 116 | # replace \n with ' ' because of the fix line length of edl data 117 | # plain_text = plain_text.replace('\n', ' ') 118 | 119 | # do sentence segmentation 120 | if seg_option == 'edl_spanish': 121 | # use nltk sent tokenization for spanish 122 | tmp_seg = nltk.sent_tokenize(plain_text) 123 | if seg_option == 'edl_cmn': 124 | # use naive sent tokenization for chinese 125 | tmp_seg = self.seg_cmn(plain_text) 126 | 127 | # recover \n after xml tag 128 | recovered_tmp_seg = [] 129 | for sent in tmp_seg: 130 | sent = sent.replace('> ', '>\n').replace(' <', '\n<') 131 | sent = sent.split('\n') 132 | recovered_tmp_seg += [item.strip() for item in sent] 133 | 134 | return recovered_tmp_seg 135 | 136 | def seg_edl_spanish(self, plain_text): 137 | return self.seg_edl(plain_text, 'edl_spanish') 138 | 139 | def seg_edl_cmn(self, plain_text): 140 | return self.seg_edl(plain_text, 'edl_cmn') 141 | 142 | def seg_tigrinya(self, plain_text): 143 | result = [item.strip() for item in plain_text.split('\n') if 144 | item.strip()] 145 | 146 | updated_result = [] 147 | for r in result: 148 | if '።' in r: 149 | sents = [] 150 | start = 0 151 | for i, char in enumerate(r): 152 | if char == '።': 153 | sents.append(r[start:i+1]) 154 | start = i + 1 155 | updated_result += sents 156 | else: 157 | updated_result.append(r) 158 | 159 | return updated_result 160 | 161 | # 162 | # tokenizers 163 | # 164 | def tok_unitok(self, sents): 165 | res = [] 166 | for s in sents: 167 | s = unitok_tokenize(s).split() 168 | res.append(s) 169 | 170 | return res 171 | 172 | def tok_unitok_cut(self, sents): 173 | res = [] 174 | num_sent_cut = 0 175 | for s in sents: 176 | s = unitok_tokenize(s).split() 177 | if len(s) > 80: 178 | sub_sents = [item.split() for item in nltk.sent_tokenize(' '.join(s))] 179 | assert sum([len(item) for item in sub_sents]) == len(s) 180 | 181 | # sub_sent = [list(group) for k, group in 182 | # itertools.groupby(s, lambda x: x == ".") if not k] 183 | res += sub_sents 184 | if len(sub_sents) > 1: 185 | num_sent_cut += 1 186 | else: 187 | res.append(s) 188 | print('%d sentences longer than 80 and cut by delimiter ".".') 189 | return res 190 | 191 | def tok_regexp(self, sents): 192 | result = [] 193 | for s in sents: 194 | tokenizer = nltk.tokenize.RegexpTokenizer('\w+|\$[\d\.]+|\S+') 195 | tokenization_out = tokenizer.tokenize(s) 196 | result.append(tokenization_out) 197 | 198 | return result 199 | 200 | def tok_nltk_wordpunct(self, sents): 201 | result = [] 202 | for s in sents: 203 | tokenizer = nltk.tokenize.WordPunctTokenizer() 204 | tokenization_out = tokenizer.tokenize(s) 205 | result.append(tokenization_out) 206 | return result 207 | 208 | def tok_space(self, sents): 209 | result = [] 210 | for s in sents: 211 | tokenization_out = s.split(' ') 212 | result.append(tokenization_out) 213 | return result 214 | 215 | def tok_char(self, sents): 216 | result = [] 217 | for s in sents: 218 | tokenization_out = list(s) 219 | result.append(tokenization_out) 220 | return result 221 | 222 | def tok_jieba(self, sents): 223 | result = [] 224 | for s in sents: 225 | raw_tokenization_out = list(jieba.cut(s)) 226 | result.append(raw_tokenization_out) 227 | return result 228 | 229 | 230 | # by Jon May 231 | def unitok_tokenize(data): 232 | toks = [] 233 | for offset, char in enumerate(data): 234 | cc = ud.category(char) 235 | # separate text by punctuation or symbol 236 | if char in ['ʼ', '’', '‘', '´', '′', "'"]: # do not tokenize oromo apostrophe 237 | toks.append(char) 238 | elif cc.startswith("P") or cc.startswith("S") \ 239 | or char in ['።', '፡']: # Tigrinya period and comma 240 | toks.append(' ') 241 | toks.append(char) 242 | toks.append(' ') 243 | else: 244 | toks.append(char) 245 | 246 | toks = [item for item in ''.join(toks).split() if item] 247 | 248 | return ' '.join(toks) -------------------------------------------------------------------------------- /rsd2ltf.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import os 3 | import argparse 4 | import sys 5 | import itertools 6 | import xml.dom.minidom 7 | import xml.etree.ElementTree as ET 8 | import codecs 9 | 10 | # dirty import from current dir 11 | script_dirname = os.path.dirname(os.path.abspath(__file__)) 12 | sys.path.append(script_dirname) 13 | from tokenizer import Tokenizer 14 | 15 | 16 | def rsd2ltf(rsd_str, doc_id, 17 | seg_option='linebreak', 18 | tok_option='unitok', 19 | re_segment=False): 20 | tokenizer = Tokenizer(seg_option, tok_option) 21 | 22 | if re_segment: 23 | # running segmentation and tokenization, then re-segment the tokenized 24 | # sentences (use space to concatenate tokens. this solves segmentation 25 | # problem, e.g. How are you?I'm fine.). 26 | sents = tokenizer.run_segmenter(rsd_str) 27 | raw_tokens = tokenizer.run_tokenizer(sents) 28 | 29 | # re-segment tokenized sentence 30 | num_sent_reseg = 0 31 | tokens = [] 32 | for i, t in enumerate(raw_tokens): 33 | reseg = [item.split() for item in tokenizer.run_segmenter(' '.join(t))] 34 | if len(reseg) > 1: 35 | num_sent_reseg += 1 36 | 37 | tokens += reseg 38 | 39 | # compute offset for each token 40 | indexer = 0 41 | token_offset = [] 42 | for i, t in enumerate(itertools.chain(*tokens)): 43 | while not rsd_str[indexer:].startswith(t) and \ 44 | indexer < len(rsd_str): 45 | indexer += 1 46 | if indexer < len(rsd_str): 47 | t_start = indexer 48 | t_end = t_start + len(t) - 1 49 | assert rsd_str[t_start:t_end + 1] == t, \ 50 | "re_segment token offset not match %s-%d" % (doc_id, i) 51 | token_offset.append((t_start, t_end)) 52 | indexer = t_end + 1 53 | 54 | assert len(token_offset) == len(list(itertools.chain(*tokens))), \ 55 | "re_segment tokenization offset error in: %s" % doc_id 56 | 57 | # recover sent using tokens 58 | sents = [] 59 | prev_token_end = token_offset[0][0]-1 60 | token_index = 0 61 | for i, t in enumerate(tokens): 62 | sent = '' 63 | for j, item in enumerate(t): 64 | if j == 0: 65 | prev_token_end = token_offset[token_index][0] - 1 66 | 67 | sent += ' ' * (token_offset[token_index][0] - prev_token_end - 1) + item 68 | 69 | prev_token_end = token_offset[token_index][1] 70 | 71 | token_index += 1 72 | 73 | assert sent in rsd_str, \ 74 | 're_segment sentence offset error.' 75 | 76 | sents.append(sent) 77 | 78 | else: 79 | # running segmentation and tokenization 80 | sents = tokenizer.run_segmenter(rsd_str) 81 | tokens = tokenizer.run_tokenizer(sents) 82 | 83 | # generate offset for sentences and tokens 84 | indexer = 0 85 | sent_offset = [] 86 | for i, s in enumerate(sents): 87 | while not rsd_str[indexer:].startswith(s) and indexer < len(rsd_str): 88 | indexer += 1 89 | if indexer < len(rsd_str): 90 | sent_start = indexer 91 | sent_end = sent_start + len(s) - 1 92 | assert rsd_str[sent_start:sent_end+1] == s, \ 93 | "sentence offset not match %s-%d" % (doc_id, i) 94 | sent_offset.append((sent_start, sent_end)) 95 | indexer = sent_end + 1 96 | 97 | assert len(sent_offset) == len(sents), \ 98 | "sentence segmentation offset error in: %s" % doc_id 99 | 100 | token_offsets = [] 101 | for i, tok in enumerate(tokens): 102 | sent_text = sents[i] 103 | indexer = 0 104 | t_offset = [] 105 | for j, t in enumerate(tok): 106 | while not sent_text[indexer:].startswith(t) and \ 107 | indexer < len(sent_text): 108 | indexer += 1 109 | if indexer < len(sent_text): 110 | t_start = indexer 111 | t_end = t_start + len(t) - 1 112 | assert sent_text[t_start:t_end+1] == t, \ 113 | "token offset not match %s-%d-%d" % (doc_id, i, j) 114 | t_offset.append((t_start, t_end)) 115 | indexer = t_end + 1 116 | token_offsets.append(t_offset) 117 | 118 | assert len(t_offset) == len(tok), \ 119 | "tokenization offset error in: %s-%d" % (doc_id, i) 120 | 121 | # convert seg/tok result to ltf 122 | root = ET.Element('LCTL_TEXT') 123 | doc_element = ET.Element('DOC', {'id': doc_id}) 124 | text_element = ET.Element('TEXT') 125 | root.append(doc_element) 126 | doc_element.append(text_element) 127 | 128 | for i in range(len(sents)): 129 | seg_text = sents[i] 130 | seg_start_char = sent_offset[i][0] 131 | seg_end_char = sent_offset[i][1] 132 | 133 | seg_id = '%s-%s' % (doc_id, str(i)) 134 | 135 | seg_element = ET.Element('SEG', {'id': seg_id, 136 | 'start_char': str(seg_start_char), 137 | 'end_char': str(seg_end_char)}) 138 | original_text_element = ET.Element('ORIGINAL_TEXT') 139 | original_text_element.text = seg_text 140 | seg_element.append(original_text_element) 141 | 142 | for j in range(len(tokens[i])): 143 | token_id = 'token-%d-%d' % (i, j) 144 | tok_text = tokens[i][j] 145 | if not tok_text: 146 | continue 147 | tok_start_char = int(token_offsets[i][j][0]) + seg_start_char 148 | tok_end_char = int(token_offsets[i][j][1]) + seg_start_char 149 | 150 | assert rsd_str[tok_start_char:tok_end_char+1] == tok_text 151 | 152 | token_element = ET.Element('TOKEN', 153 | {'id': token_id, 154 | 'start_char': str(tok_start_char), 155 | 'end_char': str(tok_end_char)}) 156 | token_element.text = tok_text 157 | seg_element.append(token_element) 158 | 159 | text_element.append(seg_element) 160 | 161 | return root 162 | 163 | 164 | def write2file(ltf_root, out_file): 165 | # pretty print xml 166 | root_str = ET.tostring(ltf_root, 'utf-8') 167 | f_xml = xml.dom.minidom.parseString(root_str) 168 | pretty_xml_as_string = f_xml.toprettyxml(encoding="utf-8") 169 | f = open(out_file, 'wb') 170 | f.write(pretty_xml_as_string) 171 | f.close() 172 | 173 | 174 | if __name__ == "__main__": 175 | parser = argparse.ArgumentParser() 176 | parser.add_argument('--rsd', type=str, 177 | help='input rsd file path or directory.') 178 | parser.add_argument('--ltf', type=str, 179 | help='output ltf file path or directory.') 180 | t = Tokenizer() 181 | parser.add_argument('--seg_option', default='nltk+linebreak', 182 | help="segmentation options: %s (default is linebreak)" % 183 | ', '.join(t.segmenters.keys())) 184 | parser.add_argument('--tok_option', default='unitok', 185 | help="tokenization options: %s (default is unitok)" % 186 | ', '.join(t.tokenizers.keys())) 187 | parser.add_argument('--extension', default=".sgm", 188 | help="extension of rsd file") 189 | parser.add_argument('--re_segment', action='store_true', default=False, 190 | help='first run tokenizaiton, and then segmentation.') 191 | 192 | args = parser.parse_args() 193 | 194 | input_rsd = args.rsd 195 | output_ltf = args.ltf 196 | seg_option = args.seg_option 197 | tok_option = args.tok_option 198 | extension = args.extension 199 | re_segment = args.re_segment 200 | 201 | rsd_files = [] 202 | output_files = [] 203 | if os.path.isdir(input_rsd): 204 | if not os.path.exists(output_ltf): 205 | os.makedirs(output_ltf) 206 | 207 | for fn in os.listdir(input_rsd): 208 | if extension not in fn: 209 | continue 210 | rsd_files.append(os.path.join(input_rsd, fn)) 211 | output_files.append(os.path.join(output_ltf, 212 | fn.replace(extension, '.ltf.xml'))) 213 | else: 214 | rsd_files = [input_rsd] 215 | output_files = [output_ltf] 216 | 217 | for k, rsd_f in enumerate(rsd_files): 218 | try: 219 | rsd_str = codecs.open(rsd_f, 'r', 'utf-8').read() 220 | 221 | doc_id = os.path.basename(rsd_f).replace(extension, '') 222 | 223 | ltf_root = rsd2ltf(rsd_str, doc_id, seg_option, tok_option, 224 | re_segment) 225 | 226 | write2file(ltf_root, output_files[k]) 227 | 228 | except AssertionError as e: 229 | print(e) 230 | 231 | sys.stdout.write('%d files processed.\r' % k) 232 | sys.stdout.flush() 233 | 234 | sys.stdout.write('%d files processed.' % len(rsd_files)) 235 | -------------------------------------------------------------------------------- /ace2event.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import csv 4 | 5 | import spacy 6 | from spacy.tokens import Doc 7 | 8 | 9 | class WhitespaceTokenizer(object): 10 | def __init__(self, vocab): 11 | self.vocab = vocab 12 | 13 | def __call__(self, text): 14 | words = text.split(' ') 15 | # All tokens 'own' a subsequent space character in this tokenizer 16 | spaces = [True] * len(words) 17 | return Doc(self.vocab, words=words, spaces=spaces) 18 | 19 | 20 | def write_event(ace_file, trigger_file, arg_file, dep, nlp): 21 | all_sents = [] 22 | sent = [] 23 | with open(ace_file, 'r') as csv_file: 24 | reader = csv.DictReader(csv_file) 25 | for row in reader: 26 | token = row["token"] 27 | if token == "----sentence_delimiter----": 28 | all_sents.append(sent) 29 | sent = [] 30 | else: 31 | token_offset_parts = row["offset"].split(':') 32 | offset_parts = token_offset_parts[1].split('-') 33 | token_start = int(offset_parts[0]) 34 | token_end = int(offset_parts[1]) 35 | 36 | ner_tag = "O" 37 | if row["ner_type"] != "O": 38 | ner_offset = row["ner_offset"].split(":") 39 | ner_start = int(ner_offset[0]) 40 | if "#@#" in ner_offset[1]: 41 | ner_offset_parts = ner_offset[1].split("#@#") 42 | ner_end = int(ner_offset_parts[0]) 43 | else: 44 | ner_end = int(ner_offset[1]) 45 | ner_type_parts = row["ner_type"].split(":") 46 | ner_tag = ner_type_parts[0] + "-" + determine_tag(token_start, token_end, ner_start, ner_end) 47 | if row["trigger_type"] == "O": 48 | sent.append(row["token"] + "\t" + row["offset"] + "\t" + row["trigger_type"] + "\t" + 49 | row["trigger_arguments"] + "\t" + ner_tag) 50 | else: 51 | event_offset_parts = row["trigger_offset"].split(':') 52 | event_start = int(event_offset_parts[0]) 53 | event_end = int(event_offset_parts[1]) 54 | event_type_parts = row["trigger_type"].split(":") 55 | tag = event_type_parts[1] + "-" + determine_tag(token_start, token_end, event_start, event_end) 56 | sent.append(row["token"] + "\t" + row["offset"] + "\t" + tag + "\t" + row["trigger_arguments"] 57 | + "\t" + ner_tag) 58 | if len(sent) > 0: 59 | all_sents.append(sent) 60 | sent = [] 61 | 62 | vtag_all_sents = validate_tags(all_sents) # check if a mention starts with "I" without "B" 63 | vseg_all_sents = validate_sent_seg(vtag_all_sents) # check if an event mention occurs in separate sents 64 | 65 | # write trigger and argument file 66 | out_trigger = open(trigger_file, 'w') 67 | out_arg = open(arg_file, 'w') 68 | 69 | for i in range(len(vseg_all_sents)): 70 | sent_id = i 71 | current_sent = vseg_all_sents[i] 72 | 73 | tok_idx2token = {} 74 | tok_idx2offset = {} 75 | tok_idx2label = {} 76 | tok_idx2ner = {} 77 | trigger_b2i = {} 78 | # write triggers 79 | pre_b_idx = -1 80 | for t in range(len(current_sent)): 81 | parts = current_sent[t].strip().split('\t') 82 | out_trigger.write(str(sent_id) + '\t' + str(t) + '\t' + parts[0] + '\t' + parts[1] + '\t' + parts[2] + "\n") 83 | tok_idx2offset[t] = parts[1] 84 | tok_idx2token[t] = parts[0] 85 | tok_idx2label[t] = parts[2] 86 | tok_idx2ner[t] = parts[-1] 87 | if parts[2].endswith('B'): 88 | pre_b_idx = t 89 | trigger_b2i[t] = [t] 90 | elif parts[2].endswith('O'): 91 | pre_b_idx = -1 92 | elif parts[2].endswith('I'): 93 | tmp = trigger_b2i[pre_b_idx] 94 | tmp.append(t) 95 | trigger_b2i[pre_b_idx] = tmp 96 | out_trigger.write('\n') 97 | 98 | # write arguments 99 | trigger2arg2role_idx = {} 100 | for t in range(len(current_sent)): 101 | parts = current_sent[t].strip().split('\t') 102 | if parts[2].endswith("B"): 103 | e1_idx = t 104 | arg_str = parts[3] 105 | if arg_str != 'O': 106 | args = arg_str.split(' ') 107 | for arg in args: 108 | arg_parts = arg.split(':') 109 | start = int(arg_parts[2]) 110 | end = int(arg_parts[3]) 111 | role = arg_parts[1] 112 | e2_idx_set = search_e2(tok_idx2offset, start, end) 113 | e1_idx_set = trigger_b2i[e1_idx] 114 | e2_idx = e2_idx_set[0] 115 | if e1_idx in trigger2arg2role_idx: 116 | arg2role = trigger2arg2role_idx[e1_idx] 117 | arg2role[e2_idx] = role + "-B" 118 | trigger2arg2role_idx[e1_idx] = arg2role 119 | else: 120 | arg2role = {e2_idx: role + "-B"} 121 | trigger2arg2role_idx[e1_idx] = arg2role 122 | 123 | for e2_idx_tmp in e2_idx_set[1:]: 124 | if e1_idx in trigger2arg2role_idx: 125 | arg2role = trigger2arg2role_idx[e1_idx] 126 | arg2role[e2_idx_tmp] = role + "-I" 127 | trigger2arg2role_idx[e1_idx] = arg2role 128 | else: 129 | arg2role = {e2_idx_tmp: role + "-I"} 130 | trigger2arg2role_idx[e1_idx] = arg2role 131 | 132 | for e1_idx_tmp in e1_idx_set[1:]: 133 | for e2_idx_tmp in e2_idx_set: 134 | if e1_idx_tmp in trigger2arg2role_idx: 135 | arg2role = trigger2arg2role_idx[e1_idx_tmp] 136 | arg2role[e2_idx_tmp] = role + "-I" 137 | trigger2arg2role_idx[e1_idx_tmp] = arg2role 138 | else: 139 | arg2role = {e2_idx_tmp: role + "-I"} 140 | trigger2arg2role_idx[e1_idx_tmp] = arg2role 141 | 142 | mod2head2dep = {} 143 | if dep: 144 | sent = ' '.join([t.split('\t')[0] for t in current_sent]) 145 | doc_sent = nlp(sent) 146 | 147 | for i in range(len(doc_sent)): 148 | mod2head2dep[i] = {doc_sent[i].head.i:doc_sent[i].dep_} 149 | assert len(doc_sent) == len(current_sent) 150 | 151 | for t1 in range(len(current_sent)): 152 | e1_idx = t1 153 | e1_token = tok_idx2token[t1] 154 | e1_offset = tok_idx2offset[t1] 155 | e1_label = tok_idx2label[t1] 156 | for t2 in range(len(current_sent)): 157 | e2_idx = t2 158 | e2_token = tok_idx2token[t2] 159 | e2_offset = tok_idx2offset[t2] 160 | e2_label = tok_idx2label[t2] 161 | e2_ner = tok_idx2ner[t2] 162 | 163 | role = "O" 164 | if t1 in trigger2arg2role_idx and t2 in trigger2arg2role_idx[t1]: 165 | role = trigger2arg2role_idx[t1][t2] 166 | 167 | if dep == "bi": 168 | if e1_idx in mod2head2dep and e2_idx in mod2head2dep[e1_idx]: 169 | out_arg.write( 170 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' + 171 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' + 172 | e2_label + '\t' + role + '\t' + mod2head2dep[e1_idx][e2_idx] + "\t" + e2_ner + '\n') 173 | elif e2_idx in mod2head2dep and e1_idx in mod2head2dep[e2_idx]: 174 | out_arg.write( 175 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' + 176 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' + 177 | e2_label + '\t' + role + '\t' + mod2head2dep[e2_idx][e1_idx] + "\t" + e2_ner + '\n') 178 | else: 179 | out_arg.write( 180 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' + 181 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' + 182 | e2_label + '\t' + role + '\t' + "NA" + "\t" + e2_ner + '\n') 183 | elif dep == "un": 184 | if e1_idx in mod2head2dep and e2_idx in mod2head2dep[e1_idx]: 185 | out_arg.write( 186 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' + 187 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' + 188 | e2_label + '\t' + role + '\t' + mod2head2dep[e1_idx][e2_idx] + "\t" + e2_ner + '\n') 189 | else: 190 | out_arg.write( 191 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' + 192 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' + 193 | e2_label + '\t' + role + '\t' + "NA" + "\t" + e2_ner + '\n') 194 | else: 195 | out_arg.write(str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' + 196 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' + 197 | e2_label + '\t' + role + '\t' + 'NA' + "\t" + e2_ner + '\n') 198 | out_arg.write("\n") 199 | 200 | out_trigger.close() 201 | out_arg.close() 202 | 203 | 204 | def search_e2(tok_idx2offset, start, end): 205 | e2_idx = [] 206 | for i in range(len(tok_idx2offset)): 207 | offset_parts = tok_idx2offset[i].split(':')[1].split('-') 208 | c_start = int(offset_parts[0]) 209 | c_end = int(offset_parts[1]) 210 | if start <= c_end <= end or start <= c_start <= end: 211 | e2_idx.append(i) 212 | return e2_idx 213 | 214 | 215 | def validate_sent_seg(all_sents): 216 | cluster_idx = 0 217 | sent2cluster = {} 218 | merge_pre = False 219 | current_merge_next = False 220 | pre_merge_next = False 221 | current_single = False 222 | for i in range(len(all_sents)): 223 | current_sent = all_sents[i] 224 | sent_min, sent_max, ann_min, ann_max = get_offset_limit(current_sent) 225 | 226 | if sent_min <= ann_min and sent_max >= ann_max: 227 | current_single = True 228 | if sent_min > ann_min: 229 | merge_pre = True 230 | if sent_max < ann_max: 231 | current_merge_next = True 232 | 233 | if merge_pre: 234 | sent2cluster[i] = cluster_idx 235 | if not merge_pre and not current_merge_next and not pre_merge_next and current_single: 236 | sent2cluster[i] = cluster_idx+1 237 | cluster_idx += 1 238 | if pre_merge_next: 239 | sent2cluster[i] = cluster_idx 240 | if current_merge_next and not pre_merge_next: 241 | sent2cluster[i] = cluster_idx+1 242 | cluster_idx += 1 243 | 244 | merge_pre = False 245 | current_single = False 246 | pre_merge_next = current_merge_next 247 | current_merge_next = False 248 | 249 | cluster2sent = {} 250 | cluster_list = [] 251 | for i in range(len(all_sents)): 252 | c = sent2cluster[i] 253 | if c not in cluster2sent: 254 | tmp = [i] 255 | cluster2sent[c] = tmp 256 | cluster_list.append(c) 257 | else: 258 | tmp = cluster2sent[c] 259 | tmp.append(i) 260 | 261 | new_all_sents = [] 262 | for c in cluster_list: 263 | sids = cluster2sent[c] 264 | if len(sids) > 1: 265 | print(cluster2sent) 266 | newsents = [] 267 | for s in sids: 268 | newsents += all_sents[s] 269 | new_all_sents.append(newsents) 270 | return new_all_sents 271 | 272 | 273 | def get_offset_limit(current_sent): 274 | first_tok_offset = current_sent[0].split('\t')[1].split(':')[1].split('-') 275 | sent_min = int(first_tok_offset[0]) 276 | last_tok_offset = current_sent[-1].split('\t')[1].split(':')[1].split('-') 277 | sent_max = int(last_tok_offset[1]) 278 | 279 | ann_min = 100000 280 | ann_max = 0 281 | for line in current_sent: 282 | arg_str = line.strip().split('\t')[3] 283 | if arg_str != "O": 284 | arg_parts = arg_str.split(' ') 285 | for arg in arg_parts: 286 | parts = arg.split(':') 287 | s = int(parts[2]) 288 | e = int(parts[3]) 289 | if s < ann_min: 290 | ann_min = s 291 | if e > ann_max: 292 | ann_max = e 293 | if ann_min == 100000 and ann_max == 0: 294 | ann_min = sent_min 295 | ann_max = sent_max 296 | return sent_min, sent_max, ann_min, ann_max 297 | 298 | 299 | def validate_tags(all_sents): 300 | new_all_sents = [] 301 | pre_tag = "" 302 | for sents in all_sents: 303 | new_sents = [] 304 | for i in range(len(sents)): 305 | current_line = sents[i].strip('\n') 306 | if len(current_line) == 0: 307 | new_sents.append(current_line + "\n") 308 | else: 309 | parts = current_line.split('\t') 310 | tag = parts[2] 311 | if tag.endswith("I") and not (pre_tag.endswith("B") or pre_tag.endswith("I")): 312 | print("Error " + current_line) 313 | new_line = sents[i].strip()[:-1] + "B" 314 | new_sents.append(new_line + "\n") 315 | else: 316 | new_sents.append(sents[i].strip() + "\n") 317 | pre_tag = tag 318 | new_all_sents.append(new_sents) 319 | return new_all_sents 320 | 321 | 322 | def determine_tag(token_start, token_end, ner_start, ner_end): 323 | tag = "B" 324 | if token_start <= ner_start <= token_end: 325 | tag = "B" 326 | elif ner_start < token_start <= ner_end: 327 | tag = "I" 328 | return tag 329 | 330 | 331 | if __name__ == "__main__": 332 | parser = argparse.ArgumentParser() 333 | parser.add_argument('--ace', type=str, 334 | help='ace input path') 335 | parser.add_argument('--event', type=str, 336 | help='event path') 337 | parser.add_argument('--dep', type=str, default=None, 338 | help='apply dependency parser or not') 339 | 340 | args = parser.parse_args() 341 | 342 | ace_path = args.ace 343 | event_path = args.event 344 | dep = args.dep 345 | 346 | nlp = None 347 | if dep: 348 | # import en_core_web_sm 349 | # nlp = en_core_web_sm.load() 350 | nlp = spacy.load("en_core_web_sm")# , disable=["tagger", "ner", "textcat"] 351 | nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) 352 | 353 | if not os.path.exists(event_path): 354 | os.makedirs(event_path) 355 | 356 | file_names = [] 357 | if os.path.isdir(ace_path): 358 | file_names = [item[:-4] 359 | for item in os.listdir(ace_path) 360 | if item.endswith(".csv")] 361 | else: 362 | file_names = [ace_path] 363 | 364 | for f in file_names: 365 | print(f) 366 | ace_file= os.path.join(ace_path, f+".csv") 367 | trigger_file = os.path.join(event_path, f+".trigger") 368 | arg_file = os.path.join(event_path, f + ".arg") 369 | 370 | if os.path.exists(ace_file): 371 | write_event(ace_file, trigger_file, arg_file, dep, nlp) 372 | -------------------------------------------------------------------------------- /bio2ere.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import csv 4 | import xml.etree.ElementTree as ET 5 | 6 | 7 | # word, offset, nertag, relation, trigger, argument 8 | def write_ann(bio_file, ann_file, ace_file): 9 | csv_file = open(ace_file, 'w') 10 | fields = ['token', 'offset', 'ner_offset', 'ner_type', 'ner_nam_nom', 'ner_cluster', 11 | 'filler_offset', 'filler_type', 12 | 'relations_belong_to', 13 | 'trigger_offset', 'trigger_type', 'trigger_cluster', 'trigger_arguments'] 14 | writer = csv.DictWriter(csv_file, fieldnames=fields) 15 | writer.writeheader() 16 | 17 | entity_mentions_mentionid2dict, filler_mentions_mentionid2dict, relation_mentions_id2dict, \ 18 | event_mentions_id2dict = parse_ann(ann_file) 19 | 20 | with open(bio_file, 'r') as f: 21 | for line in f: 22 | line = line.strip() 23 | if len(line) > 0: 24 | parts = line.strip().split(' ') 25 | token = parts[0] 26 | offset = parts[1] 27 | 28 | token_dict = {'token': token, 'offset': offset} 29 | 30 | d_id, o = offset.split(':') 31 | start, end = o.split('-') 32 | start = int(start) 33 | end = int(end) 34 | 35 | entity_mention_ids = search_offset_id(start, end, entity_mentions_mentionid2dict, 'offset') 36 | filler_mention_ids = search_offset_id(start, end, filler_mentions_mentionid2dict, 'offset') 37 | relation_mention_ids = search_relation_id(start, end, relation_mentions_id2dict) 38 | event_mention_ids = search_offset_id(start, end, event_mentions_id2dict, 'trigger_offset') 39 | 40 | if len(entity_mention_ids) == 0: 41 | token_dict['ner_offset'] = 'O' 42 | token_dict['ner_type'] = 'O' 43 | token_dict['ner_nam_nom'] = 'O' 44 | token_dict['ner_cluster'] = 'O' 45 | else: 46 | ner_offsets = [] 47 | ner_types = [] 48 | ner_nam_noms = [] 49 | ner_clusters = [] 50 | 51 | for id in entity_mention_ids: 52 | ner_offsets.append(entity_mentions_mentionid2dict[id]['offset']) 53 | if str(start)+":"+str(end) == entity_mentions_mentionid2dict[id]['offset']: 54 | assert token == entity_mentions_mentionid2dict[id]['text'] 55 | ner_types.append(entity_mentions_mentionid2dict[id]['type'] + ':' + \ 56 | entity_mentions_mentionid2dict[id]['subtype']) 57 | ner_nam_noms.append(entity_mentions_mentionid2dict[id]['mention_type']) 58 | ner_clusters.append(entity_mentions_mentionid2dict[id]['entity_id']) 59 | token_dict['ner_offset'] = '#@#'.join(ner_offsets) 60 | token_dict['ner_type'] = '#@#'.join(ner_types) 61 | token_dict['ner_nam_nom'] = '#@#'.join(ner_nam_noms) 62 | token_dict['ner_cluster'] = '#@#'.join(ner_clusters) 63 | 64 | if len(filler_mention_ids) == 0: 65 | token_dict['filler_offset'] = 'O' 66 | token_dict['filler_type'] = 'O' 67 | else: 68 | filler_offsets = [] 69 | filler_types = [] 70 | for id in filler_mention_ids: 71 | filler_offsets.append(filler_mentions_mentionid2dict[id]['offset']) 72 | filler_types.append(filler_mentions_mentionid2dict[id]['type']) 73 | token_dict['filler_offset'] = '#@#'.join(filler_offsets) 74 | token_dict['filler_type'] = '#@#'.join(filler_types) 75 | 76 | if len(relation_mention_ids) == 0: 77 | token_dict['relations_belong_to'] = 'O' 78 | else: 79 | relation_mentions = [] 80 | for id in relation_mention_ids: 81 | relation_mention_dict = relation_mentions_id2dict[id] 82 | relation_id = relation_mention_dict['relation_id'] 83 | relation_type = relation_mention_dict['relation_type'] + ':' + \ 84 | relation_mention_dict['relation_subtype'] 85 | arg0 = relation_mention_dict['mention_argument0_offset'] 86 | arg1 = relation_mention_dict['mention_argument1_offset'] 87 | 88 | mention = relation_id + ':' + arg0 + ':' + relation_type + ':' + arg1 89 | relation_mentions.append(mention) 90 | mention_str = ' '.join(relation_mentions) 91 | token_dict['relations_belong_to'] = mention_str 92 | 93 | if len(event_mention_ids) == 0: 94 | token_dict['trigger_offset'] = 'O' 95 | token_dict['trigger_type'] = 'O' 96 | token_dict['trigger_cluster'] = 'O' 97 | token_dict['trigger_arguments'] = 'O' 98 | else: 99 | trigger_offsets = [] 100 | trigger_types = [] 101 | trigger_clusters = [] 102 | trigger_arguments_set = [] 103 | for id in event_mention_ids: 104 | trigger_offsets.append(event_mentions_id2dict[id]['trigger_offset']) 105 | if str(start)+":"+str(end) == event_mentions_id2dict[id]['trigger_offset']: 106 | assert token == event_mentions_id2dict[id]['trigger_text'] 107 | trigger_types.append(event_mentions_id2dict[id]['type'] + ':' + 108 | event_mentions_id2dict[id]['subtype']) 109 | trigger_clusters.append(event_mentions_id2dict[id]['event_id']) 110 | all_event_mention_arguments = event_mentions_id2dict[id]['argument'] 111 | arguments = [] 112 | for arg in all_event_mention_arguments: 113 | arg_str = arg['mention_argument_refid'] + ':' + arg['mention_argument_role'] + ':' + \ 114 | arg['mention_argument_offset'] 115 | arguments.append(arg_str) 116 | arguments_str = ' '.join(arguments) 117 | trigger_arguments_set.append(arguments_str) 118 | 119 | token_dict['trigger_offset'] = '#@#'.join(trigger_offsets) 120 | token_dict['trigger_type'] = '#@#'.join(trigger_types) 121 | token_dict['trigger_cluster'] = '#@#'.join(trigger_clusters) 122 | token_dict['trigger_arguments'] = '#@#'.join(trigger_arguments_set) 123 | 124 | writer.writerow(token_dict) 125 | else: 126 | token_dict = {'token':'----sentence_delimiter----'} 127 | writer.writerow(token_dict) 128 | 129 | csv_file.close() 130 | 131 | 132 | # applicable to entity, timex2, event mentions 133 | def search_offset_id(token_start, token_end, entity_mentions_mentionid2dict, offset_key): 134 | searched_ids = [] 135 | for id in entity_mentions_mentionid2dict: 136 | can_dict = entity_mentions_mentionid2dict[id] 137 | mention_offset_parts = can_dict[offset_key].split(':') 138 | can_start = int(mention_offset_parts[0]) 139 | can_end = int(mention_offset_parts[1]) 140 | if (can_start <= token_start <= can_end) or (can_start <= token_end <= can_end): 141 | searched_ids.append(id) 142 | return searched_ids 143 | 144 | 145 | def search_relation_id(token_start, token_end, relation_mentions_id2dict): 146 | searched_ids = [] 147 | for id in relation_mentions_id2dict: 148 | can_dict = relation_mentions_id2dict[id] 149 | argument0_offset_parts = can_dict['mention_argument0_offset'].split(':') 150 | argument1_offset_parts = can_dict['mention_argument1_offset'].split(':') 151 | arg0_start = int(argument0_offset_parts[0]) 152 | arg0_end = int(argument0_offset_parts[1]) 153 | arg1_start = int(argument1_offset_parts[0]) 154 | arg1_end = int(argument1_offset_parts[1]) 155 | if (arg0_start <= token_start <= arg0_end) or (arg0_start <= token_end <= arg0_end) or \ 156 | (arg1_start <= token_start <= arg1_end) or (arg1_start <= token_end <= arg1_end): 157 | searched_ids.append(id) 158 | return searched_ids 159 | 160 | 161 | def parse_ann(ann_file): 162 | tree = ET.parse(ann_file) 163 | root = tree.getroot() 164 | doc_elem = root[0] # entities, fillers, relations, hoppers 165 | 166 | all_entity_elems = [] 167 | all_filler_elems = [] 168 | all_relation_elems = [] 169 | all_hopper_elems = [] 170 | if len(doc_elem.findall('entities')) > 0: 171 | entities_elem = doc_elem.findall('entities')[0] 172 | all_entity_elems = entities_elem.findall('entity') 173 | if len(doc_elem.findall('fillers')) > 0: 174 | fillers_elem = doc_elem.findall('fillers')[0] 175 | all_filler_elems = fillers_elem.findall('filler') 176 | if len(doc_elem.findall('relations')) > 0: 177 | relations_elem = doc_elem.findall('relations')[0] 178 | all_relation_elems = relations_elem.findall('relation') 179 | if len(doc_elem.findall('hoppers')) > 0: 180 | hoppers_elem = doc_elem.findall('hoppers')[0] 181 | all_hopper_elems = hoppers_elem.findall('hopper') 182 | 183 | # parse all entities and mentions 184 | entity_mentions_offset2dict = {} 185 | entity_mentions_mentionid2dict = {} 186 | for entity_elem in all_entity_elems: 187 | entity_attribs = entity_elem.attrib 188 | entity_id = entity_attribs["id"] # CNN_CF_20030303.1900.00-E1 189 | entity_type = entity_attribs["type"] # PER 190 | entity_specificity = entity_attribs["specificity"] # 191 | 192 | all_entity_mention_elems = entity_elem.findall("entity_mention") 193 | for entity_mention_elem in all_entity_mention_elems: 194 | entity_mention_attribs = entity_mention_elem.attrib 195 | entity_mention_id = entity_mention_attribs["id"] # CNN_CF_20030303.1900.00-E1-2 196 | entity_mention_noun_type = entity_mention_attribs["noun_type"] # NOM 197 | 198 | entity_mention_start = entity_mention_attribs["offset"] 199 | entity_mention_end = int(entity_mention_start) + int(entity_mention_attribs["length"]) - 1 200 | entity_mention_text = entity_mention_elem.findall('mention_text')[0].text 201 | mention_offset = entity_mention_start + ":" + str(entity_mention_end) 202 | 203 | nom_head_elems = entity_mention_elem.findall("nom_head") 204 | if len(nom_head_elems) > 0: 205 | if len(nom_head_elems) > 1: 206 | print("Error: multiple nom heads~") 207 | nom_head_elem = nom_head_elems[0] 208 | entity_mention_head_start = nom_head_elem.attrib["offset"] 209 | entity_mention_head_end = int(entity_mention_head_start) + int(nom_head_elem.attrib["length"]) - 1 210 | mention_offset = entity_mention_head_start + ":" + str(entity_mention_head_end) 211 | entity_mention_text = nom_head_elem.text 212 | 213 | mention_dict = {"type": entity_type, "specificity": entity_specificity, "entity_id": entity_id, 214 | "mention_id": entity_mention_id, "mention_type": entity_mention_noun_type, 215 | "text": entity_mention_text, "offset": mention_offset} 216 | entity_mentions_offset2dict[mention_offset] = mention_dict 217 | entity_mentions_mentionid2dict[entity_mention_id] = mention_dict 218 | 219 | # parse all filler 220 | filler_mentions_offset2dict = {} 221 | filler_mentions_mentionid2dict = {} 222 | for filler_elem in all_filler_elems: 223 | filler_id = filler_elem.attrib["id"] 224 | filler_start = filler_elem.attrib["offset"] 225 | filler_end = int(filler_start) + int(filler_elem.attrib["length"]) - 1 226 | filler_type = filler_elem.attrib["type"] 227 | filler_text = filler_elem.text 228 | mention_offset = filler_start + ":" + str(filler_end) 229 | mention_dict = {"filler_id": filler_id, "type": filler_type, "text": filler_text, 230 | "offset": mention_offset} 231 | filler_mentions_offset2dict[mention_offset] = mention_dict 232 | filler_mentions_mentionid2dict[filler_id] = mention_dict 233 | 234 | # parse all relations 235 | relation_mentions_id2dict = {} 236 | relation_mentions_men2men2dict = {} 237 | for relation_elem in all_relation_elems: 238 | relation_elem_attribs = relation_elem.attrib 239 | relation_id = relation_elem_attribs["id"] # CNN_CF_20030303.1900.00-R2 240 | relation_type = relation_elem_attribs["type"] # PART-WHOLE 241 | relation_subtype = relation_elem_attribs["subtype"] # PART-WHOLE 242 | 243 | all_relation_mention_elems = relation_elem.findall("relation_mention") 244 | for relation_mention_elem in all_relation_mention_elems: 245 | relation_mention_id = relation_mention_elem.attrib["id"] 246 | relation_mention_realis = relation_mention_elem.attrib["realis"] 247 | 248 | relation_mention_argument0_elem = relation_mention_elem.findall("rel_arg1")[0] 249 | relation_mention_argument1_elem = relation_mention_elem.findall("rel_arg2")[0] 250 | if "entity_id" in relation_mention_argument0_elem.attrib: 251 | relation_mention_argument0_refid = relation_mention_argument0_elem.attrib["entity_mention_id"] 252 | relation_mention_argument0_role = relation_mention_argument0_elem.attrib["role"] 253 | relation_mention_argument0_extend_offset = \ 254 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"] 255 | relation_mention_argument0_extend_text = \ 256 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["text"] 257 | elif "filler_id" in relation_mention_argument0_elem.attrib: 258 | relation_mention_argument0_refid = relation_mention_argument0_elem.attrib["filler_id"] 259 | relation_mention_argument0_role = relation_mention_argument0_elem.attrib["role"] 260 | relation_mention_argument0_extend_offset = \ 261 | filler_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"] 262 | relation_mention_argument0_extend_text = \ 263 | filler_mentions_mentionid2dict[relation_mention_argument0_refid]["text"] 264 | if "entity_id" in relation_mention_argument1_elem.attrib: 265 | relation_mention_argument1_refid = relation_mention_argument1_elem.attrib["entity_mention_id"] 266 | relation_mention_argument1_role = relation_mention_argument1_elem.attrib["role"] 267 | relation_mention_argument1_extend_offset = \ 268 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"] 269 | relation_mention_argument1_extend_text = \ 270 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["text"] 271 | elif "filler_id" in relation_mention_argument1_elem.attrib: 272 | relation_mention_argument1_refid = relation_mention_argument1_elem.attrib["filler_id"] 273 | relation_mention_argument1_role = relation_mention_argument1_elem.attrib["role"] 274 | relation_mention_argument1_extend_offset = \ 275 | filler_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"] 276 | relation_mention_argument1_extend_text = \ 277 | filler_mentions_mentionid2dict[relation_mention_argument1_refid]["text"] 278 | 279 | relation_mention_trigger_elems = relation_mention_elem.findall("trigger") 280 | relation_mention_trigger_offset = "O" 281 | relation_mention_trigger_text = "O" 282 | if len(relation_mention_trigger_elems) > 0: 283 | relation_mention_trigger_start = relation_mention_trigger_elems[0].attrib["offset"] 284 | relation_mention_trigger_end = int(relation_mention_trigger_start) + \ 285 | int(relation_mention_trigger_elems[0].attrib["length"]) - 1 286 | relation_mention_trigger_offset = relation_mention_trigger_start + ":" + \ 287 | str(relation_mention_trigger_end) 288 | relation_mention_trigger_text = relation_mention_trigger_elems[0].text 289 | 290 | mention_dict = {"relation_id": relation_id, "relation_type": relation_type, 291 | "relation_subtype": relation_subtype, 292 | "mention_id": relation_mention_id, "mention_realis": relation_mention_realis, 293 | "mention_argument0_refid": relation_mention_argument0_refid, 294 | "mention_argument0_role": relation_mention_argument0_role, 295 | "mention_argument1_refid": relation_mention_argument1_refid, 296 | "mention_argument1_role": relation_mention_argument1_role, 297 | "mention_argument0_offset": relation_mention_argument0_extend_offset, 298 | "mention_argument0_text": relation_mention_argument0_extend_text, 299 | "mention_argument1_offset": relation_mention_argument1_extend_offset, 300 | "mention_argument1_text": relation_mention_argument1_extend_text, 301 | "mention_trigger_offset": relation_mention_trigger_offset, 302 | "mention_trigger_text": relation_mention_trigger_text 303 | } 304 | relation_mentions_id2dict[relation_mention_id] = mention_dict 305 | if relation_mention_argument0_refid in relation_mentions_men2men2dict: 306 | relation_mentions_men2dict = relation_mentions_men2men2dict[relation_mention_argument0_refid] 307 | relation_mentions_men2dict[relation_mention_argument1_refid] = mention_dict 308 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict 309 | else: 310 | relation_mentions_men2dict = {relation_mention_argument1_refid: mention_dict} 311 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict 312 | 313 | # parse all events 314 | event_mentions_id2dict = {} 315 | for event_elem in all_hopper_elems: 316 | event_id = event_elem.attrib["id"] 317 | 318 | all_event_mention_elems = event_elem.findall("event_mention") 319 | for event_mention_elem in all_event_mention_elems: 320 | event_mention_id = event_mention_elem.attrib["id"] 321 | event_mention_type = event_mention_elem.attrib["type"] 322 | event_mention_subtype = event_mention_elem.attrib["subtype"] 323 | event_mention_realis = event_mention_elem.attrib["realis"] 324 | 325 | event_mention_trigger_elem = event_mention_elem.findall("trigger")[0] 326 | event_mention_trigger_start = event_mention_trigger_elem.attrib["offset"] 327 | event_mention_trigger_end = int(event_mention_trigger_start) + \ 328 | int(event_mention_trigger_elem.attrib["length"]) - 1 329 | event_mention_trigger_text = event_mention_trigger_elem.text 330 | event_mention_trigger_offset = event_mention_trigger_start + ":" + \ 331 | str(event_mention_trigger_end) 332 | 333 | all_event_mention_argument_elems = event_mention_elem.findall("em_arg") 334 | all_event_mention_arguments = [] 335 | for event_mention_argument_elem in all_event_mention_argument_elems: 336 | if "entity_id" in event_mention_argument_elem.attrib: 337 | event_mention_argument_refid = event_mention_argument_elem.attrib["entity_mention_id"] 338 | event_mention_argument_offset = entity_mentions_mentionid2dict[event_mention_argument_refid][ 339 | "offset"] 340 | event_mention_argument_text = entity_mentions_mentionid2dict[event_mention_argument_refid][ 341 | "text"] 342 | elif "filler_id" in event_mention_argument_elem.attrib: 343 | event_mention_argument_refid = event_mention_argument_elem.attrib["filler_id"] 344 | event_mention_argument_offset = filler_mentions_mentionid2dict[event_mention_argument_refid][ 345 | "offset"] 346 | event_mention_argument_text = filler_mentions_mentionid2dict[event_mention_argument_refid][ 347 | "text"] 348 | event_mention_argument_role = event_mention_argument_elem.attrib["role"] 349 | event_mention_argument_realis = event_mention_argument_elem.attrib["realis"] 350 | 351 | event_mention_argument_dict = {"mention_argument_refid": event_mention_argument_refid, 352 | "mention_argument_role": event_mention_argument_role, 353 | "mention_argument_realis": event_mention_argument_realis, 354 | "mention_argument_offset": event_mention_argument_offset, 355 | "mention_argument_text": event_mention_argument_text} 356 | all_event_mention_arguments.append(event_mention_argument_dict) 357 | 358 | mention_dict = {"event_id": event_id, "type": event_mention_type, "subtype": event_mention_subtype, 359 | "realis": event_mention_realis, "mention_id": event_mention_id, 360 | "trigger_offset": event_mention_trigger_offset, "trigger_text": event_mention_trigger_text, 361 | "argument": all_event_mention_arguments} 362 | 363 | event_mentions_id2dict[event_mention_id] = mention_dict 364 | 365 | return entity_mentions_mentionid2dict, filler_mentions_mentionid2dict, \ 366 | relation_mentions_id2dict, event_mentions_id2dict 367 | 368 | 369 | if __name__ == "__main__": 370 | parser = argparse.ArgumentParser() 371 | parser.add_argument('--bio', type=str, 372 | help='bio input path') 373 | parser.add_argument('--ann', type=str, 374 | help='ace annotation input path') 375 | parser.add_argument('--ere', type=str, 376 | help='output ace annotation path') 377 | parser.add_argument('--filelist', type=str, 378 | help='filelist path') 379 | 380 | args = parser.parse_args() 381 | 382 | bio_path = args.bio 383 | ann_path = args.ann 384 | ere_path = args.ere 385 | 386 | if not os.path.exists(ere_path): 387 | os.makedirs(ere_path) 388 | 389 | file_names = [] 390 | if os.path.isdir(bio_path): 391 | file_names = [item[:-4] 392 | for item in os.listdir(bio_path) 393 | if item.endswith(".bio")] 394 | else: 395 | file_names = [bio_path] 396 | 397 | for f in file_names: 398 | print(f) 399 | bio_file= os.path.join(bio_path, f+".bio") 400 | ann_file = os.path.join(ann_path, f+".rich_ere.xml") 401 | ace_file = os.path.join(ere_path, f+".csv") 402 | 403 | if os.path.exists(bio_file) and os.path.exists(ann_file): 404 | write_ann(bio_file, ann_file, ace_file) 405 | 406 | -------------------------------------------------------------------------------- /bio2ace.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import csv 4 | import xml.etree.ElementTree as ET 5 | 6 | 7 | # word, offset, nertag, relation, trigger, argument 8 | def write_ann(bio_file, ann_file, ace_file): 9 | csv_file = open(ace_file, 'w') 10 | fields = ['token', 'offset', 'ner_offset', 'ner_type', 'ner_nam_nom', 'ner_mention', 'ner_cluster', 11 | 'timex2_offset', 'timex2_cluster', 12 | 'value_offset', 'value_type', 'value_cluster', 13 | 'relations_belong_to', 14 | 'trigger_offset', 'trigger_type', 'trigger_cluster', 'trigger_arguments'] 15 | writer = csv.DictWriter(csv_file, fieldnames=fields) 16 | writer.writeheader() 17 | 18 | entity_mentions_mentionid2dict, timex2_mentions_mentionid2dict, value_mentions_mentionid2dict, \ 19 | relation_mentions_id2dict, event_mentions_id2dict = parse_ann(ann_file) 20 | 21 | with open(bio_file, 'r') as f: 22 | for line in f: 23 | line = line.strip() 24 | if len(line) > 0: 25 | parts = line.strip().split(' ') 26 | token = parts[0] 27 | offset = parts[1] 28 | 29 | token_dict = {'token': token, 'offset': offset} 30 | 31 | d_id, o = offset.split(':') 32 | start, end = o.split('-') 33 | start = int(start) 34 | end = int(end) 35 | 36 | entity_mention_ids = search_offset_id(start, end, entity_mentions_mentionid2dict, 'offset') 37 | timex2_mention_ids = search_offset_id(start, end, timex2_mentions_mentionid2dict, 'offset') 38 | value_mention_ids = search_offset_id(start, end, value_mentions_mentionid2dict, 'offset') 39 | relation_mention_ids = search_relation_id(start, end, relation_mentions_id2dict) 40 | event_mention_ids = search_offset_id(start, end, event_mentions_id2dict, 'anchor_offset') 41 | 42 | if len(entity_mention_ids) == 0: 43 | token_dict['ner_offset'] = 'O' 44 | token_dict['ner_type'] = 'O' 45 | token_dict['ner_nam_nom'] = 'O' 46 | token_dict['ner_mention'] = 'O' 47 | token_dict['ner_cluster'] = 'O' 48 | else: 49 | ner_offsets = [] 50 | ner_types = [] 51 | ner_nam_noms = [] 52 | ner_mentions = [] 53 | ner_clusters = [] 54 | 55 | for id in entity_mention_ids: 56 | ner_offsets.append(entity_mentions_mentionid2dict[id]['offset']) 57 | ner_types.append(entity_mentions_mentionid2dict[id]['type'] + ':' + \ 58 | entity_mentions_mentionid2dict[id]['subtype']) 59 | ner_nam_noms.append(entity_mentions_mentionid2dict[id]['mention_type']) 60 | ner_mentions.append(entity_mentions_mentionid2dict[id]['text']) 61 | ner_clusters.append(entity_mentions_mentionid2dict[id]['entity_id']) 62 | token_dict['ner_offset'] = '#@#'.join(ner_offsets) 63 | token_dict['ner_type'] = '#@#'.join(ner_types) 64 | token_dict['ner_nam_nom'] = '#@#'.join(ner_nam_noms) 65 | token_dict['ner_mention'] = '#@#'.join(ner_mentions) 66 | token_dict['ner_cluster'] = '#@#'.join(ner_clusters) 67 | 68 | if len(timex2_mention_ids) == 0: 69 | token_dict['timex2_offset'] = 'O' 70 | token_dict['timex2_cluster'] = 'O' 71 | else: 72 | timex2_offsets = [] 73 | timex2_clusters = [] 74 | for id in timex2_mention_ids: 75 | timex2_offsets.append(timex2_mentions_mentionid2dict[id]['offset']) 76 | timex2_clusters.append(timex2_mentions_mentionid2dict[id]['timex2_id']) 77 | token_dict['timex2_offset'] = '#@#'.join(timex2_offsets) 78 | token_dict['timex2_cluster'] = '#@#'.join(timex2_clusters) 79 | 80 | if len(value_mention_ids) == 0: 81 | token_dict['value_offset'] = 'O' 82 | token_dict['value_type'] = 'O' 83 | token_dict['value_cluster'] = 'O' 84 | else: 85 | value_offsets = [] 86 | value_types = [] 87 | value_clusters = [] 88 | 89 | for id in value_mention_ids: 90 | value_offsets.append(value_mentions_mentionid2dict[id]['offset']) 91 | value_types.append(value_mentions_mentionid2dict[id]['type'] + ':' + 92 | value_mentions_mentionid2dict[id]['subtype']) 93 | value_clusters.append(value_mentions_mentionid2dict[id]['value_id']) 94 | token_dict['value_offset'] = '#@#'.join(value_offsets) 95 | token_dict['value_type'] = '#@#'.join(value_types) 96 | token_dict['value_cluster'] = '#@#'.join(value_clusters) 97 | 98 | if len(relation_mention_ids) == 0: 99 | token_dict['relations_belong_to'] = 'O' 100 | else: 101 | relation_mentions = [] 102 | for id in relation_mention_ids: 103 | relation_mention_dict = relation_mentions_id2dict[id] 104 | relation_id = relation_mention_dict['relation_id'] 105 | relation_type = relation_mention_dict['relation_type'] + ':' + \ 106 | relation_mention_dict['relation_subtype'] 107 | arg0 = relation_mention_dict['mention_argument0_offset'] 108 | arg1 = relation_mention_dict['mention_argument1_offset'] 109 | 110 | mention = relation_id + ':' + arg0 + ':' + relation_type + ':' + arg1 111 | relation_mentions.append(mention) 112 | mention_str = ' '.join(relation_mentions) 113 | token_dict['relations_belong_to'] = mention_str 114 | 115 | if len(event_mention_ids) == 0: 116 | token_dict['trigger_offset'] = 'O' 117 | token_dict['trigger_type'] = 'O' 118 | token_dict['trigger_cluster'] = 'O' 119 | token_dict['trigger_arguments'] = 'O' 120 | else: 121 | trigger_offsets = [] 122 | trigger_types = [] 123 | trigger_clusters = [] 124 | trigger_arguments_set = [] 125 | for id in event_mention_ids: 126 | trigger_offsets.append(event_mentions_id2dict[id]['anchor_offset']) 127 | trigger_types.append(event_mentions_id2dict[id]['type'] + ':' + 128 | event_mentions_id2dict[id]['subtype']) 129 | trigger_clusters.append(event_mentions_id2dict[id]['event_id']) 130 | all_event_mention_arguments = event_mentions_id2dict[id]['argument'] 131 | arguments = [] 132 | for arg in all_event_mention_arguments: 133 | arg_str = arg['mention_argument_refid'] + ':' + arg['mention_argument_role'] + ':' + \ 134 | arg['mention_argument_offset'] 135 | arguments.append(arg_str) 136 | if len(arguments) > 0: 137 | arguments_str = ' '.join(arguments) 138 | trigger_arguments_set.append(arguments_str) 139 | 140 | token_dict['trigger_offset'] = '#@#'.join(trigger_offsets) 141 | token_dict['trigger_type'] = '#@#'.join(trigger_types) 142 | token_dict['trigger_cluster'] = '#@#'.join(trigger_clusters) 143 | if len(trigger_arguments_set) > 0: 144 | token_dict['trigger_arguments'] = '#@#'.join(trigger_arguments_set) 145 | else: 146 | token_dict['trigger_arguments'] = 'O' 147 | 148 | writer.writerow(token_dict) 149 | else: 150 | token_dict = {'token':'----sentence_delimiter----'} 151 | writer.writerow(token_dict) 152 | 153 | csv_file.close() 154 | 155 | 156 | # applicable to entity, timex2, event mentions 157 | def search_offset_id(token_start, token_end, entity_mentions_mentionid2dict, offset_key): 158 | searched_ids = [] 159 | for id in entity_mentions_mentionid2dict: 160 | can_dict = entity_mentions_mentionid2dict[id] 161 | mention_offset_parts = can_dict[offset_key].split(':') 162 | can_start = int(mention_offset_parts[0]) 163 | can_end = int(mention_offset_parts[1]) 164 | if (can_start <= token_start <= can_end) or (can_start <= token_end <= can_end): 165 | searched_ids.append(id) 166 | return searched_ids 167 | 168 | 169 | def search_relation_id(token_start, token_end, relation_mentions_id2dict): 170 | searched_ids = [] 171 | for id in relation_mentions_id2dict: 172 | can_dict = relation_mentions_id2dict[id] 173 | argument0_offset_parts = can_dict['mention_argument0_offset'].split(':') 174 | argument1_offset_parts = can_dict['mention_argument1_offset'].split(':') 175 | arg0_start = int(argument0_offset_parts[0]) 176 | arg0_end = int(argument0_offset_parts[1]) 177 | arg1_start = int(argument1_offset_parts[0]) 178 | arg1_end = int(argument1_offset_parts[1]) 179 | if (arg0_start <= token_start <= arg0_end) or (arg0_start <= token_end <= arg0_end) or \ 180 | (arg1_start <= token_start <= arg1_end) or (arg1_start <= token_end <= arg1_end): 181 | searched_ids.append(id) 182 | return searched_ids 183 | 184 | 185 | def parse_ann(ann_file): 186 | tree = ET.parse(ann_file) 187 | root = tree.getroot() 188 | doc_elem = root[0] # entity, timex2, relation, event 189 | 190 | all_entity_elems = doc_elem.findall('entity') 191 | all_timex2_elems = doc_elem.findall('timex2') 192 | all_value_elems = doc_elem.findall('value') 193 | all_relaton_elems = doc_elem.findall('relation') 194 | all_event_elems = doc_elem.findall('event') 195 | 196 | # parse all entities and mentions 197 | entity_mentions_offset2dict = {} 198 | entity_mentions_mentionid2dict = {} 199 | for entity_elem in all_entity_elems: 200 | entity_attribs = entity_elem.attrib 201 | entity_id = entity_attribs["ID"] # CNN_CF_20030303.1900.00-E1 202 | entity_type = entity_attribs["TYPE"] # PER 203 | entity_subtype = entity_attribs["SUBTYPE"] # Individual 204 | entity_class = entity_attribs["CLASS"] # SPC 205 | 206 | all_entity_mention_elems = entity_elem.findall("entity_mention") 207 | for entity_mention_elem in all_entity_mention_elems: 208 | entity_mention_attribs = entity_mention_elem.attrib 209 | entity_mention_id = entity_mention_attribs["ID"] # CNN_CF_20030303.1900.00-E1-2 210 | entity_mention_type = entity_mention_attribs["TYPE"] # NOM 211 | entity_mention_ldctype = entity_mention_attribs["LDCTYPE"] # NOMPRE 212 | 213 | entity_mention_extent_elem = entity_mention_elem.findall("extent")[0].findall("charseq")[0] 214 | entity_mention_head_elem = entity_mention_elem.findall("head")[0].findall("charseq")[0] 215 | 216 | entity_mention_head_start = entity_mention_head_elem.attrib["START"] # 490 217 | entity_mention_head_end = entity_mention_head_elem.attrib["END"] # 498 218 | entity_mention_head_text = entity_mention_head_elem.text # Secretary 219 | 220 | mention_offset = entity_mention_head_start + ":" + entity_mention_head_end 221 | mention_dict = {"type": entity_type, "subtype": entity_subtype, "entity_id": entity_id, 222 | "entity_class": entity_class, "mention_id": entity_mention_id, 223 | "mention_type": entity_mention_type, "mention_ldctype": entity_mention_ldctype, 224 | "text": entity_mention_head_text, "offset": mention_offset} 225 | entity_mentions_offset2dict[mention_offset] = mention_dict 226 | entity_mentions_mentionid2dict[entity_mention_id] = mention_dict 227 | 228 | # parse all timex2 229 | timex2_mentions_offset2dict = {} 230 | timex2_mentions_mentionid2dict = {} 231 | for timex2_elem in all_timex2_elems: 232 | timex2_id = timex2_elem.attrib["ID"] 233 | all_timex2_mention_elems = timex2_elem.findall("timex2_mention") 234 | for timex2_mention_elem in all_timex2_mention_elems: 235 | timex2_mention_id = timex2_mention_elem.attrib["ID"] 236 | timex2_mention_elem_extend = timex2_mention_elem.findall("extent")[0].findall("charseq")[0] 237 | timex2_mention_start = timex2_mention_elem_extend.attrib["START"] 238 | timex2_mention_end = timex2_mention_elem_extend.attrib["END"] 239 | timex2_mention_text = timex2_mention_elem_extend.text 240 | 241 | mention_offset = timex2_mention_start + ":" + timex2_mention_end 242 | mention_dict = {"timex2_id": timex2_id, "mention_id": timex2_mention_id, "text": timex2_mention_text, 243 | "offset": mention_offset} 244 | timex2_mentions_offset2dict[mention_offset] = mention_dict 245 | timex2_mentions_mentionid2dict[timex2_mention_id] = mention_dict 246 | 247 | # parse all values 248 | value_mentions_offset2dict = {} 249 | value_mentions_mentionid2dict = {} 250 | for value_elem in all_value_elems: 251 | value_id = value_elem.attrib["ID"] 252 | value_type = value_elem.attrib['TYPE'] 253 | value_subtype = "O" 254 | if "SUBTYPE" in value_elem.attrib: 255 | value_subtype = value_elem.attrib['SUBTYPE'] 256 | 257 | all_value_mention_elems = value_elem.findall("value_mention") 258 | for value_mention_elem in all_value_mention_elems: 259 | value_mention_id = value_mention_elem.attrib["ID"] 260 | value_mention_elem_extend = value_mention_elem.findall("extent")[0].findall("charseq")[0] 261 | value_mention_start = value_mention_elem_extend.attrib["START"] 262 | value_mention_end = value_mention_elem_extend.attrib["END"] 263 | value_mention_text = value_mention_elem_extend.text 264 | 265 | mention_offset = value_mention_start + ":" + value_mention_end 266 | mention_dict = {"value_id": value_id, "type":value_type, 'subtype':value_subtype, 267 | "mention_id": value_mention_id, "text": value_mention_text, 268 | "offset": mention_offset} 269 | value_mentions_offset2dict[mention_offset] = mention_dict 270 | value_mentions_mentionid2dict[value_mention_id] = mention_dict 271 | 272 | # parse all relations 273 | relation_mentions_id2dict = {} 274 | relation_mentions_men2men2dict = {} 275 | for relation_elem in all_relaton_elems: 276 | relation_elem_attribs = relation_elem.attrib 277 | relation_id = relation_elem_attribs["ID"] # CNN_CF_20030303.1900.00-R2 278 | relation_type = relation_elem_attribs["TYPE"] # PART-WHOLE 279 | relation_subtype = "O" 280 | if "SUBTYPE" in relation_elem_attribs: 281 | relation_subtype = relation_elem_attribs["SUBTYPE"] # Geographical 282 | relation_tense = "O" 283 | if "TENSE" in relation_elem_attribs: 284 | relation_tense = relation_elem_attribs["TENSE"] # Unspecified 285 | relation_modality = "O" 286 | if "MODALITY" in relation_elem_attribs: 287 | relation_modality = relation_elem_attribs["MODALITY"] # Unspecified 288 | relation_argument_elems = relation_elem.findall("relation_argument") 289 | relation_argument0 = relation_argument_elems[0] 290 | relation_argument1 = relation_argument_elems[1] 291 | relation_argument0_refid = relation_argument0.attrib["REFID"] 292 | relation_argument0_role = relation_argument0.attrib["ROLE"] 293 | relation_argument1_refid = relation_argument1.attrib["REFID"] 294 | relation_argument1_role = relation_argument1.attrib["ROLE"] 295 | 296 | all_relation_mention_elems = relation_elem.findall("relation_mention") 297 | for relation_mention_elem in all_relation_mention_elems: 298 | relation_mention_id = relation_mention_elem.attrib["ID"] 299 | relation_mention_lexical_condition = relation_mention_elem.attrib["LEXICALCONDITION"] 300 | relation_mention_extent = relation_mention_elem.findall("extent")[0].findall("charseq")[0] 301 | relation_mention_extent_start = relation_mention_extent.attrib["START"] 302 | relation_mention_extent_end = relation_mention_extent.attrib["END"] 303 | relation_mention_extent_text = relation_mention_extent.text 304 | relation_mention_extend_offset = relation_mention_extent_start + ":" + relation_mention_extent_end 305 | 306 | relation_mention_argument_elems = relation_mention_elem.findall("relation_mention_argument") 307 | relation_mention_argument0 = relation_mention_argument_elems[0] 308 | relation_mention_argument1 = relation_mention_argument_elems[1] 309 | relation_mention_argument0_refid = relation_mention_argument0.attrib["REFID"] 310 | relation_mention_argument0_role = relation_mention_argument0.attrib["ROLE"] 311 | relation_mention_argument1_refid = relation_mention_argument1.attrib["REFID"] 312 | relation_mention_argument1_role = relation_mention_argument1.attrib["ROLE"] 313 | 314 | # replace extend to the corresponding head 315 | # arg0 316 | if relation_mention_argument0_refid in entity_mentions_mentionid2dict: 317 | relation_mention_argument0_extend_offset = \ 318 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"] 319 | relation_mention_argument0_extend_text = \ 320 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["text"] 321 | elif relation_mention_argument0_refid in timex2_mentions_mentionid2dict: 322 | relation_mention_argument0_extend_offset = \ 323 | timex2_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"] 324 | relation_mention_argument0_extend_text = \ 325 | timex2_mentions_mentionid2dict[relation_mention_argument0_refid]["text"] 326 | elif relation_mention_argument0_refid in value_mentions_mentionid2dict: 327 | relation_mention_argument0_extend_offset = \ 328 | value_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"] 329 | relation_mention_argument0_extend_text = \ 330 | value_mentions_mentionid2dict[relation_mention_argument0_refid]["text"] 331 | 332 | # time mention 333 | if relation_mention_argument1_refid in entity_mentions_mentionid2dict: 334 | relation_mention_argument1_extend_offset = \ 335 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"] 336 | relation_mention_argument1_extend_text = \ 337 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["text"] 338 | elif relation_mention_argument1_refid in timex2_mentions_mentionid2dict: 339 | relation_mention_argument1_extend_offset = \ 340 | timex2_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"] 341 | relation_mention_argument1_extend_text = \ 342 | timex2_mentions_mentionid2dict[relation_mention_argument1_refid]["text"] 343 | elif relation_mention_argument1_refid in value_mentions_mentionid2dict: 344 | relation_mention_argument1_extend_offset = \ 345 | value_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"] 346 | relation_mention_argument1_extend_text = \ 347 | value_mentions_mentionid2dict[relation_mention_argument1_refid]["text"] 348 | 349 | mention_dict = {"relation_id": relation_id, "relation_type": relation_type, 350 | "relation_subtype": relation_subtype, "relation_tense": relation_tense, 351 | "relation_modality": relation_modality, "relation_argument0_refid": relation_argument0_refid, 352 | "relation_argument0_role": relation_argument0_role, 353 | "relation_argument1_refid": relation_argument1_refid, 354 | "relation_argument1_role": relation_argument1_role, "mention_id": relation_mention_id, 355 | "mention_offset": relation_mention_extend_offset, 356 | "mention_text": relation_mention_extent_text, 357 | "mention_argument0_refid": relation_mention_argument0_refid, 358 | "mention_argument0_role": relation_mention_argument0_role, 359 | "mention_argument1_refid": relation_mention_argument1_refid, 360 | "mention_argument1_role": relation_mention_argument1_role, 361 | "mention_argument0_offset": relation_mention_argument0_extend_offset, 362 | "mention_argument0_text": relation_mention_argument0_extend_text, 363 | "mention_argument1_offset": relation_mention_argument1_extend_offset, 364 | "mention_argument1_text": relation_mention_argument1_extend_text 365 | } 366 | relation_mentions_id2dict[relation_mention_id] = mention_dict 367 | if relation_mention_argument0_refid in relation_mentions_men2men2dict: 368 | relation_mentions_men2dict = relation_mentions_men2men2dict[relation_mention_argument0_refid] 369 | relation_mentions_men2dict[relation_mention_argument1_refid] = mention_dict 370 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict 371 | else: 372 | relation_mentions_men2dict = {relation_mention_argument1_refid: mention_dict} 373 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict 374 | 375 | # parse all events 376 | event_mentions_id2dict = {} 377 | for event_elem in all_event_elems: 378 | event_id = event_elem.attrib["ID"] 379 | event_type = event_elem.attrib["TYPE"] 380 | event_subtype = event_elem.attrib["SUBTYPE"] 381 | event_modality = event_elem.attrib["MODALITY"] 382 | event_polarity = event_elem.attrib["POLARITY"] 383 | event_genericity = event_elem.attrib["GENERICITY"] 384 | event_tense = event_elem.attrib["TENSE"] 385 | 386 | all_event_argument_elems = event_elem.findall("event_argument") 387 | event_argument_list = [] 388 | for event_argument_elem in all_event_argument_elems: 389 | event_argument_refid = event_argument_elem.attrib["REFID"] 390 | event_argument_role = event_argument_elem.attrib["ROLE"] 391 | event_argument_dict = {"argument_refid": event_argument_refid, "argument_role": event_argument_role} 392 | event_argument_list.append(event_argument_dict) 393 | 394 | all_event_mention_elems = event_elem.findall("event_mention") 395 | for event_mention_elem in all_event_mention_elems: 396 | event_mention_id = event_mention_elem.attrib["ID"] 397 | event_mention_extent = event_mention_elem.findall("extent")[0].findall("charseq")[0] 398 | event_mention_extent_start = event_mention_extent.attrib["START"] 399 | event_mention_extent_end = event_mention_extent.attrib["END"] 400 | event_mention_extent_text = event_mention_extent.text 401 | 402 | event_mention_anchor = event_mention_elem.findall("anchor")[0].findall("charseq")[0] # trigger 403 | event_mention_anchor_start = event_mention_anchor.attrib["START"] 404 | event_mention_anchor_end = event_mention_anchor.attrib["END"] 405 | event_mention_anchor_offset = event_mention_anchor_start + ":" + event_mention_anchor_end 406 | event_mention_anchor_text = event_mention_anchor.text 407 | 408 | all_event_mention_argument_elems = event_mention_elem.findall("event_mention_argument") 409 | all_event_mention_arguments = [] 410 | for event_mention_argument_elem in all_event_mention_argument_elems: 411 | event_mention_argument_refid = event_mention_argument_elem.attrib["REFID"] 412 | event_mention_argument_role = event_mention_argument_elem.attrib["ROLE"] 413 | 414 | # replace extend to head 415 | # entity mentions 416 | if event_mention_argument_refid in entity_mentions_mentionid2dict: 417 | event_mention_argument_offset = \ 418 | entity_mentions_mentionid2dict[event_mention_argument_refid]["offset"] 419 | event_mention_argument_text = entity_mentions_mentionid2dict[event_mention_argument_refid]["text"] 420 | elif event_mention_argument_refid in timex2_mentions_mentionid2dict: 421 | event_mention_argument_offset = \ 422 | timex2_mentions_mentionid2dict[event_mention_argument_refid]["offset"] 423 | event_mention_argument_text = timex2_mentions_mentionid2dict[event_mention_argument_refid]["text"] 424 | elif event_mention_argument_refid in value_mentions_mentionid2dict: 425 | event_mention_argument_offset = \ 426 | value_mentions_mentionid2dict[event_mention_argument_refid]["offset"] 427 | event_mention_argument_text = value_mentions_mentionid2dict[event_mention_argument_refid]["text"] 428 | 429 | event_mention_argument_dict = {"mention_argument_refid": event_mention_argument_refid, 430 | "mention_argument_role": event_mention_argument_role, 431 | "mention_argument_offset": event_mention_argument_offset, 432 | "mention_argument_text": event_mention_argument_text} 433 | all_event_mention_arguments.append(event_mention_argument_dict) 434 | 435 | mention_dict = {"event_id": event_id, "type": event_type, "subtype": event_subtype, 436 | "modality": event_modality, "polarity": event_polarity, 437 | "genericity": event_genericity, "tense": event_tense, 438 | "mention_id": event_mention_id, "anchor_offset": event_mention_anchor_offset, 439 | "anchor_text": event_mention_anchor_text, "argument": all_event_mention_arguments} 440 | 441 | event_mentions_id2dict[event_mention_id] = mention_dict 442 | 443 | return entity_mentions_mentionid2dict, timex2_mentions_mentionid2dict, value_mentions_mentionid2dict, \ 444 | relation_mentions_id2dict, event_mentions_id2dict 445 | 446 | 447 | if __name__ == "__main__": 448 | parser = argparse.ArgumentParser() 449 | parser.add_argument('--bio', type=str, 450 | help='bio input path') 451 | parser.add_argument('--ann', type=str, 452 | help='ace annotation input path') 453 | parser.add_argument('--ace', type=str, 454 | help='output ace annotation path') 455 | parser.add_argument('--filelist', type=str, 456 | help='filelist path') 457 | 458 | args = parser.parse_args() 459 | 460 | bio_path = args.bio 461 | ann_path = args.ann 462 | ace_path = args.ace 463 | 464 | if not os.path.exists(ace_path): 465 | os.makedirs(ace_path) 466 | 467 | file_names = [] 468 | if os.path.isdir(bio_path): 469 | file_names = [item[:-4] 470 | for item in os.listdir(bio_path) 471 | if item.endswith(".bio")] 472 | else: 473 | file_names = [bio_path] 474 | 475 | for f in file_names: 476 | # print(f) 477 | bio_file= os.path.join(bio_path, f+".bio") 478 | ann_file = os.path.join(ann_path, f+".apf.xml") 479 | ace_file = os.path.join(ace_path, f+".csv") 480 | 481 | if os.path.exists(bio_file) and os.path.exists(ann_file): 482 | write_ann(bio_file, ann_file, ace_file) 483 | 484 | --------------------------------------------------------------------------------