├── __init__.py
├── README.md
├── html_entities
├── ace2ner.py
├── ltf2sent.py
├── ltf2bio.py
├── source2rsd.py
├── tokenizer.py
├── rsd2ltf.py
├── ace2event.py
├── bio2ere.py
└── bio2ace.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ACE and ERE Preprocessing
2 |
3 | This repository includes the preprocessing scripts for ACE and ERE datasets, including name tagging, entity coreference, relation extraction, event extraction and event coreference tasks. (verified on ACE_2005 and Rich_ERE corpus)
4 |
5 | ## Requirements
6 |
7 | Python 3.6, jieba, NLTK
8 |
9 | ## Usage
10 |
11 | ### Step 1
12 | Preprocess Data: remove XML tags from ACE/ERE articles, sentence merging.
13 |
14 | ```
15 | python source2rsd.py --source [source_path] --rsd [rsd_path] --data [ace or ere] --extension [ending_of_source_files]
16 | ```
17 |
18 | > [source_path]: the path for input files (all .sgm files from ACE source corpus)
19 |
20 | > [rsd_path]: output path
21 |
22 | ### Step 2
23 | Sentence segmentation, tokenization with offset retrieval
24 |
25 | ```
26 | python rsd2ltf.py --rsd [rsd_path] --ltf [ltf_path] --extension [ending_of_rsd_files]
27 | ```
28 |
29 | > [rsd_path]: the path for rsd files from step 1
30 |
31 | > [ltf_path]: output path
32 |
33 | ### Step 3
34 | Convert ltf files to sentences of tokens as the bio format in name tagging tasks
35 |
36 | ```
37 | python ltf2bio.py --ltf [ltf_path] --bio [bio_path]
38 | ```
39 |
40 | > [ltf_path]: the path for input files
41 |
42 | > [bio_path]: output path
43 |
44 | ### Step 4
45 | Add annotations to bio files
46 |
47 | ```
48 | python bio2ace.py --bio [bio_path] --ann [ann_path] --ace [ace_path]
49 | ```
50 |
51 | > [bio_path]: the path for input files
52 |
53 | > [ann_path]: the path for all annotation files from ACE
54 |
55 | > [ace_path]: output path
56 |
57 | Similarly, for ERE corpus,
58 |
59 | ```
60 | python bio2ere.py --bio [bio_path] --ann [ann_path] --ere [ace_path]
61 | ```
62 |
63 | ## Citation
64 | [1] Lifu Huang, Taylor Cassidy, Xiaocheng Feng, Heng Ji, Clare R Voss, Jiawei Han, Avirup Sil. Liberal Event Extraction and Event Schema Induction. Proc. ACL'2016
65 |
66 | [2] Lifu Huang, Avirup Sil, Heng Ji, Radu Florian. Improving slot filling performance with attentive neural networks on dependency structures. Proc. EMNLP'2017
67 |
68 | [3] Lifu Huang, Heng Ji, Kyunghyun Cho, Clare R Voss. Zero-shot transfer learning for event extraction, Proc. ACL, 2018
69 |
--------------------------------------------------------------------------------
/html_entities:
--------------------------------------------------------------------------------
1 | " " quotation mark
2 | ' ' apostrophe
3 | & & ampersand
4 | < < less-than
5 | > > greater-than
6 | non-breaking space
7 | ¡ ¡ inverted exclamation mark
8 | ¢ ¢ cent
9 | £ £ pound
10 | ¤ ¤ currency
11 | ¥ ¥ yen
12 | ¦ ¦ broken vertical bar
13 | § § section
14 | ¨ ¨ spacing diaeresis
15 | © © copyright
16 | ª ª feminine ordinal indicator
17 | « « angle quotation mark (left)
18 | ¬ ¬ negation
19 | soft hyphen
20 | ® ® registered trademark
21 | ¯ ¯ spacing macron
22 | ° ° degree
23 | ± ± plus-or-minus
24 | ² ² superscript 2
25 | ³ ³ superscript 3
26 | ´ ´ spacing acute
27 | µ µ micro
28 | ¶ ¶ paragraph
29 | · · middle dot
30 | ¸ ¸ spacing cedilla
31 | ¹ ¹ superscript 1
32 | º º masculine ordinal indicator
33 | » » angle quotation mark (right)
34 | ¼ ¼ fraction 1/4
35 | ½ ½ fraction 1/2
36 | ¾ ¾ fraction 3/4
37 | ¿ ¿ inverted question mark
38 | × × multiplication
39 | ÷ ÷ division
40 | À À capital a, grave accent
41 | Á Á capital a, acute accent
42 | Â Â capital a, circumflex accent
43 | Ã Ã capital a, tilde
44 | Ä Ä capital a, umlaut mark
45 | Å Å capital a, ring
46 | Æ Æ capital ae
47 | Ç Ç capital c, cedilla
48 | È È capital e, grave accent
49 | É É capital e, acute accent
50 | Ê Ê capital e, circumflex accent
51 | Ë Ë capital e, umlaut mark
52 | Ì Ì capital i, grave accent
53 | Í Í capital i, acute accent
54 | Î Î capital i, circumflex accent
55 | Ï Ï capital i, umlaut mark
56 | Ð Ð capital eth, Icelandic
57 | Ñ Ñ capital n, tilde
58 | Ò Ò capital o, grave accent
59 | Ó Ó capital o, acute accent
60 | Ô Ô capital o, circumflex accent
61 | Õ Õ capital o, tilde
62 | Ö Ö capital o, umlaut mark
63 | Ø Ø capital o, slash
64 | Ù Ù capital u, grave accent
65 | Ú Ú capital u, acute accent
66 | Û Û capital u, circumflex accent
67 | Ü Ü capital u, umlaut mark
68 | Ý Ý capital y, acute accent
69 | Þ Þ capital THORN, Icelandic
70 | ß ß small sharp s, German
71 | à à small a, grave accent
72 | á á small a, acute accent
73 | â â small a, circumflex accent
74 | ã ã small a, tilde
75 | ä ä small a, umlaut mark
76 | å å small a, ring
77 | æ æ small ae
78 | ç ç small c, cedilla
79 | è è small e, grave accent
80 | é é small e, acute accent
81 | ê ê small e, circumflex accent
82 | ë ë small e, umlaut mark
83 | ì ì small i, grave accent
84 | í í small i, acute accent
85 | î î small i, circumflex accent
86 | ï ï small i, umlaut mark
87 | ð ð small eth, Icelandic
88 | ñ ñ small n, tilde
89 | ò ò small o, grave accent
90 | ó ó small o, acute accent
91 | ô ô small o, circumflex accent
92 | õ õ small o, tilde
93 | ö ö small o, umlaut mark
94 | ø ø small o, slash
95 | ù ù small u, grave accent
96 | ú ú small u, acute accent
97 | û û small u, circumflex accent
98 | ü ü small u, umlaut mark
99 | ý ý small y, acute accent
100 | þ þ small thorn, Icelandic
101 | ÿ ÿ small y, umlaut mark
--------------------------------------------------------------------------------
/ace2ner.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import csv
4 |
5 |
6 | def write_ner(ace_file, ner_file):
7 | all_lines = []
8 | with open(ace_file, 'r') as csv_file:
9 | reader = csv.DictReader(csv_file)
10 | for row in reader:
11 | token = row["token"]
12 | if token == "----sentence_delimiter----":
13 | all_lines.append("\n")
14 | else:
15 | token_offset_parts = row["offset"].split(':')
16 | offset_parts = token_offset_parts[1].split('-')
17 | token_start = int(offset_parts[0])
18 | token_end = int(offset_parts[1])
19 | if row["ner_type"] == "O":
20 | all_lines.append(row["token"] + " " + row["offset"] + " " + row["ner_type"] + "\n")
21 | else:
22 | ner_nam_nom = row["ner_nam_nom"]
23 | if ner_nam_nom == "NAM":
24 | ner_offset_parts = row["ner_offset"].split(':')
25 | ner_start = int(ner_offset_parts[0])
26 | ner_end = int(ner_offset_parts[1])
27 | ner_type_parts = row["ner_type"].split(":")
28 | tag = ner_type_parts[0] + "-" + determine_tag(token_start, token_end, ner_start, ner_end)
29 | all_lines.append(row["token"] + " " + row["offset"] + " " + tag + "\n")
30 | else:
31 | all_lines.append(row["token"] + " " + row["offset"] + " " + "O" + "\n")
32 | new_all_lines = validate_lines(all_lines)
33 | out = open(ner_file, 'w')
34 | for l in new_all_lines:
35 | out.write(l)
36 | out.close()
37 |
38 |
39 | def validate_lines(all_lines):
40 | new_all_lines = []
41 | pre_tag = ""
42 | for i in range(len(all_lines)):
43 | current_line = all_lines[i].strip()
44 | if len(current_line) == 0:
45 | new_all_lines.append(current_line + "\n")
46 | else:
47 | parts = current_line.split(' ')
48 | tag = parts[2]
49 | if tag.endswith("I") and not (pre_tag.endswith("B") or pre_tag.endswith("I")):
50 | print("Error " + current_line)
51 | new_line = all_lines[i].strip()[:-1] + "B"
52 | new_all_lines.append(new_line + "\n")
53 | else:
54 | new_all_lines.append(all_lines[i].strip() + "\n")
55 | pre_tag = tag
56 | return new_all_lines
57 |
58 |
59 | def determine_tag(token_start, token_end, ner_start, ner_end):
60 | tag = "B"
61 | if token_start <= ner_start <= token_end:
62 | tag = "B"
63 | elif ner_start < token_start < ner_end:
64 | tag = "I"
65 | return tag
66 |
67 |
68 | if __name__ == "__main__":
69 | parser = argparse.ArgumentParser()
70 | parser.add_argument('--ace', type=str,
71 | help='ace input path')
72 | parser.add_argument('--ner', type=str,
73 | help='ner bio path')
74 |
75 | args = parser.parse_args()
76 |
77 | ace_path = args.ace
78 | ner_path = args.ner
79 |
80 | if not os.path.exists(ner_path):
81 | os.makedirs(ner_path)
82 |
83 | file_names = []
84 | if os.path.isdir(ace_path):
85 | file_names = [item[:-4]
86 | for item in os.listdir(ace_path)
87 | if item.endswith(".csv")]
88 | else:
89 | file_names = [ace_path]
90 |
91 | for f in file_names:
92 | print(f)
93 | ace_file= os.path.join(ace_path, f+".csv")
94 | ner_file = os.path.join(ner_path, f+".ner")
95 |
96 | if os.path.exists(ace_file):
97 | write_ner(ace_file, ner_file)
--------------------------------------------------------------------------------
/ltf2sent.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import os
3 | import argparse
4 | import sys
5 | import xml.etree.ElementTree as ET
6 |
7 |
8 | def ltf2sent(ltf_str):
9 | doc_tokens = load_ltf(ltf_str.encode('utf-8'))
10 |
11 | all_sents = []
12 | for sent in doc_tokens:
13 | sent_res = []
14 | for token in sent:
15 | t_text = token[0]
16 | if not t_text.strip():
17 | continue
18 | if t_text is None:
19 | t_text = ''
20 | # get token bio tag
21 | sent_res.append(t_text)
22 | all_sents.append(' '.join(sent_res))
23 |
24 | return '\n'.join(all_sents)
25 |
26 |
27 | def load_ltf(ltf_str):
28 | doc_tokens = []
29 | root = ET.fromstring(ltf_str)
30 | doc_id = root.find('DOC').get('id')
31 | for seg in root.find('DOC').find('TEXT').findall('SEG'):
32 | sent_tokens = []
33 | seg_text = seg.find('ORIGINAL_TEXT').text
34 | seg_start = int(seg.get('start_char'))
35 | seg_end = int(seg.get('end_char'))
36 | for token in seg.findall('TOKEN'):
37 | token_text = token.text
38 | start_char = int(token.get('start_char'))
39 | end_char = int(token.get('end_char'))
40 |
41 | assert seg_text[start_char-seg_start:end_char-seg_start+1] == token_text, \
42 | 'ltf2bio load_ltf token offset error.'
43 |
44 | sent_tokens.append((token_text, doc_id, start_char, end_char))
45 | doc_tokens.append(sent_tokens)
46 |
47 | return doc_tokens
48 |
49 |
50 | def write2file(bio_str, out_file):
51 | with codecs.open(out_file, 'w', 'utf-8') as f:
52 | f.write(bio_str)
53 |
54 |
55 | if __name__ == "__main__":
56 | parser = argparse.ArgumentParser()
57 | parser.add_argument('--ltf', type=str,
58 | help='ltf input path')
59 | parser.add_argument('--sent', type=str,
60 | help='output path')
61 | parser.add_argument('--ltf_filelist', type=str,
62 | help='ltf filelist path')
63 | parser.add_argument('-s', '--separate_output', action='store_true', default=True,
64 | help='separate output')
65 |
66 | args = parser.parse_args()
67 |
68 | ltf_input = args.ltf
69 | output = args.sent
70 | ltf_filelist = args.ltf_filelist
71 | separate_output = args.separate_output
72 |
73 | ltf_fp = []
74 | if os.path.isdir(ltf_input):
75 | if not os.path.exists(output):
76 | os.makedirs(output)
77 | if args.ltf_filelist:
78 | ltf_filelist = open(args.ltf_filelist).read().splitlines()
79 | ltf_fp = [os.path.join(ltf_input, item)
80 | for item in ltf_filelist]
81 | else:
82 | ltf_fp = [os.path.join(ltf_input, item)
83 | for item in os.listdir(args.ltf)
84 | if '.ltf.xml' in item]
85 | else:
86 | ltf_fp = [ltf_input]
87 |
88 | res = []
89 | for i, filepath in enumerate(ltf_fp):
90 |
91 | assert os.path.exists(filepath)
92 |
93 | print(filepath)
94 |
95 | ltf_str = codecs.open(filepath, 'r', 'utf-8').read()
96 | bio_str = ltf2sent(ltf_str)
97 | if separate_output:
98 | out_file = os.path.join(
99 | output, os.path.basename(filepath).replace('.ltf.xml', '')
100 | )
101 | write2file(bio_str, out_file)
102 | res.append(bio_str)
103 |
104 | sys.stdout.write('%d docs processed.\r' % i)
105 | sys.stdout.flush()
106 |
107 | if not separate_output:
108 | write2file('\n\n'.join(res), args.output)
109 |
110 | print('%d docs processed in total.' % len(ltf_fp))
111 |
--------------------------------------------------------------------------------
/ltf2bio.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import os
3 | import argparse
4 | import sys
5 | import xml.etree.ElementTree as ET
6 |
7 |
8 | def ltf2bio(ltf_str):
9 | doc_tokens = load_ltf(ltf_str.encode('utf-8'))
10 |
11 | bio = []
12 | for sent in doc_tokens:
13 | sent_res = []
14 | for token in sent:
15 | t_text = token[0]
16 | if not t_text.strip():
17 | continue
18 | if t_text is None:
19 | t_text = ''
20 | t_doc_id = token[1]
21 | t_start_char = token[2]
22 | t_end_char = token[3]
23 |
24 | # get token bio tag
25 | sent_res.append(' '.join([t_text,
26 | '%s:%s-%s' % (t_doc_id,
27 | t_start_char,
28 | t_end_char)]))
29 | bio.append('\n'.join(sent_res))
30 |
31 | return '\n\n'.join(bio)
32 |
33 |
34 | def load_ltf(ltf_str):
35 | doc_tokens = []
36 | root = ET.fromstring(ltf_str)
37 | doc_id = root.find('DOC').get('id')
38 | for seg in root.find('DOC').find('TEXT').findall('SEG'):
39 | sent_tokens = []
40 | seg_text = seg.find('ORIGINAL_TEXT').text
41 | seg_start = int(seg.get('start_char'))
42 | seg_end = int(seg.get('end_char'))
43 | for token in seg.findall('TOKEN'):
44 | token_text = token.text
45 | start_char = int(token.get('start_char'))
46 | end_char = int(token.get('end_char'))
47 |
48 | assert seg_text[start_char-seg_start:end_char-seg_start+1] == token_text, \
49 | 'ltf2bio load_ltf token offset error.'
50 |
51 | sent_tokens.append((token_text, doc_id, start_char, end_char))
52 | doc_tokens.append(sent_tokens)
53 |
54 | return doc_tokens
55 |
56 |
57 | def write2file(bio_str, out_file):
58 | with codecs.open(out_file, 'w', 'utf-8') as f:
59 | f.write(bio_str)
60 |
61 |
62 | if __name__ == "__main__":
63 | parser = argparse.ArgumentParser()
64 | parser.add_argument('--ltf', type=str,
65 | help='ltf input path')
66 | parser.add_argument('--bio', type=str,
67 | help='output path')
68 | parser.add_argument('--ltf_filelist', type=str,
69 | help='ltf filelist path')
70 | parser.add_argument('-s', '--separate_output', action='store_true', default=True,
71 | help='separate output')
72 |
73 | args = parser.parse_args()
74 |
75 | ltf_input = args.ltf
76 | output = args.bio
77 | ltf_filelist = args.ltf_filelist
78 | separate_output = args.separate_output
79 |
80 | ltf_fp = []
81 | if os.path.isdir(ltf_input):
82 | if not os.path.exists(output):
83 | os.makedirs(output)
84 | if args.ltf_filelist:
85 | ltf_filelist = open(args.ltf_filelist).read().splitlines()
86 | ltf_fp = [os.path.join(ltf_input, item)
87 | for item in ltf_filelist]
88 | else:
89 | ltf_fp = [os.path.join(ltf_input, item)
90 | for item in os.listdir(args.ltf)
91 | if '.ltf.xml' in item]
92 | else:
93 | ltf_fp = [ltf_input]
94 |
95 | res = []
96 | for i, filepath in enumerate(ltf_fp):
97 |
98 | assert os.path.exists(filepath)
99 |
100 | print(filepath)
101 |
102 | ltf_str = codecs.open(filepath, 'r', 'utf-8').read()
103 | bio_str = ltf2bio(ltf_str)
104 | if separate_output:
105 | out_file = os.path.join(
106 | output, os.path.basename(filepath).replace('.ltf.xml', '.bio')
107 | )
108 | write2file(bio_str, out_file)
109 | res.append(bio_str)
110 |
111 | sys.stdout.write('%d docs processed.\r' % i)
112 | sys.stdout.flush()
113 |
114 | if not separate_output:
115 | write2file('\n\n'.join(res), args.output)
116 |
117 | print('%d docs processed in total.' % len(ltf_fp))
118 |
--------------------------------------------------------------------------------
/source2rsd.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | html_entities = []
5 |
6 | python_path = os.path.abspath(__file__).replace("source2rsd.py", "")
7 | with open(os.path.join(python_path, "html_entities"), 'r') as f:
8 | for line in f:
9 | parts = line.strip().split('\t')
10 | html_entities.append(parts[1])
11 |
12 |
13 | def remove_xml_tag(source_file, rsd_file, data):
14 | out = open(rsd_file, 'w')
15 | signal = 0 # 0 before read , 1 after
16 | lines = []
17 | with open(source_file, 'r') as f:
18 | for line in f:
19 | line = line.strip('\n')
20 | if line == "":
21 | signal = 1
22 | if signal == 0:
23 | new_line = remove_tag(line, data, signal)
24 | out.write(new_line + " ")
25 | elif signal == 1:
26 | lines.append(line)
27 | con_line = ' '.join(lines)
28 | new_line = remove_tag(con_line, data, signal)
29 | out.write(new_line + " ")
30 | out.close()
31 |
32 |
33 | def remove_tag(sent, data, signal):
34 | newsent = sent
35 | if data == 'ace' or data.lower() == 'ace':
36 | # keep text only after
37 |
38 | if (newsent.startswith("") or newsent.startswith("")
39 | or newsent.startswith("") or newsent.startswith("")):
40 | while "<" in newsent and ">" in newsent and newsent.index("<") < newsent.index(">"):
41 | index1 = newsent.index("<")
42 | index2 = newsent.index(">")
43 | str1 = newsent[0:index1]
44 | str2 = newsent[index2+1:]
45 | newsent = str1+str2
46 | else:
47 | while "<" in newsent and ">" in newsent and newsent.index("<") < newsent.index(">"):
48 | index1 = newsent.index("<")
49 | index2 = newsent.index(">")
50 | str1 = newsent[0:index1]
51 | str2 = newsent[index2+1:]
52 | newsent = str1+str2
53 |
54 | if signal == 0:
55 | newsent = ''.join(len(newsent) * [' '])
56 |
57 | elif data == 'ere' or data.lower() == 'ere':
58 | # replace html entities
59 | for ent in html_entities:
60 | space_str = ''.join(len(ent)*[' '])
61 | newsent = newsent.replace(ent, space_str)
62 | tags = ["", " ")
67 | newsent = newsent.replace("=", " ")
68 | newsent = newsent.replace("\"", " ")
69 |
70 | tags1 = ["
" in newsent and newsent.index(tag)"):
73 | idx1 = newsent.index(tag)
74 | idx2 = newsent.index(">")
75 | subsent1 = newsent[0:idx1]
76 | subsent2 = newsent[idx2+1:]
77 | subsent3 = newsent[idx1:idx2+1]
78 | spaces_str = ''.join(len(subsent3) * [' '])
79 | newsent = subsent1 + spaces_str + subsent2
80 | return newsent
81 |
82 |
83 | if __name__ == "__main__":
84 | parser = argparse.ArgumentParser()
85 | parser.add_argument('--source', type=str,
86 | help='input path')
87 | parser.add_argument('--rsd', type=str,
88 | help='rsd path')
89 | parser.add_argument('--data', type=str,
90 | help='ace or ere')
91 | parser.add_argument('--extension', type=str, default=".sgm",
92 | help='')
93 |
94 | args = parser.parse_args()
95 |
96 | source_path = args.source
97 | rsd_path = args.rsd
98 | data = args.data
99 | suffix = args.extension
100 |
101 | if not os.path.exists(rsd_path):
102 | os.makedirs(rsd_path)
103 |
104 | file_names = []
105 | if os.path.isdir(source_path):
106 | file_names = [item for item in os.listdir(source_path) if item.endswith(suffix)]
107 | else:
108 | file_names = [source_path]
109 |
110 | for f in file_names:
111 | source_file= os.path.join(source_path, f)
112 | rsd_file = os.path.join(rsd_path, f)
113 |
114 | if os.path.exists(source_file):
115 | remove_xml_tag(source_file, rsd_file, data)
--------------------------------------------------------------------------------
/tokenizer.py:
--------------------------------------------------------------------------------
1 | #encoding=utf-8
2 | import os
3 | import jieba
4 | import nltk
5 | import re
6 | import itertools
7 | import unicodedata as ud
8 |
9 |
10 | class Tokenizer(object):
11 | def __init__(self, seg_option="linebreak", tok_option="unitok"):
12 | self.segmenters = {'linebreak': self.seg_linebreak,
13 | 'nltk': self.seg_nltk,
14 | 'cmn': self.seg_cmn,
15 | 'edl_spanish': self.seg_edl_spanish,
16 | 'edl_cmn': self.seg_edl_cmn,
17 | 'nltk+linebreak': self.seg_nltk_linebreak,
18 | 'tigrinya': self.seg_tigrinya
19 | }
20 | self.tokenizers = {'unitok': self.tok_unitok,
21 | 'unitok_cut': self.tok_unitok_cut,
22 | 'regexp': self.tok_regexp,
23 | 'nltk_wordpunct': self.tok_nltk_wordpunct,
24 | 'space': self.tok_space,
25 | 'char': self.tok_char,
26 | 'jieba': self.tok_jieba,
27 | }
28 |
29 | self.root_dir = os.path.dirname(os.path.abspath(__file__))
30 |
31 | self.seg_option = seg_option
32 | self.tok_option = tok_option
33 |
34 | # initialize jieba cn tok
35 | if tok_option == 'jieba':
36 | jieba.initialize()
37 |
38 | def run_segmenter(self, plain_text):
39 | # right strip plain text
40 | plain_text = plain_text.rstrip()
41 |
42 | # run segmenter
43 | sents = self.segmenters[self.seg_option](plain_text)
44 |
45 | sents = [s for s in sents if s.strip()]
46 |
47 | return sents
48 |
49 | def run_tokenizer(self, sents):
50 | # right strip each sent
51 | for i in range(len(sents)):
52 | sents[i] = sents[i].rstrip()
53 |
54 | # run tokenizer
55 | tokenized_sents = self.tokenizers[self.tok_option](sents)
56 |
57 | for i, s in enumerate(tokenized_sents):
58 | s = [t for t in s if t.strip()]
59 | tokenized_sents[i] = s
60 |
61 | return tokenized_sents
62 |
63 | #
64 | # segmenters
65 | #
66 | def seg_linebreak(self, plain_text):
67 | """
68 | use "\n" as delimiter
69 | :param plain_text:
70 | :return:
71 | """
72 | result = [item.strip() for item in plain_text.split('\n') if item.strip()]
73 |
74 | return result
75 |
76 | def seg_nltk(self, plain_text):
77 | """
78 | use nltk default segmenter
79 | :param plain_text:
80 | :return:
81 | """
82 | result = [item.strip() for item in nltk.sent_tokenize(plain_text)]
83 |
84 | return result
85 |
86 | def seg_nltk_linebreak(self, plain_text):
87 | """
88 | use nltk segmenter and then use "\n" as delimiter to re-segment.
89 | :param plain_text:
90 | :return:
91 | """
92 | nltk_result = '\n'.join(self.seg_nltk(plain_text))
93 | linebreak_result = self.seg_linebreak(nltk_result)
94 |
95 | return linebreak_result
96 |
97 | def seg_cmn(self, plain_text):
98 | """
99 | use Chinese punctuation as delimiter
100 | :param plain_text:
101 | :return:
102 | """
103 | res = []
104 | sent_end_char = [u'。', u'!', u'?']
105 | current_sent = ''
106 | for i, char in enumerate(list(plain_text)):
107 | if char in sent_end_char or i == len(list(plain_text)) - 1:
108 | res.append(current_sent + char)
109 | current_sent = ''
110 | else:
111 | current_sent += char
112 |
113 | return [item.strip() for item in res]
114 |
115 | def seg_edl(self, plain_text, seg_option):
116 | # replace \n with ' ' because of the fix line length of edl data
117 | # plain_text = plain_text.replace('\n', ' ')
118 |
119 | # do sentence segmentation
120 | if seg_option == 'edl_spanish':
121 | # use nltk sent tokenization for spanish
122 | tmp_seg = nltk.sent_tokenize(plain_text)
123 | if seg_option == 'edl_cmn':
124 | # use naive sent tokenization for chinese
125 | tmp_seg = self.seg_cmn(plain_text)
126 |
127 | # recover \n after xml tag
128 | recovered_tmp_seg = []
129 | for sent in tmp_seg:
130 | sent = sent.replace('> ', '>\n').replace(' <', '\n<')
131 | sent = sent.split('\n')
132 | recovered_tmp_seg += [item.strip() for item in sent]
133 |
134 | return recovered_tmp_seg
135 |
136 | def seg_edl_spanish(self, plain_text):
137 | return self.seg_edl(plain_text, 'edl_spanish')
138 |
139 | def seg_edl_cmn(self, plain_text):
140 | return self.seg_edl(plain_text, 'edl_cmn')
141 |
142 | def seg_tigrinya(self, plain_text):
143 | result = [item.strip() for item in plain_text.split('\n') if
144 | item.strip()]
145 |
146 | updated_result = []
147 | for r in result:
148 | if '።' in r:
149 | sents = []
150 | start = 0
151 | for i, char in enumerate(r):
152 | if char == '።':
153 | sents.append(r[start:i+1])
154 | start = i + 1
155 | updated_result += sents
156 | else:
157 | updated_result.append(r)
158 |
159 | return updated_result
160 |
161 | #
162 | # tokenizers
163 | #
164 | def tok_unitok(self, sents):
165 | res = []
166 | for s in sents:
167 | s = unitok_tokenize(s).split()
168 | res.append(s)
169 |
170 | return res
171 |
172 | def tok_unitok_cut(self, sents):
173 | res = []
174 | num_sent_cut = 0
175 | for s in sents:
176 | s = unitok_tokenize(s).split()
177 | if len(s) > 80:
178 | sub_sents = [item.split() for item in nltk.sent_tokenize(' '.join(s))]
179 | assert sum([len(item) for item in sub_sents]) == len(s)
180 |
181 | # sub_sent = [list(group) for k, group in
182 | # itertools.groupby(s, lambda x: x == ".") if not k]
183 | res += sub_sents
184 | if len(sub_sents) > 1:
185 | num_sent_cut += 1
186 | else:
187 | res.append(s)
188 | print('%d sentences longer than 80 and cut by delimiter ".".')
189 | return res
190 |
191 | def tok_regexp(self, sents):
192 | result = []
193 | for s in sents:
194 | tokenizer = nltk.tokenize.RegexpTokenizer('\w+|\$[\d\.]+|\S+')
195 | tokenization_out = tokenizer.tokenize(s)
196 | result.append(tokenization_out)
197 |
198 | return result
199 |
200 | def tok_nltk_wordpunct(self, sents):
201 | result = []
202 | for s in sents:
203 | tokenizer = nltk.tokenize.WordPunctTokenizer()
204 | tokenization_out = tokenizer.tokenize(s)
205 | result.append(tokenization_out)
206 | return result
207 |
208 | def tok_space(self, sents):
209 | result = []
210 | for s in sents:
211 | tokenization_out = s.split(' ')
212 | result.append(tokenization_out)
213 | return result
214 |
215 | def tok_char(self, sents):
216 | result = []
217 | for s in sents:
218 | tokenization_out = list(s)
219 | result.append(tokenization_out)
220 | return result
221 |
222 | def tok_jieba(self, sents):
223 | result = []
224 | for s in sents:
225 | raw_tokenization_out = list(jieba.cut(s))
226 | result.append(raw_tokenization_out)
227 | return result
228 |
229 |
230 | # by Jon May
231 | def unitok_tokenize(data):
232 | toks = []
233 | for offset, char in enumerate(data):
234 | cc = ud.category(char)
235 | # separate text by punctuation or symbol
236 | if char in ['ʼ', '’', '‘', '´', '′', "'"]: # do not tokenize oromo apostrophe
237 | toks.append(char)
238 | elif cc.startswith("P") or cc.startswith("S") \
239 | or char in ['።', '፡']: # Tigrinya period and comma
240 | toks.append(' ')
241 | toks.append(char)
242 | toks.append(' ')
243 | else:
244 | toks.append(char)
245 |
246 | toks = [item for item in ''.join(toks).split() if item]
247 |
248 | return ' '.join(toks)
--------------------------------------------------------------------------------
/rsd2ltf.py:
--------------------------------------------------------------------------------
1 | #encoding=utf-8
2 | import os
3 | import argparse
4 | import sys
5 | import itertools
6 | import xml.dom.minidom
7 | import xml.etree.ElementTree as ET
8 | import codecs
9 |
10 | # dirty import from current dir
11 | script_dirname = os.path.dirname(os.path.abspath(__file__))
12 | sys.path.append(script_dirname)
13 | from tokenizer import Tokenizer
14 |
15 |
16 | def rsd2ltf(rsd_str, doc_id,
17 | seg_option='linebreak',
18 | tok_option='unitok',
19 | re_segment=False):
20 | tokenizer = Tokenizer(seg_option, tok_option)
21 |
22 | if re_segment:
23 | # running segmentation and tokenization, then re-segment the tokenized
24 | # sentences (use space to concatenate tokens. this solves segmentation
25 | # problem, e.g. How are you?I'm fine.).
26 | sents = tokenizer.run_segmenter(rsd_str)
27 | raw_tokens = tokenizer.run_tokenizer(sents)
28 |
29 | # re-segment tokenized sentence
30 | num_sent_reseg = 0
31 | tokens = []
32 | for i, t in enumerate(raw_tokens):
33 | reseg = [item.split() for item in tokenizer.run_segmenter(' '.join(t))]
34 | if len(reseg) > 1:
35 | num_sent_reseg += 1
36 |
37 | tokens += reseg
38 |
39 | # compute offset for each token
40 | indexer = 0
41 | token_offset = []
42 | for i, t in enumerate(itertools.chain(*tokens)):
43 | while not rsd_str[indexer:].startswith(t) and \
44 | indexer < len(rsd_str):
45 | indexer += 1
46 | if indexer < len(rsd_str):
47 | t_start = indexer
48 | t_end = t_start + len(t) - 1
49 | assert rsd_str[t_start:t_end + 1] == t, \
50 | "re_segment token offset not match %s-%d" % (doc_id, i)
51 | token_offset.append((t_start, t_end))
52 | indexer = t_end + 1
53 |
54 | assert len(token_offset) == len(list(itertools.chain(*tokens))), \
55 | "re_segment tokenization offset error in: %s" % doc_id
56 |
57 | # recover sent using tokens
58 | sents = []
59 | prev_token_end = token_offset[0][0]-1
60 | token_index = 0
61 | for i, t in enumerate(tokens):
62 | sent = ''
63 | for j, item in enumerate(t):
64 | if j == 0:
65 | prev_token_end = token_offset[token_index][0] - 1
66 |
67 | sent += ' ' * (token_offset[token_index][0] - prev_token_end - 1) + item
68 |
69 | prev_token_end = token_offset[token_index][1]
70 |
71 | token_index += 1
72 |
73 | assert sent in rsd_str, \
74 | 're_segment sentence offset error.'
75 |
76 | sents.append(sent)
77 |
78 | else:
79 | # running segmentation and tokenization
80 | sents = tokenizer.run_segmenter(rsd_str)
81 | tokens = tokenizer.run_tokenizer(sents)
82 |
83 | # generate offset for sentences and tokens
84 | indexer = 0
85 | sent_offset = []
86 | for i, s in enumerate(sents):
87 | while not rsd_str[indexer:].startswith(s) and indexer < len(rsd_str):
88 | indexer += 1
89 | if indexer < len(rsd_str):
90 | sent_start = indexer
91 | sent_end = sent_start + len(s) - 1
92 | assert rsd_str[sent_start:sent_end+1] == s, \
93 | "sentence offset not match %s-%d" % (doc_id, i)
94 | sent_offset.append((sent_start, sent_end))
95 | indexer = sent_end + 1
96 |
97 | assert len(sent_offset) == len(sents), \
98 | "sentence segmentation offset error in: %s" % doc_id
99 |
100 | token_offsets = []
101 | for i, tok in enumerate(tokens):
102 | sent_text = sents[i]
103 | indexer = 0
104 | t_offset = []
105 | for j, t in enumerate(tok):
106 | while not sent_text[indexer:].startswith(t) and \
107 | indexer < len(sent_text):
108 | indexer += 1
109 | if indexer < len(sent_text):
110 | t_start = indexer
111 | t_end = t_start + len(t) - 1
112 | assert sent_text[t_start:t_end+1] == t, \
113 | "token offset not match %s-%d-%d" % (doc_id, i, j)
114 | t_offset.append((t_start, t_end))
115 | indexer = t_end + 1
116 | token_offsets.append(t_offset)
117 |
118 | assert len(t_offset) == len(tok), \
119 | "tokenization offset error in: %s-%d" % (doc_id, i)
120 |
121 | # convert seg/tok result to ltf
122 | root = ET.Element('LCTL_TEXT')
123 | doc_element = ET.Element('DOC', {'id': doc_id})
124 | text_element = ET.Element('TEXT')
125 | root.append(doc_element)
126 | doc_element.append(text_element)
127 |
128 | for i in range(len(sents)):
129 | seg_text = sents[i]
130 | seg_start_char = sent_offset[i][0]
131 | seg_end_char = sent_offset[i][1]
132 |
133 | seg_id = '%s-%s' % (doc_id, str(i))
134 |
135 | seg_element = ET.Element('SEG', {'id': seg_id,
136 | 'start_char': str(seg_start_char),
137 | 'end_char': str(seg_end_char)})
138 | original_text_element = ET.Element('ORIGINAL_TEXT')
139 | original_text_element.text = seg_text
140 | seg_element.append(original_text_element)
141 |
142 | for j in range(len(tokens[i])):
143 | token_id = 'token-%d-%d' % (i, j)
144 | tok_text = tokens[i][j]
145 | if not tok_text:
146 | continue
147 | tok_start_char = int(token_offsets[i][j][0]) + seg_start_char
148 | tok_end_char = int(token_offsets[i][j][1]) + seg_start_char
149 |
150 | assert rsd_str[tok_start_char:tok_end_char+1] == tok_text
151 |
152 | token_element = ET.Element('TOKEN',
153 | {'id': token_id,
154 | 'start_char': str(tok_start_char),
155 | 'end_char': str(tok_end_char)})
156 | token_element.text = tok_text
157 | seg_element.append(token_element)
158 |
159 | text_element.append(seg_element)
160 |
161 | return root
162 |
163 |
164 | def write2file(ltf_root, out_file):
165 | # pretty print xml
166 | root_str = ET.tostring(ltf_root, 'utf-8')
167 | f_xml = xml.dom.minidom.parseString(root_str)
168 | pretty_xml_as_string = f_xml.toprettyxml(encoding="utf-8")
169 | f = open(out_file, 'wb')
170 | f.write(pretty_xml_as_string)
171 | f.close()
172 |
173 |
174 | if __name__ == "__main__":
175 | parser = argparse.ArgumentParser()
176 | parser.add_argument('--rsd', type=str,
177 | help='input rsd file path or directory.')
178 | parser.add_argument('--ltf', type=str,
179 | help='output ltf file path or directory.')
180 | t = Tokenizer()
181 | parser.add_argument('--seg_option', default='nltk+linebreak',
182 | help="segmentation options: %s (default is linebreak)" %
183 | ', '.join(t.segmenters.keys()))
184 | parser.add_argument('--tok_option', default='unitok',
185 | help="tokenization options: %s (default is unitok)" %
186 | ', '.join(t.tokenizers.keys()))
187 | parser.add_argument('--extension', default=".sgm",
188 | help="extension of rsd file")
189 | parser.add_argument('--re_segment', action='store_true', default=False,
190 | help='first run tokenizaiton, and then segmentation.')
191 |
192 | args = parser.parse_args()
193 |
194 | input_rsd = args.rsd
195 | output_ltf = args.ltf
196 | seg_option = args.seg_option
197 | tok_option = args.tok_option
198 | extension = args.extension
199 | re_segment = args.re_segment
200 |
201 | rsd_files = []
202 | output_files = []
203 | if os.path.isdir(input_rsd):
204 | if not os.path.exists(output_ltf):
205 | os.makedirs(output_ltf)
206 |
207 | for fn in os.listdir(input_rsd):
208 | if extension not in fn:
209 | continue
210 | rsd_files.append(os.path.join(input_rsd, fn))
211 | output_files.append(os.path.join(output_ltf,
212 | fn.replace(extension, '.ltf.xml')))
213 | else:
214 | rsd_files = [input_rsd]
215 | output_files = [output_ltf]
216 |
217 | for k, rsd_f in enumerate(rsd_files):
218 | try:
219 | rsd_str = codecs.open(rsd_f, 'r', 'utf-8').read()
220 |
221 | doc_id = os.path.basename(rsd_f).replace(extension, '')
222 |
223 | ltf_root = rsd2ltf(rsd_str, doc_id, seg_option, tok_option,
224 | re_segment)
225 |
226 | write2file(ltf_root, output_files[k])
227 |
228 | except AssertionError as e:
229 | print(e)
230 |
231 | sys.stdout.write('%d files processed.\r' % k)
232 | sys.stdout.flush()
233 |
234 | sys.stdout.write('%d files processed.' % len(rsd_files))
235 |
--------------------------------------------------------------------------------
/ace2event.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import csv
4 |
5 | import spacy
6 | from spacy.tokens import Doc
7 |
8 |
9 | class WhitespaceTokenizer(object):
10 | def __init__(self, vocab):
11 | self.vocab = vocab
12 |
13 | def __call__(self, text):
14 | words = text.split(' ')
15 | # All tokens 'own' a subsequent space character in this tokenizer
16 | spaces = [True] * len(words)
17 | return Doc(self.vocab, words=words, spaces=spaces)
18 |
19 |
20 | def write_event(ace_file, trigger_file, arg_file, dep, nlp):
21 | all_sents = []
22 | sent = []
23 | with open(ace_file, 'r') as csv_file:
24 | reader = csv.DictReader(csv_file)
25 | for row in reader:
26 | token = row["token"]
27 | if token == "----sentence_delimiter----":
28 | all_sents.append(sent)
29 | sent = []
30 | else:
31 | token_offset_parts = row["offset"].split(':')
32 | offset_parts = token_offset_parts[1].split('-')
33 | token_start = int(offset_parts[0])
34 | token_end = int(offset_parts[1])
35 |
36 | ner_tag = "O"
37 | if row["ner_type"] != "O":
38 | ner_offset = row["ner_offset"].split(":")
39 | ner_start = int(ner_offset[0])
40 | if "#@#" in ner_offset[1]:
41 | ner_offset_parts = ner_offset[1].split("#@#")
42 | ner_end = int(ner_offset_parts[0])
43 | else:
44 | ner_end = int(ner_offset[1])
45 | ner_type_parts = row["ner_type"].split(":")
46 | ner_tag = ner_type_parts[0] + "-" + determine_tag(token_start, token_end, ner_start, ner_end)
47 | if row["trigger_type"] == "O":
48 | sent.append(row["token"] + "\t" + row["offset"] + "\t" + row["trigger_type"] + "\t" +
49 | row["trigger_arguments"] + "\t" + ner_tag)
50 | else:
51 | event_offset_parts = row["trigger_offset"].split(':')
52 | event_start = int(event_offset_parts[0])
53 | event_end = int(event_offset_parts[1])
54 | event_type_parts = row["trigger_type"].split(":")
55 | tag = event_type_parts[1] + "-" + determine_tag(token_start, token_end, event_start, event_end)
56 | sent.append(row["token"] + "\t" + row["offset"] + "\t" + tag + "\t" + row["trigger_arguments"]
57 | + "\t" + ner_tag)
58 | if len(sent) > 0:
59 | all_sents.append(sent)
60 | sent = []
61 |
62 | vtag_all_sents = validate_tags(all_sents) # check if a mention starts with "I" without "B"
63 | vseg_all_sents = validate_sent_seg(vtag_all_sents) # check if an event mention occurs in separate sents
64 |
65 | # write trigger and argument file
66 | out_trigger = open(trigger_file, 'w')
67 | out_arg = open(arg_file, 'w')
68 |
69 | for i in range(len(vseg_all_sents)):
70 | sent_id = i
71 | current_sent = vseg_all_sents[i]
72 |
73 | tok_idx2token = {}
74 | tok_idx2offset = {}
75 | tok_idx2label = {}
76 | tok_idx2ner = {}
77 | trigger_b2i = {}
78 | # write triggers
79 | pre_b_idx = -1
80 | for t in range(len(current_sent)):
81 | parts = current_sent[t].strip().split('\t')
82 | out_trigger.write(str(sent_id) + '\t' + str(t) + '\t' + parts[0] + '\t' + parts[1] + '\t' + parts[2] + "\n")
83 | tok_idx2offset[t] = parts[1]
84 | tok_idx2token[t] = parts[0]
85 | tok_idx2label[t] = parts[2]
86 | tok_idx2ner[t] = parts[-1]
87 | if parts[2].endswith('B'):
88 | pre_b_idx = t
89 | trigger_b2i[t] = [t]
90 | elif parts[2].endswith('O'):
91 | pre_b_idx = -1
92 | elif parts[2].endswith('I'):
93 | tmp = trigger_b2i[pre_b_idx]
94 | tmp.append(t)
95 | trigger_b2i[pre_b_idx] = tmp
96 | out_trigger.write('\n')
97 |
98 | # write arguments
99 | trigger2arg2role_idx = {}
100 | for t in range(len(current_sent)):
101 | parts = current_sent[t].strip().split('\t')
102 | if parts[2].endswith("B"):
103 | e1_idx = t
104 | arg_str = parts[3]
105 | if arg_str != 'O':
106 | args = arg_str.split(' ')
107 | for arg in args:
108 | arg_parts = arg.split(':')
109 | start = int(arg_parts[2])
110 | end = int(arg_parts[3])
111 | role = arg_parts[1]
112 | e2_idx_set = search_e2(tok_idx2offset, start, end)
113 | e1_idx_set = trigger_b2i[e1_idx]
114 | e2_idx = e2_idx_set[0]
115 | if e1_idx in trigger2arg2role_idx:
116 | arg2role = trigger2arg2role_idx[e1_idx]
117 | arg2role[e2_idx] = role + "-B"
118 | trigger2arg2role_idx[e1_idx] = arg2role
119 | else:
120 | arg2role = {e2_idx: role + "-B"}
121 | trigger2arg2role_idx[e1_idx] = arg2role
122 |
123 | for e2_idx_tmp in e2_idx_set[1:]:
124 | if e1_idx in trigger2arg2role_idx:
125 | arg2role = trigger2arg2role_idx[e1_idx]
126 | arg2role[e2_idx_tmp] = role + "-I"
127 | trigger2arg2role_idx[e1_idx] = arg2role
128 | else:
129 | arg2role = {e2_idx_tmp: role + "-I"}
130 | trigger2arg2role_idx[e1_idx] = arg2role
131 |
132 | for e1_idx_tmp in e1_idx_set[1:]:
133 | for e2_idx_tmp in e2_idx_set:
134 | if e1_idx_tmp in trigger2arg2role_idx:
135 | arg2role = trigger2arg2role_idx[e1_idx_tmp]
136 | arg2role[e2_idx_tmp] = role + "-I"
137 | trigger2arg2role_idx[e1_idx_tmp] = arg2role
138 | else:
139 | arg2role = {e2_idx_tmp: role + "-I"}
140 | trigger2arg2role_idx[e1_idx_tmp] = arg2role
141 |
142 | mod2head2dep = {}
143 | if dep:
144 | sent = ' '.join([t.split('\t')[0] for t in current_sent])
145 | doc_sent = nlp(sent)
146 |
147 | for i in range(len(doc_sent)):
148 | mod2head2dep[i] = {doc_sent[i].head.i:doc_sent[i].dep_}
149 | assert len(doc_sent) == len(current_sent)
150 |
151 | for t1 in range(len(current_sent)):
152 | e1_idx = t1
153 | e1_token = tok_idx2token[t1]
154 | e1_offset = tok_idx2offset[t1]
155 | e1_label = tok_idx2label[t1]
156 | for t2 in range(len(current_sent)):
157 | e2_idx = t2
158 | e2_token = tok_idx2token[t2]
159 | e2_offset = tok_idx2offset[t2]
160 | e2_label = tok_idx2label[t2]
161 | e2_ner = tok_idx2ner[t2]
162 |
163 | role = "O"
164 | if t1 in trigger2arg2role_idx and t2 in trigger2arg2role_idx[t1]:
165 | role = trigger2arg2role_idx[t1][t2]
166 |
167 | if dep == "bi":
168 | if e1_idx in mod2head2dep and e2_idx in mod2head2dep[e1_idx]:
169 | out_arg.write(
170 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' +
171 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' +
172 | e2_label + '\t' + role + '\t' + mod2head2dep[e1_idx][e2_idx] + "\t" + e2_ner + '\n')
173 | elif e2_idx in mod2head2dep and e1_idx in mod2head2dep[e2_idx]:
174 | out_arg.write(
175 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' +
176 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' +
177 | e2_label + '\t' + role + '\t' + mod2head2dep[e2_idx][e1_idx] + "\t" + e2_ner + '\n')
178 | else:
179 | out_arg.write(
180 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' +
181 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' +
182 | e2_label + '\t' + role + '\t' + "NA" + "\t" + e2_ner + '\n')
183 | elif dep == "un":
184 | if e1_idx in mod2head2dep and e2_idx in mod2head2dep[e1_idx]:
185 | out_arg.write(
186 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' +
187 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' +
188 | e2_label + '\t' + role + '\t' + mod2head2dep[e1_idx][e2_idx] + "\t" + e2_ner + '\n')
189 | else:
190 | out_arg.write(
191 | str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' +
192 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' +
193 | e2_label + '\t' + role + '\t' + "NA" + "\t" + e2_ner + '\n')
194 | else:
195 | out_arg.write(str(sent_id) + '\t' + str(e1_idx) + '\t' + e1_token + '\t' + e1_offset + '\t' +
196 | e1_label + '\t' + str(e2_idx) + '\t' + e2_token + '\t' + e2_offset + '\t' +
197 | e2_label + '\t' + role + '\t' + 'NA' + "\t" + e2_ner + '\n')
198 | out_arg.write("\n")
199 |
200 | out_trigger.close()
201 | out_arg.close()
202 |
203 |
204 | def search_e2(tok_idx2offset, start, end):
205 | e2_idx = []
206 | for i in range(len(tok_idx2offset)):
207 | offset_parts = tok_idx2offset[i].split(':')[1].split('-')
208 | c_start = int(offset_parts[0])
209 | c_end = int(offset_parts[1])
210 | if start <= c_end <= end or start <= c_start <= end:
211 | e2_idx.append(i)
212 | return e2_idx
213 |
214 |
215 | def validate_sent_seg(all_sents):
216 | cluster_idx = 0
217 | sent2cluster = {}
218 | merge_pre = False
219 | current_merge_next = False
220 | pre_merge_next = False
221 | current_single = False
222 | for i in range(len(all_sents)):
223 | current_sent = all_sents[i]
224 | sent_min, sent_max, ann_min, ann_max = get_offset_limit(current_sent)
225 |
226 | if sent_min <= ann_min and sent_max >= ann_max:
227 | current_single = True
228 | if sent_min > ann_min:
229 | merge_pre = True
230 | if sent_max < ann_max:
231 | current_merge_next = True
232 |
233 | if merge_pre:
234 | sent2cluster[i] = cluster_idx
235 | if not merge_pre and not current_merge_next and not pre_merge_next and current_single:
236 | sent2cluster[i] = cluster_idx+1
237 | cluster_idx += 1
238 | if pre_merge_next:
239 | sent2cluster[i] = cluster_idx
240 | if current_merge_next and not pre_merge_next:
241 | sent2cluster[i] = cluster_idx+1
242 | cluster_idx += 1
243 |
244 | merge_pre = False
245 | current_single = False
246 | pre_merge_next = current_merge_next
247 | current_merge_next = False
248 |
249 | cluster2sent = {}
250 | cluster_list = []
251 | for i in range(len(all_sents)):
252 | c = sent2cluster[i]
253 | if c not in cluster2sent:
254 | tmp = [i]
255 | cluster2sent[c] = tmp
256 | cluster_list.append(c)
257 | else:
258 | tmp = cluster2sent[c]
259 | tmp.append(i)
260 |
261 | new_all_sents = []
262 | for c in cluster_list:
263 | sids = cluster2sent[c]
264 | if len(sids) > 1:
265 | print(cluster2sent)
266 | newsents = []
267 | for s in sids:
268 | newsents += all_sents[s]
269 | new_all_sents.append(newsents)
270 | return new_all_sents
271 |
272 |
273 | def get_offset_limit(current_sent):
274 | first_tok_offset = current_sent[0].split('\t')[1].split(':')[1].split('-')
275 | sent_min = int(first_tok_offset[0])
276 | last_tok_offset = current_sent[-1].split('\t')[1].split(':')[1].split('-')
277 | sent_max = int(last_tok_offset[1])
278 |
279 | ann_min = 100000
280 | ann_max = 0
281 | for line in current_sent:
282 | arg_str = line.strip().split('\t')[3]
283 | if arg_str != "O":
284 | arg_parts = arg_str.split(' ')
285 | for arg in arg_parts:
286 | parts = arg.split(':')
287 | s = int(parts[2])
288 | e = int(parts[3])
289 | if s < ann_min:
290 | ann_min = s
291 | if e > ann_max:
292 | ann_max = e
293 | if ann_min == 100000 and ann_max == 0:
294 | ann_min = sent_min
295 | ann_max = sent_max
296 | return sent_min, sent_max, ann_min, ann_max
297 |
298 |
299 | def validate_tags(all_sents):
300 | new_all_sents = []
301 | pre_tag = ""
302 | for sents in all_sents:
303 | new_sents = []
304 | for i in range(len(sents)):
305 | current_line = sents[i].strip('\n')
306 | if len(current_line) == 0:
307 | new_sents.append(current_line + "\n")
308 | else:
309 | parts = current_line.split('\t')
310 | tag = parts[2]
311 | if tag.endswith("I") and not (pre_tag.endswith("B") or pre_tag.endswith("I")):
312 | print("Error " + current_line)
313 | new_line = sents[i].strip()[:-1] + "B"
314 | new_sents.append(new_line + "\n")
315 | else:
316 | new_sents.append(sents[i].strip() + "\n")
317 | pre_tag = tag
318 | new_all_sents.append(new_sents)
319 | return new_all_sents
320 |
321 |
322 | def determine_tag(token_start, token_end, ner_start, ner_end):
323 | tag = "B"
324 | if token_start <= ner_start <= token_end:
325 | tag = "B"
326 | elif ner_start < token_start <= ner_end:
327 | tag = "I"
328 | return tag
329 |
330 |
331 | if __name__ == "__main__":
332 | parser = argparse.ArgumentParser()
333 | parser.add_argument('--ace', type=str,
334 | help='ace input path')
335 | parser.add_argument('--event', type=str,
336 | help='event path')
337 | parser.add_argument('--dep', type=str, default=None,
338 | help='apply dependency parser or not')
339 |
340 | args = parser.parse_args()
341 |
342 | ace_path = args.ace
343 | event_path = args.event
344 | dep = args.dep
345 |
346 | nlp = None
347 | if dep:
348 | # import en_core_web_sm
349 | # nlp = en_core_web_sm.load()
350 | nlp = spacy.load("en_core_web_sm")# , disable=["tagger", "ner", "textcat"]
351 | nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
352 |
353 | if not os.path.exists(event_path):
354 | os.makedirs(event_path)
355 |
356 | file_names = []
357 | if os.path.isdir(ace_path):
358 | file_names = [item[:-4]
359 | for item in os.listdir(ace_path)
360 | if item.endswith(".csv")]
361 | else:
362 | file_names = [ace_path]
363 |
364 | for f in file_names:
365 | print(f)
366 | ace_file= os.path.join(ace_path, f+".csv")
367 | trigger_file = os.path.join(event_path, f+".trigger")
368 | arg_file = os.path.join(event_path, f + ".arg")
369 |
370 | if os.path.exists(ace_file):
371 | write_event(ace_file, trigger_file, arg_file, dep, nlp)
372 |
--------------------------------------------------------------------------------
/bio2ere.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import csv
4 | import xml.etree.ElementTree as ET
5 |
6 |
7 | # word, offset, nertag, relation, trigger, argument
8 | def write_ann(bio_file, ann_file, ace_file):
9 | csv_file = open(ace_file, 'w')
10 | fields = ['token', 'offset', 'ner_offset', 'ner_type', 'ner_nam_nom', 'ner_cluster',
11 | 'filler_offset', 'filler_type',
12 | 'relations_belong_to',
13 | 'trigger_offset', 'trigger_type', 'trigger_cluster', 'trigger_arguments']
14 | writer = csv.DictWriter(csv_file, fieldnames=fields)
15 | writer.writeheader()
16 |
17 | entity_mentions_mentionid2dict, filler_mentions_mentionid2dict, relation_mentions_id2dict, \
18 | event_mentions_id2dict = parse_ann(ann_file)
19 |
20 | with open(bio_file, 'r') as f:
21 | for line in f:
22 | line = line.strip()
23 | if len(line) > 0:
24 | parts = line.strip().split(' ')
25 | token = parts[0]
26 | offset = parts[1]
27 |
28 | token_dict = {'token': token, 'offset': offset}
29 |
30 | d_id, o = offset.split(':')
31 | start, end = o.split('-')
32 | start = int(start)
33 | end = int(end)
34 |
35 | entity_mention_ids = search_offset_id(start, end, entity_mentions_mentionid2dict, 'offset')
36 | filler_mention_ids = search_offset_id(start, end, filler_mentions_mentionid2dict, 'offset')
37 | relation_mention_ids = search_relation_id(start, end, relation_mentions_id2dict)
38 | event_mention_ids = search_offset_id(start, end, event_mentions_id2dict, 'trigger_offset')
39 |
40 | if len(entity_mention_ids) == 0:
41 | token_dict['ner_offset'] = 'O'
42 | token_dict['ner_type'] = 'O'
43 | token_dict['ner_nam_nom'] = 'O'
44 | token_dict['ner_cluster'] = 'O'
45 | else:
46 | ner_offsets = []
47 | ner_types = []
48 | ner_nam_noms = []
49 | ner_clusters = []
50 |
51 | for id in entity_mention_ids:
52 | ner_offsets.append(entity_mentions_mentionid2dict[id]['offset'])
53 | if str(start)+":"+str(end) == entity_mentions_mentionid2dict[id]['offset']:
54 | assert token == entity_mentions_mentionid2dict[id]['text']
55 | ner_types.append(entity_mentions_mentionid2dict[id]['type'] + ':' + \
56 | entity_mentions_mentionid2dict[id]['subtype'])
57 | ner_nam_noms.append(entity_mentions_mentionid2dict[id]['mention_type'])
58 | ner_clusters.append(entity_mentions_mentionid2dict[id]['entity_id'])
59 | token_dict['ner_offset'] = '#@#'.join(ner_offsets)
60 | token_dict['ner_type'] = '#@#'.join(ner_types)
61 | token_dict['ner_nam_nom'] = '#@#'.join(ner_nam_noms)
62 | token_dict['ner_cluster'] = '#@#'.join(ner_clusters)
63 |
64 | if len(filler_mention_ids) == 0:
65 | token_dict['filler_offset'] = 'O'
66 | token_dict['filler_type'] = 'O'
67 | else:
68 | filler_offsets = []
69 | filler_types = []
70 | for id in filler_mention_ids:
71 | filler_offsets.append(filler_mentions_mentionid2dict[id]['offset'])
72 | filler_types.append(filler_mentions_mentionid2dict[id]['type'])
73 | token_dict['filler_offset'] = '#@#'.join(filler_offsets)
74 | token_dict['filler_type'] = '#@#'.join(filler_types)
75 |
76 | if len(relation_mention_ids) == 0:
77 | token_dict['relations_belong_to'] = 'O'
78 | else:
79 | relation_mentions = []
80 | for id in relation_mention_ids:
81 | relation_mention_dict = relation_mentions_id2dict[id]
82 | relation_id = relation_mention_dict['relation_id']
83 | relation_type = relation_mention_dict['relation_type'] + ':' + \
84 | relation_mention_dict['relation_subtype']
85 | arg0 = relation_mention_dict['mention_argument0_offset']
86 | arg1 = relation_mention_dict['mention_argument1_offset']
87 |
88 | mention = relation_id + ':' + arg0 + ':' + relation_type + ':' + arg1
89 | relation_mentions.append(mention)
90 | mention_str = ' '.join(relation_mentions)
91 | token_dict['relations_belong_to'] = mention_str
92 |
93 | if len(event_mention_ids) == 0:
94 | token_dict['trigger_offset'] = 'O'
95 | token_dict['trigger_type'] = 'O'
96 | token_dict['trigger_cluster'] = 'O'
97 | token_dict['trigger_arguments'] = 'O'
98 | else:
99 | trigger_offsets = []
100 | trigger_types = []
101 | trigger_clusters = []
102 | trigger_arguments_set = []
103 | for id in event_mention_ids:
104 | trigger_offsets.append(event_mentions_id2dict[id]['trigger_offset'])
105 | if str(start)+":"+str(end) == event_mentions_id2dict[id]['trigger_offset']:
106 | assert token == event_mentions_id2dict[id]['trigger_text']
107 | trigger_types.append(event_mentions_id2dict[id]['type'] + ':' +
108 | event_mentions_id2dict[id]['subtype'])
109 | trigger_clusters.append(event_mentions_id2dict[id]['event_id'])
110 | all_event_mention_arguments = event_mentions_id2dict[id]['argument']
111 | arguments = []
112 | for arg in all_event_mention_arguments:
113 | arg_str = arg['mention_argument_refid'] + ':' + arg['mention_argument_role'] + ':' + \
114 | arg['mention_argument_offset']
115 | arguments.append(arg_str)
116 | arguments_str = ' '.join(arguments)
117 | trigger_arguments_set.append(arguments_str)
118 |
119 | token_dict['trigger_offset'] = '#@#'.join(trigger_offsets)
120 | token_dict['trigger_type'] = '#@#'.join(trigger_types)
121 | token_dict['trigger_cluster'] = '#@#'.join(trigger_clusters)
122 | token_dict['trigger_arguments'] = '#@#'.join(trigger_arguments_set)
123 |
124 | writer.writerow(token_dict)
125 | else:
126 | token_dict = {'token':'----sentence_delimiter----'}
127 | writer.writerow(token_dict)
128 |
129 | csv_file.close()
130 |
131 |
132 | # applicable to entity, timex2, event mentions
133 | def search_offset_id(token_start, token_end, entity_mentions_mentionid2dict, offset_key):
134 | searched_ids = []
135 | for id in entity_mentions_mentionid2dict:
136 | can_dict = entity_mentions_mentionid2dict[id]
137 | mention_offset_parts = can_dict[offset_key].split(':')
138 | can_start = int(mention_offset_parts[0])
139 | can_end = int(mention_offset_parts[1])
140 | if (can_start <= token_start <= can_end) or (can_start <= token_end <= can_end):
141 | searched_ids.append(id)
142 | return searched_ids
143 |
144 |
145 | def search_relation_id(token_start, token_end, relation_mentions_id2dict):
146 | searched_ids = []
147 | for id in relation_mentions_id2dict:
148 | can_dict = relation_mentions_id2dict[id]
149 | argument0_offset_parts = can_dict['mention_argument0_offset'].split(':')
150 | argument1_offset_parts = can_dict['mention_argument1_offset'].split(':')
151 | arg0_start = int(argument0_offset_parts[0])
152 | arg0_end = int(argument0_offset_parts[1])
153 | arg1_start = int(argument1_offset_parts[0])
154 | arg1_end = int(argument1_offset_parts[1])
155 | if (arg0_start <= token_start <= arg0_end) or (arg0_start <= token_end <= arg0_end) or \
156 | (arg1_start <= token_start <= arg1_end) or (arg1_start <= token_end <= arg1_end):
157 | searched_ids.append(id)
158 | return searched_ids
159 |
160 |
161 | def parse_ann(ann_file):
162 | tree = ET.parse(ann_file)
163 | root = tree.getroot()
164 | doc_elem = root[0] # entities, fillers, relations, hoppers
165 |
166 | all_entity_elems = []
167 | all_filler_elems = []
168 | all_relation_elems = []
169 | all_hopper_elems = []
170 | if len(doc_elem.findall('entities')) > 0:
171 | entities_elem = doc_elem.findall('entities')[0]
172 | all_entity_elems = entities_elem.findall('entity')
173 | if len(doc_elem.findall('fillers')) > 0:
174 | fillers_elem = doc_elem.findall('fillers')[0]
175 | all_filler_elems = fillers_elem.findall('filler')
176 | if len(doc_elem.findall('relations')) > 0:
177 | relations_elem = doc_elem.findall('relations')[0]
178 | all_relation_elems = relations_elem.findall('relation')
179 | if len(doc_elem.findall('hoppers')) > 0:
180 | hoppers_elem = doc_elem.findall('hoppers')[0]
181 | all_hopper_elems = hoppers_elem.findall('hopper')
182 |
183 | # parse all entities and mentions
184 | entity_mentions_offset2dict = {}
185 | entity_mentions_mentionid2dict = {}
186 | for entity_elem in all_entity_elems:
187 | entity_attribs = entity_elem.attrib
188 | entity_id = entity_attribs["id"] # CNN_CF_20030303.1900.00-E1
189 | entity_type = entity_attribs["type"] # PER
190 | entity_specificity = entity_attribs["specificity"] #
191 |
192 | all_entity_mention_elems = entity_elem.findall("entity_mention")
193 | for entity_mention_elem in all_entity_mention_elems:
194 | entity_mention_attribs = entity_mention_elem.attrib
195 | entity_mention_id = entity_mention_attribs["id"] # CNN_CF_20030303.1900.00-E1-2
196 | entity_mention_noun_type = entity_mention_attribs["noun_type"] # NOM
197 |
198 | entity_mention_start = entity_mention_attribs["offset"]
199 | entity_mention_end = int(entity_mention_start) + int(entity_mention_attribs["length"]) - 1
200 | entity_mention_text = entity_mention_elem.findall('mention_text')[0].text
201 | mention_offset = entity_mention_start + ":" + str(entity_mention_end)
202 |
203 | nom_head_elems = entity_mention_elem.findall("nom_head")
204 | if len(nom_head_elems) > 0:
205 | if len(nom_head_elems) > 1:
206 | print("Error: multiple nom heads~")
207 | nom_head_elem = nom_head_elems[0]
208 | entity_mention_head_start = nom_head_elem.attrib["offset"]
209 | entity_mention_head_end = int(entity_mention_head_start) + int(nom_head_elem.attrib["length"]) - 1
210 | mention_offset = entity_mention_head_start + ":" + str(entity_mention_head_end)
211 | entity_mention_text = nom_head_elem.text
212 |
213 | mention_dict = {"type": entity_type, "specificity": entity_specificity, "entity_id": entity_id,
214 | "mention_id": entity_mention_id, "mention_type": entity_mention_noun_type,
215 | "text": entity_mention_text, "offset": mention_offset}
216 | entity_mentions_offset2dict[mention_offset] = mention_dict
217 | entity_mentions_mentionid2dict[entity_mention_id] = mention_dict
218 |
219 | # parse all filler
220 | filler_mentions_offset2dict = {}
221 | filler_mentions_mentionid2dict = {}
222 | for filler_elem in all_filler_elems:
223 | filler_id = filler_elem.attrib["id"]
224 | filler_start = filler_elem.attrib["offset"]
225 | filler_end = int(filler_start) + int(filler_elem.attrib["length"]) - 1
226 | filler_type = filler_elem.attrib["type"]
227 | filler_text = filler_elem.text
228 | mention_offset = filler_start + ":" + str(filler_end)
229 | mention_dict = {"filler_id": filler_id, "type": filler_type, "text": filler_text,
230 | "offset": mention_offset}
231 | filler_mentions_offset2dict[mention_offset] = mention_dict
232 | filler_mentions_mentionid2dict[filler_id] = mention_dict
233 |
234 | # parse all relations
235 | relation_mentions_id2dict = {}
236 | relation_mentions_men2men2dict = {}
237 | for relation_elem in all_relation_elems:
238 | relation_elem_attribs = relation_elem.attrib
239 | relation_id = relation_elem_attribs["id"] # CNN_CF_20030303.1900.00-R2
240 | relation_type = relation_elem_attribs["type"] # PART-WHOLE
241 | relation_subtype = relation_elem_attribs["subtype"] # PART-WHOLE
242 |
243 | all_relation_mention_elems = relation_elem.findall("relation_mention")
244 | for relation_mention_elem in all_relation_mention_elems:
245 | relation_mention_id = relation_mention_elem.attrib["id"]
246 | relation_mention_realis = relation_mention_elem.attrib["realis"]
247 |
248 | relation_mention_argument0_elem = relation_mention_elem.findall("rel_arg1")[0]
249 | relation_mention_argument1_elem = relation_mention_elem.findall("rel_arg2")[0]
250 | if "entity_id" in relation_mention_argument0_elem.attrib:
251 | relation_mention_argument0_refid = relation_mention_argument0_elem.attrib["entity_mention_id"]
252 | relation_mention_argument0_role = relation_mention_argument0_elem.attrib["role"]
253 | relation_mention_argument0_extend_offset = \
254 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"]
255 | relation_mention_argument0_extend_text = \
256 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["text"]
257 | elif "filler_id" in relation_mention_argument0_elem.attrib:
258 | relation_mention_argument0_refid = relation_mention_argument0_elem.attrib["filler_id"]
259 | relation_mention_argument0_role = relation_mention_argument0_elem.attrib["role"]
260 | relation_mention_argument0_extend_offset = \
261 | filler_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"]
262 | relation_mention_argument0_extend_text = \
263 | filler_mentions_mentionid2dict[relation_mention_argument0_refid]["text"]
264 | if "entity_id" in relation_mention_argument1_elem.attrib:
265 | relation_mention_argument1_refid = relation_mention_argument1_elem.attrib["entity_mention_id"]
266 | relation_mention_argument1_role = relation_mention_argument1_elem.attrib["role"]
267 | relation_mention_argument1_extend_offset = \
268 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"]
269 | relation_mention_argument1_extend_text = \
270 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["text"]
271 | elif "filler_id" in relation_mention_argument1_elem.attrib:
272 | relation_mention_argument1_refid = relation_mention_argument1_elem.attrib["filler_id"]
273 | relation_mention_argument1_role = relation_mention_argument1_elem.attrib["role"]
274 | relation_mention_argument1_extend_offset = \
275 | filler_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"]
276 | relation_mention_argument1_extend_text = \
277 | filler_mentions_mentionid2dict[relation_mention_argument1_refid]["text"]
278 |
279 | relation_mention_trigger_elems = relation_mention_elem.findall("trigger")
280 | relation_mention_trigger_offset = "O"
281 | relation_mention_trigger_text = "O"
282 | if len(relation_mention_trigger_elems) > 0:
283 | relation_mention_trigger_start = relation_mention_trigger_elems[0].attrib["offset"]
284 | relation_mention_trigger_end = int(relation_mention_trigger_start) + \
285 | int(relation_mention_trigger_elems[0].attrib["length"]) - 1
286 | relation_mention_trigger_offset = relation_mention_trigger_start + ":" + \
287 | str(relation_mention_trigger_end)
288 | relation_mention_trigger_text = relation_mention_trigger_elems[0].text
289 |
290 | mention_dict = {"relation_id": relation_id, "relation_type": relation_type,
291 | "relation_subtype": relation_subtype,
292 | "mention_id": relation_mention_id, "mention_realis": relation_mention_realis,
293 | "mention_argument0_refid": relation_mention_argument0_refid,
294 | "mention_argument0_role": relation_mention_argument0_role,
295 | "mention_argument1_refid": relation_mention_argument1_refid,
296 | "mention_argument1_role": relation_mention_argument1_role,
297 | "mention_argument0_offset": relation_mention_argument0_extend_offset,
298 | "mention_argument0_text": relation_mention_argument0_extend_text,
299 | "mention_argument1_offset": relation_mention_argument1_extend_offset,
300 | "mention_argument1_text": relation_mention_argument1_extend_text,
301 | "mention_trigger_offset": relation_mention_trigger_offset,
302 | "mention_trigger_text": relation_mention_trigger_text
303 | }
304 | relation_mentions_id2dict[relation_mention_id] = mention_dict
305 | if relation_mention_argument0_refid in relation_mentions_men2men2dict:
306 | relation_mentions_men2dict = relation_mentions_men2men2dict[relation_mention_argument0_refid]
307 | relation_mentions_men2dict[relation_mention_argument1_refid] = mention_dict
308 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict
309 | else:
310 | relation_mentions_men2dict = {relation_mention_argument1_refid: mention_dict}
311 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict
312 |
313 | # parse all events
314 | event_mentions_id2dict = {}
315 | for event_elem in all_hopper_elems:
316 | event_id = event_elem.attrib["id"]
317 |
318 | all_event_mention_elems = event_elem.findall("event_mention")
319 | for event_mention_elem in all_event_mention_elems:
320 | event_mention_id = event_mention_elem.attrib["id"]
321 | event_mention_type = event_mention_elem.attrib["type"]
322 | event_mention_subtype = event_mention_elem.attrib["subtype"]
323 | event_mention_realis = event_mention_elem.attrib["realis"]
324 |
325 | event_mention_trigger_elem = event_mention_elem.findall("trigger")[0]
326 | event_mention_trigger_start = event_mention_trigger_elem.attrib["offset"]
327 | event_mention_trigger_end = int(event_mention_trigger_start) + \
328 | int(event_mention_trigger_elem.attrib["length"]) - 1
329 | event_mention_trigger_text = event_mention_trigger_elem.text
330 | event_mention_trigger_offset = event_mention_trigger_start + ":" + \
331 | str(event_mention_trigger_end)
332 |
333 | all_event_mention_argument_elems = event_mention_elem.findall("em_arg")
334 | all_event_mention_arguments = []
335 | for event_mention_argument_elem in all_event_mention_argument_elems:
336 | if "entity_id" in event_mention_argument_elem.attrib:
337 | event_mention_argument_refid = event_mention_argument_elem.attrib["entity_mention_id"]
338 | event_mention_argument_offset = entity_mentions_mentionid2dict[event_mention_argument_refid][
339 | "offset"]
340 | event_mention_argument_text = entity_mentions_mentionid2dict[event_mention_argument_refid][
341 | "text"]
342 | elif "filler_id" in event_mention_argument_elem.attrib:
343 | event_mention_argument_refid = event_mention_argument_elem.attrib["filler_id"]
344 | event_mention_argument_offset = filler_mentions_mentionid2dict[event_mention_argument_refid][
345 | "offset"]
346 | event_mention_argument_text = filler_mentions_mentionid2dict[event_mention_argument_refid][
347 | "text"]
348 | event_mention_argument_role = event_mention_argument_elem.attrib["role"]
349 | event_mention_argument_realis = event_mention_argument_elem.attrib["realis"]
350 |
351 | event_mention_argument_dict = {"mention_argument_refid": event_mention_argument_refid,
352 | "mention_argument_role": event_mention_argument_role,
353 | "mention_argument_realis": event_mention_argument_realis,
354 | "mention_argument_offset": event_mention_argument_offset,
355 | "mention_argument_text": event_mention_argument_text}
356 | all_event_mention_arguments.append(event_mention_argument_dict)
357 |
358 | mention_dict = {"event_id": event_id, "type": event_mention_type, "subtype": event_mention_subtype,
359 | "realis": event_mention_realis, "mention_id": event_mention_id,
360 | "trigger_offset": event_mention_trigger_offset, "trigger_text": event_mention_trigger_text,
361 | "argument": all_event_mention_arguments}
362 |
363 | event_mentions_id2dict[event_mention_id] = mention_dict
364 |
365 | return entity_mentions_mentionid2dict, filler_mentions_mentionid2dict, \
366 | relation_mentions_id2dict, event_mentions_id2dict
367 |
368 |
369 | if __name__ == "__main__":
370 | parser = argparse.ArgumentParser()
371 | parser.add_argument('--bio', type=str,
372 | help='bio input path')
373 | parser.add_argument('--ann', type=str,
374 | help='ace annotation input path')
375 | parser.add_argument('--ere', type=str,
376 | help='output ace annotation path')
377 | parser.add_argument('--filelist', type=str,
378 | help='filelist path')
379 |
380 | args = parser.parse_args()
381 |
382 | bio_path = args.bio
383 | ann_path = args.ann
384 | ere_path = args.ere
385 |
386 | if not os.path.exists(ere_path):
387 | os.makedirs(ere_path)
388 |
389 | file_names = []
390 | if os.path.isdir(bio_path):
391 | file_names = [item[:-4]
392 | for item in os.listdir(bio_path)
393 | if item.endswith(".bio")]
394 | else:
395 | file_names = [bio_path]
396 |
397 | for f in file_names:
398 | print(f)
399 | bio_file= os.path.join(bio_path, f+".bio")
400 | ann_file = os.path.join(ann_path, f+".rich_ere.xml")
401 | ace_file = os.path.join(ere_path, f+".csv")
402 |
403 | if os.path.exists(bio_file) and os.path.exists(ann_file):
404 | write_ann(bio_file, ann_file, ace_file)
405 |
406 |
--------------------------------------------------------------------------------
/bio2ace.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import csv
4 | import xml.etree.ElementTree as ET
5 |
6 |
7 | # word, offset, nertag, relation, trigger, argument
8 | def write_ann(bio_file, ann_file, ace_file):
9 | csv_file = open(ace_file, 'w')
10 | fields = ['token', 'offset', 'ner_offset', 'ner_type', 'ner_nam_nom', 'ner_mention', 'ner_cluster',
11 | 'timex2_offset', 'timex2_cluster',
12 | 'value_offset', 'value_type', 'value_cluster',
13 | 'relations_belong_to',
14 | 'trigger_offset', 'trigger_type', 'trigger_cluster', 'trigger_arguments']
15 | writer = csv.DictWriter(csv_file, fieldnames=fields)
16 | writer.writeheader()
17 |
18 | entity_mentions_mentionid2dict, timex2_mentions_mentionid2dict, value_mentions_mentionid2dict, \
19 | relation_mentions_id2dict, event_mentions_id2dict = parse_ann(ann_file)
20 |
21 | with open(bio_file, 'r') as f:
22 | for line in f:
23 | line = line.strip()
24 | if len(line) > 0:
25 | parts = line.strip().split(' ')
26 | token = parts[0]
27 | offset = parts[1]
28 |
29 | token_dict = {'token': token, 'offset': offset}
30 |
31 | d_id, o = offset.split(':')
32 | start, end = o.split('-')
33 | start = int(start)
34 | end = int(end)
35 |
36 | entity_mention_ids = search_offset_id(start, end, entity_mentions_mentionid2dict, 'offset')
37 | timex2_mention_ids = search_offset_id(start, end, timex2_mentions_mentionid2dict, 'offset')
38 | value_mention_ids = search_offset_id(start, end, value_mentions_mentionid2dict, 'offset')
39 | relation_mention_ids = search_relation_id(start, end, relation_mentions_id2dict)
40 | event_mention_ids = search_offset_id(start, end, event_mentions_id2dict, 'anchor_offset')
41 |
42 | if len(entity_mention_ids) == 0:
43 | token_dict['ner_offset'] = 'O'
44 | token_dict['ner_type'] = 'O'
45 | token_dict['ner_nam_nom'] = 'O'
46 | token_dict['ner_mention'] = 'O'
47 | token_dict['ner_cluster'] = 'O'
48 | else:
49 | ner_offsets = []
50 | ner_types = []
51 | ner_nam_noms = []
52 | ner_mentions = []
53 | ner_clusters = []
54 |
55 | for id in entity_mention_ids:
56 | ner_offsets.append(entity_mentions_mentionid2dict[id]['offset'])
57 | ner_types.append(entity_mentions_mentionid2dict[id]['type'] + ':' + \
58 | entity_mentions_mentionid2dict[id]['subtype'])
59 | ner_nam_noms.append(entity_mentions_mentionid2dict[id]['mention_type'])
60 | ner_mentions.append(entity_mentions_mentionid2dict[id]['text'])
61 | ner_clusters.append(entity_mentions_mentionid2dict[id]['entity_id'])
62 | token_dict['ner_offset'] = '#@#'.join(ner_offsets)
63 | token_dict['ner_type'] = '#@#'.join(ner_types)
64 | token_dict['ner_nam_nom'] = '#@#'.join(ner_nam_noms)
65 | token_dict['ner_mention'] = '#@#'.join(ner_mentions)
66 | token_dict['ner_cluster'] = '#@#'.join(ner_clusters)
67 |
68 | if len(timex2_mention_ids) == 0:
69 | token_dict['timex2_offset'] = 'O'
70 | token_dict['timex2_cluster'] = 'O'
71 | else:
72 | timex2_offsets = []
73 | timex2_clusters = []
74 | for id in timex2_mention_ids:
75 | timex2_offsets.append(timex2_mentions_mentionid2dict[id]['offset'])
76 | timex2_clusters.append(timex2_mentions_mentionid2dict[id]['timex2_id'])
77 | token_dict['timex2_offset'] = '#@#'.join(timex2_offsets)
78 | token_dict['timex2_cluster'] = '#@#'.join(timex2_clusters)
79 |
80 | if len(value_mention_ids) == 0:
81 | token_dict['value_offset'] = 'O'
82 | token_dict['value_type'] = 'O'
83 | token_dict['value_cluster'] = 'O'
84 | else:
85 | value_offsets = []
86 | value_types = []
87 | value_clusters = []
88 |
89 | for id in value_mention_ids:
90 | value_offsets.append(value_mentions_mentionid2dict[id]['offset'])
91 | value_types.append(value_mentions_mentionid2dict[id]['type'] + ':' +
92 | value_mentions_mentionid2dict[id]['subtype'])
93 | value_clusters.append(value_mentions_mentionid2dict[id]['value_id'])
94 | token_dict['value_offset'] = '#@#'.join(value_offsets)
95 | token_dict['value_type'] = '#@#'.join(value_types)
96 | token_dict['value_cluster'] = '#@#'.join(value_clusters)
97 |
98 | if len(relation_mention_ids) == 0:
99 | token_dict['relations_belong_to'] = 'O'
100 | else:
101 | relation_mentions = []
102 | for id in relation_mention_ids:
103 | relation_mention_dict = relation_mentions_id2dict[id]
104 | relation_id = relation_mention_dict['relation_id']
105 | relation_type = relation_mention_dict['relation_type'] + ':' + \
106 | relation_mention_dict['relation_subtype']
107 | arg0 = relation_mention_dict['mention_argument0_offset']
108 | arg1 = relation_mention_dict['mention_argument1_offset']
109 |
110 | mention = relation_id + ':' + arg0 + ':' + relation_type + ':' + arg1
111 | relation_mentions.append(mention)
112 | mention_str = ' '.join(relation_mentions)
113 | token_dict['relations_belong_to'] = mention_str
114 |
115 | if len(event_mention_ids) == 0:
116 | token_dict['trigger_offset'] = 'O'
117 | token_dict['trigger_type'] = 'O'
118 | token_dict['trigger_cluster'] = 'O'
119 | token_dict['trigger_arguments'] = 'O'
120 | else:
121 | trigger_offsets = []
122 | trigger_types = []
123 | trigger_clusters = []
124 | trigger_arguments_set = []
125 | for id in event_mention_ids:
126 | trigger_offsets.append(event_mentions_id2dict[id]['anchor_offset'])
127 | trigger_types.append(event_mentions_id2dict[id]['type'] + ':' +
128 | event_mentions_id2dict[id]['subtype'])
129 | trigger_clusters.append(event_mentions_id2dict[id]['event_id'])
130 | all_event_mention_arguments = event_mentions_id2dict[id]['argument']
131 | arguments = []
132 | for arg in all_event_mention_arguments:
133 | arg_str = arg['mention_argument_refid'] + ':' + arg['mention_argument_role'] + ':' + \
134 | arg['mention_argument_offset']
135 | arguments.append(arg_str)
136 | if len(arguments) > 0:
137 | arguments_str = ' '.join(arguments)
138 | trigger_arguments_set.append(arguments_str)
139 |
140 | token_dict['trigger_offset'] = '#@#'.join(trigger_offsets)
141 | token_dict['trigger_type'] = '#@#'.join(trigger_types)
142 | token_dict['trigger_cluster'] = '#@#'.join(trigger_clusters)
143 | if len(trigger_arguments_set) > 0:
144 | token_dict['trigger_arguments'] = '#@#'.join(trigger_arguments_set)
145 | else:
146 | token_dict['trigger_arguments'] = 'O'
147 |
148 | writer.writerow(token_dict)
149 | else:
150 | token_dict = {'token':'----sentence_delimiter----'}
151 | writer.writerow(token_dict)
152 |
153 | csv_file.close()
154 |
155 |
156 | # applicable to entity, timex2, event mentions
157 | def search_offset_id(token_start, token_end, entity_mentions_mentionid2dict, offset_key):
158 | searched_ids = []
159 | for id in entity_mentions_mentionid2dict:
160 | can_dict = entity_mentions_mentionid2dict[id]
161 | mention_offset_parts = can_dict[offset_key].split(':')
162 | can_start = int(mention_offset_parts[0])
163 | can_end = int(mention_offset_parts[1])
164 | if (can_start <= token_start <= can_end) or (can_start <= token_end <= can_end):
165 | searched_ids.append(id)
166 | return searched_ids
167 |
168 |
169 | def search_relation_id(token_start, token_end, relation_mentions_id2dict):
170 | searched_ids = []
171 | for id in relation_mentions_id2dict:
172 | can_dict = relation_mentions_id2dict[id]
173 | argument0_offset_parts = can_dict['mention_argument0_offset'].split(':')
174 | argument1_offset_parts = can_dict['mention_argument1_offset'].split(':')
175 | arg0_start = int(argument0_offset_parts[0])
176 | arg0_end = int(argument0_offset_parts[1])
177 | arg1_start = int(argument1_offset_parts[0])
178 | arg1_end = int(argument1_offset_parts[1])
179 | if (arg0_start <= token_start <= arg0_end) or (arg0_start <= token_end <= arg0_end) or \
180 | (arg1_start <= token_start <= arg1_end) or (arg1_start <= token_end <= arg1_end):
181 | searched_ids.append(id)
182 | return searched_ids
183 |
184 |
185 | def parse_ann(ann_file):
186 | tree = ET.parse(ann_file)
187 | root = tree.getroot()
188 | doc_elem = root[0] # entity, timex2, relation, event
189 |
190 | all_entity_elems = doc_elem.findall('entity')
191 | all_timex2_elems = doc_elem.findall('timex2')
192 | all_value_elems = doc_elem.findall('value')
193 | all_relaton_elems = doc_elem.findall('relation')
194 | all_event_elems = doc_elem.findall('event')
195 |
196 | # parse all entities and mentions
197 | entity_mentions_offset2dict = {}
198 | entity_mentions_mentionid2dict = {}
199 | for entity_elem in all_entity_elems:
200 | entity_attribs = entity_elem.attrib
201 | entity_id = entity_attribs["ID"] # CNN_CF_20030303.1900.00-E1
202 | entity_type = entity_attribs["TYPE"] # PER
203 | entity_subtype = entity_attribs["SUBTYPE"] # Individual
204 | entity_class = entity_attribs["CLASS"] # SPC
205 |
206 | all_entity_mention_elems = entity_elem.findall("entity_mention")
207 | for entity_mention_elem in all_entity_mention_elems:
208 | entity_mention_attribs = entity_mention_elem.attrib
209 | entity_mention_id = entity_mention_attribs["ID"] # CNN_CF_20030303.1900.00-E1-2
210 | entity_mention_type = entity_mention_attribs["TYPE"] # NOM
211 | entity_mention_ldctype = entity_mention_attribs["LDCTYPE"] # NOMPRE
212 |
213 | entity_mention_extent_elem = entity_mention_elem.findall("extent")[0].findall("charseq")[0]
214 | entity_mention_head_elem = entity_mention_elem.findall("head")[0].findall("charseq")[0]
215 |
216 | entity_mention_head_start = entity_mention_head_elem.attrib["START"] # 490
217 | entity_mention_head_end = entity_mention_head_elem.attrib["END"] # 498
218 | entity_mention_head_text = entity_mention_head_elem.text # Secretary
219 |
220 | mention_offset = entity_mention_head_start + ":" + entity_mention_head_end
221 | mention_dict = {"type": entity_type, "subtype": entity_subtype, "entity_id": entity_id,
222 | "entity_class": entity_class, "mention_id": entity_mention_id,
223 | "mention_type": entity_mention_type, "mention_ldctype": entity_mention_ldctype,
224 | "text": entity_mention_head_text, "offset": mention_offset}
225 | entity_mentions_offset2dict[mention_offset] = mention_dict
226 | entity_mentions_mentionid2dict[entity_mention_id] = mention_dict
227 |
228 | # parse all timex2
229 | timex2_mentions_offset2dict = {}
230 | timex2_mentions_mentionid2dict = {}
231 | for timex2_elem in all_timex2_elems:
232 | timex2_id = timex2_elem.attrib["ID"]
233 | all_timex2_mention_elems = timex2_elem.findall("timex2_mention")
234 | for timex2_mention_elem in all_timex2_mention_elems:
235 | timex2_mention_id = timex2_mention_elem.attrib["ID"]
236 | timex2_mention_elem_extend = timex2_mention_elem.findall("extent")[0].findall("charseq")[0]
237 | timex2_mention_start = timex2_mention_elem_extend.attrib["START"]
238 | timex2_mention_end = timex2_mention_elem_extend.attrib["END"]
239 | timex2_mention_text = timex2_mention_elem_extend.text
240 |
241 | mention_offset = timex2_mention_start + ":" + timex2_mention_end
242 | mention_dict = {"timex2_id": timex2_id, "mention_id": timex2_mention_id, "text": timex2_mention_text,
243 | "offset": mention_offset}
244 | timex2_mentions_offset2dict[mention_offset] = mention_dict
245 | timex2_mentions_mentionid2dict[timex2_mention_id] = mention_dict
246 |
247 | # parse all values
248 | value_mentions_offset2dict = {}
249 | value_mentions_mentionid2dict = {}
250 | for value_elem in all_value_elems:
251 | value_id = value_elem.attrib["ID"]
252 | value_type = value_elem.attrib['TYPE']
253 | value_subtype = "O"
254 | if "SUBTYPE" in value_elem.attrib:
255 | value_subtype = value_elem.attrib['SUBTYPE']
256 |
257 | all_value_mention_elems = value_elem.findall("value_mention")
258 | for value_mention_elem in all_value_mention_elems:
259 | value_mention_id = value_mention_elem.attrib["ID"]
260 | value_mention_elem_extend = value_mention_elem.findall("extent")[0].findall("charseq")[0]
261 | value_mention_start = value_mention_elem_extend.attrib["START"]
262 | value_mention_end = value_mention_elem_extend.attrib["END"]
263 | value_mention_text = value_mention_elem_extend.text
264 |
265 | mention_offset = value_mention_start + ":" + value_mention_end
266 | mention_dict = {"value_id": value_id, "type":value_type, 'subtype':value_subtype,
267 | "mention_id": value_mention_id, "text": value_mention_text,
268 | "offset": mention_offset}
269 | value_mentions_offset2dict[mention_offset] = mention_dict
270 | value_mentions_mentionid2dict[value_mention_id] = mention_dict
271 |
272 | # parse all relations
273 | relation_mentions_id2dict = {}
274 | relation_mentions_men2men2dict = {}
275 | for relation_elem in all_relaton_elems:
276 | relation_elem_attribs = relation_elem.attrib
277 | relation_id = relation_elem_attribs["ID"] # CNN_CF_20030303.1900.00-R2
278 | relation_type = relation_elem_attribs["TYPE"] # PART-WHOLE
279 | relation_subtype = "O"
280 | if "SUBTYPE" in relation_elem_attribs:
281 | relation_subtype = relation_elem_attribs["SUBTYPE"] # Geographical
282 | relation_tense = "O"
283 | if "TENSE" in relation_elem_attribs:
284 | relation_tense = relation_elem_attribs["TENSE"] # Unspecified
285 | relation_modality = "O"
286 | if "MODALITY" in relation_elem_attribs:
287 | relation_modality = relation_elem_attribs["MODALITY"] # Unspecified
288 | relation_argument_elems = relation_elem.findall("relation_argument")
289 | relation_argument0 = relation_argument_elems[0]
290 | relation_argument1 = relation_argument_elems[1]
291 | relation_argument0_refid = relation_argument0.attrib["REFID"]
292 | relation_argument0_role = relation_argument0.attrib["ROLE"]
293 | relation_argument1_refid = relation_argument1.attrib["REFID"]
294 | relation_argument1_role = relation_argument1.attrib["ROLE"]
295 |
296 | all_relation_mention_elems = relation_elem.findall("relation_mention")
297 | for relation_mention_elem in all_relation_mention_elems:
298 | relation_mention_id = relation_mention_elem.attrib["ID"]
299 | relation_mention_lexical_condition = relation_mention_elem.attrib["LEXICALCONDITION"]
300 | relation_mention_extent = relation_mention_elem.findall("extent")[0].findall("charseq")[0]
301 | relation_mention_extent_start = relation_mention_extent.attrib["START"]
302 | relation_mention_extent_end = relation_mention_extent.attrib["END"]
303 | relation_mention_extent_text = relation_mention_extent.text
304 | relation_mention_extend_offset = relation_mention_extent_start + ":" + relation_mention_extent_end
305 |
306 | relation_mention_argument_elems = relation_mention_elem.findall("relation_mention_argument")
307 | relation_mention_argument0 = relation_mention_argument_elems[0]
308 | relation_mention_argument1 = relation_mention_argument_elems[1]
309 | relation_mention_argument0_refid = relation_mention_argument0.attrib["REFID"]
310 | relation_mention_argument0_role = relation_mention_argument0.attrib["ROLE"]
311 | relation_mention_argument1_refid = relation_mention_argument1.attrib["REFID"]
312 | relation_mention_argument1_role = relation_mention_argument1.attrib["ROLE"]
313 |
314 | # replace extend to the corresponding head
315 | # arg0
316 | if relation_mention_argument0_refid in entity_mentions_mentionid2dict:
317 | relation_mention_argument0_extend_offset = \
318 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"]
319 | relation_mention_argument0_extend_text = \
320 | entity_mentions_mentionid2dict[relation_mention_argument0_refid]["text"]
321 | elif relation_mention_argument0_refid in timex2_mentions_mentionid2dict:
322 | relation_mention_argument0_extend_offset = \
323 | timex2_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"]
324 | relation_mention_argument0_extend_text = \
325 | timex2_mentions_mentionid2dict[relation_mention_argument0_refid]["text"]
326 | elif relation_mention_argument0_refid in value_mentions_mentionid2dict:
327 | relation_mention_argument0_extend_offset = \
328 | value_mentions_mentionid2dict[relation_mention_argument0_refid]["offset"]
329 | relation_mention_argument0_extend_text = \
330 | value_mentions_mentionid2dict[relation_mention_argument0_refid]["text"]
331 |
332 | # time mention
333 | if relation_mention_argument1_refid in entity_mentions_mentionid2dict:
334 | relation_mention_argument1_extend_offset = \
335 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"]
336 | relation_mention_argument1_extend_text = \
337 | entity_mentions_mentionid2dict[relation_mention_argument1_refid]["text"]
338 | elif relation_mention_argument1_refid in timex2_mentions_mentionid2dict:
339 | relation_mention_argument1_extend_offset = \
340 | timex2_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"]
341 | relation_mention_argument1_extend_text = \
342 | timex2_mentions_mentionid2dict[relation_mention_argument1_refid]["text"]
343 | elif relation_mention_argument1_refid in value_mentions_mentionid2dict:
344 | relation_mention_argument1_extend_offset = \
345 | value_mentions_mentionid2dict[relation_mention_argument1_refid]["offset"]
346 | relation_mention_argument1_extend_text = \
347 | value_mentions_mentionid2dict[relation_mention_argument1_refid]["text"]
348 |
349 | mention_dict = {"relation_id": relation_id, "relation_type": relation_type,
350 | "relation_subtype": relation_subtype, "relation_tense": relation_tense,
351 | "relation_modality": relation_modality, "relation_argument0_refid": relation_argument0_refid,
352 | "relation_argument0_role": relation_argument0_role,
353 | "relation_argument1_refid": relation_argument1_refid,
354 | "relation_argument1_role": relation_argument1_role, "mention_id": relation_mention_id,
355 | "mention_offset": relation_mention_extend_offset,
356 | "mention_text": relation_mention_extent_text,
357 | "mention_argument0_refid": relation_mention_argument0_refid,
358 | "mention_argument0_role": relation_mention_argument0_role,
359 | "mention_argument1_refid": relation_mention_argument1_refid,
360 | "mention_argument1_role": relation_mention_argument1_role,
361 | "mention_argument0_offset": relation_mention_argument0_extend_offset,
362 | "mention_argument0_text": relation_mention_argument0_extend_text,
363 | "mention_argument1_offset": relation_mention_argument1_extend_offset,
364 | "mention_argument1_text": relation_mention_argument1_extend_text
365 | }
366 | relation_mentions_id2dict[relation_mention_id] = mention_dict
367 | if relation_mention_argument0_refid in relation_mentions_men2men2dict:
368 | relation_mentions_men2dict = relation_mentions_men2men2dict[relation_mention_argument0_refid]
369 | relation_mentions_men2dict[relation_mention_argument1_refid] = mention_dict
370 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict
371 | else:
372 | relation_mentions_men2dict = {relation_mention_argument1_refid: mention_dict}
373 | relation_mentions_men2men2dict[relation_mention_argument0_refid] = relation_mentions_men2dict
374 |
375 | # parse all events
376 | event_mentions_id2dict = {}
377 | for event_elem in all_event_elems:
378 | event_id = event_elem.attrib["ID"]
379 | event_type = event_elem.attrib["TYPE"]
380 | event_subtype = event_elem.attrib["SUBTYPE"]
381 | event_modality = event_elem.attrib["MODALITY"]
382 | event_polarity = event_elem.attrib["POLARITY"]
383 | event_genericity = event_elem.attrib["GENERICITY"]
384 | event_tense = event_elem.attrib["TENSE"]
385 |
386 | all_event_argument_elems = event_elem.findall("event_argument")
387 | event_argument_list = []
388 | for event_argument_elem in all_event_argument_elems:
389 | event_argument_refid = event_argument_elem.attrib["REFID"]
390 | event_argument_role = event_argument_elem.attrib["ROLE"]
391 | event_argument_dict = {"argument_refid": event_argument_refid, "argument_role": event_argument_role}
392 | event_argument_list.append(event_argument_dict)
393 |
394 | all_event_mention_elems = event_elem.findall("event_mention")
395 | for event_mention_elem in all_event_mention_elems:
396 | event_mention_id = event_mention_elem.attrib["ID"]
397 | event_mention_extent = event_mention_elem.findall("extent")[0].findall("charseq")[0]
398 | event_mention_extent_start = event_mention_extent.attrib["START"]
399 | event_mention_extent_end = event_mention_extent.attrib["END"]
400 | event_mention_extent_text = event_mention_extent.text
401 |
402 | event_mention_anchor = event_mention_elem.findall("anchor")[0].findall("charseq")[0] # trigger
403 | event_mention_anchor_start = event_mention_anchor.attrib["START"]
404 | event_mention_anchor_end = event_mention_anchor.attrib["END"]
405 | event_mention_anchor_offset = event_mention_anchor_start + ":" + event_mention_anchor_end
406 | event_mention_anchor_text = event_mention_anchor.text
407 |
408 | all_event_mention_argument_elems = event_mention_elem.findall("event_mention_argument")
409 | all_event_mention_arguments = []
410 | for event_mention_argument_elem in all_event_mention_argument_elems:
411 | event_mention_argument_refid = event_mention_argument_elem.attrib["REFID"]
412 | event_mention_argument_role = event_mention_argument_elem.attrib["ROLE"]
413 |
414 | # replace extend to head
415 | # entity mentions
416 | if event_mention_argument_refid in entity_mentions_mentionid2dict:
417 | event_mention_argument_offset = \
418 | entity_mentions_mentionid2dict[event_mention_argument_refid]["offset"]
419 | event_mention_argument_text = entity_mentions_mentionid2dict[event_mention_argument_refid]["text"]
420 | elif event_mention_argument_refid in timex2_mentions_mentionid2dict:
421 | event_mention_argument_offset = \
422 | timex2_mentions_mentionid2dict[event_mention_argument_refid]["offset"]
423 | event_mention_argument_text = timex2_mentions_mentionid2dict[event_mention_argument_refid]["text"]
424 | elif event_mention_argument_refid in value_mentions_mentionid2dict:
425 | event_mention_argument_offset = \
426 | value_mentions_mentionid2dict[event_mention_argument_refid]["offset"]
427 | event_mention_argument_text = value_mentions_mentionid2dict[event_mention_argument_refid]["text"]
428 |
429 | event_mention_argument_dict = {"mention_argument_refid": event_mention_argument_refid,
430 | "mention_argument_role": event_mention_argument_role,
431 | "mention_argument_offset": event_mention_argument_offset,
432 | "mention_argument_text": event_mention_argument_text}
433 | all_event_mention_arguments.append(event_mention_argument_dict)
434 |
435 | mention_dict = {"event_id": event_id, "type": event_type, "subtype": event_subtype,
436 | "modality": event_modality, "polarity": event_polarity,
437 | "genericity": event_genericity, "tense": event_tense,
438 | "mention_id": event_mention_id, "anchor_offset": event_mention_anchor_offset,
439 | "anchor_text": event_mention_anchor_text, "argument": all_event_mention_arguments}
440 |
441 | event_mentions_id2dict[event_mention_id] = mention_dict
442 |
443 | return entity_mentions_mentionid2dict, timex2_mentions_mentionid2dict, value_mentions_mentionid2dict, \
444 | relation_mentions_id2dict, event_mentions_id2dict
445 |
446 |
447 | if __name__ == "__main__":
448 | parser = argparse.ArgumentParser()
449 | parser.add_argument('--bio', type=str,
450 | help='bio input path')
451 | parser.add_argument('--ann', type=str,
452 | help='ace annotation input path')
453 | parser.add_argument('--ace', type=str,
454 | help='output ace annotation path')
455 | parser.add_argument('--filelist', type=str,
456 | help='filelist path')
457 |
458 | args = parser.parse_args()
459 |
460 | bio_path = args.bio
461 | ann_path = args.ann
462 | ace_path = args.ace
463 |
464 | if not os.path.exists(ace_path):
465 | os.makedirs(ace_path)
466 |
467 | file_names = []
468 | if os.path.isdir(bio_path):
469 | file_names = [item[:-4]
470 | for item in os.listdir(bio_path)
471 | if item.endswith(".bio")]
472 | else:
473 | file_names = [bio_path]
474 |
475 | for f in file_names:
476 | # print(f)
477 | bio_file= os.path.join(bio_path, f+".bio")
478 | ann_file = os.path.join(ann_path, f+".apf.xml")
479 | ace_file = os.path.join(ace_path, f+".csv")
480 |
481 | if os.path.exists(bio_file) and os.path.exists(ann_file):
482 | write_ann(bio_file, ann_file, ace_file)
483 |
484 |
--------------------------------------------------------------------------------