├── .DS_Store ├── .gitignore ├── README.md ├── __init__.py ├── cached_objects └── vocabularies_dict.pickle ├── dataset.py ├── logger.py ├── main.py ├── model.py ├── parameters.py ├── parameters.txt ├── test.py └── trainer.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkacan/entity-relation-extraction/6cf352535b15f9f2d099c04fb54729662347f761/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea 3 | logs/ 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Joint Entity and Relation Extraction with Sequential and Tree-structured LSTMs. 2 | A TensorFlow implementation of the paper _End-to-End Relation Extraction using LSTMs on Sequences and Tree Structures_ (https://www.aclweb.org/anthology/P16-1105). 3 | 4 | I used the _Annotated Corpus for Named Entity Recognition_ (https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus) to (pre)train the 5 | Sequence (entity detection) part of the model. 6 | 7 | Still a (very early) work-in-progress. 8 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkacan/entity-relation-extraction/6cf352535b15f9f2d099c04fb54729662347f761/__init__.py -------------------------------------------------------------------------------- /cached_objects/vocabularies_dict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkacan/entity-relation-extraction/6cf352535b15f9f2d099c04fb54729662347f761/cached_objects/vocabularies_dict.pickle -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import _pickle 3 | import os 4 | 5 | class Dataset(object): 6 | def __init__(self, parameters): 7 | #self.word_to_index, self.index_to_word = self.create_vocabulary(parameters.train_file) 8 | 9 | self.phase_file_dict = { 10 | "train": parameters.train_file, 11 | "validate": parameters.validate_file, 12 | "test": parameters.test_file 13 | } 14 | 15 | self.current_file = None 16 | self.phase = None 17 | 18 | self.parameters = parameters 19 | 20 | self._create_vocabularies(parameters.train_file) 21 | 22 | 23 | def next_batch(self): 24 | word_lists = list() 25 | pos_lists = list() 26 | label_lists = list() 27 | 28 | batch_sents = list() 29 | for _ in range(self.parameters.batch_size): 30 | if self.phase is None: 31 | break 32 | 33 | batch_sents.append(self.next_sentence()) 34 | 35 | if len(batch_sents) < self.parameters.batch_size: 36 | return None 37 | 38 | for word_list, pos_list, label_list in batch_sents: 39 | word_lists.append(word_list) 40 | pos_lists.append(pos_list) 41 | label_lists.append(label_list) 42 | 43 | return word_lists, pos_lists, label_lists 44 | 45 | def next_batch_np(self): 46 | word_lists, pos_lists, label_lists = self.next_batch() 47 | 48 | max_len = len(max(word_lists, key=lambda x: len(x))) 49 | 50 | batch_size = self.parameters.batch_size 51 | 52 | words_np = np.zeros(shape=(batch_size, max_len)) 53 | POSs_np = np.zeros(shape=(batch_size, max_len)) 54 | labels_np = np.zeros(shape=(batch_size, max_len)) 55 | mask_np = np.zeros(shape=(batch_size, max_len)) 56 | 57 | word_UNK_index = self.word_to_index[self.parameters.SPECIAL_CHAR_UNK] 58 | POS_tag_UNK_index = self.POS_tag_to_index[self.parameters.SPECIAL_CHAR_UNK] 59 | NE_tag_UNK_index = self.NE_tag_to_index[self.parameters.SPECIAL_CHAR_UNK] 60 | 61 | for i, (word_list, POS_tag_list, NE_tag_list) in enumerate(zip(word_lists, pos_lists, label_lists)): 62 | word_index_list = list(map(lambda x: self.word_to_index[x] if x in self.word_to_index else word_UNK_index, word_list)) 63 | POS_tag_index_list = list(map(lambda x: self.POS_tag_to_index[x] if x in self.POS_tag_to_index else POS_tag_UNK_index, POS_tag_list)) 64 | NE_tag_index_list = list(map(lambda x: self.NE_tag_to_index[x] if x in self.NE_tag_to_index else NE_tag_UNK_index, NE_tag_list)) 65 | 66 | length = len(word_index_list) 67 | 68 | words_np[i, :length] = word_index_list 69 | POSs_np[i, :length] = POS_tag_index_list 70 | labels_np[i, :length] = NE_tag_index_list 71 | mask_np[i, :length] += 1 72 | 73 | return words_np, POSs_np, labels_np, mask_np 74 | 75 | def next_sentence(self): 76 | word_list = list() 77 | pos_list = list() 78 | label_list = list() 79 | 80 | line = self.current_file.readline() 81 | if line == "": 82 | self.end_epoch() 83 | return None 84 | 85 | word, pos, label = self.parse_word_line(line) 86 | 87 | word_list.append(word) 88 | pos_list.append(pos) 89 | label_list.append(label) 90 | 91 | last_pos = self.current_file.tell() 92 | line = self.current_file.readline() 93 | 94 | while True: 95 | if line.startswith("Sentence: "): 96 | self.current_file.seek(last_pos) 97 | break 98 | elif line == "": 99 | self.end_epoch() 100 | break 101 | 102 | word, pos, label = self.parse_word_line(line) 103 | 104 | word_list.append(word) 105 | pos_list.append(pos) 106 | label_list.append(label) 107 | 108 | last_pos = self.current_file.tell() 109 | line = self.current_file.readline() 110 | 111 | return word_list, pos_list, label_list 112 | 113 | 114 | def start_epoch(self, phase): 115 | if self.current_file is not None: 116 | self.current_file.close() 117 | 118 | self.current_file = open(self.phase_file_dict[phase]) 119 | self.phase = phase 120 | 121 | def end_epoch(self): 122 | self.current_file.close() 123 | self.current_file = None 124 | self.phase = None 125 | 126 | 127 | def _create_vocabularies(self, train_sentences): 128 | if self.parameters.reuse_vocabularies and os.path.isfile(self.parameters.vocabularies_dir + "vocabularies_dict.pickle"): 129 | self._load_cached_vocabularies() 130 | return 131 | 132 | self.start_epoch("train") 133 | 134 | word_set = set() 135 | POS_tag_set = set() 136 | NE_tag_set = set() 137 | 138 | while self.phase == "train": 139 | words, POS_tags, NE_tags = self.next_sentence() 140 | 141 | word_set.update(words) 142 | POS_tag_set.update(POS_tags) 143 | NE_tag_set.update(NE_tags) 144 | 145 | # add the special token UNK for all words/POS_tags/NE_tags not seen in the training set 146 | word_set.add(self.parameters.SPECIAL_CHAR_UNK) 147 | POS_tag_set.add(self.parameters.SPECIAL_CHAR_UNK) 148 | NE_tag_set.add(self.parameters.SPECIAL_CHAR_UNK) 149 | 150 | index_to_word, word_to_index = self.create_bidirectional_dicts(word_set) 151 | index_to_POS_tag, POS_tag_to_index = self.create_bidirectional_dicts(POS_tag_set) 152 | index_to_NE_tag, NE_tag_to_index = self.create_bidirectional_dicts(NE_tag_set) 153 | 154 | self.word_to_index = word_to_index 155 | self.index_to_word = index_to_word 156 | self.POS_tag_to_index = POS_tag_to_index 157 | self.index_to_POS_tag = index_to_POS_tag 158 | self.NE_tag_to_index = NE_tag_to_index 159 | self.index_to_NE_tag = index_to_NE_tag 160 | 161 | vocabularies_dict = dict() 162 | vocabularies_dict["word_to_index"] = word_to_index 163 | vocabularies_dict["index_to_word"] = index_to_word 164 | vocabularies_dict["POS_tag_to_index"] = POS_tag_to_index 165 | vocabularies_dict["index_to_POS_tag"] = index_to_POS_tag 166 | vocabularies_dict["NE_tag_to_index"] = NE_tag_to_index 167 | vocabularies_dict["index_to_NE_tag"] = index_to_NE_tag 168 | 169 | with open(self.parameters.vocabularies_dir + "vocabularies_dict.pickle", "wb") as f: 170 | _pickle.dump(vocabularies_dict, f, 2) 171 | 172 | 173 | def create_bidirectional_dicts(self, element_set): 174 | enumerated_list = list(enumerate(element_set)) 175 | 176 | element_to_index = {element: index for index, element in enumerated_list} 177 | index_to_element = {index: element for index, element in enumerated_list} 178 | 179 | return index_to_element, element_to_index 180 | 181 | def parse_word_line(self, line): 182 | 183 | if '"' in line: 184 | line = line.replace('""""', '<$quot$>') 185 | parts = line.split('"') 186 | 187 | new_parts = list() 188 | 189 | odd_n = True 190 | for part in parts: 191 | if odd_n: 192 | for p in part.split(","): 193 | new_parts.append(p) 194 | else: 195 | new_parts.append(part) 196 | odd_n = not odd_n 197 | 198 | parsed = list(filter(lambda x: len(x) > 0, new_parts)) 199 | parsed = list(map(lambda x: '"' if x == '<$quot$>' else x, parsed)) 200 | 201 | [word, pos, label] = parsed 202 | else: 203 | splitted = line.split(",") 204 | [sent, word, pos, label] = splitted 205 | 206 | return word.strip(), pos.strip(), label.strip() 207 | 208 | def _load_cached_vocabularies(self): 209 | dir_path = self.parameters.vocabularies_dir 210 | 211 | with open(dir_path + "vocabularies_dict.pickle", "rb") as f: 212 | vocabularies_dict = _pickle.load(f) 213 | 214 | self.word_to_index = vocabularies_dict["word_to_index"] 215 | self.index_to_word = vocabularies_dict["index_to_word"] 216 | self.POS_tag_to_index = vocabularies_dict["POS_tag_to_index"] 217 | self.index_to_POS_tag = vocabularies_dict["index_to_POS_tag"] 218 | self.NE_tag_to_index = vocabularies_dict["NE_tag_to_index"] 219 | self.index_to_NE_tag = vocabularies_dict["index_to_NE_tag"] 220 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | class Logger(object): 2 | def __init__(self, parameters): 3 | pass -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from parameters import Parameters 2 | from dataset import Dataset 3 | from model import Model 4 | from logger import Logger 5 | from trainer import Trainer 6 | import tensorflow as tf 7 | 8 | if __name__ == '__main__': 9 | 10 | 11 | parameters = Parameters() 12 | 13 | dataset = Dataset(parameters) 14 | 15 | dataset.start_epoch("train") 16 | 17 | model = Model(parameters, dataset) 18 | 19 | with tf.Session(graph=model.graph) as sess: 20 | sess.run(tf.global_variables_initializer()) 21 | 22 | word_indices, POS_tag_indices, NE_tag_indices, mask_indices = dataset.next_batch_np() 23 | 24 | print(word_indices) 25 | 26 | feed_dict = { 27 | model.word_sequence: word_indices, 28 | model.POS_tag_sequence: POS_tag_indices, 29 | model.label_sequence: NE_tag_indices 30 | } 31 | 32 | 33 | [w_seq] = sess.run(fetches=[model.word_sequence], feed_dict=feed_dict) 34 | 35 | print(w_seq) 36 | exit() 37 | 38 | logger = Logger(parameters) 39 | 40 | trainer = Trainer(parameters) 41 | 42 | trainer.train(dataset, model, logger) 43 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class Model(object): 4 | def __init__(self, parameters, dataset): 5 | self.parameters = parameters 6 | self.dataset = dataset 7 | 8 | self._build_graph() 9 | 10 | def _build_graph(self): 11 | graph = tf.Graph() 12 | 13 | with graph.as_default() as g: 14 | self._construct_model() 15 | 16 | self.graph = graph 17 | 18 | def _construct_model(self): 19 | BATCH_SIZE = self.parameters.batch_size 20 | 21 | """ 22 | ------------------------------- EMBEDDING LAYER START ------------------------------- 23 | """ 24 | 25 | 26 | # 1. Create placeholders for the word list, POS tag list, label list. 27 | self.word_sequence = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE, None], name="word_sequence") 28 | self.POS_tag_sequence = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE, None], name="POS_tag_sequence") 29 | 30 | self.label_sequence = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE, None], name="label_sequence") 31 | 32 | 33 | # 2. Embed the words and the POS tags, each with its own learnable embedding matrix. 34 | 35 | 36 | # 3. Concatenate the word and POS embeddings into a list of vectors. 37 | 38 | 39 | """ 40 | ------------------------------- EMBEDDING LAYER END ------------------------------- 41 | """ 42 | 43 | """ 44 | ------------------------------- SEQUENCE LAYER START ------------------------------- 45 | """ 46 | 47 | # 4. Run a BiLSTM on the concatenated embeddings. 48 | 49 | # 5. Run a custom RNN that predicts the entity labels: 50 | # - input: the output vector of the previous BiLSTM layer 51 | # - hidden state: 52 | # - incoming: the predicted entity label for the previous input 53 | # - outgoing: the predicted entity label for the current input 54 | # - output: the logits on the vocabulary of labels (NE tags); unnormalized log-probability of each label for the current input 55 | 56 | 57 | """ 58 | ------------------------------- SEQUENCE LAYER END ------------------------------- 59 | """ 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /parameters.py: -------------------------------------------------------------------------------- 1 | class Parameters(object): 2 | def __init__(self): 3 | with open("parameters.txt", "r") as f: 4 | lines = filter(lambda x: len(x) > 0, map(lambda l: l.strip(), f.readlines())) 5 | 6 | parameter_dict = {pair[0]: pair[1] for pair in map(lambda x: x.split("="), lines)} 7 | 8 | data_root = parameter_dict["DATA_ROOT"] 9 | problem_name = parameter_dict["PROBLEM_NAME"] 10 | batch_size = int(parameter_dict["BATCH_SIZE"]) 11 | 12 | self.data_root = data_root 13 | self.problem_name = problem_name 14 | self.batch_size = batch_size 15 | 16 | self.data_folder = data_root + problem_name + "/" 17 | 18 | self.SPECIAL_CHAR_UNK = parameter_dict["SPECIAL_CHAR_UNK"] 19 | self.SPECIAL_CHAR_START = parameter_dict["SPECIAL_CHAR_START"] 20 | self.SPECIAL_CHAR_END = parameter_dict["SPECIAL_CHAR_END"] 21 | 22 | self.reuse_vocabularies = parameter_dict["REUSE_VOCABULARIES"] 23 | self.vocabularies_dir = parameter_dict["VOCABULARIES_DIR"] 24 | 25 | self.train_file = data_root + problem_name + "/" + parameter_dict["TRAIN_FILE"] 26 | self.validate_file = data_root + problem_name + "/" + parameter_dict["VALIDATE_FILE"] 27 | self.test_file = data_root + problem_name + "/" + parameter_dict["TEST_FILE"] 28 | 29 | -------------------------------------------------------------------------------- /parameters.txt: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/Users/marinkacan/datasets/entity_relation/ 2 | 3 | PROBLEM_NAME=ER_SMALL 4 | 5 | BATCH_SIZE=2 6 | 7 | TRAIN_FILE=train.csv 8 | VALIDATE_FILE=validate.csv 9 | TEST_FILE=test.csv 10 | 11 | SPECIAL_CHAR_START=#START 12 | SPECIAL_CHAR_END=#END 13 | SPECIAL_CHAR_UNK=#UNK 14 | 15 | REUSE_VOCABULARIES=True 16 | VOCABULARIES_DIR=cached_objects/ -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | with tf.variable_scope("TRAIN"): 4 | v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer()) 5 | a = tf.constant(2, name="a") 6 | b = tf.constant(3, name="b") 7 | 8 | c = tf.add(a, b, name="c") * v 9 | 10 | with tf.variable_scope("TEST"): 11 | v_t = tf.get_variable("v", shape=(), initializer=tf.ones_initializer()) 12 | a_t = tf.constant(2, name="a") 13 | b_t = tf.constant(3, name="b") 14 | 15 | c_t = tf.add(a, b, name="c") * v_t 16 | 17 | 18 | writer = tf.summary.FileWriter(logdir="logs/") 19 | 20 | with tf.Session() as sess: 21 | writer.add_graph(sess.graph) 22 | writer.flush() 23 | writer.close() 24 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | class Trainer(object): 2 | def __init__(self, parameters): 3 | pass 4 | 5 | def train(self, dataset, model, logger): 6 | pass --------------------------------------------------------------------------------