├── .DS_Store
├── .gitignore
├── README.md
├── __init__.py
├── cached_objects
    └── vocabularies_dict.pickle
├── dataset.py
├── logger.py
├── main.py
├── model.py
├── parameters.py
├── parameters.txt
├── test.py
└── trainer.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkacan/entity-relation-extraction/6cf352535b15f9f2d099c04fb54729662347f761/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea
3 | logs/
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Joint Entity and Relation Extraction with Sequential and Tree-structured LSTMs.
2 | A TensorFlow implementation of the paper _End-to-End Relation Extraction using LSTMs on Sequences and Tree Structures_ (https://www.aclweb.org/anthology/P16-1105). 
3 | 
4 | I used the _Annotated Corpus for Named Entity Recognition_ (https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus) to (pre)train the
5 | Sequence (entity detection) part of the model.
6 | 
7 | Still a (very early) work-in-progress.
8 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkacan/entity-relation-extraction/6cf352535b15f9f2d099c04fb54729662347f761/__init__.py


--------------------------------------------------------------------------------
/cached_objects/vocabularies_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkacan/entity-relation-extraction/6cf352535b15f9f2d099c04fb54729662347f761/cached_objects/vocabularies_dict.pickle


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import _pickle
  3 | import os
  4 | 
  5 | class Dataset(object):
  6 |     def __init__(self, parameters):
  7 |         #self.word_to_index, self.index_to_word = self.create_vocabulary(parameters.train_file)
  8 | 
  9 |         self.phase_file_dict = {
 10 |             "train": parameters.train_file,
 11 |             "validate": parameters.validate_file,
 12 |             "test": parameters.test_file
 13 |         }
 14 | 
 15 |         self.current_file = None
 16 |         self.phase = None
 17 | 
 18 |         self.parameters = parameters
 19 | 
 20 |         self._create_vocabularies(parameters.train_file)
 21 | 
 22 | 
 23 |     def next_batch(self):
 24 |         word_lists = list()
 25 |         pos_lists = list()
 26 |         label_lists = list()
 27 | 
 28 |         batch_sents = list()
 29 |         for _ in range(self.parameters.batch_size):
 30 |             if self.phase is None:
 31 |                 break
 32 | 
 33 |             batch_sents.append(self.next_sentence())
 34 | 
 35 |         if len(batch_sents) < self.parameters.batch_size:
 36 |             return None
 37 | 
 38 |         for word_list, pos_list, label_list in batch_sents:
 39 |             word_lists.append(word_list)
 40 |             pos_lists.append(pos_list)
 41 |             label_lists.append(label_list)
 42 | 
 43 |         return word_lists, pos_lists, label_lists
 44 | 
 45 |     def next_batch_np(self):
 46 |         word_lists, pos_lists, label_lists = self.next_batch()
 47 | 
 48 |         max_len = len(max(word_lists, key=lambda x: len(x)))
 49 | 
 50 |         batch_size = self.parameters.batch_size
 51 | 
 52 |         words_np = np.zeros(shape=(batch_size, max_len))
 53 |         POSs_np = np.zeros(shape=(batch_size, max_len))
 54 |         labels_np = np.zeros(shape=(batch_size, max_len))
 55 |         mask_np = np.zeros(shape=(batch_size, max_len))
 56 | 
 57 |         word_UNK_index = self.word_to_index[self.parameters.SPECIAL_CHAR_UNK]
 58 |         POS_tag_UNK_index = self.POS_tag_to_index[self.parameters.SPECIAL_CHAR_UNK]
 59 |         NE_tag_UNK_index = self.NE_tag_to_index[self.parameters.SPECIAL_CHAR_UNK]
 60 | 
 61 |         for i, (word_list, POS_tag_list, NE_tag_list) in enumerate(zip(word_lists, pos_lists, label_lists)):
 62 |             word_index_list = list(map(lambda x: self.word_to_index[x] if x in self.word_to_index else word_UNK_index, word_list))
 63 |             POS_tag_index_list = list(map(lambda x: self.POS_tag_to_index[x] if x in self.POS_tag_to_index else POS_tag_UNK_index, POS_tag_list))
 64 |             NE_tag_index_list = list(map(lambda x: self.NE_tag_to_index[x] if x in self.NE_tag_to_index else NE_tag_UNK_index, NE_tag_list))
 65 | 
 66 |             length = len(word_index_list)
 67 | 
 68 |             words_np[i, :length] = word_index_list
 69 |             POSs_np[i, :length] = POS_tag_index_list
 70 |             labels_np[i, :length] = NE_tag_index_list
 71 |             mask_np[i, :length] += 1
 72 | 
 73 |         return words_np, POSs_np, labels_np, mask_np
 74 | 
 75 |     def next_sentence(self):
 76 |         word_list = list()
 77 |         pos_list = list()
 78 |         label_list = list()
 79 | 
 80 |         line = self.current_file.readline()
 81 |         if line == "":
 82 |             self.end_epoch()
 83 |             return None
 84 | 
 85 |         word, pos, label = self.parse_word_line(line)
 86 | 
 87 |         word_list.append(word)
 88 |         pos_list.append(pos)
 89 |         label_list.append(label)
 90 | 
 91 |         last_pos = self.current_file.tell()
 92 |         line = self.current_file.readline()
 93 | 
 94 |         while True:
 95 |             if line.startswith("Sentence: "):
 96 |                 self.current_file.seek(last_pos)
 97 |                 break
 98 |             elif line == "":
 99 |                 self.end_epoch()
100 |                 break
101 | 
102 |             word, pos, label = self.parse_word_line(line)
103 | 
104 |             word_list.append(word)
105 |             pos_list.append(pos)
106 |             label_list.append(label)
107 | 
108 |             last_pos = self.current_file.tell()
109 |             line = self.current_file.readline()
110 | 
111 |         return word_list, pos_list, label_list
112 | 
113 | 
114 |     def start_epoch(self, phase):
115 |         if self.current_file is not None:
116 |             self.current_file.close()
117 | 
118 |         self.current_file = open(self.phase_file_dict[phase])
119 |         self.phase = phase
120 | 
121 |     def end_epoch(self):
122 |         self.current_file.close()
123 |         self.current_file = None
124 |         self.phase = None
125 | 
126 | 
127 |     def _create_vocabularies(self, train_sentences):
128 |         if self.parameters.reuse_vocabularies and os.path.isfile(self.parameters.vocabularies_dir + "vocabularies_dict.pickle"):
129 |             self._load_cached_vocabularies()
130 |             return
131 | 
132 |         self.start_epoch("train")
133 | 
134 |         word_set = set()
135 |         POS_tag_set = set()
136 |         NE_tag_set = set()
137 | 
138 |         while self.phase == "train":
139 |             words, POS_tags, NE_tags = self.next_sentence()
140 | 
141 |             word_set.update(words)
142 |             POS_tag_set.update(POS_tags)
143 |             NE_tag_set.update(NE_tags)
144 | 
145 |         # add the special token UNK for all words/POS_tags/NE_tags not seen in the training set
146 |         word_set.add(self.parameters.SPECIAL_CHAR_UNK)
147 |         POS_tag_set.add(self.parameters.SPECIAL_CHAR_UNK)
148 |         NE_tag_set.add(self.parameters.SPECIAL_CHAR_UNK)
149 | 
150 |         index_to_word, word_to_index = self.create_bidirectional_dicts(word_set)
151 |         index_to_POS_tag, POS_tag_to_index = self.create_bidirectional_dicts(POS_tag_set)
152 |         index_to_NE_tag, NE_tag_to_index = self.create_bidirectional_dicts(NE_tag_set)
153 | 
154 |         self.word_to_index = word_to_index
155 |         self.index_to_word = index_to_word
156 |         self.POS_tag_to_index = POS_tag_to_index
157 |         self.index_to_POS_tag = index_to_POS_tag
158 |         self.NE_tag_to_index = NE_tag_to_index
159 |         self.index_to_NE_tag = index_to_NE_tag
160 | 
161 |         vocabularies_dict = dict()
162 |         vocabularies_dict["word_to_index"] = word_to_index
163 |         vocabularies_dict["index_to_word"] = index_to_word
164 |         vocabularies_dict["POS_tag_to_index"] = POS_tag_to_index
165 |         vocabularies_dict["index_to_POS_tag"] = index_to_POS_tag
166 |         vocabularies_dict["NE_tag_to_index"] = NE_tag_to_index
167 |         vocabularies_dict["index_to_NE_tag"] = index_to_NE_tag
168 | 
169 |         with open(self.parameters.vocabularies_dir + "vocabularies_dict.pickle", "wb") as f:
170 |             _pickle.dump(vocabularies_dict, f, 2)
171 | 
172 | 
173 |     def create_bidirectional_dicts(self, element_set):
174 |         enumerated_list = list(enumerate(element_set))
175 | 
176 |         element_to_index = {element: index for index, element in enumerated_list}
177 |         index_to_element = {index: element for index, element in enumerated_list}
178 | 
179 |         return index_to_element, element_to_index
180 | 
181 |     def parse_word_line(self, line):
182 | 
183 |         if '"' in line:
184 |             line = line.replace('""""', '<$quot$>')
185 |             parts = line.split('"')
186 | 
187 |             new_parts = list()
188 | 
189 |             odd_n = True
190 |             for part in parts:
191 |                 if odd_n:
192 |                     for p in part.split(","):
193 |                         new_parts.append(p)
194 |                 else:
195 |                     new_parts.append(part)
196 |                 odd_n = not odd_n
197 | 
198 |             parsed = list(filter(lambda x: len(x) > 0, new_parts))
199 |             parsed = list(map(lambda x: '"' if x == '<$quot$>' else x, parsed))
200 | 
201 |             [word, pos, label] = parsed
202 |         else:
203 |             splitted = line.split(",")
204 |             [sent, word, pos, label] = splitted
205 | 
206 |         return word.strip(), pos.strip(), label.strip()
207 | 
208 |     def _load_cached_vocabularies(self):
209 |         dir_path = self.parameters.vocabularies_dir
210 | 
211 |         with open(dir_path + "vocabularies_dict.pickle", "rb") as f:
212 |             vocabularies_dict = _pickle.load(f)
213 | 
214 |         self.word_to_index = vocabularies_dict["word_to_index"]
215 |         self.index_to_word = vocabularies_dict["index_to_word"]
216 |         self.POS_tag_to_index = vocabularies_dict["POS_tag_to_index"]
217 |         self.index_to_POS_tag = vocabularies_dict["index_to_POS_tag"]
218 |         self.NE_tag_to_index = vocabularies_dict["NE_tag_to_index"]
219 |         self.index_to_NE_tag = vocabularies_dict["index_to_NE_tag"]
220 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
1 | class Logger(object):
2 |     def __init__(self, parameters):
3 |         pass


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from parameters import Parameters
 2 | from dataset import Dataset
 3 | from model import Model
 4 | from logger import Logger
 5 | from trainer import Trainer
 6 | import tensorflow as tf
 7 | 
 8 | if __name__ == '__main__':
 9 | 
10 | 
11 |     parameters = Parameters()
12 | 
13 |     dataset = Dataset(parameters)
14 | 
15 |     dataset.start_epoch("train")
16 | 
17 |     model = Model(parameters, dataset)
18 | 
19 |     with tf.Session(graph=model.graph) as sess:
20 |         sess.run(tf.global_variables_initializer())
21 | 
22 |         word_indices, POS_tag_indices, NE_tag_indices, mask_indices = dataset.next_batch_np()
23 | 
24 |         print(word_indices)
25 | 
26 |         feed_dict = {
27 |             model.word_sequence: word_indices,
28 |             model.POS_tag_sequence: POS_tag_indices,
29 |             model.label_sequence: NE_tag_indices
30 |         }
31 | 
32 | 
33 |         [w_seq] = sess.run(fetches=[model.word_sequence], feed_dict=feed_dict)
34 | 
35 |     print(w_seq)
36 |     exit()
37 | 
38 |     logger = Logger(parameters)
39 | 
40 |     trainer = Trainer(parameters)
41 | 
42 |     trainer.train(dataset, model, logger)
43 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | class Model(object):
 4 |     def __init__(self, parameters, dataset):
 5 |         self.parameters = parameters
 6 |         self.dataset = dataset
 7 | 
 8 |         self._build_graph()
 9 | 
10 |     def _build_graph(self):
11 |         graph = tf.Graph()
12 | 
13 |         with graph.as_default() as g:
14 |             self._construct_model()
15 | 
16 |         self.graph = graph
17 | 
18 |     def _construct_model(self):
19 |         BATCH_SIZE = self.parameters.batch_size
20 | 
21 |         """
22 |         ------------------------------- EMBEDDING LAYER START -------------------------------
23 |         """
24 | 
25 | 
26 |         # 1. Create placeholders for the word list, POS tag list, label list.
27 |         self.word_sequence = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE, None], name="word_sequence")
28 |         self.POS_tag_sequence = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE, None], name="POS_tag_sequence")
29 | 
30 |         self.label_sequence = tf.placeholder(dtype=tf.int32, shape=[BATCH_SIZE, None], name="label_sequence")
31 | 
32 | 
33 |         # 2. Embed the words and the POS tags, each with its own learnable embedding matrix.
34 | 
35 | 
36 |         # 3. Concatenate the word and POS embeddings into a list of vectors.
37 | 
38 | 
39 |         """
40 |         ------------------------------- EMBEDDING LAYER END -------------------------------
41 |         """
42 | 
43 |         """
44 |         ------------------------------- SEQUENCE LAYER START -------------------------------
45 |         """
46 | 
47 |         # 4. Run a BiLSTM on the concatenated embeddings.
48 | 
49 |         # 5. Run a custom RNN that predicts the entity labels:
50 |         #       - input: the output vector of the previous BiLSTM layer
51 |         #       - hidden state:
52 |         #           - incoming: the predicted entity label for the previous input
53 |         #           - outgoing: the predicted entity label for the current input
54 |         #       - output: the logits on the vocabulary of labels (NE tags); unnormalized log-probability of each label for the current input
55 | 
56 | 
57 |         """
58 |         ------------------------------- SEQUENCE LAYER END -------------------------------
59 |         """
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/parameters.py:
--------------------------------------------------------------------------------
 1 | class Parameters(object):
 2 |     def __init__(self):
 3 |         with open("parameters.txt", "r") as f:
 4 |             lines = filter(lambda x: len(x) > 0, map(lambda l: l.strip(), f.readlines()))
 5 | 
 6 |         parameter_dict = {pair[0]: pair[1] for pair in map(lambda x: x.split("="), lines)}
 7 | 
 8 |         data_root = parameter_dict["DATA_ROOT"]
 9 |         problem_name = parameter_dict["PROBLEM_NAME"]
10 |         batch_size = int(parameter_dict["BATCH_SIZE"])
11 | 
12 |         self.data_root = data_root
13 |         self.problem_name = problem_name
14 |         self.batch_size = batch_size
15 | 
16 |         self.data_folder = data_root + problem_name + "/"
17 | 
18 |         self.SPECIAL_CHAR_UNK = parameter_dict["SPECIAL_CHAR_UNK"]
19 |         self.SPECIAL_CHAR_START = parameter_dict["SPECIAL_CHAR_START"]
20 |         self.SPECIAL_CHAR_END = parameter_dict["SPECIAL_CHAR_END"]
21 | 
22 |         self.reuse_vocabularies = parameter_dict["REUSE_VOCABULARIES"]
23 |         self.vocabularies_dir = parameter_dict["VOCABULARIES_DIR"]
24 | 
25 |         self.train_file = data_root + problem_name + "/" + parameter_dict["TRAIN_FILE"]
26 |         self.validate_file = data_root + problem_name + "/" + parameter_dict["VALIDATE_FILE"]
27 |         self.test_file = data_root + problem_name + "/" + parameter_dict["TEST_FILE"]
28 | 
29 | 


--------------------------------------------------------------------------------
/parameters.txt:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/Users/marinkacan/datasets/entity_relation/
 2 | 
 3 | PROBLEM_NAME=ER_SMALL
 4 | 
 5 | BATCH_SIZE=2
 6 | 
 7 | TRAIN_FILE=train.csv
 8 | VALIDATE_FILE=validate.csv
 9 | TEST_FILE=test.csv
10 | 
11 | SPECIAL_CHAR_START=#START
12 | SPECIAL_CHAR_END=#END
13 | SPECIAL_CHAR_UNK=#UNK
14 | 
15 | REUSE_VOCABULARIES=True
16 | VOCABULARIES_DIR=cached_objects/


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | with tf.variable_scope("TRAIN"):
 4 |     v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
 5 |     a = tf.constant(2, name="a")
 6 |     b = tf.constant(3, name="b")
 7 | 
 8 |     c = tf.add(a, b, name="c") * v
 9 | 
10 | with tf.variable_scope("TEST"):
11 |     v_t = tf.get_variable("v", shape=(), initializer=tf.ones_initializer())
12 |     a_t = tf.constant(2, name="a")
13 |     b_t = tf.constant(3, name="b")
14 | 
15 |     c_t = tf.add(a, b, name="c") * v_t
16 | 
17 | 
18 | writer = tf.summary.FileWriter(logdir="logs/")
19 | 
20 | with tf.Session() as sess:
21 |     writer.add_graph(sess.graph)
22 |     writer.flush()
23 |     writer.close()
24 | 


--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
1 | class Trainer(object):
2 |     def __init__(self, parameters):
3 |         pass
4 | 
5 |     def train(self, dataset, model, logger):
6 |         pass


--------------------------------------------------------------------------------