├── .gitignore ├── model_config.yml ├── README.md ├── test_crcnn.py ├── dataio.py ├── model.py └── train_crcnn.py /.gitignore: -------------------------------------------------------------------------------- 1 | .project 2 | .pydevproject 3 | *models/* 4 | *data/* 5 | *__pycache__/* 6 | -------------------------------------------------------------------------------- /model_config.yml: -------------------------------------------------------------------------------- 1 | data_dir: ./data/ 2 | model_dir: ./models/ 3 | train_file: TRAIN_FILE.TXT 4 | test_file: TEST_FILE_FULL.TXT 5 | dtype: float32 6 | embeddings.dim: 50 7 | embeddings.tune: false 8 | embeddings.file: ./data/glove.6B.50d.txt 9 | embeddings.mat.file: null 10 | embeddings.init_scale: 0.25 11 | embeddings.dist.dim: 25 12 | learning_rate: 0.01 13 | training_iters: 40 14 | batch_size: 20 15 | eval_interval: 1 16 | train_step_eval: 1 17 | nprint: -1 18 | window: 19 | - 2 20 | - 3 21 | - 4 22 | - 5 23 | nfeature_map: 25 24 | sent_length: 90 25 | dropout: 0.5 26 | l2: 0.01 27 | device: '/cpu:0' 28 | lm: 1.0 29 | margin_plus: 2.5 30 | margin_minus: 0.5 31 | experiment_name: 'test' 32 | seed: 123 33 | devset_size: 0.2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Classifying Relations by Ranking with Convolutional Neural Networks 2 | 3 | Implementation of ACL 2015 Paper: 4 | [Classifying Relations by Ranking with Convolutional Neural Networks](https://arxiv.org/abs/1504.06580) 5 | 6 | ## Download SemEval 2010 Task 8 Dataset for Relation Classification 7 | Here is the link to download this dataset: 8 | [link](https://drive.google.com/file/d/0B_jQiLugGTAkMDQ5ZjZiMTUtMzQ1Yy00YWNmLWJlZDYtOWY1ZDMwY2U4YjFk/view?layout=list&ddrp=1&sort=name&num=50) 9 | 10 | You will also need to download some pre-trained embeddings like 11 | [GloVe](https://nlp.stanford.edu/projects/glove/). 12 | 13 | ## Dependencies 14 | ``` 15 | tensorflow (1.3.0) 16 | spacy 17 | pandas 18 | numpy 19 | scikit-learn 20 | ``` 21 | 22 | ## Training 23 | Update paths in `model_config.yml`, then start training as: 24 | 25 | ``` 26 | python3 -m train_crcnn 27 | ``` 28 | 29 | 30 | ## Evaluation 31 | Once the training is finished and you have trained models in your model directory, 32 | evaluate a model as: 33 | 34 | ``` 35 | python3 -m test_crcnn --config_file 36 | 37 | ``` 38 | -------------------------------------------------------------------------------- /test_crcnn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 1 March 2018 3 | 4 | @author: Bhanu 5 | 6 | ''' 7 | import tensorflow as tf 8 | import os 9 | import numpy as np 10 | import yaml 11 | from train_crcnn import load_sents_data_semeval2010, build_model,\ 12 | build_data_streams, Vocab 13 | import pickle 14 | from sklearn.metrics.classification import f1_score, classification_report 15 | import argparse 16 | import sys 17 | 18 | FLAGS = None 19 | 20 | def main(_): 21 | test_model = FLAGS.model_name 22 | config_file = FLAGS.config_file 23 | 24 | with open(config_file, 'r') as rf: 25 | params = yaml.load(rf) 26 | 27 | seed = params.get('seed') 28 | tf.set_random_seed(seed) 29 | 30 | test_data_filename = params.get('test_file') 31 | 32 | data_dir = params.get('data_dir') 33 | model_dir = params.get('model_dir') 34 | 35 | print("loading data...", flush=True) 36 | test_data_file = os.path.join(data_dir, test_data_filename) 37 | dftest = load_sents_data_semeval2010(test_data_file, testset=True) 38 | with open(os.path.join(model_dir, params.get('label_encoder_file')), 'rb') as rf: 39 | le = pickle.load(rf) 40 | print(le.classes_) 41 | 42 | #build pos vocab 43 | print("loading vocab...", flush=True) 44 | with open(os.path.join(model_dir, params.get('vocab_file')), 'rb') as rf: 45 | vocab = pickle.load(rf) 46 | 47 | is_test_labels = dftest.class_.any() 48 | 49 | # build input data streams 50 | teststream = build_data_streams(dftest, vocab.dict, 51 | params.get('sent_length'), le) 52 | 53 | labels = teststream.label 54 | if labels is None: 55 | labels = np.zeros(teststream.sent.shape[0]) 56 | 57 | #build model 58 | mdl = build_model(params) 59 | test_feed_dict = { 60 | mdl.sent: teststream.sent, 61 | mdl.label: labels, 62 | mdl.ent1_dist: teststream.ent1_dist, 63 | mdl.ent2_dist: teststream.ent2_dist, 64 | mdl.dropout_keep_proba: 1.0, 65 | mdl.batch_size: teststream.sent.shape[0] 66 | } 67 | 68 | #run the graph 69 | init_op = tf.global_variables_initializer() 70 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) 71 | with tf.Session() as sess: 72 | sess.run(init_op) 73 | saver.restore(sess, os.path.join(model_dir, test_model)) 74 | print("Restored session from %s"%test_model) 75 | pred_probas, preds = sess.run([mdl.pred_probas, mdl.preds], test_feed_dict) 76 | 77 | #print scores, if test_labels known 78 | if is_test_labels is not None: 79 | l = teststream.label 80 | p = preds 81 | 82 | class_int_labels = list(range(len(le.classes_))) 83 | target_names=le.classes_ 84 | 85 | eval_score = (f1_score(l, p, average='micro'), 86 | f1_score(l, p, average='macro') 87 | ) 88 | print("EVAL f1_micro {:g} f1_macro {:g}" 89 | .format(eval_score[0], eval_score[1]), flush=True) 90 | 91 | print("Classification Report: \n%s"% 92 | classification_report(l, p, 93 | labels=class_int_labels, 94 | target_names=target_names, 95 | ), flush=True) 96 | 97 | 98 | if __name__ == '__main__': 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument('--model_name', type=str, default=None, 101 | help='Checkpoint Prefix of the model to be tested') 102 | parser.add_argument('--config_file', type=str, default=None, 103 | help='Full path of the config file') 104 | 105 | FLAGS, unparsed = parser.parse_known_args() 106 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 107 | 108 | -------------------------------------------------------------------------------- /dataio.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from _collections import defaultdict 3 | import pandas as pd 4 | import numpy as np 5 | import spacy 6 | import csv 7 | 8 | nlp = spacy.load('en') 9 | 10 | SpecialVocab = collections.namedtuple('SpecialVocab', ['sos', 'eos', 'unknown', 11 | 'padding']) 12 | special_vocab = SpecialVocab(sos='SEQUENCE_START', eos='SEQUENCE_END', 13 | unknown="UNK", padding='-PAD-') 14 | 15 | 16 | def words_to_indices(seq, vocab_dict): 17 | ''' 18 | @param seq: list of words/tokens 19 | ''' 20 | word_indices = [] 21 | for w in seq: 22 | if w in vocab_dict: 23 | v = vocab_dict.get(w) 24 | if v is None: 25 | print("Got None for %s"%w) 26 | word_indices.append(v) 27 | else: 28 | word_indices.append(vocab_dict.get(special_vocab.unknown)) 29 | print("Couldn't find %s"%w) 30 | return word_indices 31 | 32 | def build_vocab(sentsdf, min_freq=1): 33 | vocab_dict = defaultdict(int) 34 | for _, row in sentsdf.iterrows(): 35 | for w in row.words: 36 | vocab_dict[w] += 1 37 | 38 | #drop all words with less than min_freq 39 | vocabdf = pd.DataFrame({'word': list(vocab_dict.keys()), 40 | 'freq': list(vocab_dict.values())}) 41 | vocabdf = vocabdf[vocabdf.freq >= min_freq] 42 | 43 | #add special vocab, with padding at index 0 44 | vocab_ = [special_vocab.padding, special_vocab.unknown, 45 | special_vocab.eos, special_vocab.sos] + vocabdf.word.values.tolist() 46 | return vocab_ 47 | 48 | def pad_seq(seq, max_len): 49 | seq_len = len(seq) 50 | if len(seq) > max_len: 51 | seq = seq[:max_len] 52 | seq_len = max_len 53 | else: 54 | seq = seq+[special_vocab.padding]*(max_len-seq_len) 55 | return seq 56 | 57 | def process_sequence(df, vocab_dict, max_len): 58 | ''' 59 | 1. Tokenize 60 | 2. Pad 61 | 3. Convert to wordindices 62 | 4. Convert to relative distances from entity1 and entity2 for each word. 63 | ''' 64 | word_indices = [] 65 | ent1_dists = [] 66 | ent2_dists = [] 67 | for _, row in df.iterrows(): 68 | seq = row.sent 69 | 70 | words = row.words 71 | 72 | padded_words = [special_vocab.sos] + words + [special_vocab.eos] 73 | padded_words = pad_seq(padded_words, max_len) 74 | wi = words_to_indices(padded_words, vocab_dict) 75 | 76 | e1_end = int(row.ent_1_end)#seq.index(row.ent_1)+len(row.ent_1) 77 | e2_end = int(row.ent_2_end)#seq.index(row.ent_2)+len(row.ent_2) 78 | newseq = seq[:e1_end].strip()+" entity_1_end "+ seq[e1_end:e2_end].strip() + \ 79 | " entity_2_end "+ seq[e2_end:].strip() 80 | 81 | newseq_words = [tok.text for tok in nlp.tokenizer(newseq)] 82 | newseq_words = [special_vocab.sos] + newseq_words + [special_vocab.eos] 83 | i1 = newseq_words.index('entity_1_end') - 1#TODO: Use head of entity-phrase instead of rightmost word 84 | i2 = newseq_words.index('entity_2_end') - 2 85 | 86 | ent1_dist = [i-i1 for i in range(len(padded_words))] 87 | ent2_dist = [i-i2 for i in range(len(padded_words))] 88 | 89 | word_indices.append(wi) 90 | ent1_dists.append(ent1_dist) 91 | ent2_dists.append(ent2_dist) 92 | 93 | word_indices, ent1_dists, ent2_dists = \ 94 | np.asarray(word_indices), np.asarray(ent1_dists), np.asarray(ent2_dists) 95 | 96 | ent1_dists += max_len 97 | ent2_dists += max_len 98 | 99 | return word_indices, ent1_dists, ent2_dists 100 | 101 | 102 | def read_semeval2010_data(filename): 103 | data = {'rel':[], 'sent': [], 'ent_1':[], 'ent_2':[], 'words':[], 104 | 'ent_1_start':[], 'ent_2_start':[], 'ent_1_end':[], 'ent_2_end':[]} 105 | etags = ['', '', '', ''] 106 | with open(filename, 'r') as rf: 107 | for line in rf: 108 | _, sent = line.split('\t') 109 | 110 | rel = next(rf).strip().upper() 111 | next(rf) #comment 112 | next(rf)#blankline 113 | e1 = sent[sent.index('')+4:sent.index('')] 114 | e2 = sent[sent.index('')+4:sent.index('')] 115 | e1_start = sent.index('') - 1 116 | e2_start = sent.index('') - 1*4 - 1*5 - 1 #compensating for tag, and " 117 | e1_end = sent.index('') - 1*4 - 1 118 | e2_end = sent.index('') - 2*4 - 1*5 - 1 119 | 120 | for tag_ in etags: 121 | sent = sent.replace(tag_,"") 122 | sent = sent.strip().lower()[1:-1] 123 | words = [tok.text for tok in nlp.tokenizer(sent)] 124 | data['sent'].append(sent) 125 | data['ent_1'].append(e1) 126 | data['ent_2'].append(e2) 127 | data['rel'].append(rel) 128 | data['words'].append(words) 129 | data['ent_1_start'].append(e1_start) 130 | data['ent_1_end'].append(e1_end) 131 | data['ent_2_start'].append(e2_start) 132 | data['ent_2_end'].append(e2_end) 133 | df = pd.DataFrame.from_dict(data) 134 | return df 135 | 136 | def read_embeddings(embeddings_path, vocab_, init_scale=0.25, 137 | dtype='float32', random_state=None): 138 | 139 | if random_state is None: 140 | random_state = np.random.RandomState(10) 141 | 142 | vocab_vec = pd.read_csv(embeddings_path, header=None, skiprows=[0], 143 | sep=' ', index_col=0, quoting=csv.QUOTE_NONE) 144 | cols = ['col%d'%x for x in range(vocab_vec.shape[1])] 145 | vocab_vec.columns = cols 146 | 147 | # known_words = [w for w in vocab_ if w in vocab_vec.index] 148 | # known_mat = vocab_vec.ix[known_words,:] 149 | # known_mat.to_csv(embeddings_path+".aclaug", sep=' ', index_label='word') 150 | 151 | print("Vocab Size: %d"%len(vocab_), flush=True) 152 | unknown_words = [w for w in vocab_ if w not in vocab_vec.index] 153 | 154 | print("adding %d unknown words..."%len(unknown_words), flush=True) 155 | emb_dim = vocab_vec.shape[1] 156 | rnd_mat = random_state.uniform(-init_scale, init_scale, 157 | size=(len(unknown_words), emb_dim)) 158 | rnd_df = pd.DataFrame(rnd_mat, index=unknown_words, columns=cols) 159 | vocab_vec = pd.concat([vocab_vec, rnd_df], axis=0) 160 | embeddings_mat = vocab_vec.ix[vocab_,:] 161 | return embeddings_mat 162 | 163 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 1 Mar 2018 3 | 4 | @author: Bhanu 5 | 6 | ''' 7 | 8 | import tensorflow as tf 9 | import os 10 | import pickle 11 | 12 | def variable_on_device(name, shape, initializer, device): 13 | with tf.device(device): 14 | v = tf.get_variable(name=name, shape=shape, initializer=initializer) 15 | return v 16 | 17 | class CRCNN: 18 | ''' 19 | Implementation of Classification by Ranking CNN. 20 | Refer: Classifying Relations by Ranking with Convolutional Neural Networks. 21 | C´ıcero Nogueira dos Santos, Bing Xiang, Bowen Zhou 22 | ''' 23 | 24 | def __init__(self, params): 25 | self.is_training = False if params.get('mode') == 'INFER' else True 26 | 27 | #graph inputs 28 | self.sent = tf.placeholder(dtype=tf.int32, 29 | shape=[None, params.get('sent_length')], name='sent') 30 | self.label = tf.placeholder(dtype=tf.int32, 31 | shape=[None], name='label') 32 | self.ent1_dist = tf.placeholder(dtype=tf.int32, shape=[None, None], 33 | name='ent1_dist') 34 | self.ent2_dist = tf.placeholder(dtype=tf.int32, shape=[None, None], 35 | name='ent2_dist') 36 | self.dropout_keep_proba = tf.placeholder(dtype=params.get('dtype'), 37 | name='dropout') 38 | self.batch_size = tf.placeholder(dtype=tf.int32, name='batch_size') 39 | 40 | self.scope = tf.get_variable_scope() 41 | 42 | #graph variables for each of the layers in cnn architecture 43 | ## Embeddings layer 44 | with tf.device(params.get('device')): 45 | with open(os.path.join(params.get('model_dir'), 46 | params.get('embeddings.mat.file')), 'rb') as rf: 47 | embeddings_mat = pickle.load(rf) 48 | self.sent_embedding = tf.get_variable(name="W_s", 49 | trainable=params['embeddings.tune'], 50 | initializer=tf.constant(embeddings_mat)) 51 | self.dist_embedding = tf.get_variable(name='W_d', 52 | shape=[2*params.get('sent_length')-1, params['embeddings.dist.dim']], 53 | initializer=tf.random_uniform_initializer( 54 | -params["embeddings.init_scale"], 55 | params["embeddings.init_scale"])) 56 | 57 | ##embeddings look-up operation 58 | sent_input = tf.nn.embedding_lookup(params=self.sent_embedding, ids=self.sent) 59 | ent1_dist_input = tf.nn.embedding_lookup(params=self.dist_embedding, 60 | ids=self.ent1_dist) 61 | ent2_dist_input = tf.nn.embedding_lookup(params=self.dist_embedding, 62 | ids=self.ent2_dist) 63 | conv_input = tf.concat([sent_input, ent1_dist_input, ent2_dist_input], 64 | axis=-1) 65 | conv_input = tf.expand_dims(conv_input, -1, name='input') 66 | input_dim = params.get('embeddings.dim') + 2*params.get('embeddings.dist.dim') 67 | 68 | ##Convolutional & pooling Layers 69 | with tf.variable_scope('conv') as scope: 70 | pool_tensors = [] 71 | for w_size in params.get('window'): 72 | fw = variable_on_device(name='fw_'+str(w_size), 73 | shape=[w_size, input_dim, 1, params.get('nfeature_map')], 74 | initializer=tf.random_uniform_initializer( 75 | -params["embeddings.init_scale"], 76 | params["embeddings.init_scale"]), 77 | device=params.get('device')) 78 | conv = tf.nn.conv2d(input=conv_input, filter=fw, 79 | strides=[1,1,1,1], padding='VALID') 80 | biases = variable_on_device(name='biases_'+str(w_size), 81 | shape=[params.get('nfeature_map')], 82 | initializer=tf.constant_initializer(0.0), 83 | device=params.get('device')) 84 | bias = tf.nn.bias_add(conv, biases) 85 | relu = tf.nn.relu(bias, name=scope.name) 86 | conv_len = relu.get_shape()[1] 87 | pool = tf.nn.max_pool(relu, ksize=[1,conv_len,1,1], 88 | strides=[1,1,1,1], padding='VALID') 89 | pool = tf.squeeze(pool,squeeze_dims=[1,2]) 90 | pool_tensors.append(pool) 91 | 92 | ##pooling & concatenation operation 93 | num_filters = len(params.get('window')) 94 | pool_size = num_filters * params.get('nfeature_map') 95 | pool_layer = tf.concat(pool_tensors, -1, name='pool') 96 | pool_flat = tf.reshape(pool_layer, [-1, pool_size]) 97 | 98 | ##Dropout Layer 99 | pool_dropout = tf.nn.dropout(pool_flat, keep_prob=self.dropout_keep_proba) 100 | 101 | ##Dense Projection Layer 102 | input_ = pool_dropout 103 | input_size = pool_size 104 | with tf.variable_scope('fc') as scope: 105 | W = variable_on_device(name='W', shape=[input_size, params.get('nclass')], 106 | initializer=tf.random_uniform_initializer( 107 | -params["embeddings.init_scale"], 108 | params["embeddings.init_scale"]), 109 | device=params.get('device')) 110 | 111 | biases = variable_on_device(name='biases', shape=[params.get('nclass')], 112 | initializer=tf.constant_initializer(0.01), 113 | device=params.get('device')) 114 | ##dense layer operation 115 | self.logits = tf.nn.bias_add(tf.matmul(input_, W), biases) 116 | 117 | ##softmax 118 | self.pred_probas = tf.nn.softmax(self.logits, name='class_proba') 119 | self.preds = tf.argmax(self.pred_probas, axis=-1, name='class_prediction') 120 | 121 | #loss using graph's output(s) 122 | self._loss = self._loss(params) 123 | self.l2loss = self._l2loss(params) 124 | self.loss = self._loss + self.l2loss 125 | 126 | #evaluation metric using graph's output(s) 127 | ##precision & recall evaluation metric 128 | with tf.variable_scope('eval_metric') as scope: 129 | self.accuracy, self.accuracy_op = tf.metrics.accuracy(self.label, 130 | self.preds, name='accuracy') 131 | # Isolate the variables stored behind the scenes by the metric operation 132 | running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES) 133 | 134 | # Define initializer to initialize/reset running eval_metric variables 135 | self.running_vars_initializer = tf.variables_initializer(var_list=running_vars) 136 | 137 | def _loss(self, params): 138 | return CRCNN.ranking_loss(params, self.label, self.logits, 139 | self.batch_size) 140 | 141 | 142 | def _l2loss(self, params): 143 | vars_ = [v for v in tf.trainable_variables() if 'biases' not in v.name 144 | and 'W_d' not in v.name and 'W_s' not in v.name] 145 | l2loss = tf.multiply(tf.add_n([ tf.nn.l2_loss(v) for v in vars_ ]), 146 | params.get('l2'), name='l2loss') 147 | return l2loss 148 | 149 | @staticmethod 150 | def ranking_loss(params, labels, logits, batch_size): 151 | lm = tf.constant(params.get('lm')) #lambda 152 | m_plus = tf.constant(params.get('margin_plus')) 153 | m_minus = tf.constant(params.get('margin_minus')) 154 | 155 | L = tf.constant(0.0) 156 | i = tf.constant(0) 157 | cond = lambda i, L: tf.less(i, batch_size) 158 | 159 | def loop_body(i, L): 160 | cplus = labels[i] #positive class label index 161 | #taking most informative negative class, use 2nd argmax 162 | _, cminus_indices = tf.nn.top_k(logits[i,:], k=2) 163 | cminus = tf.cond(tf.equal(cplus, cminus_indices[0]), 164 | lambda: cminus_indices[1], lambda: cminus_indices[0]) 165 | 166 | splus = logits[i,cplus] #score for gold class 167 | sminus = logits[i,cminus] #score for negative class 168 | 169 | l = tf.log((1.0+tf.exp((lm*(m_plus-splus))))) + \ 170 | tf.log((1.0+tf.exp((lm*(m_minus+sminus))))) 171 | 172 | return [tf.add(i, 1), tf.add(L,l)] 173 | 174 | _, L = tf.while_loop(cond, loop_body, loop_vars=[i,L]) 175 | nbatch = tf.to_float(batch_size) 176 | L = L/nbatch 177 | return L 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /train_crcnn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 1 March 2018 3 | 4 | @author: Bhanu 5 | 6 | ''' 7 | import tensorflow as tf 8 | 9 | from dataio import process_sequence, build_vocab, read_semeval2010_data,\ 10 | read_embeddings 11 | from model import CRCNN 12 | 13 | import collections 14 | import pandas as pd 15 | import numpy as np 16 | import pickle 17 | import os 18 | import yaml 19 | from sklearn.preprocessing.label import LabelEncoder 20 | from sklearn.model_selection._split import StratifiedShuffleSplit 21 | from sklearn.metrics.classification import f1_score, classification_report 22 | import argparse 23 | import sys 24 | 25 | 26 | 27 | FLAGS = None 28 | 29 | DataStream = collections.namedtuple('DataStream', 30 | field_names=['sent', 'label', 'ent1_dist', 'ent2_dist']) 31 | Vocab = collections.namedtuple('Vocab', 32 | field_names=['words', 'size', 'dict', 'inv_dict']) 33 | 34 | def build_data_streams(df, vocab_dict, max_len, label_encoder): 35 | sents, ent1_dist, ent2_dist = process_sequence(df, vocab_dict, max_len) 36 | if df.class_.any(): 37 | labels = label_encoder.transform(df.class_.values) 38 | else: #test dataframe 39 | labels = None 40 | 41 | datastream = DataStream(sent=sents, label=labels, 42 | ent1_dist=ent1_dist, ent2_dist=ent2_dist) 43 | 44 | return datastream 45 | 46 | def build_model(params): 47 | mdl = CRCNN(params) 48 | return mdl 49 | 50 | def load_vocab(vocab_file): 51 | with open(vocab_file, 'rb') as rf: 52 | vocab_list = pickle.load(rf) 53 | vocab_size= len(vocab_list) 54 | vocab_dict = dict(zip(vocab_list, range(vocab_size))) 55 | vocab_inv_dict = dict(zip(range(vocab_size), vocab_list)) 56 | vocab = Vocab(vocab_list, vocab_size, vocab_dict, vocab_inv_dict) 57 | return vocab 58 | 59 | def load_sents_data_semeval2010(data_file, testset=False): 60 | 61 | df = read_semeval2010_data(data_file) 62 | 63 | non_other = ~(df.rel == 'OTHER') 64 | df['class_'] = 'OTHER' 65 | df.loc[non_other, 'class_'] = df.loc[non_other,:].rel 66 | 67 | return df 68 | 69 | 70 | def main(_): 71 | if(FLAGS.config is None): 72 | config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 73 | 'model_config.yml') 74 | else: 75 | config_file = FLAGS.config 76 | with open(config_file, 'r') as rf: 77 | params = yaml.load(rf) 78 | 79 | seed = params.get('seed') 80 | random_state = np.random.RandomState(seed) 81 | tf.set_random_seed(seed) 82 | 83 | data_dir = params.get('data_dir') 84 | model_dir = params.get('model_dir') 85 | experiment_name = params.get('experiment_name') 86 | train_data_filename = params.get('train_file') 87 | test_data_filename = params.get('test_file') 88 | 89 | #load sentences data 90 | print("loading data...", flush=True) 91 | train_data_file = os.path.join(data_dir, train_data_filename) 92 | test_data_file = os.path.join(data_dir, test_data_filename) 93 | dftrain = load_sents_data_semeval2010(train_data_file) 94 | dftest = load_sents_data_semeval2010(test_data_file, testset=True) 95 | dftraintest = pd.concat([dftrain, dftest], ignore_index=True).reset_index(drop=True) 96 | le = LabelEncoder().fit(dftrain.class_.values) 97 | params['nclass'] = len(le.classes_) 98 | params['label_encoder_file'] = experiment_name+'_label_encoder.pkl' 99 | #oversample class w/ only one example, hack for stratified cv 100 | dftrain = pd.concat([dftrain, dftrain[dftrain.rel=='ENTITY-DESTINATION(E2,E1)']], 101 | ignore_index=True).reset_index(drop=True) 102 | 103 | #build vocab 104 | print("building vocab...", flush=True) 105 | vocab_list = build_vocab(dftraintest) 106 | vocab_size= len(vocab_list) 107 | vocab_dict = dict(zip(vocab_list, range(vocab_size))) 108 | vocab_inv_dict = dict(zip(range(vocab_size), vocab_list)) 109 | vocab = Vocab(vocab_list, vocab_size, vocab_dict, vocab_inv_dict) 110 | params['vocab_file'] = experiment_name+'_vocab.pkl' 111 | 112 | #read embeddings 113 | print("reading embeddings...", flush=True) 114 | vocab_vec = read_embeddings(params['embeddings.file'], 115 | vocab.words, 116 | params['embeddings.init_scale'], 117 | params['dtype'], random_state) 118 | embeddings_mat = np.asarray(vocab_vec.values, dtype=params['dtype']) 119 | embeddings_mat[0,:] = 0 #make embeddings of PADDING all zeros 120 | params['embeddings.mat.file'] = experiment_name+'_embeddings.pkl' 121 | 122 | 123 | #save params, vocab and embeddings in model directory for testing 124 | print("saving params, vocab, le and embeddings...", flush=True) 125 | with open(os.path.join(model_dir, experiment_name+'_params.yml'), 'w') as wf: 126 | yaml.dump(params, wf, default_flow_style=False) 127 | with open(os.path.join(model_dir, params.get('vocab_file')), 'wb') as wf: 128 | pickle.dump(vocab, wf) 129 | with open(os.path.join(model_dir, params.get('embeddings.mat.file')), 'wb') as wf: 130 | pickle.dump(embeddings_mat, wf) 131 | with open(os.path.join(model_dir, params.get('label_encoder_file')), 'wb') as wf: 132 | pickle.dump(le, wf) 133 | 134 | 135 | ##cross-validation 136 | sss = StratifiedShuffleSplit(n_splits=1, random_state=random_state, 137 | test_size=params.get('devset_size')) 138 | for trainidx, devidx in sss.split(dftrain.values, dftrain.rel.values): 139 | cvtraindf = dftrain.iloc[trainidx,:] 140 | cvdevdf = dftrain.iloc[devidx,:] 141 | experiment_name = params.get('experiment_name') 142 | 143 | tstream = build_data_streams(cvtraindf, vocab.dict, 144 | params.get('sent_length'), le 145 | 146 | ) 147 | dstream = build_data_streams(cvdevdf, vocab.dict, 148 | params.get('sent_length'), le 149 | ) 150 | 151 | print("Training Data Shape: ", cvtraindf.shape) 152 | print("Dev Data Shape: ", cvdevdf.shape) 153 | print("Classes: ", le.classes_) 154 | 155 | def graph_ops(): 156 | #2. build model and define its loss minimization approach(training operation) 157 | mdl = build_model(params) 158 | 159 | ##defining an optimizer to minimize model's loss 160 | global_step = tf.Variable(0, name="global_step", trainable=False) 161 | learning_rate = params.get('learning_rate') 162 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, 163 | momentum=0.8) 164 | train_op = optimizer.minimize(mdl.loss, global_step=global_step) 165 | 166 | # Summaries for loss & metrics 167 | loss_summary = tf.summary.scalar("loss", mdl.loss) 168 | acc_summary = tf.summary.scalar("accuracy", mdl.accuracy) 169 | 170 | init_op = tf.global_variables_initializer() 171 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) 172 | 173 | return mdl, global_step, train_op, loss_summary, acc_summary, init_op, \ 174 | saver 175 | 176 | with tf.Session() as sess: 177 | mdl, global_step, train_op, loss_summary, acc_summary, init_op, \ 178 | saver = graph_ops() 179 | sess.run(init_op) 180 | 181 | #summaries 182 | ##train summaries 183 | train_summary_dir = os.path.join(model_dir, "summaries", experiment_name, "train") 184 | train_summary_op = tf.summary.merge([loss_summary, acc_summary]) 185 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph, flush_secs=3) 186 | ##dev summaries 187 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 188 | dev_summary_dir = os.path.join(model_dir, "summaries", experiment_name, "dev") 189 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph, flush_secs=3) 190 | 191 | # train step 192 | def train_epoch(): 193 | ntrain = tstream.sent.shape[0] 194 | bsize = params.get('batch_size') 195 | start = 0 196 | end = 0 197 | for start in range(0, ntrain, bsize): 198 | end = start + bsize 199 | if end > ntrain: 200 | end = ntrain 201 | 202 | train_feed_dict = { 203 | mdl.sent: tstream.sent[start:end,:], 204 | mdl.label: tstream.label[start:end], 205 | mdl.ent1_dist: tstream.ent1_dist[start:end,:], 206 | mdl.ent2_dist: tstream.ent2_dist[start:end,:], 207 | mdl.dropout_keep_proba: params.get('dropout'), 208 | mdl.batch_size: end-start 209 | } 210 | sess.run([train_op, global_step, mdl.loss], train_feed_dict) 211 | 212 | def train_eval_step(): 213 | sess.run(mdl.running_vars_initializer) 214 | train_feed_dict = { 215 | mdl.sent: tstream.sent, 216 | mdl.label: tstream.label, 217 | mdl.ent1_dist: tstream.ent1_dist, 218 | mdl.ent2_dist: tstream.ent2_dist, 219 | mdl.dropout_keep_proba: 1.0, 220 | mdl.batch_size: tstream.sent.shape[0] 221 | } 222 | tstep, tloss = sess.run([global_step, mdl.loss], train_feed_dict) 223 | sess.run(mdl.accuracy_op, train_feed_dict) 224 | tsummary = sess.run(train_summary_op, train_feed_dict) 225 | train_summary_writer.add_summary(tsummary, tstep) 226 | train_eval_score = sess.run(mdl.accuracy) 227 | return tstep, tloss, train_eval_score 228 | 229 | def eval_step(): 230 | sess.run(mdl.running_vars_initializer) 231 | dev_feed_dict = { 232 | mdl.sent: dstream.sent, 233 | mdl.label: dstream.label, 234 | mdl.ent1_dist: dstream.ent1_dist, 235 | mdl.ent2_dist: dstream.ent2_dist, 236 | mdl.dropout_keep_proba: 1.0, 237 | mdl.batch_size: dstream.label.shape[0] 238 | } 239 | 240 | dstep, dloss, preds = sess.run([global_step, mdl.loss, 241 | mdl.preds], dev_feed_dict) 242 | sess.run(mdl.accuracy_op, dev_feed_dict) 243 | dacc_ = sess.run(mdl.accuracy) 244 | l = dstream.label 245 | p = preds 246 | 247 | class_int_labels = list(range(len(le.classes_))) 248 | target_names=le.classes_ 249 | 250 | sess.run(mdl.accuracy_op, dev_feed_dict) 251 | dsummary = sess.run(dev_summary_op, dev_feed_dict) 252 | dev_summary_writer.add_summary(dsummary, dstep) 253 | eval_score = (f1_score(l, p, average='micro'), 254 | f1_score(l, p, average='macro'), 255 | dacc_ 256 | ) 257 | print("EVAL step {}, loss {:g}, f1_micro {:g} f1_macro {:g} accuracy {:g}" 258 | .format(tstep, dloss, eval_score[0], eval_score[1], eval_score[2]), 259 | flush=True) 260 | official_score = eval_score[1] 261 | 262 | print("Classification Report: \n%s"% 263 | classification_report(l, p, 264 | labels=class_int_labels, 265 | target_names=target_names, 266 | ), flush=True) 267 | 268 | return official_score 269 | 270 | #training loop 271 | best_score = 0.0; best_step = 0; best_itr = 0; 272 | for ite in range(params.get('training_iters')): 273 | train_epoch() 274 | if ite%params.get('train_step_eval') == 0: 275 | tstep, tloss, tacc_ = train_eval_step() 276 | 277 | if ite%params.get('train_step_eval') == 0: 278 | print("TRAIN step {}, iteration {} loss {:g} accuracy {:g}" 279 | .format(tstep, ite, tloss, tacc_), 280 | flush=True) 281 | 282 | current_step = tf.train.global_step(sess, global_step) 283 | if current_step % params.get('eval_interval') == 0: 284 | official_score = eval_step() 285 | if best_score < official_score: 286 | checkpoint_prefix = os.path.join(params.get('model_dir'), 287 | "%s-score-%s"%(experiment_name, str(official_score))) 288 | saver.save(sess, checkpoint_prefix, global_step=current_step) 289 | 290 | best_score = official_score 291 | best_step = current_step 292 | best_itr = ite 293 | print("Best Score: %2.3f, Best Step: %d (iteration: %d)" 294 | %(best_score, best_step, best_itr)) 295 | 296 | 297 | if __name__ == '__main__': 298 | parser = argparse.ArgumentParser() 299 | parser.add_argument('--config', type=str, default=None, 300 | help='Path to the config file.') 301 | 302 | FLAGS, unparsed = parser.parse_known_args() 303 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) --------------------------------------------------------------------------------