├── .gitignore
├── model_config.yml
├── README.md
├── test_crcnn.py
├── dataio.py
├── model.py
└── train_crcnn.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .project
2 | .pydevproject
3 | *models/*
4 | *data/*
5 | *__pycache__/*
6 | 


--------------------------------------------------------------------------------
/model_config.yml:
--------------------------------------------------------------------------------
 1 | data_dir:  ./data/
 2 | model_dir: ./models/
 3 | train_file: TRAIN_FILE.TXT
 4 | test_file: TEST_FILE_FULL.TXT
 5 | dtype: float32
 6 | embeddings.dim: 50
 7 | embeddings.tune: false
 8 | embeddings.file: ./data/glove.6B.50d.txt
 9 | embeddings.mat.file: null
10 | embeddings.init_scale: 0.25
11 | embeddings.dist.dim: 25
12 | learning_rate: 0.01
13 | training_iters: 40
14 | batch_size: 20
15 | eval_interval: 1
16 | train_step_eval: 1
17 | nprint: -1
18 | window: 
19 |     - 2
20 |     - 3
21 |     - 4
22 |     - 5
23 | nfeature_map: 25
24 | sent_length: 90
25 | dropout: 0.5
26 | l2: 0.01
27 | device: '/cpu:0'
28 | lm: 1.0
29 | margin_plus: 2.5
30 | margin_minus: 0.5
31 | experiment_name: 'test'
32 | seed: 123
33 | devset_size: 0.2


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Classifying Relations by Ranking with Convolutional Neural Networks
 2 | 
 3 | Implementation of ACL 2015 Paper:  
 4 | [Classifying Relations by Ranking with Convolutional Neural Networks](https://arxiv.org/abs/1504.06580)
 5 | 
 6 | ## Download SemEval 2010 Task 8 Dataset for Relation Classification
 7 | Here is the link to download this dataset:
 8 | [link](https://drive.google.com/file/d/0B_jQiLugGTAkMDQ5ZjZiMTUtMzQ1Yy00YWNmLWJlZDYtOWY1ZDMwY2U4YjFk/view?layout=list&ddrp=1&sort=name&num=50) 
 9 | 
10 | You will also need to download some pre-trained embeddings like 
11 | [GloVe](https://nlp.stanford.edu/projects/glove/). 
12 | 
13 | ## Dependencies 
14 | ```
15 | tensorflow (1.3.0)
16 | spacy
17 | pandas
18 | numpy
19 | scikit-learn
20 | ```
21 | 
22 | ## Training
23 | Update paths in `model_config.yml`, then start training as: 
24 | 
25 | ```
26 | python3 -m train_crcnn
27 | ```
28 | 
29 | 
30 | ## Evaluation
31 | Once the training is finished and you have trained models in your model directory,
32 | evaluate a model as: 
33 | 
34 | ```
35 | python3 -m test_crcnn --config_file <full path of saved .yml config file in your model directory --model_name <checkpoint prefix of the model you want to evaluate>
36 | 
37 | ```
38 | 


--------------------------------------------------------------------------------
/test_crcnn.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 1 March 2018
  3 | 
  4 | @author: Bhanu
  5 | 
  6 | '''
  7 | import tensorflow as tf
  8 | import os
  9 | import numpy as np
 10 | import yaml 
 11 | from train_crcnn import load_sents_data_semeval2010, build_model,\
 12 |     build_data_streams, Vocab
 13 | import pickle
 14 | from sklearn.metrics.classification import f1_score, classification_report
 15 | import argparse
 16 | import sys
 17 | 
 18 | FLAGS = None               
 19 | 
 20 | def main(_):
 21 |     test_model = FLAGS.model_name
 22 |     config_file = FLAGS.config_file
 23 | 
 24 |     with open(config_file, 'r') as rf:
 25 |         params = yaml.load(rf)
 26 |     
 27 |     seed = params.get('seed')
 28 |     tf.set_random_seed(seed)
 29 |     
 30 |     test_data_filename = params.get('test_file')
 31 | 
 32 |     data_dir = params.get('data_dir')
 33 |     model_dir = params.get('model_dir')
 34 |     
 35 |     print("loading data...", flush=True)
 36 |     test_data_file = os.path.join(data_dir, test_data_filename)
 37 |     dftest = load_sents_data_semeval2010(test_data_file, testset=True)
 38 |     with open(os.path.join(model_dir, params.get('label_encoder_file')), 'rb') as rf:
 39 |         le = pickle.load(rf)
 40 |     print(le.classes_)
 41 |     
 42 |     #build pos vocab
 43 |     print("loading vocab...", flush=True)
 44 |     with open(os.path.join(model_dir, params.get('vocab_file')), 'rb') as rf:
 45 |         vocab = pickle.load(rf)        
 46 |     
 47 |     is_test_labels = dftest.class_.any()
 48 | 
 49 |     # build input data streams
 50 |     teststream = build_data_streams(dftest, vocab.dict, 
 51 |                     params.get('sent_length'), le)
 52 |     
 53 |     labels = teststream.label 
 54 |     if labels is None:
 55 |         labels = np.zeros(teststream.sent.shape[0])
 56 |     
 57 |     #build model
 58 |     mdl = build_model(params)
 59 |     test_feed_dict = {
 60 |                     mdl.sent: teststream.sent,
 61 |                     mdl.label: labels,
 62 |                     mdl.ent1_dist: teststream.ent1_dist,
 63 |                     mdl.ent2_dist: teststream.ent2_dist,
 64 |                     mdl.dropout_keep_proba: 1.0,
 65 |                     mdl.batch_size: teststream.sent.shape[0]
 66 |                 }
 67 |     
 68 |     #run the graph
 69 |     init_op = tf.global_variables_initializer()
 70 |     saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
 71 |     with tf.Session() as sess:    
 72 |         sess.run(init_op)
 73 |         saver.restore(sess, os.path.join(model_dir, test_model))
 74 |         print("Restored session from %s"%test_model)   
 75 |         pred_probas, preds = sess.run([mdl.pred_probas, mdl.preds], test_feed_dict)
 76 |     
 77 |     #print scores, if test_labels known
 78 |     if is_test_labels is not None:
 79 |         l = teststream.label
 80 |         p = preds 
 81 |         
 82 |         class_int_labels = list(range(len(le.classes_)))
 83 |         target_names=le.classes_
 84 |     
 85 |         eval_score = (f1_score(l, p, average='micro'),
 86 |                       f1_score(l, p, average='macro')
 87 |                     )
 88 |         print("EVAL f1_micro {:g} f1_macro {:g}"
 89 |               .format(eval_score[0], eval_score[1]), flush=True)
 90 |         
 91 |         print("Classification Report: \n%s"%
 92 |               classification_report(l, p, 
 93 |                         labels=class_int_labels, 
 94 |                         target_names=target_names,
 95 |                     ), flush=True)
 96 |     
 97 | 
 98 | if __name__ == '__main__':
 99 |     parser = argparse.ArgumentParser()
100 |     parser.add_argument('--model_name', type=str, default=None, 
101 |                         help='Checkpoint Prefix of the model to be tested')
102 |     parser.add_argument('--config_file', type=str, default=None, 
103 |                         help='Full path of the config file')
104 |  
105 |     FLAGS, unparsed = parser.parse_known_args()
106 |     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
107 |     
108 | 


--------------------------------------------------------------------------------
/dataio.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | from _collections import defaultdict
  3 | import pandas as pd
  4 | import numpy as np
  5 | import spacy 
  6 | import csv
  7 | 
  8 | nlp = spacy.load('en')
  9 | 
 10 | SpecialVocab = collections.namedtuple('SpecialVocab', ['sos', 'eos', 'unknown', 
 11 |                                         'padding'])
 12 | special_vocab = SpecialVocab(sos='SEQUENCE_START', eos='SEQUENCE_END',
 13 |                              unknown="UNK", padding='-PAD-')
 14 | 
 15 | 
 16 | def words_to_indices(seq, vocab_dict):
 17 |     '''
 18 |     @param seq: list of words/tokens
 19 |     '''
 20 |     word_indices = []
 21 |     for w in seq:
 22 |         if w in vocab_dict:
 23 |             v = vocab_dict.get(w)
 24 |             if v is None:
 25 |                 print("Got None for %s"%w)
 26 |             word_indices.append(v)
 27 |         else:
 28 |             word_indices.append(vocab_dict.get(special_vocab.unknown))
 29 |             print("Couldn't find %s"%w)
 30 |     return word_indices
 31 | 
 32 | def build_vocab(sentsdf, min_freq=1):
 33 |     vocab_dict = defaultdict(int)
 34 |     for _, row in sentsdf.iterrows():
 35 |         for w in row.words:
 36 |             vocab_dict[w] += 1
 37 |                     
 38 |     #drop all words with less than min_freq
 39 |     vocabdf = pd.DataFrame({'word': list(vocab_dict.keys()), 
 40 |                             'freq': list(vocab_dict.values())})
 41 |     vocabdf = vocabdf[vocabdf.freq >= min_freq]
 42 |     
 43 |     #add special vocab, with padding at index 0
 44 |     vocab_ = [special_vocab.padding, special_vocab.unknown, 
 45 |               special_vocab.eos, special_vocab.sos] + vocabdf.word.values.tolist() 
 46 |     return vocab_
 47 | 
 48 | def pad_seq(seq, max_len):
 49 |     seq_len = len(seq)
 50 |     if len(seq) > max_len: 
 51 |         seq = seq[:max_len]
 52 |         seq_len = max_len
 53 |     else:
 54 |         seq = seq+[special_vocab.padding]*(max_len-seq_len) 
 55 |     return seq
 56 | 
 57 | def process_sequence(df, vocab_dict, max_len):
 58 |     '''
 59 |     1. Tokenize
 60 |     2. Pad
 61 |     3. Convert to wordindices
 62 |     4. Convert to relative distances from entity1 and entity2 for each word.
 63 |     '''
 64 |     word_indices = []
 65 |     ent1_dists = []
 66 |     ent2_dists = []
 67 |     for _, row in df.iterrows():
 68 |         seq = row.sent
 69 |         
 70 |         words = row.words
 71 |         
 72 |         padded_words = [special_vocab.sos] + words + [special_vocab.eos]
 73 |         padded_words = pad_seq(padded_words, max_len)
 74 |         wi = words_to_indices(padded_words, vocab_dict)
 75 |         
 76 |         e1_end = int(row.ent_1_end)#seq.index(row.ent_1)+len(row.ent_1) 
 77 |         e2_end = int(row.ent_2_end)#seq.index(row.ent_2)+len(row.ent_2)
 78 |         newseq = seq[:e1_end].strip()+" entity_1_end "+ seq[e1_end:e2_end].strip() + \
 79 |         " entity_2_end "+ seq[e2_end:].strip()
 80 | 
 81 |         newseq_words = [tok.text for tok in nlp.tokenizer(newseq)]
 82 |         newseq_words = [special_vocab.sos] + newseq_words + [special_vocab.eos]
 83 |         i1 = newseq_words.index('entity_1_end') - 1#TODO: Use head of entity-phrase instead of rightmost word
 84 |         i2 = newseq_words.index('entity_2_end') - 2
 85 |         
 86 |         ent1_dist = [i-i1 for i in range(len(padded_words))]
 87 |         ent2_dist = [i-i2 for i in range(len(padded_words))]
 88 |         
 89 |         word_indices.append(wi)
 90 |         ent1_dists.append(ent1_dist)
 91 |         ent2_dists.append(ent2_dist)        
 92 |         
 93 |     word_indices, ent1_dists, ent2_dists = \
 94 |         np.asarray(word_indices), np.asarray(ent1_dists), np.asarray(ent2_dists)
 95 |     
 96 |     ent1_dists += max_len
 97 |     ent2_dists += max_len
 98 |     
 99 |     return word_indices, ent1_dists, ent2_dists
100 |     
101 |             
102 | def read_semeval2010_data(filename):
103 |     data = {'rel':[], 'sent': [], 'ent_1':[], 'ent_2':[], 'words':[],
104 |             'ent_1_start':[], 'ent_2_start':[], 'ent_1_end':[], 'ent_2_end':[]}
105 |     etags = ['<e1>', '</e1>', '<e2>', '</e2>']
106 |     with open(filename, 'r') as rf:
107 |         for line in rf:
108 |             _, sent = line.split('\t')
109 |             
110 |             rel = next(rf).strip().upper()
111 |             next(rf) #comment
112 |             next(rf)#blankline
113 |             e1 = sent[sent.index('<e1>')+4:sent.index('</e1>')]
114 |             e2 = sent[sent.index('<e2>')+4:sent.index('</e2>')]
115 |             e1_start = sent.index('<e1>') - 1
116 |             e2_start = sent.index('<e2>') - 1*4 - 1*5 - 1 #compensating for tag, and "
117 |             e1_end = sent.index('</e1>') - 1*4 - 1
118 |             e2_end = sent.index('</e2>') - 2*4 - 1*5 - 1
119 |             
120 |             for tag_ in etags:
121 |                 sent = sent.replace(tag_,"")
122 |             sent = sent.strip().lower()[1:-1]
123 |             words = [tok.text for tok in nlp.tokenizer(sent)]
124 |             data['sent'].append(sent)
125 |             data['ent_1'].append(e1)
126 |             data['ent_2'].append(e2)
127 |             data['rel'].append(rel)
128 |             data['words'].append(words)
129 |             data['ent_1_start'].append(e1_start)
130 |             data['ent_1_end'].append(e1_end)
131 |             data['ent_2_start'].append(e2_start)
132 |             data['ent_2_end'].append(e2_end)
133 |     df = pd.DataFrame.from_dict(data)
134 |     return df
135 |             
136 | def read_embeddings(embeddings_path, vocab_, init_scale=0.25, 
137 |                            dtype='float32', random_state=None):
138 |     
139 |     if random_state is None:
140 |         random_state = np.random.RandomState(10)
141 |         
142 |     vocab_vec = pd.read_csv(embeddings_path, header=None, skiprows=[0],
143 |                         sep=' ', index_col=0, quoting=csv.QUOTE_NONE)
144 |     cols = ['col%d'%x for x in range(vocab_vec.shape[1])]
145 |     vocab_vec.columns = cols
146 |     
147 | #     known_words = [w for w in vocab_  if w in vocab_vec.index]
148 | #     known_mat = vocab_vec.ix[known_words,:]
149 | #     known_mat.to_csv(embeddings_path+".aclaug", sep=' ', index_label='word')
150 |     
151 |     print("Vocab Size: %d"%len(vocab_), flush=True)
152 |     unknown_words = [w for w in vocab_  if w not in vocab_vec.index]
153 |     
154 |     print("adding %d unknown words..."%len(unknown_words), flush=True)
155 |     emb_dim = vocab_vec.shape[1]
156 |     rnd_mat = random_state.uniform(-init_scale, init_scale, 
157 |                     size=(len(unknown_words), emb_dim))
158 |     rnd_df = pd.DataFrame(rnd_mat, index=unknown_words, columns=cols)
159 |     vocab_vec = pd.concat([vocab_vec, rnd_df], axis=0)
160 |     embeddings_mat = vocab_vec.ix[vocab_,:]
161 |     return embeddings_mat
162 | 
163 |     


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 1 Mar 2018
  3 | 
  4 | @author: Bhanu
  5 | 
  6 | '''
  7 | 
  8 | import tensorflow as tf
  9 | import os
 10 | import pickle
 11 | 
 12 | def variable_on_device(name, shape, initializer, device):
 13 |     with tf.device(device):
 14 |         v = tf.get_variable(name=name, shape=shape, initializer=initializer)
 15 |     return v
 16 | 
 17 | class CRCNN:
 18 |     '''
 19 |     Implementation of Classification by Ranking CNN. 
 20 |     Refer: Classifying Relations by Ranking with Convolutional Neural Networks.
 21 |     C´ıcero Nogueira dos Santos, Bing Xiang, Bowen Zhou
 22 |     '''
 23 |     
 24 |     def __init__(self, params):
 25 |         self.is_training = False if params.get('mode') == 'INFER' else True
 26 |     
 27 |         #graph inputs
 28 |         self.sent = tf.placeholder(dtype=tf.int32, 
 29 |                         shape=[None, params.get('sent_length')], name='sent')
 30 |         self.label = tf.placeholder(dtype=tf.int32, 
 31 |                         shape=[None], name='label')
 32 |         self.ent1_dist = tf.placeholder(dtype=tf.int32, shape=[None, None], 
 33 |                                         name='ent1_dist')
 34 |         self.ent2_dist = tf.placeholder(dtype=tf.int32, shape=[None, None], 
 35 |                                         name='ent2_dist')
 36 |         self.dropout_keep_proba = tf.placeholder(dtype=params.get('dtype'), 
 37 |                                         name='dropout')
 38 |         self.batch_size = tf.placeholder(dtype=tf.int32, name='batch_size')
 39 |     
 40 |         self.scope = tf.get_variable_scope()
 41 |         
 42 |         #graph variables for each of the layers in cnn architecture
 43 |         ## Embeddings layer
 44 |         with tf.device(params.get('device')):
 45 |             with open(os.path.join(params.get('model_dir'), 
 46 |                     params.get('embeddings.mat.file')), 'rb') as rf:
 47 |                 embeddings_mat = pickle.load(rf)
 48 |             self.sent_embedding = tf.get_variable(name="W_s", 
 49 |                     trainable=params['embeddings.tune'],
 50 |                     initializer=tf.constant(embeddings_mat))
 51 |             self.dist_embedding = tf.get_variable(name='W_d', 
 52 |                     shape=[2*params.get('sent_length')-1, params['embeddings.dist.dim']],
 53 |                     initializer=tf.random_uniform_initializer(
 54 |                                     -params["embeddings.init_scale"],
 55 |                                     params["embeddings.init_scale"]))
 56 |         
 57 |         ##embeddings look-up operation
 58 |         sent_input = tf.nn.embedding_lookup(params=self.sent_embedding, ids=self.sent)
 59 |         ent1_dist_input = tf.nn.embedding_lookup(params=self.dist_embedding, 
 60 |                             ids=self.ent1_dist)
 61 |         ent2_dist_input = tf.nn.embedding_lookup(params=self.dist_embedding,
 62 |                             ids=self.ent2_dist)
 63 |         conv_input = tf.concat([sent_input, ent1_dist_input, ent2_dist_input], 
 64 |                                 axis=-1)
 65 |         conv_input = tf.expand_dims(conv_input, -1, name='input')
 66 |         input_dim = params.get('embeddings.dim') + 2*params.get('embeddings.dist.dim') 
 67 |         
 68 |         ##Convolutional & pooling Layers
 69 |         with tf.variable_scope('conv') as scope:
 70 |             pool_tensors = []
 71 |             for w_size in params.get('window'):
 72 |                 fw = variable_on_device(name='fw_'+str(w_size),
 73 |                     shape=[w_size, input_dim, 1, params.get('nfeature_map')], 
 74 |                     initializer=tf.random_uniform_initializer(
 75 |                                 -params["embeddings.init_scale"],
 76 |                                 params["embeddings.init_scale"]),
 77 |                     device=params.get('device'))
 78 |                 conv = tf.nn.conv2d(input=conv_input, filter=fw, 
 79 |                             strides=[1,1,1,1], padding='VALID')
 80 |                 biases = variable_on_device(name='biases_'+str(w_size), 
 81 |                             shape=[params.get('nfeature_map')], 
 82 |                             initializer=tf.constant_initializer(0.0),
 83 |                             device=params.get('device'))
 84 |                 bias = tf.nn.bias_add(conv, biases)
 85 |                 relu = tf.nn.relu(bias, name=scope.name)
 86 |                 conv_len = relu.get_shape()[1]
 87 |                 pool = tf.nn.max_pool(relu, ksize=[1,conv_len,1,1], 
 88 |                             strides=[1,1,1,1], padding='VALID')
 89 |                 pool = tf.squeeze(pool,squeeze_dims=[1,2]) 
 90 |                 pool_tensors.append(pool)
 91 |         
 92 |         ##pooling & concatenation operation
 93 |         num_filters = len(params.get('window'))
 94 |         pool_size = num_filters * params.get('nfeature_map')
 95 |         pool_layer = tf.concat(pool_tensors, -1, name='pool')
 96 |         pool_flat = tf.reshape(pool_layer, [-1, pool_size])
 97 |                 
 98 |         ##Dropout Layer
 99 |         pool_dropout = tf.nn.dropout(pool_flat, keep_prob=self.dropout_keep_proba)
100 |         
101 |         ##Dense Projection Layer
102 |         input_ = pool_dropout
103 |         input_size = pool_size
104 |         with tf.variable_scope('fc') as scope:
105 |             W = variable_on_device(name='W', shape=[input_size, params.get('nclass')],
106 |                                 initializer=tf.random_uniform_initializer(
107 |                                     -params["embeddings.init_scale"],
108 |                                     params["embeddings.init_scale"]),
109 |                                 device=params.get('device'))
110 | 
111 |             biases = variable_on_device(name='biases', shape=[params.get('nclass')], 
112 |                             initializer=tf.constant_initializer(0.01),
113 |                             device=params.get('device'))
114 |             ##dense layer operation
115 |             self.logits = tf.nn.bias_add(tf.matmul(input_, W), biases)
116 | 
117 |         ##softmax 
118 |         self.pred_probas = tf.nn.softmax(self.logits, name='class_proba')
119 |         self.preds = tf.argmax(self.pred_probas, axis=-1, name='class_prediction')
120 |         
121 |         #loss using graph's output(s)
122 |         self._loss = self._loss(params)
123 |         self.l2loss = self._l2loss(params)
124 |         self.loss = self._loss + self.l2loss
125 |         
126 |         #evaluation metric using graph's output(s)
127 |         ##precision & recall evaluation metric
128 |         with tf.variable_scope('eval_metric') as scope:
129 |             self.accuracy, self.accuracy_op = tf.metrics.accuracy(self.label, 
130 |                                 self.preds, name='accuracy')
131 |             # Isolate the variables stored behind the scenes by the metric operation
132 |             running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES)
133 | 
134 |         # Define initializer to initialize/reset running eval_metric variables
135 |         self.running_vars_initializer = tf.variables_initializer(var_list=running_vars)
136 | 
137 |     def _loss(self, params):
138 |         return CRCNN.ranking_loss(params, self.label, self.logits, 
139 |                                         self.batch_size)
140 |     
141 |     
142 |     def _l2loss(self, params):
143 |         vars_   = [v for v in tf.trainable_variables() if 'biases' not in v.name 
144 |                    and 'W_d' not in v.name and 'W_s' not in v.name]
145 |         l2loss = tf.multiply(tf.add_n([ tf.nn.l2_loss(v) for v in vars_ ]),
146 |                         params.get('l2'), name='l2loss')
147 |         return l2loss
148 |     
149 |     @staticmethod
150 |     def ranking_loss(params, labels, logits, batch_size):        
151 |         lm = tf.constant(params.get('lm')) #lambda
152 |         m_plus = tf.constant(params.get('margin_plus'))
153 |         m_minus = tf.constant(params.get('margin_minus'))
154 | 
155 |         L = tf.constant(0.0)
156 |         i = tf.constant(0)
157 |         cond = lambda i, L: tf.less(i, batch_size)
158 | 
159 |         def loop_body(i, L): 
160 |             cplus = labels[i] #positive class label index
161 |             #taking most informative negative class, use 2nd argmax
162 |             _, cminus_indices = tf.nn.top_k(logits[i,:], k=2)
163 |             cminus = tf.cond(tf.equal(cplus, cminus_indices[0]),
164 |                              lambda: cminus_indices[1], lambda: cminus_indices[0])
165 |             
166 |             splus = logits[i,cplus] #score for gold class
167 |             sminus = logits[i,cminus] #score for negative class
168 |             
169 |             l = tf.log((1.0+tf.exp((lm*(m_plus-splus))))) + \
170 |                 tf.log((1.0+tf.exp((lm*(m_minus+sminus)))))
171 |             
172 |             return [tf.add(i, 1), tf.add(L,l)]
173 | 
174 |         _, L = tf.while_loop(cond, loop_body, loop_vars=[i,L])
175 |         nbatch = tf.to_float(batch_size)
176 |         L = L/nbatch
177 |         return L
178 |     
179 |     
180 | 
181 |         


--------------------------------------------------------------------------------
/train_crcnn.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 1 March 2018
  3 | 
  4 | @author: Bhanu
  5 | 
  6 | '''
  7 | import tensorflow as tf
  8 | 
  9 | from dataio import process_sequence, build_vocab, read_semeval2010_data,\
 10 |     read_embeddings
 11 | from model import CRCNN
 12 | 
 13 | import collections
 14 | import pandas as pd
 15 | import numpy as np
 16 | import pickle
 17 | import os
 18 | import yaml
 19 | from sklearn.preprocessing.label import LabelEncoder
 20 | from sklearn.model_selection._split import StratifiedShuffleSplit
 21 | from sklearn.metrics.classification import f1_score, classification_report
 22 | import argparse
 23 | import sys
 24 | 
 25 | 
 26 | 
 27 | FLAGS = None
 28 | 
 29 | DataStream = collections.namedtuple('DataStream', 
 30 |                 field_names=['sent', 'label', 'ent1_dist', 'ent2_dist'])
 31 | Vocab = collections.namedtuple('Vocab', 
 32 |                 field_names=['words', 'size', 'dict', 'inv_dict'])
 33 | 
 34 | def build_data_streams(df, vocab_dict, max_len, label_encoder):        
 35 |     sents, ent1_dist, ent2_dist = process_sequence(df, vocab_dict, max_len)
 36 |     if df.class_.any():
 37 |         labels = label_encoder.transform(df.class_.values)   
 38 |     else: #test dataframe
 39 |         labels = None        
 40 | 
 41 |     datastream = DataStream(sent=sents, label=labels,
 42 |                     ent1_dist=ent1_dist, ent2_dist=ent2_dist)
 43 |     
 44 |     return datastream
 45 | 
 46 | def build_model(params):
 47 |     mdl = CRCNN(params)
 48 |     return mdl
 49 | 
 50 | def load_vocab(vocab_file):
 51 |     with open(vocab_file, 'rb') as rf:
 52 |         vocab_list = pickle.load(rf)
 53 |     vocab_size= len(vocab_list)
 54 |     vocab_dict = dict(zip(vocab_list, range(vocab_size)))
 55 |     vocab_inv_dict = dict(zip(range(vocab_size), vocab_list))
 56 |     vocab = Vocab(vocab_list, vocab_size, vocab_dict, vocab_inv_dict)
 57 |     return vocab
 58 | 
 59 | def load_sents_data_semeval2010(data_file, testset=False):
 60 |     
 61 |     df = read_semeval2010_data(data_file)
 62 |     
 63 |     non_other = ~(df.rel == 'OTHER')
 64 |     df['class_'] = 'OTHER'
 65 |     df.loc[non_other, 'class_'] = df.loc[non_other,:].rel
 66 |     
 67 |     return df
 68 | 
 69 | 
 70 | def main(_):
 71 |     if(FLAGS.config is None):
 72 |         config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 73 |                                'model_config.yml')
 74 |     else:
 75 |         config_file = FLAGS.config
 76 |     with open(config_file, 'r') as rf:
 77 |         params = yaml.load(rf)
 78 |     
 79 |     seed = params.get('seed')
 80 |     random_state = np.random.RandomState(seed)
 81 |     tf.set_random_seed(seed)
 82 |     
 83 |     data_dir = params.get('data_dir')
 84 |     model_dir = params.get('model_dir')
 85 |     experiment_name = params.get('experiment_name')
 86 |     train_data_filename = params.get('train_file')
 87 |     test_data_filename = params.get('test_file')
 88 |     
 89 |     #load sentences data
 90 |     print("loading data...", flush=True)
 91 |     train_data_file = os.path.join(data_dir, train_data_filename)
 92 |     test_data_file = os.path.join(data_dir, test_data_filename)
 93 |     dftrain = load_sents_data_semeval2010(train_data_file)
 94 |     dftest = load_sents_data_semeval2010(test_data_file, testset=True)
 95 |     dftraintest = pd.concat([dftrain, dftest], ignore_index=True).reset_index(drop=True)
 96 |     le = LabelEncoder().fit(dftrain.class_.values)
 97 |     params['nclass'] = len(le.classes_) 
 98 |     params['label_encoder_file'] = experiment_name+'_label_encoder.pkl'  
 99 |     #oversample class w/ only one example, hack for stratified cv
100 |     dftrain = pd.concat([dftrain, dftrain[dftrain.rel=='ENTITY-DESTINATION(E2,E1)']],
101 |                         ignore_index=True).reset_index(drop=True)
102 |               
103 |     #build vocab 
104 |     print("building vocab...", flush=True)         
105 |     vocab_list = build_vocab(dftraintest)
106 |     vocab_size= len(vocab_list)
107 |     vocab_dict = dict(zip(vocab_list, range(vocab_size)))
108 |     vocab_inv_dict = dict(zip(range(vocab_size), vocab_list))
109 |     vocab = Vocab(vocab_list, vocab_size, vocab_dict, vocab_inv_dict)
110 |     params['vocab_file'] = experiment_name+'_vocab.pkl'
111 | 
112 |     #read embeddings
113 |     print("reading embeddings...", flush=True)
114 |     vocab_vec = read_embeddings(params['embeddings.file'], 
115 |                                      vocab.words,
116 |                                      params['embeddings.init_scale'],
117 |                                      params['dtype'], random_state)
118 |     embeddings_mat = np.asarray(vocab_vec.values, dtype=params['dtype'])
119 |     embeddings_mat[0,:] = 0    #make embeddings of PADDING all zeros
120 |     params['embeddings.mat.file'] = experiment_name+'_embeddings.pkl'
121 |     
122 |     
123 |     #save params, vocab and embeddings in model directory for testing
124 |     print("saving params, vocab, le and embeddings...", flush=True)
125 |     with open(os.path.join(model_dir, experiment_name+'_params.yml'), 'w') as wf:
126 |         yaml.dump(params, wf, default_flow_style=False)
127 |     with open(os.path.join(model_dir, params.get('vocab_file')), 'wb') as wf:
128 |         pickle.dump(vocab, wf)
129 |     with open(os.path.join(model_dir, params.get('embeddings.mat.file')), 'wb') as wf:
130 |         pickle.dump(embeddings_mat, wf) 
131 |     with open(os.path.join(model_dir, params.get('label_encoder_file')), 'wb') as wf:
132 |         pickle.dump(le, wf) 
133 |     
134 |         
135 |     ##cross-validation
136 |     sss = StratifiedShuffleSplit(n_splits=1, random_state=random_state,
137 |                                  test_size=params.get('devset_size'))
138 |     for trainidx, devidx in sss.split(dftrain.values, dftrain.rel.values):
139 |         cvtraindf = dftrain.iloc[trainidx,:]
140 |         cvdevdf = dftrain.iloc[devidx,:]
141 |         experiment_name = params.get('experiment_name') 
142 | 
143 |         tstream = build_data_streams(cvtraindf, vocab.dict, 
144 |                     params.get('sent_length'), le
145 | 
146 |                 )
147 |         dstream = build_data_streams(cvdevdf, vocab.dict, 
148 |                     params.get('sent_length'), le
149 |                 )        
150 |             
151 |         print("Training Data Shape: ", cvtraindf.shape)
152 |         print("Dev Data Shape: ", cvdevdf.shape)
153 |         print("Classes: ", le.classes_)        
154 |         
155 |         def graph_ops():
156 |             #2. build model and define its loss minimization approach(training operation)
157 |             mdl = build_model(params)
158 |                 
159 |             ##defining an optimizer to minimize model's loss
160 |             global_step = tf.Variable(0, name="global_step", trainable=False)
161 |             learning_rate = params.get('learning_rate')
162 |             optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, 
163 |                             momentum=0.8)
164 |             train_op = optimizer.minimize(mdl.loss, global_step=global_step)
165 |             
166 |             # Summaries for loss & metrics
167 |             loss_summary = tf.summary.scalar("loss", mdl.loss)  
168 |             acc_summary = tf.summary.scalar("accuracy", mdl.accuracy)  
169 |             
170 |             init_op = tf.global_variables_initializer()
171 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
172 |             
173 |             return mdl, global_step, train_op, loss_summary, acc_summary, init_op, \
174 |                 saver
175 |     
176 |         with tf.Session() as sess:   
177 |             mdl, global_step, train_op, loss_summary, acc_summary, init_op, \
178 |             saver = graph_ops()
179 |             sess.run(init_op)
180 |             
181 |             #summaries
182 |             ##train  summaries
183 |             train_summary_dir = os.path.join(model_dir, "summaries", experiment_name, "train")
184 |             train_summary_op = tf.summary.merge([loss_summary, acc_summary])
185 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph, flush_secs=3)
186 |             ##dev summaries        
187 |             dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
188 |             dev_summary_dir = os.path.join(model_dir, "summaries", experiment_name, "dev")
189 |             dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph, flush_secs=3)
190 |             
191 |             # train step
192 |             def train_epoch():
193 |                 ntrain = tstream.sent.shape[0]
194 |                 bsize = params.get('batch_size')
195 |                 start = 0
196 |                 end = 0
197 |                 for start in range(0, ntrain, bsize):
198 |                     end = start + bsize
199 |                     if end > ntrain: 
200 |                         end = ntrain
201 |                     
202 |                     train_feed_dict = {
203 |                                        mdl.sent: tstream.sent[start:end,:],
204 |                                        mdl.label: tstream.label[start:end],
205 |                                        mdl.ent1_dist: tstream.ent1_dist[start:end,:],
206 |                                        mdl.ent2_dist: tstream.ent2_dist[start:end,:],
207 |                                        mdl.dropout_keep_proba: params.get('dropout'),
208 |                                        mdl.batch_size: end-start
209 |                                     }
210 |                     sess.run([train_op, global_step, mdl.loss], train_feed_dict)
211 | 
212 |             def train_eval_step():
213 |                 sess.run(mdl.running_vars_initializer)
214 |                 train_feed_dict = {
215 |                                    mdl.sent: tstream.sent,
216 |                                    mdl.label: tstream.label,
217 |                                    mdl.ent1_dist: tstream.ent1_dist,
218 |                                    mdl.ent2_dist: tstream.ent2_dist,
219 |                                    mdl.dropout_keep_proba: 1.0,
220 |                                    mdl.batch_size: tstream.sent.shape[0]     
221 |                                 }
222 |                 tstep, tloss = sess.run([global_step, mdl.loss], train_feed_dict)
223 |                 sess.run(mdl.accuracy_op, train_feed_dict)
224 |                 tsummary = sess.run(train_summary_op, train_feed_dict)
225 |                 train_summary_writer.add_summary(tsummary, tstep)
226 |                 train_eval_score = sess.run(mdl.accuracy)
227 |                 return tstep, tloss, train_eval_score
228 | 
229 |             def eval_step():  
230 |                 sess.run(mdl.running_vars_initializer)
231 |                 dev_feed_dict = {
232 |                                 mdl.sent: dstream.sent,
233 |                                 mdl.label: dstream.label,
234 |                                 mdl.ent1_dist: dstream.ent1_dist,
235 |                                 mdl.ent2_dist: dstream.ent2_dist,
236 |                                 mdl.dropout_keep_proba: 1.0,
237 |                                 mdl.batch_size: dstream.label.shape[0]
238 |                             } 
239 |                 
240 |                 dstep, dloss, preds = sess.run([global_step, mdl.loss, 
241 |                         mdl.preds], dev_feed_dict)
242 |                 sess.run(mdl.accuracy_op, dev_feed_dict)
243 |                 dacc_ = sess.run(mdl.accuracy)
244 |                 l = dstream.label
245 |                 p = preds 
246 |                 
247 |                 class_int_labels = list(range(len(le.classes_)))
248 |                 target_names=le.classes_
249 |                 
250 |                 sess.run(mdl.accuracy_op, dev_feed_dict)
251 |                 dsummary = sess.run(dev_summary_op, dev_feed_dict)
252 |                 dev_summary_writer.add_summary(dsummary, dstep) 
253 |                 eval_score = (f1_score(l, p, average='micro'),
254 |                               f1_score(l, p, average='macro'),
255 |                               dacc_
256 |                             )
257 |                 print("EVAL step {}, loss {:g}, f1_micro {:g} f1_macro {:g} accuracy {:g}"
258 |                       .format(tstep, dloss, eval_score[0], eval_score[1], eval_score[2]), 
259 |                       flush=True)
260 |                 official_score = eval_score[1]
261 |                 
262 |                 print("Classification Report: \n%s"%
263 |                       classification_report(l, p, 
264 |                                 labels=class_int_labels, 
265 |                                 target_names=target_names,
266 |                             ), flush=True)
267 |                 
268 |                 return official_score
269 |     
270 |             #training loop
271 |             best_score = 0.0; best_step = 0; best_itr = 0;
272 |             for ite in range(params.get('training_iters')):
273 |                 train_epoch()
274 |                 if ite%params.get('train_step_eval') == 0:
275 |                     tstep, tloss, tacc_ = train_eval_step()
276 |                 
277 |                 if ite%params.get('train_step_eval') == 0:
278 |                     print("TRAIN step {}, iteration {} loss {:g} accuracy {:g}"
279 |                       .format(tstep, ite, tloss, tacc_), 
280 |                       flush=True)
281 |             
282 |                 current_step = tf.train.global_step(sess, global_step)
283 |                 if current_step % params.get('eval_interval') == 0:
284 |                     official_score = eval_step()
285 |                     if best_score < official_score:
286 |                         checkpoint_prefix = os.path.join(params.get('model_dir'), 
287 |                             "%s-score-%s"%(experiment_name, str(official_score)))
288 |                         saver.save(sess, checkpoint_prefix, global_step=current_step)
289 |                         
290 |                         best_score = official_score
291 |                         best_step = current_step
292 |                         best_itr = ite
293 |                     print("Best Score: %2.3f, Best Step: %d (iteration: %d)"
294 |                           %(best_score, best_step, best_itr))  
295 |             
296 |             
297 | if __name__ == '__main__':
298 |     parser = argparse.ArgumentParser()
299 |     parser.add_argument('--config', type=str, default=None, 
300 |                         help='Path to the config file.')
301 |     
302 |     FLAGS, unparsed = parser.parse_known_args()
303 |     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)


--------------------------------------------------------------------------------