├── BasicTextPreprocessing_CNN_CRF.py ├── Eval.py ├── Notes.pdf ├── README.md ├── Sample_Data ├── eng.testa.iobes.act_part ├── eng.testb.iobes.act_part ├── eng.train.iobes.act_part └── test_Predictions_41000.txt ├── alphabet.py ├── aux_network_func.py ├── data_processor.py ├── network.py ├── test_NER.py └── utils.py /BasicTextPreprocessing_CNN_CRF.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import tensorflow as tf 7 | import utils as utils 8 | import aux_network_func as af 9 | import data_processor as dp 10 | #Alphabet maps objects to integer ids 11 | from alphabet import Alphabet 12 | import network as network 13 | 14 | import dill 15 | 16 | import numpy as np 17 | import os 18 | import time 19 | import datetime 20 | from tensorflow.python import debug as tf_debug 21 | # In[2]: 22 | 23 | tf.__version__ 24 | 25 | #usage : python BasicTextPreprocessing_CNN_CRF.py 26 | #here 'word' is the name of the alphabet class instance 27 | print("Loading data...") 28 | word_alphabet = Alphabet('word') 29 | #'label_name' is 'pos' or 'ner' 30 | label_name ="ner" 31 | label_alphabet = Alphabet(label_name) 32 | logger = utils.get_logger("MainCode") 33 | embedding = "glove" 34 | embedding_path = "glove.6B.100d.gz" 35 | 36 | 37 | 38 | oov = 'embedding' 39 | fine_tune = True 40 | # Model Hyperparameters 41 | #tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") #not used 42 | tf.flags.DEFINE_string("train_path", "eng.train.iobes.act", "Train Path") 43 | tf.flags.DEFINE_string("test_path", "eng.testa.iobes.act", "Test Path") 44 | tf.flags.DEFINE_string("dev_path", "eng.testb.iobes.act", "dev Path") 45 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 46 | tf.flags.DEFINE_float("grad_clip", 5, "value for gradient clipping to avoid exploding/vanishing gradient(default: 5.0) in LSTM") 47 | tf.flags.DEFINE_float("max_global_clip", 5.0, "value for gradient clipping to avoid exploding/vanishing gradient overall(default: 1.0)") 48 | 49 | 50 | # Training parameters 51 | tf.flags.DEFINE_integer("batch_size", 10, "Batch Size (default: 64)") 52 | tf.flags.DEFINE_integer("word_col", 0, "position of the word in input file (default: 0)") 53 | tf.flags.DEFINE_integer("label_col", 3, "position of the label in input file (default: 3)") 54 | tf.flags.DEFINE_integer("n_hidden_LSTM", 200, "Number of hidden units in LSTM (default: 200)") 55 | tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 200)") 56 | tf.flags.DEFINE_integer("num_filters", 30, "Number of filters to apply for char CNN (default: 30)") 57 | tf.flags.DEFINE_integer("filter_size", 3, "filter_size (default: 3 )") 58 | tf.flags.DEFINE_integer("evaluate_every", 1000, "Evaluate model on dev set after this many steps (default: 100)") 59 | tf.flags.DEFINE_integer("char_embedd_dim", 30, "char_embedd_dim(default: 30)") 60 | tf.flags.DEFINE_integer("Optimizer", 1, "Adam : 1 , SGD:2") 61 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") 62 | tf.flags.DEFINE_float("starter_learning_rate", 0.015, "Initial learning rate for the optimizer. (default: 1e-3)") 63 | tf.flags.DEFINE_float("decay_rate", 0.05, "How much to decay the learning rate. (default: 0.015)") 64 | 65 | # Misc Parameters 66 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 67 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 68 | tf.flags.DEFINE_boolean("PadZeroBegin", False, "where to pad zero in the input") 69 | FLAGS = tf.flags.FLAGS 70 | FLAGS._parse_flags() 71 | Flags_Dict= utils.print_FLAGS(FLAGS,logger) 72 | 73 | train_path = FLAGS.train_path 74 | test_path = FLAGS.test_path 75 | dev_path = FLAGS.dev_path 76 | 77 | word_column = FLAGS.word_col 78 | label_column = FLAGS.label_col 79 | # Output directory for models and summaries 80 | timestamp = str(int(time.time())) 81 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 82 | print("Writing to {}\n".format(out_dir)) 83 | # read training data 84 | logger.info("Reading data from training set...") 85 | word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = dp.read_conll_sequence_labeling( 86 | train_path, word_alphabet, label_alphabet, word_column, label_column,out_dir=out_dir) 87 | 88 | # if oov is "random" and do not fine tune, close word_alphabet 89 | if oov == "random" and not fine_tune: 90 | logger.info("Close word alphabet.") 91 | word_alphabet.close() 92 | 93 | 94 | # read dev data 95 | logger.info("Reading data from dev set...") 96 | word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = dp.read_conll_sequence_labeling( 97 | dev_path, word_alphabet, label_alphabet, word_column, label_column) 98 | 99 | # close alphabets : by close we mean we cannot add any more words to the word vocabulary. 100 | #To DO :change to close this after train set alone 101 | word_alphabet.close() 102 | label_alphabet.close() 103 | 104 | 105 | # we are doing a -1 because we did not use the zer index. I believe this is to account for unknown word 106 | logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) 107 | logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) 108 | # get maximum length : this is mainly for padding. 109 | max_length_train = utils.get_max_length(word_sentences_train) 110 | max_length_dev = utils.get_max_length(word_sentences_dev) 111 | #max_length_test = utils.get_max_length(word_sentences_test) 112 | max_length = min(dp.MAX_LENGTH, max(max_length_train, max_length_dev)) 113 | logger.info("Maximum length (i.e max words ) of training set is %d" % max_length_train) 114 | logger.info("Maximum length (i.e max words ) of dev set is %d" % max_length_dev) 115 | #logger.info("Maximum length (i.e max words ) of test set is %d" % max_length_test) 116 | logger.info("Maximum length (i.e max words ) used for training is %d" % max_length) 117 | 118 | logger.info("Padding training text and lables ...") 119 | word_index_sentences_train_pad,train_seq_length = utils.padSequence(word_index_sentences_train,max_length, beginZero=FLAGS.PadZeroBegin) 120 | label_index_sentences_train_pad,_= utils.padSequence(label_index_sentences_train,max_length, beginZero=FLAGS.PadZeroBegin) 121 | 122 | logger.info("Padding dev text and lables ...") 123 | word_index_sentences_dev_pad,dev_seq_length = utils.padSequence(word_index_sentences_dev,max_length, beginZero=FLAGS.PadZeroBegin) 124 | label_index_sentences_dev_pad,_= utils.padSequence(label_index_sentences_dev,max_length, beginZero=FLAGS.PadZeroBegin) 125 | 126 | logger.info("Creating character set FROM training set ...") 127 | char_alphabet = Alphabet('character') 128 | char_index_train,max_char_per_word_train= dp.generate_character_data(word_sentences_train, 129 | char_alphabet=char_alphabet,setType="Train") 130 | # close character alphabet. WE close it because the embed table is goign to be random 131 | char_alphabet.close() 132 | 133 | logger.info("Creating character set FROM dev set ...") 134 | char_index_dev,max_char_per_word_dev= dp.generate_character_data(word_sentences_dev, 135 | char_alphabet=char_alphabet, setType="Dev") 136 | 137 | 138 | logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) 139 | max_char_per_word = min(dp.MAX_CHAR_PER_WORD, max_char_per_word_train,max_char_per_word_dev) 140 | logger.info("Maximum character length is %d" %max_char_per_word) 141 | logger.info("Constructing embedding table ...") 142 | #TODO : modify network to use this 143 | char_embedd_table = dp.build_char_embedd_table(char_alphabet,char_embedd_dim=FLAGS.char_embedd_dim) 144 | 145 | logger.info("Padding Training set ...") 146 | char_index_train_pad = dp.construct_padded_char(char_index_train, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word) 147 | logger.info("Padding Dev set ...") 148 | char_index_dev_pad = dp.construct_padded_char(char_index_dev, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word) 149 | 150 | #logger.info("Generating data with fine tuning...") 151 | embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path,logger) 152 | logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) 153 | #Create an embedding table where if the word from training/train/dev set is in glove , then assign glove values else assign random values 154 | embedd_table = dp.build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) 155 | word_vocab = word_alphabet.instances 156 | word_vocab_size = len(word_vocab) 157 | char_vocab = char_alphabet.instances 158 | char_vocab_size = len(char_vocab) 159 | num_classes = len(label_alphabet.instances) + 1 #to account for zero index we dont use 160 | #logger.info("length of the embedding table is %d" , embedd_table.shape[0]) 161 | 162 | #Store the parameters for loading in test set 163 | Flags_Dict['sequence_length']=max_length 164 | Flags_Dict['num_classes']=num_classes 165 | Flags_Dict['word_vocab_size']=word_vocab_size 166 | Flags_Dict['char_vocab_size']=char_vocab_size 167 | Flags_Dict['max_char_per_word']=max_char_per_word 168 | Flags_Dict['embedd_dim']=embedd_dim 169 | Flags_Dict['out_dir']=out_dir 170 | Flags_Dict['model_path']=out_dir 171 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 172 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 173 | Flags_Dict['checkpoint_dir']=checkpoint_dir 174 | dill.dump(Flags_Dict,open(os.path.join(out_dir, "config.pkl"),'wb')) 175 | dill.dump(char_alphabet,open(os.path.join(out_dir, "char_alphabet.pkl"),'wb')) 176 | dill.dump(word_alphabet,open(os.path.join(out_dir, "word_alphabet.pkl"),'wb')) 177 | dill.dump(label_alphabet,open(os.path.join(out_dir, "label_alphabet.pkl"),'wb')) 178 | tf.reset_default_graph() 179 | 180 | session_conf = tf.ConfigProto( 181 | allow_soft_placement=FLAGS.allow_soft_placement, 182 | log_device_placement=FLAGS.log_device_placement) 183 | with tf.Session(config=session_conf) as sess: 184 | best_accuracy = 0 185 | best_overall_accuracy = 0 186 | best_accuracy_test = 0 187 | best_overall_accuracy_test = 0 188 | best_step = 0 189 | BiLSTM = network.textBiLSTM(sequence_length=max_length, num_classes=num_classes, word_vocab_size=word_vocab_size, 190 | word_embedd_dim=embedd_dim,n_hidden_LSTM =FLAGS.n_hidden_LSTM,max_char_per_word=max_char_per_word, 191 | char_vocab_size=char_vocab_size,char_embedd_dim = FLAGS.char_embedd_dim,grad_clip=FLAGS.grad_clip,num_filters=FLAGS.num_filters,filter_size= FLAGS.filter_size) 192 | 193 | 194 | # Define Training procedure 195 | global_step = tf.Variable(0, name="global_step", trainable=False) 196 | decay_step = int(len(word_index_sentences_train_pad)/FLAGS.batch_size) #we want to decay per epoch. Comes to around 1444 for batch of 100 197 | #print("decay_step :",decay_step) 198 | learning_rate = tf.train.exponential_decay(FLAGS.starter_learning_rate, global_step,decay_step, FLAGS.decay_rate, staircase=True) 199 | if(FLAGS.Optimizer==2): 200 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) #also try GradientDescentOptimizer , AdamOptimizer 201 | elif(FLAGS.Optimizer==1): 202 | optimizer = tf.train.AdamOptimizer(learning_rate) 203 | 204 | #This is the first part of minimize() 205 | grads_and_vars = optimizer.compute_gradients(BiLSTM.loss) 206 | #clipped_grads_and_vars = [(tf.clip_by_norm(grad, FLAGS.max_global_clip), var) for grad, var in grads_and_vars] 207 | 208 | 209 | #we will do grad_clipping for LSTM only 210 | #capped_gvs = [(tf.clip_by_value(grad, -FLAGS.max_global_clip, FLAGS.max_global_clip), var) for grad, var in grads_and_vars] 211 | 212 | # the following bloack is a hack for clip by norm 213 | #grad_list = [grad for grad, var in grads_and_vars] 214 | #var_list = [var for grad, var in grads_and_vars] 215 | #capped_gvs = tf.clip_by_global_norm(grad_list, clip_norm=FLAGS.max_global_norm) 216 | #grads_and_vars_pair = zip(capped_gvs,var) 217 | 218 | 219 | #This is the second part of minimize() 220 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 221 | # Keep track of gradient values and sparsity (optional) 222 | grad_summaries = [] 223 | for g, v in grads_and_vars: 224 | if g is not None: 225 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 226 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 227 | grad_summaries.append(grad_hist_summary) 228 | grad_summaries.append(sparsity_summary) 229 | grad_summaries_merged = tf.summary.merge(grad_summaries) 230 | 231 | 232 | 233 | # Summaries for loss and accuracy 234 | loss_summary = tf.summary.scalar("loss", BiLSTM.loss) 235 | #acc_summary = tf.summary.scalar("accuracy", BiLSTM.accuracy) 236 | 237 | # Train Summaries 238 | train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged]) 239 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 240 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 241 | 242 | # Dev summaries 243 | dev_summary_op = tf.summary.merge([loss_summary]) 244 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 245 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) 246 | 247 | 248 | 249 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 250 | if not os.path.exists(checkpoint_dir): 251 | os.makedirs(checkpoint_dir) 252 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 253 | 254 | # variables need to be initialized before we can use them 255 | sess.run(tf.global_variables_initializer()) 256 | 257 | #debug block 258 | #sess = tf_debug.LocalCLIDebugWrapperSession(sess) 259 | #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) 260 | 261 | 262 | 263 | def dev_step (session,BiLSTM,PadZeroBegin,max_length,x_batch,y_batch,act_seq_lengths, 264 | dropout_keep_prob,embedd_table,step,char_batch,char_embedd_table,writer= None): 265 | feed_dict=af.create_feed_Dict(BiLSTM,PadZeroBegin,max_length,x_batch,y_batch,act_seq_lengths,dropout_keep_prob,embedd_table,char_batch,char_embedd_table) 266 | logits, transition_params,summaries = session.run([BiLSTM.logits, BiLSTM.transition_params,dev_summary_op],feed_dict=feed_dict) 267 | accuracy,accuracy_low_classes = af.predictAccuracyAndWrite(logits,transition_params,act_seq_lengths,y_batch,step,x_batch,word_alphabet,label_alphabet,beginZero=FLAGS.PadZeroBegin) 268 | 269 | time_str = datetime.datetime.now().isoformat() 270 | print("{}: step {}, accuracy on set {:g}, accuracy for classes except Others: {:g}".format(time_str, step,accuracy,accuracy_low_classes)) 271 | if writer: 272 | writer.add_summary(summaries, step) 273 | return accuracy,accuracy_low_classes 274 | 275 | def train_step(session,BiLSTM,PadZeroBegin,max_length,x_batch, y_batch,act_seq_lengths,dropout_keep_prob,embedd_table,char_batch,char_embedd_table): 276 | """ 277 | A single training step 278 | """ 279 | feed_dict=af.create_feed_Dict(BiLSTM,PadZeroBegin,max_length,x_batch,y_batch,act_seq_lengths,dropout_keep_prob,embedd_table,char_batch,char_embedd_table) 280 | 281 | _, step, summaries, loss,logits,transition_params = session.run( 282 | [train_op, global_step, train_summary_op, BiLSTM.loss,BiLSTM.logits,BiLSTM.transition_params], 283 | feed_dict) 284 | 285 | time_str = datetime.datetime.now().isoformat() 286 | print("{}: step {}, loss {:g}".format(time_str, step, loss)) 287 | train_summary_writer.add_summary(summaries, step) 288 | 289 | # Generate batches 290 | batches = utils.batch_iter( 291 | list(zip(word_index_sentences_train_pad, label_index_sentences_train_pad ,train_seq_length,char_index_train_pad)), FLAGS.batch_size, FLAGS.num_epochs) 292 | 293 | # Training loop. For each batch... 294 | for batch in batches: 295 | x_batch, y_batch,act_seq_lengths,char_batch = zip(*batch) 296 | train_step(sess,BiLSTM,FLAGS.PadZeroBegin,max_length,x_batch, y_batch,act_seq_lengths,FLAGS.dropout_keep_prob, 297 | embedd_table,char_batch,char_embedd_table) 298 | current_step = tf.train.global_step(sess, global_step) 299 | if current_step % FLAGS.evaluate_every == 0: 300 | print("\nEvaluation:") 301 | new_accuracy,accuracy_low_classes=dev_step(sess,BiLSTM,FLAGS.PadZeroBegin,max_length,word_index_sentences_dev_pad, 302 | label_index_sentences_dev_pad ,dev_seq_length,FLAGS.dropout_keep_prob, 303 | embedd_table,current_step,char_index_dev_pad,char_embedd_table, writer=dev_summary_writer) 304 | print("") 305 | if (accuracy_low_classes > best_accuracy): 306 | 307 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 308 | best_accuracy = accuracy_low_classes 309 | best_step = current_step 310 | best_overall_accuracy = new_accuracy 311 | print("Saved model checkpoint to {}\n".format(path)) 312 | #run test data 313 | new_accuracy_test,accuracy_low_classes_test = af.test_step(logger= logger,session=sess,BiLSTM=BiLSTM,PadZeroBegin=FLAGS.PadZeroBegin,max_length=max_length, 314 | test_path=test_path,dropout_keep_prob=FLAGS.dropout_keep_prob,step=current_step,out_dir=out_dir,char_alphabet=char_alphabet, 315 | label_alphabet=label_alphabet,word_alphabet=word_alphabet,word_column=word_column,label_column=label_column, 316 | char_embedd_dim=FLAGS.char_embedd_dim,max_char_per_word=max_char_per_word) 317 | if (accuracy_low_classes_test > best_accuracy_test): 318 | best_accuracy_test = accuracy_low_classes_test 319 | best_step_test = current_step 320 | best_overall_accuracy_test = new_accuracy_test 321 | 322 | print("DEV: best_accuracy on NER : %f best_step: %d best_overall_accuracy: %d" %(best_accuracy,best_step,best_overall_accuracy)) 323 | print("TEST : best_accuracy on NER : %f best_step: %d best_overall_accuracy: %d" %(best_accuracy_test,best_step_test,best_overall_accuracy_test)) 324 | 325 | 326 | 327 | -------------------------------------------------------------------------------- /Eval.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | # this is the evaluation code. Change the prediction file name to the one from your test step. the precision and recall were calculated as described 4 | Tjong Kim Sang, Erik. F. 2002. Introduction to the CoNLL-2003 Shared Task: Language Independent Named Entity Recognition. In Proc. Conference on Natural Language Learning 5 | ''' 6 | 7 | predictedFileName = "test_Predictions_41000.txt" 8 | 9 | words = [] 10 | y_label = [] 11 | pred_label =[] 12 | 13 | header = False 14 | with open(predictedFileName) as file: 15 | for line in file: 16 | if(not header): 17 | header = True 18 | else: 19 | line = line.strip().split('\t') #or someother preprocessing 20 | if(len(line) == 3): 21 | words.append(line[0].strip()) 22 | y_label.append(line[1].strip()) 23 | pred_label.append(line[2].strip()) 24 | 25 | def PrecisionRecall (y_label,pred_label): 26 | # for precision we need to count hte actual NER 27 | #for recall we need to find out the count of NER we predicted 28 | # so same function can be used for P and R if order of parameters are changed 29 | # this order is for precision 30 | count = len(y_label) 31 | i=0 32 | correctEntityCount = {} 33 | act_count = {} # stores actual count of per,loc for precision calculation 34 | metricValue = {} 35 | #This is for precision only 36 | while(i= self.size(): 61 | raise IndexError("Enumerate is allowed between [1 : size of the alphabet)") 62 | return zip(range(start, len(self.instances) + 1), self.instances[start - 1:]) 63 | 64 | def close(self): 65 | self.keep_growing = False 66 | 67 | def open(self): 68 | self.keep_growing = True 69 | 70 | def get_content(self): 71 | return {'instance2index': self.instance2index, 'instances': self.instances} 72 | 73 | def from_json(self, data): 74 | self.instances = data["instances"] 75 | self.instance2index = data["instance2index"] 76 | 77 | def save(self, output_directory, name=None): 78 | """ 79 | Save both alhpabet records to the given directory. 80 | :param output_directory: Directory to save model and weights. 81 | :param name: The alphabet saving name, optional. 82 | :return: 83 | """ 84 | saving_name = name if name else self.__name 85 | try: 86 | json.dump(self.get_content(), open(os.path.join(output_directory, saving_name + ".json"), 'w')) 87 | except Exception as e: 88 | self.logger.warn("Alphabet is not saved: " % repr(e)) 89 | 90 | def load(self, input_directory, name=None): 91 | """ 92 | Load model architecture and weights from the give directory. This allow we use old models even the structure 93 | changes. 94 | :param input_directory: Directory to save model and weights 95 | :return: 96 | """ 97 | loading_name = name if name else self.__name 98 | self.from_json(json.load(open(os.path.join(input_directory, loading_name + ".json")))) 99 | 100 | -------------------------------------------------------------------------------- /aux_network_func.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import utils as utils 4 | import os 5 | import time 6 | import datetime 7 | import data_processor as dp 8 | import dill 9 | 10 | def create_feed_Dict(BiLSTM,PadZeroBegin,max_length,x_batch,y_batch,act_seq_lengths,dropout_keep_prob,embedd_table,char_batch,char_embedd_table): 11 | if PadZeroBegin: 12 | cur_batch_size = len(x_batch) 13 | sequence_length_batch= np.full((cur_batch_size), max_length, dtype=int) 14 | feed_dict = { 15 | BiLSTM.input_x: x_batch, 16 | BiLSTM.input_y: y_batch, 17 | BiLSTM.dropout_keep_prob: dropout_keep_prob, 18 | BiLSTM.word_embedding_placeholder: embedd_table, 19 | BiLSTM.sequence_lengths : sequence_length_batch, #NOTES:sadly giving hte actual seq length gives None all the time when sequence is padded in begining 20 | #BiLSTM.sequence_lengths : seq_length 21 | BiLSTM.input_x_char : char_batch, 22 | BiLSTM.char_embedding_placeholder : char_embedd_table 23 | 24 | } 25 | else: 26 | feed_dict = { 27 | BiLSTM.input_x: x_batch, 28 | BiLSTM.input_y: y_batch, 29 | BiLSTM.dropout_keep_prob: dropout_keep_prob, 30 | BiLSTM.word_embedding_placeholder: embedd_table, 31 | #BiLSTM.sequence_lengths : sequence_length_batch, #NOTES:sadly giving hte actual seq length gives None all the time when sequence is padded in begining 32 | BiLSTM.sequence_lengths : act_seq_lengths, 33 | BiLSTM.input_x_char : char_batch, 34 | BiLSTM.char_embedding_placeholder : char_embedd_table 35 | 36 | 37 | } 38 | return feed_dict 39 | 40 | def create_feed_Dict_Test(BiLSTM,PadZeroBegin,max_length,x_batch, y_batch,act_seq_lengths, dropout_keep_prob,char_batch): 41 | if PadZeroBegin: 42 | cur_batch_size = len(x_batch) 43 | sequence_length_batch= np.full((cur_batch_size), max_length, dtype=int) 44 | feed_dict = { 45 | BiLSTM.input_x: x_batch, 46 | BiLSTM.input_y: y_batch, 47 | BiLSTM.dropout_keep_prob: dropout_keep_prob, 48 | BiLSTM.sequence_lengths : sequence_length_batch, #NOTES:sadly giving hte actual seq length gives None all the time when sequence is padded in begining 49 | #BiLSTM.sequence_lengths : seq_length 50 | BiLSTM.input_x_char : char_batch, 51 | 52 | } 53 | else: 54 | feed_dict = { 55 | BiLSTM.input_x: x_batch, 56 | BiLSTM.input_y: y_batch, 57 | BiLSTM.dropout_keep_prob: dropout_keep_prob, 58 | #BiLSTM.sequence_lengths : sequence_length_batch, #NOTES:sadly giving hte actual seq length gives None all the time when sequence is padded in begining 59 | BiLSTM.sequence_lengths : act_seq_lengths, 60 | BiLSTM.input_x_char : char_batch, 61 | 62 | 63 | } 64 | return feed_dict 65 | 66 | 67 | def predictAccuracyAndWrite(logits,transition_params,seq_length,y_batch,step,x_batch,word_alphabet,label_alphabet,prefix_filename="Dev",beginZero=True): 68 | correct_labels = 0 69 | total_labels = 0 70 | correct_labels_low_classes = 0 71 | total_labels_low_classes = 0 72 | fname = prefix_filename + "_Predictions_" +str(step)+".txt" 73 | 74 | with open(fname, 'w') as outfile: 75 | outfile.write("word\ty_label\tpred_label\n") 76 | for tf_unary_scores_, y_, sequence_length_, x_ in zip(logits, y_batch,seq_length,x_batch): 77 | # Remove padding from the scores and tag sequence. 78 | tf_unary_scores_ = tf_unary_scores_[-sequence_length_:] if beginZero else tf_unary_scores_[:sequence_length_] 79 | #for writing to file 80 | y_ = y_[-sequence_length_:] if beginZero else y_[:sequence_length_] 81 | x_ = x_[-sequence_length_:] if beginZero else x_[:sequence_length_] 82 | # Compute the highest scoring sequence. 83 | viterbi_sequence, viterbi_score = tf.contrib.crf.viterbi_decode( 84 | tf_unary_scores_, transition_params) 85 | for xi,yi,vi in zip(x_,y_,viterbi_sequence): 86 | x_word = word_alphabet.get_instance(xi) 87 | y_label = label_alphabet.get_instance(yi) 88 | pred_label = label_alphabet.get_instance(vi) 89 | outfile.write(str(x_word) + "\t"+str(y_label)+"\t"+str(pred_label)+"\n") 90 | if(y_label != "O"): 91 | total_labels_low_classes = total_labels_low_classes + 1 92 | if (y_label == pred_label): 93 | correct_labels_low_classes = correct_labels_low_classes +1 94 | outfile.write("\n") 95 | # Evaluate word-level accuracy. 96 | correct_labels += np.sum(np.equal(viterbi_sequence, y_)) 97 | total_labels += sequence_length_ 98 | accuracy = 100.0 * correct_labels / float(total_labels) 99 | accuracy_low_classes = 100.0 * correct_labels_low_classes / float(total_labels_low_classes) 100 | outfile.write("accuracy: " + str(accuracy)) 101 | outfile.write("\naccuracy for classes except other : " + str(accuracy_low_classes)) 102 | outfile.write("\ntotal other classes : {}, correctly predicted : {} ".format(total_labels_low_classes,correct_labels_low_classes )) 103 | outfile.write("\ntotal : {}, correctly predicted : {} ".format(total_labels,correct_labels )) 104 | return accuracy,accuracy_low_classes 105 | 106 | def test_step(logger,session,BiLSTM,PadZeroBegin,max_length,test_path, 107 | dropout_keep_prob,step,out_dir,char_alphabet,label_alphabet,word_alphabet, 108 | word_column, label_column,char_embedd_dim,max_char_per_word): 109 | # read test data 110 | logger.info("Reading data from test set...") 111 | word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = dp.read_conll_sequence_labeling( 112 | test_path, word_alphabet, label_alphabet, word_column, label_column) 113 | logger.info("Padding test text and lables ...") 114 | word_index_sentences_test_pad,test_seq_length = utils.padSequence(word_index_sentences_test,max_length, beginZero=PadZeroBegin) 115 | label_index_sentences_test_pad,_= utils.padSequence(label_index_sentences_test,max_length, beginZero=PadZeroBegin) 116 | logger.info("Creating character set FROM test set ...") 117 | char_index_test,_= dp.generate_character_data(word_sentences_test, 118 | char_alphabet=char_alphabet, setType="Test") 119 | 120 | logger.info("Padding Test set ...") 121 | char_index_test_pad = dp.construct_padded_char(char_index_test, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word) 122 | 123 | # test summaries 124 | #test_summary_op = tf.summary.merge([loss_summary]) 125 | #test_summary_dir = os.path.join(out_dir, "summaries", "test") 126 | #test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph) 127 | 128 | feed_dict=create_feed_Dict_Test(BiLSTM,PadZeroBegin=PadZeroBegin,max_length=max_length, 129 | x_batch=word_index_sentences_test_pad, y_batch=label_index_sentences_test_pad, 130 | act_seq_lengths= test_seq_length, dropout_keep_prob=dropout_keep_prob, 131 | char_batch=char_index_test_pad) 132 | '''#tf.Print(feed_dict,feed_dict) 133 | logits, transition_params = session.run([BiLSTM.logits, BiLSTM.transition_params],feed_dict) 134 | #logits is a list of numpy.ndarray 135 | #transition_params : ndarray''' 136 | 137 | logits, transition_params,embedded_char,embedded_words,char_pool_flat,input_x_test = session.run([BiLSTM.logits, BiLSTM.transition_params, 138 | BiLSTM.W_char,BiLSTM.W_word,BiLSTM.char_pool_flat,BiLSTM.input_x],feed_dict) 139 | 140 | accuracy,accuracy_low_classes = predictAccuracyAndWrite(logits,transition_params,test_seq_length, 141 | label_index_sentences_test_pad,step,word_index_sentences_test_pad,word_alphabet,label_alphabet,prefix_filename="test",beginZero=PadZeroBegin) 142 | 143 | #test_summary_writer.add_summary(summaries, step) 144 | print("step {}, accuracy on test set {:g}, accuracy for classes except Others: {:g}".format(step,accuracy,accuracy_low_classes)) 145 | 146 | checkpoint_dir_test = os.path.abspath(os.path.join(out_dir, "checkpoints_test")) 147 | 148 | if not os.path.exists(checkpoint_dir_test): 149 | os.makedirs(checkpoint_dir_test) 150 | fname_data = "input_x_test_"+str(step)+".pkl" 151 | fname_conv_out = "char_pool_flat_"+str(step)+".pkl" 152 | fname_seqLength = "act_seq_len_"+str(step)+".pkl" 153 | fname_embedded_char = "embedded_char_"+str(step)+".pkl" 154 | fname_embedded_words = "embedded_words_"+str(step)+".pkl" 155 | dill.dump(input_x_test,open(os.path.join(checkpoint_dir_test, fname_data),'wb')) 156 | dill.dump(char_pool_flat,open(os.path.join(checkpoint_dir_test, fname_conv_out),'wb')) 157 | dill.dump(test_seq_length,open(os.path.join(checkpoint_dir_test, fname_seqLength),'wb')) 158 | dill.dump(embedded_char,open(os.path.join(checkpoint_dir_test, fname_embedded_char),'wb')) 159 | dill.dump(embedded_words,open(os.path.join(checkpoint_dir_test, fname_embedded_words),'wb')) 160 | print("Saved test data checkpoint to {}\n".format(checkpoint_dir_test)) 161 | return accuracy,accuracy_low_classes 162 | 163 | 164 | 165 | def create_feed_Dict_Eval(graph,PadZeroBegin,max_length,x_batch,act_seq_lengths,dropout_keep_prob, 166 | char_batch): 167 | 168 | if PadZeroBegin: 169 | cur_batch_size = len(x_batch) 170 | sequence_length_batch= np.full((cur_batch_size), max_length, dtype=int) 171 | feed_dict = { 172 | graph.get_tensor_by_name('input_x:0'): x_batch, 173 | graph.get_tensor_by_name('dropout_keep_prob:0'): dropout_keep_prob, 174 | graph.get_tensor_by_name('sequence_lengths:0') : sequence_length_batch, #NOTES:sadly giving hte actual seq length gives None all the time when sequence is padded in begining 175 | graph.get_tensor_by_name('input_x_char:0') : char_batch, 176 | } 177 | else: 178 | feed_dict = { 179 | graph.get_tensor_by_name('input_x:0'): x_batch, 180 | graph.get_tensor_by_name('dropout_keep_prob:0'): dropout_keep_prob, 181 | graph.get_tensor_by_name('sequence_lengths:0') : act_seq_lengths, 182 | graph.get_tensor_by_name('input_x_char:0') : char_batch, 183 | 184 | } 185 | return feed_dict 186 | 187 | # This function is just to understand the network for debugging purposes 188 | def viterbi_decode(score, transition_params, targetWordIndex): 189 | """Decode the highest scoring sequence of tags outside of TensorFlow. 190 | This should only be used at test time. 191 | Args: 192 | score: A [seq_len, num_tags] matrix of unary potentials. 193 | transition_params: A [num_tags, num_tags] matrix of binary potentials. 194 | Returns: 195 | viterbi: A [seq_len] list of integers containing the highest scoring tag 196 | indicies. 197 | viterbi_score: A float containing the score for the Viterbi sequence. 198 | """ 199 | trellis = np.zeros_like(score) 200 | backpointers = np.zeros_like(score, dtype=np.int32) 201 | trellis[0] = score[0] 202 | 203 | v_target = np.zeros_like(transition_params) 204 | for t in range(1, score.shape[0]): 205 | v = np.expand_dims(trellis[t - 1], 1) + transition_params 206 | if(t==targetWordIndex): 207 | v_target = v 208 | trellis[t] = score[t] + np.max(v, 0) 209 | backpointers[t] = np.argmax(v, 0) 210 | 211 | 212 | viterbi = [np.argmax(trellis[-1])] 213 | for bp in reversed(backpointers[1:]): 214 | viterbi.append(bp[viterbi[-1]]) 215 | viterbi.reverse() 216 | if(targetWordIndex == 0): 217 | total = float(np.sum([i if i > 0 else 0 for i in score[0]])) 218 | prob = [i/total if i > 0 else 0 for i in score[0]] 219 | else: 220 | total = float(np.sum([i if i > 0 else 0 for i in v_target[viterbi[targetWordIndex]]])) 221 | prob = [i/total if i > 0 else 0 for i in v_target[viterbi[targetWordIndex]]] 222 | dill.dump(prob,open("prob.dill",'wb')) 223 | '''dill.dump(trellis,open("trellis.dill",'wb')) 224 | dill.dump(score,open("score.dill",'wb')) 225 | dill.dump(transition_params,open("transition_params.dill",'wb'))''' 226 | viterbi_score = np.max(trellis[-1]) 227 | return viterbi, viterbi_score,prob 228 | 229 | def debug(logits,transition_params,seq_length,x_batch,word_alphabet,label_alphabet, targetWordIndexArray,prefix_filename="Dev", beginZero=True): 230 | 231 | for tf_unary_scores_,sequence_length_, x_,targetWordIndex in zip(logits, seq_length,x_batch,targetWordIndexArray): 232 | # Remove padding from the scores and tag sequence. 233 | tf_unary_scores_ = tf_unary_scores_[-sequence_length_:] if beginZero else tf_unary_scores_[:sequence_length_] 234 | x_ = x_[-sequence_length_:] if beginZero else x_[:sequence_length_] 235 | 236 | # Compute the highest scoring sequence. 237 | viterbi_sequence, viterbi_score,prob = viterbi_decode( 238 | tf_unary_scores_, transition_params, targetWordIndex) 239 | 240 | return 241 | def test_step_eval(logger,session,PadZeroBegin,max_length,test_path, 242 | dropout_keep_prob,step,out_dir,char_alphabet,label_alphabet,word_alphabet, 243 | word_column, label_column,char_embedd_dim,max_char_per_word): 244 | # read test data 245 | graph = tf.get_default_graph() 246 | logger.info("Reading data from test set...") 247 | word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = dp.read_conll_sequence_labeling( 248 | test_path, word_alphabet, label_alphabet, word_column, label_column) 249 | logger.info("Padding test text and lables ...") 250 | word_index_sentences_test_pad,test_seq_length = utils.padSequence(word_index_sentences_test,max_length, beginZero=PadZeroBegin) 251 | label_index_sentences_test_pad,_= utils.padSequence(label_index_sentences_test,max_length, beginZero=PadZeroBegin) 252 | logger.info("Creating character set FROM test set ...") 253 | char_index_test,_= dp.generate_character_data(word_sentences_test, 254 | char_alphabet=char_alphabet, setType="Test") 255 | 256 | logger.info("Padding Test set ...") 257 | char_index_test_pad = dp.construct_padded_char(char_index_test, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word) 258 | print(type(char_index_test_pad)) 259 | print(type(word_index_sentences_test_pad)) 260 | 261 | feed_dict=create_feed_Dict_Eval(graph,PadZeroBegin=PadZeroBegin,max_length=max_length, 262 | x_batch=word_index_sentences_test_pad, 263 | act_seq_lengths= test_seq_length, dropout_keep_prob=dropout_keep_prob, 264 | char_batch=char_index_test_pad) 265 | #tf.Print(feed_dict,feed_dict) 266 | logit_op = graph.get_tensor_by_name('output/logits:0') 267 | transition_params_op = graph.get_tensor_by_name('transitions:0') 268 | logits,transition_params = session.run([logit_op, transition_params_op],feed_dict) 269 | print(logits.shape) 270 | targetWordIndexArray = np.asarray([0]) 271 | debug(logits=logits,transition_params=transition_params, 272 | seq_length=test_seq_length,x_batch=word_index_sentences_test_pad,word_alphabet=word_alphabet,label_alphabet=label_alphabet, 273 | targetWordIndexArray=targetWordIndexArray,prefix_filename="test",beginZero=PadZeroBegin) 274 | return 0,0 275 | 276 | '''accuracy,accuracy_low_classes = predictAccuracyAndWrite(logits,transition_params,test_seq_length,step,word_index_sentences_test_pad,word_alphabet,label_alphabet,prefix_filename="test",beginZero=PadZeroBegin) 277 | 278 | #test_summary_writer.add_summary(summaries, step) 279 | print("step {}, accuracy on test set {:g}, accuracy for classes except Others: {:g}".format(step,accuracy,accuracy_low_classes)) 280 | 281 | return accuracy,accuracy_low_classes''' 282 | def viterbi_decode(logits,transition_params,seq_length,x_batch,word_alphabet,label_alphabet, prefix_filename="Test", beginZero=True): 283 | fname = prefix_filename + "_Predictions.txt" 284 | with open(fname, 'w') as outfile: 285 | outfile.write("word\ty_label\tpred_label\n") 286 | for tf_unary_scores_,sequence_length_, x_,targetWordIndex in zip(logits, seq_length,x_batch,targetWordIndexArray): 287 | # Remove padding from the scores and tag sequence. 288 | tf_unary_scores_ = tf_unary_scores_[-sequence_length_:] if beginZero else tf_unary_scores_[:sequence_length_] 289 | x_ = x_[-sequence_length_:] if beginZero else x_[:sequence_length_] 290 | # Compute the highest scoring sequence. 291 | viterbi_sequence, viterbi_score = tf.contrib.crf.viterbi_decode( 292 | tf_unary_scores_, transition_params) 293 | for xi,yi,vi in zip(x_,viterbi_sequence): 294 | x_word = word_alphabet.get_instance(xi) 295 | pred_label = label_alphabet.get_instance(vi) 296 | outfile.write(str(x_word) + "\t"+str(pred_label)+"\n") 297 | outfile.write("\n") 298 | 299 | return 300 | 301 | def test_step_report(logger,session,PadZeroBegin,max_length,test_path, 302 | dropout_keep_prob,step,out_dir,char_alphabet,label_alphabet,word_alphabet, 303 | word_column, label_column,char_embedd_dim,max_char_per_word): 304 | # read test data 305 | graph = tf.get_default_graph() 306 | logger.info("Reading data from test set...") 307 | word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = dp.read_conll_sequence_labeling( 308 | test_path, word_alphabet, label_alphabet, word_column, label_column) 309 | logger.info("Padding test text and lables ...") 310 | word_index_sentences_test_pad,test_seq_length = utils.padSequence(word_index_sentences_test,max_length, beginZero=PadZeroBegin) 311 | label_index_sentences_test_pad,_= utils.padSequence(label_index_sentences_test,max_length, beginZero=PadZeroBegin) 312 | logger.info("Creating character set FROM test set ...") 313 | char_index_test,_= dp.generate_character_data(word_sentences_test, 314 | char_alphabet=char_alphabet, setType="Test") 315 | 316 | logger.info("Padding Test set ...") 317 | char_index_test_pad = dp.construct_padded_char(char_index_test, char_alphabet, max_sent_length=max_length,max_char_per_word=max_char_per_word) 318 | print(type(char_index_test_pad)) 319 | print(type(word_index_sentences_test_pad)) 320 | 321 | feed_dict=create_feed_Dict_Eval(graph,PadZeroBegin=PadZeroBegin,max_length=max_length, 322 | x_batch=word_index_sentences_test_pad, 323 | act_seq_lengths= test_seq_length, dropout_keep_prob=dropout_keep_prob, 324 | char_batch=char_index_test_pad) 325 | #tf.Print(feed_dict,feed_dict) 326 | logit_op = graph.get_tensor_by_name('output/logits:0') 327 | transition_params_op = graph.get_tensor_by_name('transitions:0') 328 | logits,transition_params = session.run([logit_op, transition_params_op],feed_dict) 329 | viterbi_decode(logits=logits,transition_params=transition_params, 330 | seq_length=test_seq_length,x_batch=word_index_sentences_test_pad,word_alphabet=word_alphabet,label_alphabet=label_alphabet, 331 | prefix_filename="test",beginZero=PadZeroBegin) 332 | return -------------------------------------------------------------------------------- /data_processor.py: -------------------------------------------------------------------------------- 1 | import utils as utils 2 | from alphabet import Alphabet 3 | import numpy as np 4 | import pickle 5 | import os 6 | MAX_LENGTH = 120 7 | MAX_CHAR_PER_WORD = 45 8 | root_symbol = "##ROOT##" 9 | root_label = "" 10 | word_end = "##WE##" 11 | logger = utils.get_logger("LoadData") 12 | 13 | def read_conll_sequence_labeling(path, word_alphabet, label_alphabet, word_column=1, label_column=3,out_dir=None): 14 | """ 15 | read data from file in conll format 16 | :param path: file path 17 | :param word_column: the column index of word (start from 0) 18 | :param label_column: the column of label (start from 0) 19 | :param word_alphabet: alphabet of words 20 | :param label_alphabet: alphabet -f labels 21 | :return: sentences of words and labels, sentences of indexes of words and labels. 22 | """ 23 | word_sentences = [] 24 | label_sentences = [] 25 | 26 | word_index_sentences = [] 27 | label_index_sentences = [] 28 | if(out_dir !=None): 29 | vocab = set() 30 | #print(out_dir = os.path.abspath(os.path.join(os.path.curdir, "vocab", timestamp))) 31 | vocab_save_path = os.path.join(out_dir, "vocab.pkl") 32 | words = [] 33 | labels = [] 34 | 35 | word_ids = [] 36 | label_ids = [] 37 | 38 | num_tokens = 0 39 | with open(path) as file: 40 | for line in file: 41 | #line.decode('utf-8') 42 | if line.strip() == "":#this means we have the entire sentence 43 | if 0 < len(words) <= MAX_LENGTH: 44 | word_sentences.append(words[:]) 45 | label_sentences.append(labels[:]) 46 | 47 | word_index_sentences.append(word_ids[:]) 48 | label_index_sentences.append(label_ids[:]) 49 | 50 | num_tokens += len(words) 51 | else: 52 | if len(words) != 0: 53 | logger.info("ignore sentence with length %d" % (len(words))) 54 | 55 | words = [] 56 | labels = [] 57 | 58 | word_ids = [] 59 | label_ids = [] 60 | else: 61 | tokens = line.strip().split() 62 | word = tokens[word_column] 63 | label = tokens[label_column] 64 | 65 | words.append(word) 66 | if(out_dir !=None): 67 | vocab.add(word) 68 | labels.append(label) 69 | word_id = word_alphabet.get_index(word) 70 | label_id = label_alphabet.get_index(label) 71 | 72 | word_ids.append(word_id) 73 | label_ids.append(label_id) 74 | #this is for the last sentence 75 | if 0 < len(words) <= MAX_LENGTH: 76 | word_sentences.append(words[:]) 77 | label_sentences.append(labels[:]) 78 | 79 | word_index_sentences.append(word_ids[:]) 80 | label_index_sentences.append(label_ids[:]) 81 | 82 | num_tokens += len(words) 83 | else: 84 | if len(words) != 0: 85 | logger.info("ignore sentence with length %d" % (len(words))) 86 | 87 | if(out_dir !=None): 88 | if not os.path.exists(out_dir): 89 | os.makedirs(out_dir) 90 | with open(vocab_save_path, 'wb') as handle: 91 | pickle.dump(vocab, handle) 92 | logger.info("vocab written to %s" % (vocab_save_path)) 93 | logger.info("#sentences: %d, #tokens: %d" % (len(word_sentences), num_tokens)) 94 | 95 | return word_sentences, label_sentences, word_index_sentences, label_index_sentences 96 | 97 | def build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless): 98 | scale = np.sqrt(3.0 / embedd_dim) 99 | #TODO:should we build an embedding table with words in our training/dev/test plus glove . 100 | # the extra words in glove will not be trained but can help with UNK 101 | embedd_table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float64) 102 | embedd_table[word_alphabet.default_index, :] = np.random.uniform(-scale, scale, [1, embedd_dim]) 103 | for word, index in word_alphabet.items(): 104 | ww = word.lower() if caseless else word 105 | embedd = embedd_dict[ww] if ww in embedd_dict else np.random.uniform(-scale, scale, [1, embedd_dim]) 106 | embedd_table[index, :] = embedd 107 | return embedd_table 108 | 109 | def construct_padded_char(index_sentences,char_alphabet,max_sent_length,max_char_per_word): 110 | C = np.empty([len(index_sentences), max_sent_length, max_char_per_word], dtype=np.int32) 111 | # this is to mark space at the end of the words 112 | word_end_id = char_alphabet.get_index(word_end) 113 | 114 | for i in range(len(index_sentences)): 115 | words = index_sentences[i] 116 | sent_length = len(words) 117 | for j in range(min(sent_length,max_sent_length)): 118 | chars = words[j] 119 | char_length = len(chars) 120 | for k in range(min (char_length,max_char_per_word)): 121 | cid = chars[k] 122 | C[i, j, k] = cid 123 | # fill index of word end after the end of word 124 | C[i, j, char_length:] = word_end_id 125 | # Zero out C after the end of the sentence 126 | C[i, sent_length:, :] = 0 127 | return C 128 | 129 | 130 | def build_char_embedd_table(char_alphabet,char_embedd_dim=30): 131 | scale = np.sqrt(3.0 / char_embedd_dim) 132 | char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype( 133 | np.float64) 134 | return char_embedd_table 135 | 136 | 137 | def generate_character_data(sentences_list,char_alphabet, setType="Train", char_embedd_dim=30): 138 | """ 139 | generate data for charaters 140 | :param sentences_train: 141 | :param sentences_train: 142 | :param max_sent_length: zero for trainset: 143 | :return: char_index_set_pad,max_char_per_word, char_embedd_table,char_alphabet 144 | """ 145 | 146 | def get_character_indexes(sentences): 147 | index_sentences = [] 148 | max_length = 0 149 | for words in sentences: 150 | index_words = [] 151 | for word in words: 152 | index_chars = [] 153 | if len(word) > max_length: 154 | max_length = len(word) 155 | 156 | for char in word[:MAX_CHAR_PER_WORD ]: 157 | char_id = char_alphabet.get_index(char) 158 | index_chars.append(char_id) 159 | 160 | index_words.append(index_chars) 161 | index_sentences.append(index_words) 162 | return index_sentences, max_length 163 | 164 | char_alphabet.get_index(word_end) 165 | 166 | index_sentences, max_char_per_word = get_character_indexes(sentences_list) 167 | max_char_per_word = min(MAX_CHAR_PER_WORD, max_char_per_word) 168 | logger.info("Maximum character length after %s set is %d" %(setType ,max_char_per_word)) 169 | return index_sentences,max_char_per_word -------------------------------------------------------------------------------- /network.py: -------------------------------------------------------------------------------- 1 | from tensorflow.contrib import rnn 2 | import tensorflow as tf 3 | 4 | 5 | class textBiLSTM(object): 6 | def __init__( 7 | self, sequence_length, num_classes, word_vocab_size, 8 | word_embedd_dim,char_vocab_size,grad_clip,num_filters=20, 9 | filter_size =3, 10 | char_embedd_dim = 30, n_hidden_LSTM =200,max_char_per_word=45): 11 | 12 | # Placeholders for input, output and dropout 13 | self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") 14 | self.input_x_char = tf.placeholder(tf.int32, [None, sequence_length,max_char_per_word], name="input_x_char") 15 | #in this step we basically concatentate all the characters of the words. We need to have a separate layer. 16 | self.input_x_char_flat = tf.reshape(self.input_x_char,[-1,max_char_per_word*sequence_length],name="input_x_char_flat") 17 | 18 | 19 | #input_y is not one hot encoded. 20 | self.input_y = tf.placeholder(tf.int32, [None, sequence_length], name="input_y") 21 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 22 | self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], 23 | name="sequence_lengths") 24 | # Embedding layer (is always built on CPU. There is bug that makes embedding fail on GPU) 25 | with tf.device('/cpu:0'), tf.name_scope("word_embedding"): 26 | #plus 1 becuase 0 is for random word 27 | self.W_word = tf.Variable(tf.random_uniform([word_vocab_size+1, word_embedd_dim],-1,1),trainable=True, name="W_word") 28 | self.word_embedding_placeholder = tf.placeholder(tf.float32, [word_vocab_size+1, word_embedd_dim]) 29 | word_embedding_init = self.W_word.assign(self.word_embedding_placeholder) 30 | ##output is #[batch_size, sequence_length, word_embedd_dim] 31 | self.embedded_words = tf.nn.embedding_lookup(self.W_word, self.input_x,name="embedded_words") 32 | 33 | #Embedding layer (is always built on CPU. There is bug that makes embedding fail on GPU) 34 | with tf.device('/cpu:0'), tf.name_scope("char_embedding"): 35 | #plus 1 becuase 0 is for unknown char 36 | self.W_char = tf.Variable(tf.random_uniform([char_vocab_size+1, char_embedd_dim],-1,1),trainable=True, name="W_char") 37 | self.char_embedding_placeholder = tf.placeholder(tf.float32, [char_vocab_size+1, char_embedd_dim]) 38 | char_embedding_init = self.W_char.assign(self.char_embedding_placeholder) 39 | self.embedded_char = tf.nn.embedding_lookup(self.W_char, self.input_x_char_flat,name="embedded_char") #shape [batch_size,max_char_per_word*sequence_length,char_embedd_dim] 40 | self.embedded_char_dropout =tf.nn.dropout(self.embedded_char, self.dropout_keep_prob,name="embedded_char_dropout") 41 | #Add CNN get filters and combine with word 42 | with tf.name_scope("char_conv_maxPool"): 43 | filter_shape = [filter_size, char_embedd_dim,num_filters] 44 | W_conv = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_conv") 45 | b_conv = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b_conv") 46 | 47 | conv = tf.nn.conv1d(self.embedded_char_dropout, 48 | W_conv, 49 | stride=1, 50 | padding="SAME", 51 | name="conv") #will have dimensions [batch_size,out_width,num_filters] out_width is a function of max_words,filter_size and stride_size #(?, 3051, 20) 52 | #out_width for same padding iwth stride 1 given by (max_char_per_word*sequence_length) 53 | print("conv.get_Shape(): ",conv.get_shape()) 54 | # Apply nonlinearity TODO: Test without relu 55 | #h = tf.nn.bias_add(conv, b_conv,name="add bias")#does not change dimensions 56 | h_expand = tf.expand_dims(conv, -1) 57 | print("h_expand.get_Shape(): ",h_expand.get_shape()) 58 | pooled = tf.nn.max_pool( 59 | h_expand, 60 | #[batch, height, width, channels] 61 | ksize=[1,sequence_length * max_char_per_word,1, 1], #On the batch size dimension and the channels dimension, ksize is 1 because we don't want to take the maximum over multiple examples, or over multiples channels. 62 | strides=[1, max_char_per_word, 1, 1], 63 | padding='SAME', 64 | name="pooled") 65 | #print("pooled.get_Shape(): ",pooled.get_shape()) 66 | #[batch_size,(max_char_per_word*sequence_length), num_filters, 1] --> [batch, sequence_length, num_filters] , same as word_embedding layer (?, 113, 20, 1) --> (?, 113, 20) 67 | self.char_pool_flat = tf.reshape(pooled, [-1,sequence_length,num_filters],name="char_pool_flat") 68 | #print("self.char_pool_flat.get_shape(): ",self.char_pool_flat.get_shape()) 69 | #[batch, sequence_length, word_embedd_dim+num_filters] 70 | self.word_char_features = tf.concat([self.embedded_words, self.char_pool_flat], axis=2) #we mean that the feature with index 2 i/e num_filters is variable 71 | #print("self.word_char_features.get_shape(): ",self.word_char_features.get_shape()) 72 | self.word_char_features_dropout =tf.nn.dropout(self.word_char_features, self.dropout_keep_prob,name="word_char_features_dropout") 73 | 74 | 75 | with tf.name_scope("biLSTM"): 76 | # forward LSTM cell 77 | lstm_fw_cell = rnn.BasicLSTMCell(n_hidden_LSTM, state_is_tuple=True) 78 | # Backward direction cell 79 | lstm_bw_cell = rnn.BasicLSTMCell(n_hidden_LSTM, state_is_tuple=True) 80 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, 81 | lstm_bw_cell, self.word_char_features_dropout, sequence_length=self.sequence_lengths, 82 | dtype=tf.float32)# output : [batch_size, timesteps, cell_fw.output_size] 83 | self.biLstm = tf.concat([output_fw, output_bw], axis=-1,name="biLstm") 84 | self.biLstm_clip = tf.clip_by_value(self.biLstm,-grad_clip,grad_clip) 85 | self.biLstm_dropout =tf.nn.dropout(self.biLstm_clip, self.dropout_keep_prob) 86 | 87 | with tf.name_scope("output"): 88 | W_out = tf.get_variable("W_out",shape = [2*n_hidden_LSTM, num_classes],initializer=tf.contrib.layers.xavier_initializer()) 89 | b_out = tf.Variable(tf.constant(0.0, shape=[num_classes]), name="b_out") 90 | 91 | self.biLstm_reshaped = tf.reshape(self.biLstm_dropout, [-1, 2*n_hidden_LSTM]) # [batch_size * timesteps , 2*n_hidden_LSTM] obtained by statement print(self.biLstm.get_shape()) 92 | 93 | # Final (unnormalized) scores and predictions 94 | self.predictions = tf.nn.xw_plus_b(self.biLstm_reshaped, W_out, b_out, name="predictions") # input : [batch_size * timesteps , 2*n_hidden_LSTM] * [2*n_hidden_LSTM, num_classes] = [batch_size * timesteps , num_classes] 95 | self.logits = tf.reshape(self.predictions, [-1, sequence_length, num_classes],name="logits") # output [batch_size, max_seq_len] 96 | 97 | # CalculateMean cross-entropy loss 98 | with tf.name_scope("loss"): 99 | #needs input as [batch_size, max_seq_len, num_tags] 100 | # input_y : [batch_size, max_seq_len] 101 | #self.logits_clipped = tf.clip_by_value(self.logits,1e-10,10000) 102 | log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( 103 | self.logits, self.input_y, self.sequence_lengths) 104 | self.loss = tf.reduce_mean(-log_likelihood,name="loss") 105 | 106 | '''with tf.name_scope("loss"): 107 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) 108 | mask = tf.sequence_mask(self.sequence_lengths) 109 | losses = tf.boolean_mask(losses, mask) 110 | self.loss = tf.reduce_mean(losses)''' 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /test_NER.py: -------------------------------------------------------------------------------- 1 | 2 | #usage python test_NER.py --PathToConfig /data/gilopez/Tf_LSTM_CRF/runs/1494895894/ --modelName model-41000 3 | import tensorflow as tf 4 | import utils as utils 5 | import data_processor as dp 6 | 7 | #Alphabet maps objects to integer ids 8 | from alphabet import Alphabet 9 | import network as network 10 | import aux_network_func as af 11 | #import pickle 12 | import dill 13 | 14 | import numpy as np 15 | import os 16 | import time 17 | import datetime 18 | 19 | cwd = os.getcwd() 20 | 21 | tf.flags.DEFINE_string("modelName", 'model', "Name of model (default: model)") 22 | tf.flags.DEFINE_string("PathToConfig", cwd, "Path to the directory where config file is stored (default: model)") 23 | tf.flags.DEFINE_string("TestFilePath", "eng.testa.iobes.act.part", "Path to the directory where config file is stored (default: eng.testa.iobes.act)") 24 | FLAGS = tf.flags.FLAGS 25 | FLAGS._parse_flags() 26 | 27 | configFile = os.path.abspath(os.path.join(FLAGS.PathToConfig ,"config.pkl")) 28 | print(configFile) 29 | FlagsDict = dill.load(open(configFile,'rb')) 30 | 31 | modelName = FLAGS.modelName 32 | 33 | path_to_models = FlagsDict['checkpoint_dir'] 34 | #path_to_models = "/data/gilopez/Tf_LSTM_CRF/runs/1494820638/checkpoints" 35 | logger = utils.get_logger("EvalCode") 36 | 37 | word_alphabet = dill.load(open(os.path.abspath(os.path.join(FLAGS.PathToConfig ,'word_alphabet.pkl')),'rb')) 38 | char_alphabet = dill.load(open(os.path.abspath(os.path.join(FLAGS.PathToConfig ,'char_alphabet.pkl')),'rb')) 39 | label_alphabet = dill.load(open(os.path.abspath(os.path.join(FLAGS.PathToConfig ,'label_alphabet.pkl')),'rb')) 40 | 41 | test_path = FLAGS.TestFilePath 42 | timestamp = str(int(time.time())) 43 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "test", timestamp)) 44 | if not os.path.exists(out_dir): 45 | os.makedirs(out_dir) 46 | 47 | 48 | with tf.Session() as sess: 49 | sess.run(tf.global_variables_initializer()) #always have this before you load , else it reinitializes the model to zero state. 50 | new_saver = tf.train.import_meta_graph(os.path.abspath(os.path.join(path_to_models, modelName+".meta"))) 51 | new_saver.restore(sess, os.path.abspath(os.path.join(path_to_models, modelName))) 52 | 53 | # Access saved Variables directly. Get a list of variables by using #to get all keys 54 | #All_varaibles = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) 55 | #Below code verifies that the model was loaded correctly 56 | #print(sess.run('output/b_out:0')) 57 | # to save any variable do : 58 | #Embedding = sess.run(graph.get_tensor_by_name('word_embedding/W_word:0')) 59 | logger.info("Running Test ....") 60 | af.test_step_report(logger,sess, 61 | PadZeroBegin=FlagsDict['PadZeroBegin'],max_length = FlagsDict['sequence_length'], 62 | test_path=test_path, dropout_keep_prob=FlagsDict['dropout_keep_prob'],step=1, 63 | out_dir=out_dir,char_alphabet=char_alphabet,label_alphabet=label_alphabet, 64 | word_alphabet=word_alphabet,word_column=FlagsDict['word_col'], label_column=FlagsDict['label_col'], char_embedd_dim=FlagsDict['char_embedd_dim'],max_char_per_word=FlagsDict['max_char_per_word']) 65 | 66 | ''' This is for interpretability 67 | new_accuracy_test,accuracy_low_classes_test= af.test_step_eval(logger,sess, 68 | PadZeroBegin=FlagsDict['PadZeroBegin'],max_length = FlagsDict['sequence_length'], 69 | test_path=test_path, dropout_keep_prob=FlagsDict['dropout_keep_prob'],step=1, 70 | out_dir=out_dir,char_alphabet=char_alphabet,label_alphabet=label_alphabet, 71 | word_alphabet=word_alphabet,word_column=FlagsDict['word_col'], label_column=FlagsDict['label_col'], char_embedd_dim=FlagsDict['char_embedd_dim'],max_char_per_word=FlagsDict['max_char_per_word']) 72 | 73 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from gensim.models.word2vec import Word2Vec 2 | import logging 3 | import sys 4 | import gzip 5 | import numpy as np 6 | 7 | def get_logger(name, level=logging.INFO, handler=sys.stdout, 8 | formatter='%(asctime)s - %(name)s - %(levelname)s - %(message)s'): 9 | logger = logging.getLogger(name) 10 | logger.setLevel(logging.INFO) 11 | formatter = logging.Formatter(formatter) 12 | stream_handler = logging.StreamHandler(handler) 13 | stream_handler.setLevel(level) 14 | stream_handler.setFormatter(formatter) 15 | logger.addHandler(stream_handler) 16 | 17 | return logger 18 | 19 | def print_FLAGS(FLAGS,logger): 20 | Flags_Dict = {} 21 | logger.info("\nParameters:") 22 | for attr, value in sorted(FLAGS.__flags.items()): 23 | logger.info("{} = {}".format(attr, value)) 24 | Flags_Dict[attr] = value 25 | logger.info("\n") 26 | return Flags_Dict 27 | 28 | 29 | def load_word_embedding_dict(embedding,embedding_path,logger): 30 | """ 31 | load word embeddings from file 32 | :param embedding: 33 | :param embedding_path: 34 | :param logger: 35 | :return: embedding dict, embedding dimention, caseless 36 | """ 37 | if embedding == 'word2vec': 38 | # loading word2vec 39 | logger.info("Loading word2vec ...") 40 | word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) 41 | embedd_dim = word2vec.vector_size 42 | return word2vec, embedd_dim, False 43 | elif embedding == 'glove': 44 | # loading GloVe 45 | logger.info("Loading GloVe ...") 46 | embedd_dim = -1 47 | embedd_dict = dict() 48 | with gzip.open(embedding_path, 'r') as file: 49 | for line in file: 50 | line = line.strip() 51 | if len(line) == 0: 52 | continue 53 | 54 | tokens = line.split() 55 | if embedd_dim < 0: 56 | embedd_dim = len(tokens) - 1 #BECAUSE THE ZEROTH INDEX IS OCCUPIED BY THE WORD 57 | else: 58 | assert (embedd_dim + 1 == len(tokens)) 59 | embedd = np.empty([1, embedd_dim], dtype=np.float64) 60 | embedd[:] = tokens[1:] 61 | embedd_dict[tokens[0]] = embedd 62 | return embedd_dict, embedd_dim, True 63 | else: 64 | raise ValueError("embedding should choose from [word2vec, glove]") 65 | 66 | def get_max_length(word_sentences): 67 | max_len = 0 68 | for sentence in word_sentences: 69 | length = len(sentence) 70 | if length > max_len: 71 | max_len = length 72 | return max_len 73 | 74 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 75 | """ 76 | Generates a batch iterator for a dataset. 77 | """ 78 | data = np.array(data) 79 | data_size = len(data) 80 | num_batches_per_epoch = int((len(data)-1)/batch_size) + 1 81 | for epoch in range(num_epochs): 82 | # Shuffle the data at each epoch 83 | if shuffle: 84 | shuffle_indices = np.random.permutation(np.arange(data_size)) 85 | shuffled_data = data[shuffle_indices] 86 | else: 87 | shuffled_data = data 88 | for batch_num in range(num_batches_per_epoch): 89 | start_index = batch_num * batch_size 90 | end_index = min((batch_num + 1) * batch_size, data_size) 91 | yield shuffled_data[start_index:end_index] 92 | 93 | #this function will pad 0 at the beginning of the sentence. if you add beg = false it will add to the end 94 | def padSequence(dataset,max_length,beginZero=True): 95 | dataset_p = [] 96 | actual_sequence_length =[] 97 | #added np.atleast_2d here 98 | for x in dataset: 99 | row_length = len(x) 100 | actual_sequence_length.append(row_length) 101 | if(row_length <=max_length): 102 | if(beginZero): 103 | dataset_p.append(np.pad(x,pad_width=(max_length-len(x),0),mode='constant',constant_values=0)) 104 | else: 105 | dataset_p.append(np.pad(x,pad_width=(0,max_length-len(x)),mode='constant',constant_values=0)) 106 | else: 107 | dataset_p.append(x[0:max_length]) 108 | return np.array(dataset_p),actual_sequence_length 109 | 110 | 111 | --------------------------------------------------------------------------------