├── __init__.py
├── util
    ├── __init__.py
    ├── preprocess.py
    ├── util.py
    └── input_helpers.py
├── Sentence_Modeling
    ├── __init__.py
    ├── test.py
    ├── test_mask.py
    ├── Siamese_network.py
    └── Sentence_Model.py
├── word2vec_pretrain
    ├── __init__.py
    ├── visualization.py
    └── preparation.py
├── README.md
├── plot_figures.py
├── visualization.py
├── coorrence.py
├── test_Model.py
├── helper.py
├── DeepModel.py
├── reload_model.py
├── tensor_construction.py
├── MultiGran_Model.py
├── tensor.py
├── Dynamic
    ├── MT_Dynamic_MultiGranModel.py
    └── MT_Dynamic_Arch.py
├── train_test_idf.py
├── train.py
├── MultiTask_MultiGranModel.py
└── Multi_task_Arch.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Sentence_Modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/word2vec_pretrain/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DLDisambiguation
2 | ## A multi-task normalization model with multi-view CNN 
3 | 
4 | ## The goal is do medical concept normalization and link medical short-texts into entities in ICD-10.
5 | 


--------------------------------------------------------------------------------
/plot_figures.py:
--------------------------------------------------------------------------------
 1 | import pylab as pl
 2 | 
 3 | x1 = [1, 2, 3, 4, 5, 6]# Make x, y arrays for each graph
 4 | x = [0.001, 0.01, 0.1, 1, 10, 100]# Make x, y arrays for each graph
 5 | y1 = [0.8487, 0.8562, 0.8578, 0.8581, 0.8255, 0.7031]
 6 | x2 = x1
 7 | y2 = [0.9116, 0.9148, 0.9101, 0.9081, 0.8992, 0.7140]
 8 | 
 9 | import matplotlib.pyplot as plt
10 | 
11 | ax = plt.subplot(111, xlabel='lambda', ylabel='F1 score')
12 | for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()):
13 |  item.set_fontsize(15)
14 | 
15 | plot1, = ax.plot(x1, y1, 'rs-', label='Disease')# use pylab to plot x and y : Give your plots names
16 | plot2, = ax.plot(x2, y2, 'b^-', label='Procedure')
17 | 
18 | # pl.title('MTL performance with different lambda of constraints')# give plot a title
19 | pl.xticks(x1, x, rotation=0)
20 | 
21 | pl.legend(handles=[plot1, plot2],  numpoints=1, fontsize=15)# make legend
22 | pl.show()# show the plot on the screen
23 | 


--------------------------------------------------------------------------------
/util/preprocess.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import re
 6 | import numpy as np
 7 | from tensorflow.contrib import learn  # pylint: disable=g-bad-import-order
 8 | 
 9 | TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",
10 |                           re.UNICODE)
11 | 
12 | def tokenizer(iterator):
13 |     for value in iterator:
14 |         yield list(value)
15 | 
16 | 
17 | class MyVocabularyProcessor(learn.preprocessing.VocabularyProcessor):
18 |     def __init__(self,
19 |                  max_document_length,
20 |                  min_frequency=0,
21 |                  vocabulary=None,
22 |                  tokenizer_fn=tokenizer):
23 |         self.sup = super(MyVocabularyProcessor, self)
24 |         self.sup.__init__(max_document_length, min_frequency, vocabulary, tokenizer_fn)
25 | 
26 |     def transform(self, raw_documents):
27 |         """Transform documents to word-id matrix.
28 |         Convert words to ids with vocabulary fitted with fit or the one
29 |         provided in the constructor.
30 |         Args:
31 |           raw_documents: An iterable which yield either str or unicode.
32 |         Yields:
33 |           x: iterable, [n_samples, max_document_length]. Word-id matrix.
34 |         """
35 |         for tokens in self._tokenizer(raw_documents):
36 |             word_ids = np.zeros(self.max_document_length, np.int64)
37 |             for idx, token in enumerate(tokens):
38 |                 if idx >= self.max_document_length:
39 |                     break
40 |                 word_ids[idx] = self.vocabulary_.get(token)
41 |             yield word_ids
42 | 


--------------------------------------------------------------------------------
/word2vec_pretrain/visualization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | import sys
 5 | reload(sys)
 6 | sys.setdefaultencoding('utf-8')
 7 | 
 8 | from matplotlib import rc
 9 | rc('font',**{'family':'sans-serif','sans-serif':['AR PL KaitiM GB']})
10 | 
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from sklearn.manifold import TSNE
14 | from gensim.models import Word2Vec
15 | 
16 | def checkSimilarity(model_file, word):
17 | 
18 |     model = Word2Vec.load(model_file)
19 |     arr = model.most_similar([word])
20 |     for x in arr:
21 |         print(str(x[0] + ":" + str(x[1])))
22 | 
23 | def main():
24 |     # model_file = "../data/word2vec/character.model"
25 |     model_file = "../data/word2vec_new/word.model"
26 |     checkSimilarity(model_file, "左")
27 | 
28 |     # character_wv_file = '../data/word2vec/character_model.txt'
29 |     # word_wv_file = '../data/word2vec/word_model.txt'
30 |     #
31 |     # embeddings_file = word_wv_file
32 |     # wv, vocabulary = load_embeddings(embeddings_file)
33 |     #
34 |     # tsne = TSNE(n_components=2, random_state=0)
35 |     # np.set_printoptions(suppress=True)
36 |     # Y = tsne.fit_transform(wv[:1000, :])
37 |     #
38 |     # plt.scatter(Y[:, 0], Y[:, 1])
39 |     # for label, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]):
40 |     #     plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
41 |     # plt.show()
42 | 
43 | 
44 | def load_embeddings(file_name):
45 | 
46 |     # with codecs.open(file_name, 'r', 'utf-8') as f_in:
47 |     lines = open(file_name).readlines()[1:]
48 |     vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in lines])
49 | 
50 |     wv = np.loadtxt(wv)
51 |     return wv, vocabulary
52 | 
53 | if __name__ == '__main__':
54 |     main()


--------------------------------------------------------------------------------
/Sentence_Modeling/test.py:
--------------------------------------------------------------------------------
 1 | from DLDisambiguation.util.input_helpers import InputHelper
 2 | from DLDisambiguation.util.preprocess import MyVocabularyProcessor
 3 | import tensorflow as tf
 4 | import os
 5 | 
 6 | # tf.flags.DEFINE_string("test_file", "../data/validation_data_0724_opr.txt", "training file (default: None)")
 7 | tf.flags.DEFINE_string("test_file", "../data/test_data_0816_des.txt", "training file (default: None)")
 8 | 
 9 | FLAGS = tf.flags.FLAGS
10 | inpH = InputHelper()
11 | max_document_length = 10
12 | y_is_value = True
13 | 
14 | model_dir = "./Exp/runs/Description1502955472"
15 | # model_dir = "./runs/Description1500991322"  # 0.760
16 | # model_dir = "./runs/1500428748" # 0.69
17 | # model_dir = "./runs/Description1500983617"  # 0.767
18 | # model_dir = "./runs/Description1501058401" # 0.754
19 | # model_dir = "./runs/Operation1501000120" # 0.809
20 | 
21 | checkpoint_dir = os.path.join(model_dir, "checkpoints")
22 | print(checkpoint_dir)
23 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
24 | checkpoint_file = ckpt.model_checkpoint_path
25 | 
26 | vocab_file = os.path.join(checkpoint_dir, "vocab")
27 | # load vocabulary model
28 | vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
29 | vocab_processor = vocab_processor.restore(vocab_file)
30 | 
31 | test_x1, test_x2, test_y = inpH.getTestIndexedDataSet(FLAGS.test_file, "\t", vocab_processor, max_document_length,
32 |                                                       y_is_value)
33 | 
34 | graph = tf.Graph()
35 | 
36 | with graph.as_default():
37 |     sess = tf.Session()
38 |     with sess.as_default():
39 |         # Load the saved meta graph and restore variables
40 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
41 |         sess.run(tf.initialize_all_variables())
42 |         saver.restore(sess, checkpoint_file)
43 | 
44 |         # Get the placeholders from the graph by name
45 |         # the output is a list with only one element
46 |         input_x1 = graph.get_operation_by_name("input_x1").outputs[0]
47 |         input_x2 = graph.get_operation_by_name("input_x2").outputs[0]
48 |         input_y = graph.get_operation_by_name("input_y").outputs[0]
49 | 
50 |         accuracy_o = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
51 | 
52 |         accu = sess.run([accuracy_o], {input_x1: test_x1, input_x2: test_x2, input_y: test_y})
53 |     print(accu)
54 | 


--------------------------------------------------------------------------------
/visualization.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import numpy as np
 3 | from matplotlib.font_manager import FontManager, FontProperties
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | plt.rcParams['font.sans-serif'] = ['SimHei']
 7 | from util.input_helpers import InputHelper
 8 | 
 9 | max_document_length = 20
10 | task_num = 2
11 | name = "des" if task_num == 1 else "opr"
12 | 
13 | arr = np.load(
14 |     "/Users/luoyi/Documents/Python/DLDisambiguation/Tensor_files/0823/Length" + str(
15 |         max_document_length) + "/test_" + name + ".npy")
16 | 
17 | inpH = InputHelper()
18 | 
19 | data_file_test = "/Users/luoyi/Documents/Python/DLDisambiguation/data/test_data_0823_" + name + ".txt"
20 | x_test_mention, x_test_entity, y_test = inpH.getTsvTestData(data_file_test, "\t", max_document_length, y_value=False)
21 | 
22 | sample_n = 1
23 | dir_ = "./fig/"
24 | 
25 | 
26 | # sample_n = len(arr)
27 | 
28 | def getChineseFont():
29 |     return FontProperties(fname='/Users/luoyi/Downloads/msyh.ttf')
30 | 
31 | 
32 | sample_index = 352
33 | row_n, col_n = 1, 4
34 | plt.subplots(row_n, col_n, figsize=(20, 10))
35 | 
36 | for j in range(col_n):
37 |     t = arr[sample_index, :, :, j]
38 |     # x = t
39 |     # t = (x - np.min(x)) / (np.max(x) - np.min(x))
40 |     # t = np.abs(t)
41 |     plt.subplot(row_n, col_n, j + 1)
42 |     plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.magma)
43 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.bone)
44 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.hsv)
45 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.prism)
46 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.flag)
47 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.autumn)
48 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.cool)
49 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.jet)
50 |     # plt.imshow(t, vmin=-1, vmax=1, interpolation='nearest', cmap=plt.get_cmap('gray'))
51 | 
52 |     plt.title(str(j))
53 | 
54 | # plt.ylabel(unicode(x_test_mention[sample_index]), fontproperties=getChineseFont())
55 | # plt.xlabel(unicode(x_test_entity[sample_index]), fontproperties=getChineseFont())
56 | 
57 | # plt.show()
58 | plt.colorbar(fraction=0.046, pad=0.04)
59 | plt.savefig(dir_ + "outfile" + str(sample_index) + name + "_magma.jpg")
60 | 


--------------------------------------------------------------------------------
/coorrence.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | # # discover relationships between disease and procedure from labeled_data and save in coorrence_file
 5 | 
 6 | file_des_opr = open("./data/exp0823/filter_pos_data_review_final.txt")
 7 | res = open("coorrence_file.txt", "w+")
 8 | 
 9 | entity_names = []
10 | map_dict = []
11 | 
12 | line = file_des_opr.readline()
13 | while line != "":
14 |     tmp = line.split("\t")
15 |     des_e_name = tmp[1].strip()
16 |     opr_e_name = tmp[3].strip()
17 | 
18 |     if entity_names.__contains__(des_e_name):
19 |         i = entity_names.index(des_e_name)
20 |         if opr_e_name in map_dict[i].keys():
21 |             map_dict[i][opr_e_name] += 1
22 |         else:
23 |             map_dict[i][opr_e_name] = 1
24 |     else:
25 |         entity_names.append(des_e_name)
26 |         map_dict.append({opr_e_name: 1})
27 |     line = file_des_opr.readline()
28 | 
29 | length = len(entity_names)
30 | for i in range(length):
31 |     res.write(entity_names[i] + "\t")
32 |     map_dict_des = map_dict[i]
33 |     for k, v in map_dict_des.items():
34 |         res.write(k + ":" + str(v) + "_")
35 |     res.write("\n")
36 | 
37 | # # discover relationships between disease and procedure from Database and save in new_co_file.file
38 | 
39 | # # !/usr/bin/env python
40 | # # coding=utf-8
41 | # import MySQLdb
42 | # import codecs
43 | 
44 | # conn = MySQLdb.connect("localhost", "root", "10081008", "medical", charset='utf8')
45 | # cursor = conn.cursor()
46 | # cursor.execute('select S050100, S050501 from d2014_2015 where S050100 != "" and S050501 != "" limit 10000000;')
47 | # values = cursor.fetchall()
48 | # print("Finished data loading...")
49 | #
50 | # cursor.execute('select 疾病名称 from Norm6;')
51 | # disease_tuple = cursor.fetchall()
52 | # disease_list = [i[0] for i in disease_tuple]
53 | #
54 | # cursor.execute('select 手术名称 from Treatment;')
55 | # operation_tuple = cursor.fetchall()
56 | # operation_list = [i[0] for i in operation_tuple]
57 | # print("Finished Disease and Operation Names loading...")
58 | #
59 | # co_file = codecs.open("./data/new_co_file.txt", "w+", encoding="utf-8")
60 | # map_dict = {}
61 | # for i in values:
62 | #     d_name = i[0]
63 | #     o_name = i[1]
64 | #     if d_name in disease_list and o_name in operation_list:
65 | #         if d_name in map_dict.keys():
66 | #             o_dict = map_dict[d_name]
67 | #             if o_name in o_dict.keys():
68 | #                 map_dict[d_name][o_name] += 1
69 | #             else:
70 | #                 map_dict[d_name][o_name] = 1
71 | #         else:
72 | #             map_dict[d_name] = {o_name: 1}
73 | #
74 | # for k, v in map_dict.iteritems():
75 | #     co_file.write(k + "\t")
76 | #     for o_name, num in v.iteritems():
77 | #         co_file.write(o_name + ":"+str(num) + "_")
78 | #     co_file.write("\n")
79 | # co_file.close()
80 | 


--------------------------------------------------------------------------------
/test_Model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import os
 3 | from DLDisambiguation.util.input_helpers import InputHelper
 4 | from tensor import Tensor
 5 | import numpy as np
 6 | from util.util import write_evaluation_file
 7 | 
 8 | tf.flags.DEFINE_string("train_dir", "./", "Training dir root")
 9 | 
10 | FLAGS = tf.flags.FLAGS
11 | FLAGS._parse_flags()
12 | 
13 | task_num = 2
14 | inpH = InputHelper()
15 | max_document_length = 20
16 | 
17 | name = "des" if task_num == 1 else "opr"
18 | 
19 | # load in model
20 | model_dir = "./runs/Single_task21501595265"
21 | checkpoint_dir = os.path.join(model_dir, "checkpoints")
22 | print(checkpoint_dir)
23 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
24 | checkpoint_file = ckpt.model_checkpoint_path
25 | 
26 | lstm_dir = "Description1500991322" if task_num == 1 else "Operation1501000120"
27 | lstm_dir = os.path.join("./Sentence_Modeling/runs", lstm_dir)
28 | 
29 | # load data
30 | load_Tensor = True
31 | 
32 | data_file = os.path.join(FLAGS.train_dir, "data/training_data_0724_" + name + ".txt")
33 | data_file_test = os.path.join(FLAGS.train_dir, "data/test_data_0724_" + name + ".txt")
34 | data_file_val = os.path.join(FLAGS.train_dir, "data/validation_data_0724_" + name + ".txt")
35 | 
36 | x_test_mention, x_test_entity, y_test = inpH.getTsvTestData(data_file_test, "\t", max_document_length, y_value=False)
37 | 
38 | if load_Tensor:
39 |     mydir = "./Length" + str(max_document_length) + "/"
40 |     x_test_tensor = np.load(mydir + "test_" + name + ".npy")
41 | 
42 | else:
43 |     x_test_tensor = Tensor(x_test_mention, x_test_entity, len(x_test_entity), max_document_length, task_num,
44 |                            lstm_dir).get_tensor()
45 |     x_test_tensor = x_test_tensor.transpose((0, 2, 3, 1))
46 | 
47 | graph = tf.Graph()
48 | 
49 | eval_file = open(os.path.join(model_dir, "test_analysis.txt"), "w+")
50 | rightfile = open(os.path.join(model_dir, "right_analysis.txt"), "w+")
51 | wrongfile = open(os.path.join(model_dir, "wrong_analysis.txt"), "w+")
52 | 
53 | with graph.as_default():
54 |     sess = tf.Session()
55 | 
56 |     with sess.as_default():
57 |         # Load the saved meta graph and restore variables
58 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
59 |         sess.run(tf.initialize_all_variables())
60 |         saver.restore(sess, checkpoint_file)
61 | 
62 |         for var in tf.trainable_variables():
63 |             print(var.name)
64 |             print(var)
65 |             print(sess.run(var))
66 | 
67 |         # Get the placeholders from the graph by name
68 |         # the output is a list with only one element
69 |         input_tensor = graph.get_operation_by_name("input_tensor").outputs[0]
70 |         input_y = graph.get_operation_by_name("input_y").outputs[0]
71 |         droppout = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
72 | 
73 |         predictions = graph.get_operation_by_name("output/predictions").outputs[0]
74 |         accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
75 |         scores = graph.get_operation_by_name("output/scores").outputs[0]
76 | 
77 |         pre, accu, scores = sess.run(
78 |             [predictions, accuracy, scores], {input_tensor: x_test_tensor, input_y: y_test, droppout: 1})
79 | 
80 |         eval_file.write("Accu: " + str(accu) + "\n")
81 |         length = len(x_test_mention)
82 |         labels = np.argmax(y_test, 1)
83 | 
84 |         write_evaluation_file(eval_file, rightfile, wrongfile, labels, pre, x_test_mention, x_test_entity)
85 | 


--------------------------------------------------------------------------------
/helper.py:
--------------------------------------------------------------------------------
  1 | # combine segmented segments to complete ones
  2 | # generate new_training_data_.txt
  3 | 
  4 | # file = open("./data/" + name + "_data_0810.txt")
  5 | # line = file.readline().strip()
  6 | # new_f = open("./data/new_training_data_.txt", "w+")
  7 | #
  8 | # while line != "":
  9 | #     tmp = line.split("\t")
 10 | #     line = file.readline().strip()
 11 | #     unormalized = "".join(tmp[1].split(" "))
 12 | #     normalized = "".join(tmp[2].split(" "))
 13 | #     new_f.write(unormalized + "\t" + normalized + "\n")
 14 | #
 15 | # new_f.close()
 16 | # file.close()
 17 | #
 18 | # file = open("./data/new_training_data_.txt", "r")
 19 | # data1 = open("./data/training_data.txt", "w+")
 20 | # data2 = open("./data/testing_data.txt", "w+")
 21 | #
 22 | # line = file.readline()
 23 | # cnt = 0
 24 | # while line != "":
 25 | #     if cnt < 15000:
 26 | #         data1.write(line)
 27 | #     elif cnt < 20000:
 28 | #         data2.write(line)
 29 | #     else:
 30 | #         break
 31 | #     line = file.readline()
 32 | #     cnt += 1
 33 | # file.close()
 34 | # data1.close()
 35 | # data2.close()
 36 | 
 37 | key_set = ["train", "test"]
 38 | # key_set = ["training", "validation", "test"]
 39 | 
 40 | for key in key_set:
 41 |     prex = "_data_0823"
 42 |     # prex = "_dynamic_data"
 43 |     file = open("./data/exp0823/data_augment_" + key + ".txt")
 44 |     line = file.readline().strip()
 45 |     new_f = open("./data/" + key + prex + "_des.txt", "w+")
 46 |     new_f_o = open("./data/" + key + prex + "_opr.txt", "w+")
 47 | 
 48 |     while line != "":
 49 |         tmp = line.split("\t")
 50 |         line = file.readline().strip()
 51 |         label = tmp[0]
 52 |         unormalized_d = tmp[1]
 53 |         normalized_d = tmp[2]
 54 |         new_f.write(label + "\t" + unormalized_d + "\t" + normalized_d + "\n")
 55 | 
 56 |         label2 = tmp[3]
 57 |         unormalized_o = tmp[4]
 58 |         normalized_o = tmp[5]
 59 |         new_f_o.write(label2 + "\t" + unormalized_o + "\t" + normalized_o + "\n")
 60 | 
 61 |     new_f.close()
 62 |     new_f_o.close()
 63 |     file.close()
 64 | 
 65 | # # generate dynamic dataset
 66 | # import random
 67 | 
 68 | # name = "validation"
 69 | # file_name = "./data/exp0803/" + name + "_data_0803.lpy.csv"
 70 | # file_t = open(file_name)
 71 | # line = file_t.readline()
 72 | #
 73 | # res_file = open("./data/exp0803/" + name + "_dynamic_data.txt", "w+")
 74 | # cnt = 0
 75 | # while line != "":
 76 | #     cnt += 1
 77 | #     if cnt > 80000:
 78 | #         break
 79 | #     res = line.split("\t")
 80 | #     random_n = random.random()
 81 | #     if random_n < 0.6:
 82 | #         res_file.write(line)
 83 | #     else:
 84 | #         res_file.write("\t".join(res[:3]) + "\n")
 85 | #     line = file_t.readline()
 86 | # res_file.close()
 87 | 
 88 | ########### Analyze results of models
 89 | 
 90 | # file2 = open("./runs/Exp/Single_task11502361344/right_cases.txt")
 91 | # file1 = open("./runs/Exp/Single_task11502361227/right_cases.txt")
 92 | #
 93 | # line = file1.readline()
 94 | # arr1 = []
 95 | # while line != "":
 96 | #     arr1.append(line)
 97 | #     line = file1.readline()
 98 | # line = file2.readline()
 99 | #
100 | # arr2 = []
101 | # while line != "":
102 | #     if line in arr1:
103 | #         arr1.remove(line)
104 | #         arr2.append(line)
105 | #     line = file2.readline()
106 | #
107 | # ans = open("ans.txt", "w+")
108 | # ans2 = open("ans_overlap.txt", "w+")
109 | # for i in arr1:
110 | #     ans.write(i)
111 | #
112 | # for i in arr2:
113 | #     ans2.write(i)
114 | # ans.close()
115 | # ans2.close()
116 | 


--------------------------------------------------------------------------------
/DeepModel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding=utf-8
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | # no use any more...*********************************** Change to MultiGran_Model.py
 7 | class CNNModel(object):
 8 | 
 9 |     def __init__(self, max_len, filter_sizes, num_filters, l2_reg_lambda=0.0):
10 |         channel_num = 4
11 | 
12 |         # Placeholders for input, output and dropout
13 |         self.input_tensor = tf.placeholder(tf.float32, [None, max_len, max_len, 4], name="input_tensor")
14 |         self.input_y = tf.placeholder(tf.float32, [None, 2], name="input_y")
15 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
16 | 
17 |         # Keeping track of l2 regularization loss (optional)
18 |         l2_loss = tf.constant(0.0)
19 | 
20 |         # Create a convolution + maxpool layer for each filter size
21 |         pooled_outputs = []
22 |         for i, filter_size in enumerate(filter_sizes):
23 |             filter_shape = [filter_size, filter_size, channel_num, num_filters]
24 | 
25 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
26 |                 # Convolution Layer
27 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
28 |                 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
29 |                 conv = tf.nn.conv2d(
30 |                     self.input_tensor,
31 |                     W,
32 |                     strides=[1, 1, 1, 1],
33 |                     padding="VALID",
34 |                     name="conv")
35 |                 # Apply nonlinearity
36 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
37 | 
38 |                 # Maxpooling over the outputs
39 |                 pooled = tf.nn.max_pool(
40 |                     h,
41 |                     # ksize=[1, filter_size, filter_size, 1],
42 |                     ksize=[1, max_len - filter_size + 1, max_len - filter_size + 1, 1],
43 |                     # ksize=[1, max_len - filter_size + 1, 1, 1],
44 |                     strides=[1, 1, 1, 1],
45 |                     padding='VALID',
46 |                     name="pool")
47 |                 pooled_outputs.append(pooled)
48 | 
49 |         # Combine all the pooled features
50 |         num_filters_total = num_filters * len(filter_sizes)
51 |         self.h_pool = tf.concat(pooled_outputs, 3) # 128
52 |         self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])  # 128
53 | 
54 |         # Add dropout
55 |         with tf.name_scope("dropout"):
56 |             self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob, name="hidden_output_drop")
57 | 
58 |         # Final (unnormalized) scores and predictions
59 |         with tf.name_scope("output"):
60 |             W = tf.get_variable(
61 |                 "W_output",
62 |                 shape=[num_filters_total, 2],
63 |                 initializer=tf.contrib.layers.xavier_initializer())
64 |             b = tf.Variable(tf.constant(0.1, shape=[2]), name="b")
65 |             l2_loss += tf.nn.l2_loss(W)
66 |             l2_loss += tf.nn.l2_loss(b)
67 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
68 |             self.predictions = tf.argmax(self.scores, 1, name="predictions")
69 | 
70 |         # Calculate Mean cross-entropy loss
71 |         with tf.name_scope("loss"):
72 |             losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
73 |             self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
74 | 
75 |         # Accuracy
76 |         with tf.name_scope("accuracy"):
77 |             correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
78 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
79 | 


--------------------------------------------------------------------------------
/Sentence_Modeling/test_mask.py:
--------------------------------------------------------------------------------
  1 | from DLDisambiguation.util.input_helpers import InputHelper
  2 | from DLDisambiguation.util.preprocess import MyVocabularyProcessor
  3 | import tensorflow as tf
  4 | import os
  5 | import numpy as np
  6 | 
  7 | 
  8 | def writeToFile(m, file):
  9 |     l = len(m)
 10 |     for i in range(l):
 11 |         l_col = len(m[i])
 12 |         file.write("[")
 13 |         for j in range(l_col):
 14 |             file.write(str(m[i][j]) + " ")
 15 |         file.write("]\n")
 16 |     file.write("\n")
 17 | 
 18 | inpH = InputHelper()
 19 | max_document_length = 10
 20 | y_is_value = True
 21 | 
 22 | 
 23 | def get_data(vocab_processor, inpH, train_x1, train_x2, train_y, max_document_length):
 24 |     train_x1_i = np.asarray(list(vocab_processor.transform(train_x1)))
 25 |     train_x2_i = np.asarray(list(vocab_processor.transform(train_x2)))
 26 | 
 27 |     mask_train_x1 = np.zeros([len(train_x1_i), max_document_length])
 28 |     mask_train_x2 = np.zeros([len(train_x2_i), max_document_length])
 29 | 
 30 |     new_mask_x1, new_mask_x2 = inpH.padding_and_generate_mask(train_x1, train_x2, mask_train_x1, mask_train_x2)
 31 |     return (train_x1_i, train_x2_i, new_mask_x1, new_mask_x2, train_y)
 32 | 
 33 | 
 34 | model_dir = "./Exp/runs/Description1502868912"
 35 | 
 36 | checkpoint_dir = os.path.join(model_dir, "checkpoints")
 37 | print(checkpoint_dir)
 38 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
 39 | checkpoint_file = ckpt.model_checkpoint_path
 40 | 
 41 | vocab_file = os.path.join(checkpoint_dir, "vocab")
 42 | # load vocabulary model
 43 | vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
 44 | vocab_processor = vocab_processor.restore(vocab_file)
 45 | 
 46 | task_num = 1
 47 | name = "des" if task_num == 1 else "opr"
 48 | test_file = "../data/test_data_0816_" + name + ".txt"
 49 | test_x1, test_x2, test_y = inpH.getTsvTestData(test_file, "\t", max_document_length, y_is_value)
 50 | 
 51 | test_set = get_data(vocab_processor, inpH, test_x1, test_x2, test_y, max_document_length)
 52 | 
 53 | filename = model_dir + "/test_look.txt"
 54 | file = open(filename, "w+")
 55 | 
 56 | # param_f = open(model_dir + "/params.txt", "w+")
 57 | graph = tf.Graph()
 58 | 
 59 | with graph.as_default():
 60 |     sess = tf.Session()
 61 |     with sess.as_default():
 62 |         # Load the saved meta graph and restore variables
 63 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
 64 |         sess.run(tf.initialize_all_variables())
 65 |         saver.restore(sess, checkpoint_file)
 66 | 
 67 |         # for var in tf.trainable_variables():
 68 |         #     print(var.name)
 69 |         #     print(var)
 70 |         #     print(sess.run(var))
 71 | 
 72 |         # Get the placeholders from the graph by name
 73 |         # the output is a list with only one element
 74 |         input_x1 = graph.get_operation_by_name("input_x1").outputs[0]
 75 |         mask_x1 = graph.get_operation_by_name("mask_x1").outputs[0]
 76 |         input_x2 = graph.get_operation_by_name("input_x2").outputs[0]
 77 |         mask_x2 = graph.get_operation_by_name("mask_x2").outputs[0]
 78 |         input_y = graph.get_operation_by_name("input_y").outputs[0]
 79 | 
 80 |         accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
 81 |         r1 = graph.get_operation_by_name("sentence_embedding/Representation1").outputs[0]
 82 |         r2 = graph.get_operation_by_name("sentence_embedding/Representation2").outputs[0]
 83 | 
 84 |         accu, vr1, vr2 = sess.run([accuracy, r1, r2], {input_x1: test_set[0], mask_x1: test_set[2],
 85 |                                                        input_x2: test_set[1], mask_x2: test_set[3],
 86 |                                                        input_y: test_set[4]})
 87 | 
 88 |     # r(input_size, None, hidden_n * 2) => (None, input_size, hidden_n * 2)
 89 |     representation1 = np.transpose(vr1, (1, 0, 2))
 90 |     representation2 = np.transpose(vr2, (1, 0, 2))
 91 | 
 92 |     test_x1 = list(test_x1)
 93 |     test_x2 = list(test_x2)
 94 |     test_y = list(test_y)
 95 |     sample_n = len(representation1)
 96 |     for i in range(sample_n):
 97 |         matrix1 = representation1[i]
 98 |         matrix2 = representation2[i]
 99 |         file.write("sample " + str(i) + "\n")
100 |         file.write(str(test_x1[i]) + " " + str(test_x2[i]) + " " + str(test_y[i]) + "\n")
101 |         writeToFile(matrix1, file)
102 |         writeToFile(matrix2, file)
103 | 
104 |         file.write("\n")
105 | 
106 |     print(accu)
107 | 


--------------------------------------------------------------------------------
/reload_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import os
  3 | from DLDisambiguation.util.input_helpers import InputHelper
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | 
  7 | 
  8 | def plot_conv(sess, t_vars, name):
  9 |     var_conv = [v for v in t_vars if name in v.name]
 10 |     W = var_conv[0]  # [2, 2, 1, 8]
 11 |     W = sess.run(W)
 12 | 
 13 |     length = W.shape[-1]
 14 | 
 15 |     row_n = 2 if length == 8 else 4
 16 |     col_n = length / row_n
 17 |     plt.subplots(row_n, col_n)
 18 | 
 19 |     for i in range(length):
 20 |         axes = plt.subplot(row_n, col_n, i + 1)
 21 |         map = W[:, :, 0, i]
 22 |         plt.imshow(map,cmap=plt.cm.magma)
 23 |         # plt.imshow(map, cmap=plt.get_cmap('gray'))
 24 |         # plt.xlabel(i)
 25 |         axes.set_xticks([])
 26 |         axes.set_yticks([])
 27 |     # plt.colorbar(fraction=0.046, pad=0.04)
 28 |     plt.savefig(dir_ + "map" + name + ".jpg")
 29 | 
 30 | 
 31 | def plot_activation(sample_index, k, row_n, col_n, conv, name):
 32 |     for i in range(col_n):
 33 |         k += 1
 34 |         m = plt.subplot(row_n, col_n, k)
 35 |         if i == col_n / 2:
 36 |             m.set_title(name)
 37 |         # plt.imshow(conv[sample_index, :, :, i], vmin=-1, vmax=1, interpolation='nearest', cmap=plt.cm.jet)
 38 |         plt.imshow(conv[sample_index, :, :, i], interpolation='nearest', cmap=plt.cm.magma)
 39 |         # plt.imshow(conv[sample_index, :, :, i], cmap=plt.cm.jet, aspect='auto')
 40 |         # plt.imshow(conv[sample_index, :, :, i], cmap=plt.get_cmap('gray'), aspect='auto')
 41 |         m.set_xticks([])
 42 |         m.set_yticks([])
 43 |     # plt.title(name)
 44 |     # plt.colorbar(fraction=0.046, pad=0.04)
 45 |     return k
 46 | 
 47 | 
 48 | def plot_activations(conv1, conv2, conv3, conv4):
 49 |     length = conv1.shape[-1]
 50 |     row_n = 4
 51 |     col_n = length
 52 |     plt.subplots(row_n, col_n, figsize=(20, 10))
 53 |     sample_idnex = 352
 54 | 
 55 |     k = 0
 56 |     k = plot_activation(sample_idnex, k, row_n, col_n, conv1, "Str")
 57 |     k = plot_activation(sample_idnex, k, row_n, col_n, conv2, "Character Embedding")
 58 |     k = plot_activation(sample_idnex, k, row_n, col_n, conv3, "Word Embedding")
 59 |     plot_activation(sample_idnex, k, row_n, col_n, conv4, "Sentence Embedding")
 60 | 
 61 |     # plt.colorbar()
 62 |     plt.savefig(dir_ + str(sample_idnex) + "activations_color" + ".jpg")
 63 | 
 64 | 
 65 | tf.flags.DEFINE_string("train_dir", "./", "Training dir root")
 66 | FLAGS = tf.flags.FLAGS
 67 | FLAGS._parse_flags()
 68 | 
 69 | task_num = 1
 70 | inpH = InputHelper()
 71 | max_document_length = 10
 72 | dir_ = "fig/"
 73 | 
 74 | model_dir = "./runs/NewExp/Single_task11503543419"
 75 | checkpoint_dir = os.path.join(model_dir, "checkpoints")
 76 | print(checkpoint_dir)
 77 | ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
 78 | checkpoint_file = ckpt.model_checkpoint_path
 79 | #
 80 | # mydir = "./Length" + str(max_document_length) + "/"
 81 | # x_test_tensor = np.load(mydir + "test_des" + ".npy")
 82 | x_test_tensor = np.load("./Tensor_files/0823/Length10/test_des.npy")
 83 | graph = tf.Graph()
 84 | 
 85 | with graph.as_default():
 86 |     sess = tf.Session()
 87 | 
 88 |     with sess.as_default():
 89 |         # Load the saved meta graph and restore variables
 90 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
 91 |         sess.run(tf.initialize_all_variables())
 92 |         saver.restore(sess, checkpoint_file)
 93 | 
 94 |         # PLOT Conv Filters
 95 |         t_vars = tf.trainable_variables()
 96 |         for var in t_vars:
 97 |             print(var.name)
 98 |             print(var)
 99 |             print(sess.run(var))
100 |         plot_conv(sess, t_vars, "conv1")
101 |         plot_conv(sess, t_vars, "conv1_1")
102 |         plot_conv(sess, t_vars, "conv1_2")
103 |         plot_conv(sess, t_vars, "conv1_3")
104 | 
105 |         input_tensor = graph.get_operation_by_name("input_tensor").outputs[0]
106 |         droppout = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
107 | 
108 |         conv1 = graph.get_operation_by_name("conv1/conv1").outputs[0]
109 |         conv2 = graph.get_operation_by_name("conv1_1/conv1").outputs[0]
110 |         conv3 = graph.get_operation_by_name("conv1_2/conv1").outputs[0]
111 |         conv4 = graph.get_operation_by_name("conv1_3/conv1").outputs[0]
112 | 
113 |         conv_layer1, conv_layer2, conv_layer3, conv_layer4 = sess.run([conv1, conv2, conv3, conv4],
114 |                                                                       feed_dict={input_tensor: x_test_tensor})
115 |         plot_activations(conv_layer1, conv_layer2, conv_layer3, conv_layer4)
116 | 


--------------------------------------------------------------------------------
/word2vec_pretrain/preparation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | from sklearn.feature_extraction.text import TfidfVectorizer
  5 | import jieba
  6 | import re
  7 | import gensim
  8 | import codecs
  9 | import os
 10 | from DLDisambiguation.util.util import preprocess_unit
 11 | 
 12 | def generateProcessedCorpusFile(src_file_path, character_file_path, word_file_path):
 13 |     """
 14 |     载入语料，去掉NULL的无效数据，
 15 |     分词用空格隔开每个词并生成word_src.txt
 16 |     将每个字用空格隔开并生成character_src.txt
 17 |     :param src_file_path:
 18 |     :param character_file_path:
 19 |     :param word_file_path:
 20 |     :return:
 21 |     """
 22 |     data = []
 23 |     file = codecs.open(src_file_path, "r")
 24 |     line = file.readline()
 25 | 
 26 |     character_file = codecs.open(character_file_path, "w+", "utf-8")
 27 |     word_file = codecs.open(word_file_path, "w+", "utf-8")
 28 | 
 29 |     while line != "":
 30 |         d = line.strip().decode("utf-8")
 31 |         # 处理"无"，"NA"."NULL".""的情况
 32 |         if len(d) != 1 and d != "NA" and d != "NULL" and d != "":
 33 |             data.append(d)
 34 |         line = file.readline()
 35 | 
 36 |     for i in data:
 37 |         res = preprocess_unit(i)
 38 |         characters = list("".join(res))
 39 |         if len(characters) == 0:
 40 |             continue
 41 |         character_file.write(" ".join(characters) + "\n")
 42 |     character_file.close()
 43 |     print("Finished character model!")
 44 | 
 45 |     for i in data:
 46 |         res = preprocess_unit(i)
 47 |         x = jieba.cut(res)
 48 |         words = list(x)
 49 |         if len(words) == 0:
 50 |             continue
 51 |         word_file.write(" ".join(words) + "\n")
 52 |     word_file.close()
 53 |     print("Finished Word model!")
 54 | 
 55 | 
 56 | def get_IDF(corpus_file, idf_file):
 57 |     file = open(corpus_file)
 58 |     line = file.readline()
 59 |     counter = 0
 60 |     corpus = []
 61 | 
 62 |     while line != "":
 63 |         counter += 1
 64 | 
 65 |         line = preprocess_unit(line)
 66 |         cut_words = list(jieba.cut(line))
 67 |         cut_words_valid = [i for i in cut_words if re.sub("\w+", "", i) != ""]
 68 |         content = " ".join(cut_words_valid)
 69 |         corpus.append(content)
 70 |         line = file.readline()
 71 |     file.close()
 72 | 
 73 |     # compute idf model
 74 |     vectorizer = TfidfVectorizer(
 75 |         use_idf=True,
 76 |         norm=None,
 77 |         smooth_idf=False,  # idf = ln(N+1 / ni+1)
 78 |         sublinear_tf=False,  # tf = 1+ln(tf)
 79 |         binary=False,
 80 |         min_df=1, max_df=1.0, max_features=None,
 81 |         strip_accents='unicode',
 82 |         ngram_range=(1, 1), preprocessor=None, stop_words=None, tokenizer=None, vocabulary=None
 83 |     )
 84 |     X = vectorizer.fit_transform(corpus)
 85 |     idf = vectorizer.idf_
 86 |     name_idf_dict = dict(zip(vectorizer.get_feature_names(), idf))
 87 | 
 88 |     # write IDF_Model
 89 |     idf_file = open(idf_file, "w+")
 90 |     for k, v in name_idf_dict.items():
 91 |         idf_file.write(k.encode("utf-8") + ":" + str(v) + "\n")
 92 |     idf_file.close()
 93 |     print("IDF_Model Finished!")
 94 | 
 95 | 
 96 | class MySentences(object):
 97 |     def __init__(self, fname):
 98 |         self.fname = fname
 99 | 
100 |     def __iter__(self):
101 |         for line in open(self.fname):
102 |             yield line.split()
103 | 
104 | 
105 | def checkPath(x):
106 |     if not os.path.exists(x):
107 |         open(x, 'a').close()
108 | 
109 | 
110 | def compute_word2vec(embedding_dir, type):
111 |     load_model_flag = False
112 |     src_file = embedding_dir + type + "_src.txt"
113 |     model_file = embedding_dir + type + '.model'
114 |     wv_file = embedding_dir + type + 'character_model.txt'
115 | 
116 |     if not load_model_flag:
117 |         sentences = MySentences(src_file)  # a memory-friendly iterator
118 |         model = gensim.models.Word2Vec(sentences)
119 | 
120 |         checkPath(model_file)
121 |         checkPath(wv_file)
122 | 
123 |         model.save(model_file)  # save model
124 |         model.wv.save_word2vec_format(wv_file, binary=False)  # save word2vec txt
125 |         print("Word2vec model finished!\n")
126 |     else:
127 |         model = gensim.models.Word2Vec.load(model_file)  # load in model
128 | 
129 | 
130 | if __name__ == "__main__":
131 | 
132 |     corpus_file = "../data/d2013_operation.txt"
133 |     idf_model = "../data/idfModel_operation.txt"
134 | 
135 |     # corpus_file = "../data/db_description_d2013.txt"
136 |     # idf_model = "../data/idfModel.txt"
137 | 
138 |     task_num = 1
139 |     if task_num == 1:
140 |         embedding_dir = "../data/word2vec_new/"
141 |     else:
142 |         embedding_dir = "../data/operation/"
143 | 
144 |     # generate IDF Model
145 |     get_IDF(corpus_file, idf_model)
146 | 
147 |     # produce character_src_file and word_src_file from corpus file
148 |     # generateProcessedCorpusFile(corpus_file, character_src_file, word_src_file)
149 | 
150 |     # generate word2vec for characters and words
151 |     # compute_word2vec(embedding_dir, "character")
152 |     # compute_word2vec(embedding_dir, "word")


--------------------------------------------------------------------------------
/tensor_construction.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | import os
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | from DLDisambiguation.util.input_helpers import InputHelper
  9 | from tensor import Tensor
 10 | 
 11 | tf.flags.DEFINE_string("train_dir", "./", "Training dir root")
 12 | tf.flags.DEFINE_string("max_sequence_len", 10, "length")
 13 | tf.flags.DEFINE_string("max_sequence_len2", 20, "length")
 14 | FLAGS = tf.flags.FLAGS
 15 | FLAGS._parse_flags()
 16 | 
 17 | 
 18 | def generate_Tensor(mention, entity, mention2, entity2, mention3, entity3, max_len, task_n):
 19 |     lstm_dir = "Description1502868912" if task_n == 1 else "Operation1502954903"  # 0810数据集 10 + 20
 20 |     # lstm_dir = "Description1501554142" if task_n == 1 else "Operation1501588184"  # 旧数据集 8万 LEN = 10
 21 |     bilstm_dir = os.path.join("./Sentence_Modeling/Exp/runs", lstm_dir)
 22 | 
 23 |     men_arr = np.concatenate((mention, mention2, mention3))
 24 |     entity_arr = np.concatenate((entity, entity2, entity3))
 25 |     tensor = Tensor(men_arr, entity_arr, len(men_arr), max_len, task_n, bilstm_dir).get_tensor()
 26 |     tensor = tensor.transpose((0, 2, 3, 1))
 27 | 
 28 |     g1 = len(mention)
 29 |     g2 = len(np.concatenate((mention, mention2)))
 30 |     return tensor[:g1], tensor[g1:g2], tensor[g2:]
 31 | 
 32 | 
 33 | def generate_Tensor_no_dev(mention, entity, mention3, entity3, max_len, task_n):
 34 |     lstm_dir = "Description1503482587" if task_n == 1 else "Operation1503500979"  # 0823
 35 |     # lstm_dir = "Description1503276064" if task_n == 1 else "Operation1503277387"  # 0816
 36 |     # lstm_dir = "Description1503227493" if task_n == 1 else "Operation1502964494"  #
 37 |     # lstm_dir = "Description1502964352" if task_n == 1 else "Operation1502964494"  # 0810数据集 10 + 20
 38 |     print lstm_dir
 39 |     bilstm_dir = os.path.join("./Sentence_Modeling/Exp0823/runs", lstm_dir)
 40 | 
 41 |     men_arr = np.concatenate((mention, mention3))
 42 |     entity_arr = np.concatenate((entity, entity3))
 43 |     tensor = Tensor(men_arr, entity_arr, len(men_arr), max_len, task_n, bilstm_dir).get_tensor()
 44 |     tensor = tensor.transpose((0, 2, 3, 1))
 45 | 
 46 |     g1 = len(mention)
 47 |     return tensor[:g1], tensor[g1:]
 48 | 
 49 | 
 50 | def prepara_dynamic_tensor(inputH, training_path, dev_path, test_path, max_len, max_len2):
 51 |     indi1, x1_train, x2_train, x3_train, x4_train, y_train, y2_train = inputH.getTsvTestData_Mul_Labels_Dyna(
 52 |         training_path, "\t", max_len)
 53 |     indi2, x1_dev, x2_dev, x3_dev, x4_dev, y_dev, y2_dev = inputH.getTsvTestData_Mul_Labels_Dyna(dev_path, "\t",
 54 |                                                                                                  max_len)
 55 |     indi3, x1_test, x2_test, x3_test, x4_test, y_test, y2_test = inputH.getTsvTestData_Mul_Labels_Dyna(test_path, "\t",
 56 |                                                                                                        max_len)
 57 | 
 58 |     print("Finished Loading")
 59 |     x_train_tensor, x_dev_tensor, x_test_tensor = generate_Tensor(x1_train, x2_train, x1_dev, x2_dev, x1_test, x2_test,
 60 |                                                                   max_len, 1)
 61 |     print("Finished constructing tensors!")
 62 | 
 63 |     dir_t = "./0816/"
 64 |     np.save(dir_t + "train_des", x_train_tensor)
 65 |     np.save(dir_t + "dev_des", x_dev_tensor)
 66 |     np.save(dir_t + "test_des", x_test_tensor)
 67 |     print("Save description tensors!")
 68 | 
 69 |     x_train_tensor_o, x_dev_tensor_o, x_test_tensor_o = generate_Tensor(x3_train, x4_train, x3_dev, x4_dev, x3_test,
 70 |                                                                         x4_test, max_len2, 2)
 71 | 
 72 |     np.save(dir_t + "train_opr", x_train_tensor_o)
 73 |     np.save(dir_t + "dev_opr", x_dev_tensor_o)
 74 |     np.save(dir_t + "test_opr", x_test_tensor_o)
 75 | 
 76 |     np.save(dir_t + "train_indi_opr", indi1)
 77 |     np.save(dir_t + "dev_indi_opr", indi2)
 78 |     np.save(dir_t + "test_indi_opr", indi3)
 79 | 
 80 | def prepara_tensor_y_seperate(inputH, data_file, data_file_val, data_file_test, sep, max_len, name, task_num):
 81 |     x_train_mention, x_train_entity, y_train = inputH.getTsvTestData(data_file, sep, max_len, y_value=False)
 82 |     # x_dev_mention, x_dev_entity, y_dev = inputH.getTsvTestData(data_file_val, sep, max_len, y_value=False)
 83 |     x_test_mention, x_test_entity, y_test = inputH.getTsvTestData(data_file_test, sep, max_len, y_value=False)
 84 | 
 85 |     print("Finished Loading")
 86 |     # x_train_tensor, x_dev_tensor, x_test_tensor = generate_Tensor(x_train_mention, x_train_entity, x_dev_mention,
 87 |     #                                                               x_dev_entity, x_test_mention, x_test_entity,
 88 |     #                                                               max_len,  task_num)
 89 |     x_train_tensor, x_test_tensor = generate_Tensor_no_dev(x_train_mention, x_train_entity,
 90 |                                                                          x_test_mention, x_test_entity,
 91 |                                                                          max_len, task_num)
 92 |     print("Finished constructing tensors!")
 93 | 
 94 |     print("Length")
 95 |     print(len(x_train_mention))
 96 |     mydir = "Tensor_files/0823/No_IDF/Length" + str(max_len) + "/"
 97 |     np.save(mydir + "train_" + name, x_train_tensor)
 98 |     # np.save(mydir + "dev_" + name, x_dev_tensor)
 99 |     np.save(mydir + "test_" + name, x_test_tensor)
100 |     print("Save tensors!")
101 | 
102 | 
103 | def main():
104 |     # Load data
105 |     print("Loading data...")
106 |     inputH = InputHelper()
107 | 
108 |     task_num = 1
109 |     name = "des" if task_num == 1 else "opr"
110 | 
111 |     # train_f = "./data/exp0803/training_dynamic_data.txt"
112 |     # dev_f = "./data/exp0803/validation_dynamic_data.txt"
113 |     # test_f = "./data/exp0803/test_dynamic_data.txt"
114 |     # prepara_dynamic_tensor(inputH, train_f, dev_f, test_f, FLAGS.max_sequence_len, FLAGS.max_sequence_len2)
115 | 
116 |     time_gen = "0823"
117 |     data_file = os.path.join(FLAGS.train_dir, "data/train_data_" + time_gen + "_" + name + ".txt")
118 |     data_file_test = os.path.join(FLAGS.train_dir, "data/test_data_" + time_gen + "_" + name + ".txt")
119 |     data_file_val = data_file_test
120 |     # data_file_val = os.path.join(FLAGS.train_dir, "data/validation_data_" + time_gen + "_" + name + ".txt")
121 |     prepara_tensor_y_seperate(inputH, data_file, data_file_val, data_file_test, "\t", FLAGS.max_sequence_len, name,
122 |                               task_num)
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     main()
127 | 


--------------------------------------------------------------------------------
/util/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | import re
  4 | import numpy as np
  5 | 
  6 | 
  7 | def write_evaluation_file(eval_file, right_output_file, wrong_output_file, labels, predictions, mentions, entities,
  8 |                           indi=None):
  9 |     tp, fp, tn, fn = 0, 0, 0, 0
 10 |     length = len(predictions)
 11 |     right_output_file.write("mention\tentity\tprediction\tlabel\n")
 12 |     wrong_output_file.write("mention\tentity\tprediction\tlabel\n")
 13 | 
 14 |     for i in range(length):
 15 |         if indi != None and indi[i] == 0:
 16 |             continue
 17 |         str_m = mentions[i] + "\t" + entities[i] + "\t" + str(predictions[i]) + "\t" + str(labels[i]) + "\n"
 18 |         if predictions[i] == 1 and labels[i] == 1:
 19 |             tp += 1.0
 20 |             right_output_file.write(str_m)
 21 |         elif predictions[i] == 1 and labels[i] == 0:
 22 |             fn += 1.0
 23 |             wrong_output_file.write(str_m)
 24 |         elif predictions[i] == 0 and labels[i] == 1:
 25 |             fp += 1.0
 26 |             wrong_output_file.write(str_m)
 27 |         else:
 28 |             tn += 1.0
 29 |             right_output_file.write(str_m)
 30 | 
 31 |     eval_file.write("True positive: " + str(tp) + "\n")
 32 |     eval_file.write("False positive: " + str(fp) + "\n")
 33 |     eval_file.write("True negative: " + str(tn) + "\n")
 34 |     eval_file.write("False negative: " + str(fn) + "\n")
 35 | 
 36 |     precision = tp / (tp + fp)
 37 |     recall = tp / (tp + fn) if tp + fn != 0.0 else 0.0001
 38 |     f1 = 2 * precision * recall / (recall + precision)
 39 | 
 40 |     eval_file.write("Precision:" + str(precision) + "\n")
 41 |     eval_file.write("Recall:" + str(recall) + "\n")
 42 |     eval_file.write("F1:" + str(f1) + "\n\n")
 43 | 
 44 | 
 45 | def write_evaluation_file_multi(eval_file, right_output_file, wrong_output_file, labels, predictions, labels2,
 46 |                                 predictions2, mentions1, entities1, mentions2, entities2):
 47 |     tp, fp, tn, fn = 0, 0, 0, 0
 48 |     length = len(predictions)
 49 |     right_output_file.write("mention_description\tentity_description\tprediction_description\tlabel"
 50 |                             "\tmention_operation\tentity_operation\tprediction_operation\tlabel \n")
 51 |     wrong_output_file.write("mention_description\tentity_description\tprediction_description\tlabel"
 52 |                             "\tmention_operation\tentity_operation\tprediction_operation\tlabel \n")
 53 | 
 54 |     for i in range(length):
 55 |         str_m = mentions1[i] + "\t" + entities1[i] + "\t" + str(predictions[i]) + "\t" + str(labels[i]) + "\t" + \
 56 |                 mentions2[i] + "\t" + entities2[i] + "\t" + str(predictions2[i]) + "\t" + str(labels2[i]) + "\n"
 57 |         if predictions[i] == 1 and labels[i] == 1:
 58 |             tp += 1.0
 59 |             right_output_file.write(str_m)
 60 |         elif predictions[i] == 1 and labels[i] == 0:
 61 |             fn += 1.0
 62 |             wrong_output_file.write(str_m)
 63 |         elif predictions[i] == 0 and labels[i] == 1:
 64 |             fp += 1.0
 65 |             wrong_output_file.write(str_m)
 66 |         else:
 67 |             tn += 1.0
 68 |             right_output_file.write(str_m)
 69 | 
 70 |         eval_file.write("True positive: " + str(tp) + "\n")
 71 |         eval_file.write("False positive: " + str(fp) + "\n")
 72 |         eval_file.write("True negative: " + str(tn) + "\n")
 73 |         eval_file.write("False negative: " + str(fn) + "\n")
 74 | 
 75 |         precision = tp / (tp + fp)
 76 |         recall = tp / (tp + fn)
 77 |         f1 = 2 * precision * recall / (recall + precision)
 78 | 
 79 |         eval_file.write("Precision:" + str(precision) + "\n")
 80 |         eval_file.write("Recall:" + str(recall) + "\n")
 81 |         eval_file.write("F1:" + str(f1) + "\n")
 82 | 
 83 | 
 84 | def preprocess_unit(str):
 85 |     res_0 = re.sub(ur"\w+", '', str)
 86 |     res_0 = re.sub(ur"[-（ ）\( \)， \.；;、：° \s+ \*\[ \] \+ ？? \,]", '', res_0)
 87 | 
 88 |     # res_0 = str.replace('&nbsp;', '')
 89 |     # res_0 = re.sub(ur"\u3000", '', res_0)  # 将中文的空格用英文空格代替，后面可以处理
 90 | 
 91 |     # res_0 = re.sub(ur"\[?[（]?\w+[.]?\w+\]?[）]?$", '', res_0)  # 去除掉ICD编码 eg:I20.222
 92 |     #
 93 |     # res_0 = re.sub(r"\w\d+.\d+", '', res_0)  # 去除掉ICD编码 eg:I20.222
 94 |     # res_0 = re.sub(r"\w\d+.?x+\d+$", '', res_0)  # 去除掉尾部的编码 eg:I20.x222
 95 |     #
 96 |     # res_0 = re.sub(r"\s\w+", "", res_0)  # 去掉空格后的字母，eg: 心肌梗塞急性 NOS
 97 |     # res_0 = re.sub(ur"\[\w+\]", "", res_0).strip()  # 去掉括号中的字母解释，eg: [NSSMD]
 98 |     # res_0 = re.sub(ur"（\w+）", "", res_0).strip()  # 去掉括号中的字母解释，eg: （NSSMD）
 99 |     # res_0 = re.sub(ur"\(\w+\)", "", res_0).strip()  # 去掉括号中的字母解释，eg: (NSSMD)
100 |     #
101 |     # res = re.split(ur"[（ ）\( \)， \.；;、：° \s+ \*\[ \] \+ ？? \,]", res_0)
102 |     # res = filter(lambda x: len(x) != 1 and len(x) != 0, res)
103 |     #
104 |     # return "".join(res)
105 |     return res_0
106 | 
107 | 
108 | def preprocess_arr(arr):
109 |     res = []
110 |     for i in arr:
111 |         res.append(preprocess_unit(i.decode("utf-8")))
112 |     return res
113 | 
114 | 
115 | def loadIDFModel(file_path):
116 |     file = open(file_path)
117 |     idfModel = dict()
118 |     for line in file.readlines():
119 |         segs = line.strip().split(":")
120 |         name, idf = segs[0].decode("utf-8"), float(segs[1])
121 |         idfModel[name] = idf
122 |     return idfModel
123 | 
124 | 
125 | def load_data(path):
126 |     file = open(path)
127 |     line = file.readline()
128 |     res = []
129 |     y = []
130 | 
131 |     while line != "":
132 |         tmp = line.split(",")
133 |         res.append([tmp[0], tmp[1]])
134 |         y.append(tmp[2])
135 |         line = file.readline()
136 |     file.close()
137 |     return res, y
138 | 
139 | 
140 | def loadWord2Vec(filename):
141 |     vocab = []
142 |     embd = []
143 |     file = open(filename, 'r')
144 |     line = file.readline()
145 | 
146 |     while line != "":
147 |         line = file.readline()  # jump the first line
148 |         if line == "":
149 |             break
150 |         row = line.strip().split(' ')
151 |         vocab.append(row[0])
152 |         embedding_float = [float(i) for i in row[1:]]
153 |         embd.append(embedding_float)
154 |     print('Loaded Word2vec model!')
155 |     file.close()
156 |     return vocab, embd
157 | 
158 | 
159 | def getEmbedding(filename):
160 |     vocab, embd = loadWord2Vec(filename)
161 | 
162 |     # add unknown symbol
163 |     vocab.append("<UNK>")
164 |     embd.append([0.0] * 100)
165 | 
166 |     vocab_size = len(vocab)
167 |     embedding_dim = len(embd[0])
168 |     embedding = np.asarray(embd)
169 |     return vocab, vocab_size, embedding_dim, embedding
170 | 


--------------------------------------------------------------------------------
/MultiGran_Model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | class MultiGranModel(object):
  8 |     def _conv(self, name, in_, ksize, reuse=False):
  9 |         num_filters = ksize[3]
 10 | 
 11 |         with tf.variable_scope(name, reuse=reuse) as scope:
 12 |             # different CNN for different views
 13 |             # W = tf.get_variable("weights", ksize, initializer=tf.contrib.layers.xavier_initializer())
 14 |             W = tf.Variable(tf.truncated_normal(ksize, stddev=0.1), name="W")
 15 |             biases = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
 16 | 
 17 |             # same CNN for different views
 18 |             # W = tf.get_variable("weights", ksize, initializer=tf.contrib.layers.xavier_initializer())
 19 |             # W = tf.get_variable("weights", ksize, initializer=tf.truncated_normal_initializer(stddev=0.1))
 20 |             # biases = tf.get_variable("biases", [num_filters], initializer=tf.constant_initializer(0.1))
 21 | 
 22 |             conv = tf.nn.conv2d(in_, W, strides=[1, 1, 1, 1], padding="VALID")
 23 |             h = tf.nn.relu(tf.nn.bias_add(conv, biases), name=scope.name)
 24 | 
 25 |         return h
 26 | 
 27 |     def _maxpool(self, name, in_, ksize, strides):
 28 |         pool = tf.nn.max_pool(in_, ksize=ksize, strides=strides, padding='VALID', name=name)
 29 |         print name, pool.get_shape().as_list()
 30 |         return pool
 31 | 
 32 |     def __init__(self, max_len, filter_sizes, pool_sizes, num_filters, l2_reg_lambda=0.0, type_CNN=2):
 33 |         channel_num = 4
 34 | 
 35 |         # Placeholders for input, output and dropout
 36 |         self.input_tensor = tf.placeholder(tf.float32, [None, max_len, max_len, channel_num], name="input_tensor")
 37 |         self.input_y = tf.placeholder(tf.float32, [None, 2], name="input_y")
 38 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 39 | 
 40 |         # Keeping track of l2 regularization loss (optional)
 41 |         l2_loss = tf.constant(0.0)
 42 | 
 43 |         # Create a convolution + maxpool layer for each filter size
 44 |         pooled_outputs = []
 45 | 
 46 |         input_tensor = tf.expand_dims(self.input_tensor, 4)  # N x W x H x V  => N x W x H x V x C
 47 |         input_tensor = tf.transpose(input_tensor, perm=[3, 0, 1, 2, 4])  # N x W x H x V x C =>  V x N x W x H x C
 48 | 
 49 |         if type_CNN == 1:
 50 |             filter_shape1 = [filter_sizes[0], filter_sizes[1], 4, num_filters / 2]
 51 |             p_size1 = [1, 2, 2, 1]
 52 |             filter_shape2 = [filter_sizes[2], filter_sizes[3], num_filters / 2, num_filters]
 53 |             p_size2 = [1, 2, 2, 1]
 54 | 
 55 |             conv1 = self._conv("conv1", self.input_tensor, filter_shape1)
 56 |             pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
 57 |             conv2 = self._conv('conv2', pool1, filter_shape2)
 58 |             pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
 59 | 
 60 |             dim = np.prod(pool2.get_shape().as_list()[1:])
 61 |             y = tf.reshape(pool2, [-1, dim])
 62 |         else:
 63 |             for i in range(channel_num):
 64 |                 # set reuse True for i > 0, for weight-sharing
 65 |                 reuse_f = (i != 0)
 66 |                 view = tf.gather(input_tensor, i)  # N x W x H x C
 67 | 
 68 |                 filter_shape1 = [filter_sizes[0], filter_sizes[1], 1, num_filters / 2]
 69 |                 p_size1 = [1, pool_sizes[0], pool_sizes[1], 1]
 70 | 
 71 |                 conv1 = self._conv('conv1', view, filter_shape1, reuse=reuse_f)
 72 |                 pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
 73 | 
 74 |                 if len(filter_sizes) == 4:
 75 |                     filter_shape2 = [filter_sizes[2], filter_sizes[3], num_filters / 2, num_filters]
 76 |                     p_size2 = [1, pool_sizes[2], pool_sizes[3], 1]
 77 | 
 78 |                     conv2 = self._conv('conv2', pool1, filter_shape2, reuse=reuse_f)
 79 |                     pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
 80 | 
 81 |                     dim = np.prod(pool2.get_shape().as_list()[1:])
 82 |                     reshape = tf.reshape(pool2, [-1, dim])
 83 |                 else:
 84 |                     dim = np.prod(pool1.get_shape().as_list()[1:])
 85 |                     reshape = tf.reshape(pool1, [-1, dim])
 86 | 
 87 |                 pooled_outputs.append(reshape)
 88 | 
 89 |             with tf.name_scope("view_pooling"):
 90 |                 x = tf.stack(pooled_outputs)  # 4 * N * dim
 91 |                 x = tf.transpose(x, perm=[1, 2, 0])  # N * dim * 4
 92 |                 reshape = tf.reshape(x, [-1, 4]) # (N * dim) * 4
 93 |                 print reshape.get_shape().as_list()
 94 | 
 95 |                 Weights = tf.Variable(tf.truncated_normal([4, 1], 0, 0.1), name="W")
 96 | 
 97 |                 y = tf.matmul(reshape, Weights, name="view_pooling")
 98 |                 y = tf.reshape(y, [-1, dim])
 99 |                 print y.get_shape().as_list()
100 |                 print("DIM:!" + str(dim))
101 | 
102 |         # Add dropout
103 |         with tf.name_scope("dropout"):
104 |             self.h_drop = tf.nn.dropout(y, self.dropout_keep_prob, name="hidden_output_drop")
105 |             print self.h_drop.get_shape().as_list()
106 | 
107 |         with tf.name_scope("fc1"):
108 |             dim_ = dim / 2
109 |             # dim_ = 100
110 |             # W = tf.get_variable("W", [dim, dim_], initializer=tf.contrib.layers.xavier_initializer())
111 |             W = tf.Variable(name="W", initial_value=tf.truncated_normal(shape=[dim, dim_], stddev=0.1))
112 |             b = tf.Variable(tf.constant(0.1, shape=[dim_]), name="b")
113 | 
114 |             l2_loss += tf.nn.l2_loss(W)
115 |             l2_loss += tf.nn.l2_loss(b)
116 |             self.fc1 = tf.nn.relu(tf.matmul(self.h_drop, W) + b)
117 |             self.fc_drop1 = tf.nn.dropout(self.fc1, self.dropout_keep_prob)
118 | 
119 |         # with tf.name_scope("fc2"):
120 |         #     dim__ = dim_ / 2
121 |         #     # dim_ = 100
122 |         #     W = tf.Variable(name="W", initial_value=tf.truncated_normal(shape=[dim_, dim__], stddev=0.1))
123 |         #     b = tf.Variable(tf.constant(0.1, shape=[dim__]), name="b")
124 |         #
125 |         #     l2_loss += tf.nn.l2_loss(W)
126 |         #     l2_loss += tf.nn.l2_loss(b)
127 |         #     self.fc2 = tf.nn.relu(tf.matmul(self.fc_drop1, W) + b)
128 |         #     self.fc_drop2 = tf.nn.dropout(self.fc2, self.dropout_keep_prob)
129 | 
130 |         # Final (unnormalized) scores and predictions
131 |         with tf.name_scope("output"):
132 |             # W = tf.get_variable("W_output", [dim_, 2], initializer=tf.contrib.layers.xavier_initializer())
133 |             W = tf.Variable(name="W_output", initial_value=tf.truncated_normal(shape=[dim_, 2], stddev=0.1))
134 |             b = tf.Variable(tf.constant(0.1, shape=[2]), name="b")
135 | 
136 |             l2_loss += tf.nn.l2_loss(W)
137 |             l2_loss += tf.nn.l2_loss(b)
138 |             self.scores = tf.nn.xw_plus_b(self.fc_drop1, W, b, name="scores")
139 |             self.predictions = tf.argmax(self.scores, 1, name="predictions")
140 | 
141 |         # Calculate Mean cross-entropy loss
142 |         with tf.name_scope("loss"):
143 |             losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
144 |             self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
145 | 
146 |         # Accuracy
147 |         with tf.name_scope("accuracy"):
148 |             correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
149 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
150 | 


--------------------------------------------------------------------------------
/Sentence_Modeling/Siamese_network.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | class SiameseLSTM(object):
  4 |     """
  5 |     A LSTM based deep Siamese network
  6 |     Uses an character embedding layer, followed by a biLSTM and Energy Loss layer.
  7 |     """
  8 | 
  9 |     def __init__(self, sequence_length, vocab_processer, embedding_size, hidden_unit_size, l2_reg_lambda, batch_size,
 10 |                  embedding_arr):
 11 |         # Placeholders for input, output
 12 |         self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1")
 13 |         self.mask_x1 = tf.placeholder(tf.float32, [None, sequence_length], name="mask_x1")
 14 | 
 15 |         self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2")
 16 |         self.mask_x2 = tf.placeholder(tf.float32, [None, sequence_length], name="mask_x2")
 17 |         self.input_y = tf.placeholder(tf.float32, [None], name="input_y")
 18 | 
 19 |         self.hidden_n = hidden_unit_size
 20 |         self.vocab_processer = vocab_processer
 21 | 
 22 |         # Keeping track of l2 regularization loss (optional)
 23 |         l2_loss = tf.constant(0.0, name="l2_loss")
 24 | 
 25 |         # Embedding layer
 26 |         with tf.name_scope("embedding"):
 27 |             self.W = tf.Variable(tf.constant(embedding_arr, dtype=tf.float32), trainable=True, name="W")
 28 | 
 29 |             self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)
 30 |             self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2)
 31 | 
 32 |         # Create a convolution + maxpool layer for each filter size
 33 |         with tf.name_scope("output"):
 34 |             # Siamese Network with the same Bi-LSTM(side, side)
 35 |             # self.out1 = self.BiRNN(self.embedded_chars1, self.mask_x1, "side", embedding_size, sequence_length,
 36 |             #                        reuse_f=None)
 37 |             # self.out2 = self.BiRNN(self.embedded_chars2, self.mask_x2, "side", embedding_size, sequence_length,
 38 |             #                        reuse_f=True)
 39 | 
 40 |             # Siamese Network with different Bi-LSTMs(side1, side2)
 41 |             # out1 shape(None, 20)
 42 |             self.out1 = self.BiRNN(self.embedded_chars1, self.mask_x1, "side1", embedding_size, sequence_length)
 43 |             self.out2 = self.BiRNN(self.embedded_chars2, self.mask_x2, "side2", embedding_size, sequence_length)
 44 | 
 45 |             # cosine distance
 46 |             # normalize_a = tf.nn.l2_normalize(self.out1, 1)
 47 |             # normalize_b = tf.nn.l2_normalize(self.out2, 1)
 48 |             # self.distance = tf.subtract(1.0, abs(
 49 |             #     tf.reduce_sum(tf.multiply(normalize_a, normalize_b), axis=1, name="distance")))
 50 | 
 51 |             # Euclidean distance : distance shape:(None)
 52 | 
 53 |             self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.out1, self.out2)), 1))
 54 | 
 55 |         with tf.name_scope("sentence_embedding"):
 56 |             self.representation1 = self.get_Representation(self.embedded_chars1, "side1", embedding_size,
 57 |                                                            sequence_length)
 58 |             self.representation2 = self.get_Representation(self.embedded_chars2, "side2", embedding_size,
 59 |                                                            sequence_length)
 60 | 
 61 |             self.representation1 = tf.identity(self.representation1, name="Representation1")
 62 |             self.representation2 = tf.identity(self.representation2, name="Representation2")
 63 | 
 64 |         with tf.name_scope("loss"):
 65 | 
 66 |             self.loss = self.contrastive_loss(self.input_y, self.distance, batch_size)
 67 | 
 68 |         with tf.name_scope("accuracy"):
 69 |             # predict_label = tf.subtract(1.0, tf.round(self.distance))
 70 | 
 71 |             margin = 0.5
 72 |             self.predict_label = tf.cast(tf.less(self.distance, margin), "float32", name="prediction")
 73 |             # predict_label = tf.subtract(tf.Variable(1.0), tf.round(self.distance))
 74 | 
 75 |             correct_predictions = tf.equal(self.predict_label, self.input_y)
 76 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float32"), name="accuracy")
 77 | 
 78 |     def BiRNN(self, x, mask, scope, embedding_size, sequence_length, reuse_f=None):
 79 |         n_input = embedding_size
 80 |         n_steps = sequence_length
 81 | 
 82 |         x = tf.transpose(x, [1, 0, 2])  # (batch_size, n_steps, n_input) => (n_steps, batch_size, n_input)
 83 |         # Reshape to (n_steps*batch_size, n_input)
 84 |         x = tf.reshape(x, [-1, n_input])
 85 |         # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
 86 |         x = tf.split(x, n_steps, 0)
 87 | 
 88 |         with tf.name_scope("fw" + scope), tf.variable_scope("fw" + scope, reuse=reuse_f):
 89 |             print(tf.get_variable_scope().name)
 90 |             fw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_n, forget_bias=1.0, state_is_tuple=True)
 91 | 
 92 |         with tf.name_scope("bw" + scope), tf.variable_scope("bw" + scope, reuse=reuse_f):
 93 |             print(tf.get_variable_scope().name)
 94 |             bw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_n, forget_bias=1.0, state_is_tuple=True)
 95 | 
 96 |         with tf.name_scope("fwbw" + scope), tf.variable_scope("fwbw" + scope, reuse=reuse_f):
 97 |             outputs, _, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, x, dtype=tf.float32)
 98 | 
 99 |         outputs = tf.stack(outputs)  # list of Tensor(None*(2*hidden)) => steps * None * (2*hidden)
100 | 
101 |         outputs = tf.transpose(outputs, [1, 0, 2])  # steps * None * (2*hidden) => None * steps * (2 * hidden)
102 |         print(outputs.get_shape().as_list())
103 | 
104 |         # Use mask
105 |         outputs = outputs * mask[:, :, None]
106 |         # mean pooling to get the vector
107 |         x = tf.reduce_sum(mask, 1)[:, None]
108 |         outputs = tf.reduce_sum(outputs, 1) / x
109 |         print(outputs.get_shape().as_list())
110 |         return outputs
111 | 
112 |     def get_Representation(self, x, scope, embedding_size, sequence_length):
113 |         n_input = embedding_size
114 |         n_steps = sequence_length
115 | 
116 |         x = tf.transpose(x, [1, 0, 2])
117 |         # Reshape to (n_steps*batch_size, n_input)
118 |         x = tf.reshape(x, [-1, n_input])
119 |         # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
120 |         x = tf.split(x, n_steps, 0)
121 | 
122 |         with tf.name_scope("fw" + scope), tf.variable_scope("fw" + scope, reuse=True):
123 |             print(tf.get_variable_scope().name)
124 |             fw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_n, forget_bias=1.0, state_is_tuple=True)
125 |         with tf.name_scope("bw" + scope), tf.variable_scope("bw" + scope, reuse=True):
126 |             print(tf.get_variable_scope().name)
127 |             bw_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_n, forget_bias=1.0, state_is_tuple=True)
128 |         with tf.name_scope("fwbw" + scope), tf.variable_scope("fwbw" + scope, reuse=True):
129 |             # Outputs list contains the depth-concatenated fw and bw vectors for each input.
130 |             # output shape -- [time][batch][cell_fw.output_size + cell_bw.output_size]
131 |             outputs, _, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, x, dtype=tf.float32)
132 |         return outputs
133 | 
134 |     def contrastive_loss(self, y, d, batch_size):
135 |         tmp = y * tf.square(d)
136 |         margin = 1
137 |         tmp2 = (1 - y) * tf.square(tf.maximum((margin - d), 0))
138 |         return tf.reduce_sum(tmp + tmp2) / batch_size / 2
139 | 
140 |         # dis = tf.subtract(tf.Variable(1.0), d)
141 |         # tmp = tf.square(dis) / 4
142 |         # margin = 0.4
143 |         # tmp2 = tf.multiply(tf.cast(tf.less(margin, d), "float"), tf.square(d))
144 |         # return tf.reduce_sum(y * tmp + (1 - y) * tmp2)
145 | 


--------------------------------------------------------------------------------
/tensor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | import gensim, jieba, os
  4 | from scipy import spatial
  5 | from util.util import preprocess_arr
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | from util.input_helpers import InputHelper
 10 | from util.preprocess import MyVocabularyProcessor
 11 | from util.util import loadIDFModel
 12 | 
 13 | 
 14 | class Tensor(object):
 15 |     def __init__(self, m, e, batch_size, sequence_length, task_num, lstm_dir):
 16 |         """
 17 |         initialize tensors
 18 |         :param m: mention list
 19 |         :param e: entity list
 20 |         :param batch_size: batch_size
 21 |         :param sequence_length: default sentence length
 22 |         :param task_num: the type of task(1--description, 2--operation)
 23 |         """
 24 |         channel_num = 4
 25 |         if task_num == 1:  # task1-description disambiguation
 26 |             word2vec_dir = "./data/word2vec"
 27 |             self.idfModel_file = "./data/description_idf.txt"
 28 |             # self.idfModel_file = "./data/idfModel.txt"
 29 | 
 30 |         else:  # task2-operation disambiguation
 31 |             word2vec_dir = "./data/operation"
 32 |             self.idfModel_file = "./data/operation_idf.txt"
 33 |             # self.idfModel_file = "./data/idfModel_operation.txt"
 34 | 
 35 |         character_model_file = os.path.join(word2vec_dir, 'character.model')
 36 |         word_mode_file = os.path.join(word2vec_dir, 'word.model')
 37 |         self.bilstm_dir = lstm_dir
 38 | 
 39 |         self.mentions = preprocess_arr(m)
 40 |         self.entities = preprocess_arr(e)
 41 |         self.sequence_length = sequence_length
 42 |         self.batch_size = batch_size
 43 | 
 44 |         self.character_embedding_model = gensim.models.Word2Vec.load(character_model_file)
 45 |         self.word_embedding_model = gensim.models.Word2Vec.load(word_mode_file)
 46 | 
 47 |         self.tensor = np.zeros(shape=(batch_size, channel_num, sequence_length, sequence_length))
 48 |         self.init_matrices()
 49 | 
 50 |     def init_matrices(self):
 51 |         """
 52 |         initialize four matrices in the tensor
 53 |         :return:
 54 |         """
 55 |         sentence_embedding_m, sentence_embedding_e = self.getSentence_Embedding(self.mentions, self.entities,
 56 |                                                                                 self.sequence_length)
 57 |         np.save("no_att_sentece_m_opr.npy", sentence_embedding_m)
 58 |         np.save("no_att_sentece_e_opr.npy", sentence_embedding_e)
 59 |         # sentence_embedding_m = np.load("0823_sentence_m.npy")
 60 |         # sentence_embedding_e = np.load("0823_sentence_e.npy")
 61 | 
 62 |         print("Sentence Embedding Finished!")
 63 | 
 64 |         for sample_index in range(self.batch_size):
 65 |             len_mention = len(self.mentions[sample_index].decode("utf-8"))
 66 |             len_entity = len(self.entities[sample_index].decode("utf-8"))
 67 | 
 68 |             # for word matching
 69 |             words_m = list(jieba.cut(self.mentions[sample_index]))
 70 |             words_e = list(jieba.cut(self.entities[sample_index]))
 71 |             len_w_m = len(words_m)
 72 |             len_w_e = len(words_e)
 73 | 
 74 |             for i in range(len_w_m):
 75 |                 for j in range(len_w_e):
 76 | 
 77 |                     words_sim = 1 - spatial.distance.cosine(
 78 |                         self.get_embedding(words_m[i], self.word_embedding_model),
 79 |                         self.get_embedding(words_e[j], self.word_embedding_model))
 80 | 
 81 |                     # assign the word_pair_sim to the character_pairs which construct the words
 82 |                     for character_i in words_m[i]:
 83 |                         for character_j in words_e[j]:
 84 |                             self.tensor[sample_index][2][
 85 |                                 self.mentions[sample_index].index(character_i), self.entities[sample_index].index(
 86 |                                     character_j)] = words_sim
 87 | 
 88 |             for i in range(len_mention):
 89 |                 for j in range(len_entity):
 90 |                     # for sentence matching
 91 |                     self.tensor[sample_index][3][i][j] = 1 - spatial.distance.cosine(
 92 |                         sentence_embedding_m[sample_index][i], sentence_embedding_e[sample_index][j])
 93 | 
 94 |                     # for string matching
 95 |                     if self.mentions[sample_index][i] == self.entities[sample_index][j]:
 96 |                         self.tensor[sample_index][0][i][j] = 1
 97 | 
 98 |                     # for character matching
 99 |                     character_embedding_mi = self.get_embedding(self.mentions[sample_index][i],
100 |                                                                 self.character_embedding_model)
101 |                     character_embedding_ei = self.get_embedding(self.entities[sample_index][j],
102 |                                                                 self.character_embedding_model)
103 |                     self.tensor[sample_index][1][i][j] = 1 - spatial.distance.cosine(character_embedding_ei,
104 |                                                                                      character_embedding_mi)
105 | 
106 |         print("Tensor Completed!")
107 | 
108 |     def get_tensor(self):
109 |         return self.tensor
110 | 
111 | 
112 |     def get_embedding(self, word, model):
113 |         if word in model.wv.vocab.keys():
114 |             index = model.wv.index2word.index(word)
115 |             return model.wv.syn0[index]
116 |             # return model.wv.vocab[word]
117 |         else:
118 |             vector_length = 100
119 |             return np.ones([vector_length])
120 | 
121 | 
122 |     def getIDFWeights(self, x_names, x_index, vocab_id_w, idfModel):
123 |         res = []  # idf_weights
124 |         length = len(x_names)
125 | 
126 |         for i in range(length):
127 |             name = x_names[i]
128 |             index = x_index[i]
129 |             character_idfs = dict()
130 |             default_idf = 1  # for unknown word, assign idf=1
131 | 
132 |             name_segs = list(jieba.cut(name))
133 | 
134 |             for name in name_segs:
135 |                 for character in name:
136 |                     if name in idfModel.keys():
137 |                         character_idfs[character] = idfModel[name]
138 |                     else:
139 |                         character_idfs[character] = default_idf
140 | 
141 |             weight_per_name = []
142 |             for character_index in index:
143 |                 character = vocab_id_w[character_index]
144 |                 if character in character_idfs.keys():
145 |                     weight_per_name.append(character_idfs[character])
146 |                 else:
147 |                     weight_per_name.append(default_idf)
148 |             res.append(weight_per_name)
149 |             res_arr = np.asarray(res)
150 | 
151 |             # *** normalize the idf weights
152 |             row_sums = res_arr.sum(axis=1)
153 |             res_arr = res_arr / row_sums[:, np.newaxis]
154 |         return res_arr
155 | 
156 | 
157 |     def getAttention(self, r, x, index, vocab_id_w, idfModel):
158 |         # r(input_size, None, hidden_n * 2) => (None, input_size, hidden_n * 2)
159 |         representation = np.transpose(r, (1, 0, 2))
160 | 
161 |         weights = self.getIDFWeights(x, index, vocab_id_w, idfModel)  # shape: batch_size * sequence_length
162 | 
163 |         # weights transform from 2D to 3 D and then 3D*3D broadcasting
164 |         representation = representation * weights[:, :, np.newaxis]
165 |         return representation
166 | 
167 | 
168 |     def getAttention_M(self, r, m, x, index, vocab_id_w, idfModel):
169 |         # r(input_size, None, hidden_n * 2) => (None, input_size, hidden_n * 2)
170 |         representation = np.transpose(r, (1, 0, 2))
171 |         representation = representation * m[:, :, np.newaxis]
172 | 
173 |         weights = self.getIDFWeights(x, index, vocab_id_w, idfModel)  # shape: batch_size * sequence_length
174 | 
175 |         # weights transform from 2D to 3 D and then 3D*3D broadcasting
176 |         representation = representation * weights[:, :, np.newaxis]
177 |         return representation
178 |     def no_attention(self, r, m, x, index, vocab_id_w, idfModel):
179 |         # r(input_size, None, hidden_n * 2) => (None, input_size, hidden_n * 2)
180 |         representation = np.transpose(r, (1, 0, 2))
181 |         representation = representation * m[:, :, np.newaxis]
182 | 
183 |         # weights = self.getIDFWeights(x, index, vocab_id_w, idfModel)  # shape: batch_size * sequence_length
184 | 
185 |         # weights transform from 2D to 3 D and then 3D*3D broadcasting
186 |         # representation = representation * weights[:, :, np.newaxis]
187 |         return representation
188 | 
189 |     def getSentence_Embedding(self, x1, x2, max_document_length):
190 |         checkpoint_dir = os.path.abspath(os.path.join(self.bilstm_dir, "checkpoints"))
191 |         ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
192 |         checkpoint_file = ckpt.model_checkpoint_path
193 | 
194 |         vocab_file = os.path.join(checkpoint_dir, "vocab")
195 | 
196 |         inpH = InputHelper()
197 |         vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
198 |         vocab_processor = vocab_processor.restore(vocab_file)
199 | 
200 |         tmp = []
201 |         (x1_index, x2_index, mask_x1, mask_x2, tmp) = inpH.get_data(vocab_processor, x1, x2, tmp,
202 |                                                                     max_document_length)
203 |         # x1_index, x2_index = inpH.toVocabularyIndexVector(x1, x2, vocab_file, max_document_length)
204 | 
205 |         idfModel = loadIDFModel(self.idfModel_file)
206 | 
207 |         # # load vocabulary model
208 |         # vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
209 |         # vocab_processor = vocab_processor.restore(vocab_file)
210 | 
211 |         # Extract word:id mapping from the object.
212 |         vocab_dict = vocab_processor.vocabulary_._mapping
213 |         vocab_id_w = dict((y, x) for x, y in vocab_dict.iteritems())
214 | 
215 |         print("\nGenerating Sentence Embedding Result...\n")
216 |         graph = tf.Graph()
217 | 
218 |         with graph.as_default():
219 |             sess = tf.Session()
220 | 
221 |             with sess.as_default():
222 |                 # Load the saved meta graph and restore variables
223 |                 saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
224 |                 sess.run(tf.initialize_all_variables())
225 |                 saver.restore(sess, checkpoint_file)
226 | 
227 |                 # Get the placeholders from the graph by name
228 |                 # the output is a list with only one element
229 |                 input_x1 = graph.get_operation_by_name("input_x1").outputs[0]
230 |                 input_x2 = graph.get_operation_by_name("input_x2").outputs[0]
231 | 
232 |                 sentence_representation1 = graph.get_operation_by_name("sentence_embedding/Representation1").outputs[0]
233 |                 sentence_representation2 = graph.get_operation_by_name("sentence_embedding/Representation2").outputs[0]
234 |                 print "Sentence vector shape after sentence modeling"
235 |                 print sentence_representation2.get_shape().as_list()
236 | 
237 |                 r1, r2 = sess.run([sentence_representation1, sentence_representation2],
238 |                                   {input_x1: x1_index, input_x2: x2_index})
239 | 
240 |                 # Applied Attention_mechanism
241 |                 representation1 = self.getAttention_M(r1, mask_x1, x1, x1_index, vocab_id_w, idfModel)
242 |                 representation2 = self.getAttention_M(r2, mask_x2, x2, x2_index, vocab_id_w, idfModel)
243 |                 # representation1 = self.no_attention(r1, mask_x1, x1, x1_index, vocab_id_w, idfModel)
244 |                 # representation2 = self.no_attention(r2, mask_x2, x2, x2_index, vocab_id_w, idfModel)
245 | 
246 |         return representation1, representation2
247 | 


--------------------------------------------------------------------------------
/Dynamic/MT_Dynamic_MultiGranModel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | class MT_Dynamic_MultiGranModel(object):
  8 |     def _conv(self, name, in_, ksize, reuse=False):
  9 |         num_filters = ksize[3]
 10 | 
 11 |         with tf.variable_scope(name, reuse=reuse) as scope:
 12 |             # same CNN for different views
 13 |             W = tf.get_variable("weights", ksize, initializer=tf.truncated_normal_initializer(stddev=0.1))
 14 |             biases = tf.get_variable("biases", [num_filters], initializer=tf.constant_initializer(0.1))
 15 | 
 16 |             conv = tf.nn.conv2d(in_, W, strides=[1, 1, 1, 1], padding="VALID")
 17 |             h = tf.nn.relu(tf.nn.bias_add(conv, biases), name=scope.name)
 18 | 
 19 |         return h
 20 | 
 21 |     def _maxpool(self, name, in_, ksize, strides):
 22 |         pool = tf.nn.max_pool(in_, ksize=ksize, strides=strides, padding='VALID', name=name)
 23 |         print name, pool.get_shape().as_list()
 24 |         return pool
 25 | 
 26 |     def __init__(self, max_len1, max_len2, filter_sizes, num_filters, l2_reg_lambda=0.0):
 27 |         channel_num = 4
 28 | 
 29 |         # Placeholders for input, output and dropout
 30 |         self.input_tensor = tf.placeholder(tf.float32, [None, max_len1, max_len1, 4], name="input_tensor_description")
 31 |         self.input_tensor_o = tf.placeholder(tf.float32, [None, max_len2, max_len2, 4], name="input_tensor_operation")
 32 | 
 33 |         self.input_y_description = tf.placeholder(tf.float32, [None, 2], name="input_y_description")
 34 |         self.input_y_operation = tf.placeholder(tf.float32, [None, 2], name="input_y_operation")
 35 | 
 36 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 37 | 
 38 |         self.gamma = tf.placeholder(tf.float32, [None], name="des_mtl_param")
 39 |         self.mask_opr = tf.placeholder(tf.float32, [None], name="mask_opr")
 40 | 
 41 |         self.gamma_ = tf.expand_dims(self.gamma, axis=1)
 42 | 
 43 |         self.matrix = tf.placeholder(tf.float32, [None, 1], name="cooccurence")
 44 | 
 45 |         # Keeping track of l2 regularization loss (optional)
 46 |         l2_loss_d = tf.constant(0.0)
 47 |         l2_loss_operation = tf.constant(0.0)
 48 | 
 49 |         # Create a convolution + maxpool layer for each filter size
 50 |         pooled_outputs = []
 51 |         pooled_outputs_operation = []
 52 | 
 53 |         input_tensor = tf.expand_dims(self.input_tensor, 4)  # N x W x H x V  => N x W x H x V x C
 54 |         input_tensor = tf.transpose(input_tensor,
 55 |                                     perm=[3, 0, 1, 2, 4])  # N x W x H x V x C =>  V x N x W x H x C
 56 | 
 57 |         input_tensor_operation = tf.expand_dims(self.input_tensor_o, 4)  # N x W x H x V  => N x W x H x V x C
 58 |         input_tensor_operation = tf.transpose(input_tensor_operation,
 59 |                                               perm=[3, 0, 1, 2, 4])  # N x W x H x V x C =>  V x N x W x H x C
 60 | 
 61 |         with tf.name_scope("CNN_Description"):
 62 |             for i in range(channel_num):
 63 |                 # set reuse True for i > 0, for weight-sharing
 64 |                 reuse_f = (i != 0)
 65 |                 with tf.variable_scope("CNN_Description", reuse=reuse_f):
 66 |                     view = tf.gather(input_tensor, i)  # N x W x H x C
 67 | 
 68 |                     filter_shape1 = [filter_sizes[0], filter_sizes[0], 1, num_filters / 2]
 69 |                     filter_shape2 = [filter_sizes[1], filter_sizes[1], num_filters / 2, num_filters]
 70 |                     p_size1 = [1, 3, 3, 1]
 71 |                     p_size2 = [1, 5, 5, 1]
 72 | 
 73 |                     conv1 = self._conv('conv1', view, filter_shape1, reuse=reuse_f)
 74 |                     pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
 75 | 
 76 |                     # conv2 = self._conv('conv2', pool1, filter_shape2, reuse=reuse_f)
 77 |                     # pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
 78 | 
 79 |                     dim1 = np.prod(pool1.get_shape().as_list()[1:])
 80 |                     reshape = tf.reshape(pool1, [-1, dim1])
 81 | 
 82 |                     pooled_outputs.append(reshape)
 83 | 
 84 |         with tf.name_scope("CNN_Operation"):
 85 |             for i in range(channel_num):
 86 |                 # set reuse True for i > 0, for weight-sharing
 87 |                 reuse_f = (i != 0)
 88 | 
 89 |                 with tf.variable_scope("CNN_Operation", reuse=reuse_f):
 90 |                     view = tf.gather(input_tensor_operation, i)  # N x W x H x C
 91 | 
 92 |                     filter_shape1 = [filter_sizes[0], filter_sizes[0], 1, num_filters / 2]
 93 |                     filter_shape2 = [filter_sizes[1], filter_sizes[1], num_filters / 2, num_filters]
 94 |                     p_size1 = [1, 2, 2, 1]
 95 |                     p_size2 = [1, 5, 5, 1]
 96 | 
 97 |                     conv1 = self._conv('conv1', view, filter_shape1, reuse=reuse_f)
 98 |                     pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
 99 | 
100 |                     # conv2 = self._conv('conv2', pool1, filter_shape2, reuse=reuse_f)
101 |                     # pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
102 | 
103 |                     dim2 = np.prod(pool1.get_shape().as_list()[1:])
104 |                     reshape = tf.reshape(pool1, [-1, dim2])
105 | 
106 |                     pooled_outputs_operation.append(reshape)
107 | 
108 |         with tf.name_scope("Descriptipn_view_pooling"):
109 |             x = tf.stack(pooled_outputs)  # 4 * N * 7744
110 |             x = tf.transpose(x, perm=[1, 2, 0])  # N * 7744 * 4
111 |             reshape = tf.reshape(x, [-1, 4])
112 |             print reshape.get_shape().as_list()
113 | 
114 |             Weights = tf.Variable(tf.random_uniform([4, 1], 0.0, 1.0), name="W")
115 | 
116 |             y_d = tf.matmul(reshape, Weights, name="view_pooling")
117 |             y_d = tf.reshape(y_d, [-1, dim1])
118 |             print y_d.get_shape().as_list()
119 | 
120 |         with tf.name_scope("Operation_view_pooling"):
121 |             x = tf.stack(pooled_outputs_operation)  # 4 * N * 7744
122 |             x = tf.transpose(x, perm=[1, 2, 0])  # N * 7744 * 4
123 |             reshape = tf.reshape(x, [-1, 4])
124 |             print reshape.get_shape().as_list()
125 | 
126 |             Weights = tf.Variable(tf.random_uniform([4, 1], 0.0, 1.0), name="W")
127 | 
128 |             y_o = tf.matmul(reshape, Weights, name="view_pooling")
129 |             y_o = tf.reshape(y_o, [-1, dim2])
130 |             y_o = y_o * tf.expand_dims(self.mask_opr, axis=1)
131 |             print y_o.get_shape().as_list()
132 | 
133 |         # Add dropout
134 |         with tf.name_scope("dropout"):
135 |             self.h_drop_d = tf.nn.dropout(y_d, self.dropout_keep_prob, name="hidden_output_description_drop")
136 |             self.h_drop_o = tf.nn.dropout(y_o, self.dropout_keep_prob, name="hidden_output_operation_drop")
137 |             print self.h_drop_d.get_shape().as_list()
138 |             print self.h_drop_o.get_shape().as_list()
139 | 
140 |         with tf.name_scope("FC"):
141 |             dim = 100
142 |             W1 = tf.Variable(name="W1", initial_value=tf.truncated_normal(shape=[dim1, dim], stddev=0.1))
143 |             b1 = tf.Variable(tf.constant(0.1, shape=[dim]), name="b1")
144 | 
145 |             self.fc_d = tf.nn.relu(tf.matmul(self.h_drop_d, W1) + b1)
146 |             self.fc_drop_d = tf.nn.dropout(self.fc_d, self.dropout_keep_prob)
147 | 
148 |             W2 = tf.Variable(name="W2", initial_value=tf.truncated_normal(shape=[dim2, dim], stddev=0.1))
149 |             b2 = tf.Variable(tf.constant(0.1, shape=[dim]), name="b2")
150 | 
151 |             self.fc_o = tf.nn.relu(tf.matmul(self.h_drop_o, W2) + b2)
152 |             self.fc_drop_o = tf.nn.dropout(self.fc_o, self.dropout_keep_prob)
153 | 
154 |         # Share Layer Construction
155 |         with tf.name_scope("Multitask"):
156 | 
157 |             layer1 = self.fc_drop_d * self.gamma_
158 |             layer2 = self.fc_drop_o * 0.5
159 |             self.shared_layer = tf.add(layer1, layer2, name="shared_Layer")
160 | 
161 |             print self.shared_layer.get_shape().as_list()
162 | 
163 |             W1 = tf.get_variable(name="tt1_W", shape=[dim],
164 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
165 |             W2 = tf.get_variable(name="st1_W", shape=[dim],
166 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
167 |             W3 = tf.get_variable(name="st2_W", shape=[dim],
168 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
169 |             W4 = tf.get_variable(name="tt2_W", shape=[dim],
170 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
171 | 
172 |             self.task1_r = tf.add(tf.multiply(self.shared_layer, W2), tf.multiply(self.fc_drop_d, W1),
173 |                                   name="description_r")
174 |             self.task2_r = tf.add(tf.multiply(self.shared_layer, W3), tf.multiply(self.fc_drop_o, W4),
175 |                                   name="operation_r")
176 |             print self.task1_r.get_shape().as_list()
177 | 
178 |         with tf.name_scope("FC2"):
179 |             W1 = tf.Variable(name="W1", initial_value=tf.truncated_normal(shape=[dim, dim / 2], stddev=0.1))
180 |             b1 = tf.Variable(tf.constant(0.1, shape=[dim / 2]), name="b1")
181 | 
182 |             self.task1_representation = tf.nn.relu(tf.matmul(self.task1_r, W1) + b1)
183 |             self.task1_representation = tf.nn.dropout(self.task1_representation, self.dropout_keep_prob)
184 | 
185 |             W2 = tf.Variable(name="W2", initial_value=tf.truncated_normal(shape=[dim, dim / 2], stddev=0.1))
186 |             b2 = tf.Variable(tf.constant(0.1, shape=[dim / 2]), name="b2")
187 | 
188 |             self.task2_representation = tf.nn.relu(tf.matmul(self.task2_r, W2) + b2)
189 |             self.task2_representation = tf.nn.dropout(self.task2_representation, self.dropout_keep_prob)
190 | 
191 |         # Final (unnormalized) scores and predictions
192 |         with tf.name_scope("output"):
193 |             W_d = tf.get_variable(name="W_d", shape=[dim / 2, 2],
194 |                                   initializer=tf.truncated_normal_initializer(stddev=0.1))
195 |             b_d = tf.Variable(tf.constant(0.1, shape=[2]), name="b_d")
196 | 
197 |             l2_loss_d += tf.nn.l2_loss(W_d)
198 |             l2_loss_d += tf.nn.l2_loss(b_d)
199 | 
200 |             # W_o = tf.Variable(name="W_output_o", initial_value=tf.random_normal([dim, 2], stddev=0.1))
201 |             W_o = tf.get_variable(name="W_o", shape=[dim / 2, 2],
202 |                                   initializer=tf.truncated_normal_initializer(stddev=0.1))
203 |             b_o = tf.Variable(tf.constant(0.1, shape=[2]), name="b_o")
204 | 
205 |             l2_loss_operation += tf.nn.l2_loss(W_o)
206 |             l2_loss_operation += tf.nn.l2_loss(b_o)
207 | 
208 |             self.scores_d = tf.nn.xw_plus_b(self.task1_representation, W_d, b_d, name="scores1")
209 |             self.scores_o = tf.nn.xw_plus_b(self.task2_representation, W_o, b_o, name="scores2")
210 | 
211 |             self.relation_d = tf.nn.softmax(self.scores_d, name="relation1")
212 |             self.relation_o = tf.nn.softmax(self.scores_o, name="relation2")
213 | 
214 |             self.predictions_d = tf.argmax(self.scores_d, 1, name="predictions1")
215 |             self.predictions_o = tf.argmax(self.scores_o, 1, name="predictions2")
216 | 
217 |         # Calculate Mean cross-entropy loss
218 |         with tf.name_scope("loss"):
219 |             losses1 = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores_d, labels=self.input_y_description)
220 |             losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores_o,
221 |                                                               labels=self.input_y_operation) * self.mask_opr
222 | 
223 |             constraints = self.matrix * tf.reduce_sum(tf.square(self.relation_d - self.relation_o), axis=1)
224 |             self.constraints = tf.identity(constraints, name="constraints")
225 | 
226 |             self.loss = tf.reduce_mean(losses1) + tf.reduce_mean(losses2) + l2_reg_lambda * (
227 |                 l2_loss_d + l2_loss_operation) + tf.reduce_mean(constraints * self.mask_opr)
228 | 
229 |         # Accuracy
230 |         with tf.name_scope("accuracy"):
231 |             correct_predictions_d = tf.cast(tf.equal(self.predictions_d, tf.argmax(self.input_y_description, 1)),
232 |                                             "float")
233 | 
234 |             label_opr = tf.argmax(self.input_y_operation, 1)
235 |             correct_predictions_o = tf.multiply(tf.cast(tf.equal(self.predictions_o, label_opr), "float"),
236 |                                                 self.mask_opr)
237 | 
238 |             self.accuracy_d = tf.reduce_mean(correct_predictions_d, name="accuracy_d")
239 |             self.accuracy_o = tf.div(tf.reduce_sum(correct_predictions_o),
240 |                                      tf.reduce_sum(self.mask_opr), name="accuracy_o")
241 | 


--------------------------------------------------------------------------------
/util/input_helpers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | import codecs
  5 | import gc
  6 | import sys
  7 | 
  8 | import numpy as np
  9 | 
 10 | from preprocess import MyVocabularyProcessor
 11 | from util import preprocess_unit, preprocess_arr
 12 | 
 13 | reload(sys)
 14 | sys.setdefaultencoding("utf-8")
 15 | 
 16 | 
 17 | class InputHelper(object):
 18 |     def get_data(self, vocab_processor, train_x1, train_x2, train_y, max_document_length):
 19 |         """
 20 |         Use vocab_processor to index mention and entity pairs and then pad them and return mask arrs
 21 |         :param vocab_processor:
 22 |         :param train_x1:
 23 |         :param train_x2:
 24 |         :param train_y:
 25 |         :param max_document_length:
 26 |         :return:
 27 |         """
 28 |         train_x1_i = np.asarray(list(vocab_processor.transform(train_x1)))
 29 |         train_x2_i = np.asarray(list(vocab_processor.transform(train_x2)))
 30 | 
 31 |         mask_train_x1 = np.zeros([len(train_x1_i), max_document_length])
 32 |         mask_train_x2 = np.zeros([len(train_x2_i), max_document_length])
 33 | 
 34 |         new_mask_x1, new_mask_x2 = self.padding_and_generate_mask(train_x1, train_x2, mask_train_x1, mask_train_x2)
 35 |         return (train_x1_i, train_x2_i, new_mask_x1, new_mask_x2, train_y)
 36 | 
 37 |     def padding_and_generate_mask(self, x1, x2, new_mask_x1, new_mask_x2):
 38 |         """
 39 |         Pad the sentence and return mask array for mention and entity pair
 40 |         :param x1:
 41 |         :param x2:
 42 |         :param new_mask_x1:
 43 |         :param new_mask_x2:
 44 |         :return:
 45 |         """
 46 | 
 47 |         for i, (x1, x2) in enumerate(zip(x1, x2)):
 48 |             # whether to remove sentences with length larger than maxlen
 49 |             if len(x1) == 0 or len(x2) == 0:
 50 |                 print("")
 51 |             new_mask_x1[i, 0:len(x1)] = 1.0
 52 |             new_mask_x2[i, 0:len(x2)] = 1.0
 53 |         return new_mask_x1, new_mask_x2
 54 | 
 55 |     def add_y_helper(self, y_value, y_arr, is_positive_label):
 56 |         """
 57 |         add 1/0 or [0,1]/[1, 0] in y_arr which depends on y_value flag
 58 |         :param y_value:
 59 |         :param y_arr:
 60 |         :param is_positive_label:
 61 |         :return:
 62 |         """
 63 |         if y_value == True:
 64 |             if is_positive_label:
 65 |                 y_arr.append(1)
 66 |             else:
 67 |                 y_arr.append(0)
 68 |         else:
 69 |             if is_positive_label:
 70 |                 y_arr.append(np.array([0, 1]))
 71 |             else:
 72 |                 y_arr.append(np.array([1, 0]))
 73 |         return y_arr
 74 | 
 75 |     def batch_iter(self, data, batch_size, num_epochs, shuffle=True):
 76 |         """
 77 |         Generates a batch iterator for a data set.
 78 |         :param data:
 79 |         :param batch_size:
 80 |         :param num_epochs:
 81 |         :param shuffle:
 82 |         :return:
 83 |         """
 84 |         data = np.asarray(data)
 85 |         print(data)
 86 |         print(data.shape)
 87 |         data_size = len(data)
 88 |         num_batches_per_epoch = int(len(data) / batch_size)
 89 | 
 90 |         if shuffle:
 91 |             print "Shuffle!!!!"
 92 |         for epoch in range(num_epochs):
 93 |             # Shuffle the data at each epoch
 94 |             if shuffle:
 95 | 
 96 |                 shuffle_indices = np.random.permutation(np.arange(data_size))
 97 |                 shuffled_data = data[shuffle_indices]
 98 |             else:
 99 |                 shuffled_data = data
100 |             for batch_num in range(num_batches_per_epoch):
101 |                 start_index = batch_num * batch_size
102 |                 end_index = min((batch_num + 1) * batch_size, data_size)
103 |                 yield shuffled_data[start_index:end_index]
104 | 
105 |     def getTestIndexedDataSet(self, data_path, sep, vocab_processor, max_document_length, y_value):
106 |         """
107 |         Read in labeled test data and use previous vocabulary processor to index them
108 |         :param data_path:
109 |         :param sep:
110 |         :param vocab_processor:
111 |         :param max_document_length:
112 |         :param y_value:
113 |         :return:
114 |         """
115 |         x1_temp, x2_temp, y = self.getTsvTestData(data_path, sep, max_document_length, y_value)
116 | 
117 |         x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
118 |         x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
119 |         return x1, x2, y
120 | 
121 |     def toVocabularyIndexVector(self, datax1, datax2, vocab_path, max_document_length):
122 |         """
123 |         Transform the word list to vocabulary_index vectors
124 |         :param datax1:
125 |         :param datax2:
126 |         :param vocab_path:
127 |         :param max_document_length:
128 |         :return:
129 |         """
130 |         # Build vocabulary
131 |         vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
132 |         vocab_processor = vocab_processor.restore(vocab_path)
133 |         print(len(vocab_processor.vocabulary_))
134 | 
135 |         datax1 = preprocess_arr(datax1)
136 |         datax2 = preprocess_arr(datax2)
137 |         x1 = np.asarray(list(vocab_processor.transform(datax1)))
138 |         x2 = np.asarray(list(vocab_processor.transform(datax2)))
139 |         # Randomly shuffle data
140 |         del vocab_processor
141 |         gc.collect()
142 |         return x1, x2
143 | 
144 |     def getTsvTestData(self, filepath, sep, sequence_length, y_value=False):
145 |         """
146 |         load the data(label, mention, entity) from labeled files
147 |         :param filepath:
148 |         :return:  three lists(label_list, mention_list, entity_list)
149 |         """
150 |         print("Loading testing/labelled data from " + filepath)
151 |         x1, x2 = [], []
152 |         y = []
153 |         line_num = 0
154 |         for line in codecs.open(filepath, "r", "utf-8"):
155 |             line_num += 1
156 |             l = line.strip().split(sep)
157 |             if len(l) < 3:
158 |                 continue
159 | 
160 |             l[1] = preprocess_unit(l[1])
161 |             l[2] = preprocess_unit(l[2])
162 | 
163 |             if len(l[1]) == 0 or len(l[2]) == 0:
164 |                 continue
165 | 
166 |             # truncate when length is bigger than the max_length
167 |             if len(l[1]) > sequence_length or len(l[2]) > sequence_length:
168 |                 l[1] = l[1][:sequence_length]
169 |                 l[2] = l[2][:sequence_length]
170 | 
171 |             x1.append(l[1])
172 |             x2.append(l[2])
173 |             y = self.add_y_helper(y_value, y, int(l[0]) == 1)
174 |             if line_num != len(y):
175 |                 print("ei")
176 |         print(line_num)
177 |         return np.asarray(x1), np.asarray(x2), np.asarray(y)
178 | 
179 |     def getTsvTestData_Mul(self, filepath, sep, sequence_length, y_value=False):
180 |         """
181 |         load the data(label, mention, entity) from labeled mutlti-task files
182 |         :param filepath:
183 |         :return:  three lists(label_list, mention_list, entity_list)
184 |         """
185 |         print("Loading testing/labelled data from " + filepath)
186 |         x1, x2, x3, x4 = [], [], [], []
187 |         y = []
188 |         y2 = []
189 |         for line in codecs.open(filepath, "r", "utf-8"):
190 |             l = line.strip().split(sep)
191 |             if len(l) < 5:
192 |                 continue
193 | 
194 |             l[1] = preprocess_unit(l[1])
195 |             l[2] = preprocess_unit(l[2])
196 |             l[3] = preprocess_unit(l[3])
197 |             l[4] = preprocess_unit(l[4])
198 | 
199 |             # truncate when length is bigger than the max_length
200 |             if len(l[1]) > sequence_length or len(l[2]) > sequence_length or len(l[3]) > sequence_length or len(
201 |                     l[4]) > sequence_length:
202 |                 l[1] = l[1][:sequence_length]
203 |                 l[2] = l[2][:sequence_length]
204 |                 l[3] = l[3][:sequence_length]
205 |                 l[4] = l[4][:sequence_length]
206 | 
207 |             x1.append(l[1])
208 |             x2.append(l[2])
209 |             x3.append(l[3])
210 |             x4.append(l[4])
211 |             y = self.add_y_helper(y_value, y, int(l[0]) == 1)
212 |             y2 = self.add_y_helper(y_value, y2, int(l[0]) == 1)
213 |         return np.asarray(x1), np.asarray(x2), np.asarray(x3), np.asarray(x4), np.asarray(y), np.asarray(y2)
214 | 
215 |     def getTsvTestData_Mul(self, filepath, sep, sequence_length, y_value=False):
216 |         """
217 |         load the data(label, mention, entity) from labeled mutlti-task files
218 |         :param filepath:
219 |         :return:  three lists(label_list, mention_list, entity_list)
220 |         """
221 |         print("Loading testing/labelled data from " + filepath)
222 |         x1, x2, x3, x4 = [], [], [], []
223 |         y = []
224 |         y2 = []
225 |         for line in codecs.open(filepath, "r", "utf-8"):
226 |             l = line.strip().split(sep)
227 |             if len(l) < 5:
228 |                 continue
229 | 
230 |             l[1] = preprocess_unit(l[1])
231 |             l[2] = preprocess_unit(l[2])
232 |             l[3] = preprocess_unit(l[3])
233 |             l[4] = preprocess_unit(l[4])
234 | 
235 |             # truncate when length is bigger than the max_length
236 |             if len(l[1]) > sequence_length or len(l[2]) > sequence_length or len(l[3]) > sequence_length or len(
237 |                     l[4]) > sequence_length:
238 |                 l[1] = l[1][:sequence_length]
239 |                 l[2] = l[2][:sequence_length]
240 |                 l[3] = l[3][:sequence_length]
241 |                 l[4] = l[4][:sequence_length]
242 | 
243 |             x1.append(l[1])
244 |             x2.append(l[2])
245 |             x3.append(l[3])
246 |             x4.append(l[4])
247 |             y = self.add_y_helper(y_value, y, int(l[0]) == 1)
248 |             y2 = self.add_y_helper(y_value, y2, int(l[0]) == 1)
249 |         return np.asarray(x1), np.asarray(x2), np.asarray(x3), np.asarray(x4), np.asarray(y), np.asarray(y2)
250 | 
251 |     def getTsvTestData_Mul_Labels(self, filepath, sep, sequence_length, y_value=False):
252 |         """
253 |         load the data(label, mention, entity) from labeled mutlti-task files
254 |         :param filepath:
255 |         :return:  three lists(label_list, mention_list, entity_list)
256 |         """
257 |         print("Loading testing/labelled data from " + filepath)
258 |         x1, x2, x3, x4 = [], [], [], []
259 |         y = []
260 |         y2 = []
261 |         for line in codecs.open(filepath, "r", "utf-8"):
262 |             l = line.strip().split(sep)
263 |             if len(l) < 6:
264 |                 continue
265 | 
266 |             l[1] = preprocess_unit(l[1])
267 |             l[2] = preprocess_unit(l[2])
268 |             l[4] = preprocess_unit(l[4])
269 |             l[5] = preprocess_unit(l[5])
270 | 
271 |             # truncate when length is bigger than the max_length
272 |             if len(l[1]) > sequence_length or len(l[2]) > sequence_length or len(l[4]) > sequence_length or len(
273 |                     l[5]) > sequence_length:
274 |                 l[1] = l[1][:sequence_length]
275 |                 l[2] = l[2][:sequence_length]
276 |                 l[5] = l[5][:sequence_length]
277 |                 l[4] = l[4][:sequence_length]
278 | 
279 |             x1.append(l[1])
280 |             x2.append(l[2])
281 |             x3.append(l[4])
282 |             x4.append(l[5])
283 |             y = self.add_y_helper(y_value, y, int(l[0]) == 1)
284 |             y2 = self.add_y_helper(y_value, y2, int(l[3]) == 1)
285 | 
286 |         return np.asarray(x1), np.asarray(x2), np.asarray(x3), np.asarray(x4), np.asarray(y), np.asarray(y2)
287 | 
288 |     def getTsvTestData_Mul_Labels_Dyna(self, filepath, sep, sequence_length, y_value=False):
289 |         """
290 |         load the data(label, mention, entity) from labeled mutlti-task files
291 |         :param filepath:
292 |         :return:  three lists(label_list, mention_list, entity_list)
293 |         """
294 |         print("Loading testing/labelled data from " + filepath)
295 |         x1, x2, x3, x4 = [], [], [], []
296 |         y = []
297 |         y2 = []
298 |         indicate = []
299 |         for line in codecs.open(filepath, "r", "utf-8"):
300 |             l = line.strip().split(sep)
301 |             l[1] = preprocess_unit(l[1])
302 |             l[2] = preprocess_unit(l[2])
303 |             if len(l[1]) > sequence_length or len(l[2]) > sequence_length:
304 |                 l[1] = l[1][:sequence_length]
305 |                 l[2] = l[2][:sequence_length]
306 |             x1.append(l[1])
307 |             x2.append(l[2])
308 |             y = self.add_y_helper(y_value, y, int(l[0]) == 1)
309 | 
310 |             if len(l) == 3:  # dynamic single task1
311 |                 x3.append("")
312 |                 x4.append("")
313 |                 y2 = self.add_y_helper(y_value, y2, False)
314 |                 indicate.append(1)
315 |             else:
316 |                 l[4] = preprocess_unit(l[4])
317 |                 l[5] = preprocess_unit(l[5])
318 |                 # truncate when length is bigger than the max_length
319 |                 if len(l[4]) > sequence_length or len(l[5]) > sequence_length:
320 |                     l[5] = l[5][:sequence_length]
321 |                     l[4] = l[4][:sequence_length]
322 |                 x3.append(l[4])
323 |                 x4.append(l[5])
324 |                 indicate.append(0)
325 |                 y2 = self.add_y_helper(y_value, y2, int(l[3]) == 1)
326 | 
327 |         return indicate, np.asarray(x1), np.asarray(x2), np.asarray(x3), np.asarray(x4), np.asarray(y), np.asarray(y2)
328 | 


--------------------------------------------------------------------------------
/train_test_idf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | import datetime
  5 | import os
  6 | import time
  7 | import codecs
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | 
 12 | from util.util import write_evaluation_file
 13 | from util.input_helpers import InputHelper
 14 | from MultiGran_Model import MultiGranModel
 15 | from tensor import Tensor
 16 | 
 17 | # Parameters
 18 | # ==================================================
 19 | 
 20 | # Model Hyperparameters
 21 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 100)")
 22 | tf.flags.DEFINE_string("filter_sizes", "2, 3", "Comma-separated filter sizes (default: '2,3')")
 23 | tf.flags.DEFINE_integer("num_filters", 16, "Number of filters per filter size (default: 64)")
 24 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 25 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")
 26 | 
 27 | # Data Parameter
 28 | tf.flags.DEFINE_integer("max_sequence_len", 10, "max document length of input")
 29 | tf.flags.DEFINE_integer("most_words", 300000, "Most number of words in vocab (default: 300000)")
 30 | 
 31 | # Training parameters
 32 | tf.flags.DEFINE_integer("seed", 123, "Random seed (default: 123)")
 33 | tf.flags.DEFINE_string("train_dir", "./", "Training dir root")
 34 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
 35 | tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs")
 36 | tf.flags.DEFINE_float("eval_split", 0.1, "Use how much data for evaluating (default: 0.1)")
 37 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
 38 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
 39 | 
 40 | FLAGS = tf.flags.FLAGS
 41 | FLAGS._parse_flags()
 42 | print("\nParameters:")
 43 | for attr, value in sorted(FLAGS.__flags.items()):
 44 |     print("{}={}".format(attr.upper(), value))
 45 | print("")
 46 | 
 47 | 
 48 | def main():
 49 |     print("Loading data...")
 50 | 
 51 |     task_num = 1
 52 |     name = "des" if task_num == 1 else "opr"
 53 |     d_type = "Single_task1" if task_num == 1 else "Single_task2"
 54 | 
 55 |     inputH = InputHelper()
 56 |     max_document_length = FLAGS.max_sequence_len
 57 |     load_from_npy = False
 58 | 
 59 |     sep = "\t"
 60 |     data_file = "./runs/Exp/Single_task11501902502/lalaa.txt"
 61 | 
 62 |     def add_y_helper(y_value, y_arr, is_positive_label):
 63 |         """
 64 |         add 1/0 or [0,1]/[1, 0] in y_arr which depends on y_value flag
 65 |         :param y_value:
 66 |         :param y_arr:
 67 |         :param is_positive_label:
 68 |         :return:
 69 |         """
 70 |         if y_value == True:
 71 |             if is_positive_label:
 72 |                 y_arr.append(1)
 73 |             else:
 74 |                 y_arr.append(0)
 75 |         else:
 76 |             if is_positive_label:
 77 |                 y_arr.append(np.array([0, 1]))
 78 |             else:
 79 |                 y_arr.append(np.array([1, 0]))
 80 |         return y_arr
 81 | 
 82 |     def getTsvTestData(filepath, sep, sequence_length, y_value):
 83 |         """
 84 |         load the data(label, mention, entity) from labeled file
 85 |         :param filepath:
 86 |         :return:  three lists(label_list, mention_list, entity_list)
 87 |         """
 88 |         print("Loading testing/labelled data from " + filepath)
 89 |         x1 = []
 90 |         x2 = []
 91 |         y = []
 92 |         cnt = 0
 93 |         # positive samples from file
 94 |         for line in codecs.open(filepath, "r", "utf-8"):
 95 |             if cnt == 0:
 96 |                 cnt = 1
 97 |                 continue
 98 |             l = line.strip().split(sep)
 99 |             if len(l) < 4:
100 |                 continue
101 | 
102 |             # truncate when length is bigger than the max_length
103 |             if len(l[1]) > sequence_length or len(l[0]) > sequence_length:
104 |                 l[1] = l[1][:sequence_length]
105 |                 l[0] = l[0][:sequence_length]
106 | 
107 |             x1.append(l[0])
108 |             x2.append(l[1])
109 |             y = add_y_helper(y_value, y, int(l[3]) == 1)
110 |         return np.asarray(x1), np.asarray(x2), np.asarray(y)
111 | 
112 |     x_mention, x_entity, y = getTsvTestData(data_file, sep, max_document_length, y_value=False)
113 |     # x_dev_mention, x_dev_entity, y_dev = inputH.getTsvTestData(data_file, sep, max_document_length, y_value=False)
114 |     # x_test_mention, x_test_entity, y_test = inputH.getTsvTestData(data_file, sep, max_document_length,
115 |     #                                                               y_value=False)
116 |     r = len(x_mention)
117 |     r1 = (int)(r * 0.8)
118 |     r2 = (int)(r * 0.9)
119 |     shuffle_indices = np.random.permutation(np.arange(r))
120 |     x_mention = x_mention[shuffle_indices]
121 |     x_entity = x_entity[shuffle_indices]
122 |     y = y[shuffle_indices]
123 | 
124 |     x_train_mention, x_train_entity, y_train = x_mention[:r1], x_entity[:r1], y[:r1]
125 |     x_dev_mention, x_dev_entity, y_dev = x_mention[r1:r1], x_entity[r1:r2], y[r1:r2]
126 |     x_test_mention, x_test_entity, y_test = x_mention[r2:], x_entity[r2:], y[r2:]
127 | 
128 |     if load_from_npy == False:
129 |         # Constructing Tensor for train, dev, and test
130 |         men_arr = np.concatenate((x_train_mention, x_dev_mention, x_test_mention))
131 |         entity_arr = np.concatenate((x_train_entity, x_dev_entity, x_test_entity))
132 | 
133 |         lstm_dir = "Description1501554142" if task_num == 1 else "Operation1501209225"
134 |         bilstm_dir = os.path.join("./Sentence_Modeling/runs", lstm_dir)
135 | 
136 |         tensor = Tensor(men_arr, entity_arr, len(men_arr), FLAGS.max_sequence_len, task_num, bilstm_dir).get_tensor()
137 |         tensor = tensor.transpose((0, 2, 3, 1))
138 |         g1 = len(x_train_mention)
139 |         g2 = len(np.concatenate((x_train_mention, x_dev_mention)))
140 |         x_train_tensor, x_dev_tensor, x_test_tensor = tensor[:g1], tensor[g1:g2], tensor[g2:]
141 | 
142 |     else:
143 |         tensor_dir = "IDF"
144 |         mydir = "./" + tensor_dir + "/Length" + str(FLAGS.max_sequence_len) + "/"
145 |         x_train_tensor = np.load(mydir + "train_" + name + ".npy")
146 |         x_dev_tensor = np.load(mydir + "dev_" + name + ".npy")
147 |         x_test_tensor = np.load(mydir + "test_" + name + ".npy")
148 | 
149 |     with tf.Graph().as_default():
150 | 
151 |         sess = tf.Session()
152 |         with sess.as_default():
153 |             cnn = MultiGranModel(
154 |                 max_len=FLAGS.max_sequence_len,
155 |                 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
156 |                 num_filters=FLAGS.num_filters,
157 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
158 | 
159 |             # Define Training procedure
160 |             global_step = tf.Variable(0, name="global_step", trainable=False)
161 |             optimizer = tf.train.AdamOptimizer(1e-3)
162 |             grads_and_vars = optimizer.compute_gradients(cnn.loss)
163 |             train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
164 |             saver = tf.train.Saver(tf.all_variables(), max_to_keep=20)
165 | 
166 |             # Keep track of gradient values and sparsity (optional)
167 |             for g, v in grads_and_vars:
168 |                 if g is not None:
169 |                     tf.summary.histogram("grad_hist/{}".format(v.name), g)
170 |                     tf.summary.scalar("grad_sparsity/{}".format(v.name), tf.nn.zero_fraction(g))
171 |                     tf.summary.histogram(v.name, v)
172 | 
173 |             # Output directory for models and summaries
174 |             timestamp = str(int(time.time()))
175 |             out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, "runs", "Exp", d_type + timestamp))
176 |             if not os.path.exists(out_dir):
177 |                 os.makedirs(out_dir)
178 |             print("Writing to {}\n".format(out_dir))
179 | 
180 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
181 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
182 |             if not os.path.exists(checkpoint_dir):
183 |                 os.makedirs(checkpoint_dir)
184 | 
185 |             # Summaries for loss and accuracy
186 |             loss_summary = tf.summary.scalar("loss", cnn.loss)
187 |             acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
188 | 
189 |             # Train Summaries
190 |             train_summary_merged = tf.summary.merge_all()
191 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
192 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
193 | 
194 |             # Dev summaries
195 |             dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
196 |             dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
197 |             dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
198 | 
199 |             # Initialize all variables
200 |             sess.run(tf.initialize_all_variables())
201 | 
202 |             def train_step(x_batch, y_batch):
203 |                 feed_dict = {
204 |                     cnn.input_tensor: x_batch,
205 |                     cnn.input_y: y_batch,
206 |                     cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
207 |                 }
208 |                 _, step, summaries, loss, accuracy = sess.run(
209 |                     [train_op, global_step, train_summary_merged, cnn.loss, cnn.accuracy],
210 |                     feed_dict)
211 |                 time_str = datetime.datetime.now().isoformat()
212 |                 if step % 10 == 0:
213 |                     print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
214 |                 train_summary_writer.add_summary(summaries, step)
215 | 
216 |             def dev_step(x_dev, y_batch_dev, writer=None):
217 |                 feed_dict = {
218 |                     cnn.input_tensor: x_dev,
219 |                     cnn.input_y: y_batch_dev,
220 |                     cnn.dropout_keep_prob: 1.0
221 |                 }
222 |                 step, summaries, loss, accuracy, pres = sess.run(
223 |                     [global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.scores],
224 |                     feed_dict)
225 |                 if writer:
226 |                     writer.add_summary(summaries, step)
227 | 
228 |                 return loss, accuracy
229 | 
230 |             def evaluate(x_dev, y_batch_dev):
231 |                 feed_dict = {
232 |                     cnn.input_tensor: x_dev,
233 |                     cnn.input_y: y_batch_dev,
234 |                     cnn.dropout_keep_prob: 1.0
235 |                 }
236 |                 step, loss, accuracy, pres = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.scores], feed_dict)
237 | 
238 |                 eval_file = open(out_dir + "/evaluation.txt", "w+")
239 |                 right_file = open(out_dir + "/right_cases.txt", "w+")
240 |                 wrong_file = open(out_dir + "/wrong_cases.txt", "w+")
241 | 
242 |                 eval_file.write("Accu: " + str(accuracy) + "\n")
243 |                 predictions = np.argmax(pres, 1)
244 |                 labels = np.argmax(y_batch_dev, 1)
245 | 
246 |                 write_evaluation_file(eval_file, right_file, wrong_file, labels, predictions, x_test_mention,
247 |                                       x_test_entity)
248 | 
249 |                 eval_file.write("Parameters:")
250 |                 for attr, value in sorted(FLAGS.__flags.items()):
251 |                     eval_file.write("{}={}".format(attr.upper(), value) + "\n")
252 | 
253 |                 print(loss)
254 |                 print(accuracy)
255 |                 print(pres)
256 |                 print(y_batch_dev)
257 |                 return loss, accuracy
258 | 
259 |             def dev_whole(x_dev, y_dev, writer=None):
260 |                 batches_dev = inputH.batch_iter(list(zip(x_dev, y_dev)), FLAGS.batch_size, 1, shuffle=False)
261 |                 losses = []
262 |                 accuracies = []
263 | 
264 |                 for idx, batch_dev in enumerate(batches_dev):
265 |                     x_batch, y_batch = zip(*batch_dev)
266 |                     loss, accurary = dev_step(x_batch, y_batch, writer)
267 |                     losses.append(loss)
268 |                     accuracies.append(accurary)
269 |                 return np.mean(np.array(losses)), np.mean(np.array(accuracies))
270 | 
271 |             # def overfit(dev_loss):
272 |             #     n = len(dev_loss)
273 |             #     if n < 5:
274 |             #         return False
275 |             #     for i in xrange(n - 4, n):
276 |             #         if dev_loss[i] > dev_loss[i - 1]:
277 |             #             return False
278 |             #     return True
279 | 
280 |             # Generate batches
281 |             batches = inputH.batch_iter(list(zip(x_train_tensor, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
282 | 
283 |             # Training loop. For each batch...
284 |             dev_loss = []
285 |             for batch in batches:
286 |                 x_batch, y_batch = zip(*batch)
287 |                 train_step(x_batch, y_batch)
288 |                 current_step = tf.train.global_step(sess, global_step)
289 | 
290 |                 if current_step % FLAGS.evaluate_every == 0:
291 |                     print("\nEvaluation:")
292 |                     loss, accuracy = dev_whole(x_dev_tensor, y_dev, writer=dev_summary_writer)
293 |                     time_str = datetime.datetime.now().isoformat()
294 |                     print("{}: dev-aver, loss {:g}, acc {:g}".format(time_str, loss, accuracy))
295 |                     dev_loss.append(accuracy)
296 |                     print("\nRecently accuracy:")
297 |                     print dev_loss[-10:]
298 |                     # if overfit(dev_loss):
299 |                     #     print 'Overfit!!'
300 |                     #     break
301 |                     print("")
302 | 
303 |                 if current_step % FLAGS.checkpoint_every == 0:
304 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
305 |                     print("Saved model checkpoint to {}\n".format(path))
306 | 
307 |             print("")
308 |             evaluate(x_test_tensor, y_test)
309 | 
310 | 
311 | if __name__ == '__main__':
312 |     main()
313 | 


--------------------------------------------------------------------------------
/Sentence_Modeling/Sentence_Model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from Siamese_network import SiameseLSTM
  3 | from DLDisambiguation.util.input_helpers import InputHelper
  4 | from DLDisambiguation.util.util import getEmbedding
  5 | from DLDisambiguation.util.preprocess import MyVocabularyProcessor
  6 | import time
  7 | import numpy as np
  8 | import datetime
  9 | import os
 10 | 
 11 | from DLDisambiguation.util.util import write_evaluation_file
 12 | 
 13 | # Parameters
 14 | # ==================================================
 15 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding")
 16 | tf.flags.DEFINE_integer("hidden_units", 15, "unit numbers of hidden vectors in Bi-LSTM")
 17 | tf.flags.DEFINE_float("l2_reg_lambda", 0.01, "L2 regularization lambda (default: 0.0)")
 18 | 
 19 | tf.flags.DEFINE_string("train_dir", "../", "training dir")
 20 | 
 21 | # Training parameters
 22 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size")
 23 | tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 50)")
 24 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps")
 25 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
 26 | 
 27 | FLAGS = tf.flags.FLAGS
 28 | FLAGS._parse_flags()
 29 | print("\nParameters:")
 30 | for attr, value in sorted(FLAGS.__flags.items()):
 31 |     print("{}={}".format(attr.upper(), value))
 32 | print("")
 33 | 
 34 | 
 35 | class SentenceModel:
 36 |     def getEmbeddingMatrix(self, embedding_dir, processer):
 37 |         vocab, vocab_size, embedding_dim, embedding = getEmbedding(embedding_dir)
 38 | 
 39 |         # Extract word:id mapping from the object.
 40 |         vocab_dict = processer.vocabulary_._mapping
 41 |         words_s = set(vocab_dict.keys())
 42 | 
 43 |         # words_index = [i for i,e in enumerate(vocab) if e.decode("utf-8") in words_s]
 44 |         words_index = []
 45 |         vocab_set = set(vocab)
 46 |         last_index = len(vocab) - 1
 47 | 
 48 |         for i in words_s:
 49 |             character_u = i.encode("utf-8")
 50 |             if character_u in vocab_set:
 51 |                 words_index.append(vocab_dict[i])
 52 |             else:
 53 |                 # for unkown word, give default value of key <UNK>
 54 |                 print(character_u)
 55 |                 words_index.append(last_index)
 56 |                 # print(embedding[last_index])
 57 | 
 58 |         # words_index = [vocab_dict[i] for i in words_s if i.encode("utf-8") in set(vocab)]
 59 |         res = embedding[words_index]
 60 |         res = np.asarray(res, dtype="float")
 61 |         return res
 62 | 
 63 |     def __init__(self):
 64 |         # the max length of description/operation segment, padding if shorter, and ignore the pair if longer
 65 |         max_document_length = 20
 66 |         inpH = InputHelper()
 67 |         y_is_value = True  # flag to indicate that y is value(0 / 1) or array[0,1] / [1, 0]
 68 |         # train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets_File(FLAGS.training_files, "\t",
 69 |         #                                                                           max_document_length,
 70 |         #                                                                           10,  # 10---percent_dev
 71 |         #                                                                           FLAGS.batch_size, y_value=y_is_value)
 72 | 
 73 |         # test_x1, test_x2, test_y = inpH.getTestDataSet(FLAGS.test_file, "\t\t", vocab_processor, max_document_length, y_is_value)
 74 | 
 75 |         task_num = 2
 76 |         d_type = "Description" if task_num == 1 else "Operation"
 77 |         embedding_dir = "../data/word2vec/character_model.txt" if task_num == 1 \
 78 |             else "../data/operation/character_model.txt"
 79 |         name = "des" if task_num == 1 else "opr"
 80 | 
 81 |         time_gen = "0823"
 82 |         data_file = os.path.join(FLAGS.train_dir, "data/train_data_" + time_gen + "_" + name + ".txt")
 83 |         data_file_test = os.path.join(FLAGS.train_dir, "data/test_data_" + time_gen + "_" + name + ".txt")
 84 |         # data_file_val = os.path.join(FLAGS.train_dir, "data/validation_data_" + time_gen + "_" + name + ".txt")
 85 | 
 86 |         sep = "\t"
 87 |         train_x1, train_x2, train_y = inpH.getTsvTestData(data_file, sep, max_document_length, y_is_value)
 88 |         test_x1, test_x2, test_y = inpH.getTsvTestData(data_file_test, sep, max_document_length, y_is_value)
 89 |         # dev_x1, dev_x2, dev_y = inpH.getTsvTestData(data_file_val, sep, max_document_length, y_is_value)
 90 |         dev_x1, dev_x2, dev_y = test_x1, test_x2, test_y
 91 |         sum_no_of_batches = len(train_y) // FLAGS.batch_size
 92 | 
 93 |         vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0)
 94 |         vocab_processor.fit_transform(np.concatenate((train_x1, train_x2, dev_x1, dev_x2)))
 95 |         # vocab_processor.fit_transform(np.concatenate((train_x1, train_x2, test_x1, test_x2, dev_x1, dev_x2)))
 96 |         print("Length of loaded vocabulary ={}".format(len(vocab_processor.vocabulary_)))
 97 | 
 98 |         train_set = inpH.get_data(vocab_processor, train_x1, train_x2, train_y, max_document_length)
 99 |         dev_set = inpH.get_data(vocab_processor, dev_x1, dev_x2, dev_y, max_document_length)
100 |         test_set = inpH.get_data(vocab_processor, test_x1, test_x2, test_y, max_document_length)
101 | 
102 |         # load in word2vec model
103 |         embedding_matrix = self.getEmbeddingMatrix(embedding_dir, vocab_processor)
104 | 
105 |         sess = tf.Session()
106 |         with sess.as_default():
107 |             siameseModel = SiameseLSTM(sequence_length=max_document_length,
108 |                                        vocab_processer=vocab_processor,
109 |                                        embedding_size=FLAGS.embedding_dim,
110 |                                        hidden_unit_size=FLAGS.hidden_units,
111 |                                        l2_reg_lambda=FLAGS.l2_reg_lambda,
112 |                                        batch_size=FLAGS.batch_size,
113 |                                        embedding_arr=embedding_matrix)
114 | 
115 |             # Define Training procedure
116 |             global_step = tf.Variable(0, name="global_step", trainable=False)
117 |             optimizer = tf.train.AdamOptimizer(1e-3)
118 |             print("initialized siameseModel object")
119 | 
120 |             grads_and_vars = optimizer.compute_gradients(siameseModel.loss)
121 |             tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
122 |             print("defined training_ops")
123 | 
124 |             # Keep track of variables, gradient values and sparsity
125 |             for g, v in grads_and_vars:
126 |                 if g is not None:
127 |                     tf.summary.histogram("grad_hist/{}".format(v.name), g)
128 |                     tf.summary.histogram("grad_sparsity/{}".format(v.name), tf.nn.zero_fraction(g))
129 |                     tf.summary.histogram(v.name, v)
130 |             print("defined gradient summaries")
131 | 
132 |             # Output directory for models and summaries
133 |             timestamp = str(int(time.time()))
134 |             out_dir = os.path.abspath(os.path.join(os.path.curdir, "Exp" + time_gen, "runs", d_type + timestamp))
135 |             print("Writing to {}\n".format(out_dir))
136 | 
137 |             # Summaries for loss and accuracy
138 |             loss_summary = tf.summary.scalar("loss", siameseModel.loss)
139 |             acc_summary = tf.summary.scalar("accuracy", siameseModel.accuracy)
140 | 
141 |             # Train Summaries
142 |             train_summary_merged = tf.summary.merge_all()
143 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
144 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
145 | 
146 |             # Dev summaries
147 |             dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
148 |             dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
149 |             dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
150 | 
151 |             # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
152 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
153 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
154 |             if not os.path.exists(checkpoint_dir):
155 |                 os.makedirs(checkpoint_dir)
156 |             saver = tf.train.Saver(tf.all_variables(), max_to_keep=100)
157 | 
158 |             # Write vocabulary
159 |             vocab_processor.save(os.path.join(checkpoint_dir, "vocab"))
160 | 
161 |             # Initialize all variables
162 |             sess.run(tf.initialize_all_variables())
163 |             print("init all variables")
164 | 
165 |             graph_def = tf.get_default_graph().as_graph_def()
166 |             graphpb_txt = str(graph_def)
167 |             with open(os.path.join(checkpoint_dir, "graphpb.txt"), 'w') as f:
168 |                 f.write(graphpb_txt)
169 | 
170 |             def train_step(x1_batch, x2_batch, x1_batch_m, x2_batch_m, y_batch):
171 | 
172 |                 feed_dict = {
173 |                     siameseModel.input_x1: x1_batch,
174 |                     siameseModel.input_x2: x2_batch,
175 |                     siameseModel.mask_x1: x1_batch_m,
176 |                     siameseModel.mask_x2: x2_batch_m,
177 |                     siameseModel.input_y: y_batch,
178 |                 }
179 | 
180 |                 _, step, summaries, loss, accuracy, dist = sess.run(
181 |                     [tr_op_set, global_step, train_summary_merged, siameseModel.loss, siameseModel.accuracy,
182 |                      siameseModel.distance], feed_dict)
183 | 
184 |                 time_str = datetime.datetime.now().isoformat()
185 |                 d = np.copy(dist)
186 |                 d[d >= 0.5] = 999.0
187 |                 d[d < 0.5] = 1
188 |                 d[d > 1.0] = 0
189 |                 accuracy_t = np.mean(y_batch == d)
190 |                 print(
191 |                     "TRAIN {}: step {}, loss {:g}, acc {:g}, acc_t {:g}".format(time_str, step, loss, accuracy, accuracy_t))
192 |                 print(y_batch)
193 |                 print(dist)
194 |                 print(d)
195 |                 train_summary_writer.add_summary(summaries, step)
196 | 
197 |             def dev_step(x1_batch, x2_batch, x1_batch_m, x2_batch_m, y_batch):
198 | 
199 |                 feed_dict = {
200 |                     siameseModel.input_x1: x1_batch,
201 |                     siameseModel.input_x2: x2_batch,
202 |                     siameseModel.mask_x1: x1_batch_m,
203 |                     siameseModel.mask_x2: x2_batch_m,
204 |                     siameseModel.input_y: y_batch,
205 |                 }
206 | 
207 |                 step, summaries, loss, accuracy, dist = sess.run(
208 |                     [global_step, dev_summary_op, siameseModel.loss, siameseModel.accuracy, siameseModel.distance],
209 |                     feed_dict)
210 |                 time_str = datetime.datetime.now().isoformat()
211 |                 d = np.copy(dist)
212 |                 d[d >= 0.5] = 999.0
213 |                 d[d < 0.5] = 1
214 |                 d[d > 1.0] = 0
215 |                 accuracy_t = np.mean(y_batch == d)
216 |                 print("DEV {}: step {}, loss {:g}, acc {:g}, acc_t {:g}".format(time_str, step, loss, accuracy, accuracy_t))
217 |                 print(y_batch)
218 |                 print(dist)
219 |                 print(d)
220 |                 dev_summary_writer.add_summary(summaries, step)
221 |                 return accuracy
222 | 
223 |             def overfit(dev_loss, accu):
224 |                 num = 6
225 |                 n = len(dev_loss)
226 |                 if n < num:
227 |                     return False
228 |                 for i in xrange(n - num, n):
229 |                     if dev_loss[i] < accu:
230 |                         return False
231 |                 print(dev_loss)
232 |                 print(accu)
233 |                 return True
234 | 
235 |             def evaluate(x1_batch, x2_batch, x1_batch_m, x2_batch_m, y_batch, mention, entity):
236 | 
237 |                 feed_dict = {
238 |                     siameseModel.input_x1: x1_batch,
239 |                     siameseModel.input_x2: x2_batch,
240 |                     siameseModel.mask_x1: x1_batch_m,
241 |                     siameseModel.mask_x2: x2_batch_m,
242 |                     siameseModel.input_y: y_batch,
243 |                 }
244 | 
245 |                 loss, accuracy, dist = sess.run([siameseModel.loss, siameseModel.accuracy, siameseModel.distance],
246 |                                                 feed_dict)
247 |                 time_str = datetime.datetime.now().isoformat()
248 |                 print("Test {}: loss {:g}, acc {:g}".format(time_str, loss, accuracy))
249 |                 print(dist)
250 | 
251 |                 eval_file = open(out_dir + "/evaluation.txt", "w+")
252 |                 right_file = open(out_dir + "/right_cases.txt", "w+")
253 |                 wrong_file = open(out_dir + "/wrong_cases.txt", "w+")
254 | 
255 |                 eval_file.write("Accu: " + str(accuracy) + "\n")
256 |                 eval_file.write("Dataset: " + data_file + "\n")
257 |                 eval_file.write("Early Stopped at: " + str(stop_p) + "\n")
258 | 
259 |                 d = np.copy(dist)
260 |                 d[d >= 0.5] = 999.0
261 |                 d[d < 0.5] = 1
262 |                 d[d > 1.0] = 0
263 | 
264 |                 predictions = d
265 |                 write_evaluation_file(eval_file, right_file, wrong_file, y_batch, predictions, mention, entity)
266 |                 return accuracy
267 | 
268 |             # Generate batches
269 |             batches = inpH.batch_iter(list(zip(train_set[0], train_set[1], train_set[2], train_set[3], train_set[4])),
270 |                                       FLAGS.batch_size, FLAGS.num_epochs)
271 | 
272 |             max_validation_acc = 0.0
273 |             num_batches_per_epoch = int(len(train_set[0]) / FLAGS.batch_size)
274 |             print num_batches_per_epoch
275 |             max_accu = 0
276 |             dev_accu = []
277 | 
278 |             for nn in xrange(sum_no_of_batches * FLAGS.num_epochs):
279 |                 batch = batches.next()
280 |                 if len(batch) < 1:
281 |                     continue
282 |                 x1_batch, x2_batch, x1_batch_m, x2_match_m, y_batch = zip(*batch)
283 |                 if len(y_batch) < 1:
284 |                     continue
285 | 
286 |                 train_step(x1_batch, x2_batch, x1_batch_m, x2_match_m, y_batch)
287 | 
288 |                 current_step = tf.train.global_step(sess, global_step)  # get the global step.
289 |                 sum_acc = 0.0
290 |                 tmp = []
291 | 
292 |                 if current_step % num_batches_per_epoch == 0:
293 |                     print("\nEvaluation:")
294 |                     # dev_batches = inpH.batch_iter(list(zip(dev_set[0], dev_set[1], dev_set[2], dev_set[3], dev_set[4])),
295 |                     #                               FLAGS.batch_size, 1)
296 |                     # for db in dev_batches:
297 |                     #     if len(db) < 1:
298 |                     #         continue
299 |                     #     x1_dev_b, x2_dev_b, x1_dev_m, x2_dev_m, y_dev_b = zip(*db)
300 |                     #     if len(y_dev_b) < 1:
301 |                     #         continue
302 |                     #     acc = dev_step(x1_dev_b, x2_dev_b, x1_dev_m, x2_dev_m, y_dev_b)
303 |                     #     sum_acc = sum_acc + acc
304 |                     #     tmp.append(acc)
305 |                     #
306 |                     # acc_mean = np.mean(tmp)
307 |                     acc_mean = dev_step(dev_set[0], dev_set[1], dev_set[2], dev_set[3], dev_set[4])
308 |                     dev_accu.append(acc_mean)
309 | 
310 |                     if overfit(dev_accu, acc_mean):
311 |                         print 'Overfit!!'
312 |                         print("Optimum" + str(max_accu))
313 |                         print(current_step)
314 |                         stop_p = current_step / num_batches_per_epoch
315 |                         print(stop_p)
316 |                         break
317 | 
318 |                     if acc_mean >= max_accu:
319 |                         max_accu = acc_mean
320 |                         saver.save(sess, checkpoint_prefix, global_step=current_step)  # save checkpoints
321 |                         tf.train.write_graph(sess.graph.as_graph_def(), checkpoint_prefix, "graph" + str(nn) + ".pb",
322 |                                              as_text=False)  # save graph_def
323 |                         print("Saved model {} with sum_accuracy={} checkpoint to {}\n".format(nn, max_validation_acc,
324 |                                                                                               checkpoint_prefix))
325 | 
326 |             evaluate(test_set[0], test_set[1], test_set[2], test_set[3], test_set[4], test_x1, test_x2)
327 | 
328 | 
329 | s = SentenceModel()
330 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | import datetime
  5 | import os
  6 | import time
  7 | 
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | 
 11 | from util.util import write_evaluation_file
 12 | from util.input_helpers import InputHelper
 13 | from MultiGran_Model import MultiGranModel
 14 | from tensor import Tensor
 15 | 
 16 | # Parameters
 17 | # ==================================================
 18 | 
 19 | # Model Hyperparameters
 20 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 100)")
 21 | tf.flags.DEFINE_string("filter_sizes", "3, 3, 3, 3", "Comma-separated filter sizes (default: '2,3')")
 22 | tf.flags.DEFINE_string("pool_sizes", "2, 2, 2, 2", "Comma-separated filter sizes (default: '2,3')")
 23 | tf.flags.DEFINE_integer("num_filters", 16, "Number of filters per filter size (default: 64)")
 24 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 25 | tf.flags.DEFINE_float("l2_reg_lambda", 0.001, "L2 regularizaion lambda (default: 0.0)")
 26 | 
 27 | # Data Parameter
 28 | tf.flags.DEFINE_integer("max_sequence_len", 20, "max document length of input")
 29 | tf.flags.DEFINE_integer("most_words", 300000, "Most number of words in vocab (default: 300000)")
 30 | 
 31 | # Training parameters
 32 | tf.flags.DEFINE_integer("seed", 123, "Random seed (default: 123)")
 33 | tf.flags.DEFINE_string("train_dir", "./", "Training dir root")
 34 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
 35 | tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs")
 36 | tf.flags.DEFINE_float("eval_split", 0.1, "Use how much data for evaluating (default: 0.1)")
 37 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
 38 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
 39 | 
 40 | FLAGS = tf.flags.FLAGS
 41 | FLAGS._parse_flags()
 42 | print("\nParameters:")
 43 | for attr, value in sorted(FLAGS.__flags.items()):
 44 |     print("{}={}".format(attr.upper(), value))
 45 | print("")
 46 | 
 47 | 
 48 | def main():
 49 |     print("Loading data...")
 50 | 
 51 |     type_CNN = 2
 52 |     run_type = "Single CNN" if type_CNN == 1 else "Multiview 4CNN"
 53 | 
 54 |     early_stop_num = 11
 55 |     task_num = 2
 56 |     name = "des" if task_num == 1 else "opr"
 57 |     d_type = "Single_task1" if task_num == 1 else "Single_task2"
 58 | 
 59 |     inputH = InputHelper()
 60 |     max_document_length = FLAGS.max_sequence_len
 61 |     load_from_npy = True
 62 | 
 63 |     sep = "\t"
 64 |     time_gen = "0823"
 65 |     data_file = os.path.join(FLAGS.train_dir, "data/train_data_" + time_gen + "_" + name + ".txt")
 66 |     data_file_test = os.path.join(FLAGS.train_dir, "data/test_data_" + time_gen + "_" + name + ".txt")
 67 |     # data_file_val = os.path.join(FLAGS.train_dir, "data/validation_data_" + time_gen + "_" + name + ".txt")
 68 |     data_file_val = data_file_test
 69 | 
 70 |     # data_file = os.path.join(FLAGS.train_dir, "data/training_dynamic_data_" + name + ".txt")
 71 |     # data_file_test = os.path.join(FLAGS.train_dir, "data/test_dynamic_data_" + name + ".txt")
 72 |     # data_file_val = os.path.join(FLAGS.train_dir, "data/validation_dynamic_data_" + name + ".txt")
 73 | 
 74 |     x_train_mention, x_train_entity, y_train = inputH.getTsvTestData(data_file, sep, max_document_length, y_value=False)
 75 |     x_dev_mention, x_dev_entity, y_dev = inputH.getTsvTestData(data_file_val, sep, max_document_length, y_value=False)
 76 |     x_test_mention, x_test_entity, y_test = inputH.getTsvTestData(data_file_test, sep, max_document_length,
 77 |                                                                   y_value=False)
 78 | 
 79 |     if load_from_npy == False:
 80 |         # Constructing Tensor for train, dev, and test
 81 |         men_arr = np.concatenate((x_train_mention, x_dev_mention, x_test_mention))
 82 |         entity_arr = np.concatenate((x_train_entity, x_dev_entity, x_test_entity))
 83 | 
 84 |         lstm_dir = "Description1501174251" if task_num == 1 else "Operation1501209225"
 85 |         bilstm_dir = os.path.join("./Sentence_Modeling/runs", lstm_dir)
 86 | 
 87 |         tensor = Tensor(men_arr, entity_arr, len(men_arr), FLAGS.max_sequence_len, task_num, bilstm_dir).get_tensor()
 88 |         tensor = tensor.transpose((0, 2, 3, 1))
 89 |         g1 = len(x_train_mention)
 90 |         g2 = len(np.concatenate((x_train_mention, x_dev_mention)))
 91 |         x_train_tensor, x_dev_tensor, x_test_tensor = tensor[:g1], tensor[g1:g2], tensor[g2:]
 92 | 
 93 |     else:
 94 |         mydir = "./Tensor_files/0823/Length" + str(FLAGS.max_sequence_len) + "/"
 95 | 
 96 |         x_train_tensor = np.load(mydir + "train_" + name + ".npy")
 97 |         # x_dev_tensor = np.load(mydir + "dev_" + name + ".npy")
 98 |         x_test_tensor = np.load(mydir + "test_" + name + ".npy")
 99 |         x_dev_tensor = x_test_tensor
100 | 
101 |         # indi_train = np.load(mydir + "train_indi_" + name + ".npy")
102 |         # indi_val = np.load(mydir + "dev_indi_" + name + ".npy")
103 |         # indi_test = np.load(mydir + "test_indi_" + name + ".npy")
104 | 
105 |         # def process(indi, tensor):
106 |         #     tmp = []
107 |         #     ll = len(indi)
108 |         #     for i in range(ll):
109 |         #         if indi[i] == 0:
110 |         #             tmp.append(tensor[i])
111 |         #     return np.asarray(tmp)
112 |         #
113 |         # x_train_tensor = process(indi_train, x_train_tensor)
114 |         # x_dev_tensor = process(indi_val, x_dev_tensor)
115 |         # x_test_tensor = process(indi_test, x_test_tensor)
116 | 
117 |     with tf.Graph().as_default():
118 | 
119 |         sess = tf.Session()
120 |         with sess.as_default():
121 |             cnn = MultiGranModel(
122 |                 max_len=FLAGS.max_sequence_len,
123 |                 pool_sizes=list(map(int, FLAGS.pool_sizes.split(","))),
124 |                 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
125 |                 num_filters=FLAGS.num_filters,
126 |                 l2_reg_lambda=FLAGS.l2_reg_lambda,
127 |                 type_CNN=type_CNN)  # Define Training procedure
128 |             global_step = tf.Variable(0, name="global_step", trainable=False)
129 |             optimizer = tf.train.AdamOptimizer(1e-3)
130 |             grads_and_vars = optimizer.compute_gradients(cnn.loss)
131 |             train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
132 |             saver = tf.train.Saver(tf.all_variables(), max_to_keep=20)
133 | 
134 |             # Keep track of gradient values and sparsity (optional)
135 |             for g, v in grads_and_vars:
136 |                 if g is not None:
137 |                     tf.summary.histogram("grad_hist/{}".format(v.name), g)
138 |                     tf.summary.scalar("grad_sparsity/{}".format(v.name), tf.nn.zero_fraction(g))
139 |                     tf.summary.histogram(v.name, v)
140 | 
141 |             # Output directory for models and summaries
142 |             timestamp = str(int(time.time()))
143 |             out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, "runs", "NewExp", d_type + timestamp))
144 |             if not os.path.exists(out_dir):
145 |                 os.makedirs(out_dir)
146 |             print("Writing to {}\n".format(out_dir))
147 | 
148 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
149 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
150 |             if not os.path.exists(checkpoint_dir):
151 |                 os.makedirs(checkpoint_dir)
152 | 
153 |             # Summaries for loss and accuracy
154 |             loss_summary = tf.summary.scalar("loss", cnn.loss)
155 |             acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
156 | 
157 |             # Train Summaries
158 |             train_summary_merged = tf.summary.merge_all()
159 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
160 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
161 | 
162 |             # Dev summaries
163 |             dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
164 |             dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
165 |             dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
166 | 
167 |             # Initialize all variables
168 |             sess.run(tf.initialize_all_variables())
169 | 
170 |             def train_step(x_batch, y_batch):
171 |                 feed_dict = {
172 |                     cnn.input_tensor: x_batch,
173 |                     cnn.input_y: y_batch,
174 |                     cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
175 |                 }
176 |                 _, step, summaries, loss, accuracy = sess.run(
177 |                     [train_op, global_step, train_summary_merged, cnn.loss, cnn.accuracy],
178 |                     feed_dict)
179 |                 time_str = datetime.datetime.now().isoformat()
180 |                 if step % 10 == 0:
181 |                     print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
182 |                 train_summary_writer.add_summary(summaries, step)
183 | 
184 |             def dev_step(x_dev, y_batch_dev, writer=None):
185 |                 feed_dict = {
186 |                     cnn.input_tensor: x_dev,
187 |                     cnn.input_y: y_batch_dev,
188 |                     cnn.dropout_keep_prob: 1.0
189 |                 }
190 |                 step, summaries, loss, accuracy, pres = sess.run(
191 |                     [global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.scores],
192 |                     feed_dict)
193 |                 if writer:
194 |                     writer.add_summary(summaries, step)
195 | 
196 |                 return loss, accuracy
197 | 
198 |             def evaluate(x_dev, y_batch_dev, m):
199 |                 feed_dict = {
200 |                     m.input_tensor: x_dev,
201 |                     m.input_y: y_batch_dev,
202 |                     m.dropout_keep_prob: 1.0
203 |                 }
204 |                 step, loss, accuracy, pres = sess.run([global_step, m.loss, m.accuracy, m.scores],
205 |                                                       feed_dict)
206 | 
207 |                 eval_file = open(out_dir + "/evaluation.txt", "w+")
208 |                 right_file = open(out_dir + "/right_cases.txt", "w+")
209 |                 wrong_file = open(out_dir + "/wrong_cases.txt", "w+")
210 | 
211 |                 eval_file.write("Accu: " + str(accuracy) + "\n")
212 |                 predictions = np.argmax(pres, 1)
213 |                 labels = np.argmax(y_batch_dev, 1)
214 | 
215 |                 write_evaluation_file(eval_file, right_file, wrong_file, labels, predictions,
216 |                                       x_test_mention,
217 |                                       x_test_entity)
218 | 
219 |                 eval_file.write("Parameters:")
220 |                 for attr, value in sorted(FLAGS.__flags.items()):
221 |                     eval_file.write("{}={}".format(attr.upper(), value) + "\n")
222 |                 fs1 = [str(i) for i in fs]
223 |                 ps1 = [str(i) for i in ps]
224 |                 eval_file.write("Conv" + " ".join(fs1) + "\n")
225 |                 eval_file.write("Pool_sizes" + " ".join(ps1) + "\n")
226 | 
227 |                 print(loss)
228 |                 print(accuracy)
229 |                 print(pres)
230 |                 print(y_batch_dev)
231 |                 return loss, accuracy
232 | 
233 |             def dev_whole(x_dev, y_dev, writer=None):
234 |                 # batches_dev = inputH.batch_iter(list(zip(x_dev, y_dev)), FLAGS.batch_size, 1,
235 |                 #                                 shuffle=False)
236 |                 # losses = []
237 |                 # accuracies = []
238 |                 #
239 |                 # for idx, batch_dev in enumerate(batches_dev):
240 |                 #     x_batch, y_batch = zip(*batch_dev)
241 |                 #     loss, accurary = dev_step(x_batch, y_batch, writer)
242 |                 #     losses.append(loss)
243 |                 #     accuracies.append(accurary)
244 |                 # return np.mean(np.array(losses)), np.mean(np.array(accuracies))
245 | 
246 |                 l, accu = dev_step(x_dev, y_dev, writer)
247 |                 return l, accu
248 | 
249 |             def overfit(dev_loss, accu):
250 | 
251 |                 n = len(dev_loss)
252 |                 if n < early_stop_num:
253 |                     return False
254 | 
255 |                 # mean_acc = np.mean(dev_loss[-15:])
256 |                 # if mean_acc < accu:
257 |                 #     return False
258 |                 for i in xrange(n - early_stop_num, n):
259 |                     if dev_loss[i] < accu:
260 |                         return False
261 |                 print(dev_loss)
262 |                 print(accu)
263 |                 return True
264 | 
265 |             # Generate batches
266 |             batches = inputH.batch_iter(list(zip(x_train_tensor, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
267 | 
268 |             data_num = len(y_train)
269 |             num_batches_per_epoch = int(data_num / FLAGS.batch_size)
270 |             # num_batches_per_epoch_2 = int(num_batches_per_epoch / 2)
271 |             print num_batches_per_epoch
272 | 
273 |             # Training loop. For each batch...
274 |             dev_loss = []
275 |             optimum_accu = 0
276 | 
277 |             for batch in batches:
278 |                 x_batch, y_batch = zip(*batch)
279 |                 train_step(x_batch, y_batch)
280 |                 current_step = tf.train.global_step(sess, global_step)
281 | 
282 |                 if current_step % num_batches_per_epoch == 0:
283 |                     print("\nEvaluation:")
284 |                     loss, accuracy = dev_whole(x_dev_tensor, y_dev, writer=dev_summary_writer)
285 |                     time_str = datetime.datetime.now().isoformat()
286 |                     print("{}: dev-aver, loss {:g}, acc {:g}".format(time_str, loss, accuracy))
287 |                     dev_loss.append(accuracy)
288 | 
289 |                     print("\nRecently accuracy:")
290 |                     print dev_loss[-10:]
291 |                     if overfit(dev_loss, accuracy):
292 |                         print 'Overfit!!'
293 |                         print(current_step)
294 |                         print(current_step / num_batches_per_epoch)
295 |                         break
296 |                     print("")
297 | 
298 |                     if accuracy > optimum_accu:
299 |                         optimum_accu = accuracy
300 |                         path = saver.save(sess, checkpoint_prefix, global_step=current_step)
301 |                         print("Saved model checkpoint to {}\n".format(path))
302 |                     print("Optimum_accu: " + str(optimum_accu))
303 | 
304 |             print("")
305 |             print("Optimum_accu: " + str(optimum_accu))
306 | 
307 |         # evaluate the result with the best model
308 |         ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
309 |         checkpoint_file = ckpt.model_checkpoint_path
310 |         graph = tf.Graph()
311 | 
312 |         with graph.as_default():
313 |             sess = tf.Session()
314 |             with sess.as_default():
315 |                 # Load the saved meta graph and restore variables
316 |                 saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
317 |                 sess.run(tf.initialize_all_variables())
318 |                 saver.restore(sess, checkpoint_file)
319 | 
320 |                 input_t = graph.get_operation_by_name("input_tensor").outputs[0]
321 |                 input_y = graph.get_operation_by_name("input_y").outputs[0]
322 |                 prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
323 | 
324 |                 prediction = graph.get_operation_by_name("output/predictions").outputs[0]
325 |                 accu = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
326 |                 pres, accuracy = sess.run([prediction, accu], {input_t: x_test_tensor, input_y: y_test, prob: 1})
327 | 
328 |                 labels = np.argmax(y_test, 1)
329 |                 eval_file = open(out_dir + "/evaluation.txt", "w+")
330 |                 right_file = open(out_dir + "/right_cases.txt", "w+")
331 |                 wrong_file = open(out_dir + "/wrong_cases.txt", "w+")
332 | 
333 |                 eval_file.write("Dataset: " + data_file + "\n")
334 |                 eval_file.write(run_type + "\n")
335 |                 eval_file.write("Stopped at " + str(current_step / num_batches_per_epoch) + "\n")
336 |                 eval_file.write("Accu: " + str(accuracy) + "\n")
337 |                 write_evaluation_file(eval_file, right_file, wrong_file, labels, pres, x_test_mention,
338 |                                       x_test_entity)
339 | 
340 |                 eval_file.write("Parameters:")
341 |                 for attr, value in sorted(FLAGS.__flags.items()):
342 |                     eval_file.write("{}={}".format(attr.upper(), value) + "\n")
343 | 
344 |                 print("loss:" + str(loss))
345 |                 print("accuracy:" + str(accuracy))
346 | 
347 | 
348 | if __name__ == '__main__':
349 |     main()
350 | 


--------------------------------------------------------------------------------
/Dynamic/MT_Dynamic_Arch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | import datetime
  5 | import os
  6 | import time
  7 | import copy
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | 
 12 | from util.util import write_evaluation_file
 13 | from DLDisambiguation.util.input_helpers import InputHelper
 14 | from MT_Dynamic_MultiGranModel import MT_Dynamic_MultiGranModel
 15 | from tensor import Tensor
 16 | 
 17 | # Parameters
 18 | # ==================================================
 19 | 
 20 | # Model Hyperparameters
 21 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 100)")
 22 | tf.flags.DEFINE_string("filter_sizes", "2,3", "Comma-separated filter sizes (default: '2,3')")
 23 | tf.flags.DEFINE_integer("num_filters", 16, "Number of filters per filter size (default: 64)")
 24 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 25 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")
 26 | 
 27 | # Data Parameter
 28 | tf.flags.DEFINE_integer("max_sequence_len", 10, "max document length of input")
 29 | tf.flags.DEFINE_integer("max_sequence_len2", 10, "max document length of input")
 30 | tf.flags.DEFINE_integer("most_words", 300000, "Most number of words in vocab (default: 300000)")
 31 | 
 32 | # Training parameters
 33 | tf.flags.DEFINE_integer("seed", 123, "Random seed (default: 123)")
 34 | tf.flags.DEFINE_string("train_dir", "./", "Training dir root")
 35 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
 36 | tf.flags.DEFINE_integer("num_epochs", 5, "Number of training epochs")
 37 | tf.flags.DEFINE_float("eval_split", 0.1, "Use how much data for evaluating (default: 0.1)")
 38 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
 39 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
 40 | 
 41 | FLAGS = tf.flags.FLAGS
 42 | FLAGS._parse_flags()
 43 | print("\nParameters:")
 44 | for attr, value in sorted(FLAGS.__flags.items()):
 45 |     print("{}={}".format(attr.upper(), value))
 46 | print("")
 47 | 
 48 | 
 49 | def get_coocurrence(des_e_names, des_opr_map, x2_test, x4_test):
 50 |     res = []
 51 |     length = len(x2_test)
 52 | 
 53 |     for i in range(length):
 54 |         des = x2_test[i]
 55 |         opr = x4_test[i]
 56 |         N = 0.1
 57 | 
 58 |         if des_e_names.__contains__(des):
 59 |             index = des_e_names.index(des)
 60 |             des_opr = des_opr_map[index]
 61 |             if opr in des_opr.keys():
 62 |                 N = des_opr[str(opr)]
 63 |         res.append(N)
 64 |     return np.expand_dims(np.asarray(res), axis=1)
 65 | 
 66 | 
 67 | def load_coocurrence_matrix(filename):
 68 |     t = open(filename)
 69 |     line = t.readline()
 70 |     des_e_names = []
 71 |     des_opr_map = []
 72 | 
 73 |     while line != "":
 74 |         des, oprs = line.split("\t")[0], line.split("\t")[1]
 75 |         des_e_names.append(des)
 76 | 
 77 |         oprs_num = oprs.split("_")[:-1]
 78 |         tmp = {}
 79 |         for opr in oprs_num:
 80 |             opr_name, num = opr.split(":")[0], int(opr.split(":")[1])
 81 |             tmp[opr_name] = num
 82 |         des_opr_map.append(copy.deepcopy(tmp))
 83 |         line = t.readline()
 84 |     return des_e_names, des_opr_map
 85 | 
 86 | 
 87 | def main():
 88 |     # Load data
 89 |     print("Loading data...")
 90 |     inputH = InputHelper()
 91 | 
 92 |     train_f = os.path.join(FLAGS.train_dir, 'data/exp0803/training_dynamic_data.txt')
 93 |     dev_f = os.path.join(FLAGS.train_dir, 'data/exp0803/validation_dynamic_data.txt')
 94 |     test_f = os.path.join(FLAGS.train_dir, 'data/exp0803/test_dynamic_data.txt')
 95 | 
 96 |     our_dir = "./Tensor_files/0803_dynamic/"
 97 |     # our_dir = "./Length" + str(FLAGS.max_sequence_len) + "/"
 98 |     x_train_tensor = np.load(our_dir + "train_des.npy")
 99 |     x_dev_tensor = np.load(our_dir + "dev_des.npy")
100 |     x_test_tensor = np.load(our_dir + "test_des.npy")
101 | 
102 |     # our_dir = "./Length" + str(FLAGS.max_sequence_len2) + "/"
103 |     x_train_tensor_o = np.load(our_dir + "train_opr.npy")
104 |     x_dev_tensor_o = np.load(our_dir + "dev_opr.npy")
105 |     x_test_tensor_o = np.load(our_dir + "test_opr.npy")
106 | 
107 |     x_train_indi_o = 1 - np.load(our_dir + "train_indi_opr.npy")
108 |     x_dev_indi_o = 1 - np.load(our_dir + "dev_indi_opr.npy")
109 |     x_test_indi_o = 1 - np.load(our_dir + "test_indi_opr.npy")
110 | 
111 |     sep = "\t"
112 |     i1, x1_train, x2_train, x3_train, x4_train, y_train, y2_train = inputH.getTsvTestData_Mul_Labels_Dyna(train_f, sep,
113 |                                                                                                           FLAGS.max_sequence_len)
114 |     i2, x1_dev, x2_dev, x3_dev, x4_dev, y_dev, y2_dev = inputH.getTsvTestData_Mul_Labels_Dyna(dev_f, sep,
115 |                                                                                               FLAGS.max_sequence_len)
116 |     i3, x1_test, x2_test, x3_test, x4_test, y_test, y2_test = inputH.getTsvTestData_Mul_Labels_Dyna(test_f, sep,
117 |                                                                                                     FLAGS.max_sequence_len)
118 | 
119 |     des_e_names, des_opr_map = load_coocurrence_matrix("result.txt")
120 |     co_arr_test = get_coocurrence(des_e_names, des_opr_map, x2_test, x4_test)
121 |     co_arr_train = get_coocurrence(des_e_names, des_opr_map, x2_train, x4_train)
122 |     co_arr_val = get_coocurrence(des_e_names, des_opr_map, x2_dev, x4_dev)
123 | 
124 |     with tf.Graph().as_default():
125 | 
126 |         sess = tf.Session()
127 |         with sess.as_default():
128 |             cnn = MT_Dynamic_MultiGranModel(
129 |                 max_len1=FLAGS.max_sequence_len,
130 |                 max_len2=FLAGS.max_sequence_len2,
131 |                 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
132 |                 num_filters=FLAGS.num_filters,
133 |                 l2_reg_lambda=FLAGS.l2_reg_lambda,
134 |             )
135 | 
136 |             # Define Training procedure
137 |             global_step = tf.Variable(0, name="global_step", trainable=False)
138 |             optimizer = tf.train.AdamOptimizer(1e-3)
139 | 
140 |             grads_and_vars = optimizer.compute_gradients(cnn.loss)
141 |             train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
142 | 
143 |             saver = tf.train.Saver(tf.all_variables(), max_to_keep=20)
144 | 
145 |             # Keep track of gradient values and sparsity (optional)
146 |             for g, v in grads_and_vars:
147 |                 if g is not None:
148 |                     tf.summary.histogram("grad_hist/{}".format(v.name), g)
149 |                     tf.summary.scalar("grad_sparsity/{}".format(v.name), tf.nn.zero_fraction(g))
150 |                     tf.summary.histogram(v.name, v)
151 | 
152 |             # Output directory for models and summaries
153 |             timestamp = str(int(time.time()))
154 |             out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, "runs", "multitask" + timestamp))
155 |             if not os.path.exists(out_dir):
156 |                 os.makedirs(out_dir)
157 |             print("Writing to {}\n".format(out_dir))
158 | 
159 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
160 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
161 |             if not os.path.exists(checkpoint_dir):
162 |                 os.makedirs(checkpoint_dir)
163 | 
164 |             # Summaries for loss and accuracy
165 |             loss_summary = tf.summary.scalar("loss", cnn.loss)
166 |             acc_summary1 = tf.summary.scalar("accuracy1", cnn.accuracy_d)
167 |             acc_summary2 = tf.summary.scalar("accuracy2", cnn.accuracy_o)
168 | 
169 |             # Train Summaries
170 |             train_summary_op = tf.summary.merge_all()
171 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
172 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
173 | 
174 |             # Dev summaries
175 |             dev_summary_op = tf.summary.merge([loss_summary, acc_summary1, acc_summary2])
176 |             dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
177 |             dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
178 | 
179 |             # Initialize all variables
180 |             sess.run(tf.initialize_all_variables())
181 | 
182 |             def train_step(x_batch, y_batch, x_batch2, y_batch2, indi, co_arr):
183 |                 gamma = [0.5 if i == 1 else 1.0 for i in indi]
184 |                 gamma = np.asarray(gamma)
185 | 
186 |                 feed_dict = {
187 |                     cnn.input_tensor: x_batch,
188 |                     cnn.input_y_description: y_batch,
189 |                     cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
190 |                     cnn.input_tensor_o: x_batch2,
191 |                     cnn.input_y_operation: y_batch2,
192 |                     cnn.mask_opr: np.asarray(indi, dtype=float),
193 |                     cnn.gamma: gamma,
194 |                     cnn.matrix: co_arr
195 |                 }
196 |                 _, step, summaries, loss, accuracy1, accuracy2 = sess.run(
197 |                     [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy_d, cnn.accuracy_o],
198 |                     feed_dict)
199 |                 time_str = datetime.datetime.now().isoformat()
200 |                 if step % 10 == 0:
201 |                     print(
202 |                         "{}: step {}, loss {:g}, acc1 {:g}, acc2 {:g}".format(time_str, step, loss, accuracy1,
203 |                                                                               accuracy2))
204 |                 train_summary_writer.add_summary(summaries, step)
205 | 
206 |             def dev_step(x_dev, y_batch_dev, x_dev2, y_batch_dev2, indi, co_arr, writer=None):
207 |                 gamma = [0.5 if i == 1 else 1.0 for i in indi]
208 |                 gamma = np.asarray(gamma)
209 |                 feed_dict = {
210 |                     cnn.input_tensor: x_dev,
211 |                     cnn.input_y_description: y_batch_dev,
212 |                     cnn.dropout_keep_prob: 1.0,
213 |                     cnn.input_y_operation: y_batch_dev2,
214 |                     cnn.input_tensor_o: x_dev2,
215 |                     cnn.mask_opr: np.asarray(indi, dtype=float),
216 |                     cnn.gamma: gamma,
217 |                     cnn.matrix: co_arr
218 |                 }
219 |                 step, summaries, loss, accuracy1, accuracy2, pres1, pres2 = sess.run(
220 |                     [global_step, dev_summary_op, cnn.loss, cnn.accuracy_d, cnn.accuracy_o, cnn.scores_d, cnn.scores_o],
221 |                     feed_dict)
222 |                 if writer:
223 |                     writer.add_summary(summaries, step)
224 |                 return loss, accuracy1, accuracy2
225 | 
226 |             def evaluate(x_dev, y_batch_dev, x_dev2, y_batch_dev2, indi, co_arr):
227 |                 gamma = [0.5 if i == 1 else 1.0 for i in indi]
228 |                 gamma = np.asarray(gamma)
229 |                 feed_dict = {
230 |                     cnn.input_tensor: x_dev,
231 |                     cnn.input_y_description: y_batch_dev,
232 |                     cnn.dropout_keep_prob: 1.0,
233 |                     cnn.input_y_operation: y_batch_dev2,
234 |                     cnn.input_tensor_o: x_dev2,
235 |                     cnn.mask_opr: np.asarray(indi, dtype=float),
236 |                     cnn.gamma: gamma,
237 |                     cnn.matrix: co_arr
238 |                 }
239 | 
240 |                 step, summaries, loss, accuracy1, accuracy2, pres1, pres2 = sess.run(
241 |                     [global_step, dev_summary_op, cnn.loss, cnn.accuracy_d, cnn.accuracy_o, cnn.scores_d, cnn.scores_o],
242 |                     feed_dict)
243 | 
244 |                 eval_file = open(out_dir + "/evaluation.txt", "w+")
245 |                 right_file = open(out_dir + "/right_cases.txt", "w+")
246 |                 right_file2 = open(out_dir + "/right_cases_operation.txt", "w+")
247 |                 wrong_file = open(out_dir + "/wrong_cases.txt", "w+")
248 |                 wrong_file2 = open(out_dir + "/wrong_cases_operation.txt", "w+")
249 | 
250 |                 eval_file.write("Accu1: " + str(accuracy1) + "\n")
251 |                 eval_file.write("Accu2: " + str(accuracy2) + "\n")
252 | 
253 |                 predictions1 = np.argmax(pres1, 1)
254 |                 predictions2 = np.argmax(pres2, 1)
255 |                 labels1 = np.argmax(y_batch_dev, 1)
256 |                 labels2 = np.argmax(y_batch_dev2, 1)
257 | 
258 |                 def process(indi, tensor):
259 |                     tmp = []
260 |                     ll = len(indi)
261 |                     for i in range(ll):
262 |                         if indi[i] == 0:
263 |                             tmp.append(tensor[i])
264 |                     return np.asarray(tmp)
265 | 
266 |                 write_evaluation_file(eval_file, right_file, wrong_file, labels1, predictions1, x1_test, x2_test)
267 |                 write_evaluation_file(eval_file, right_file2, wrong_file2, labels2, predictions2, x3_test, x4_test,
268 |                                       indi)
269 | 
270 |                 eval_file.write("Parameters:")
271 |                 for attr, value in sorted(FLAGS.__flags.items()):
272 |                     eval_file.write("{}={}".format(attr.upper(), value) + "\n")
273 | 
274 |                 return loss, accuracy1, accuracy2
275 | 
276 |             def dev_whole(x_dev, y_dev, x_dev2, y_dev2, indi, co_dev_arr, writer=None):
277 |                 batches_dev = inputH.batch_iter(list(zip(x_dev, y_dev, co_dev_arr)), FLAGS.batch_size, 1, shuffle=False)
278 |                 batches_dev2 = inputH.batch_iter(list(zip(x_dev2, y_dev2, indi)), FLAGS.batch_size, 1, shuffle=False)
279 |                 losses = []
280 |                 accuracies1 = []
281 |                 accuracies2 = []
282 | 
283 |                 batches = zip(batches_dev, batches_dev2)
284 | 
285 |                 for batches_dev, batches_dev2 in batches:
286 |                     x_batch, y_batch, co_arr = zip(*batches_dev)
287 |                     x_batch2, y_batch2, indi = zip(*batches_dev2)
288 |                     loss, accuracy1, accuracy2 = dev_step(x_batch, y_batch, x_batch2, y_batch2, indi, co_arr, writer)
289 |                     losses.append(loss)
290 |                     accuracies1.append(accuracy1)
291 |                     accuracies2.append(accuracy2)
292 |                 return np.mean(np.array(losses)), np.mean(np.array(accuracies1)), np.mean(np.array(accuracies2))
293 | 
294 |             def overfit(dev_loss):
295 |                 n = len(dev_loss)
296 |                 if n < 5:
297 |                     return False
298 |                 for i in xrange(n - 4, n):
299 |                     if dev_loss[i] > dev_loss[i - 1]:
300 |                         return False
301 |                 return True
302 | 
303 |             # Generate batches
304 |             batches = inputH.batch_iter(
305 |                 list(zip(x_train_tensor, y_train, x_train_tensor_o, y2_train, x_train_indi_o, co_arr_train)),
306 |                 FLAGS.batch_size, FLAGS.num_epochs)
307 | 
308 |             # Training loop. For each batch...
309 |             dev_loss = []
310 |             dev_loss2 = []
311 |             # batch_d_o = zip(batches, batches2)
312 |             for batch in batches:
313 |                 x_batch, y_batch, x_batch2, y_batch2, indi, co_arr = zip(*batch)
314 | 
315 |                 train_step(x_batch, y_batch, x_batch2, y_batch2, indi, co_arr)
316 |                 current_step = tf.train.global_step(sess, global_step)
317 | 
318 |                 if current_step % FLAGS.evaluate_every == 0:
319 | 
320 |                     print("\nEvaluation:")
321 |                     loss, accuracy1, accuracy2 = dev_whole(x_dev_tensor, y_dev, x_dev_tensor_o, y2_dev, x_dev_indi_o,
322 |                                                            co_arr_val, writer=dev_summary_writer)
323 | 
324 |                     time_str = datetime.datetime.now().isoformat()
325 |                     print("{}: dev-aver, loss {:g}, acc {:g}, acc2 {:g}".format(time_str, loss, accuracy1, accuracy2))
326 |                     dev_loss.append(accuracy1)
327 |                     dev_loss2.append(accuracy2)
328 | 
329 |                     print("\nRecently accuracy:")
330 |                     print dev_loss[-10:]
331 |                     print dev_loss2[-10:]
332 | 
333 |                     if overfit(dev_loss):
334 |                         print 'Overfit!! in task1'
335 |                         break
336 |                     if overfit(dev_loss2):
337 |                         print 'Overfit!! in task2'
338 |                         break
339 |                     print("")
340 | 
341 |                 if current_step % FLAGS.checkpoint_every == 0:
342 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
343 |                     print("Saved model checkpoint to {}\n".format(path))
344 | 
345 |             loss, accuracy1, accuracy2 = evaluate(x_test_tensor, y_test, x_test_tensor_o, y2_test, x_test_indi_o,
346 |                                                   co_arr_test)
347 |             print(loss)
348 |             print(accuracy1)
349 |             print(accuracy2)
350 | 
351 | 
352 | if __name__ == '__main__':
353 |     main()
354 | 


--------------------------------------------------------------------------------
/MultiTask_MultiGranModel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | class MultiTask_MultiGranModel(object):
  8 |     def _conv(self, name, in_, ksize, reuse=False):
  9 |         num_filters = ksize[3]
 10 | 
 11 |         with tf.variable_scope(name, reuse=reuse) as scope:
 12 |             # different CNN for different views
 13 |             W = tf.Variable(tf.truncated_normal(ksize, stddev=0.1), name="W")
 14 |             biases = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
 15 | 
 16 |             # same CNN for different views
 17 |             # W = tf.get_variable("weights", ksize, initializer=tf.contrib.layers.xavier_initializer())
 18 |             # W = tf.get_variable("weights", ksize, initializer=tf.truncated_normal_initializer(stddev=0.1))
 19 |             # biases = tf.get_variable("biases", [num_filters], initializer=tf.constant_initializer(0.1))
 20 | 
 21 |             conv = tf.nn.conv2d(in_, W, strides=[1, 1, 1, 1], padding="VALID")
 22 |             h = tf.nn.relu(tf.nn.bias_add(conv, biases), name=scope.name)
 23 | 
 24 |         return h
 25 | 
 26 |     def _maxpool(self, name, in_, ksize, strides):
 27 |         pool = tf.nn.max_pool(in_, ksize=ksize, strides=strides, padding='VALID', name=name)
 28 |         print name, pool.get_shape().as_list()
 29 |         return pool
 30 | 
 31 |     def __init__(self, max_len1, max_len2, filter_sizes, pool_sizes, filter_sizes2, pool_sizes2, num_filters,
 32 |                  l2_reg_lambda=0.0, constraint_lambda=0.0, alpha=0.5, type_CNN=2, view_num=0, view_nums=[]):
 33 |         channel_num = 4
 34 | 
 35 |         # Placeholders for input, output and dropout
 36 |         self.input_tensor = tf.placeholder(tf.float32, [None, max_len1, max_len1, channel_num], name="input_tensor_description")
 37 |         self.input_tensor_o = tf.placeholder(tf.float32, [None, max_len2, max_len2, channel_num], name="input_tensor_operation")
 38 | 
 39 |         self.input_y_description = tf.placeholder(tf.float32, [None, 2], name="input_y_description")
 40 |         self.input_y_operation = tf.placeholder(tf.float32, [None, 2], name="input_y_operation")
 41 | 
 42 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 43 |         self.matrix = tf.placeholder(tf.float32, [None, 1], name="cooccurence")
 44 |         self.constraint_lambda = constraint_lambda
 45 | 
 46 |         # Keeping track of l2 regularization loss (optional)
 47 |         l2_loss_d = tf.constant(0.0)
 48 |         l2_loss_operation = tf.constant(0.0)
 49 | 
 50 |         # Create a convolution + maxpool layer for each filter size
 51 |         pooled_outputs = []
 52 |         pooled_outputs_operation = []
 53 | 
 54 |         input_tensor = tf.expand_dims(self.input_tensor, 4)  # N x W x H x V  => N x W x H x V x C
 55 |         input_tensor = tf.transpose(input_tensor,
 56 |                                     perm=[3, 0, 1, 2, 4])  # N x W x H x V x C =>  V x N x W x H x C
 57 | 
 58 |         input_tensor_operation = tf.expand_dims(self.input_tensor_o, 4)  # N x W x H x V  => N x W x H x V x C
 59 |         input_tensor_operation = tf.transpose(input_tensor_operation,
 60 |                                               perm=[3, 0, 1, 2, 4])  # N x W x H x V x C =>  V x N x W x H x C
 61 | 
 62 |         if type_CNN == 2: # multi-view
 63 |             with tf.name_scope("CNN_Description"):
 64 |                 view_c_num = 0
 65 |                 for i in range(channel_num):
 66 |                     # set reuse True for i > 0, for weight-sharing
 67 |                     reuse_f = (i != 0)
 68 |                     with tf.variable_scope("CNN_Description", reuse=reuse_f):
 69 |                         if len(view_nums) != 0:
 70 |                             if len(view_nums) <= view_c_num or view_nums[view_c_num] != i:
 71 |                                 continue
 72 |                             else:
 73 |                                 view_c_num += 1
 74 |                         print("AHAA" + str(i) + "\n")
 75 |                         view = tf.gather(input_tensor, i)  # N x W x H x C
 76 | 
 77 |                         filter_shape1 = [filter_sizes[0], filter_sizes[0], 1, num_filters]
 78 |                         filter_shape2 = [filter_sizes[1], filter_sizes[1], num_filters, num_filters * 2]
 79 |                         p_size1 = [1, pool_sizes[0], pool_sizes[0], 1]
 80 |                         p_size2 = [1, pool_sizes[1], pool_sizes[1], 1]
 81 | 
 82 |                         conv1 = self._conv('conv1', view, filter_shape1, reuse=reuse_f)
 83 |                         pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
 84 | 
 85 |                         conv2 = self._conv('conv2', pool1, filter_shape2, reuse=reuse_f)
 86 |                         pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
 87 | 
 88 |                         dim1 = np.prod(pool2.get_shape().as_list()[1:])
 89 |                         reshape = tf.reshape(pool2, [-1, dim1])
 90 | 
 91 |                         pooled_outputs.append(reshape)
 92 | 
 93 |             with tf.name_scope("CNN_Operation"):
 94 |                 view_c_num = 0
 95 |                 for i in range(channel_num):
 96 |                     # set reuse True for i > 0, for weight-sharing
 97 |                     reuse_f = (i != 0)
 98 | 
 99 |                     with tf.variable_scope("CNN_Operation", reuse=reuse_f):
100 |                         if len(view_nums) != 0:
101 |                             if len(view_nums) <= view_c_num or view_nums[view_c_num] != i:
102 |                                 continue
103 |                             else:
104 |                                 view_c_num += 1
105 |                         print("AHAA" + str(i) + "\n")
106 |                         view = tf.gather(input_tensor_operation, i)  # N x W x H x C
107 | 
108 |                         filter_shape1 = [filter_sizes2[0], filter_sizes2[0], 1, num_filters / 2]
109 |                         filter_shape2 = [filter_sizes2[1], filter_sizes2[1], num_filters / 2, num_filters]
110 |                         p_size1 = [1, pool_sizes2[0], pool_sizes2[0], 1]
111 |                         p_size2 = [1, pool_sizes2[1], pool_sizes2[1], 1]
112 | 
113 |                         conv1 = self._conv('conv1', view, filter_shape1, reuse=reuse_f)
114 |                         pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
115 | 
116 |                         conv2 = self._conv('conv2', pool1, filter_shape2, reuse=reuse_f)
117 |                         pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
118 | 
119 |                         dim2 = np.prod(pool2.get_shape().as_list()[1:])
120 |                         reshape = tf.reshape(pool2, [-1, dim2])
121 | 
122 |                         pooled_outputs_operation.append(reshape)
123 | 
124 | 
125 |             view_num_len = len(pooled_outputs)
126 |             print("LEN:" + str(view_num_len))
127 |             with tf.name_scope("Descriptipn_view_pooling"):
128 |                 x = tf.stack(pooled_outputs)  # 4 * N * 7744
129 |                 x = tf.transpose(x, perm=[1, 2, 0])  # N * 7744 * 4
130 | 
131 |                 reshape = tf.reshape(x, [-1, view_num_len])
132 |                 print reshape.get_shape().as_list()
133 | 
134 |                 Weights = tf.Variable(tf.random_uniform([view_num_len, 1], 0.0, 1.0), name="W")
135 | 
136 |                 y_d = tf.matmul(reshape, Weights, name="view_pooling")
137 |                 y_d = tf.reshape(y_d, [-1, dim1])
138 |                 print y_d.get_shape().as_list()
139 | 
140 |             with tf.name_scope("Operation_view_pooling"):
141 |                 x = tf.stack(pooled_outputs_operation)  # 4 * N * 7744
142 |                 x = tf.transpose(x, perm=[1, 2, 0])  # N * 7744 * 4
143 |                 reshape = tf.reshape(x, [-1, view_num_len])
144 |                 print reshape.get_shape().as_list()
145 | 
146 |                 Weights = tf.Variable(tf.random_uniform([view_num_len, 1], 0.0, 1.0), name="W")
147 | 
148 |                 y_o = tf.matmul(reshape, Weights, name="view_pooling")
149 |                 y_o = tf.reshape(y_o, [-1, dim2])
150 |                 print y_o.get_shape().as_list()
151 | 
152 |         elif type_CNN == 3: # single view
153 |             with tf.name_scope("CNN_Description"):
154 |                 view = tf.gather(input_tensor, view_num)  # N x W x H x C
155 |                 filter_shape1 = [filter_sizes[0], filter_sizes[0], 1, num_filters]
156 |                 filter_shape2 = [filter_sizes[1], filter_sizes[1], num_filters, num_filters * 2]
157 |                 p_size1 = [1, pool_sizes[0], pool_sizes[0], 1]
158 |                 p_size2 = [1, pool_sizes[1], pool_sizes[1], 1]
159 | 
160 |                 conv1 = self._conv("conv1", view, filter_shape1)
161 |                 pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
162 |                 conv2 = self._conv('conv2', pool1, filter_shape2)
163 |                 pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
164 | 
165 |                 dim1 = np.prod(pool2.get_shape().as_list()[1:])
166 |                 y_d = tf.reshape(pool2, [-1, dim1])
167 | 
168 |             with tf.name_scope("CNN_Operation"):
169 |                 view = tf.gather(input_tensor_operation, view_num)  # N x W x H x C
170 |                 filter_shape1 = [filter_sizes2[0], filter_sizes2[0], 1, num_filters / 2]
171 |                 filter_shape2 = [filter_sizes2[1], filter_sizes2[1], num_filters / 2, num_filters]
172 |                 p_size1 = [1, pool_sizes2[0], pool_sizes2[0], 1]
173 |                 p_size2 = [1, pool_sizes2[1], pool_sizes2[1], 1]
174 | 
175 |                 conv1 = self._conv('conv1', view, filter_shape1)
176 |                 pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
177 | 
178 |                 conv2 = self._conv('conv2', pool1, filter_shape2)
179 |                 pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
180 | 
181 |                 dim2 = np.prod(pool2.get_shape().as_list()[1:])
182 |                 y_o = tf.reshape(pool2, [-1, dim2])
183 | 
184 |         else: # single CNN
185 |             with tf.name_scope("CNN_Description"):
186 |                 filter_shape1 = [filter_sizes[0], filter_sizes[0], 4, num_filters]
187 |                 filter_shape2 = [filter_sizes[1], filter_sizes[1], num_filters, num_filters * 2]
188 |                 p_size1 = [1, pool_sizes[0], pool_sizes[0], 1]
189 |                 p_size2 = [1, pool_sizes[1], pool_sizes[1], 1]
190 | 
191 |                 conv1 = self._conv("conv1", self.input_tensor, filter_shape1)
192 |                 pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
193 |                 conv2 = self._conv('conv2', pool1, filter_shape2)
194 |                 pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
195 | 
196 |                 dim1 = np.prod(pool2.get_shape().as_list()[1:])
197 |                 y_d = tf.reshape(pool2, [-1, dim1])
198 | 
199 |             with tf.name_scope("CNN_Operation"):
200 |                 filter_shape1 = [filter_sizes2[0], filter_sizes2[0], 4, num_filters / 2]
201 |                 filter_shape2 = [filter_sizes2[1], filter_sizes2[1], num_filters / 2, num_filters]
202 |                 p_size1 = [1, pool_sizes2[0], pool_sizes2[0], 1]
203 |                 p_size2 = [1, pool_sizes2[1], pool_sizes2[1], 1]
204 | 
205 |                 conv1 = self._conv('conv1', self.input_tensor_o, filter_shape1)
206 |                 pool1 = self._maxpool('pool1', conv1, ksize=p_size1, strides=[1, 1, 1, 1])
207 | 
208 |                 conv2 = self._conv('conv2', pool1, filter_shape2)
209 |                 pool2 = self._maxpool('pool2', conv2, ksize=p_size2, strides=[1, 1, 1, 1])
210 | 
211 |                 dim2 = np.prod(pool2.get_shape().as_list()[1:])
212 |                 y_o = tf.reshape(pool2, [-1, dim2])
213 | 
214 |         # Add dropout
215 |         with tf.name_scope("dropout"):
216 |             self.h_drop_d = tf.nn.dropout(y_d, self.dropout_keep_prob, name="hidden_output_description_drop")
217 |             self.h_drop_o = tf.nn.dropout(y_o, self.dropout_keep_prob, name="hidden_output_operation_drop")
218 |             print self.h_drop_d.get_shape().as_list()
219 |             print self.h_drop_o.get_shape().as_list()
220 | 
221 |         with tf.name_scope("FC"):
222 |             dim = min(int(dim1 / 2), int(dim2 / 2))
223 |             print("FC DIM:" + str(dim) + "\n")
224 |             W1 = tf.Variable(name="W1", initial_value=tf.truncated_normal(shape=[dim1, dim], stddev=0.1))
225 |             b1 = tf.Variable(tf.constant(0.1, shape=[dim]), name="b1")
226 | 
227 |             self.fc_d = tf.nn.relu(tf.matmul(self.h_drop_d, W1) + b1)
228 |             self.fc_drop_d = tf.nn.dropout(self.fc_d, self.dropout_keep_prob)
229 | 
230 |             W2 = tf.Variable(name="W2", initial_value=tf.truncated_normal(shape=[dim2, dim], stddev=0.1))
231 |             b2 = tf.Variable(tf.constant(0.1, shape=[dim]), name="b2")
232 | 
233 |             self.fc_o = tf.nn.relu(tf.matmul(self.h_drop_o, W2) + b2)
234 |             self.fc_drop_o = tf.nn.dropout(self.fc_o, self.dropout_keep_prob)
235 | 
236 |         # Share Layer Construction
237 |         with tf.name_scope("Multitask"):
238 | 
239 |             self.shared_layer = tf.add(alpha * self.fc_drop_d, (1 - alpha) * self.fc_drop_o, name="Shared_layer")
240 |             # self.shared_layer = tf.div(tf.add(self.h_drop_d, self.h_drop_o), 2, name="Shared_layer")
241 |             print self.shared_layer.get_shape().as_list()
242 | 
243 |             W1 = tf.get_variable(name="tt1_W", shape=[dim],
244 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
245 |             W2 = tf.get_variable(name="st1_W", shape=[dim],
246 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
247 |             W3 = tf.get_variable(name="st2_W", shape=[dim],
248 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
249 |             W4 = tf.get_variable(name="tt2_W", shape=[dim],
250 |                                  initializer=tf.truncated_normal_initializer(stddev=0.1))
251 | 
252 |             self.task1_r = tf.add(tf.multiply(self.shared_layer, W2), tf.multiply(self.fc_drop_d, W1),
253 |                                   name="description_r")
254 |             self.task2_r = tf.add(tf.multiply(self.shared_layer, W3), tf.multiply(self.fc_drop_o, W4),
255 |                                   name="operation_r")
256 |             print self.task1_r.get_shape().as_list()
257 | 
258 |         with tf.name_scope("FC2"):
259 |             W1 = tf.Variable(name="W1", initial_value=tf.truncated_normal(shape=[dim, dim / 2], stddev=0.1))
260 |             b1 = tf.Variable(tf.constant(0.1, shape=[dim / 2]), name="b1")
261 | 
262 |             self.task1_representation = tf.nn.relu(tf.matmul(self.task1_r, W1) + b1)
263 |             self.task1_representation = tf.nn.dropout(self.task1_representation, self.dropout_keep_prob)
264 | 
265 |             W2 = tf.Variable(name="W2", initial_value=tf.truncated_normal(shape=[dim, dim / 2], stddev=0.1))
266 |             b2 = tf.Variable(tf.constant(0.1, shape=[dim / 2]), name="b2")
267 | 
268 |             self.task2_representation = tf.nn.relu(tf.matmul(self.task2_r, W2) + b2)
269 |             self.task2_representation = tf.nn.dropout(self.task2_representation, self.dropout_keep_prob)
270 | 
271 |         # Final (unnormalized) scores and predictions
272 |         with tf.name_scope("output"):
273 |             W_d = tf.get_variable(name="W_d", shape=[dim / 2, 2],
274 |                                   initializer=tf.truncated_normal_initializer(stddev=0.1))
275 |             b_d = tf.Variable(tf.constant(0.1, shape=[2]), name="b_d")
276 | 
277 |             l2_loss_d += tf.nn.l2_loss(W_d)
278 |             l2_loss_d += tf.nn.l2_loss(b_d)
279 | 
280 |             W_o = tf.get_variable(name="W_o", shape=[dim / 2, 2],
281 |                                   initializer=tf.truncated_normal_initializer(stddev=0.1))
282 |             b_o = tf.Variable(tf.constant(0.1, shape=[2]), name="b_o")
283 | 
284 |             l2_loss_operation += tf.nn.l2_loss(W_o)
285 |             l2_loss_operation += tf.nn.l2_loss(b_o)
286 | 
287 |             self.scores_d = tf.nn.xw_plus_b(self.task1_representation, W_d, b_d, name="scores1")
288 |             self.scores_o = tf.nn.xw_plus_b(self.task2_representation, W_o, b_o, name="scores2")
289 | 
290 |             self.relation_d = tf.nn.softmax(self.scores_d, name="relation1")
291 |             self.relation_o = tf.nn.softmax(self.scores_o, name="relation2")
292 | 
293 |             self.predictions_d = tf.argmax(self.scores_d, 1, name="predictions1")
294 |             self.predictions_o = tf.argmax(self.scores_o, 1, name="predictions2")
295 | 
296 |         # Calculate Mean cross-entropy loss
297 |         with tf.name_scope("loss"):
298 |             losses1 = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores_d, labels=self.input_y_description)
299 |             losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores_o, labels=self.input_y_operation)
300 | 
301 |             gap = tf.reduce_sum(tf.square(self.relation_d - self.relation_o), axis=1, keep_dims=True)
302 |             constraints = tf.multiply(self.matrix, gap)
303 |             self.constraints = tf.identity(tf.reduce_mean(constraints), name="constraints")
304 | 
305 |             self.loss = tf.reduce_mean(losses1) + tf.reduce_mean(losses2) + l2_reg_lambda * (
306 |                 l2_loss_d + l2_loss_operation) + self.constraint_lambda * tf.reduce_mean(constraints)
307 |             self.loss = tf.identity(self.loss, name="loss")
308 | 
309 |         # Accuracy
310 |         with tf.name_scope("accuracy"):
311 |             correct_predictions_d = tf.equal(self.predictions_d, tf.argmax(self.input_y_description, 1))
312 |             correct_predictions_o = tf.equal(self.predictions_o, tf.argmax(self.input_y_operation, 1))
313 |             self.accuracy_d = tf.reduce_mean(tf.cast(correct_predictions_d, "float"), name="accuracy_d")
314 |             self.accuracy_o = tf.reduce_mean(tf.cast(correct_predictions_o, "float"), name="accuracy_o")
315 | 


--------------------------------------------------------------------------------
/Multi_task_Arch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding=utf-8
  3 | 
  4 | import datetime
  5 | import os
  6 | import time
  7 | import copy
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | 
 12 | from util.util import write_evaluation_file
 13 | from DLDisambiguation.util.input_helpers import InputHelper
 14 | from MultiTask_MultiGranModel import MultiTask_MultiGranModel
 15 | from tensor import Tensor
 16 | 
 17 | # Parameters
 18 | # ==================================================
 19 | 
 20 | # Model Hyperparameters
 21 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 100)")
 22 | tf.flags.DEFINE_string("filter_sizes", "4,3", "Comma-separated filter sizes (default: '2,3')")
 23 | tf.flags.DEFINE_string("filter_sizes2", "3,3", "Comma-separated filter sizes (default: '2,3')")
 24 | tf.flags.DEFINE_string("pool_sizes", "2,2", "Comma-separated filter sizes (default: '2,3')")
 25 | tf.flags.DEFINE_string("pool_sizes2", "3,3", "Comma-separated filter sizes (default: '2,3')")
 26 | tf.flags.DEFINE_string("view_nums", "0,1,2,3", "view combination")
 27 | 
 28 | tf.flags.DEFINE_integer("num_filters", 16, "Number of filters per filter size (default: 64)")
 29 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 30 | tf.flags.DEFINE_float("l2_reg_lambda", 100, "L2 regularizaion lambda (default: 0.0)")
 31 | tf.flags.DEFINE_float("con_lambda", 0.1, "constraint regularizaion lambda (default: 0.0)")
 32 | tf.flags.DEFINE_float("alpha", 0.5, "parameter for shared layer")
 33 | tf.flags.DEFINE_integer("type_CNN", 2, "type of CNN")
 34 | tf.flags.DEFINE_integer("view_num", 3, "type of CNN")
 35 | tf.flags.DEFINE_boolean("early_stop", True, "whether early stopping is used")
 36 | tf.flags.DEFINE_integer("early_stop_num", 11, "number of epoch in early stopping")
 37 | 
 38 | # Data Parameter
 39 | tf.flags.DEFINE_integer("max_sequence_len", 10, "max document length of input")
 40 | tf.flags.DEFINE_integer("max_sequence_len2", 20, "max document length of input")
 41 | tf.flags.DEFINE_integer("most_words", 300000, "Most number of words in vocab (default: 300000)")
 42 | 
 43 | # Training parameters
 44 | tf.flags.DEFINE_integer("seed", 123, "Random seed (default: 123)")
 45 | tf.flags.DEFINE_string("train_dir", "./", "Training dir root")
 46 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
 47 | tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs")
 48 | tf.flags.DEFINE_float("eval_split", 0.1, "Use how much data for evaluating (default: 0.1)")
 49 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
 50 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
 51 | 
 52 | FLAGS = tf.flags.FLAGS
 53 | FLAGS._parse_flags()
 54 | print("\nParameters:")
 55 | for attr, value in sorted(FLAGS.__flags.items()):
 56 |     print("{}={}".format(attr.upper(), value))
 57 | print("")
 58 | 
 59 | 
 60 | def generate_Tensor(mention, entity, mention2, entity2, mention3, entity3, max_len, task_n):
 61 |     lstm_dir = "Description1501058401" if task_n == 1 else "Operation1501000120"
 62 |     bilstm_dir = os.path.join("./Sentence_Modeling/runs", lstm_dir)
 63 | 
 64 |     tensor = Tensor(mention + mention2 + mention3, entity + entity2 + entity3, len(mention + mention2 + mention3),
 65 |                     max_len, task_n, bilstm_dir).get_tensor()
 66 |     tensor = tensor.transpose((0, 2, 3, 1))
 67 | 
 68 |     g1 = len(mention)
 69 |     g2 = len(mention + mention2)
 70 |     return tensor[:g1], tensor[g1:g2], tensor[g2:]
 71 | 
 72 | 
 73 | def prepara_tensor_y(inputH, training_path, dev_path, test_path, max_len):
 74 |     sep = "\t"
 75 |     x1_train, x2_train, x3_train, x4_train, y_train, y2_train = inputH.getTsvTestData_Mul(training_path, sep, max_len)
 76 |     x1_dev, x2_dev, x3_dev, x4_dev, y_dev, y2_dev = inputH.getTsvTestData_Mul(dev_path, sep, max_len)
 77 |     x1_test, x2_test, x3_test, x4_test, y_test, y2_test = inputH.getTsvTestData_Mul(test_path, sep, max_len)
 78 | 
 79 |     x_train_tensor, x_dev_tensor, x_test_tensor = generate_Tensor(x1_train, x2_train, x1_dev, x2_dev, x1_test, x2_test,
 80 |                                                                   max_len, 1)
 81 | 
 82 |     x_train_tensor_o, x_dev_tensor_o, x_test_tensor_o = generate_Tensor(x3_train, x4_train, x3_dev, x4_dev, x3_test,
 83 |                                                                         x4_test, max_len, 2)
 84 | 
 85 |     np.save("train_des", x_train_tensor)
 86 |     np.save("dev_des", x_dev_tensor)
 87 |     np.save("test_des", x_test_tensor)
 88 | 
 89 |     np.save("train_opr", x_train_tensor_o)
 90 |     np.save("dev_opr", x_dev_tensor_o)
 91 |     np.save("test_opr", x_test_tensor_o)
 92 | 
 93 |     return x_train_tensor, y_train, x_dev_tensor, y_dev, x_test_tensor, y_test, \
 94 |            x_train_tensor_o, y2_train, x_dev_tensor_o, y2_dev, x_test_tensor_o, y2_test
 95 | 
 96 | 
 97 | def get_coocurrence(des_e_names, des_opr_map, x2_test, x4_test, N):
 98 |     res = []
 99 |     length = len(x2_test)
100 | 
101 |     for i in range(length):
102 |         des = x2_test[i]
103 |         opr = x4_test[i]
104 | 
105 |         if des_e_names.__contains__(des):
106 |             index = des_e_names.index(des)
107 |             des_opr = des_opr_map[index]
108 |             if opr in des_opr.keys():
109 |                 N = des_opr[str(opr)]
110 |         res.append(N)
111 |     return np.expand_dims(np.asarray(res), axis=1)
112 | 
113 | 
114 | def load_coocurrence_matrix(filename):
115 |     t = open(filename)
116 |     line = t.readline()
117 |     des_e_names = []
118 |     des_opr_map = []
119 | 
120 |     while line != "":
121 |         des, oprs = line.split("\t")[0], line.split("\t")[1]
122 |         des_e_names.append(des)
123 | 
124 |         oprs_num = oprs.split("_")[:-1]
125 |         tmp = {}
126 |         for opr in oprs_num:
127 |             opr_name, num = opr.split(":")[0], int(opr.split(":")[1])
128 |             tmp[opr_name] = num
129 |         des_opr_map.append(copy.deepcopy(tmp))
130 |         line = t.readline()
131 |     return des_e_names, des_opr_map
132 | 
133 | 
134 | def main():
135 |     # Load data
136 |     print("Loading data...")
137 |     inputH = InputHelper()
138 | 
139 |     date_f = "0823"
140 |     train_f = "./data/exp" + date_f + "/data_augment_train.txt"
141 |     test_f = "./data/exp" + date_f + "/data_augment_test.txt"
142 |     dev_f = test_f
143 | 
144 |     our_dir = "./Tensor_files/" + date_f + "/Length" + str(FLAGS.max_sequence_len) + "/"
145 |     x_train_tensor = np.load(our_dir + "train_des.npy")
146 |     # x_dev_tensor = np.load(our_dir + "dev_des.npy")
147 |     x_test_tensor = np.load(our_dir + "test_des.npy")
148 |     x_dev_tensor = x_test_tensor
149 | 
150 |     our_dir = "./Tensor_files/" + date_f + "/Length" + str(FLAGS.max_sequence_len2) + "/"
151 |     x_train_tensor_o = np.load(our_dir + "train_opr.npy")
152 |     # x_dev_tensor_o = np.load(our_dir + "dev_opr.npy")
153 |     x_test_tensor_o = np.load(our_dir + "test_opr.npy")
154 |     x_dev_tensor_o = x_test_tensor_o
155 | 
156 |     def normalize(a):
157 |         amin, amax = a.min(), a.max()  # 求最大最小值
158 |         a = (a - amin) / (amax - amin)  # (矩阵元素-最小值)/(最大值-最小值)
159 |         return a
160 | 
161 |     def normalize_tensor(t):
162 |         t[:, :, :, 0] = normalize(t[:, :, :, 0])
163 |         t[:, :, :, 1] = normalize(t[:, :, :, 1])
164 |         t[:, :, :, 2] = normalize(t[:, :, :, 2])
165 |         t[:, :, :, 3] = normalize(t[:, :, :, 3])
166 |         return t
167 | 
168 |     x_test_tensor[:, :, :, 3] = normalize(x_test_tensor[:, :, :, 3])
169 |     x_train_tensor[:, :, :, 3] = normalize(x_train_tensor[:, :, :, 3])
170 |     x_test_tensor_o[:, :, :, 3] = normalize(x_test_tensor_o[:, :, :, 3])
171 |     x_train_tensor_o[:, :, :, 3] = normalize(x_train_tensor_o[:, :, :, 3])
172 |     # x_test_tensor = normalize_tensor(x_test_tensor)
173 |     # x_test_tensor_o = normalize_tensor(x_test_tensor_o)
174 |     # x_train_tensor = normalize_tensor(x_train_tensor)
175 |     # x_train_tensor_o = normalize_tensor(x_train_tensor_o)
176 | 
177 |     sep = "\t"
178 |     x1_train, x2_train, x3_train, x4_train, y_train, y2_train = inputH.getTsvTestData_Mul_Labels(train_f, sep,
179 |                                                                                                  FLAGS.max_sequence_len)
180 |     # x1_dev, x2_dev, x3_dev, x4_dev, y_dev, y2_dev = inputH.getTsvTestData_Mul_Labels(dev_f, sep, FLAGS.max_sequence_len)
181 |     x1_test, x2_test, x3_test, x4_test, y_test, y2_test = inputH.getTsvTestData_Mul_Labels(test_f, sep,
182 |                                                                                            FLAGS.max_sequence_len)
183 |     x1_dev, x2_dev, x3_dev, x4_dev, y_dev, y2_dev = x1_test, x2_test, x3_test, x4_test, y_test, y2_test
184 | 
185 |     des_e_names, des_opr_map = load_coocurrence_matrix("coorrence_file.txt")
186 |     N_default = 0.01
187 |     co_arr_test = get_coocurrence(des_e_names, des_opr_map, x2_test, x4_test, N_default)
188 |     co_arr_train = get_coocurrence(des_e_names, des_opr_map, x2_train, x4_train, N_default)
189 |     # co_arr_val = get_coocurrence(des_e_names, des_opr_map, x2_dev, x4_dev)
190 |     co_arr_val = co_arr_test
191 | 
192 |     with tf.Graph().as_default():
193 | 
194 |         sess = tf.Session()
195 |         with sess.as_default():
196 |             cnn = MultiTask_MultiGranModel(
197 |                 max_len1=FLAGS.max_sequence_len,
198 |                 max_len2=FLAGS.max_sequence_len2,
199 |                 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
200 |                 filter_sizes2=list(map(int, FLAGS.filter_sizes2.split(","))),
201 |                 pool_sizes=list(map(int, FLAGS.pool_sizes.split(","))),
202 |                 pool_sizes2=list(map(int, FLAGS.pool_sizes2.split(","))),
203 |                 num_filters=FLAGS.num_filters,
204 |                 l2_reg_lambda=FLAGS.l2_reg_lambda,
205 |                 constraint_lambda=FLAGS.con_lambda,
206 |                 alpha=FLAGS.alpha,
207 |                 type_CNN=FLAGS.type_CNN,
208 |                 view_num=FLAGS.view_num,
209 |                 view_nums=list(map(int, FLAGS.view_nums.split(",")))
210 |             )
211 | 
212 |             # Define Training procedure
213 |             global_step = tf.Variable(0, name="global_step", trainable=False)
214 |             optimizer = tf.train.AdamOptimizer(1e-3)
215 | 
216 |             grads_and_vars = optimizer.compute_gradients(cnn.loss)
217 |             train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
218 | 
219 |             saver = tf.train.Saver(tf.all_variables(), max_to_keep=20)
220 | 
221 |             # Keep track of gradient values and sparsity (optional)
222 |             for g, v in grads_and_vars:
223 |                 if g is not None:
224 |                     tf.summary.histogram("grad_hist/{}".format(v.name), g)
225 |                     tf.summary.scalar("grad_sparsity/{}".format(v.name), tf.nn.zero_fraction(g))
226 |                     tf.summary.histogram(v.name, v)
227 | 
228 |             # Output directory for models and summaries
229 |             timestamp = str(int(time.time()))
230 |             out_dir = os.path.abspath(os.path.join(FLAGS.train_dir, "runs", "alpha_E", "multitask" + timestamp))
231 |             if not os.path.exists(out_dir):
232 |                 os.makedirs(out_dir)
233 |             print("Writing to {}\n".format(out_dir))
234 | 
235 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
236 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
237 |             if not os.path.exists(checkpoint_dir):
238 |                 os.makedirs(checkpoint_dir)
239 | 
240 |             # Summaries for loss and accuracy
241 |             loss_summary = tf.summary.scalar("loss", cnn.loss)
242 |             constraint_summary = tf.summary.scalar("constraints", cnn.constraints)
243 |             acc_summary1 = tf.summary.scalar("accuracy1", cnn.accuracy_d)
244 |             acc_summary2 = tf.summary.scalar("accuracy2", cnn.accuracy_o)
245 | 
246 |             # Train Summaries
247 |             train_summary_op = tf.summary.merge_all()
248 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
249 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
250 | 
251 |             # Dev summaries
252 |             dev_summary_op = tf.summary.merge([loss_summary, acc_summary1, acc_summary2, constraint_summary])
253 |             dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
254 |             dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
255 | 
256 |             # Initialize all variables
257 |             sess.run(tf.initialize_all_variables())
258 | 
259 |             def train_step(x_batch, y_batch, x_batch2, y_batch2, co_arr):
260 |                 feed_dict = {
261 |                     cnn.input_tensor: x_batch,
262 |                     cnn.input_y_description: y_batch,
263 |                     cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
264 |                     cnn.input_tensor_o: x_batch2,
265 |                     cnn.input_y_operation: y_batch2,
266 |                     cnn.matrix: co_arr,
267 |                 }
268 |                 _, step, summaries, loss, accuracy1, accuracy2 = sess.run(
269 |                     [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy_d, cnn.accuracy_o],
270 |                     feed_dict)
271 |                 time_str = datetime.datetime.now().isoformat()
272 |                 if step % 10 == 0:
273 |                     print(
274 |                         "{}: step {}, loss {:g}, acc1 {:g}, acc2 {:g}".format(time_str, step, loss, accuracy1,
275 |                                                                               accuracy2))
276 |                 train_summary_writer.add_summary(summaries, step)
277 |                 return accuracy1, accuracy2, loss
278 | 
279 |             def dev_step(x_dev, y_batch_dev, x_dev2, y_batch_dev2, co_arr, writer=None):
280 |                 feed_dict = {
281 |                     cnn.input_tensor: x_dev,
282 |                     cnn.input_y_description: y_batch_dev,
283 |                     cnn.dropout_keep_prob: 1.0,
284 |                     cnn.input_y_operation: y_batch_dev2,
285 |                     cnn.input_tensor_o: x_dev2,
286 |                     cnn.matrix: co_arr
287 |                 }
288 |                 step, summaries, loss, accuracy1, accuracy2, pres1, pres2 = sess.run(
289 |                     [global_step, dev_summary_op, cnn.loss, cnn.accuracy_d, cnn.accuracy_o, cnn.scores_d, cnn.scores_o],
290 |                     feed_dict)
291 |                 if writer:
292 |                     writer.add_summary(summaries, step)
293 |                 return loss, accuracy1, accuracy2
294 | 
295 |             def evaluate(x_dev, y_batch_dev, x_dev2, y_batch_dev2, co_arr):
296 |                 feed_dict = {
297 |                     cnn.input_tensor: x_dev,
298 |                     cnn.input_y_description: y_batch_dev,
299 |                     cnn.dropout_keep_prob: 1.0,
300 |                     cnn.input_y_operation: y_batch_dev2,
301 |                     cnn.input_tensor_o: x_dev2,
302 |                     cnn.matrix: co_arr
303 |                 }
304 | 
305 |                 step, summaries, loss, accuracy1, accuracy2, pres1, pres2 = sess.run(
306 |                     [global_step, dev_summary_op, cnn.loss, cnn.accuracy_d, cnn.accuracy_o, cnn.scores_d, cnn.scores_o],
307 |                     feed_dict)
308 | 
309 |                 eval_file = open(out_dir + "/evaluation.txt", "w+")
310 |                 right_file = open(out_dir + "/right_cases.txt", "w+")
311 |                 right_file2 = open(out_dir + "/right_cases_operation.txt", "w+")
312 |                 wrong_file = open(out_dir + "/wrong_cases.txt", "w+")
313 |                 wrong_file2 = open(out_dir + "/wrong_cases_operation.txt", "w+")
314 | 
315 |                 eval_file.write("Accu1: " + str(accuracy1) + "\n")
316 |                 eval_file.write("Accu2: " + str(accuracy2) + "\n")
317 | 
318 |                 predictions1 = np.argmax(pres1, 1)
319 |                 predictions2 = np.argmax(pres2, 1)
320 |                 labels1 = np.argmax(y_batch_dev, 1)
321 |                 labels2 = np.argmax(y_batch_dev2, 1)
322 |                 write_evaluation_file(eval_file, right_file, wrong_file, labels1, predictions1, x1_test, x2_test)
323 |                 write_evaluation_file(eval_file, right_file2, wrong_file2, labels2, predictions2, x3_test, x4_test)
324 | 
325 |                 eval_file.write("Parameters:")
326 |                 for attr, value in sorted(FLAGS.__flags.items()):
327 |                     eval_file.write("{}={}".format(attr.upper(), value) + "\n")
328 | 
329 |                 return loss, accuracy1, accuracy2
330 | 
331 |             def dev_whole(x_dev, y_dev, x_dev2, y_dev2, co_arr, writer=None):
332 |                 batches_dev = inputH.batch_iter(list(zip(x_dev, y_dev, co_arr)), FLAGS.batch_size, 1, shuffle=False)
333 |                 batches_dev2 = inputH.batch_iter(list(zip(x_dev2, y_dev2)), FLAGS.batch_size, 1, shuffle=False)
334 |                 losses = []
335 |                 accuracies1 = []
336 |                 accuracies2 = []
337 | 
338 |                 batches = zip(batches_dev, batches_dev2)
339 | 
340 |                 for batches_dev, batches_dev2 in batches:
341 |                     x_batch, y_batch, co_arr_ = zip(*batches_dev)
342 |                     x_batch2, y_batch2 = zip(*batches_dev2)
343 |                     loss, accuracy1, accuracy2 = dev_step(x_batch, y_batch, x_batch2, y_batch2, co_arr_)
344 |                     losses.append(loss)
345 |                     accuracies1.append(accuracy1)
346 |                     accuracies2.append(accuracy2)
347 | 
348 |                 return np.mean(np.array(losses)), np.mean(np.array(accuracies1)), np.mean(np.array(accuracies2))
349 | 
350 |             def overfit(dev_loss, accu):
351 |                 num = FLAGS.early_stop_num
352 |                 n = len(dev_loss)
353 |                 if n < num:
354 |                     return False
355 | 
356 |                 for i in xrange(n - num, n - 1):
357 |                     if dev_loss[i] < accu:
358 |                         return False
359 |                 print(dev_loss)
360 |                 print(accu)
361 |                 return True
362 | 
363 |             # Generate batches
364 |             batches = inputH.batch_iter(list(zip(x_train_tensor, y_train, x_train_tensor_o, y2_train, co_arr_train)),
365 |                                         FLAGS.batch_size, FLAGS.num_epochs)
366 | 
367 |             # Training loop. For each batch...
368 |             dev_loss = []
369 |             train_loss = []
370 | 
371 |             train_accu = []
372 |             train_accu2 = []
373 |             dev_accu = []
374 |             dev_accu2 = []
375 |             # batch_d_o = zip(batches, batches2)
376 |             optimum_accu1 = 0
377 |             optimum_accu2 = 0
378 |             data_num = len(y_train)
379 |             num_batches_per_epoch = int(data_num / FLAGS.batch_size)
380 |             # t = num_batches_per_epoch / 2
381 |             optimum_loss = 1000
382 | 
383 |             for batch in batches:
384 |                 x_batch, y_batch, x_batch2, y_batch2, co_arr_batch = zip(*batch)
385 | 
386 |                 acc1, acc2, loss_train = train_step(x_batch, y_batch, x_batch2, y_batch2, co_arr_batch)
387 |                 train_accu.append(acc1)
388 |                 train_accu2.append(acc2)
389 |                 train_loss.append(loss_train)
390 |                 current_step = tf.train.global_step(sess, global_step)
391 |                 if current_step % num_batches_per_epoch == 0:
392 | 
393 |                     print("\nEvaluation:")
394 |                     loss, accuracy1, accuracy2 = dev_whole(x_dev_tensor, y_dev, x_dev_tensor_o, y2_dev, co_arr_val,
395 |                                                            writer=dev_summary_writer)
396 | 
397 |                     summary = tf.Summary()
398 | 
399 |                     summary.value.add(tag="Accuracy_Dev", simple_value=accuracy1)
400 |                     summary.value.add(tag="Accuracy2_Dev", simple_value=accuracy2)
401 |                     summary.value.add(tag="Loss_Dev", simple_value=loss)
402 |                     dev_summary_writer.add_summary(summary, current_step)
403 | 
404 |                     time_str = datetime.datetime.now().isoformat()
405 |                     print("{}: dev-aver, loss {:g}, acc {:g}, acc2 {:g}".format(time_str, loss, accuracy1, accuracy2))
406 |                     dev_accu.append(accuracy1)
407 |                     dev_accu2.append(accuracy2)
408 |                     dev_loss.append(loss)
409 |                     print("\nRecently accuracy:")
410 |                     print dev_accu[-10:]
411 |                     print dev_accu2[-10:]
412 | 
413 |                     # if loss < optimum_loss:
414 |                     #     optimum_loss = loss
415 |                     #     stop_early = 0
416 |                     #     optimum_accu1 = accuracy1
417 |                     #     optimum_accu2 = accuracy2
418 |                     #     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
419 |                     #     print("Saved model checkpoint to {}\n".format(path))
420 |                     # else:
421 |                     #     stop_early += 1
422 |                     #     if stop_early == 10:
423 |                     #         break
424 |                     if FLAGS.early_stop:
425 |                         if overfit(dev_accu, accuracy1) or overfit(dev_accu2, accuracy2):
426 |                             print 'Overfit!!'
427 |                             print(current_step)
428 |                             print(current_step / num_batches_per_epoch)
429 |                             break
430 |                         print("")
431 | 
432 |                     if accuracy1 > optimum_accu1 and accuracy2 > optimum_accu2:
433 |                         optimum_accu1 = accuracy1
434 |                         optimum_accu2 = accuracy2
435 |                         path = saver.save(sess, checkpoint_prefix, global_step=current_step)
436 |                         print("Saved model checkpoint to {}\n".format(path))
437 | 
438 |                     print("Optimum_accu1: " + str(optimum_accu1))
439 |                     print("Optimum_accu2: " + str(optimum_accu2))
440 | 
441 |     print("Optimum_accu1: " + str(optimum_accu1))
442 |     print("Optimum_accu2: " + str(optimum_accu2))
443 | 
444 |     import matplotlib.pyplot as plt
445 |     # def plot_plots(y1, y2, name_task, type_eval):
446 |     #     x1 = np.arange(len(y1))
447 |     #     x2 = np.arange(len(y2))
448 |     #     p1, = plt.plot(x1, y1, 'b', label="Validation")
449 |     #     p2, = plt.plot(x2, y2, 'r', label="Train")
450 |     #     plt.legend(handles=[p1, p2], numpoints=1)  # make legend
451 |     #     plt.title(name_task + "_" + type_eval)
452 |     #     plt.savefig(os.path.join(out_dir, name_task + "_" + type_eval + ".png"))
453 |     #
454 |     # plot_plots(dev_accu, train_accu, "Disease", "Accu")
455 |     # plot_plots(dev_accu2, train_accu2, "Operation", "Accu")
456 |     # plot_plots(dev_loss, train_loss, "MTL", "Loss")
457 | 
458 |     # evaluate the result with the best model
459 |     ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
460 |     checkpoint_file = ckpt.model_checkpoint_path
461 |     graph = tf.Graph()
462 | 
463 |     with graph.as_default():
464 |         sess = tf.Session()
465 |         with sess.as_default():
466 |             # Load the saved meta graph and restore variables
467 |             saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
468 |             sess.run(tf.initialize_all_variables())
469 |             saver.restore(sess, checkpoint_file)
470 | 
471 |             cooccur = graph.get_operation_by_name("cooccurence").outputs[0]
472 |             input_t1 = graph.get_operation_by_name("input_tensor_description").outputs[0]
473 |             input_t2 = graph.get_operation_by_name("input_tensor_operation").outputs[0]
474 |             input_y1 = graph.get_operation_by_name("input_y_description").outputs[0]
475 |             input_y2 = graph.get_operation_by_name("input_y_operation").outputs[0]
476 |             prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
477 | 
478 |             loss_opr = graph.get_operation_by_name("loss/loss").outputs[0]
479 |             prediction = graph.get_operation_by_name("output/predictions1").outputs[0]
480 |             prediction2 = graph.get_operation_by_name("output/predictions2").outputs[0]
481 |             accu = graph.get_operation_by_name("accuracy/accuracy_d").outputs[0]
482 |             accu2 = graph.get_operation_by_name("accuracy/accuracy_o").outputs[0]
483 | 
484 |             loss, pres1, pres2, accuracy1, accuracy2 = sess.run(
485 |                 [loss_opr, prediction, prediction2, accu, accu2],
486 |                 {input_t1: x_test_tensor, input_y1: y_test, cooccur: co_arr_test,
487 |                  input_t2: x_test_tensor_o, input_y2: y2_test, prob: 1})
488 | 
489 |             eval_file = open(out_dir + "/evaluation.txt", "w+")
490 |             right_file = open(out_dir + "/right_cases.txt", "w+")
491 |             wrong_file = open(out_dir + "/wrong_cases.txt", "w+")
492 |             right_file2 = open(out_dir + "/right_cases_opr.txt", "w+")
493 |             wrong_file2 = open(out_dir + "/wrong_cases_opr.txt", "w+")
494 | 
495 |             eval_file.write("Accu1: " + str(accuracy1) + "\n")
496 |             eval_file.write("Accu2: " + str(accuracy2) + "\n")
497 |             # eval_file.write("Stopped at: " + str(int(current_step / num_batches_per_epoch)) + "\n")
498 |             eval_file.write("Default: " + str(N_default) + "\n")
499 | 
500 |             labels1 = np.argmax(y_test, 1)
501 |             labels2 = np.argmax(y2_test, 1)
502 |             write_evaluation_file(eval_file, right_file, wrong_file, labels1, pres1, x1_test, x2_test)
503 |             write_evaluation_file(eval_file, right_file2, wrong_file2, labels2, pres2, x3_test, x4_test)
504 | 
505 |             eval_file.write("Parameters:")
506 |             for attr, value in sorted(FLAGS.__flags.items()):
507 |                 eval_file.write("{}={}".format(attr.upper(), value) + "\n")
508 | 
509 |             print("loss:" + str(loss))
510 |             print("accuracy1:" + str(accuracy1))
511 |             print("accuracy2:" + str(accuracy2))
512 | 
513 | 
514 | if __name__ == '__main__':
515 |     main()
516 | 


--------------------------------------------------------------------------------