├── ComNet.py ├── README.md ├── loss.py ├── mlpcnn.py ├── tools ├── com_reader.py └── com_writer.py └── utils ├── controler.py ├── converter.py ├── datafeeds.py └── utility.py /ComNet.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import reader 3 | 4 | import argparse 5 | import logging 6 | import json 7 | import sys 8 | import os 9 | 10 | import tensorflow as tf 11 | from tensorflow.python.framework import graph_util 12 | 13 | from utils import datafeeds 14 | from utils import controler 15 | from utils import utility 16 | from utils import converter 17 | 18 | _WORK_DIR = os.path.split(os.path.realpath(__file__))[0] 19 | sys.path.append(os.path.join(_WORK_DIR, '../../../common')) 20 | #import log 21 | 22 | 23 | def load_config(config_file): 24 | """ 25 | load config 26 | """ 27 | with open(config_file, "r") as f: 28 | try: 29 | conf = json.load(f) 30 | except Exception: 31 | logging.error("load json file %s error" % config_file) 32 | conf_dict = {} 33 | unused = [conf_dict.update(conf[k]) for k in conf] 34 | logging.debug("\n".join( 35 | ["%s=%s" % (u, conf_dict[u]) for u in conf_dict])) 36 | return conf_dict 37 | 38 | 39 | def train(conf_dict): 40 | """ 41 | train 42 | """ 43 | training_mode = conf_dict["training_mode"] 44 | net = utility.import_object( 45 | conf_dict["net_py"], conf_dict["net_class"])(conf_dict) 46 | if training_mode == "pointwise": 47 | datafeed = datafeeds.TFPointwisePaddingData(conf_dict) 48 | input_l, input_r, label_y = datafeed.ops() 49 | pred = net.predict(input_l, input_r) 50 | output_prob = tf.nn.softmax(pred, -1, name="output_prob") 51 | loss_layer = utility.import_object( 52 | conf_dict["loss_py"], conf_dict["loss_class"])() 53 | loss = loss_layer.ops(pred, label_y) 54 | elif training_mode == "pairwise": 55 | datafeed = datafeeds.TFPairwisePaddingData(conf_dict) 56 | input_l, input_r, neg_input = datafeed.ops() 57 | pos_score = net.predict(input_l, input_r) 58 | output_prob = tf.identity(pos_score, name="output_preb") 59 | neg_score = net.predict(input_l, neg_input) 60 | loss_layer = utility.import_object( 61 | conf_dict["loss_py"], conf_dict["loss_class"])(conf_dict) 62 | loss = loss_layer.ops(pos_score, neg_score) 63 | else: 64 | print(sys.stderr, "training mode not supported") 65 | sys.exit(1) 66 | # define optimizer 67 | lr = float(conf_dict["learning_rate"]) 68 | optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) 69 | 70 | # run_trainer 71 | controler.run_trainer(loss, optimizer, conf_dict) 72 | 73 | 74 | def predict(conf_dict): 75 | """ 76 | predict 77 | """ 78 | net = utility.import_object( 79 | conf_dict["net_py"], conf_dict["net_class"])(conf_dict) 80 | conf_dict.update({"num_epochs": "1", "batch_size": "1", 81 | "shuffle": "0", "train_file": conf_dict["test_file"]}) 82 | test_datafeed = datafeeds.TFPointwisePaddingData(conf_dict) 83 | test_l, test_r, test_y = test_datafeed.ops() 84 | # test network 85 | pred = net.predict(test_l, test_r) 86 | controler.run_predict(pred, test_y, conf_dict) 87 | 88 | 89 | def freeze(conf_dict): 90 | """ 91 | freeze net for c api predict 92 | """ 93 | model_path = conf_dict["save_path"] 94 | freeze_path = conf_dict["freeze_path"] 95 | saver = tf.train.import_meta_graph(model_path + '.meta') 96 | with tf.Session() as sess: 97 | saver.restore(sess, model_path) 98 | var_graph_def = tf.get_default_graph().as_graph_def() 99 | const_graph_def = graph_util.convert_variables_to_constants(sess, var_graph_def, ["output_prob"]) 100 | with tf.gfile.GFile(freeze_path, "wb") as f: 101 | f.write(const_graph_def.SerializeToString()) 102 | 103 | 104 | def sim_func(query_pair): 105 | ''' 106 | 输入: 107 | query_pair:文本对,制表符隔开 108 | 返回: 109 | simlarity:文本对语义相似度 110 | ''' 111 | simnet_process.input_pair = query_pair 112 | preds_list = [] 113 | for iter, data in enumerate(batch_data()): 114 | output = executor.run(program, feed=infer_feeder.feed(data), fetch_list=fetch_targets) 115 | if args.task_mode == "pairwise": 116 | preds_list += list(map(lambda item: str(item[0]), output[1])) 117 | else: 118 | preds_list += map(lambda item: str(np.argmax(item)), output[1]) 119 | 120 | return float(preds_list[0]) 121 | 122 | 123 | 124 | def convert(conf_dict): 125 | """ 126 | convert 127 | """ 128 | converter.run_convert(conf_dict) 129 | 130 | 131 | if __name__ == "__main__": 132 | # log.init_log("./log/tensorflow") 133 | parser = argparse.ArgumentParser() 134 | parser.add_argument('--task', default='train', 135 | help='task: train/predict/freeze/convert, the default value is train.') 136 | parser.add_argument('--task_conf', default='./examples/cnn-pointwise.json', 137 | help='task_conf: config file for this task') 138 | args = parser.parse_args() 139 | task_conf = args.task_conf 140 | config = load_config(task_conf) 141 | task = args.task 142 | if args.task == 'train': 143 | train(config) 144 | elif args.task == 'predict': 145 | predict(config) 146 | elif args.task == 'freeze': 147 | freeze(config) 148 | elif args.task == 'convert': 149 | convert(config) 150 | else: 151 | print(sys.stderr, 'task type error.') 152 | 153 | 154 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # comparator-network 2 | 基于ComNet的短文本匹配网络模型 3 | 4 | 5 | 1) 数据预处理 6 | 由于机器的性能限制以及完整保留语义特征的考虑,此处仅使用中文字符级别的原始特征。 7 | 也就是说使用预先准备好的词汇表,将pointwise型的文本对-标签数据,转化为索引对-标签数据。 8 | 在将数据投喂给模型之前,先按照设定的序列长度进行填充处理,过长的序列从首部或尾部进行截断,长度固定为64。 9 | 10 | 2) 网络结构 11 | com-net项目的网络结构有很多种,本实验中使用CNN-pointwise结构配置进行训练。 12 | 模型首先经过嵌入层,嵌入层使用随机初始化的方式进行变量初始化。 13 | 嵌入层之后是一维卷积,卷积核的数目为256个。将两个文本对分别经过两个卷积层,得到query和doc张量的形状都是[batch_size, 256]。 14 | 这之后使用连接函数,将两个256维的向量连接起来,形成512维的向量。 15 | 将该[batch_size, 512]形状的张量经过两层全联接网络,第一层全联接网络的hidden_size设定为128。 16 | 第二层全联接网络的神经元数目设置为类别数,也就是2,激活函数使用RELU函数。 17 | 损失函数使用交叉熵损失函数。 18 | 19 | 3) 训练与测试 20 | 训练的批次样本数目为64,一共训练10个历元,每个历元的迭代次数为int((样本总数-1)/64)+1。 21 | 测试数据的格式与训练数据的组织形式一模一样,也就是每一行由query和doc文本对组成。 22 | 23 | 4) 结果 24 | 使用pairwise网络,在微众银行测试集的最后测试精度为 87.41%,略高于当时比赛中最好的结果86.89%。 25 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | import tensorflow as tf 5 | from tensorflow.contrib.rnn import GRUCell 6 | from tensorflow.contrib.rnn import LSTMCell 7 | from tensorflow.python.ops import array_ops 8 | from tensorflow.python.ops.rnn import dynamic_rnn as rnn 9 | from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn 10 | 11 | 12 | class PairwiseHingeLoss(object): 13 | """ 14 | a layer class: pairwise hinge loss 15 | """ 16 | def __init__(self, config): 17 | """ 18 | init function 19 | """ 20 | self.margin = float(config["margin"]) 21 | 22 | def ops(self, score_pos, score_neg): 23 | """ 24 | operation 25 | """ 26 | return tf.reduce_mean(tf.maximum(0., score_neg + 27 | self.margin - score_pos)) 28 | 29 | 30 | class PairwiseLogLoss(object): 31 | """ 32 | a layer class: pairwise log loss 33 | """ 34 | def __init__(self, config=None): 35 | """ 36 | init function 37 | """ 38 | pass 39 | 40 | def ops(self, score_pos, score_neg): 41 | """ 42 | operation 43 | """ 44 | return tf.reduce_mean(tf.nn.sigmoid(score_neg - score_pos)) 45 | 46 | 47 | class SoftmaxWithLoss(object): 48 | """ 49 | a layer class: softmax loss 50 | """ 51 | def __init__(self): 52 | """ 53 | init function 54 | """ 55 | pass 56 | 57 | def ops(self, pred, label): 58 | """ 59 | operation 60 | """ 61 | return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, 62 | labels=label)) 63 | -------------------------------------------------------------------------------- /mlpcnn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import layers.tf_layers as layers 4 | 5 | 6 | class MLPCnn(object): 7 | """ 8 | mlp cnn init function 9 | """ 10 | def __init__(self, config): 11 | self.vocab_size = int(config['vocabulary_size']) 12 | self.emb_size = int(config['embedding_dim']) 13 | self.kernel_size = int(config['num_filters']) 14 | self.win_size = int(config['window_size']) 15 | self.hidden_size = int(config['hidden_size']) 16 | self.left_name, self.seq_len = config['left_slots'][0] 17 | self.right_name, self.seq_len = config['right_slots'][0] 18 | self.task_mode = config['training_mode'] 19 | self.emb_layer = layers.EmbeddingLayer(self.vocab_size, self.emb_size) 20 | self.cnn_layer = layers.CNNLayer(self.seq_len, self.emb_size, 21 | self.win_size, self.kernel_size) 22 | self.relu_layer = layers.ReluLayer() 23 | self.concat_layer = layers.ConcatLayer() 24 | if self.task_mode == "pointwise": 25 | self.n_class = int(config['n_class']) 26 | self.fc1_layer = layers.FCLayer(2 * self.kernel_size, self.hidden_size) 27 | self.fc2_layer = layers.FCLayer(self.hidden_size, self.n_class) 28 | elif self.task_mode == "pairwise": 29 | self.fc1_layer = layers.FCLayer(self.kernel_size, self.hidden_size) 30 | self.cos_layer = layers.CosineLayer() 31 | else: 32 | logging.error("training mode not supported") 33 | 34 | def predict(self, left_slots, right_slots): 35 | """ 36 | predict graph of this net 37 | """ 38 | left = left_slots[self.left_name] 39 | right = right_slots[self.right_name] 40 | left_emb = self.emb_layer.ops(left) 41 | right_emb = self.emb_layer.ops(right) 42 | left_cnn = self.cnn_layer.ops(left_emb) 43 | right_cnn = self.cnn_layer.ops(right_emb) 44 | left_relu = self.relu_layer.ops(left_cnn) 45 | right_relu = self.relu_layer.ops(right_cnn) 46 | if self.task_mode == "pointwise": 47 | concat = self.concat_layer.ops([left_relu, right_relu], self.kernel_size * 2) 48 | concat_fc = self.fc1_layer.ops(concat) 49 | concat_relu = self.relu_layer.ops(concat_fc) 50 | pred = self.fc2_layer.ops(concat_relu) 51 | else: 52 | hid1_left = self.fc1_layer.ops(left_relu) 53 | hid1_right = self.fc1_layer.ops(right_relu) 54 | left_relu2 = self.relu_layer.ops(hid1_left) 55 | right_relu2 = self.relu_layer.ops(hid1_right) 56 | pred =self.cos_layer.ops(left_relu2, right_relu2) 57 | return pred 58 | 59 | -------------------------------------------------------------------------------- /tools/com_reader.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import Counter 3 | import logging 4 | import numpy 5 | import json 6 | import time 7 | import sys 8 | import os 9 | 10 | import tensorflow as tf 11 | 12 | _WORK_DIR = os.path.split(os.path.realpath(__file__))[0] 13 | _UPPER_DIR = os.path.split(_WORK_DIR)[0] 14 | sys.path.append(_UPPER_DIR) 15 | from utils import datafeeds 16 | 17 | 18 | def load_config(config_file): 19 | """ 20 | load config 21 | """ 22 | with open(config_file, "r") as f: 23 | try: 24 | conf = json.load(f) 25 | except Exception: 26 | logging.error("load json file %s error" % config_file) 27 | conf_dict = {} 28 | unused = [conf_dict.update(conf[k]) for k in conf] 29 | logging.debug("\n".join(["%s=%s" % (u, conf_dict[u]) for u in conf_dict])) 30 | return conf_dict 31 | 32 | 33 | def read_tfrecords_pointwise(config): 34 | """ 35 | read tf records 36 | """ 37 | datafeed = datafeeds.TFPointwisePaddingData(config) 38 | input_l, input_r, label_y = datafeed.ops() 39 | init_op = tf.group(tf.global_variables_initializer(), 40 | tf.local_variables_initializer()) 41 | start_time = time.time() 42 | sess = tf.InteractiveSession() 43 | sess.run(init_op) 44 | coord = tf.train.Coordinator() 45 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 46 | step = 0 47 | while not coord.should_stop(): 48 | step += 1 49 | try: 50 | left_, right_, label_ = sess.run([input_l, input_r, label_y]) 51 | print "pointwise data read is good" 52 | except tf.errors.OutOfRangeError: 53 | print("read %d steps" % step) 54 | coord.request_stop() 55 | coord.join(threads) 56 | duration = time.time() - start_time 57 | print("duration: %ds, step: %d" % (duration, step)) 58 | sess.close() 59 | 60 | 61 | def read_tfrecords_pairwise(config): 62 | """ 63 | read tf records 64 | """ 65 | datafeed = datafeeds.TFPairwisePaddingData(config) 66 | query, pos, neg = datafeed.ops() 67 | init_op = tf.group(tf.global_variables_initializer(), 68 | tf.local_variables_initializer()) 69 | start_time = time.time() 70 | sess = tf.InteractiveSession() 71 | sess.run(init_op) 72 | coord = tf.train.Coordinator() 73 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 74 | step = 0 75 | while not coord.should_stop(): 76 | step += 1 77 | try: 78 | query_, pos_, neg_ = sess.run([query, pos, neg]) 79 | print "pairwise data read is good" 80 | except tf.errors.OutOfRangeError: 81 | print("read %d steps" % step) 82 | coord.request_stop() 83 | coord.join(threads) 84 | duration = time.time() - start_time 85 | print("duration: %ds, step: %d" % (duration, step)) 86 | sess.close() 87 | 88 | 89 | def usage(): 90 | """ 91 | usage 92 | """ 93 | print sys.argv[0], "options" 94 | print "options" 95 | print "\tconfig_path: configure file path" 96 | 97 | 98 | if __name__ == "__main__": 99 | if len(sys.argv) != 2: 100 | usage() 101 | sys.exit(1) 102 | config_path = sys.argv[1] 103 | config = load_config(config_path) 104 | data_format_func = {"pointwise": read_tfrecords_pointwise, 105 | "pairwise": read_tfrecords_pairwise} 106 | if config["training_mode"] in data_format_func: 107 | using_func = data_format_func[config["training_mode"]] 108 | else: 109 | logging.error("data_format not supported") 110 | sys.exit(1) 111 | using_func(config) 112 | -------------------------------------------------------------------------------- /tools/com_writer.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import Counter 3 | import logging 4 | import numpy 5 | import time 6 | import sys 7 | import os 8 | 9 | import tensorflow as tf 10 | 11 | 12 | def int_feature(v): 13 | """ 14 | int feature 15 | """ 16 | return tf.train.Feature(int64_list=tf.train.Int64List(value=v)) 17 | 18 | 19 | def write_data_to_tf(filename, func, func_args, writer): 20 | """ 21 | writes tf records, write data 22 | """ 23 | with open(filename) as fin_data: 24 | for line in fin_data: 25 | example = func(line, func_args) 26 | if not example: 27 | continue 28 | writer.write(example.SerializeToString()) 29 | 30 | 31 | def parse_text_match_pointwise_pad_data(line, func_args): 32 | """ 33 | pointwise parse 34 | """ 35 | seq_len = func_args[0] 36 | pad_id = func_args[1] 37 | # left_ids \t right_ids \t label 38 | group = line.strip().split("\t") 39 | if len(group) != 3: 40 | logging.warning( 41 | "the line not conform to format(left_ids, right_ids, label)") 42 | return 43 | label = [0, 0] 44 | all_ids = [] 45 | for i in [0, 1]: 46 | tmp_ids = [int(t) for t in group[i].strip().split(" ")] 47 | if len(tmp_ids) < seq_len: 48 | pad_len = seq_len - len(tmp_ids) 49 | tmp_ids = tmp_ids + [pad_id] * pad_len 50 | all_ids.append(tmp_ids[:seq_len]) 51 | label[int(group[2])] = 1 52 | example = tf.train.Example(features=tf.train.Features( 53 | feature={"label": int_feature(label), 54 | "left": int_feature(all_ids[0]), 55 | "right": int_feature(all_ids[1])})) 56 | return example 57 | 58 | 59 | def parse_text_match_pairwise_pad_data(line, func_args): 60 | """ 61 | pairwise parse 62 | """ 63 | seq_len = func_args[0] 64 | pad_id = func_args[1] 65 | # query_terms\t postitle_terms\t negtitle_terms 66 | group = line.strip().split("\t") 67 | if len(group) != 3: 68 | logging.warning( 69 | "the line not conform to format(query_terms, postitle_terms, negtitle_terms)") 70 | return 71 | all_ids = [] 72 | for i in [0, 1, 2]: 73 | tmp_ids = [int(t) for t in group[i].strip().split(" ")] 74 | if len(tmp_ids) < seq_len: 75 | pad_len = seq_len - len(tmp_ids) 76 | tmp_ids = tmp_ids + [pad_id] * pad_len 77 | all_ids.append(tmp_ids[:seq_len]) 78 | example = tf.train.Example(features=tf.train.Features( 79 | feature={"left": int_feature(all_ids[0]), 80 | "pos_right": int_feature(all_ids[1]), 81 | "neg_right": int_feature(all_ids[2])})) 82 | return example 83 | 84 | 85 | def usage(): 86 | """ 87 | usage 88 | """ 89 | print sys.argv[0], "options" 90 | print "options" 91 | print "\ttype: data type include pointwise or pairwise" 92 | print "\tinputfile: input file path" 93 | print "\trecordfile: output recorf file" 94 | print "\tpad_id: pad id" 95 | print "\tmax_len: sequence max length" 96 | 97 | 98 | if __name__ == "__main__": 99 | if len(sys.argv) != 6: 100 | usage() 101 | sys.exit(-1) 102 | input_data_format = sys.argv[1] 103 | filename = sys.argv[2] 104 | tfrecord_name = sys.argv[3] 105 | pad_id = int(sys.argv[4]) 106 | max_len = int(sys.argv[5]) 107 | data_format_func = {"pointwise": parse_text_match_pointwise_pad_data, 108 | "pairwise": parse_text_match_pairwise_pad_data} 109 | if input_data_format in data_format_func: 110 | using_func = data_format_func[input_data_format] 111 | else: 112 | logging.error("data_format not supported") 113 | sys.exit(1) 114 | local_writer = tf.python_io.TFRecordWriter(tfrecord_name) 115 | write_data_to_tf(filename, using_func, [max_len, pad_id], local_writer) 116 | local_writer.close() 117 | -------------------------------------------------------------------------------- /utils/controler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | 4 | import tensorflow as tf 5 | 6 | 7 | def run_predict(pred, label, config): 8 | """ 9 | run classification predict function handle 10 | """ 11 | mean_acc = 0.0 12 | saver = tf.train.Saver() 13 | mode = config["training_mode"] 14 | label_index = tf.argmax(label, 1) 15 | if mode == "pointwise": 16 | pred_prob = tf.nn.softmax(pred, -1) 17 | score = tf.reduce_max(pred_prob, -1) 18 | pred_index = tf.argmax(pred_prob, 1) 19 | correct_pred = tf.equal(pred_index, label_index) 20 | acc = tf.reduce_mean(tf.cast(correct_pred, "float")) 21 | elif mode == "pairwise": 22 | score = pred 23 | pred_index = tf.argmax(pred, 1) 24 | acc = tf.constant([0.0]) 25 | modelfile = config["test_model_file"] 26 | 27 | result_file = open(config["test_result"], "w") 28 | 29 | step = 0 30 | init = tf.group(tf.global_variables_initializer(), 31 | tf.local_variables_initializer()) 32 | with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1)) \ 33 | as sess: 34 | sess.run(init) 35 | saver.restore(sess, modelfile) 36 | coord = tf.train.Coordinator() 37 | read_thread = tf.train.start_queue_runners(sess=sess, coord=coord) 38 | while not coord.should_stop(): 39 | step += 1 40 | try: 41 | ground, pi, a, prob = sess.run([label_index, pred_index, acc, score]) 42 | mean_acc += a 43 | for i in range(len(prob)): 44 | result_file.write("%d\t%d\t%f\n" % (ground[i], pi[i], prob[i])) 45 | except tf.errors.OutOfRangeError: 46 | coord.request_stop() 47 | coord.join(read_thread) 48 | sess.close() 49 | result_file.close() 50 | if mode == "pointwise": 51 | mean_acc = mean_acc / step 52 | print(sys.stderr, "accuracy: %4.2f" % (mean_acc * 100)) 53 | 54 | 55 | def run_trainer(loss, optimizer, config): 56 | """ 57 | run classification training function handle 58 | """ 59 | thread_num = int(config["thread_num"]) 60 | model_path = config["model_path"] 61 | model_file = config["model_prefix"] 62 | print_iter = int(config["print_iter"]) 63 | data_size = int(config["data_size"]) 64 | batch_size = int(config["batch_size"]) 65 | epoch_iter = int(data_size / batch_size) 66 | avg_cost = 0.0 67 | saver = tf.train.Saver(max_to_keep=None) 68 | init = tf.group(tf.global_variables_initializer(), 69 | tf.local_variables_initializer()) 70 | with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=thread_num, 71 | inter_op_parallelism_threads=thread_num)) \ 72 | as sess: 73 | sess.run(init) 74 | coord = tf.train.Coordinator() 75 | read_thread = tf.train.start_queue_runners(sess=sess, coord=coord) 76 | step = 0 77 | epoch_num = 1 78 | start_time = time.time() 79 | while not coord.should_stop(): 80 | try: 81 | step += 1 82 | c, _= sess.run([loss, optimizer]) 83 | avg_cost += c 84 | 85 | if step % print_iter == 0: 86 | print("loss: %f" % ((avg_cost / print_iter))) 87 | avg_cost = 0.0 88 | if step % epoch_iter == 0: 89 | end_time = time.time() 90 | print("save model epoch%d, used time: %d" % (epoch_num, 91 | end_time - start_time)) 92 | save_path = saver.save(sess, 93 | "%s/%s.epoch%d" % (model_path, model_file, epoch_num)) 94 | epoch_num += 1 95 | start_time = time.time() 96 | 97 | except tf.errors.OutOfRangeError: 98 | save_path = saver.save(sess, "%s/%s.final" % (model_path, model_file)) 99 | coord.request_stop() 100 | coord.join(read_thread) 101 | sess.close() 102 | 103 | 104 | def graph_save(pred, config): 105 | """ 106 | run classify predict 107 | """ 108 | graph_path=config["graph_path"] 109 | graph_name=config["graph_name"] 110 | mode = config["training_mode"] 111 | if mode == "pointwise": 112 | pred_prob = tf.nn.softmax(pred, -1, name="output_prob") 113 | elif mode == "pairwise": 114 | pred_prob = tf.identity(pred, name="output_prob") 115 | saver = tf.train.Saver() 116 | step = 0 117 | init = tf.group(tf.global_variables_initializer(), 118 | tf.local_variables_initializer()) 119 | with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1)) \ 120 | as sess: 121 | sess.run(init) 122 | tf.train.write_graph(sess.graph_def, graph_path, graph_name, as_text=True) 123 | sess.close() 124 | 125 | -------------------------------------------------------------------------------- /utils/converter.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import numpy 3 | import errno 4 | import time 5 | import sys 6 | import os 7 | 8 | import tensorflow as tf 9 | 10 | my_int_feature = lambda v: tf.train.Feature(int64_list=tf.train.Int64List(value=v)) 11 | 12 | 13 | class TFConverter(object): 14 | """ 15 | TFConverter init, with config 16 | """ 17 | def __init__(self, config): 18 | data_id_file = config['data_id_file'] 19 | data_tfrecord_file = config['data_tfrecord_file'] 20 | self.source_file = [] 21 | self.target_file = [] 22 | if os.path.isdir(data_id_file): 23 | try: 24 | os.makedirs(data_tfrecord_file) 25 | except OSError as exc: 26 | if exc.errno == errno.EEXIST and os.path.isdir(data_tfrecord_file): 27 | pass 28 | else: 29 | raise 30 | data_parts = os.listdir(data_id_file) 31 | for part in data_parts: 32 | self.source_file.append(os.path.join(data_id_file, part)) 33 | self.target_file.append(os.path.join(data_tfrecord_file, part)) 34 | else: 35 | self.source_file.append(data_id_file) 36 | self.target_file.append(data_tfrecord_file) 37 | data_mode = config['training_mode'] 38 | self.left_slots = config["left_slots"] 39 | self.right_slots = config["right_slots"] 40 | self.pad_id = 0 41 | 42 | if data_mode == "pointwise": 43 | self.n_class = config["n_class"] 44 | self.func = self.convert_pointwise 45 | self.all_slots = self.left_slots + self.right_slots 46 | elif data_mode == "pairwise": 47 | self.func = self.convert_pairwise 48 | pos_slots = [["pos_" + name, length] for (name, length) in self.right_slots] 49 | neg_slots = [["neg_" + name, length] for (name, length) in self.right_slots] 50 | self.all_slots = self.left_slots + pos_slots + neg_slots 51 | else: 52 | print >>sys.stderr, "not supported data mode" 53 | 54 | def convert_pointwise(self, line): 55 | """ 56 | convert pointwise data,pointwise parse 57 | """ 58 | # left_ids \t right_ids \t label 59 | group = line.strip().split("\t") 60 | if len(group) != 1 + len(self.all_slots): 61 | print >> sys.stderr, "convert error, slots doesn't match" 62 | sys.exit(-1) 63 | label = [0 for i in range(self.n_class)] 64 | all_ids = [] 65 | label[int(group[-1])] = 1 66 | feature={"label":my_int_feature(label)} 67 | for i in range(len(self.all_slots)): 68 | slot_name, seq_len = self.all_slots[i] 69 | tmp_ids = [int(t) for t in group[i].strip().split(" ")] 70 | if len(tmp_ids) < seq_len: 71 | pad_len = seq_len - len(tmp_ids) 72 | tmp_ids = tmp_ids + [self.pad_id] * pad_len 73 | feature[slot_name] = my_int_feature(tmp_ids[:seq_len]) 74 | example = tf.train.Example(features=tf.train.Features(feature=feature)) 75 | return example 76 | 77 | def convert_pairwise(self, line): 78 | """ 79 | convert pairwise data, pairwise parse 80 | """ 81 | # query_terms\t postitle_terms\t negtitle_terms 82 | group = line.strip().split("\t") 83 | if len(group) != len(self.all_slots): 84 | print >> sys.stderr, "convert error, slots doesn't match" 85 | sys.exit(-1) 86 | all_ids = [] 87 | feature={} 88 | for i in range(len(self.all_slots)): 89 | slot_name, seq_len = self.all_slots[i] 90 | tmp_ids = [int(t) for t in group[i].strip().split(" ")] 91 | if len(tmp_ids) < seq_len: 92 | pad_len = seq_len - len(tmp_ids) 93 | tmp_ids = tmp_ids + [self.pad_id] * pad_len 94 | feature[slot_name] = my_int_feature(tmp_ids[:seq_len]) 95 | example = tf.train.Example(features=tf.train.Features(feature=feature)) 96 | return example 97 | 98 | def write_data_to_tf(self, filename, tfrecord_name): 99 | """ 100 | write to tfrecord file, write data 101 | """ 102 | writer = tf.python_io.TFRecordWriter(tfrecord_name) 103 | with open(filename) as fin_data: 104 | for line in fin_data: 105 | example = self.func(line) 106 | writer.write(example.SerializeToString()) 107 | writer.close() 108 | 109 | def convert(self): 110 | """ 111 | convert all files 112 | """ 113 | print >> sys.stderr, "writing tf record" 114 | for i in range(len(self.source_file)): 115 | self.write_data_to_tf(self.source_file[i], self.target_file[i]) 116 | print >> sys.stderr, self.source_file[i], "-->", self.target_file[i] 117 | print >> sys.stderr, "all done" 118 | 119 | def run_convert(config): 120 | """ 121 | run convert 122 | """ 123 | tf_conv = TFConverter(config) 124 | tf_conv.convert() 125 | 126 | -------------------------------------------------------------------------------- /utils/datafeeds.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import time 3 | import sys 4 | import os 5 | 6 | import tensorflow as tf 7 | from tensorflow.contrib import learn 8 | 9 | from utils.utility import get_all_files 10 | 11 | 12 | def load_batch_ops(example, batch_size, shuffle): 13 | """ 14 | load batch ops 15 | """ 16 | if not shuffle: 17 | return tf.train.batch([example], 18 | batch_size = batch_size, 19 | num_threads = 1, 20 | capacity = 10000 + 2 * batch_size) 21 | else: 22 | return tf.train.shuffle_batch([example], 23 | batch_size = batch_size, 24 | num_threads = 1, 25 | capacity = 10000 + 2 * batch_size, 26 | min_after_dequeue = 10000) 27 | 28 | 29 | class TFPairwisePaddingData(object): 30 | """ 31 | for pairwise padding data 32 | """ 33 | def __init__(self, config): 34 | self.filelist = get_all_files(config["train_file"]) 35 | self.batch_size = int(config["batch_size"]) 36 | self.epochs = int(config["num_epochs"]) 37 | if int(config["shuffle"]) == 0: 38 | shuffle = False 39 | else: 40 | shuffle = True 41 | self.shuffle = shuffle 42 | self.reader = None 43 | self.file_queue = None 44 | self.left_slots = dict(config["left_slots"]) 45 | self.right_slots = dict(config["right_slots"]) 46 | 47 | def ops(self): 48 | """ 49 | produce data 50 | """ 51 | self.file_queue = tf.train.string_input_producer(self.filelist, 52 | num_epochs=self.epochs) 53 | self.reader = tf.TFRecordReader() 54 | _, example = self.reader.read(self.file_queue) 55 | batch_examples = load_batch_ops(example, self.batch_size, self.shuffle) 56 | features_types = {} 57 | [features_types.update({u: tf.FixedLenFeature([v], tf.int64)}) 58 | for (u, v) in self.left_slots.items()] 59 | [features_types.update({"pos_" + u: tf.FixedLenFeature([v], tf.int64)}) 60 | for (u, v) in self.right_slots.items()] 61 | [features_types.update({"neg_" + u: tf.FixedLenFeature([v], tf.int64)}) 62 | for (u, v) in self.right_slots.items()] 63 | features = tf.parse_example(batch_examples, features = features_types) 64 | return dict([(k, features[k]) for k in self.left_slots.keys()]),\ 65 | dict([(k, features["pos_" + k]) for k in self.right_slots.keys()]),\ 66 | dict([(k, features["neg_" + k]) for k in self.right_slots.keys()]) 67 | 68 | 69 | class TFPointwisePaddingData(object): 70 | """ 71 | for pointwise padding data 72 | """ 73 | def __init__(self, config): 74 | self.filelist = get_all_files(config["train_file"]) 75 | self.batch_size = int(config["batch_size"]) 76 | self.epochs = int(config["num_epochs"]) 77 | if int(config["shuffle"]) == 0: 78 | shuffle = False 79 | else: 80 | shuffle = True 81 | self.shuffle = shuffle 82 | self.reader = None 83 | self.file_queue = None 84 | self.left_slots = dict(config["left_slots"]) 85 | self.right_slots = dict(config["right_slots"]) 86 | 87 | def ops(self): 88 | """ 89 | gen data 90 | """ 91 | self.file_queue = tf.train.string_input_producer(self.filelist, 92 | num_epochs=self.epochs) 93 | self.reader = tf.TFRecordReader() 94 | _, example = self.reader.read(self.file_queue) 95 | batch_examples = load_batch_ops(example, self.batch_size, self.shuffle) 96 | features_types = {"label": tf.FixedLenFeature([2], tf.int64)} 97 | [features_types.update({u: tf.FixedLenFeature([v], tf.int64)}) 98 | for (u, v) in self.left_slots.items()] 99 | [features_types.update({u: tf.FixedLenFeature([v], tf.int64)}) 100 | for (u, v) in self.right_slots.items()] 101 | features = tf.parse_example(batch_examples, features = features_types) 102 | return dict([(k, features[k]) for k in self.left_slots.keys()]),\ 103 | dict([(k, features[k]) for k in self.right_slots.keys()]),\ 104 | features["label"] 105 | 106 | -------------------------------------------------------------------------------- /utils/utility.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | import sys 3 | import os 4 | 5 | import tensorflow as tf 6 | 7 | 8 | def get_all_files(train_data_file): 9 | """ 10 | get all files 11 | """ 12 | train_file = [] 13 | train_path = train_data_file 14 | if os.path.isdir(train_path): 15 | data_parts = os.listdir(train_path) 16 | for part in data_parts: 17 | train_file.append(os.path.join(train_path, part)) 18 | else: 19 | train_file.append(train_path) 20 | return train_file 21 | 22 | 23 | def merge_config(config, *argv): 24 | """ 25 | merge multiple configs 26 | """ 27 | cf = {} 28 | cf.update(config) 29 | for d in argv: 30 | cf.update(d) 31 | return cf 32 | 33 | 34 | def import_object(module_py, class_str): 35 | """ 36 | string to class 37 | """ 38 | mpath, mfile = os.path.split(module_py) 39 | sys.path.append(mpath) 40 | module=__import__(mfile) 41 | try: 42 | return getattr(module, class_str) 43 | except AttributeError: 44 | raise ImportError('Class %s cannot be found (%s)' % 45 | (class_str, traceback.format_exception(*sys.exc_info()))) 46 | 47 | 48 | def seq_length(sequence): 49 | """ 50 | get sequence length 51 | for id-sequence, (N, S) 52 | or vector-sequence (N, S, D) 53 | """ 54 | if len(sequence.get_shape().as_list()) == 2: 55 | used = tf.sign(tf.abs(sequence)) 56 | else: 57 | used = tf.sign(tf.reduce_max(tf.abs(sequence), 2)) 58 | length = tf.reduce_sum(used, 1) 59 | length = tf.cast(length, tf.int32) 60 | return length 61 | 62 | 63 | def get_cross_mask(seq1, seq2): 64 | """ 65 | get matching matrix mask, for two sequences( id-sequences or vector-sequences) 66 | """ 67 | length1 = seq_length(seq1) 68 | length2 = seq_length(seq2) 69 | max_len1 = tf.shape(seq1)[1] 70 | max_len2 = tf.shape(seq2)[1] 71 | ##for padding left 72 | mask1 = tf.sequence_mask(length1, max_len1, dtype=tf.int32) 73 | mask2 = tf.sequence_mask(length2, max_len2, dtype=tf.int32) 74 | cross_mask = tf.einsum('ij,ik->ijk', mask1, mask2) 75 | return cross_mask 76 | --------------------------------------------------------------------------------