├── ComNet.py
├── README.md
├── loss.py
├── mlpcnn.py
├── tools
    ├── com_reader.py
    └── com_writer.py
└── utils
    ├── controler.py
    ├── converter.py
    ├── datafeeds.py
    └── utility.py


/ComNet.py:
--------------------------------------------------------------------------------
  1 | import utils
  2 | import reader
  3 | 
  4 | import argparse
  5 | import logging
  6 | import json
  7 | import sys
  8 | import os
  9 | 
 10 | import tensorflow as tf
 11 | from tensorflow.python.framework import graph_util
 12 | 
 13 | from utils import datafeeds
 14 | from utils import controler
 15 | from utils import utility
 16 | from utils import converter
 17 | 
 18 | _WORK_DIR = os.path.split(os.path.realpath(__file__))[0]
 19 | sys.path.append(os.path.join(_WORK_DIR, '../../../common'))
 20 | #import log 
 21 | 
 22 | 
 23 | def load_config(config_file):
 24 |     """
 25 |     load config
 26 |     """
 27 |     with open(config_file, "r") as f:
 28 |         try:
 29 |             conf = json.load(f)
 30 |         except Exception:
 31 |             logging.error("load json file %s error" % config_file)
 32 |     conf_dict = {}
 33 |     unused = [conf_dict.update(conf[k]) for k in conf]
 34 |     logging.debug("\n".join(
 35 |         ["%s=%s" % (u, conf_dict[u]) for u in conf_dict]))
 36 |     return conf_dict
 37 | 
 38 | 
 39 | def train(conf_dict):
 40 |     """
 41 |     train
 42 |     """
 43 |     training_mode = conf_dict["training_mode"]
 44 |     net = utility.import_object(
 45 |         conf_dict["net_py"], conf_dict["net_class"])(conf_dict)
 46 |     if training_mode == "pointwise":
 47 |         datafeed = datafeeds.TFPointwisePaddingData(conf_dict)
 48 |         input_l, input_r, label_y = datafeed.ops()
 49 |         pred = net.predict(input_l, input_r)
 50 |         output_prob = tf.nn.softmax(pred, -1, name="output_prob")
 51 |         loss_layer = utility.import_object(
 52 |             conf_dict["loss_py"], conf_dict["loss_class"])()
 53 |         loss = loss_layer.ops(pred, label_y)
 54 |     elif training_mode == "pairwise":
 55 |         datafeed = datafeeds.TFPairwisePaddingData(conf_dict)
 56 |         input_l, input_r, neg_input = datafeed.ops()
 57 |         pos_score = net.predict(input_l, input_r)
 58 |         output_prob = tf.identity(pos_score, name="output_preb")
 59 |         neg_score = net.predict(input_l, neg_input)
 60 |         loss_layer = utility.import_object(
 61 |             conf_dict["loss_py"], conf_dict["loss_class"])(conf_dict)
 62 |         loss = loss_layer.ops(pos_score, neg_score)
 63 |     else:
 64 |         print(sys.stderr, "training mode not supported")
 65 |         sys.exit(1)
 66 |     # define optimizer
 67 |     lr = float(conf_dict["learning_rate"])
 68 |     optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
 69 | 
 70 |     # run_trainer
 71 |     controler.run_trainer(loss, optimizer, conf_dict)
 72 | 
 73 | 
 74 | def predict(conf_dict):
 75 |     """
 76 |     predict
 77 |     """
 78 |     net = utility.import_object(
 79 |         conf_dict["net_py"], conf_dict["net_class"])(conf_dict)
 80 |     conf_dict.update({"num_epochs": "1", "batch_size": "1",
 81 |                       "shuffle": "0", "train_file": conf_dict["test_file"]})
 82 |     test_datafeed = datafeeds.TFPointwisePaddingData(conf_dict)
 83 |     test_l, test_r, test_y = test_datafeed.ops()
 84 |     # test network
 85 |     pred = net.predict(test_l, test_r)
 86 |     controler.run_predict(pred, test_y, conf_dict)
 87 | 
 88 | 
 89 | def freeze(conf_dict):
 90 |     """
 91 |     freeze net for c api predict
 92 |     """
 93 |     model_path = conf_dict["save_path"]
 94 |     freeze_path = conf_dict["freeze_path"]
 95 |     saver = tf.train.import_meta_graph(model_path + '.meta')
 96 |     with tf.Session() as sess:
 97 |         saver.restore(sess, model_path)
 98 |         var_graph_def = tf.get_default_graph().as_graph_def()
 99 |         const_graph_def = graph_util.convert_variables_to_constants(sess, var_graph_def, ["output_prob"])
100 |         with tf.gfile.GFile(freeze_path, "wb") as f:
101 |             f.write(const_graph_def.SerializeToString())
102 | 
103 | 
104 | def sim_func(query_pair):
105 |     '''
106 |     输入:
107 |         query_pair:文本对，制表符隔开
108 |     返回:
109 |         simlarity:文本对语义相似度
110 |     '''
111 |     simnet_process.input_pair = query_pair
112 |     preds_list = []
113 |     for iter, data in enumerate(batch_data()):
114 |         output = executor.run(program, feed=infer_feeder.feed(data), fetch_list=fetch_targets)        
115 |         if args.task_mode == "pairwise":
116 |             preds_list += list(map(lambda item: str(item[0]), output[1]))
117 |         else:
118 |             preds_list += map(lambda item: str(np.argmax(item)), output[1])
119 | 
120 |     return float(preds_list[0])
121 |             
122 |             
123 |             
124 | def convert(conf_dict):
125 |     """
126 |     convert
127 |     """
128 |     converter.run_convert(conf_dict)
129 | 
130 | 
131 | if __name__ == "__main__": 
132 | #    log.init_log("./log/tensorflow")
133 |     parser = argparse.ArgumentParser()
134 |     parser.add_argument('--task', default='train',
135 |                         help='task: train/predict/freeze/convert, the default value is train.')
136 |     parser.add_argument('--task_conf', default='./examples/cnn-pointwise.json',
137 |                         help='task_conf: config file for this task')
138 |     args = parser.parse_args()
139 |     task_conf = args.task_conf
140 |     config = load_config(task_conf)
141 |     task = args.task
142 |     if args.task == 'train':
143 |         train(config)
144 |     elif args.task == 'predict':
145 |         predict(config)
146 |     elif args.task == 'freeze':
147 |         freeze(config)
148 |     elif args.task == 'convert':
149 |         convert(config)
150 |     else:
151 |         print(sys.stderr, 'task type error.')
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # comparator-network
 2 | 基于ComNet的短文本匹配网络模型
 3 | 
 4 | 
 5 | 1)	数据预处理
 6 | 由于机器的性能限制以及完整保留语义特征的考虑，此处仅使用中文字符级别的原始特征。
 7 | 也就是说使用预先准备好的词汇表，将pointwise型的文本对-标签数据，转化为索引对-标签数据。
 8 | 在将数据投喂给模型之前，先按照设定的序列长度进行填充处理，过长的序列从首部或尾部进行截断，长度固定为64。
 9 | 
10 | 2)	网络结构
11 | com-net项目的网络结构有很多种，本实验中使用CNN-pointwise结构配置进行训练。
12 | 模型首先经过嵌入层，嵌入层使用随机初始化的方式进行变量初始化。
13 | 嵌入层之后是一维卷积，卷积核的数目为256个。将两个文本对分别经过两个卷积层，得到query和doc张量的形状都是[batch_size, 256]。
14 | 这之后使用连接函数，将两个256维的向量连接起来，形成512维的向量。
15 | 将该[batch_size, 512]形状的张量经过两层全联接网络，第一层全联接网络的hidden_size设定为128。
16 | 第二层全联接网络的神经元数目设置为类别数，也就是2，激活函数使用RELU函数。
17 | 损失函数使用交叉熵损失函数。
18 | 
19 | 3)	训练与测试
20 | 训练的批次样本数目为64，一共训练10个历元，每个历元的迭代次数为int((样本总数-1)/64)+1。
21 | 测试数据的格式与训练数据的组织形式一模一样，也就是每一行由query和doc文本对组成。
22 | 
23 | 4)  结果
24 | 使用pairwise网络，在微众银行测试集的最后测试精度为	87.41%，略高于当时比赛中最好的结果86.89%。
25 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | 
 4 | import tensorflow as tf
 5 | from tensorflow.contrib.rnn import GRUCell
 6 | from tensorflow.contrib.rnn import LSTMCell
 7 | from tensorflow.python.ops import array_ops
 8 | from tensorflow.python.ops.rnn import dynamic_rnn as rnn
 9 | from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
10 | 
11 | 
12 | class PairwiseHingeLoss(object): 
13 |     """
14 |     a layer class: pairwise hinge loss
15 |     """
16 |     def __init__(self, config):
17 |         """
18 |         init function
19 |         """
20 |         self.margin = float(config["margin"])
21 |     
22 |     def ops(self, score_pos, score_neg):
23 |         """
24 |         operation
25 |         """
26 |         return tf.reduce_mean(tf.maximum(0., score_neg + 
27 |                                          self.margin - score_pos))
28 | 
29 | 
30 | class PairwiseLogLoss(object):
31 |     """
32 |     a layer class: pairwise log loss
33 |     """
34 |     def __init__(self, config=None):
35 |         """
36 |         init function
37 |         """
38 |         pass
39 |     
40 |     def ops(self, score_pos, score_neg):
41 |         """
42 |         operation
43 |         """
44 |         return tf.reduce_mean(tf.nn.sigmoid(score_neg - score_pos))
45 | 
46 | 
47 | class SoftmaxWithLoss(object):
48 |     """
49 |     a layer class: softmax loss
50 |     """
51 |     def __init__(self):
52 |         """
53 |         init function
54 |         """
55 |         pass
56 | 
57 |     def ops(self, pred, label):
58 |         """
59 |         operation
60 |         """
61 |         return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred,
62 |                                                                       labels=label))
63 | 


--------------------------------------------------------------------------------
/mlpcnn.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import layers.tf_layers as layers
 4 | 
 5 | 
 6 | class MLPCnn(object):
 7 |     """
 8 |     mlp cnn init function
 9 |     """
10 |     def __init__(self, config):
11 |         self.vocab_size = int(config['vocabulary_size'])
12 |         self.emb_size = int(config['embedding_dim'])
13 |         self.kernel_size = int(config['num_filters'])
14 |         self.win_size = int(config['window_size'])
15 |         self.hidden_size = int(config['hidden_size'])
16 |         self.left_name, self.seq_len = config['left_slots'][0]
17 |         self.right_name, self.seq_len = config['right_slots'][0]
18 |         self.task_mode = config['training_mode']
19 |         self.emb_layer = layers.EmbeddingLayer(self.vocab_size, self.emb_size)
20 |         self.cnn_layer = layers.CNNLayer(self.seq_len, self.emb_size, 
21 |                                         self.win_size, self.kernel_size)
22 |         self.relu_layer = layers.ReluLayer()
23 |         self.concat_layer = layers.ConcatLayer()
24 |         if self.task_mode == "pointwise":
25 |             self.n_class = int(config['n_class'])
26 |             self.fc1_layer = layers.FCLayer(2 * self.kernel_size, self.hidden_size)
27 |             self.fc2_layer = layers.FCLayer(self.hidden_size, self.n_class)
28 |         elif self.task_mode == "pairwise":
29 |             self.fc1_layer = layers.FCLayer(self.kernel_size, self.hidden_size)
30 |             self.cos_layer = layers.CosineLayer()
31 |         else:
32 |             logging.error("training mode not supported")
33 |         
34 |     def predict(self, left_slots, right_slots):
35 |         """
36 |         predict graph of this net
37 |         """
38 |         left = left_slots[self.left_name]
39 |         right = right_slots[self.right_name]
40 |         left_emb = self.emb_layer.ops(left)
41 |         right_emb = self.emb_layer.ops(right)
42 |         left_cnn = self.cnn_layer.ops(left_emb)
43 |         right_cnn = self.cnn_layer.ops(right_emb)
44 |         left_relu = self.relu_layer.ops(left_cnn)
45 |         right_relu = self.relu_layer.ops(right_cnn)
46 |         if self.task_mode == "pointwise":
47 |             concat = self.concat_layer.ops([left_relu, right_relu], self.kernel_size * 2)
48 |             concat_fc = self.fc1_layer.ops(concat)
49 |             concat_relu = self.relu_layer.ops(concat_fc)
50 |             pred = self.fc2_layer.ops(concat_relu)
51 |         else:
52 |             hid1_left = self.fc1_layer.ops(left_relu)
53 |             hid1_right = self.fc1_layer.ops(right_relu)
54 |             left_relu2 = self.relu_layer.ops(hid1_left)
55 |             right_relu2 = self.relu_layer.ops(hid1_right)
56 |             pred =self.cos_layer.ops(left_relu2, right_relu2)
57 |         return pred
58 | 
59 | 


--------------------------------------------------------------------------------
/tools/com_reader.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections import Counter
  3 | import logging
  4 | import numpy
  5 | import json
  6 | import time
  7 | import sys
  8 | import os
  9 | 
 10 | import tensorflow as tf
 11 | 
 12 | _WORK_DIR = os.path.split(os.path.realpath(__file__))[0]
 13 | _UPPER_DIR = os.path.split(_WORK_DIR)[0]
 14 | sys.path.append(_UPPER_DIR)
 15 | from utils import datafeeds
 16 | 
 17 | 
 18 | def load_config(config_file):
 19 |     """
 20 |     load config
 21 |     """
 22 |     with open(config_file, "r") as f:
 23 |         try:
 24 |             conf = json.load(f)
 25 |         except Exception:
 26 |             logging.error("load json file %s error" % config_file)
 27 |     conf_dict = {}
 28 |     unused = [conf_dict.update(conf[k]) for k in conf]
 29 |     logging.debug("\n".join(["%s=%s" % (u, conf_dict[u]) for u in conf_dict]))
 30 |     return conf_dict
 31 | 
 32 | 
 33 | def read_tfrecords_pointwise(config):
 34 |     """
 35 |     read tf records
 36 |     """
 37 |     datafeed = datafeeds.TFPointwisePaddingData(config)
 38 |     input_l, input_r, label_y = datafeed.ops()
 39 |     init_op = tf.group(tf.global_variables_initializer(),
 40 |                        tf.local_variables_initializer())
 41 |     start_time = time.time()
 42 |     sess = tf.InteractiveSession()
 43 |     sess.run(init_op)
 44 |     coord = tf.train.Coordinator()
 45 |     threads = tf.train.start_queue_runners(sess=sess, coord=coord)
 46 |     step = 0
 47 |     while not coord.should_stop():
 48 |         step += 1
 49 |         try:
 50 |             left_, right_, label_ = sess.run([input_l, input_r, label_y])
 51 |             print "pointwise data read is good"
 52 |         except tf.errors.OutOfRangeError:
 53 |             print("read %d steps" % step)
 54 |             coord.request_stop()
 55 |     coord.join(threads)
 56 |     duration = time.time() - start_time
 57 |     print("duration: %ds, step: %d" % (duration, step))
 58 |     sess.close()
 59 | 
 60 | 
 61 | def read_tfrecords_pairwise(config):
 62 |     """
 63 |     read tf records
 64 |     """
 65 |     datafeed = datafeeds.TFPairwisePaddingData(config)
 66 |     query, pos, neg = datafeed.ops()
 67 |     init_op = tf.group(tf.global_variables_initializer(),
 68 |                        tf.local_variables_initializer())
 69 |     start_time = time.time()
 70 |     sess = tf.InteractiveSession()
 71 |     sess.run(init_op)
 72 |     coord = tf.train.Coordinator()
 73 |     threads = tf.train.start_queue_runners(sess=sess, coord=coord)
 74 |     step = 0
 75 |     while not coord.should_stop():
 76 |         step += 1
 77 |         try:
 78 |             query_, pos_, neg_ = sess.run([query, pos, neg])
 79 |             print "pairwise data read is good"
 80 |         except tf.errors.OutOfRangeError:
 81 |             print("read %d steps" % step)
 82 |             coord.request_stop()
 83 |     coord.join(threads)
 84 |     duration = time.time() - start_time
 85 |     print("duration: %ds, step: %d" % (duration, step))
 86 |     sess.close()
 87 | 
 88 | 
 89 | def usage():
 90 |     """
 91 |     usage
 92 |     """
 93 |     print sys.argv[0], "options"
 94 |     print "options"
 95 |     print "\tconfig_path: configure file path"
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     if len(sys.argv) != 2:
100 |         usage()
101 |         sys.exit(1)
102 |     config_path = sys.argv[1]
103 |     config = load_config(config_path)
104 |     data_format_func = {"pointwise": read_tfrecords_pointwise,
105 |                         "pairwise": read_tfrecords_pairwise}
106 |     if config["training_mode"] in data_format_func:
107 |         using_func = data_format_func[config["training_mode"]]
108 |     else:
109 |         logging.error("data_format not supported")
110 |         sys.exit(1)
111 |     using_func(config)
112 | 


--------------------------------------------------------------------------------
/tools/com_writer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections import Counter
  3 | import logging
  4 | import numpy
  5 | import time
  6 | import sys
  7 | import os
  8 | 
  9 | import tensorflow as tf
 10 | 
 11 | 
 12 | def int_feature(v):
 13 |     """
 14 |     int feature
 15 |     """
 16 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=v))
 17 | 
 18 | 
 19 | def write_data_to_tf(filename, func, func_args, writer):
 20 |     """
 21 |     writes tf records, write data
 22 |     """
 23 |     with open(filename) as fin_data:
 24 |         for line in fin_data:
 25 |             example = func(line, func_args)
 26 |             if not example:
 27 |                 continue
 28 |             writer.write(example.SerializeToString())
 29 | 
 30 | 
 31 | def parse_text_match_pointwise_pad_data(line, func_args):
 32 |     """
 33 |     pointwise parse
 34 |     """
 35 |     seq_len = func_args[0]
 36 |     pad_id = func_args[1]
 37 |     # left_ids \t right_ids \t label
 38 |     group = line.strip().split("\t")
 39 |     if len(group) != 3:
 40 |         logging.warning(
 41 |             "the line not conform to format(left_ids, right_ids, label)")
 42 |         return
 43 |     label = [0, 0]
 44 |     all_ids = []
 45 |     for i in [0, 1]:
 46 |         tmp_ids = [int(t) for t in group[i].strip().split(" ")]
 47 |         if len(tmp_ids) < seq_len:
 48 |             pad_len = seq_len - len(tmp_ids)
 49 |             tmp_ids = tmp_ids + [pad_id] * pad_len
 50 |         all_ids.append(tmp_ids[:seq_len])
 51 |     label[int(group[2])] = 1
 52 |     example = tf.train.Example(features=tf.train.Features(
 53 |         feature={"label": int_feature(label),
 54 |                  "left": int_feature(all_ids[0]),
 55 |                  "right": int_feature(all_ids[1])}))
 56 |     return example
 57 | 
 58 | 
 59 | def parse_text_match_pairwise_pad_data(line, func_args):
 60 |     """
 61 |     pairwise parse
 62 |     """
 63 |     seq_len = func_args[0]
 64 |     pad_id = func_args[1]
 65 |     # query_terms\t postitle_terms\t negtitle_terms
 66 |     group = line.strip().split("\t")
 67 |     if len(group) != 3:
 68 |         logging.warning(
 69 |             "the line not conform to format(query_terms, postitle_terms, negtitle_terms)")
 70 |         return
 71 |     all_ids = []
 72 |     for i in [0, 1, 2]:
 73 |         tmp_ids = [int(t) for t in group[i].strip().split(" ")]
 74 |         if len(tmp_ids) < seq_len:
 75 |             pad_len = seq_len - len(tmp_ids)
 76 |             tmp_ids = tmp_ids + [pad_id] * pad_len
 77 |         all_ids.append(tmp_ids[:seq_len])
 78 |     example = tf.train.Example(features=tf.train.Features(
 79 |         feature={"left": int_feature(all_ids[0]),
 80 |                  "pos_right": int_feature(all_ids[1]),
 81 |                  "neg_right": int_feature(all_ids[2])}))
 82 |     return example
 83 | 
 84 | 
 85 | def usage():
 86 |     """
 87 |     usage
 88 |     """
 89 |     print sys.argv[0], "options"
 90 |     print "options"
 91 |     print "\ttype: data type include pointwise or pairwise"
 92 |     print "\tinputfile: input file path"
 93 |     print "\trecordfile: output recorf file"
 94 |     print "\tpad_id: pad id"
 95 |     print "\tmax_len: sequence max length"
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     if len(sys.argv) != 6:
100 |         usage()
101 |         sys.exit(-1)
102 |     input_data_format = sys.argv[1]
103 |     filename = sys.argv[2]
104 |     tfrecord_name = sys.argv[3]
105 |     pad_id = int(sys.argv[4])
106 |     max_len = int(sys.argv[5])
107 |     data_format_func = {"pointwise": parse_text_match_pointwise_pad_data,
108 |                         "pairwise": parse_text_match_pairwise_pad_data}
109 |     if input_data_format in data_format_func:
110 |         using_func = data_format_func[input_data_format]
111 |     else:
112 |         logging.error("data_format not supported")
113 |         sys.exit(1)
114 |     local_writer = tf.python_io.TFRecordWriter(tfrecord_name)
115 |     write_data_to_tf(filename, using_func, [max_len, pad_id], local_writer)
116 |     local_writer.close()
117 | 


--------------------------------------------------------------------------------
/utils/controler.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | def run_predict(pred, label, config):
  8 |     """
  9 |     run classification predict function handle
 10 |     """
 11 |     mean_acc = 0.0
 12 |     saver = tf.train.Saver()
 13 |     mode = config["training_mode"]
 14 |     label_index = tf.argmax(label, 1)
 15 |     if mode == "pointwise":
 16 |         pred_prob = tf.nn.softmax(pred, -1)
 17 |         score = tf.reduce_max(pred_prob, -1)
 18 |         pred_index = tf.argmax(pred_prob, 1)
 19 |         correct_pred = tf.equal(pred_index, label_index)
 20 |         acc = tf.reduce_mean(tf.cast(correct_pred, "float"))
 21 |     elif mode == "pairwise":
 22 |         score = pred
 23 |         pred_index = tf.argmax(pred, 1)
 24 |         acc = tf.constant([0.0])
 25 |     modelfile = config["test_model_file"]
 26 |     
 27 |     result_file = open(config["test_result"], "w")
 28 |     
 29 |     step = 0
 30 |     init = tf.group(tf.global_variables_initializer(),
 31 |                     tf.local_variables_initializer())
 32 |     with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1)) \
 33 |                     as sess:
 34 |         sess.run(init)
 35 |         saver.restore(sess, modelfile)
 36 |         coord = tf.train.Coordinator()
 37 |         read_thread = tf.train.start_queue_runners(sess=sess, coord=coord)
 38 |         while not coord.should_stop():
 39 |             step += 1
 40 |             try:
 41 |                 ground, pi, a, prob = sess.run([label_index, pred_index, acc, score])
 42 |                 mean_acc += a
 43 |                 for i in range(len(prob)):
 44 |                     result_file.write("%d\t%d\t%f\n" % (ground[i], pi[i], prob[i]))
 45 |             except tf.errors.OutOfRangeError:
 46 |                 coord.request_stop()
 47 |         coord.join(read_thread)
 48 |     sess.close()
 49 |     result_file.close()
 50 |     if mode == "pointwise":
 51 |         mean_acc = mean_acc / step
 52 |         print(sys.stderr, "accuracy: %4.2f" % (mean_acc * 100))
 53 | 
 54 | 
 55 | def run_trainer(loss, optimizer, config):
 56 |     """
 57 |     run classification training function handle
 58 |     """
 59 |     thread_num = int(config["thread_num"])
 60 |     model_path = config["model_path"]
 61 |     model_file = config["model_prefix"]
 62 |     print_iter = int(config["print_iter"])
 63 |     data_size = int(config["data_size"])
 64 |     batch_size = int(config["batch_size"])
 65 |     epoch_iter = int(data_size / batch_size)
 66 |     avg_cost = 0.0
 67 |     saver = tf.train.Saver(max_to_keep=None)
 68 |     init = tf.group(tf.global_variables_initializer(),
 69 |                     tf.local_variables_initializer())
 70 |     with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=thread_num, 
 71 |                                           inter_op_parallelism_threads=thread_num)) \
 72 |                     as sess:
 73 |         sess.run(init)
 74 |         coord = tf.train.Coordinator()
 75 |         read_thread = tf.train.start_queue_runners(sess=sess, coord=coord)
 76 |         step = 0
 77 |         epoch_num = 1
 78 |         start_time = time.time()
 79 |         while not coord.should_stop():
 80 |             try:
 81 |                 step += 1
 82 |                 c, _= sess.run([loss, optimizer])
 83 |                 avg_cost += c
 84 | 
 85 |                 if step % print_iter == 0:
 86 |                     print("loss: %f" % ((avg_cost / print_iter)))
 87 |                     avg_cost = 0.0
 88 |                 if step % epoch_iter == 0:
 89 |                     end_time = time.time()
 90 |                     print("save model epoch%d, used time: %d" % (epoch_num, 
 91 |                           end_time - start_time))
 92 |                     save_path = saver.save(sess, 
 93 |                             "%s/%s.epoch%d" % (model_path, model_file, epoch_num))
 94 |                     epoch_num += 1
 95 |                     start_time = time.time()
 96 |                     
 97 |             except tf.errors.OutOfRangeError:
 98 |                 save_path = saver.save(sess, "%s/%s.final" % (model_path, model_file))
 99 |                 coord.request_stop()
100 |         coord.join(read_thread)
101 |     sess.close()
102 | 
103 | 
104 | def graph_save(pred, config):
105 |     """
106 |     run classify predict
107 |     """
108 |     graph_path=config["graph_path"]
109 |     graph_name=config["graph_name"]
110 |     mode = config["training_mode"]
111 |     if mode == "pointwise":
112 |         pred_prob = tf.nn.softmax(pred, -1, name="output_prob")
113 |     elif mode == "pairwise":
114 |         pred_prob = tf.identity(pred, name="output_prob")
115 |     saver = tf.train.Saver()
116 |     step = 0
117 |     init = tf.group(tf.global_variables_initializer(),
118 |                     tf.local_variables_initializer())
119 |     with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1)) \
120 |                     as sess:
121 |         sess.run(init)
122 |         tf.train.write_graph(sess.graph_def, graph_path, graph_name, as_text=True)
123 |     sess.close()
124 | 
125 | 


--------------------------------------------------------------------------------
/utils/converter.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import numpy
  3 | import errno
  4 | import time
  5 | import sys
  6 | import os
  7 | 
  8 | import tensorflow as tf
  9 | 
 10 | my_int_feature = lambda v: tf.train.Feature(int64_list=tf.train.Int64List(value=v))
 11 | 
 12 | 
 13 | class TFConverter(object):
 14 |     """
 15 |     TFConverter init, with config
 16 |     """
 17 |     def __init__(self, config):
 18 |         data_id_file = config['data_id_file']
 19 |         data_tfrecord_file = config['data_tfrecord_file']
 20 |         self.source_file = []
 21 |         self.target_file = []
 22 |         if os.path.isdir(data_id_file):
 23 |             try:
 24 |                 os.makedirs(data_tfrecord_file)
 25 |             except OSError as exc:
 26 |                 if exc.errno == errno.EEXIST and os.path.isdir(data_tfrecord_file):
 27 |                     pass
 28 |                 else:
 29 |                     raise
 30 |             data_parts = os.listdir(data_id_file)
 31 |             for part in data_parts:
 32 |                 self.source_file.append(os.path.join(data_id_file, part))
 33 |                 self.target_file.append(os.path.join(data_tfrecord_file, part))
 34 |         else:
 35 |             self.source_file.append(data_id_file)
 36 |             self.target_file.append(data_tfrecord_file)
 37 |         data_mode = config['training_mode']
 38 |         self.left_slots = config["left_slots"]
 39 |         self.right_slots = config["right_slots"]
 40 |         self.pad_id = 0
 41 |         
 42 |         if data_mode == "pointwise":
 43 |             self.n_class = config["n_class"]
 44 |             self.func = self.convert_pointwise
 45 |             self.all_slots = self.left_slots + self.right_slots
 46 |         elif data_mode == "pairwise":
 47 |             self.func = self.convert_pairwise
 48 |             pos_slots = [["pos_" + name, length] for (name, length) in self.right_slots]
 49 |             neg_slots = [["neg_" + name, length] for (name, length) in self.right_slots]
 50 |             self.all_slots = self.left_slots + pos_slots + neg_slots
 51 |         else:
 52 |             print >>sys.stderr, "not supported data mode"
 53 |             
 54 |     def convert_pointwise(self, line):
 55 |         """
 56 |         convert pointwise data,pointwise parse
 57 |         """
 58 |         # left_ids \t right_ids \t label
 59 |         group = line.strip().split("\t")
 60 |         if len(group) != 1 + len(self.all_slots):
 61 |             print >> sys.stderr, "convert error, slots doesn't match"
 62 |             sys.exit(-1)
 63 |         label = [0 for i in range(self.n_class)]
 64 |         all_ids = []
 65 |         label[int(group[-1])] = 1
 66 |         feature={"label":my_int_feature(label)}
 67 |         for i in range(len(self.all_slots)):
 68 |             slot_name, seq_len = self.all_slots[i]
 69 |             tmp_ids = [int(t) for t in group[i].strip().split(" ")]
 70 |             if len(tmp_ids) < seq_len:
 71 |                 pad_len = seq_len - len(tmp_ids)
 72 |                 tmp_ids = tmp_ids + [self.pad_id] * pad_len
 73 |             feature[slot_name] = my_int_feature(tmp_ids[:seq_len])
 74 |         example = tf.train.Example(features=tf.train.Features(feature=feature))
 75 |         return example
 76 |         
 77 |     def convert_pairwise(self, line):
 78 |         """
 79 |         convert pairwise data, pairwise parse
 80 |         """
 81 |         # query_terms\t postitle_terms\t negtitle_terms
 82 |         group = line.strip().split("\t")
 83 |         if len(group) != len(self.all_slots):
 84 |             print >> sys.stderr, "convert error, slots doesn't match"
 85 |             sys.exit(-1)
 86 |         all_ids = []
 87 |         feature={}
 88 |         for i in range(len(self.all_slots)):
 89 |             slot_name, seq_len = self.all_slots[i]
 90 |             tmp_ids = [int(t) for t in group[i].strip().split(" ")]
 91 |             if len(tmp_ids) < seq_len:
 92 |                 pad_len = seq_len - len(tmp_ids)
 93 |                 tmp_ids = tmp_ids + [self.pad_id] * pad_len
 94 |             feature[slot_name] = my_int_feature(tmp_ids[:seq_len])
 95 |         example = tf.train.Example(features=tf.train.Features(feature=feature))
 96 |         return example
 97 |     
 98 |     def write_data_to_tf(self, filename, tfrecord_name):
 99 |         """
100 |         write to tfrecord file, write data
101 |         """
102 |         writer = tf.python_io.TFRecordWriter(tfrecord_name)
103 |         with open(filename) as fin_data:
104 |             for line in fin_data:
105 |                 example = self.func(line)
106 |                 writer.write(example.SerializeToString())
107 |         writer.close()
108 | 
109 |     def convert(self):
110 |         """
111 |         convert all files
112 |         """
113 |         print >> sys.stderr, "writing tf record"
114 |         for i in range(len(self.source_file)):
115 |             self.write_data_to_tf(self.source_file[i], self.target_file[i])
116 |             print >> sys.stderr, self.source_file[i], "-->", self.target_file[i]
117 |         print >> sys.stderr, "all done"
118 | 
119 | def run_convert(config):
120 |     """
121 |     run convert
122 |     """
123 |     tf_conv = TFConverter(config)
124 |     tf_conv.convert()
125 | 
126 | 


--------------------------------------------------------------------------------
/utils/datafeeds.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import time
  3 | import sys
  4 | import os
  5 | 
  6 | import tensorflow as tf
  7 | from tensorflow.contrib import learn
  8 | 
  9 | from utils.utility import get_all_files
 10 | 
 11 | 
 12 | def load_batch_ops(example, batch_size, shuffle):
 13 |     """
 14 |     load batch ops
 15 |     """
 16 |     if not shuffle:
 17 |         return tf.train.batch([example], 
 18 |                               batch_size = batch_size,
 19 |                               num_threads = 1,
 20 |                               capacity = 10000 + 2 * batch_size)
 21 |     else:
 22 |         return tf.train.shuffle_batch([example],
 23 |                                       batch_size = batch_size,
 24 |                                       num_threads = 1,
 25 |                                       capacity = 10000 + 2 * batch_size,
 26 |                                       min_after_dequeue = 10000)
 27 | 
 28 | 
 29 | class TFPairwisePaddingData(object):
 30 |     """
 31 |     for pairwise padding data
 32 |     """
 33 |     def __init__(self, config):
 34 |         self.filelist = get_all_files(config["train_file"])
 35 |         self.batch_size = int(config["batch_size"])
 36 |         self.epochs = int(config["num_epochs"])
 37 |         if int(config["shuffle"]) == 0:
 38 |             shuffle = False
 39 |         else:
 40 |             shuffle = True
 41 |         self.shuffle = shuffle
 42 |         self.reader = None
 43 |         self.file_queue = None
 44 |         self.left_slots = dict(config["left_slots"])
 45 |         self.right_slots = dict(config["right_slots"])
 46 |         
 47 |     def ops(self):
 48 |         """
 49 |         produce data
 50 |         """
 51 |         self.file_queue = tf.train.string_input_producer(self.filelist,
 52 |                                                          num_epochs=self.epochs)
 53 |         self.reader = tf.TFRecordReader()
 54 |         _, example = self.reader.read(self.file_queue)
 55 |         batch_examples = load_batch_ops(example, self.batch_size, self.shuffle)
 56 |         features_types = {}
 57 |         [features_types.update({u: tf.FixedLenFeature([v], tf.int64)}) 
 58 |                             for (u, v) in self.left_slots.items()]
 59 |         [features_types.update({"pos_" + u: tf.FixedLenFeature([v], tf.int64)}) 
 60 |                             for (u, v) in self.right_slots.items()]
 61 |         [features_types.update({"neg_" + u: tf.FixedLenFeature([v], tf.int64)}) 
 62 |                             for (u, v) in self.right_slots.items()]
 63 |         features = tf.parse_example(batch_examples, features = features_types)
 64 |         return dict([(k, features[k]) for k in self.left_slots.keys()]),\
 65 |                 dict([(k, features["pos_" + k]) for k in self.right_slots.keys()]),\
 66 |                     dict([(k, features["neg_" + k]) for k in self.right_slots.keys()])
 67 | 
 68 | 
 69 | class TFPointwisePaddingData(object):
 70 |     """
 71 |     for pointwise padding data
 72 |     """
 73 |     def __init__(self, config):
 74 |         self.filelist = get_all_files(config["train_file"])
 75 |         self.batch_size = int(config["batch_size"])
 76 |         self.epochs = int(config["num_epochs"])
 77 |         if int(config["shuffle"]) == 0:
 78 |             shuffle = False
 79 |         else:
 80 |             shuffle = True
 81 |         self.shuffle = shuffle
 82 |         self.reader = None
 83 |         self.file_queue = None
 84 |         self.left_slots = dict(config["left_slots"])
 85 |         self.right_slots = dict(config["right_slots"])
 86 |     
 87 |     def ops(self):
 88 |         """
 89 |         gen data
 90 |         """
 91 |         self.file_queue = tf.train.string_input_producer(self.filelist, 
 92 |                                                          num_epochs=self.epochs)
 93 |         self.reader = tf.TFRecordReader()
 94 |         _, example = self.reader.read(self.file_queue)
 95 |         batch_examples = load_batch_ops(example, self.batch_size, self.shuffle)
 96 |         features_types = {"label": tf.FixedLenFeature([2], tf.int64)}
 97 |         [features_types.update({u: tf.FixedLenFeature([v], tf.int64)}) 
 98 |                             for (u, v) in self.left_slots.items()]
 99 |         [features_types.update({u: tf.FixedLenFeature([v], tf.int64)}) 
100 |                             for (u, v) in self.right_slots.items()]
101 |         features = tf.parse_example(batch_examples, features = features_types)
102 |         return dict([(k, features[k]) for k in self.left_slots.keys()]),\
103 |                 dict([(k, features[k]) for k in self.right_slots.keys()]),\
104 |                     features["label"]
105 | 
106 | 


--------------------------------------------------------------------------------
/utils/utility.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | import sys
 3 | import os
 4 | 
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | def get_all_files(train_data_file):
 9 |     """
10 |     get all files
11 |     """
12 |     train_file = []
13 |     train_path = train_data_file
14 |     if os.path.isdir(train_path):
15 |         data_parts = os.listdir(train_path)
16 |         for part in data_parts:
17 |             train_file.append(os.path.join(train_path, part))
18 |     else:
19 |         train_file.append(train_path)
20 |     return train_file
21 | 
22 | 
23 | def merge_config(config, *argv):
24 |     """
25 |     merge multiple configs
26 |     """
27 |     cf = {}
28 |     cf.update(config)
29 |     for d in argv:
30 |         cf.update(d)
31 |     return cf
32 | 
33 | 
34 | def import_object(module_py, class_str):
35 |     """
36 |     string to class
37 |     """
38 |     mpath, mfile = os.path.split(module_py)
39 |     sys.path.append(mpath)
40 |     module=__import__(mfile)
41 |     try:
42 |         return getattr(module, class_str)
43 |     except AttributeError:
44 |         raise ImportError('Class %s cannot be found (%s)' %
45 |                 (class_str, traceback.format_exception(*sys.exc_info())))
46 | 
47 | 
48 | def seq_length(sequence):
49 |     """
50 |     get sequence length
51 |     for id-sequence, (N, S)
52 |         or vector-sequence  (N, S, D)
53 |     """
54 |     if len(sequence.get_shape().as_list()) == 2:
55 |         used = tf.sign(tf.abs(sequence))
56 |     else:
57 |         used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
58 |     length = tf.reduce_sum(used, 1)
59 |     length = tf.cast(length, tf.int32)
60 |     return length
61 | 
62 | 
63 | def get_cross_mask(seq1, seq2):
64 |     """
65 |     get matching matrix mask, for two sequences( id-sequences or vector-sequences)
66 |     """
67 |     length1 = seq_length(seq1)
68 |     length2 = seq_length(seq2)
69 |     max_len1 = tf.shape(seq1)[1]
70 |     max_len2 = tf.shape(seq2)[1]
71 |     ##for padding left
72 |     mask1 = tf.sequence_mask(length1, max_len1, dtype=tf.int32)
73 |     mask2 = tf.sequence_mask(length2, max_len2, dtype=tf.int32)
74 |     cross_mask = tf.einsum('ij,ik->ijk', mask1, mask2)
75 |     return cross_mask
76 | 


--------------------------------------------------------------------------------