├── cnn ├── tensorflow │ ├── test.py │ ├── README.md │ ├── insqa_cnn.py │ ├── insqa_cnn.py.old │ ├── insqa_train.py │ ├── insurance_qa_data_helpers.py │ └── insqa_train.py.old └── theano │ ├── README.md │ └── insqa_cnn.py ├── lstm_cnn └── theano │ ├── README.md │ └── insqa_lstm.py ├── config.py ├── README.md ├── gen.py ├── swem ├── swem_hier.py ├── swem_max_margin.py └── swem_hier_margin.py ├── rnn_attention └── tensorflow │ ├── insurance_qa_data_helpers.py │ └── tf_rnn_char.py └── utils.py /cnn/tensorflow/test.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | _list = [i for i in range(0, 10)] 4 | _l1 = random.sample(_list, 2) 5 | _l2 = random.sample(_list, 2) 6 | print(_l1) 7 | print(_l2) 8 | for i in range(2, 2): 9 | print(i) 10 | 11 | -------------------------------------------------------------------------------- /lstm_cnn/theano/README.md: -------------------------------------------------------------------------------- 1 | 2 | theano lstm+cnn code for insuranceQA 3 | 4 | ================result================== 5 | 6 | theano code, test1 top-1 precision : 68.3% 7 | 8 | lstm+cnn is better than cnn(61.5%). 9 | 10 | ================dataset================ 11 | 12 | dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample) 13 | 14 | I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample) 15 | 16 | you can get the original dataset from https://github.com/shuzi/insuranceQA 17 | 18 | word embedding is trained by word2vec toolkit 19 | 20 | =================run===================== 21 | 22 | reformat the original dataset(see my train and test1.sample) 23 | 24 | change filepath to your dataset(see TODO in insqa_cnn.py) 25 | 26 | python insqa_lstm.py 27 | -------------------------------------------------------------------------------- /cnn/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | 2 | ================result================== 3 | 4 | 结果和theano版本的差不多,具体数值忘了 5 | 6 | 虽然代码里写了dropout,但是实际并没有使用,dropout对结果影响不是特别大,不用dropout的话训练速度要快一些。 7 | 8 | ================dataset================ 9 | 10 | 数据格式和theano版本的是一样的 11 | 12 | github上给出的是样本数据,如果需要全量的,也可直接联系我 13 | dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample) 14 | 15 | I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample) 16 | 17 | you can get the original dataset from https://github.com/shuzi/insuranceQA 18 | 19 | word embedding is trained by word2vec toolkit 20 | 21 | =================run===================== 22 | 23 | ./insqa_train.py 24 | 25 | 我使用的是python3.4,部分代码可能会和python2不兼容,如使用python2需要自己做一些小修改,核心的CNN代码应该 26 | 不用改动的 27 | 代码里的数据路径(类似'/export/...')是需要根据自己的环境修改的,指向自己的数据路径即可。核心的CNN代码无需改动 28 | -------------------------------------------------------------------------------- /cnn/theano/README.md: -------------------------------------------------------------------------------- 1 | 2 | ================result================== 3 | theano and tensorflow cnn code for insuranceQA 4 | 5 | theano code, test1 top-1 precision : 61.5% (see ./insuranceQA/acc) 6 | tensorflow code, test1 top-1 precision : 62.6% 7 | 8 | the best precision in the paper is 62.8% (see Applying Deep Leaarning To Answer Selection: A study and an open task) 9 | 10 | ================dataset================ 11 | dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample) 12 | 13 | I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample) 14 | 15 | you can get the original dataset from https://github.com/shuzi/insuranceQA 16 | 17 | word embedding is trained by word2vec toolkit 18 | 19 | =================run===================== 20 | reformat the original dataset(see my train and test1.sample) 21 | change filepath to your dataset(see TODO in insqa_cnn.py) 22 | python insqa_cnn.py 23 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | dataset_ins = 'insurance-qa' 4 | dataset_qur = 'quora-qa' 5 | 6 | ################################################################## 7 | # ajust to your runnning environment 8 | # which data do you want 9 | dataset = dataset_qur 10 | # word2vec command path 11 | w2v_command = '/export/jw/word2vec/word2vec' 12 | ################################################################## 13 | 14 | home = '' 15 | if dataset == dataset_ins: 16 | home = os.path.expanduser('/export/jw/insuranceQA') 17 | elif dataset == dataset_qur: 18 | home = os.path.expanduser('/export/jw/quora') 19 | 20 | #Insurance-QA original data directory 21 | qa_version = 'V1' 22 | vocab_file = os.path.join(home, qa_version, 'vocabulary') 23 | answers_file = os.path.join(home, qa_version, 'answers.label.token_idx') 24 | question_train_file = os.path.join(home, qa_version, 'question.train.token_idx.label') 25 | question_test1_file = os.path.join(home, qa_version, 'question.test1.label.token_idx.pool') 26 | question_test2_file = os.path.join(home, qa_version, 'question.test2.label.token_idx.pool') 27 | question_dev_file = os.path.join(home, qa_version, 'question.dev.label.token_idx.pool') 28 | #quora original data directory 29 | qr_file = os.path.join(home, 'quora_duplicate_questions.tsv') 30 | qr_train_ratio = 0.8 31 | #processed files 32 | train_file = os.path.join(home, 'data', 'train.prepro') 33 | test1_file = os.path.join(home, 'data', 'test1.prepro') 34 | test2_file = os.path.join(home, 'data', 'test2.prepro') 35 | w2v_train_file = os.path.join(home, 'data', 'w2v.train') 36 | w2v_bin_file = os.path.join(home, 'data', 'w2v.bin') 37 | predict1_file = os.path.join(home, 'data', 'predict1') 38 | 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Insurance-QA deeplearning model 2 | ====== 3 | This is a repo for Q&A Mathing, includes some deep learning models, such as CNN、RNN.
4 | 1. CNN. Basic CNN model from 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
5 | 2. RNN. RNN seems the best model on Insurance-QA dataset.
6 | 3. SWEM. SWEM is the fastest, and has good effect on other datasets, such as WikiQA ..., but is seems not so good on Insurance-QA dataset. I think that, SWEM is more suitable for Q&Q matching, not Q&A matching.
7 | 8 | 9 | It's hard to say which model is the best in other datasets, you have to choose the most suitable model for you.

10 | More models are on the way, pay attention to the updates.
11 | 12 | ## Requirements 13 | 1. tensorflow 1.4.0
14 | 2. python3.5
15 | 16 | ## Performance 17 | margin loss version
18 | 19 | Model/Score | Ins_qa_top1_precision | quora_best_prec 20 | ------------ | ------------- | ------------- 21 | CNN | 62% | None 22 | LSTM+CNN | 68% | None 23 | SWEM | <55% | None 24 | 25 | logloss version
26 | 27 | Model/Score | Insqa_top1_precision | quora_best_prec 28 | ------------ | ------------- | ------------- 29 | CNN | None | 79.60% 30 | LSTM+CNN | None | None 31 | SWEM | <40% | 82.69% 32 | 33 | ## Running 34 | Change configuration to your own environment, just like data pathes
35 | 36 | vim config.py 37 | 38 | Data processing
39 | 40 | python3 gen.py 41 | 42 | Run CNN model
43 | 44 | cd ./cnn/tensorflow && python3 insqa_train.py 45 | 46 | It will take few hours(thousands of epoches) to train this model on a single GPU.
47 | 48 | ## Downloads 49 | 1. You can get Insurance-QA data from here https://github.com/shuzi/insuranceQA
50 | 2. You can get Quora data from here http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv
51 | 52 | ## Links 53 | 1. CNN and RNN textual classification repo https://github.com/white127/TextClassification_CNN_RNN
54 | 2. 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
55 | 56 | -------------------------------------------------------------------------------- /cnn/tensorflow/insqa_cnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | ########################################################################## 5 | # embedding_lookup + cnn + cosine margine , batch 6 | ########################################################################## 7 | class InsQACNN(object): 8 | def __init__(self, _margin, sequence_length, batch_size, 9 | vocab_size, embedding_size, 10 | filter_sizes, num_filters, l2_reg_lambda=0.0): 11 | self.L, self.B, self.V, self.E, self.FS, self.NF = sequence_length, batch_size, \ 12 | vocab_size, embedding_size, filter_sizes, num_filters 13 | 14 | #用户问题,字向量使用embedding_lookup 15 | self.q = tf.placeholder(tf.int32, [self.B, self.L], name="q") 16 | #待匹配正向问题 17 | self.qp = tf.placeholder(tf.int32, [self.B, self.L], name="qp") 18 | #负向问题 19 | self.qn = tf.placeholder(tf.int32, [self.B, self.L], name="qn") 20 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 21 | l2_loss = tf.constant(0.0) 22 | 23 | # Embedding layer 24 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 25 | W = tf.get_variable( 26 | initializer=tf.random_uniform([self.V, self.E], -1.0, 1.0), 27 | name='We') 28 | self.qe = tf.nn.embedding_lookup(W, self.q) 29 | self.qpe = tf.nn.embedding_lookup(W, self.qp) 30 | self.qne = tf.nn.embedding_lookup(W, self.qn) 31 | self.qe = tf.expand_dims(self.qe, -1) 32 | self.qpe = tf.expand_dims(self.qpe, -1) 33 | self.qne = tf.expand_dims(self.qne, -1) 34 | 35 | with tf.variable_scope('shared-conv') as scope: 36 | self.qe = self.conv(self.qe) 37 | scope.reuse_variables() 38 | #tf.get_variable_scope().reuse_variables() 39 | self.qpe = self.conv(self.qpe) 40 | scope.reuse_variables() 41 | #tf.get_variable_scope().reuse_variables() 42 | self.qne = self.conv(self.qne) 43 | self.cos_q_qp = self.cosine(self.qe, self.qpe) 44 | self.cos_q_qn = self.cosine(self.qe, self.qne) 45 | zero = tf.constant(0, shape=[self.B], dtype=tf.float32) 46 | margin = tf.constant(_margin, shape=[self.B], dtype=tf.float32) 47 | with tf.name_scope("loss"): 48 | self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_q_qp, self.cos_q_qn))) 49 | self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss 50 | print('loss ', self.loss) 51 | 52 | # Accuracy 53 | with tf.name_scope("accuracy"): 54 | self.correct = tf.equal(zero, self.losses) 55 | self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy") 56 | 57 | for v in tf.trainable_variables(): 58 | print(v) 59 | 60 | def conv(self, tensor): 61 | pooled = [] 62 | #with tf.variable_scope(name_or_scope='my-conv', reuse=tf.AUTO_REUSE): 63 | with tf.variable_scope("my-conv-shared"): 64 | for i, fs in enumerate(self.FS): 65 | filter_shape = [fs, self.E, 1, self.NF] 66 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), 67 | name="W-%s" % str(fs)) 68 | b = tf.get_variable(initializer=tf.constant(0.1, shape=[self.NF]), 69 | name="b-%s" % str(fs)) 70 | conv = tf.nn.conv2d( 71 | tensor, W, strides=[1, 1, 1, 1], padding='VALID', 72 | name="conv") 73 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 74 | output = tf.nn.max_pool( 75 | h, ksize=[1, self.L - fs + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', 76 | name="pool") 77 | pooled.append(output) 78 | num_filters_total = self.NF * len(self.FS) 79 | pooled = tf.reshape(tf.concat(pooled, 3), [-1, num_filters_total]) 80 | pooled = tf.nn.dropout(pooled, self.dropout_keep_prob) 81 | return pooled 82 | 83 | def cosine(self, v1, v2): 84 | l1 = tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1)) 85 | l2 = tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1)) 86 | a = tf.reduce_sum(tf.multiply(v1, v2), 1) 87 | cos = tf.div(a, tf.multiply(l1, l2), name='score') 88 | return tf.clip_by_value(cos, 1e-5, 0.99999) 89 | 90 | -------------------------------------------------------------------------------- /gen.py: -------------------------------------------------------------------------------- 1 | import config, os, random 2 | 3 | ##################################################################### 4 | # function: load vocab 5 | # return: dict[word] = [word_id] 6 | ##################################################################### 7 | def load_vocab(): 8 | voc = {} 9 | for line in open(config.vocab_file): 10 | word, _id = line.strip().split('\t') 11 | voc[word] = _id 12 | return voc 13 | 14 | ##################################################################### 15 | # function: load answers, restore idx to real word 16 | # return : [answer_1, answer_2, ..., answer_n] 17 | ##################################################################### 18 | def ins_load_answers(): 19 | _list, voc = [''], load_vocab() 20 | for line in open(config.answers_file): 21 | _, sent = line.strip().split('\t') 22 | _list.append('_'.join([voc[wid] for wid in sent.split(' ')])) 23 | return _list 24 | 25 | ##################################################################### 26 | # function: preprea word2vec binary file 27 | # return : 28 | ##################################################################### 29 | def ins_w2v(): 30 | print('preparing word2vec ......') 31 | _data, voc = [], load_vocab() 32 | for line in open(config.question_train_file): 33 | items = line.strip().split('\t') 34 | _data.append(' '.join([voc[_id] for _id in items[0].split(' ')])) 35 | for _file in [config.answers_file, config.question_dev_file, \ 36 | config.question_test1_file, config.question_test2_file]: 37 | for line in open(_file): 38 | items = line.strip().split('\t') 39 | _data.append(' '.join([voc[_id] for _id in items[1].split(' ')])) 40 | of = open(config.w2v_train_file, 'w') 41 | for s in _data: of.write(s + '\n') 42 | of.close() 43 | os.system('time ' + config.w2c_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1') 44 | 45 | ##################################################################### 46 | # function: preprea train file 47 | # file format: flag question answer 48 | ##################################################################### 49 | def ins_train(): 50 | print('preparing train ......') 51 | answers, voc, _data = ins_load_answers(), load_vocab(), [] 52 | for line in open(config.question_train_file): 53 | qsent, ids = line.strip().split('\t') 54 | qsent = '_'.join([voc[wid] for wid in qsent.split(' ')]) 55 | for _id in ids.split(' '): 56 | _data.append(' '.join(['1', qsent, answers[int(_id)]])) 57 | of = open(config.train_file, 'w') 58 | for _s in _data: of.write(_s + '\n') 59 | of.close() 60 | 61 | ##################################################################### 62 | # function: preprea test file 63 | # file format: flag group_id question answer 64 | ##################################################################### 65 | def ins_test(): 66 | print('preparing test ......') 67 | answers, voc = ins_load_answers(), load_vocab() 68 | for _in, _out in ([(config.question_test2_file, config.test2_file), \ 69 | (config.question_test1_file, config.test1_file)]): 70 | _data, group = [], int(0) 71 | for line in open(_in): 72 | pids, qsent, pnids = line.strip().split('\t') 73 | positive = {_id:'#' for _id in pids.split(' ')} 74 | qsent = '_'.join([voc[wid] for wid in qsent.split(' ')]) 75 | for _id in pnids.split(' '): 76 | flag = '1' if _id in positive else '0' 77 | _data.append(' '.join([flag, str(group), qsent, answers[int(_id)]])) 78 | group += 1 79 | of = open(_out, 'w') 80 | for s in _data: of.write(s + '\n') 81 | of.close() 82 | 83 | def ins_qa(): 84 | ins_w2v() 85 | ins_train() 86 | ins_test() 87 | 88 | def qur_prepare(): 89 | #pretrain word2vec 90 | _list = [] 91 | for line in open(config.qr_file): 92 | items = line.strip().split('\t') 93 | if len(items) != 6: 94 | continue 95 | _list.append(items) 96 | _list = _list[1:] 97 | random.shuffle(_list) 98 | _list = [(f, q1, q2) for _,_,_,q1,q2,f in _list] 99 | of = open(config.w2v_train_file, 'w') 100 | for f, q1, q2 in _list: 101 | of.write(q1 + '\n') 102 | of.write(q2 + '\n') 103 | of.close() 104 | os.system('time ' + config.w2v_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1') 105 | #train file 106 | _newlist = [] 107 | for f, q1, q2 in _list: 108 | if len(q1) <= 1 or len(q2) <= 1: continue 109 | q1 = '_'.join(q1.split(' ')) 110 | q2 = '_'.join(q2.split(' ')) 111 | _newlist.append((f, q1, q2)) 112 | _list = _newlist 113 | of = open(config.train_file, 'w') 114 | for f, q1, q2 in _list[:int(len(_list) * 0.8)]: 115 | of.write(' '.join([f, q1, q2]) + '\n') 116 | of.close() 117 | 118 | #test file 119 | of = open(config.test1_file, 'w') 120 | for f, q1, q2 in _list[int(len(_list) * 0.8):]: 121 | of.write(' '.join([f, q1, q2]) + '\n') 122 | of.close() 123 | 124 | def qur_qa(): 125 | qur_prepare() 126 | 127 | if __name__ == '__main__': 128 | if config.dataset == config.dataset_ins: 129 | ins_qa() 130 | elif config.dataset == config.dataset_qur: 131 | qur_qa() 132 | -------------------------------------------------------------------------------- /swem/swem_hier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time, os, random, datetime, sys 4 | from sklearn import metrics 5 | sys.path.append('../') 6 | import config, utils 7 | 8 | ################################################################################ 9 | # Insurance-QA 10 | # AUC 0.96, top 1 precision:31% 11 | # 12 | # quora-data 13 | # best precision: 0.8369, best threshold:0.62 14 | ################################################################################ 15 | class SWEM_HIER(object): 16 | def __init__(self, 17 | sequence_length, 18 | vocab_size, 19 | embedding_size, 20 | embeddings): 21 | self.x1 = tf.placeholder(tf.int32, [None, sequence_length]) 22 | self.x2 = tf.placeholder(tf.int32, [None, sequence_length]) 23 | self.y = tf.placeholder(tf.float32, [None]) 24 | self.one = tf.placeholder(tf.float32, [None]) 25 | #self.dropout_keep_prob = tf.placeholder(tf.float32) 26 | 27 | with tf.device('/cpu:0'), tf.name_scope('embedding'): 28 | self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32) 29 | x1_mat = tf.nn.embedding_lookup(self.word_mat, self.x1) 30 | x2_mat = tf.nn.embedding_lookup(self.word_mat, self.x2) 31 | self.x1_mat_exp = tf.expand_dims(x1_mat, -1) 32 | self.x2_mat_exp = tf.expand_dims(x2_mat, -1) 33 | p1 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 2, 1, 1], 34 | strides=[1, 1, 1, 1], padding='VALID') 35 | p2 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 2, 1, 1], 36 | strides=[1, 1, 1, 1], padding='VALID') 37 | p1 = tf.reshape(tf.reduce_max(p1, 1), [-1, embedding_size]) 38 | p2 = tf.reshape(tf.reduce_max(p2, 1), [-1, embedding_size]) 39 | """ 40 | p11 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 3, 1, 1], 41 | strides=[1, 1, 1, 1], padding='VALID') 42 | p21 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 3, 1, 1], 43 | strides=[1, 1, 1, 1], padding='VALID') 44 | p11 = tf.reshape(tf.reduce_max(p11, 1), [-1, embedding_size]) 45 | p21 = tf.reshape(tf.reduce_max(p21, 1), [-1, embedding_size]) 46 | p1 = tf.concat([p1, p11], 1) 47 | p2 = tf.concat([p2, p21], 1) 48 | """ 49 | 50 | self.cos = self.cosine(p1, p2) 51 | self.losses = self.logloss(self.y, self.one, self.cos) 52 | 53 | def logloss(self, y, v_one, sim): 54 | a = tf.multiply(y, tf.log(sim)) #y*log(p) 55 | b = tf.subtract(v_one, y)#1-y 56 | c = tf.log(tf.subtract(v_one, sim))#log(1-p) 57 | losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p) 58 | losses = tf.reduce_sum(losses, -1) 59 | return losses 60 | 61 | def cosine(self, t1, t2): 62 | len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1)) 63 | len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1)) 64 | multiply = tf.reduce_sum(tf.multiply(t1, t2), 1) 65 | cos = tf.div(multiply, tf.multiply(len1, len2)) 66 | return tf.clip_by_value(cos, 1e-5, 0.99999) 67 | 68 | def get_constant(batch_size): 69 | one, zero = [1.0] * batch_size, [0.0] * batch_size 70 | return np.array(one), np.array(zero) 71 | 72 | max_len = 100 73 | num_epoch = 200000 74 | batch_size = 256 75 | checkpoint_every = 10000 76 | vocab, embeddings = utils.load_embeddings() 77 | embedding_size = len(embeddings[0]) 78 | train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len) 79 | print('load data done ......') 80 | print(embeddings.shape) 81 | 82 | prev_auc = 0.0 83 | with tf.Graph().as_default(): 84 | session_conf = tf.ConfigProto( 85 | allow_soft_placement=True, log_device_placement=False) 86 | sess = tf.Session(config=session_conf) 87 | with sess.as_default(): 88 | swem = SWEM_HIER(max_len, len(vocab), embedding_size, embeddings) 89 | global_step = tf.Variable(0, name="global_step", trainable=False) 90 | optimizer = tf.train.AdamOptimizer(1e-1) 91 | #optimizer = tf.train.GradientDescentOptimizer(1e-1) 92 | grads_and_vars = optimizer.compute_gradients(swem.losses) 93 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 94 | 95 | timestamp = str(int(time.time())) 96 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 97 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 98 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 99 | if not os.path.exists(checkpoint_dir): 100 | os.makedirs(checkpoint_dir) 101 | saver = tf.train.Saver(tf.all_variables()) 102 | sess.run(tf.initialize_all_variables()) 103 | 104 | def train_step(): 105 | y, x1, x2 = utils.gen_train_batch_yxx(train_data, batch_size) 106 | one, zero = get_constant(batch_size) 107 | feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:y, swem.one:one} 108 | _, step, loss, cos = sess.run( 109 | [train_op, global_step, swem.losses, swem.cos], feed_dict) 110 | time_str = datetime.datetime.now().isoformat() 111 | print("{}: step {}, loss {:g}".format(time_str, step, loss)) 112 | 113 | def test_step(): 114 | yp, y, group = [], [], [] 115 | for i in range(0, len(test_data), batch_size): 116 | f, g, x1, x2 = utils.gen_test_batch_yxx(test_data, i, i + batch_size) 117 | one, zero = get_constant(len(f)) 118 | feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:f, swem.one:one} 119 | loss, cos = sess.run([swem.losses, swem.cos], feed_dict) 120 | yp.extend(cos) 121 | y.extend(f) 122 | group.extend(g) 123 | ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)] 124 | #for _y, _g, _yp in ppp: 125 | # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp)) 126 | return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] 127 | 128 | for i in range(num_epoch): 129 | train_step() 130 | current_step = tf.train.global_step(sess, global_step) 131 | if current_step % checkpoint_every == 0: 132 | y, g, yp = test_step() 133 | utils._eval(y, g, yp) 134 | 135 | #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f') 136 | #utils.save_features(features[3], './data/gen_sweg_hier_test.f') 137 | -------------------------------------------------------------------------------- /rnn_attention/tensorflow/insurance_qa_data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from operator import itemgetter 4 | 5 | precision = '/export/jw/cnn/insuranceQA/acc.lstm' 6 | 7 | empty_vector = [] 8 | for i in range(0, 100): 9 | empty_vector.append(float(0.0)) 10 | onevector = [] 11 | for i in range(0, 10): 12 | onevector.append(float(1)) 13 | zerovector = [] 14 | for i in range(0, 10): 15 | zerovector.append(float(0)) 16 | 17 | def build_vocab(): 18 | code, vocab = int(0), {} 19 | vocab['UNKNOWN'] = code 20 | code += 1 21 | for line in open('/export/jw/cnn/insuranceQA/train'): 22 | items = line.strip().split(' ') 23 | for i in range(2, 3): 24 | words = items[i].split('_') 25 | for word in words: 26 | if not word in vocab: 27 | vocab[word] = code 28 | code += 1 29 | for line in open('/export/jw/cnn/insuranceQA/test1'): 30 | items = line.strip().split(' ') 31 | for i in range(2, 3): 32 | words = items[i].split('_') 33 | for word in words: 34 | if not word in vocab: 35 | vocab[word] = code 36 | code += 1 37 | return vocab 38 | 39 | def read_alist(): 40 | alist = [] 41 | for line in open('/export/jw/cnn/insuranceQA/train'): 42 | items = line.strip().split(' ') 43 | alist.append(items[3]) 44 | print('read_alist done ......') 45 | return alist 46 | 47 | def load_vectors(): 48 | vectors = {} 49 | for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'): 50 | items = line.strip().split(' ') 51 | if (len(items) < 101): 52 | continue 53 | vec = [] 54 | for i in range(1, 101): 55 | vec.append(float(items[i])) 56 | vectors[items[0]] = vec 57 | return vectors 58 | 59 | def read_vector(vectors, word): 60 | global empty_vector 61 | if word in vectors: 62 | return vectors[word] 63 | else: 64 | return empty_vector 65 | #return vectors[''] 66 | 67 | def load_train_list(): 68 | train_list = [] 69 | for line in open('/export/jw/cnn/insuranceQA/train'): 70 | items = line.strip().split(' ') 71 | if items[0] == '1': 72 | train_list.append(line.strip().split(' ')) 73 | return train_list 74 | 75 | def load_test_list(): 76 | test_list = [] 77 | for line in open('/export/jw/cnn/insuranceQA/test1'): 78 | test_list.append(line.strip().split(' ')) 79 | return test_list 80 | 81 | def load_train_and_vectors(): 82 | trainList = [] 83 | for line in open('/export/jw/cnn/insuranceQA/train'): 84 | trainList.append(line.strip()) 85 | vectors = load_vectors() 86 | return trainList, vectors 87 | 88 | def read_raw(): 89 | raw = [] 90 | for line in open('/export/jw/cnn/insuranceQA/train'): 91 | items = line.strip().split(' ') 92 | if items[0] == '1': 93 | raw.append(items) 94 | return raw 95 | 96 | def encode_sent(vocab, string, size): 97 | x, m = [], [] 98 | words = string.split('_') 99 | for i in range(0, size): 100 | if words[i] in vocab: x.append(vocab[words[i]]) 101 | else: x.append(vocab['UNKNOWN']) 102 | if words[i] == '': m.append(1) 103 | else: m.append(1) 104 | return x, m 105 | 106 | def load_val_data(test_list, vocab, index, batch_size, max_len): 107 | x1, x2, x3, m1, m2, m3 = [], [], [], [], [], [] 108 | for i in range(0, batch_size): 109 | t_i = index + i 110 | if t_i >= len(test_list): 111 | t_i = len(test_list) - 1 112 | items = test_list[t_i] 113 | x, m = encode_sent(vocab, items[2], max_len) 114 | x1.append(x) 115 | m1.append(m) 116 | x, m = encode_sent(vocab, items[3], max_len) 117 | x2.append(x) 118 | m2.append(m) 119 | x, m = encode_sent(vocab, items[3], max_len) 120 | x3.append(x) 121 | m3.append(m) 122 | return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32'), np.transpose(np.array(m1, dtype='float32')) , np.transpose(np.array(m2, dtype='float32')), np.transpose(np.array(m3, dtype='float32')) 123 | 124 | def load_train_data(trainList, vocab, batch_size, max_len): 125 | train_1, train_2, train_3 = [], [], [] 126 | mask_1, mask_2, mask_3 = [], [], [] 127 | counter = 0 128 | while True: 129 | pos = trainList[random.randint(0, len(trainList)-1)] 130 | neg = trainList[random.randint(0, len(trainList)-1)] 131 | if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''): 132 | #print 'empty string ......' 133 | continue 134 | x, m = encode_sent(vocab, pos[2], max_len) 135 | train_1.append(x) 136 | mask_1.append(m) 137 | x, m = encode_sent(vocab, pos[3], max_len) 138 | train_2.append(x) 139 | mask_2.append(m) 140 | x, m = encode_sent(vocab, neg[3], max_len) 141 | train_3.append(x) 142 | mask_3.append(m) 143 | counter += 1 144 | if counter >= batch_size: 145 | break 146 | return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32'), np.transpose(np.array(mask_1, dtype='float32')) , np.transpose(np.array(mask_2, dtype='float32')), np.transpose(np.array(mask_3, dtype='float32')) 147 | 148 | def evaluation(score_list, test_list): 149 | global precision 150 | sessdict, index = {}, int(0) 151 | for items in test_list: 152 | qid = items[1].split(':')[1] 153 | if not qid in sessdict: 154 | sessdict[qid] = [] 155 | sessdict[qid].append((score_list[index], items[0])) 156 | index += 1 157 | if index >= len(test_list): 158 | break 159 | lev1, lev0 = float(0), float(0) 160 | of = open(precision, 'a') 161 | for k, v in sessdict.items(): 162 | v.sort(key=itemgetter(0), reverse=True) 163 | score, flag = v[0] 164 | if flag == '1': lev1 += 1 165 | if flag == '0': lev0 += 1 166 | of.write('lev1:' + str(lev1) + '\n') 167 | of.write('lev0:' + str(lev0) + '\n') 168 | print('lev1 ' + str(lev1)) 169 | print('lev0 ' + str(lev0)) 170 | print('precision:' + str(lev1 / (lev0 + lev1))) 171 | of.close() 172 | -------------------------------------------------------------------------------- /swem/swem_max_margin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time, os, random, datetime, sys 4 | from sklearn import metrics 5 | sys.path.append('../') 6 | import config, utils 7 | 8 | class SWEM_HIER(object): 9 | def __init__(self, 10 | margin, 11 | sequence_length, 12 | vocab_size, 13 | embedding_size, 14 | embeddings): 15 | self.zero = tf.placeholder(tf.float32, [None]) 16 | self.q = tf.placeholder(tf.int32, [None, sequence_length]) 17 | self.qp = tf.placeholder(tf.int32, [None, sequence_length]) 18 | self.qn = tf.placeholder(tf.int32, [None, sequence_length]) 19 | 20 | with tf.device('/cpu:0'), tf.name_scope('embedding'): 21 | self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32) 22 | q_mat = tf.nn.embedding_lookup(self.word_mat, self.q) 23 | qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp) 24 | qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn) 25 | self.q_mat_exp = tf.expand_dims(q_mat, -1) 26 | self.qp_mat_exp = tf.expand_dims(qp_mat, -1) 27 | self.qn_mat_exp = tf.expand_dims(qn_mat, -1) 28 | """ 29 | q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1], 30 | strides=[1, 1, 1, 1], padding='VALID') 31 | qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1], 32 | strides=[1, 1, 1, 1], padding='VALID') 33 | qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1], 34 | strides=[1, 1, 1, 1], padding='VALID') 35 | """ 36 | q = tf.reshape(tf.reduce_max(self.q_mat_exp, 1), [-1, embedding_size]) 37 | qp = tf.reshape(tf.reduce_max(self.qp_mat_exp, 1), [-1, embedding_size]) 38 | qn = tf.reshape(tf.reduce_max(self.qn_mat_exp, 1), [-1, embedding_size]) 39 | 40 | self.cos_q_qp = self.cosine(q, qp) 41 | self.cos_q_qn = self.cosine(q, qn) 42 | self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn) 43 | 44 | correct = tf.equal(self.zero, loss_batch) 45 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float")) 46 | 47 | def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn): 48 | loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn))) 49 | losses = tf.reduce_sum(loss_batch) 50 | return losses, loss_batch 51 | 52 | def logloss(self, y, v_one, sim): 53 | a = tf.multiply(y, tf.log(sim)) #y*log(p) 54 | b = tf.subtract(v_one, y)#1-y 55 | c = tf.log(tf.subtract(v_one, sim))#log(1-p) 56 | losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p) 57 | losses = tf.reduce_sum(losses, -1) 58 | return losses 59 | 60 | def cosine(self, t1, t2): 61 | len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1)) 62 | len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1)) 63 | multiply = tf.reduce_sum(tf.multiply(t1, t2), 1) 64 | cos = tf.div(multiply, tf.multiply(len1, len2)) 65 | return tf.clip_by_value(cos, 1e-5, 0.99999) 66 | 67 | def get_constant(batch_size): 68 | one, zero = [1.0] * batch_size, [0.0] * batch_size 69 | return np.array(one), np.array(zero) 70 | 71 | margin = 0.05 72 | max_len = 200 73 | num_epoch = 200000 74 | batch_size = 256 75 | checkpoint_every = 50000 76 | vocab, embeddings = utils.load_embeddings() 77 | embedding_size = len(embeddings[0]) 78 | train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len) 79 | print('load data done ......') 80 | print(embeddings.shape) 81 | 82 | prev_auc = 0.0 83 | with tf.Graph().as_default(): 84 | session_conf = tf.ConfigProto( 85 | allow_soft_placement=True, log_device_placement=False) 86 | sess = tf.Session(config=session_conf) 87 | with sess.as_default(): 88 | swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings) 89 | global_step = tf.Variable(0, name="global_step", trainable=False) 90 | optimizer = tf.train.AdamOptimizer(1e-1) 91 | #optimizer = tf.train.GradientDescentOptimizer(1e-1) 92 | grads_and_vars = optimizer.compute_gradients(swem.losses) 93 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 94 | 95 | timestamp = str(int(time.time())) 96 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 97 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 98 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 99 | if not os.path.exists(checkpoint_dir): 100 | os.makedirs(checkpoint_dir) 101 | saver = tf.train.Saver(tf.all_variables()) 102 | sess.run(tf.initialize_all_variables()) 103 | 104 | def train_step(): 105 | q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size) 106 | one, zero = get_constant(batch_size) 107 | feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero} 108 | _, step, loss, cos, acc = sess.run( 109 | [train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict) 110 | time_str = datetime.datetime.now().isoformat() 111 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, acc)) 112 | 113 | def test_step(): 114 | yp, y, group = [], [], [] 115 | for i in range(0, len(test_data), batch_size): 116 | f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size) 117 | one, zero = get_constant(len(f)) 118 | feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero} 119 | loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict) 120 | yp.extend(cos) 121 | y.extend(f) 122 | group.extend(g) 123 | ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)] 124 | #for _y, _g, _yp in ppp: 125 | # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp)) 126 | return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] 127 | 128 | for i in range(num_epoch): 129 | train_step() 130 | current_step = tf.train.global_step(sess, global_step) 131 | if current_step % checkpoint_every == 0: 132 | y, g, yp = test_step() 133 | auc = utils.eval_auc(y, g, yp) 134 | top1_prec = utils._eval_top1_prec(y, g, yp) 135 | #if auc < prev_auc: 136 | # _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)] 137 | # features.append(_flist) 138 | # break 139 | #prev_auc = auc 140 | 141 | #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f') 142 | #utils.save_features(features[3], './data/gen_sweg_hier_test.f') 143 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random, sys, config 3 | from sklearn import metrics 4 | from operator import itemgetter 5 | from itertools import groupby 6 | 7 | def load_embeddings(): 8 | _data, embeddings, vocab, _id = [], [], {}, int(0) 9 | for line in open(config.w2v_bin_file): 10 | _data.append(line.strip().split(' ')) 11 | size, dim = int(_data[0][0]), int(_data[0][1]) 12 | for i in range(1, len(_data)): 13 | w, vec = _data[i][0], [float(_data[i][k]) for k in range(1, dim+1)] 14 | embeddings.append(vec) 15 | vocab[w] = _id 16 | _id += 1 17 | embeddings.append([0.01] * dim) 18 | vocab['UNKNOWN'] = _id 19 | _id += 1 20 | embeddings.append([0.01] * dim) 21 | vocab[''] = _id 22 | return vocab, np.array(embeddings) 23 | 24 | def encode_sent(s, vocab, max_len): 25 | ws = [w for w in s.split('_')] 26 | ws = ws[:max_len] if len(ws) >= max_len else ws + [''] * (max_len - len(ws)) 27 | nws = [] 28 | for w in ws: 29 | nw = w if w in vocab else 'UNKNOWN' 30 | nws.append(vocab[nw]) 31 | return nws 32 | 33 | def load_train_data(vocab, max_len): 34 | if config.dataset == config.dataset_ins: 35 | return ins_load_train_data(vocab, max_len) 36 | if config.dataset == config.dataset_qur: 37 | return qur_load_train_test_data(config.train_file, vocab, max_len) 38 | print('bad load_train_data') 39 | exit(1) 40 | 41 | def qur_load_train_test_data(_file, vocab, max_len): 42 | _data = [] 43 | for line in open(_file): 44 | f, q1, q2 = line.strip().split(' ') 45 | q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len) 46 | _data.append((int(f), q1, q2)) 47 | return _data 48 | 49 | def ins_load_train_data(vocab, max_len): 50 | _data = [] 51 | for line in open(config.train_file): 52 | f, q1, q2 = line.strip().split(' ') 53 | q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len) 54 | _data.append((q1, q2)) 55 | return _data 56 | 57 | def load_test_data(vocab, max_len): 58 | if config.dataset == config.dataset_ins: 59 | return ins_load_test_data(vocab, max_len) 60 | if config.dataset == config.dataset_qur: 61 | return qur_load_train_test_data(config.test1_file, vocab, max_len) 62 | print('bad load_test_data') 63 | exit(1) 64 | 65 | def ins_load_test_data(vocab, max_len): 66 | _data = [] 67 | for line in open(config.test1_file): 68 | f, g, q1, q2 = line.strip().split(' ') 69 | q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len) 70 | _data.append((f, g, q1, q2)) 71 | return _data 72 | 73 | def gen_train_batch_qpn(_data, batch_size): 74 | psample = random.sample(_data, batch_size) 75 | nsample = random.sample(_data, batch_size) 76 | q = [s1 for s1, s2 in psample] 77 | qp = [s2 for s1, s2 in psample] 78 | qn = [s2 for s1, s2 in nsample] 79 | return np.array(q), np.array(qp), np.array(qn) 80 | 81 | def gen_train_batch_yxx(_data, batch_size): 82 | if config.dataset == config.dataset_ins: 83 | return ins_gen_train_batch_yxx(_data, batch_size) 84 | if config.dataset == config.dataset_qur: 85 | return qur_gen_train_batch_yxx(_data, batch_size) 86 | print('bad gen_train_batch_yxx') 87 | exit(1) 88 | 89 | def qur_gen_train_batch_yxx(_data, batch_size): 90 | sample = random.sample(_data, batch_size) 91 | y = [i for i,_,_ in sample] 92 | x1 = [i for _,i,_ in sample] 93 | x2 = [i for _,_,i in sample] 94 | return np.array(y), np.array(x1), np.array(x2) 95 | 96 | def ins_gen_train_batch_yxx(_data, batch_size): 97 | part_one, part_two = int(batch_size / 4 * 3), int(batch_size / 4) 98 | psample = random.sample(_data, part_one) 99 | nsample = random.sample(_data, part_two) 100 | y = [1.0] * part_one + [0.0] * part_two 101 | x1 = [s1 for s1, s2 in psample] + [s1 for s1, s2 in psample[:part_two]] 102 | x2 = [s2 for s1, s2 in psample] + [s2 for s1, s2 in nsample] 103 | return np.array(y), np.array(x1), np.array(x2) 104 | 105 | def gen_test_batch_qpn(_data, start, end): 106 | sample = _data[start:end] 107 | for i in range(len(sample), end - start): 108 | sample.append(sample[-1]) 109 | f = [int(i) for i,_,_,_ in sample] 110 | g = [int(i) for _,i,_,_ in sample] 111 | q1 = [i for _,_,i,_ in sample] 112 | q2 = [i for _,_,_,i in sample] 113 | return f, g, np.array(q1), np.array(q2) 114 | 115 | def gen_test_batch_yxx(_data, start, end): 116 | if config.dataset == config.dataset_ins: 117 | return ins_gen_test_batch_yxx(_data, start, end) 118 | if config.dataset == config.dataset_qur: 119 | return qur_gen_test_batch_yxx(_data, start, end) 120 | print('bad gen_test_batch_yxx') 121 | exit(1) 122 | 123 | def qur_gen_test_batch_yxx(_data, start, end): 124 | sample = _data[start:end] 125 | y = [i for i,_,_ in sample] 126 | x1 = [i for _,i,_ in sample] 127 | x2 = [i for _,_,i in sample] 128 | return y, y, np.array(x1), np.array(x2) 129 | 130 | def ins_gen_test_batch_yxx(_data, start, end): 131 | sample = _data[start:end] 132 | for i in range(len(sample), end - start): 133 | sample.append(sample[-1]) 134 | f = [int(i) for i,_,_,_ in sample] 135 | g = [int(i) for _,i,_,_ in sample] 136 | q1 = [i for _,_,i,_ in sample] 137 | q2 = [i for _,_,_,i in sample] 138 | return f, g, np.array(q1), np.array(q2) 139 | 140 | def _eval(y, g, yp): 141 | if config.dataset == config.dataset_ins: 142 | eval_auc(y, g, yp) 143 | eval_top1_prec(y, g, yp) 144 | if config.dataset == config.dataset_qur: 145 | eval_auc(y, g, yp) 146 | eval_best_prec(y, g, yp) 147 | 148 | def eval_best_prec(y, g, yp): 149 | best_p, best_s = 0.0, 0.0 150 | for i in range(50, 100, 1): 151 | i = float(i) / 100 152 | positive = 0 153 | for _y, _yp in zip(y, yp): 154 | p = 1 if _yp >= i else 0 155 | if p == _y: positive += 1 156 | prec = positive / len(yp) 157 | if prec > best_p: 158 | best_p = prec 159 | best_s = i 160 | print('best_prec: ' + str(best_p) + ' best_threshold:' + str(best_s)) 161 | return best_p, best_s 162 | 163 | def eval_auc(y, g, yp): 164 | auc = metrics.roc_auc_score(y, yp) 165 | print('auc: ' + str(auc)) 166 | return auc 167 | 168 | def eval_top1_prec(y, g, yp): 169 | _list = [(_y, _g, _yp) for _y, _g, _yp in zip(y, g, yp)] 170 | _dict = {} 171 | for _y, _g, _yp in _list: 172 | if not _g in _dict: _dict[_g] = [] 173 | _dict[_g].append((_y, _g, _yp)) 174 | positive, gc = 0 , 0 175 | for _, group in _dict.items(): 176 | group = sorted(group, key=itemgetter(2), reverse=True) 177 | gc += 1 178 | if group[0][0] == 1: 179 | positive += 1 180 | prec = positive / gc 181 | print('top1 precision ' + str(positive) + '/' + str(gc) + ': '+ str(positive / gc)) 182 | return prec 183 | 184 | -------------------------------------------------------------------------------- /cnn/tensorflow/insqa_cnn.py.old: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | ########################################################################## 5 | # embedding_lookup + cnn + cosine margine , batch 6 | ########################################################################## 7 | class InsQACNN1(object): 8 | def __init__( 9 | self, sequence_length, batch_size, 10 | vocab_size, embedding_size, 11 | filter_sizes, num_filters, l2_reg_lambda=0.0): 12 | 13 | #用户问题,字向量使用embedding_lookup 14 | self.input_x_1 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_1") 15 | #待匹配正向问题 16 | self.input_x_2 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_2") 17 | #负向问题 18 | self.input_x_3 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_3") 19 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 20 | l2_loss = tf.constant(0.0) 21 | print("input_x_1 ", self.input_x_1) 22 | 23 | # Embedding layer 24 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 25 | W = tf.Variable( 26 | tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), 27 | name="W") 28 | chars_1 = tf.nn.embedding_lookup(W, self.input_x_1) 29 | chars_2 = tf.nn.embedding_lookup(W, self.input_x_2) 30 | chars_3 = tf.nn.embedding_lookup(W, self.input_x_3) 31 | #self.embedded_chars_1 = tf.nn.dropout(chars_1, self.dropout_keep_prob) 32 | #self.embedded_chars_2 = tf.nn.dropout(chars_2, self.dropout_keep_prob) 33 | #self.embedded_chars_3 = tf.nn.dropout(chars_3, self.dropout_keep_prob) 34 | self.embedded_chars_1 = chars_1 35 | self.embedded_chars_2 = chars_2 36 | self.embedded_chars_3 = chars_3 37 | self.embedded_chars_expanded_1 = tf.expand_dims(self.embedded_chars_1, -1) 38 | self.embedded_chars_expanded_2 = tf.expand_dims(self.embedded_chars_2, -1) 39 | self.embedded_chars_expanded_3 = tf.expand_dims(self.embedded_chars_3, -1) 40 | 41 | pooled_outputs_1 = [] 42 | pooled_outputs_2 = [] 43 | pooled_outputs_3 = [] 44 | for i, filter_size in enumerate(filter_sizes): 45 | with tf.name_scope("conv-maxpool-%s" % filter_size): 46 | filter_shape = [filter_size, embedding_size, 1, num_filters] 47 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 48 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") 49 | conv = tf.nn.conv2d( 50 | self.embedded_chars_expanded_1, 51 | W, 52 | strides=[1, 1, 1, 1], 53 | padding='VALID', 54 | name="conv-1" 55 | ) 56 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-1") 57 | pooled = tf.nn.max_pool( 58 | h, 59 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 60 | strides=[1, 1, 1, 1], 61 | padding='VALID', 62 | name="poll-1" 63 | ) 64 | pooled_outputs_1.append(pooled) 65 | 66 | conv = tf.nn.conv2d( 67 | self.embedded_chars_expanded_2, 68 | W, 69 | strides=[1, 1, 1, 1], 70 | padding='VALID', 71 | name="conv-2" 72 | ) 73 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-2") 74 | pooled = tf.nn.max_pool( 75 | h, 76 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 77 | strides=[1, 1, 1, 1], 78 | padding='VALID', 79 | name="poll-2" 80 | ) 81 | pooled_outputs_2.append(pooled) 82 | 83 | conv = tf.nn.conv2d( 84 | self.embedded_chars_expanded_3, 85 | W, 86 | strides=[1, 1, 1, 1], 87 | padding='VALID', 88 | name="conv-3" 89 | ) 90 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-3") 91 | pooled = tf.nn.max_pool( 92 | h, 93 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 94 | strides=[1, 1, 1, 1], 95 | padding='VALID', 96 | name="poll-3" 97 | ) 98 | pooled_outputs_3.append(pooled) 99 | num_filters_total = num_filters * len(filter_sizes) 100 | pooled_reshape_1 = tf.reshape(tf.concat(pooled_outputs_1, 3), [-1, num_filters_total]) 101 | pooled_reshape_2 = tf.reshape(tf.concat(pooled_outputs_2, 3), [-1, num_filters_total]) 102 | pooled_reshape_3 = tf.reshape(tf.concat(pooled_outputs_3, 3), [-1, num_filters_total]) 103 | #dropout 104 | pooled_flat_1 = tf.nn.dropout(pooled_reshape_1, self.dropout_keep_prob) 105 | pooled_flat_2 = tf.nn.dropout(pooled_reshape_2, self.dropout_keep_prob) 106 | pooled_flat_3 = tf.nn.dropout(pooled_reshape_3, self.dropout_keep_prob) 107 | 108 | pooled_len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_1), 1)) #计算向量长度Batch模式 109 | pooled_len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_2, pooled_flat_2), 1)) 110 | pooled_len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_3, pooled_flat_3), 1)) 111 | pooled_mul_12 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_2), 1) #计算向量的点乘Batch模式 112 | pooled_mul_13 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_3), 1) 113 | 114 | with tf.name_scope("output"): 115 | self.cos_12 = tf.div(pooled_mul_12, tf.multiply(pooled_len_1, pooled_len_2), name="scores") #计算向量夹角Batch模式 116 | self.cos_13 = tf.div(pooled_mul_13, tf.multiply(pooled_len_1, pooled_len_3)) 117 | 118 | zero = tf.constant(0, shape=[batch_size], dtype=tf.float32) 119 | margin = tf.constant(0.05, shape=[batch_size], dtype=tf.float32) 120 | with tf.name_scope("loss"): 121 | self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_12, self.cos_13))) 122 | self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss 123 | print('loss ', self.loss) 124 | 125 | # Accuracy 126 | with tf.name_scope("accuracy"): 127 | self.correct = tf.equal(zero, self.losses) 128 | self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy") 129 | for v in tf.trainable_variables(): 130 | print(v) 131 | exit(1) 132 | -------------------------------------------------------------------------------- /swem/swem_hier_margin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time, os, random, datetime, sys 4 | from sklearn import metrics 5 | sys.path.append('../') 6 | import config, utils 7 | 8 | #top 1 precision:54% 9 | class SWEM_HIER(object): 10 | def __init__(self, 11 | margin, 12 | sequence_length, 13 | vocab_size, 14 | embedding_size, 15 | embeddings): 16 | self.zero = tf.placeholder(tf.float32, [None]) 17 | self.q = tf.placeholder(tf.int32, [None, sequence_length]) 18 | self.qp = tf.placeholder(tf.int32, [None, sequence_length]) 19 | self.qn = tf.placeholder(tf.int32, [None, sequence_length]) 20 | 21 | with tf.device('/cpu:0'), tf.name_scope('embedding'): 22 | self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32) 23 | q_mat = tf.nn.embedding_lookup(self.word_mat, self.q) 24 | qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp) 25 | qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn) 26 | self.q_mat_exp = tf.expand_dims(q_mat, -1) 27 | self.qp_mat_exp = tf.expand_dims(qp_mat, -1) 28 | self.qn_mat_exp = tf.expand_dims(qn_mat, -1) 29 | 30 | self.word_mat1 = tf.Variable(embeddings, trainable=True, dtype=tf.float32) 31 | q_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.q) 32 | qp_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qp) 33 | qn_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qn) 34 | self.q_mat_exp1 = tf.expand_dims(q_mat1, -1) 35 | self.qp_mat_exp1 = tf.expand_dims(qp_mat1, -1) 36 | self.qn_mat_exp1 = tf.expand_dims(qn_mat1, -1) 37 | 38 | q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1], 39 | strides=[1, 1, 1, 1], padding='VALID') 40 | qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1], 41 | strides=[1, 1, 1, 1], padding='VALID') 42 | qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1], 43 | strides=[1, 1, 1, 1], padding='VALID') 44 | q = tf.reshape(tf.reduce_max(q, 1), [-1, embedding_size]) 45 | qp = tf.reshape(tf.reduce_max(qp, 1), [-1, embedding_size]) 46 | qn = tf.reshape(tf.reduce_max(qn, 1), [-1, embedding_size]) 47 | 48 | q1 = tf.nn.avg_pool(self.q_mat_exp1, ksize=[1, 1, 1, 1], 49 | strides=[1, 1, 1, 1], padding='VALID') 50 | qp1 = tf.nn.avg_pool(self.qp_mat_exp1, ksize=[1, 1, 1, 1], 51 | strides=[1, 1, 1, 1], padding='VALID') 52 | qn1 = tf.nn.avg_pool(self.qn_mat_exp1, ksize=[1, 1, 1, 1], 53 | strides=[1, 1, 1, 1], padding='VALID') 54 | q1 = tf.reshape(tf.reduce_max(q1, 1), [-1, embedding_size]) 55 | qp1 = tf.reshape(tf.reduce_max(qp1, 1), [-1, embedding_size]) 56 | qn1 = tf.reshape(tf.reduce_max(qn1, 1), [-1, embedding_size]) 57 | 58 | q = tf.concat([q, q1], 1) 59 | qp = tf.concat([qp, qp1], 1) 60 | qn = tf.concat([qn, qn1], 1) 61 | 62 | self.cos_q_qp = self.cosine(q, qp) 63 | self.cos_q_qn = self.cosine(q, qn) 64 | 65 | self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn) 66 | 67 | correct = tf.equal(self.zero, loss_batch) 68 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float")) 69 | 70 | def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn): 71 | loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn))) 72 | losses = tf.reduce_sum(loss_batch) 73 | return losses, loss_batch 74 | 75 | def logloss(self, y, v_one, sim): 76 | a = tf.multiply(y, tf.log(sim)) #y*log(p) 77 | b = tf.subtract(v_one, y)#1-y 78 | c = tf.log(tf.subtract(v_one, sim))#log(1-p) 79 | losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p) 80 | losses = tf.reduce_sum(losses, -1) 81 | return losses 82 | 83 | def cosine(self, t1, t2): 84 | len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1)) 85 | len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1)) 86 | multiply = tf.reduce_sum(tf.multiply(t1, t2), 1) 87 | cos = tf.div(multiply, tf.multiply(len1, len2)) 88 | return tf.clip_by_value(cos, 1e-5, 0.99999) 89 | 90 | def get_constant(batch_size): 91 | one, zero = [1.0] * batch_size, [0.0] * batch_size 92 | return np.array(one), np.array(zero) 93 | 94 | margin = 0.05 95 | max_len = 200 96 | num_epoch = 200000 97 | batch_size = 256 98 | checkpoint_every = 50000 99 | vocab, embeddings = utils.load_embeddings() 100 | embedding_size = len(embeddings[0]) 101 | train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len) 102 | print('load data done ......') 103 | print(embeddings.shape) 104 | 105 | prev_auc = 0.0 106 | with tf.Graph().as_default(): 107 | session_conf = tf.ConfigProto( 108 | allow_soft_placement=True, log_device_placement=False) 109 | sess = tf.Session(config=session_conf) 110 | with sess.as_default(): 111 | swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings) 112 | global_step = tf.Variable(0, name="global_step", trainable=False) 113 | optimizer = tf.train.AdamOptimizer(1e-1) 114 | #optimizer = tf.train.GradientDescentOptimizer(1e-1) 115 | grads_and_vars = optimizer.compute_gradients(swem.losses) 116 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 117 | 118 | timestamp = str(int(time.time())) 119 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 120 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 121 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 122 | if not os.path.exists(checkpoint_dir): 123 | os.makedirs(checkpoint_dir) 124 | saver = tf.train.Saver(tf.all_variables()) 125 | sess.run(tf.initialize_all_variables()) 126 | 127 | def train_step(): 128 | q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size) 129 | one, zero = get_constant(batch_size) 130 | feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero} 131 | _, step, loss, cos, acc = sess.run( 132 | [train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict) 133 | time_str = datetime.datetime.now().isoformat() 134 | print("{}: step {}, loss {:g}, acc{:g}".format(time_str, step, loss, acc)) 135 | 136 | def test_step(): 137 | yp, y, group = [], [], [] 138 | for i in range(0, len(test_data), batch_size): 139 | f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size) 140 | one, zero = get_constant(len(f)) 141 | feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero} 142 | loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict) 143 | yp.extend(cos) 144 | y.extend(f) 145 | group.extend(g) 146 | ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)] 147 | #for _y, _g, _yp in ppp: 148 | # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp)) 149 | return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] 150 | 151 | for i in range(num_epoch): 152 | train_step() 153 | current_step = tf.train.global_step(sess, global_step) 154 | if current_step % checkpoint_every == 0: 155 | y, g, yp = test_step() 156 | auc = utils.eval_auc(y, g, yp) 157 | top1_prec = utils._eval_top1_prec(y, g, yp) 158 | #if auc < prev_auc: 159 | # _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)] 160 | # features.append(_flist) 161 | # break 162 | #prev_auc = auc 163 | 164 | #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f') 165 | #utils.save_features(features[3], './data/gen_sweg_hier_test.f') 166 | -------------------------------------------------------------------------------- /cnn/tensorflow/insqa_train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3.4 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import os, time, datetime, operator, sys 6 | from insqa_cnn import InsQACNN 7 | sys.path.append('../../') 8 | import config, utils 9 | 10 | print(tf.__version__) 11 | 12 | # Parameters 13 | # ================================================== 14 | 15 | # Model Hyperparameters 16 | tf.flags.DEFINE_float("margin", 0.05, "CNN model margin") 17 | tf.flags.DEFINE_integer("sequence_length", 200, "Max sequence lehgth(default: 200)") 18 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)") 19 | tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')") 20 | tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)") 21 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)") 22 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)") 23 | 24 | # Training parameters 25 | tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)") 26 | tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)") 27 | tf.flags.DEFINE_integer("evaluate_every", 3000, "Evaluate model on dev set after this many steps (default: 100)") 28 | tf.flags.DEFINE_integer("checkpoint_every", 3000, "Save model after this many steps (default: 100)") 29 | # Misc Parameters 30 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 31 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 32 | FLAGS = tf.flags.FLAGS 33 | FLAGS._parse_flags() 34 | print("\nParameters:") 35 | for attr, value in sorted(FLAGS.__flags.items()): 36 | print("{}={}".format(attr.upper(), value)) 37 | print("") 38 | 39 | # Data Preparatopn 40 | # ================================================== 41 | 42 | # Load data 43 | print("Loading data...") 44 | vocab, embeddings = utils.load_embeddings() 45 | train_data = utils.load_train_data(vocab, FLAGS.sequence_length) 46 | test_data = utils.load_test_data(vocab, FLAGS.sequence_length) 47 | print("Load done...") 48 | 49 | # Training 50 | # ================================================== 51 | 52 | prev_auc = 0 53 | with tf.Graph().as_default(): 54 | with tf.device("/gpu:1"): 55 | session_conf = tf.ConfigProto( 56 | allow_soft_placement=FLAGS.allow_soft_placement, 57 | log_device_placement=FLAGS.log_device_placement) 58 | sess = tf.Session(config=session_conf) 59 | with sess.as_default(): 60 | cnn = InsQACNN( 61 | _margin=FLAGS.margin, 62 | sequence_length=FLAGS.sequence_length, 63 | batch_size=FLAGS.batch_size, 64 | vocab_size=len(vocab), 65 | embedding_size=FLAGS.embedding_dim, 66 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 67 | num_filters=FLAGS.num_filters, 68 | l2_reg_lambda=FLAGS.l2_reg_lambda) 69 | 70 | # Define Training procedure 71 | global_step = tf.Variable(0, name="global_step", trainable=False) 72 | optimizer = tf.train.AdamOptimizer(1e-1) 73 | #optimizer = tf.train.GradientDescentOptimizer(1e-2) 74 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 75 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 76 | 77 | # Keep track of gradient values and sparsity (optional) 78 | grad_summaries = [] 79 | for g, v in grads_and_vars: 80 | if g is not None: 81 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 82 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 83 | grad_summaries.append(grad_hist_summary) 84 | grad_summaries.append(sparsity_summary) 85 | grad_summaries_merged = tf.summary.merge(grad_summaries) 86 | 87 | # Output directory for models and summaries 88 | timestamp = str(int(time.time())) 89 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 90 | print("Writing to {}\n".format(out_dir)) 91 | 92 | # Summaries for loss and accuracy 93 | loss_summary = tf.summary.scalar("loss", cnn.loss) 94 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) 95 | 96 | # Train Summaries 97 | train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) 98 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 99 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def) 100 | 101 | # Dev summaries 102 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 103 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 104 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def) 105 | 106 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 107 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 108 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 109 | if not os.path.exists(checkpoint_dir): 110 | os.makedirs(checkpoint_dir) 111 | saver = tf.train.Saver(tf.all_variables()) 112 | 113 | # Initialize all variables 114 | sess.run(tf.initialize_all_variables()) 115 | 116 | def train_step(q, qp, qn): 117 | feed_dict = { 118 | cnn.q: q, cnn.qp: qp, cnn.qn: qn, 119 | #cnn.input_x_1: q, cnn.input_x_2: qp, cnn.input_x_3: qn, 120 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 121 | } 122 | _, step, summaries, loss, accuracy, cos1, cos2 = sess.run( 123 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.cos_q_qp, cnn.cos_q_qn], 124 | feed_dict) 125 | #print(cos1) 126 | #print(cos2) 127 | time_str = datetime.datetime.now().isoformat() 128 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 129 | train_summary_writer.add_summary(summaries, step) 130 | 131 | def test_step(): 132 | yp, y, group, of = [], [], [], open(config.predict1_file, 'w') 133 | for i in range(0, len(test_data), FLAGS.batch_size): 134 | f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+FLAGS.batch_size) 135 | feed_dict = { 136 | cnn.q: q1, cnn.qp: q2, cnn.qn: q2, 137 | #cnn.input_x_1: q1, cnn.input_x_2: q2, cnn.input_x_3: q2, 138 | cnn.dropout_keep_prob: 1.0 139 | } 140 | cos = sess.run([cnn.cos_q_qp], feed_dict) 141 | yp.extend(cos[0]) 142 | y.extend(f) 143 | group.extend(g) 144 | y, g, yp = y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)] 145 | auc = utils.eval_auc(y[:len(test_data)], g, yp[:len(test_data)]) 146 | top1_prec = utils._eval_top1_prec(y, g, yp) 147 | for p in yp[:len(test_data)]: of.write(str(p) + '\n') 148 | of.write(str(top1_prec) + '\n') 149 | of.close() 150 | return auc 151 | 152 | # Generate batches 153 | # Training loop. For each batch... 154 | for i in range(FLAGS.num_epochs): 155 | try: 156 | q, qp, qn = utils.gen_train_batch_qpn(train_data, FLAGS.batch_size) 157 | train_step(q, qp, qn) 158 | current_step = tf.train.global_step(sess, global_step) 159 | if current_step % FLAGS.evaluate_every == 0: 160 | auc = test_step() 161 | #if auc < prev_auc: break 162 | prev_auc = auc 163 | if current_step % FLAGS.checkpoint_every == 0: 164 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 165 | print("Saved model checkpoint to {}\n".format(path)) 166 | except Exception as e: 167 | print(e) 168 | -------------------------------------------------------------------------------- /cnn/tensorflow/insurance_qa_data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | empty_vector = [] 5 | for i in range(0, 100): 6 | empty_vector.append(float(0.0)) 7 | onevector = [] 8 | for i in range(0, 10): 9 | onevector.append(float(1)) 10 | zerovector = [] 11 | for i in range(0, 10): 12 | zerovector.append(float(0)) 13 | 14 | def build_vocab(): 15 | code = int(0) 16 | vocab = {} 17 | vocab['UNKNOWN'] = code 18 | code += 1 19 | for line in open('/export/jw/cnn/insuranceQA/train'): 20 | items = line.strip().split(' ') 21 | for i in range(2, 4): 22 | words = items[i].split('_') 23 | for word in words: 24 | if not word in vocab: 25 | vocab[word] = code 26 | code += 1 27 | for line in open('/export/jw/cnn/insuranceQA/test1'): 28 | items = line.strip().split(' ') 29 | for i in range(2, 4): 30 | words = items[i].split('_') 31 | for word in words: 32 | if not word in vocab: 33 | vocab[word] = code 34 | code += 1 35 | return vocab 36 | 37 | def rand_qa(qalist): 38 | index = random.randint(0, len(qalist) - 1) 39 | return qalist[index] 40 | 41 | def read_alist(): 42 | alist = [] 43 | for line in open('/export/jw/cnn/insuranceQA/train'): 44 | items = line.strip().split(' ') 45 | alist.append(items[3]) 46 | print('read_alist done ......') 47 | return alist 48 | 49 | def vocab_plus_overlap(vectors, sent, over, size): 50 | global onevector 51 | global zerovector 52 | oldict = {} 53 | words = over.split('_') 54 | if len(words) < size: 55 | size = len(words) 56 | for i in range(0, size): 57 | if words[i] == '': 58 | continue 59 | oldict[words[i]] = '#' 60 | matrix = [] 61 | words = sent.split('_') 62 | if len(words) < size: 63 | size = len(words) 64 | for i in range(0, size): 65 | vec = read_vector(vectors, words[i]) 66 | newvec = vec.copy() 67 | #if words[i] in oldict: 68 | # newvec += onevector 69 | #else: 70 | # newvec += zerovector 71 | matrix.append(newvec) 72 | return matrix 73 | 74 | def load_vectors(): 75 | vectors = {} 76 | for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'): 77 | items = line.strip().split(' ') 78 | if (len(items) < 101): 79 | continue 80 | vec = [] 81 | for i in range(1, 101): 82 | vec.append(float(items[i])) 83 | vectors[items[0]] = vec 84 | return vectors 85 | 86 | def read_vector(vectors, word): 87 | global empty_vector 88 | if word in vectors: 89 | return vectors[word] 90 | else: 91 | return empty_vector 92 | #return vectors[''] 93 | 94 | def load_test_and_vectors(): 95 | testList = [] 96 | for line in open('/export/jw/cnn/insuranceQA/test1'): 97 | testList.append(line.strip()) 98 | vectors = load_vectors() 99 | return testList, vectors 100 | 101 | def load_train_and_vectors(): 102 | trainList = [] 103 | for line in open('/export/jw/cnn/insuranceQA/train'): 104 | trainList.append(line.strip()) 105 | vectors = load_vectors() 106 | return trainList, vectors 107 | 108 | def load_data_val_10(testList, vectors, index): 109 | x_train_1 = [] 110 | x_train_2 = [] 111 | x_train_3 = [] 112 | items = testList[index].split(' ') 113 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) 114 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) 115 | x_train_3.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) 116 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) 117 | 118 | def read_raw(): 119 | raw = [] 120 | for line in open('/export/jw/cnn/insuranceQA/train'): 121 | items = line.strip().split(' ') 122 | if items[0] == '1': 123 | raw.append(items) 124 | return raw 125 | 126 | def encode_sent(vocab, string, size): 127 | x = [] 128 | words = string.split('_') 129 | for i in range(0, 200): 130 | if words[i] in vocab: 131 | x.append(vocab[words[i]]) 132 | else: 133 | x.append(vocab['UNKNOWN']) 134 | return x 135 | 136 | def load_data_6(vocab, alist, raw, size): 137 | x_train_1 = [] 138 | x_train_2 = [] 139 | x_train_3 = [] 140 | for i in range(0, size): 141 | items = raw[random.randint(0, len(raw) - 1)] 142 | nega = rand_qa(alist) 143 | x_train_1.append(encode_sent(vocab, items[2], 100)) 144 | x_train_2.append(encode_sent(vocab, items[3], 100)) 145 | x_train_3.append(encode_sent(vocab, nega, 100)) 146 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) 147 | 148 | def load_data_val_6(testList, vocab, index, batch): 149 | x_train_1 = [] 150 | x_train_2 = [] 151 | x_train_3 = [] 152 | for i in range(0, batch): 153 | true_index = index + i 154 | if (true_index >= len(testList)): 155 | true_index = len(testList) - 1 156 | items = testList[true_index].split(' ') 157 | x_train_1.append(encode_sent(vocab, items[2], 100)) 158 | x_train_2.append(encode_sent(vocab, items[3], 100)) 159 | x_train_3.append(encode_sent(vocab, items[3], 100)) 160 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) 161 | 162 | def load_data_9(trainList, vectors, size): 163 | x_train_1 = [] 164 | x_train_2 = [] 165 | y_train = [] 166 | for i in range(0, size): 167 | pos = trainList[random.randint(0, len(trainList) - 1)] 168 | posItems = pos.strip().split(' ') 169 | x_train_1.append(vocab_plus_overlap(vectors, posItems[2], posItems[3], 200)) 170 | x_train_2.append(vocab_plus_overlap(vectors, posItems[3], posItems[2], 200)) 171 | y_train.append([1, 0]) 172 | neg = trainList[random.randint(0, len(trainList) - 1)] 173 | negItems = neg.strip().split(' ') 174 | x_train_1.append(vocab_plus_overlap(vectors, posItems[2], negItems[3], 200)) 175 | x_train_2.append(vocab_plus_overlap(vectors, negItems[3], posItems[2], 200)) 176 | y_train.append([0, 1]) 177 | return np.array(x_train_1), np.array(x_train_2), np.array(y_train) 178 | 179 | def load_data_val_9(testList, vectors, index): 180 | x_train_1 = [] 181 | x_train_2 = [] 182 | items = testList[index].split(' ') 183 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) 184 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) 185 | return np.array(x_train_1), np.array(x_train_2) 186 | 187 | def load_data_10(vectors, qalist, raw, size): 188 | x_train_1 = [] 189 | x_train_2 = [] 190 | x_train_3 = [] 191 | items = raw[random.randint(0, len(raw) - 1)] 192 | nega = rand_qa(qalist) 193 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) 194 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) 195 | x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200)) 196 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) 197 | 198 | def load_data_11(vectors, qalist, raw, size): 199 | x_train_1 = [] 200 | x_train_2 = [] 201 | x_train_3 = [] 202 | items = raw[random.randint(0, len(raw) - 1)] 203 | nega = rand_qa(qalist) 204 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200)) 205 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200)) 206 | x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200)) 207 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3) 208 | 209 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 210 | data = np.array(data) 211 | data_size = len(data) 212 | num_batches_per_epoch = int(len(data)/batch_size) + 1 213 | for epoch in range(num_epochs): 214 | # Shuffle the data at each epoch 215 | if shuffle: 216 | shuffle_indices = np.random.permutation(np.arange(data_size)) 217 | shuffled_data = data[shuffle_indices] 218 | else: 219 | shuffled_data = data 220 | for batch_num in range(num_batches_per_epoch): 221 | start_index = batch_num * batch_size 222 | end_index = min((batch_num + 1) * batch_size, data_size) 223 | yield shuffled_data[start_index:end_index] 224 | 225 | 226 | -------------------------------------------------------------------------------- /cnn/tensorflow/insqa_train.py.old: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3.4 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import time 7 | import datetime 8 | import insurance_qa_data_helpers 9 | from insqa_cnn import InsQACNN1 10 | import operator 11 | 12 | #print tf.__version__ 13 | 14 | # Parameters 15 | # ================================================== 16 | 17 | # Model Hyperparameters 18 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)") 19 | tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')") 20 | tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)") 21 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)") 22 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)") 23 | 24 | # Training parameters 25 | tf.flags.DEFINE_integer("batch_size", 100, "Batch Size (default: 64)") 26 | tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)") 27 | tf.flags.DEFINE_integer("evaluate_every", 5000, "Evaluate model on dev set after this many steps (default: 100)") 28 | tf.flags.DEFINE_integer("checkpoint_every", 5000, "Save model after this many steps (default: 100)") 29 | # Misc Parameters 30 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 31 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 32 | 33 | FLAGS = tf.flags.FLAGS 34 | FLAGS._parse_flags() 35 | print("\nParameters:") 36 | for attr, value in sorted(FLAGS.__flags.items()): 37 | print("{}={}".format(attr.upper(), value)) 38 | print("") 39 | 40 | # Data Preparatopn 41 | # ================================================== 42 | 43 | # Load data 44 | print("Loading data...") 45 | 46 | vocab = insurance_qa_data_helpers.build_vocab() 47 | alist = insurance_qa_data_helpers.read_alist() 48 | raw = insurance_qa_data_helpers.read_raw() 49 | x_train_1, x_train_2, x_train_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size) 50 | testList, vectors = insurance_qa_data_helpers.load_test_and_vectors() 51 | vectors = '' 52 | print('x_train_1', np.shape(x_train_1)) 53 | print("Load done...") 54 | 55 | val_file = '/export/jw/cnn/insuranceQA/test1' 56 | precision = '/export/jw/cnn/insuranceQA/test1.acc' 57 | #x_val, y_val = data_deepqa.load_data_val() 58 | 59 | # Training 60 | # ================================================== 61 | 62 | with tf.Graph().as_default(): 63 | with tf.device("/gpu:1"): 64 | session_conf = tf.ConfigProto( 65 | allow_soft_placement=FLAGS.allow_soft_placement, 66 | log_device_placement=FLAGS.log_device_placement) 67 | sess = tf.Session(config=session_conf) 68 | with sess.as_default(): 69 | cnn = InsQACNN1( 70 | sequence_length=x_train_1.shape[1], 71 | batch_size=FLAGS.batch_size, 72 | vocab_size=len(vocab), 73 | embedding_size=FLAGS.embedding_dim, 74 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 75 | num_filters=FLAGS.num_filters, 76 | l2_reg_lambda=FLAGS.l2_reg_lambda) 77 | 78 | # Define Training procedure 79 | global_step = tf.Variable(0, name="global_step", trainable=False) 80 | optimizer = tf.train.AdamOptimizer(1e-1) 81 | #optimizer = tf.train.GradientDescentOptimizer(1e-2) 82 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 83 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 84 | 85 | # Keep track of gradient values and sparsity (optional) 86 | grad_summaries = [] 87 | for g, v in grads_and_vars: 88 | if g is not None: 89 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 90 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 91 | grad_summaries.append(grad_hist_summary) 92 | grad_summaries.append(sparsity_summary) 93 | grad_summaries_merged = tf.summary.merge(grad_summaries) 94 | 95 | # Output directory for models and summaries 96 | timestamp = str(int(time.time())) 97 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 98 | print("Writing to {}\n".format(out_dir)) 99 | 100 | # Summaries for loss and accuracy 101 | loss_summary = tf.summary.scalar("loss", cnn.loss) 102 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) 103 | 104 | # Train Summaries 105 | train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) 106 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 107 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def) 108 | 109 | # Dev summaries 110 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 111 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 112 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def) 113 | 114 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 115 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 116 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 117 | if not os.path.exists(checkpoint_dir): 118 | os.makedirs(checkpoint_dir) 119 | saver = tf.train.Saver(tf.all_variables()) 120 | 121 | # Initialize all variables 122 | sess.run(tf.initialize_all_variables()) 123 | 124 | def train_step(x_batch_1, x_batch_2, x_batch_3): 125 | """ 126 | A single training step 127 | """ 128 | feed_dict = { 129 | cnn.input_x_1: x_batch_1, 130 | cnn.input_x_2: x_batch_2, 131 | cnn.input_x_3: x_batch_3, 132 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 133 | } 134 | _, step, summaries, loss, accuracy = sess.run( 135 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], 136 | feed_dict) 137 | time_str = datetime.datetime.now().isoformat() 138 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 139 | train_summary_writer.add_summary(summaries, step) 140 | 141 | def dev_step(): 142 | scoreList = [] 143 | i = int(0) 144 | while True: 145 | x_test_1, x_test_2, x_test_3 = insurance_qa_data_helpers.load_data_val_6(testList, vocab, i, FLAGS.batch_size) 146 | feed_dict = { 147 | cnn.input_x_1: x_test_1, 148 | cnn.input_x_2: x_test_2, 149 | cnn.input_x_3: x_test_3, 150 | cnn.dropout_keep_prob: 1.0 151 | } 152 | batch_scores = sess.run([cnn.cos_12], feed_dict) 153 | for score in batch_scores[0]: 154 | scoreList.append(score) 155 | i += FLAGS.batch_size 156 | if i >= len(testList): 157 | break 158 | sessdict = {} 159 | index = int(0) 160 | for line in open(val_file): 161 | items = line.strip().split(' ') 162 | qid = items[1].split(':')[1] 163 | if not qid in sessdict: 164 | sessdict[qid] = [] 165 | sessdict[qid].append((scoreList[index], items[0])) 166 | index += 1 167 | if index >= len(testList): 168 | break 169 | lev1 = float(0) 170 | lev0 = float(0) 171 | of = open(precision, 'a') 172 | for k, v in sessdict.items(): 173 | v.sort(key=operator.itemgetter(0), reverse=True) 174 | score, flag = v[0] 175 | if flag == '1': 176 | lev1 += 1 177 | if flag == '0': 178 | lev0 += 1 179 | of.write('lev1:' + str(lev1) + '\n') 180 | of.write('lev0:' + str(lev0) + '\n') 181 | print('lev1 ' + str(lev1)) 182 | print('lev0 ' + str(lev0)) 183 | of.close() 184 | 185 | # Generate batches 186 | # Training loop. For each batch... 187 | for i in range(FLAGS.num_epochs): 188 | try: 189 | x_batch_1, x_batch_2, x_batch_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size) 190 | train_step(x_batch_1, x_batch_2, x_batch_3) 191 | current_step = tf.train.global_step(sess, global_step) 192 | if current_step % FLAGS.evaluate_every == 0: 193 | print("\nEvaluation:") 194 | dev_step() 195 | print("") 196 | if current_step % FLAGS.checkpoint_every == 0: 197 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 198 | print("Saved model checkpoint to {}\n".format(path)) 199 | except Exception as e: 200 | print(e) 201 | -------------------------------------------------------------------------------- /rnn_attention/tensorflow/tf_rnn_char.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #################################################################################### 4 | #test1 top1准确率59% 5 | #################################################################################### 6 | import tensorflow as tf 7 | import numpy as np 8 | from operator import itemgetter 9 | import random, datetime, json, insurance_qa_data_helpers 10 | 11 | class RNN_Model(object): 12 | def _rnn_net(self, inputs, mask, embedding, keep_prob, batch_size, embed_dim, num_step, fw_cell, bw_cell): 13 | _initial_state = fw_cell.zero_state(batch_size,dtype=tf.float32) 14 | inputs=tf.nn.embedding_lookup(embedding, inputs) 15 | inputs = tf.nn.dropout(inputs, self.keep_prob) 16 | #[batch_size, sequence_length, embedding_size]转换为[sequence_length, batch_size, embedding_size] 17 | inputs = tf.transpose(inputs, [1, 0, 2]) 18 | #[sequence_length, batch_size, embedding_size]转换为list, sequence_length个[batch_size, embedding_size] 19 | inputs = tf.unstack(inputs) 20 | #inputs = tf.reshape(inputs, [-1, embed_dim]) 21 | #inputs = tf.split(inputs, num_step, 0) 22 | #输出为list, sequence_length个[batch_size, embedding_size * 2] 23 | outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=_initial_state, initial_state_bw=_initial_state) 24 | outputs = tf.transpose(tf.stack(outputs), [1, 0, 2]) 25 | self.outputs = outputs 26 | #对rnn的输出[batch_size, sequence_length, embedding_size],目前采用maxpooling是最好的效果 27 | #mean_pooling以及取最后一个step的向量,效果都不好 28 | outputs = self._max_pooling(outputs) 29 | print outputs 30 | 31 | #outputs = outputs[-1] 32 | #outputs = outputs * mask[:, :, None] 33 | #mean pooling 34 | #outputs = tf.reduce_sum(outputs, 0) / (tf.reduce_sum(mask, 0)[:,None]) 35 | return outputs 36 | 37 | def _max_pooling(self, lstm): 38 | sequence_length, embedding_size = int(lstm.get_shape()[1]), int(lstm.get_shape()[2]) 39 | lstm = tf.expand_dims(lstm, -1) 40 | output = tf.nn.max_pool(lstm, ksize=[1, sequence_length, 1, 1], strides=[1, 1, 1, 1], padding='VALID') 41 | output = tf.reshape(output, [-1, embedding_size]) 42 | return output 43 | 44 | def __init__(self, config, is_training=True): 45 | self.keep_prob=tf.placeholder(tf.float32, name='dropout_keep_prob') 46 | self.batch_size=config.batch_size 47 | self.num_step=config.num_step 48 | 49 | self.qlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step]) 50 | #这个版本没有使用mask 51 | self.mask_q = tf.placeholder(tf.float32, [self.num_step, self.batch_size]) 52 | self.plist = tf.placeholder(tf.int32, [self.batch_size, self.num_step]) 53 | self.mask_p = tf.placeholder(tf.float32, [self.num_step, self.batch_size]) 54 | self.nlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step]) 55 | self.mask_n = tf.placeholder(tf.float32, [self.num_step, self.batch_size]) 56 | 57 | hidden_neural_size=config.hidden_neural_size 58 | vocabulary_size=config.vocabulary_size 59 | self.embed_dim=config.embed_dim 60 | hidden_layer_num=config.hidden_layer_num 61 | 62 | #fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True) 63 | fw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu) 64 | fw_cell = tf.contrib.rnn.DropoutWrapper( 65 | fw_cell,output_keep_prob=self.keep_prob 66 | ) 67 | #bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True) 68 | bw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu) 69 | bw_cell = tf.contrib.rnn.DropoutWrapper( 70 | bw_cell,output_keep_prob=self.keep_prob 71 | ) 72 | 73 | #embedding layer 74 | with tf.device("/cpu:1"),tf.name_scope("embedding_layer"): 75 | self.embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W') 76 | #self.a_embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W') 77 | 78 | q = self._rnn_net(self.qlist, mask_q, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell) 79 | tf.get_variable_scope().reuse_variables() 80 | p = self._rnn_net(self.plist, mask_p, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell) 81 | tf.get_variable_scope().reuse_variables() 82 | n = self._rnn_net(self.nlist, mask_n, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell) 83 | #len_1 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1)), 0.01, 100000) 84 | #len_2 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1)), 0.01, 100000) 85 | #len_3 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1)), 0.01, 100000) 86 | len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1)) 87 | len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1)) 88 | len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1)) 89 | 90 | self.cos12 = tf.reduce_sum(tf.multiply(q, p), axis=1) / (len_1 * len_2) 91 | self.cos13 = tf.reduce_sum(tf.multiply(q, n), axis=1) / (len_1 * len_3) 92 | self.q = q 93 | self.p = p 94 | 95 | zero = tf.constant(np.zeros(self.batch_size, dtype='float32')) 96 | margin = tf.constant(np.full(self.batch_size, 0.1, dtype='float32')) 97 | diff = tf.cast(tf.maximum(zero, margin - self.cos12 + self.cos13), dtype='float32') 98 | self.cost = tf.reduce_sum(diff) 99 | self.accuracy = tf.reduce_sum(tf.cast(tf.equal(zero, diff), dtype='float32')) / float(self.batch_size) 100 | 101 | def train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n): 102 | fetches = [model.cost, model.accuracy, global_step, train_op, model.cos12, model.q, model.p, model.outputs] 103 | feed_dict = { 104 | model.qlist: qlist, 105 | model.plist: plist, 106 | model.nlist: nlist, 107 | model.mask_q : mask_q, 108 | model.mask_p : mask_p, 109 | model.mask_n : mask_n, 110 | model.keep_prob: config.keep_prob 111 | } 112 | cost, accuracy, step, _, cos12, q, p, outputs = sess.run(fetches, feed_dict) 113 | time_str = datetime.datetime.now().isoformat() 114 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy)) 115 | 116 | 117 | def dev_step(model, vocab, batch_size, max_len): 118 | score_list, i = [], int(0) 119 | while True: 120 | qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_val_data(test_list, vocab, i, FLAGS.batch_size, max_len) 121 | feed_dict = { 122 | model.qlist: qlist, 123 | model.plist: plist, 124 | model.nlist: nlist, 125 | model.mask_q : mask_q, 126 | model.mask_p : mask_p, 127 | model.mask_n : mask_n, 128 | model.keep_prob: float(1.0) 129 | } 130 | batch_scores = sess.run([model.cos12], feed_dict) 131 | for score in batch_scores[0]: 132 | score_list.append(score) 133 | i += FLAGS.batch_size 134 | if i >= len(test_list): 135 | break 136 | insurance_qa_data_helpers.evaluation(score_list, test_list) 137 | 138 | tf.flags.DEFINE_integer('evaluate_every',10000,'evaluate every') 139 | tf.flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure') 140 | tf.flags.DEFINE_integer('emdedding_dim',100,'embedding dim') 141 | tf.flags.DEFINE_integer('hidden_neural_size',200,'LSTM hidden neural size') 142 | tf.flags.DEFINE_integer('hidden_layer_num',1,'LSTM hidden layer num') 143 | tf.flags.DEFINE_integer('max_len',100,'max_len of training sentence') 144 | tf.flags.DEFINE_float('init_scale',0.1,'init scale') 145 | tf.flags.DEFINE_float('keep_prob',0.5,'dropout rate') 146 | tf.flags.DEFINE_integer('num_epoch',1000000,'num epoch') 147 | tf.flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm') 148 | # Misc Parameters 149 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 150 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 151 | FLAGS = tf.flags.FLAGS 152 | FLAGS._parse_flags() 153 | 154 | vocab = insurance_qa_data_helpers.build_vocab() 155 | train_list = insurance_qa_data_helpers.load_train_list() 156 | qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len) 157 | test_list = insurance_qa_data_helpers.load_test_list() 158 | 159 | class Config(object): 160 | hidden_neural_size=FLAGS.hidden_neural_size 161 | vocabulary_size=len(vocab) 162 | embed_dim=FLAGS.emdedding_dim 163 | hidden_layer_num=FLAGS.hidden_layer_num 164 | keep_prob=FLAGS.keep_prob 165 | batch_size = FLAGS.batch_size 166 | num_step = FLAGS.max_len 167 | max_grad_norm=FLAGS.max_grad_norm 168 | num_epoch = FLAGS.num_epoch 169 | 170 | config = Config() 171 | eval_config=Config() 172 | eval_config.keep_prob=1.0 173 | 174 | with tf.Graph().as_default(): 175 | with tf.device('/gpu:1'): 176 | session_conf = tf.ConfigProto( 177 | allow_soft_placement=FLAGS.allow_soft_placement, 178 | log_device_placement=FLAGS.log_device_placement) 179 | sess = tf.Session(config=session_conf) 180 | with sess.as_default(): 181 | initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale) 182 | with tf.variable_scope("model",reuse=None,initializer=initializer): 183 | model = RNN_Model(config=config, is_training=True) 184 | 185 | # Define Training procedure 186 | global_step = tf.Variable(0, name="global_step", trainable=False) 187 | #optimizer = tf.train.RMSPropOptimizer(0.01) 188 | #optimizer = tf.train.AdamOptimizer(0.1) 189 | optimizer = tf.train.GradientDescentOptimizer(0.2) 190 | grads_and_vars = optimizer.compute_gradients(model.cost) 191 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 192 | 193 | # Initialize all variables 194 | sess.run(tf.global_variables_initializer()) 195 | for i in range(config.num_epoch): 196 | qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len) 197 | train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n) 198 | current_step = tf.train.global_step(sess, global_step) 199 | if current_step % FLAGS.evaluate_every == 0: 200 | dev_step(model, vocab, FLAGS.batch_size, FLAGS.max_len) 201 | -------------------------------------------------------------------------------- /cnn/theano/insqa_cnn.py: -------------------------------------------------------------------------------- 1 | 2 | ########################################################### 3 | # test1 top-1 precision: 62% 4 | ########################################################### 5 | 6 | import os, sys, timeit, random, operator 7 | 8 | import numpy as np 9 | 10 | import theano 11 | import theano.tensor as T 12 | from theano.tensor.signal import pool 13 | from theano.tensor.nnet import conv2d 14 | 15 | #TODO change path to your dataset 16 | trainfile = '/export/jw/cnn/insuranceQA/train' 17 | test1file = '/export/jw/cnn/insuranceQA/test1' 18 | vectorsfile = '/export/jw/cnn/insuranceQA/vectors.nobin' 19 | 20 | ########################################################### 21 | # read qa data 22 | ########################################################### 23 | def build_vocab(): 24 | global trainfile 25 | code, vocab = int(0), {} 26 | vocab['UNKNOWN'] = code 27 | code += 1 28 | for line in open(trainfile): 29 | items = line.strip().split(' ') 30 | for i in range(2, 3): 31 | for word in items[i].split('_'): 32 | if len(word) <= 0: 33 | continue 34 | if not word in vocab: 35 | vocab[word] = code 36 | code += 1 37 | return vocab 38 | 39 | def load_vectors(): 40 | global vectorsfile 41 | vectors = {} 42 | for line in open(vectorsfile): 43 | items = line.strip().split(' ') 44 | if len(items[0]) <= 0: 45 | continue 46 | vec = [] 47 | for i in range(1, 101): 48 | vec.append(float(items[i])) 49 | vectors[items[0]] = vec 50 | return vectors 51 | 52 | def load_word_embeddings(vocab, dim): 53 | vectors = load_vectors() 54 | embeddings = [] #brute initialization 55 | for i in range(0, len(vocab)): 56 | vec = [] 57 | for j in range(0, dim): 58 | vec.append(0.01) 59 | embeddings.append(vec) 60 | for word, code in vocab.items(): 61 | if word in vectors: 62 | embeddings[code] = vectors[word] 63 | return np.array(embeddings, dtype='float32') 64 | 65 | #be attention initialization of UNKNNOW 66 | def encode_sent(vocab, string, size): 67 | x = [] 68 | words = string.split('_') 69 | for i in range(0, size): 70 | if words[i] in vocab: 71 | x.append(vocab[words[i]]) 72 | else: 73 | x.append(vocab['UNKNOWN']) 74 | return x 75 | 76 | def load_train_list(): 77 | global trainfile 78 | trainList = [] 79 | for line in open(trainfile): 80 | trainList.append(line.strip().split(' ')) 81 | return trainList 82 | 83 | def load_test_list(): 84 | global test1file 85 | testList = [] 86 | for line in open(test1file): 87 | testList.append(line.strip().split(' ')) 88 | return testList 89 | 90 | def load_data(trainList, vocab, batch_size): 91 | train_1, train_2, train_3 = [], [], [] 92 | for i in range(0, batch_size): 93 | pos = trainList[random.randint(0, len(trainList)-1)] 94 | neg = trainList[random.randint(0, len(trainList)-1)] 95 | train_1.append(encode_sent(vocab, pos[2], 100)) 96 | train_2.append(encode_sent(vocab, pos[3], 100)) 97 | train_3.append(encode_sent(vocab, neg[3], 100)) 98 | return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32') 99 | 100 | def load_data_val(testList, vocab, index, batch_size): 101 | x1, x2, x3 = [], [], [] 102 | for i in range(0, batch_size): 103 | true_index = index + i 104 | if true_index >= len(testList): 105 | true_index = len(testList) - 1 106 | items = testList[true_index] 107 | x1.append(encode_sent(vocab, items[2], 100)) 108 | x2.append(encode_sent(vocab, items[3], 100)) 109 | x3.append(encode_sent(vocab, items[3], 100)) 110 | return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32') 111 | 112 | def validation(validate_model, testList, vocab, batch_size): 113 | index, score_list = int(0), [] 114 | while True: 115 | x1, x2, x3 = load_data_val(testList, vocab, index, batch_size) 116 | batch_scores, nouse = validate_model(x1, x2, x3, 1.0) 117 | for score in batch_scores: 118 | score_list.append(score) 119 | index += batch_size 120 | if index >= len(testList): 121 | break 122 | print 'Evaluation ' + str(index) 123 | sdict, index = {}, int(0) 124 | for items in testList: 125 | qid = items[1].split(':')[1] 126 | if not qid in sdict: 127 | sdict[qid] = [] 128 | sdict[qid].append((score_list[index], items[0])) 129 | index += 1 130 | lev0, lev1 = float(0), float(0) 131 | for qid, cases in sdict.items(): 132 | cases.sort(key=operator.itemgetter(0), reverse=True) 133 | score, flag = cases[0] 134 | if flag == '1': 135 | lev1 += 1 136 | if flag == '0': 137 | lev0 += 1 138 | print 'top-1 precition: ' + str(lev1 / (lev0 + lev1)) 139 | 140 | class QACnn(object): 141 | def __init__(self, input1, input2, input3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters, keep_prob): 142 | rng = np.random.RandomState(23455) 143 | self.params = [] 144 | 145 | lookup_table = theano.shared(word_embeddings) 146 | self.params += [lookup_table] 147 | #input1-问题, input2-正向答案, input3-负向答案 148 | #将每个字替换成字向量 149 | input_matrix1 = lookup_table[T.cast(input1.flatten(), dtype="int32")] 150 | input_matrix2 = lookup_table[T.cast(input2.flatten(), dtype="int32")] 151 | input_matrix3 = lookup_table[T.cast(input3.flatten(), dtype="int32")] 152 | 153 | #CNN的输入是4维矩阵,这里只是增加了一个维度而已 154 | input_x1 = input_matrix1.reshape((batch_size, 1, sequence_len, embedding_size)) 155 | input_x2 = input_matrix2.reshape((batch_size, 1, sequence_len, embedding_size)) 156 | input_x3 = input_matrix3.reshape((batch_size, 1, sequence_len, embedding_size)) 157 | #print(input_x1.shape.eval()) 158 | self.dbg_x1 = input_x1 159 | 160 | outputs_1, outputs_2, outputs_3 = [], [], [] 161 | #设置多种大小的filter 162 | for filter_size in filter_sizes: 163 | #每种大小的filter的数量是num_filters 164 | filter_shape = (num_filters, 1, filter_size, embedding_size) 165 | image_shape = (batch_size, 1, sequence_len, embedding_size) 166 | fan_in = np.prod(filter_shape[1:]) 167 | fan_out = filter_shape[0] * np.prod(filter_shape[2:]) 168 | W_bound = np.sqrt(6. / (fan_in + fan_out)) 169 | W = theano.shared( 170 | np.asarray( 171 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), 172 | dtype=theano.config.floatX 173 | ), 174 | borrow=True 175 | ) 176 | b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX) 177 | b = theano.shared(value=b_values, borrow=True) 178 | 179 | #卷积+max_pooling 180 | conv_out = conv2d(input=input_x1, filters=W, filter_shape=filter_shape, input_shape=image_shape) 181 | #卷积后的向量的长度为ds 182 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') 183 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) 184 | outputs_1.append(pooled_active) 185 | 186 | conv_out = conv2d(input=input_x2, filters=W, filter_shape=filter_shape, input_shape=image_shape) 187 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') 188 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) 189 | outputs_2.append(pooled_active) 190 | 191 | conv_out = conv2d(input=input_x3, filters=W, filter_shape=filter_shape, input_shape=image_shape) 192 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') 193 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) 194 | outputs_3.append(pooled_active) 195 | 196 | self.params += [W, b] 197 | self.dbg_conv_out = conv_out.shape 198 | 199 | num_filters_total = num_filters * len(filter_sizes) 200 | self.dbg_outputs_1 = outputs_1[0].shape 201 | #每一个句子的语义表示向量的长度为num_filters_total 202 | output_flat1 = T.reshape(T.concatenate(outputs_1, axis=1), [batch_size, num_filters_total]) 203 | output_flat2 = T.reshape(T.concatenate(outputs_2, axis=1), [batch_size, num_filters_total]) 204 | output_flat3 = T.reshape(T.concatenate(outputs_3, axis=1), [batch_size, num_filters_total]) 205 | #dropout, keep_prob为1表示不进行dropout 206 | output_drop1 = self._dropout(rng, output_flat1, keep_prob) 207 | output_drop2 = self._dropout(rng, output_flat2, keep_prob) 208 | output_drop3 = self._dropout(rng, output_flat3, keep_prob) 209 | 210 | #计算问题和答案之前的向量夹角 211 | #计算向量的长度 212 | len1 = T.sqrt(T.sum(output_drop1 * output_drop1, axis=1)) 213 | len2 = T.sqrt(T.sum(output_drop2 * output_drop2, axis=1)) 214 | len3 = T.sqrt(T.sum(output_drop3 * output_drop3, axis=1)) 215 | #计算向量之间的夹角 216 | cos12 = T.sum(output_drop1 * output_drop2, axis=1) / (len1 * len2) 217 | self.cos12 = cos12 218 | cos13 = T.sum(output_drop1 * output_drop3, axis=1) / (len1 * len3) 219 | self.cos13 = cos13 220 | 221 | zero = theano.shared(np.zeros(batch_size, dtype=theano.config.floatX), borrow=True) 222 | margin = theano.shared(np.full(batch_size, 0.05, dtype=theano.config.floatX), borrow=True) 223 | #Loss损失函数 224 | diff = T.cast(T.maximum(zero, margin - cos12 + cos13), dtype=theano.config.floatX) 225 | self.cost = T.sum(diff, acc_dtype=theano.config.floatX) 226 | #mini-batch数据的准确率(如果正向答案和问题之间的cosine大于负向答案和问题的cosine,则认为正确, 227 | #否则是错误的) 228 | #Loss和Accuracy是用来评估训练中模型时候收敛的两个很重要的指标 229 | self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size) 230 | 231 | def _dropout(self, rng, layer, keep_prob): 232 | srng = T.shared_randomstreams.RandomStreams(rng.randint(123456)) 233 | mask = srng.binomial(n=1, p=keep_prob, size=layer.shape) 234 | output = layer * T.cast(mask, theano.config.floatX) 235 | output = output / keep_prob 236 | return output 237 | 238 | def train(): 239 | batch_size = int(256) 240 | filter_sizes = [2,3,5] 241 | num_filters = 500 242 | embedding_size = 100 243 | learning_rate = 0.001 244 | n_epochs = 2000000 245 | validation_freq = 1000 246 | keep_prob_value = 0.25 247 | 248 | vocab = build_vocab() 249 | word_embeddings = load_word_embeddings(vocab, embedding_size) 250 | trainList = load_train_list() 251 | testList = load_test_list() 252 | train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size) 253 | 254 | x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3') 255 | keep_prob = T.fscalar('keep_prob') 256 | model = QACnn( 257 | input1=x1, input2=x2, input3=x3, keep_prob=keep_prob, 258 | word_embeddings=word_embeddings, 259 | batch_size=batch_size, 260 | sequence_len=train_x1.shape[1], 261 | embedding_size=embedding_size, 262 | filter_sizes=filter_sizes, 263 | num_filters=num_filters) 264 | dbg_x1 = model.dbg_x1 265 | dbg_outputs_1 = model.dbg_outputs_1 266 | 267 | cost, cos12, cos13 = model.cost, model.cos12, model.cos13 268 | print 'cost' 269 | print cost 270 | params, accuracy = model.params, model.accuracy 271 | grads = T.grad(cost, params) 272 | 273 | updates = [ 274 | (param_i, param_i - learning_rate * grad_i) 275 | for param_i, grad_i in zip(params, grads) 276 | ] 277 | 278 | p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3') 279 | prob = T.fscalar('prob') 280 | train_model = theano.function( 281 | [p1, p2, p3, prob], 282 | [cost, accuracy, dbg_x1, dbg_outputs_1], 283 | updates=updates, 284 | givens={ 285 | x1: p1, x2: p2, x3: p3, keep_prob: prob 286 | } 287 | ) 288 | 289 | v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') 290 | validate_model = theano.function( 291 | inputs=[v1, v2, v3, prob], 292 | outputs=[cos12, cos13], 293 | #updates=updates, 294 | givens={ 295 | x1: v1, x2: v2, x3: v3, keep_prob: prob 296 | } 297 | ) 298 | 299 | epoch = 0 300 | done_looping = False 301 | while (epoch < n_epochs) and (not done_looping): 302 | epoch = epoch + 1 303 | train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size) 304 | #print train_x3.shape 305 | cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model(train_x1, train_x2, train_x3, keep_prob_value) 306 | print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc) 307 | if epoch % validation_freq == 0: 308 | print 'Evaluation ......' 309 | validation(validate_model, testList, vocab, batch_size) 310 | #print dbg_outputs_1 311 | 312 | if __name__ == '__main__': 313 | train() 314 | -------------------------------------------------------------------------------- /lstm_cnn/theano/insqa_lstm.py: -------------------------------------------------------------------------------- 1 | 2 | ############################################################ 3 | # if batch_size is 1, there must be a dtype error when doing 4 | # T.grad, this is something about scan func 5 | # see https://github.com/Theano/Theano/issues/1772 6 | # 7 | # LSTM + cnn 8 | # test1 top-1 precision: 68.3% 9 | ############################################################ 10 | 11 | from collections import OrderedDict 12 | import sys, time, random, operator 13 | 14 | import numpy as np 15 | import theano 16 | from theano import config 17 | import theano.tensor as T 18 | from theano.tensor.signal import pool 19 | from theano.tensor.nnet import conv2d 20 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 21 | 22 | #TODO change filepath to your local environment 23 | #include train test1 vectors.nobin 24 | 25 | def build_vocab(): 26 | code, vocab = int(0), {} 27 | vocab['UNKNOWN'] = code 28 | code += 1 29 | for line in open('/export/jw/cnn/insuranceQA/train'): 30 | items = line.strip().split(' ') 31 | for i in range(2, 3): 32 | for word in items[i].split('_'): 33 | if len(word) <= 0: 34 | continue 35 | if not word in vocab: 36 | vocab[word] = code 37 | code += 1 38 | return vocab 39 | 40 | def load_vectors(): 41 | vectors = {} 42 | for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'): 43 | items = line.strip().split(' ') 44 | if len(items[0]) <= 0: 45 | continue 46 | vec = [] 47 | for i in range(1, 101): 48 | vec.append(float(items[i])) 49 | vectors[items[0]] = vec 50 | return vectors 51 | 52 | def load_word_embeddings(vocab, dim): 53 | vectors = load_vectors() 54 | embeddings = [] #brute initialization 55 | for i in range(0, len(vocab)): 56 | vec = [] 57 | for j in range(0, dim): 58 | vec.append(0.01) 59 | embeddings.append(vec) 60 | for word, code in vocab.items(): 61 | if word in vectors: 62 | embeddings[code] = vectors[word] 63 | return np.array(embeddings, dtype='float32') 64 | 65 | #be attention initialization of UNKNNOW 66 | def encode_sent(vocab, string, size): 67 | x, m = [], [] 68 | words = string.split('_') 69 | for i in range(0, size): 70 | if words[i] in vocab: 71 | x.append(vocab[words[i]]) 72 | else: 73 | x.append(vocab['UNKNOWN']) 74 | if words[i] == '': #TODO 75 | m.append(1) #fixed sequence length, else use 0 76 | else: 77 | m.append(1) 78 | return x, m 79 | 80 | def load_train_list(): 81 | trainList = [] 82 | for line in open('/export/jw/cnn/insuranceQA/train'): 83 | items = line.strip().split(' ') 84 | if items[0] == '1': 85 | trainList.append(line.strip().split(' ')) 86 | return trainList 87 | 88 | def load_test_list(): 89 | testList = [] 90 | for line in open('/export/jw/cnn/insuranceQA/test1'): 91 | testList.append(line.strip().split(' ')) 92 | return testList 93 | 94 | def load_data(trainList, vocab, batch_size): 95 | train_1, train_2, train_3 = [], [], [] 96 | mask_1, mask_2, mask_3 = [], [], [] 97 | counter = 0 98 | while True: 99 | pos = trainList[random.randint(0, len(trainList)-1)] 100 | neg = trainList[random.randint(0, len(trainList)-1)] 101 | if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''): 102 | #print 'empty string ......' 103 | continue 104 | x, m = encode_sent(vocab, pos[2], 100) 105 | train_1.append(x) 106 | mask_1.append(m) 107 | x, m = encode_sent(vocab, pos[3], 100) 108 | train_2.append(x) 109 | mask_2.append(m) 110 | x, m = encode_sent(vocab, neg[3], 100) 111 | train_3.append(x) 112 | mask_3.append(m) 113 | counter += 1 114 | if counter >= batch_size: 115 | break 116 | return np.transpose(np.array(train_1, dtype=config.floatX)), np.transpose(np.array(train_2, dtype=config.floatX)), np.transpose(np.array(train_3, dtype=config.floatX)), np.transpose(np.array(mask_1, dtype=config.floatX)) , np.transpose(np.array(mask_2, dtype=config.floatX)), np.transpose(np.array(mask_3, dtype=config.floatX)) 117 | 118 | def load_data_val(testList, vocab, index, batch_size): 119 | x1, x2, x3, m1, m2, m3 = [], [], [], [], [], [] 120 | for i in range(0, batch_size): 121 | true_index = index + i 122 | if true_index >= len(testList): 123 | true_index = len(testList) - 1 124 | items = testList[true_index] 125 | x, m = encode_sent(vocab, items[2], 100) 126 | x1.append(x) 127 | m1.append(m) 128 | x, m = encode_sent(vocab, items[3], 100) 129 | x2.append(x) 130 | m2.append(m) 131 | x, m = encode_sent(vocab, items[3], 100) 132 | x3.append(x) 133 | m3.append(m) 134 | return np.transpose(np.array(x1, dtype=config.floatX)), np.transpose(np.array(x2, dtype=config.floatX)), np.transpose(np.array(x3, dtype=config.floatX)), np.transpose(np.array(m1, dtype=config.floatX)) , np.transpose(np.array(m2, dtype=config.floatX)), np.transpose(np.array(m3, dtype=config.floatX)) 135 | 136 | def validation(validate_model, testList, vocab, batch_size): 137 | index, score_list = int(0), [] 138 | while True: 139 | x1, x2, x3, m1, m2, m3 = load_data_val(testList, vocab, index, batch_size) 140 | batch_scores, nouse = validate_model(x1, x2, x3, m1, m2, m3) 141 | for score in batch_scores: 142 | score_list.append(score) 143 | index += batch_size 144 | if index >= len(testList): 145 | break 146 | print 'Evaluation ' + str(index) 147 | sdict, index = {}, int(0) 148 | for items in testList: 149 | qid = items[1].split(':')[1] 150 | if not qid in sdict: 151 | sdict[qid] = [] 152 | sdict[qid].append((score_list[index], items[0])) 153 | index += 1 154 | lev0, lev1 = float(0), float(0) 155 | of = open('/export/jw/cnn/insuranceQA/acc.lstm', 'a') 156 | for qid, cases in sdict.items(): 157 | cases.sort(key=operator.itemgetter(0), reverse=True) 158 | score, flag = cases[0] 159 | if flag == '1': 160 | lev1 += 1 161 | if flag == '0': 162 | lev0 += 1 163 | for s in score_list: 164 | of.write(str(s) + '\n') 165 | of.write('lev1:' + str(lev1) + '\n') 166 | of.write('lev0:' + str(lev0) + '\n') 167 | print 'lev1:' + str(lev1) 168 | print 'lev0:' + str(lev0) 169 | of.close() 170 | 171 | def ortho_weight(ndim): 172 | W = np.random.randn(ndim, ndim) 173 | u, s, v = np.linalg.svd(W) 174 | return u.astype(config.floatX) 175 | 176 | def numpy_floatX(data): 177 | return np.asarray(data, dtype=config.floatX) 178 | 179 | def param_init_cnn(filter_sizes, num_filters, proj_size, tparams, grad_params): 180 | rng = np.random.RandomState(23455) 181 | for filter_size in filter_sizes: 182 | filter_shape = (num_filters, 1, filter_size, proj_size) 183 | fan_in = np.prod(filter_shape[1:]) 184 | fan_out = filter_shape[0] * np.prod(filter_shape[2:]) 185 | W_bound = np.sqrt(6. / (fan_in + fan_out)) 186 | W = theano.shared( 187 | np.asarray( 188 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), 189 | dtype=theano.config.floatX 190 | ), 191 | borrow=True 192 | ) 193 | tparams['cnn_W_' + str(filter_size)] = W 194 | b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX) 195 | b = theano.shared(value=b_values, borrow=True) 196 | tparams['cnn_b_' + str(filter_size)] = b 197 | grad_params += [W, b] 198 | return tparams, grad_params 199 | 200 | def param_init_lstm(proj_size, tparams, grad_params): 201 | W = np.concatenate([ortho_weight(proj_size), 202 | ortho_weight(proj_size), 203 | ortho_weight(proj_size), 204 | ortho_weight(proj_size)], axis=1) 205 | W_t = theano.shared(W, borrow=True) 206 | tparams[_p('lstm', 'W')] = W_t 207 | U = np.concatenate([ortho_weight(proj_size), 208 | ortho_weight(proj_size), 209 | ortho_weight(proj_size), 210 | ortho_weight(proj_size)], axis=1) 211 | U_t = theano.shared(U, borrow=True) 212 | tparams[_p('lstm', 'U')] = U_t 213 | b = np.zeros((4 * proj_size,)) 214 | b_t = theano.shared(b.astype(config.floatX), borrow=True) 215 | tparams[_p('lstm', 'b')] = b_t 216 | grad_params += [W_t, U_t, b_t] 217 | 218 | return tparams, grad_params 219 | 220 | def dropout_layer(state_before, use_noise, trng): 221 | proj = T.switch(use_noise, 222 | (state_before * 223 | trng.binomial(state_before.shape, 224 | p=0.5, n=1, 225 | dtype=state_before.dtype)), 226 | state_before * 0.5) 227 | return proj 228 | 229 | class LSTM(object): 230 | def __init__(self, input1, input2, input3, mask1, mask2, mask3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters): 231 | #proj_size means embedding_size 232 | #'lstm_W' = [embedding_size, embedding_size] 233 | #'lstm_U' = [embedding_size, embedding_size] 234 | #'lstm_b' = [embedding_size] 235 | proj_size = 100 #TODO, what does proj mean 236 | self.params, tparams = [], {} 237 | tparams, self.params = param_init_lstm(proj_size, tparams, self.params) 238 | tparams, self.params = param_init_cnn(filter_sizes, num_filters, proj_size, tparams, self.params) 239 | lookup_table = theano.shared(word_embeddings, borrow=True) 240 | tparams['lookup_table'] = lookup_table 241 | self.params += [lookup_table] 242 | 243 | n_timesteps = input1.shape[0] 244 | n_samples = input1.shape[1] 245 | 246 | lstm1, lstm_whole1 = self._lstm_net(tparams, input1, sequence_len, batch_size, embedding_size, mask1, proj_size) 247 | lstm2, lstm_whole2 = self._lstm_net(tparams, input2, sequence_len, batch_size, embedding_size, mask2, proj_size) 248 | lstm3, lstm_whole3 = self._lstm_net(tparams, input3, sequence_len, batch_size, embedding_size, mask3, proj_size) 249 | 250 | #dimshuffle [sequence_len, batch_size, proj_size] to [batch_size, sequence_len, proj_size] 251 | cnn_input1 = T.reshape(lstm1.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size]) 252 | cnn_input2 = T.reshape(lstm2.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size]) 253 | cnn_input3 = T.reshape(lstm3.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size]) 254 | cnn1 = self._cnn_net(tparams, cnn_input1, batch_size, sequence_len, num_filters, filter_sizes, proj_size) 255 | cnn2 = self._cnn_net(tparams, cnn_input2, batch_size, sequence_len, num_filters, filter_sizes, proj_size) 256 | cnn3 = self._cnn_net(tparams, cnn_input3, batch_size, sequence_len, num_filters, filter_sizes, proj_size) 257 | 258 | len1 = T.sqrt(T.sum(cnn1 * cnn1, axis=1)) 259 | len2 = T.sqrt(T.sum(cnn2 * cnn2, axis=1)) 260 | len3 = T.sqrt(T.sum(cnn3 * cnn3, axis=1)) 261 | 262 | self.cos12 = T.sum(cnn1 * cnn2, axis=1) / (len1 * len2) 263 | self.cos13 = T.sum(cnn1 * cnn3, axis=1) / (len1 * len3) 264 | 265 | zero = theano.shared(np.zeros(batch_size, dtype=config.floatX), borrow=True) 266 | margin = theano.shared(np.full(batch_size, 0.05, dtype=config.floatX), borrow=True) 267 | diff = T.cast(T.maximum(zero, margin - self.cos12 + self.cos13), dtype=config.floatX) 268 | self.cost = T.sum(diff, acc_dtype=config.floatX) 269 | self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size) 270 | 271 | def _cnn_net(self, tparams, cnn_input, batch_size, sequence_len, num_filters, filter_sizes, proj_size): 272 | outputs = [] 273 | for filter_size in filter_sizes: 274 | filter_shape = (num_filters, 1, filter_size, proj_size) 275 | image_shape = (batch_size, 1, sequence_len, proj_size) 276 | W = tparams['cnn_W_' + str(filter_size)] 277 | b = tparams['cnn_b_' + str(filter_size)] 278 | conv_out = conv2d(input=cnn_input, filters=W, filter_shape=filter_shape, input_shape=image_shape) 279 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max') 280 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x')) 281 | outputs.append(pooled_active) 282 | num_filters_total = num_filters * len(filter_sizes) 283 | output_tensor = T.reshape(T.concatenate(outputs, axis=1), [batch_size, num_filters_total]) 284 | return output_tensor 285 | 286 | def _lstm_net(self, tparams, _input, sequence_len, batch_size, embedding_size, mask, proj_size): 287 | input_matrix = tparams['lookup_table'][T.cast(_input.flatten(), dtype="int32")] 288 | input_x = input_matrix.reshape((sequence_len, batch_size, embedding_size)) 289 | proj, proj_whole = lstm_layer(tparams, input_x, proj_size, prefix='lstm', mask=mask) 290 | #if useMask == True: 291 | #proj = (proj * mask[:, :, None]).sum(axis=0) 292 | #proj = proj / mask.sum(axis=0)[:, None] 293 | #if options['use_dropout']: 294 | #proj = dropout_layer(proj, use_noise, trng) 295 | return proj, proj_whole 296 | 297 | #state_below is word_embbeding tensor(3dim) 298 | def lstm_layer(tparams, state_below, proj_size, prefix='lstm', mask=None): 299 | #dim-0 steps, dim-1 samples(batch_size), dim-3 word_embedding 300 | nsteps = state_below.shape[0] 301 | if state_below.ndim == 3: 302 | n_samples = state_below.shape[1] 303 | else: 304 | n_samples = 1 305 | 306 | assert mask is not None 307 | 308 | def _slice(_x, n, dim): 309 | if _x.ndim == 3: 310 | return _x[:, :, n * dim:(n + 1) * dim] 311 | return _x[:, n * dim:(n + 1) * dim] 312 | 313 | #h means hidden output? c means context? so we'll use h? 314 | #rval[0] = [sequence_len, batch_size, proj_size], rval[1] the same 315 | 316 | #so preact size must equl to x_(lstm input slice) 317 | #if you want change lstm h(t) size, 'lstm_U' and 'lstm_b' 318 | #and precat must be changed to another function, like h*U+b 319 | #see http://colah.github.io/posts/2015-08-Understanding-LSTMs/ 320 | #f(t) = sigmoid(Wf * [h(t-1),x(t)] + bf) 321 | def _step(m_, x_, h_, c_): 322 | preact = T.dot(h_, tparams[_p(prefix, 'U')]) 323 | preact += x_ 324 | 325 | i = T.nnet.sigmoid(_slice(preact, 0, proj_size)) 326 | f = T.nnet.sigmoid(_slice(preact, 1, proj_size)) 327 | o = T.nnet.sigmoid(_slice(preact, 2, proj_size)) 328 | c = T.tanh(_slice(preact, 3, proj_size)) 329 | 330 | c = f * c_ + i * c 331 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 332 | 333 | h = o * T.tanh(c) 334 | #if mask(t-1)==0, than make h(t) = h(t-1) 335 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 336 | 337 | return h, c 338 | 339 | state_below = (T.dot(state_below, tparams[_p(prefix, 'W')]) + 340 | tparams[_p(prefix, 'b')]) 341 | 342 | dim_proj = proj_size 343 | rval, updates = theano.scan(_step, 344 | sequences=[mask, state_below], 345 | outputs_info=[T.alloc(numpy_floatX(0.), 346 | n_samples, 347 | dim_proj), 348 | T.alloc(numpy_floatX(0.), 349 | n_samples, 350 | dim_proj)], 351 | name=_p(prefix, '_layers'), 352 | n_steps=nsteps) 353 | return rval[0], rval[1] 354 | 355 | def _p(pp, name): 356 | return '%s_%s' % (pp, name) 357 | 358 | def train(): 359 | batch_size = int(256) 360 | embedding_size = 100 361 | learning_rate = 0.05 362 | n_epochs = 20000000 363 | validation_freq = 1000 364 | filter_sizes = [1, 2, 3, 5] 365 | num_filters = 500 366 | 367 | vocab = build_vocab() 368 | word_embeddings = load_word_embeddings(vocab, embedding_size) 369 | trainList = load_train_list() 370 | testList = load_test_list() 371 | train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size) 372 | x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3') 373 | m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3') 374 | model = LSTM( 375 | input1=x1, input2=x2, input3=x3, 376 | mask1=m1, mask2=m2, mask3=m3, 377 | word_embeddings=word_embeddings, 378 | batch_size=batch_size, 379 | sequence_len=train_x1.shape[0], #row is sequence_len 380 | embedding_size=embedding_size, 381 | filter_sizes=filter_sizes, 382 | num_filters=num_filters) 383 | 384 | cost, cos12, cos13 = model.cost, model.cos12, model.cos13 385 | params, accuracy = model.params, model.accuracy 386 | grads = T.grad(cost, params) 387 | updates = [ 388 | (param_i, param_i - learning_rate * grad_i) 389 | for param_i, grad_i in zip(params, grads) 390 | ] 391 | 392 | p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3') 393 | q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3') 394 | train_model = theano.function( 395 | [p1, p2, p3, q1, q2, q3], 396 | [cost, accuracy], 397 | updates=updates, 398 | givens={ 399 | x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3 400 | } 401 | ) 402 | 403 | v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') 404 | u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3') 405 | validate_model = theano.function( 406 | inputs=[v1, v2, v3, u1, u2, u3], 407 | outputs=[cos12, cos13], 408 | #updates=updates, 409 | givens={ 410 | x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3 411 | } 412 | ) 413 | 414 | epoch = 0 415 | done_looping = False 416 | while (epoch < n_epochs) and (not done_looping): 417 | epoch += 1 418 | train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size) 419 | #print('train_x1, train_x2, train_x3') 420 | #print(train_x1.shape, train_x2.shape, train_x3.shape) 421 | cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3) 422 | print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc) 423 | if epoch % validation_freq == 0: 424 | print 'Evaluation ......' 425 | validation(validate_model, testList, vocab, batch_size) 426 | 427 | if __name__ == '__main__': 428 | train() 429 | --------------------------------------------------------------------------------