├── cnn
├── tensorflow
│ ├── test.py
│ ├── README.md
│ ├── insqa_cnn.py
│ ├── insqa_cnn.py.old
│ ├── insqa_train.py
│ ├── insurance_qa_data_helpers.py
│ └── insqa_train.py.old
└── theano
│ ├── README.md
│ └── insqa_cnn.py
├── lstm_cnn
└── theano
│ ├── README.md
│ └── insqa_lstm.py
├── config.py
├── README.md
├── gen.py
├── swem
├── swem_hier.py
├── swem_max_margin.py
└── swem_hier_margin.py
├── rnn_attention
└── tensorflow
│ ├── insurance_qa_data_helpers.py
│ └── tf_rnn_char.py
└── utils.py
/cnn/tensorflow/test.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | _list = [i for i in range(0, 10)]
4 | _l1 = random.sample(_list, 2)
5 | _l2 = random.sample(_list, 2)
6 | print(_l1)
7 | print(_l2)
8 | for i in range(2, 2):
9 | print(i)
10 |
11 |
--------------------------------------------------------------------------------
/lstm_cnn/theano/README.md:
--------------------------------------------------------------------------------
1 |
2 | theano lstm+cnn code for insuranceQA
3 |
4 | ================result==================
5 |
6 | theano code, test1 top-1 precision : 68.3%
7 |
8 | lstm+cnn is better than cnn(61.5%).
9 |
10 | ================dataset================
11 |
12 | dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
13 |
14 | I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
15 |
16 | you can get the original dataset from https://github.com/shuzi/insuranceQA
17 |
18 | word embedding is trained by word2vec toolkit
19 |
20 | =================run=====================
21 |
22 | reformat the original dataset(see my train and test1.sample)
23 |
24 | change filepath to your dataset(see TODO in insqa_cnn.py)
25 |
26 | python insqa_lstm.py
27 |
--------------------------------------------------------------------------------
/cnn/tensorflow/README.md:
--------------------------------------------------------------------------------
1 |
2 | ================result==================
3 |
4 | 结果和theano版本的差不多,具体数值忘了
5 |
6 | 虽然代码里写了dropout,但是实际并没有使用,dropout对结果影响不是特别大,不用dropout的话训练速度要快一些。
7 |
8 | ================dataset================
9 |
10 | 数据格式和theano版本的是一样的
11 |
12 | github上给出的是样本数据,如果需要全量的,也可直接联系我
13 | dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
14 |
15 | I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
16 |
17 | you can get the original dataset from https://github.com/shuzi/insuranceQA
18 |
19 | word embedding is trained by word2vec toolkit
20 |
21 | =================run=====================
22 |
23 | ./insqa_train.py
24 |
25 | 我使用的是python3.4,部分代码可能会和python2不兼容,如使用python2需要自己做一些小修改,核心的CNN代码应该
26 | 不用改动的
27 | 代码里的数据路径(类似'/export/...')是需要根据自己的环境修改的,指向自己的数据路径即可。核心的CNN代码无需改动
28 |
--------------------------------------------------------------------------------
/cnn/theano/README.md:
--------------------------------------------------------------------------------
1 |
2 | ================result==================
3 | theano and tensorflow cnn code for insuranceQA
4 |
5 | theano code, test1 top-1 precision : 61.5% (see ./insuranceQA/acc)
6 | tensorflow code, test1 top-1 precision : 62.6%
7 |
8 | the best precision in the paper is 62.8% (see Applying Deep Leaarning To Answer Selection: A study and an open task)
9 |
10 | ================dataset================
11 | dataset is large, only test1 sample is given (see ./insuranceQA/test1.sample)
12 |
13 | I converted original idx_xx format to real-word format (see ./insuranceQA/train ./insuranceQA/test1.sample)
14 |
15 | you can get the original dataset from https://github.com/shuzi/insuranceQA
16 |
17 | word embedding is trained by word2vec toolkit
18 |
19 | =================run=====================
20 | reformat the original dataset(see my train and test1.sample)
21 | change filepath to your dataset(see TODO in insqa_cnn.py)
22 | python insqa_cnn.py
23 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | dataset_ins = 'insurance-qa'
4 | dataset_qur = 'quora-qa'
5 |
6 | ##################################################################
7 | # ajust to your runnning environment
8 | # which data do you want
9 | dataset = dataset_qur
10 | # word2vec command path
11 | w2v_command = '/export/jw/word2vec/word2vec'
12 | ##################################################################
13 |
14 | home = ''
15 | if dataset == dataset_ins:
16 | home = os.path.expanduser('/export/jw/insuranceQA')
17 | elif dataset == dataset_qur:
18 | home = os.path.expanduser('/export/jw/quora')
19 |
20 | #Insurance-QA original data directory
21 | qa_version = 'V1'
22 | vocab_file = os.path.join(home, qa_version, 'vocabulary')
23 | answers_file = os.path.join(home, qa_version, 'answers.label.token_idx')
24 | question_train_file = os.path.join(home, qa_version, 'question.train.token_idx.label')
25 | question_test1_file = os.path.join(home, qa_version, 'question.test1.label.token_idx.pool')
26 | question_test2_file = os.path.join(home, qa_version, 'question.test2.label.token_idx.pool')
27 | question_dev_file = os.path.join(home, qa_version, 'question.dev.label.token_idx.pool')
28 | #quora original data directory
29 | qr_file = os.path.join(home, 'quora_duplicate_questions.tsv')
30 | qr_train_ratio = 0.8
31 | #processed files
32 | train_file = os.path.join(home, 'data', 'train.prepro')
33 | test1_file = os.path.join(home, 'data', 'test1.prepro')
34 | test2_file = os.path.join(home, 'data', 'test2.prepro')
35 | w2v_train_file = os.path.join(home, 'data', 'w2v.train')
36 | w2v_bin_file = os.path.join(home, 'data', 'w2v.bin')
37 | predict1_file = os.path.join(home, 'data', 'predict1')
38 |
39 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Insurance-QA deeplearning model
2 | ======
3 | This is a repo for Q&A Mathing, includes some deep learning models, such as CNN、RNN.
4 | 1. CNN. Basic CNN model from 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
5 | 2. RNN. RNN seems the best model on Insurance-QA dataset.
6 | 3. SWEM. SWEM is the fastest, and has good effect on other datasets, such as WikiQA ..., but is seems not so good on Insurance-QA dataset. I think that, SWEM is more suitable for Q&Q matching, not Q&A matching.
7 |
8 |
9 | It's hard to say which model is the best in other datasets, you have to choose the most suitable model for you.
10 | More models are on the way, pay attention to the updates.
11 |
12 | ## Requirements
13 | 1. tensorflow 1.4.0
14 | 2. python3.5
15 |
16 | ## Performance
17 | margin loss version
18 |
19 | Model/Score | Ins_qa_top1_precision | quora_best_prec
20 | ------------ | ------------- | -------------
21 | CNN | 62% | None
22 | LSTM+CNN | 68% | None
23 | SWEM | <55% | None
24 |
25 | logloss version
26 |
27 | Model/Score | Insqa_top1_precision | quora_best_prec
28 | ------------ | ------------- | -------------
29 | CNN | None | 79.60%
30 | LSTM+CNN | None | None
31 | SWEM | <40% | 82.69%
32 |
33 | ## Running
34 | Change configuration to your own environment, just like data pathes
35 |
36 | vim config.py
37 |
38 | Data processing
39 |
40 | python3 gen.py
41 |
42 | Run CNN model
43 |
44 | cd ./cnn/tensorflow && python3 insqa_train.py
45 |
46 | It will take few hours(thousands of epoches) to train this model on a single GPU.
47 |
48 | ## Downloads
49 | 1. You can get Insurance-QA data from here https://github.com/shuzi/insuranceQA
50 | 2. You can get Quora data from here http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv
51 |
52 | ## Links
53 | 1. CNN and RNN textual classification repo https://github.com/white127/TextClassification_CNN_RNN
54 | 2. 《Applying Deep Learning To Answer Selection: A Study And An Open Task》
55 |
56 |
--------------------------------------------------------------------------------
/cnn/tensorflow/insqa_cnn.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 | ##########################################################################
5 | # embedding_lookup + cnn + cosine margine , batch
6 | ##########################################################################
7 | class InsQACNN(object):
8 | def __init__(self, _margin, sequence_length, batch_size,
9 | vocab_size, embedding_size,
10 | filter_sizes, num_filters, l2_reg_lambda=0.0):
11 | self.L, self.B, self.V, self.E, self.FS, self.NF = sequence_length, batch_size, \
12 | vocab_size, embedding_size, filter_sizes, num_filters
13 |
14 | #用户问题,字向量使用embedding_lookup
15 | self.q = tf.placeholder(tf.int32, [self.B, self.L], name="q")
16 | #待匹配正向问题
17 | self.qp = tf.placeholder(tf.int32, [self.B, self.L], name="qp")
18 | #负向问题
19 | self.qn = tf.placeholder(tf.int32, [self.B, self.L], name="qn")
20 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
21 | l2_loss = tf.constant(0.0)
22 |
23 | # Embedding layer
24 | with tf.device('/cpu:0'), tf.name_scope("embedding"):
25 | W = tf.get_variable(
26 | initializer=tf.random_uniform([self.V, self.E], -1.0, 1.0),
27 | name='We')
28 | self.qe = tf.nn.embedding_lookup(W, self.q)
29 | self.qpe = tf.nn.embedding_lookup(W, self.qp)
30 | self.qne = tf.nn.embedding_lookup(W, self.qn)
31 | self.qe = tf.expand_dims(self.qe, -1)
32 | self.qpe = tf.expand_dims(self.qpe, -1)
33 | self.qne = tf.expand_dims(self.qne, -1)
34 |
35 | with tf.variable_scope('shared-conv') as scope:
36 | self.qe = self.conv(self.qe)
37 | scope.reuse_variables()
38 | #tf.get_variable_scope().reuse_variables()
39 | self.qpe = self.conv(self.qpe)
40 | scope.reuse_variables()
41 | #tf.get_variable_scope().reuse_variables()
42 | self.qne = self.conv(self.qne)
43 | self.cos_q_qp = self.cosine(self.qe, self.qpe)
44 | self.cos_q_qn = self.cosine(self.qe, self.qne)
45 | zero = tf.constant(0, shape=[self.B], dtype=tf.float32)
46 | margin = tf.constant(_margin, shape=[self.B], dtype=tf.float32)
47 | with tf.name_scope("loss"):
48 | self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_q_qp, self.cos_q_qn)))
49 | self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss
50 | print('loss ', self.loss)
51 |
52 | # Accuracy
53 | with tf.name_scope("accuracy"):
54 | self.correct = tf.equal(zero, self.losses)
55 | self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy")
56 |
57 | for v in tf.trainable_variables():
58 | print(v)
59 |
60 | def conv(self, tensor):
61 | pooled = []
62 | #with tf.variable_scope(name_or_scope='my-conv', reuse=tf.AUTO_REUSE):
63 | with tf.variable_scope("my-conv-shared"):
64 | for i, fs in enumerate(self.FS):
65 | filter_shape = [fs, self.E, 1, self.NF]
66 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1),
67 | name="W-%s" % str(fs))
68 | b = tf.get_variable(initializer=tf.constant(0.1, shape=[self.NF]),
69 | name="b-%s" % str(fs))
70 | conv = tf.nn.conv2d(
71 | tensor, W, strides=[1, 1, 1, 1], padding='VALID',
72 | name="conv")
73 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
74 | output = tf.nn.max_pool(
75 | h, ksize=[1, self.L - fs + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID',
76 | name="pool")
77 | pooled.append(output)
78 | num_filters_total = self.NF * len(self.FS)
79 | pooled = tf.reshape(tf.concat(pooled, 3), [-1, num_filters_total])
80 | pooled = tf.nn.dropout(pooled, self.dropout_keep_prob)
81 | return pooled
82 |
83 | def cosine(self, v1, v2):
84 | l1 = tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1))
85 | l2 = tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1))
86 | a = tf.reduce_sum(tf.multiply(v1, v2), 1)
87 | cos = tf.div(a, tf.multiply(l1, l2), name='score')
88 | return tf.clip_by_value(cos, 1e-5, 0.99999)
89 |
90 |
--------------------------------------------------------------------------------
/gen.py:
--------------------------------------------------------------------------------
1 | import config, os, random
2 |
3 | #####################################################################
4 | # function: load vocab
5 | # return: dict[word] = [word_id]
6 | #####################################################################
7 | def load_vocab():
8 | voc = {}
9 | for line in open(config.vocab_file):
10 | word, _id = line.strip().split('\t')
11 | voc[word] = _id
12 | return voc
13 |
14 | #####################################################################
15 | # function: load answers, restore idx to real word
16 | # return : [answer_1, answer_2, ..., answer_n]
17 | #####################################################################
18 | def ins_load_answers():
19 | _list, voc = [''], load_vocab()
20 | for line in open(config.answers_file):
21 | _, sent = line.strip().split('\t')
22 | _list.append('_'.join([voc[wid] for wid in sent.split(' ')]))
23 | return _list
24 |
25 | #####################################################################
26 | # function: preprea word2vec binary file
27 | # return :
28 | #####################################################################
29 | def ins_w2v():
30 | print('preparing word2vec ......')
31 | _data, voc = [], load_vocab()
32 | for line in open(config.question_train_file):
33 | items = line.strip().split('\t')
34 | _data.append(' '.join([voc[_id] for _id in items[0].split(' ')]))
35 | for _file in [config.answers_file, config.question_dev_file, \
36 | config.question_test1_file, config.question_test2_file]:
37 | for line in open(_file):
38 | items = line.strip().split('\t')
39 | _data.append(' '.join([voc[_id] for _id in items[1].split(' ')]))
40 | of = open(config.w2v_train_file, 'w')
41 | for s in _data: of.write(s + '\n')
42 | of.close()
43 | os.system('time ' + config.w2c_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1')
44 |
45 | #####################################################################
46 | # function: preprea train file
47 | # file format: flag question answer
48 | #####################################################################
49 | def ins_train():
50 | print('preparing train ......')
51 | answers, voc, _data = ins_load_answers(), load_vocab(), []
52 | for line in open(config.question_train_file):
53 | qsent, ids = line.strip().split('\t')
54 | qsent = '_'.join([voc[wid] for wid in qsent.split(' ')])
55 | for _id in ids.split(' '):
56 | _data.append(' '.join(['1', qsent, answers[int(_id)]]))
57 | of = open(config.train_file, 'w')
58 | for _s in _data: of.write(_s + '\n')
59 | of.close()
60 |
61 | #####################################################################
62 | # function: preprea test file
63 | # file format: flag group_id question answer
64 | #####################################################################
65 | def ins_test():
66 | print('preparing test ......')
67 | answers, voc = ins_load_answers(), load_vocab()
68 | for _in, _out in ([(config.question_test2_file, config.test2_file), \
69 | (config.question_test1_file, config.test1_file)]):
70 | _data, group = [], int(0)
71 | for line in open(_in):
72 | pids, qsent, pnids = line.strip().split('\t')
73 | positive = {_id:'#' for _id in pids.split(' ')}
74 | qsent = '_'.join([voc[wid] for wid in qsent.split(' ')])
75 | for _id in pnids.split(' '):
76 | flag = '1' if _id in positive else '0'
77 | _data.append(' '.join([flag, str(group), qsent, answers[int(_id)]]))
78 | group += 1
79 | of = open(_out, 'w')
80 | for s in _data: of.write(s + '\n')
81 | of.close()
82 |
83 | def ins_qa():
84 | ins_w2v()
85 | ins_train()
86 | ins_test()
87 |
88 | def qur_prepare():
89 | #pretrain word2vec
90 | _list = []
91 | for line in open(config.qr_file):
92 | items = line.strip().split('\t')
93 | if len(items) != 6:
94 | continue
95 | _list.append(items)
96 | _list = _list[1:]
97 | random.shuffle(_list)
98 | _list = [(f, q1, q2) for _,_,_,q1,q2,f in _list]
99 | of = open(config.w2v_train_file, 'w')
100 | for f, q1, q2 in _list:
101 | of.write(q1 + '\n')
102 | of.write(q2 + '\n')
103 | of.close()
104 | os.system('time ' + config.w2v_command + ' -train ' + config.w2v_train_file + ' -output ' + config.w2v_bin_file + ' -cbow 0 -size 100 -window 5 -negative 20 -sample 1e-3 -threads 12 -binary 0 -min-count 1')
105 | #train file
106 | _newlist = []
107 | for f, q1, q2 in _list:
108 | if len(q1) <= 1 or len(q2) <= 1: continue
109 | q1 = '_'.join(q1.split(' '))
110 | q2 = '_'.join(q2.split(' '))
111 | _newlist.append((f, q1, q2))
112 | _list = _newlist
113 | of = open(config.train_file, 'w')
114 | for f, q1, q2 in _list[:int(len(_list) * 0.8)]:
115 | of.write(' '.join([f, q1, q2]) + '\n')
116 | of.close()
117 |
118 | #test file
119 | of = open(config.test1_file, 'w')
120 | for f, q1, q2 in _list[int(len(_list) * 0.8):]:
121 | of.write(' '.join([f, q1, q2]) + '\n')
122 | of.close()
123 |
124 | def qur_qa():
125 | qur_prepare()
126 |
127 | if __name__ == '__main__':
128 | if config.dataset == config.dataset_ins:
129 | ins_qa()
130 | elif config.dataset == config.dataset_qur:
131 | qur_qa()
132 |
--------------------------------------------------------------------------------
/swem/swem_hier.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import time, os, random, datetime, sys
4 | from sklearn import metrics
5 | sys.path.append('../')
6 | import config, utils
7 |
8 | ################################################################################
9 | # Insurance-QA
10 | # AUC 0.96, top 1 precision:31%
11 | #
12 | # quora-data
13 | # best precision: 0.8369, best threshold:0.62
14 | ################################################################################
15 | class SWEM_HIER(object):
16 | def __init__(self,
17 | sequence_length,
18 | vocab_size,
19 | embedding_size,
20 | embeddings):
21 | self.x1 = tf.placeholder(tf.int32, [None, sequence_length])
22 | self.x2 = tf.placeholder(tf.int32, [None, sequence_length])
23 | self.y = tf.placeholder(tf.float32, [None])
24 | self.one = tf.placeholder(tf.float32, [None])
25 | #self.dropout_keep_prob = tf.placeholder(tf.float32)
26 |
27 | with tf.device('/cpu:0'), tf.name_scope('embedding'):
28 | self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
29 | x1_mat = tf.nn.embedding_lookup(self.word_mat, self.x1)
30 | x2_mat = tf.nn.embedding_lookup(self.word_mat, self.x2)
31 | self.x1_mat_exp = tf.expand_dims(x1_mat, -1)
32 | self.x2_mat_exp = tf.expand_dims(x2_mat, -1)
33 | p1 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 2, 1, 1],
34 | strides=[1, 1, 1, 1], padding='VALID')
35 | p2 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 2, 1, 1],
36 | strides=[1, 1, 1, 1], padding='VALID')
37 | p1 = tf.reshape(tf.reduce_max(p1, 1), [-1, embedding_size])
38 | p2 = tf.reshape(tf.reduce_max(p2, 1), [-1, embedding_size])
39 | """
40 | p11 = tf.nn.avg_pool(self.x1_mat_exp, ksize=[1, 3, 1, 1],
41 | strides=[1, 1, 1, 1], padding='VALID')
42 | p21 = tf.nn.avg_pool(self.x2_mat_exp, ksize=[1, 3, 1, 1],
43 | strides=[1, 1, 1, 1], padding='VALID')
44 | p11 = tf.reshape(tf.reduce_max(p11, 1), [-1, embedding_size])
45 | p21 = tf.reshape(tf.reduce_max(p21, 1), [-1, embedding_size])
46 | p1 = tf.concat([p1, p11], 1)
47 | p2 = tf.concat([p2, p21], 1)
48 | """
49 |
50 | self.cos = self.cosine(p1, p2)
51 | self.losses = self.logloss(self.y, self.one, self.cos)
52 |
53 | def logloss(self, y, v_one, sim):
54 | a = tf.multiply(y, tf.log(sim)) #y*log(p)
55 | b = tf.subtract(v_one, y)#1-y
56 | c = tf.log(tf.subtract(v_one, sim))#log(1-p)
57 | losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
58 | losses = tf.reduce_sum(losses, -1)
59 | return losses
60 |
61 | def cosine(self, t1, t2):
62 | len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
63 | len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
64 | multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
65 | cos = tf.div(multiply, tf.multiply(len1, len2))
66 | return tf.clip_by_value(cos, 1e-5, 0.99999)
67 |
68 | def get_constant(batch_size):
69 | one, zero = [1.0] * batch_size, [0.0] * batch_size
70 | return np.array(one), np.array(zero)
71 |
72 | max_len = 100
73 | num_epoch = 200000
74 | batch_size = 256
75 | checkpoint_every = 10000
76 | vocab, embeddings = utils.load_embeddings()
77 | embedding_size = len(embeddings[0])
78 | train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
79 | print('load data done ......')
80 | print(embeddings.shape)
81 |
82 | prev_auc = 0.0
83 | with tf.Graph().as_default():
84 | session_conf = tf.ConfigProto(
85 | allow_soft_placement=True, log_device_placement=False)
86 | sess = tf.Session(config=session_conf)
87 | with sess.as_default():
88 | swem = SWEM_HIER(max_len, len(vocab), embedding_size, embeddings)
89 | global_step = tf.Variable(0, name="global_step", trainable=False)
90 | optimizer = tf.train.AdamOptimizer(1e-1)
91 | #optimizer = tf.train.GradientDescentOptimizer(1e-1)
92 | grads_and_vars = optimizer.compute_gradients(swem.losses)
93 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
94 |
95 | timestamp = str(int(time.time()))
96 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
97 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
98 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
99 | if not os.path.exists(checkpoint_dir):
100 | os.makedirs(checkpoint_dir)
101 | saver = tf.train.Saver(tf.all_variables())
102 | sess.run(tf.initialize_all_variables())
103 |
104 | def train_step():
105 | y, x1, x2 = utils.gen_train_batch_yxx(train_data, batch_size)
106 | one, zero = get_constant(batch_size)
107 | feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:y, swem.one:one}
108 | _, step, loss, cos = sess.run(
109 | [train_op, global_step, swem.losses, swem.cos], feed_dict)
110 | time_str = datetime.datetime.now().isoformat()
111 | print("{}: step {}, loss {:g}".format(time_str, step, loss))
112 |
113 | def test_step():
114 | yp, y, group = [], [], []
115 | for i in range(0, len(test_data), batch_size):
116 | f, g, x1, x2 = utils.gen_test_batch_yxx(test_data, i, i + batch_size)
117 | one, zero = get_constant(len(f))
118 | feed_dict = {swem.x1:x1, swem.x2:x2, swem.y:f, swem.one:one}
119 | loss, cos = sess.run([swem.losses, swem.cos], feed_dict)
120 | yp.extend(cos)
121 | y.extend(f)
122 | group.extend(g)
123 | ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
124 | #for _y, _g, _yp in ppp:
125 | # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
126 | return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
127 |
128 | for i in range(num_epoch):
129 | train_step()
130 | current_step = tf.train.global_step(sess, global_step)
131 | if current_step % checkpoint_every == 0:
132 | y, g, yp = test_step()
133 | utils._eval(y, g, yp)
134 |
135 | #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
136 | #utils.save_features(features[3], './data/gen_sweg_hier_test.f')
137 |
--------------------------------------------------------------------------------
/rnn_attention/tensorflow/insurance_qa_data_helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from operator import itemgetter
4 |
5 | precision = '/export/jw/cnn/insuranceQA/acc.lstm'
6 |
7 | empty_vector = []
8 | for i in range(0, 100):
9 | empty_vector.append(float(0.0))
10 | onevector = []
11 | for i in range(0, 10):
12 | onevector.append(float(1))
13 | zerovector = []
14 | for i in range(0, 10):
15 | zerovector.append(float(0))
16 |
17 | def build_vocab():
18 | code, vocab = int(0), {}
19 | vocab['UNKNOWN'] = code
20 | code += 1
21 | for line in open('/export/jw/cnn/insuranceQA/train'):
22 | items = line.strip().split(' ')
23 | for i in range(2, 3):
24 | words = items[i].split('_')
25 | for word in words:
26 | if not word in vocab:
27 | vocab[word] = code
28 | code += 1
29 | for line in open('/export/jw/cnn/insuranceQA/test1'):
30 | items = line.strip().split(' ')
31 | for i in range(2, 3):
32 | words = items[i].split('_')
33 | for word in words:
34 | if not word in vocab:
35 | vocab[word] = code
36 | code += 1
37 | return vocab
38 |
39 | def read_alist():
40 | alist = []
41 | for line in open('/export/jw/cnn/insuranceQA/train'):
42 | items = line.strip().split(' ')
43 | alist.append(items[3])
44 | print('read_alist done ......')
45 | return alist
46 |
47 | def load_vectors():
48 | vectors = {}
49 | for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
50 | items = line.strip().split(' ')
51 | if (len(items) < 101):
52 | continue
53 | vec = []
54 | for i in range(1, 101):
55 | vec.append(float(items[i]))
56 | vectors[items[0]] = vec
57 | return vectors
58 |
59 | def read_vector(vectors, word):
60 | global empty_vector
61 | if word in vectors:
62 | return vectors[word]
63 | else:
64 | return empty_vector
65 | #return vectors['']
66 |
67 | def load_train_list():
68 | train_list = []
69 | for line in open('/export/jw/cnn/insuranceQA/train'):
70 | items = line.strip().split(' ')
71 | if items[0] == '1':
72 | train_list.append(line.strip().split(' '))
73 | return train_list
74 |
75 | def load_test_list():
76 | test_list = []
77 | for line in open('/export/jw/cnn/insuranceQA/test1'):
78 | test_list.append(line.strip().split(' '))
79 | return test_list
80 |
81 | def load_train_and_vectors():
82 | trainList = []
83 | for line in open('/export/jw/cnn/insuranceQA/train'):
84 | trainList.append(line.strip())
85 | vectors = load_vectors()
86 | return trainList, vectors
87 |
88 | def read_raw():
89 | raw = []
90 | for line in open('/export/jw/cnn/insuranceQA/train'):
91 | items = line.strip().split(' ')
92 | if items[0] == '1':
93 | raw.append(items)
94 | return raw
95 |
96 | def encode_sent(vocab, string, size):
97 | x, m = [], []
98 | words = string.split('_')
99 | for i in range(0, size):
100 | if words[i] in vocab: x.append(vocab[words[i]])
101 | else: x.append(vocab['UNKNOWN'])
102 | if words[i] == '': m.append(1)
103 | else: m.append(1)
104 | return x, m
105 |
106 | def load_val_data(test_list, vocab, index, batch_size, max_len):
107 | x1, x2, x3, m1, m2, m3 = [], [], [], [], [], []
108 | for i in range(0, batch_size):
109 | t_i = index + i
110 | if t_i >= len(test_list):
111 | t_i = len(test_list) - 1
112 | items = test_list[t_i]
113 | x, m = encode_sent(vocab, items[2], max_len)
114 | x1.append(x)
115 | m1.append(m)
116 | x, m = encode_sent(vocab, items[3], max_len)
117 | x2.append(x)
118 | m2.append(m)
119 | x, m = encode_sent(vocab, items[3], max_len)
120 | x3.append(x)
121 | m3.append(m)
122 | return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32'), np.transpose(np.array(m1, dtype='float32')) , np.transpose(np.array(m2, dtype='float32')), np.transpose(np.array(m3, dtype='float32'))
123 |
124 | def load_train_data(trainList, vocab, batch_size, max_len):
125 | train_1, train_2, train_3 = [], [], []
126 | mask_1, mask_2, mask_3 = [], [], []
127 | counter = 0
128 | while True:
129 | pos = trainList[random.randint(0, len(trainList)-1)]
130 | neg = trainList[random.randint(0, len(trainList)-1)]
131 | if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''):
132 | #print 'empty string ......'
133 | continue
134 | x, m = encode_sent(vocab, pos[2], max_len)
135 | train_1.append(x)
136 | mask_1.append(m)
137 | x, m = encode_sent(vocab, pos[3], max_len)
138 | train_2.append(x)
139 | mask_2.append(m)
140 | x, m = encode_sent(vocab, neg[3], max_len)
141 | train_3.append(x)
142 | mask_3.append(m)
143 | counter += 1
144 | if counter >= batch_size:
145 | break
146 | return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32'), np.transpose(np.array(mask_1, dtype='float32')) , np.transpose(np.array(mask_2, dtype='float32')), np.transpose(np.array(mask_3, dtype='float32'))
147 |
148 | def evaluation(score_list, test_list):
149 | global precision
150 | sessdict, index = {}, int(0)
151 | for items in test_list:
152 | qid = items[1].split(':')[1]
153 | if not qid in sessdict:
154 | sessdict[qid] = []
155 | sessdict[qid].append((score_list[index], items[0]))
156 | index += 1
157 | if index >= len(test_list):
158 | break
159 | lev1, lev0 = float(0), float(0)
160 | of = open(precision, 'a')
161 | for k, v in sessdict.items():
162 | v.sort(key=itemgetter(0), reverse=True)
163 | score, flag = v[0]
164 | if flag == '1': lev1 += 1
165 | if flag == '0': lev0 += 1
166 | of.write('lev1:' + str(lev1) + '\n')
167 | of.write('lev0:' + str(lev0) + '\n')
168 | print('lev1 ' + str(lev1))
169 | print('lev0 ' + str(lev0))
170 | print('precision:' + str(lev1 / (lev0 + lev1)))
171 | of.close()
172 |
--------------------------------------------------------------------------------
/swem/swem_max_margin.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import time, os, random, datetime, sys
4 | from sklearn import metrics
5 | sys.path.append('../')
6 | import config, utils
7 |
8 | class SWEM_HIER(object):
9 | def __init__(self,
10 | margin,
11 | sequence_length,
12 | vocab_size,
13 | embedding_size,
14 | embeddings):
15 | self.zero = tf.placeholder(tf.float32, [None])
16 | self.q = tf.placeholder(tf.int32, [None, sequence_length])
17 | self.qp = tf.placeholder(tf.int32, [None, sequence_length])
18 | self.qn = tf.placeholder(tf.int32, [None, sequence_length])
19 |
20 | with tf.device('/cpu:0'), tf.name_scope('embedding'):
21 | self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
22 | q_mat = tf.nn.embedding_lookup(self.word_mat, self.q)
23 | qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp)
24 | qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn)
25 | self.q_mat_exp = tf.expand_dims(q_mat, -1)
26 | self.qp_mat_exp = tf.expand_dims(qp_mat, -1)
27 | self.qn_mat_exp = tf.expand_dims(qn_mat, -1)
28 | """
29 | q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1],
30 | strides=[1, 1, 1, 1], padding='VALID')
31 | qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1],
32 | strides=[1, 1, 1, 1], padding='VALID')
33 | qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1],
34 | strides=[1, 1, 1, 1], padding='VALID')
35 | """
36 | q = tf.reshape(tf.reduce_max(self.q_mat_exp, 1), [-1, embedding_size])
37 | qp = tf.reshape(tf.reduce_max(self.qp_mat_exp, 1), [-1, embedding_size])
38 | qn = tf.reshape(tf.reduce_max(self.qn_mat_exp, 1), [-1, embedding_size])
39 |
40 | self.cos_q_qp = self.cosine(q, qp)
41 | self.cos_q_qn = self.cosine(q, qn)
42 | self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn)
43 |
44 | correct = tf.equal(self.zero, loss_batch)
45 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float"))
46 |
47 | def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
48 | loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn)))
49 | losses = tf.reduce_sum(loss_batch)
50 | return losses, loss_batch
51 |
52 | def logloss(self, y, v_one, sim):
53 | a = tf.multiply(y, tf.log(sim)) #y*log(p)
54 | b = tf.subtract(v_one, y)#1-y
55 | c = tf.log(tf.subtract(v_one, sim))#log(1-p)
56 | losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
57 | losses = tf.reduce_sum(losses, -1)
58 | return losses
59 |
60 | def cosine(self, t1, t2):
61 | len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
62 | len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
63 | multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
64 | cos = tf.div(multiply, tf.multiply(len1, len2))
65 | return tf.clip_by_value(cos, 1e-5, 0.99999)
66 |
67 | def get_constant(batch_size):
68 | one, zero = [1.0] * batch_size, [0.0] * batch_size
69 | return np.array(one), np.array(zero)
70 |
71 | margin = 0.05
72 | max_len = 200
73 | num_epoch = 200000
74 | batch_size = 256
75 | checkpoint_every = 50000
76 | vocab, embeddings = utils.load_embeddings()
77 | embedding_size = len(embeddings[0])
78 | train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
79 | print('load data done ......')
80 | print(embeddings.shape)
81 |
82 | prev_auc = 0.0
83 | with tf.Graph().as_default():
84 | session_conf = tf.ConfigProto(
85 | allow_soft_placement=True, log_device_placement=False)
86 | sess = tf.Session(config=session_conf)
87 | with sess.as_default():
88 | swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings)
89 | global_step = tf.Variable(0, name="global_step", trainable=False)
90 | optimizer = tf.train.AdamOptimizer(1e-1)
91 | #optimizer = tf.train.GradientDescentOptimizer(1e-1)
92 | grads_and_vars = optimizer.compute_gradients(swem.losses)
93 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
94 |
95 | timestamp = str(int(time.time()))
96 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
97 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
98 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
99 | if not os.path.exists(checkpoint_dir):
100 | os.makedirs(checkpoint_dir)
101 | saver = tf.train.Saver(tf.all_variables())
102 | sess.run(tf.initialize_all_variables())
103 |
104 | def train_step():
105 | q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size)
106 | one, zero = get_constant(batch_size)
107 | feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero}
108 | _, step, loss, cos, acc = sess.run(
109 | [train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict)
110 | time_str = datetime.datetime.now().isoformat()
111 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, acc))
112 |
113 | def test_step():
114 | yp, y, group = [], [], []
115 | for i in range(0, len(test_data), batch_size):
116 | f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size)
117 | one, zero = get_constant(len(f))
118 | feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero}
119 | loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict)
120 | yp.extend(cos)
121 | y.extend(f)
122 | group.extend(g)
123 | ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
124 | #for _y, _g, _yp in ppp:
125 | # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
126 | return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
127 |
128 | for i in range(num_epoch):
129 | train_step()
130 | current_step = tf.train.global_step(sess, global_step)
131 | if current_step % checkpoint_every == 0:
132 | y, g, yp = test_step()
133 | auc = utils.eval_auc(y, g, yp)
134 | top1_prec = utils._eval_top1_prec(y, g, yp)
135 | #if auc < prev_auc:
136 | # _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)]
137 | # features.append(_flist)
138 | # break
139 | #prev_auc = auc
140 |
141 | #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
142 | #utils.save_features(features[3], './data/gen_sweg_hier_test.f')
143 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random, sys, config
3 | from sklearn import metrics
4 | from operator import itemgetter
5 | from itertools import groupby
6 |
7 | def load_embeddings():
8 | _data, embeddings, vocab, _id = [], [], {}, int(0)
9 | for line in open(config.w2v_bin_file):
10 | _data.append(line.strip().split(' '))
11 | size, dim = int(_data[0][0]), int(_data[0][1])
12 | for i in range(1, len(_data)):
13 | w, vec = _data[i][0], [float(_data[i][k]) for k in range(1, dim+1)]
14 | embeddings.append(vec)
15 | vocab[w] = _id
16 | _id += 1
17 | embeddings.append([0.01] * dim)
18 | vocab['UNKNOWN'] = _id
19 | _id += 1
20 | embeddings.append([0.01] * dim)
21 | vocab[''] = _id
22 | return vocab, np.array(embeddings)
23 |
24 | def encode_sent(s, vocab, max_len):
25 | ws = [w for w in s.split('_')]
26 | ws = ws[:max_len] if len(ws) >= max_len else ws + [''] * (max_len - len(ws))
27 | nws = []
28 | for w in ws:
29 | nw = w if w in vocab else 'UNKNOWN'
30 | nws.append(vocab[nw])
31 | return nws
32 |
33 | def load_train_data(vocab, max_len):
34 | if config.dataset == config.dataset_ins:
35 | return ins_load_train_data(vocab, max_len)
36 | if config.dataset == config.dataset_qur:
37 | return qur_load_train_test_data(config.train_file, vocab, max_len)
38 | print('bad load_train_data')
39 | exit(1)
40 |
41 | def qur_load_train_test_data(_file, vocab, max_len):
42 | _data = []
43 | for line in open(_file):
44 | f, q1, q2 = line.strip().split(' ')
45 | q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
46 | _data.append((int(f), q1, q2))
47 | return _data
48 |
49 | def ins_load_train_data(vocab, max_len):
50 | _data = []
51 | for line in open(config.train_file):
52 | f, q1, q2 = line.strip().split(' ')
53 | q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
54 | _data.append((q1, q2))
55 | return _data
56 |
57 | def load_test_data(vocab, max_len):
58 | if config.dataset == config.dataset_ins:
59 | return ins_load_test_data(vocab, max_len)
60 | if config.dataset == config.dataset_qur:
61 | return qur_load_train_test_data(config.test1_file, vocab, max_len)
62 | print('bad load_test_data')
63 | exit(1)
64 |
65 | def ins_load_test_data(vocab, max_len):
66 | _data = []
67 | for line in open(config.test1_file):
68 | f, g, q1, q2 = line.strip().split(' ')
69 | q1, q2 = encode_sent(q1, vocab, max_len), encode_sent(q2, vocab, max_len)
70 | _data.append((f, g, q1, q2))
71 | return _data
72 |
73 | def gen_train_batch_qpn(_data, batch_size):
74 | psample = random.sample(_data, batch_size)
75 | nsample = random.sample(_data, batch_size)
76 | q = [s1 for s1, s2 in psample]
77 | qp = [s2 for s1, s2 in psample]
78 | qn = [s2 for s1, s2 in nsample]
79 | return np.array(q), np.array(qp), np.array(qn)
80 |
81 | def gen_train_batch_yxx(_data, batch_size):
82 | if config.dataset == config.dataset_ins:
83 | return ins_gen_train_batch_yxx(_data, batch_size)
84 | if config.dataset == config.dataset_qur:
85 | return qur_gen_train_batch_yxx(_data, batch_size)
86 | print('bad gen_train_batch_yxx')
87 | exit(1)
88 |
89 | def qur_gen_train_batch_yxx(_data, batch_size):
90 | sample = random.sample(_data, batch_size)
91 | y = [i for i,_,_ in sample]
92 | x1 = [i for _,i,_ in sample]
93 | x2 = [i for _,_,i in sample]
94 | return np.array(y), np.array(x1), np.array(x2)
95 |
96 | def ins_gen_train_batch_yxx(_data, batch_size):
97 | part_one, part_two = int(batch_size / 4 * 3), int(batch_size / 4)
98 | psample = random.sample(_data, part_one)
99 | nsample = random.sample(_data, part_two)
100 | y = [1.0] * part_one + [0.0] * part_two
101 | x1 = [s1 for s1, s2 in psample] + [s1 for s1, s2 in psample[:part_two]]
102 | x2 = [s2 for s1, s2 in psample] + [s2 for s1, s2 in nsample]
103 | return np.array(y), np.array(x1), np.array(x2)
104 |
105 | def gen_test_batch_qpn(_data, start, end):
106 | sample = _data[start:end]
107 | for i in range(len(sample), end - start):
108 | sample.append(sample[-1])
109 | f = [int(i) for i,_,_,_ in sample]
110 | g = [int(i) for _,i,_,_ in sample]
111 | q1 = [i for _,_,i,_ in sample]
112 | q2 = [i for _,_,_,i in sample]
113 | return f, g, np.array(q1), np.array(q2)
114 |
115 | def gen_test_batch_yxx(_data, start, end):
116 | if config.dataset == config.dataset_ins:
117 | return ins_gen_test_batch_yxx(_data, start, end)
118 | if config.dataset == config.dataset_qur:
119 | return qur_gen_test_batch_yxx(_data, start, end)
120 | print('bad gen_test_batch_yxx')
121 | exit(1)
122 |
123 | def qur_gen_test_batch_yxx(_data, start, end):
124 | sample = _data[start:end]
125 | y = [i for i,_,_ in sample]
126 | x1 = [i for _,i,_ in sample]
127 | x2 = [i for _,_,i in sample]
128 | return y, y, np.array(x1), np.array(x2)
129 |
130 | def ins_gen_test_batch_yxx(_data, start, end):
131 | sample = _data[start:end]
132 | for i in range(len(sample), end - start):
133 | sample.append(sample[-1])
134 | f = [int(i) for i,_,_,_ in sample]
135 | g = [int(i) for _,i,_,_ in sample]
136 | q1 = [i for _,_,i,_ in sample]
137 | q2 = [i for _,_,_,i in sample]
138 | return f, g, np.array(q1), np.array(q2)
139 |
140 | def _eval(y, g, yp):
141 | if config.dataset == config.dataset_ins:
142 | eval_auc(y, g, yp)
143 | eval_top1_prec(y, g, yp)
144 | if config.dataset == config.dataset_qur:
145 | eval_auc(y, g, yp)
146 | eval_best_prec(y, g, yp)
147 |
148 | def eval_best_prec(y, g, yp):
149 | best_p, best_s = 0.0, 0.0
150 | for i in range(50, 100, 1):
151 | i = float(i) / 100
152 | positive = 0
153 | for _y, _yp in zip(y, yp):
154 | p = 1 if _yp >= i else 0
155 | if p == _y: positive += 1
156 | prec = positive / len(yp)
157 | if prec > best_p:
158 | best_p = prec
159 | best_s = i
160 | print('best_prec: ' + str(best_p) + ' best_threshold:' + str(best_s))
161 | return best_p, best_s
162 |
163 | def eval_auc(y, g, yp):
164 | auc = metrics.roc_auc_score(y, yp)
165 | print('auc: ' + str(auc))
166 | return auc
167 |
168 | def eval_top1_prec(y, g, yp):
169 | _list = [(_y, _g, _yp) for _y, _g, _yp in zip(y, g, yp)]
170 | _dict = {}
171 | for _y, _g, _yp in _list:
172 | if not _g in _dict: _dict[_g] = []
173 | _dict[_g].append((_y, _g, _yp))
174 | positive, gc = 0 , 0
175 | for _, group in _dict.items():
176 | group = sorted(group, key=itemgetter(2), reverse=True)
177 | gc += 1
178 | if group[0][0] == 1:
179 | positive += 1
180 | prec = positive / gc
181 | print('top1 precision ' + str(positive) + '/' + str(gc) + ': '+ str(positive / gc))
182 | return prec
183 |
184 |
--------------------------------------------------------------------------------
/cnn/tensorflow/insqa_cnn.py.old:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 | ##########################################################################
5 | # embedding_lookup + cnn + cosine margine , batch
6 | ##########################################################################
7 | class InsQACNN1(object):
8 | def __init__(
9 | self, sequence_length, batch_size,
10 | vocab_size, embedding_size,
11 | filter_sizes, num_filters, l2_reg_lambda=0.0):
12 |
13 | #用户问题,字向量使用embedding_lookup
14 | self.input_x_1 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_1")
15 | #待匹配正向问题
16 | self.input_x_2 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_2")
17 | #负向问题
18 | self.input_x_3 = tf.placeholder(tf.int32, [batch_size, sequence_length], name="input_x_3")
19 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
20 | l2_loss = tf.constant(0.0)
21 | print("input_x_1 ", self.input_x_1)
22 |
23 | # Embedding layer
24 | with tf.device('/cpu:0'), tf.name_scope("embedding"):
25 | W = tf.Variable(
26 | tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
27 | name="W")
28 | chars_1 = tf.nn.embedding_lookup(W, self.input_x_1)
29 | chars_2 = tf.nn.embedding_lookup(W, self.input_x_2)
30 | chars_3 = tf.nn.embedding_lookup(W, self.input_x_3)
31 | #self.embedded_chars_1 = tf.nn.dropout(chars_1, self.dropout_keep_prob)
32 | #self.embedded_chars_2 = tf.nn.dropout(chars_2, self.dropout_keep_prob)
33 | #self.embedded_chars_3 = tf.nn.dropout(chars_3, self.dropout_keep_prob)
34 | self.embedded_chars_1 = chars_1
35 | self.embedded_chars_2 = chars_2
36 | self.embedded_chars_3 = chars_3
37 | self.embedded_chars_expanded_1 = tf.expand_dims(self.embedded_chars_1, -1)
38 | self.embedded_chars_expanded_2 = tf.expand_dims(self.embedded_chars_2, -1)
39 | self.embedded_chars_expanded_3 = tf.expand_dims(self.embedded_chars_3, -1)
40 |
41 | pooled_outputs_1 = []
42 | pooled_outputs_2 = []
43 | pooled_outputs_3 = []
44 | for i, filter_size in enumerate(filter_sizes):
45 | with tf.name_scope("conv-maxpool-%s" % filter_size):
46 | filter_shape = [filter_size, embedding_size, 1, num_filters]
47 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
48 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
49 | conv = tf.nn.conv2d(
50 | self.embedded_chars_expanded_1,
51 | W,
52 | strides=[1, 1, 1, 1],
53 | padding='VALID',
54 | name="conv-1"
55 | )
56 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-1")
57 | pooled = tf.nn.max_pool(
58 | h,
59 | ksize=[1, sequence_length - filter_size + 1, 1, 1],
60 | strides=[1, 1, 1, 1],
61 | padding='VALID',
62 | name="poll-1"
63 | )
64 | pooled_outputs_1.append(pooled)
65 |
66 | conv = tf.nn.conv2d(
67 | self.embedded_chars_expanded_2,
68 | W,
69 | strides=[1, 1, 1, 1],
70 | padding='VALID',
71 | name="conv-2"
72 | )
73 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-2")
74 | pooled = tf.nn.max_pool(
75 | h,
76 | ksize=[1, sequence_length - filter_size + 1, 1, 1],
77 | strides=[1, 1, 1, 1],
78 | padding='VALID',
79 | name="poll-2"
80 | )
81 | pooled_outputs_2.append(pooled)
82 |
83 | conv = tf.nn.conv2d(
84 | self.embedded_chars_expanded_3,
85 | W,
86 | strides=[1, 1, 1, 1],
87 | padding='VALID',
88 | name="conv-3"
89 | )
90 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu-3")
91 | pooled = tf.nn.max_pool(
92 | h,
93 | ksize=[1, sequence_length - filter_size + 1, 1, 1],
94 | strides=[1, 1, 1, 1],
95 | padding='VALID',
96 | name="poll-3"
97 | )
98 | pooled_outputs_3.append(pooled)
99 | num_filters_total = num_filters * len(filter_sizes)
100 | pooled_reshape_1 = tf.reshape(tf.concat(pooled_outputs_1, 3), [-1, num_filters_total])
101 | pooled_reshape_2 = tf.reshape(tf.concat(pooled_outputs_2, 3), [-1, num_filters_total])
102 | pooled_reshape_3 = tf.reshape(tf.concat(pooled_outputs_3, 3), [-1, num_filters_total])
103 | #dropout
104 | pooled_flat_1 = tf.nn.dropout(pooled_reshape_1, self.dropout_keep_prob)
105 | pooled_flat_2 = tf.nn.dropout(pooled_reshape_2, self.dropout_keep_prob)
106 | pooled_flat_3 = tf.nn.dropout(pooled_reshape_3, self.dropout_keep_prob)
107 |
108 | pooled_len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_1), 1)) #计算向量长度Batch模式
109 | pooled_len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_2, pooled_flat_2), 1))
110 | pooled_len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(pooled_flat_3, pooled_flat_3), 1))
111 | pooled_mul_12 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_2), 1) #计算向量的点乘Batch模式
112 | pooled_mul_13 = tf.reduce_sum(tf.multiply(pooled_flat_1, pooled_flat_3), 1)
113 |
114 | with tf.name_scope("output"):
115 | self.cos_12 = tf.div(pooled_mul_12, tf.multiply(pooled_len_1, pooled_len_2), name="scores") #计算向量夹角Batch模式
116 | self.cos_13 = tf.div(pooled_mul_13, tf.multiply(pooled_len_1, pooled_len_3))
117 |
118 | zero = tf.constant(0, shape=[batch_size], dtype=tf.float32)
119 | margin = tf.constant(0.05, shape=[batch_size], dtype=tf.float32)
120 | with tf.name_scope("loss"):
121 | self.losses = tf.maximum(zero, tf.subtract(margin, tf.subtract(self.cos_12, self.cos_13)))
122 | self.loss = tf.reduce_sum(self.losses) + l2_reg_lambda * l2_loss
123 | print('loss ', self.loss)
124 |
125 | # Accuracy
126 | with tf.name_scope("accuracy"):
127 | self.correct = tf.equal(zero, self.losses)
128 | self.accuracy = tf.reduce_mean(tf.cast(self.correct, "float"), name="accuracy")
129 | for v in tf.trainable_variables():
130 | print(v)
131 | exit(1)
132 |
--------------------------------------------------------------------------------
/swem/swem_hier_margin.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import time, os, random, datetime, sys
4 | from sklearn import metrics
5 | sys.path.append('../')
6 | import config, utils
7 |
8 | #top 1 precision:54%
9 | class SWEM_HIER(object):
10 | def __init__(self,
11 | margin,
12 | sequence_length,
13 | vocab_size,
14 | embedding_size,
15 | embeddings):
16 | self.zero = tf.placeholder(tf.float32, [None])
17 | self.q = tf.placeholder(tf.int32, [None, sequence_length])
18 | self.qp = tf.placeholder(tf.int32, [None, sequence_length])
19 | self.qn = tf.placeholder(tf.int32, [None, sequence_length])
20 |
21 | with tf.device('/cpu:0'), tf.name_scope('embedding'):
22 | self.word_mat = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
23 | q_mat = tf.nn.embedding_lookup(self.word_mat, self.q)
24 | qp_mat = tf.nn.embedding_lookup(self.word_mat, self.qp)
25 | qn_mat = tf.nn.embedding_lookup(self.word_mat, self.qn)
26 | self.q_mat_exp = tf.expand_dims(q_mat, -1)
27 | self.qp_mat_exp = tf.expand_dims(qp_mat, -1)
28 | self.qn_mat_exp = tf.expand_dims(qn_mat, -1)
29 |
30 | self.word_mat1 = tf.Variable(embeddings, trainable=True, dtype=tf.float32)
31 | q_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.q)
32 | qp_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qp)
33 | qn_mat1 = tf.nn.embedding_lookup(self.word_mat1, self.qn)
34 | self.q_mat_exp1 = tf.expand_dims(q_mat1, -1)
35 | self.qp_mat_exp1 = tf.expand_dims(qp_mat1, -1)
36 | self.qn_mat_exp1 = tf.expand_dims(qn_mat1, -1)
37 |
38 | q = tf.nn.avg_pool(self.q_mat_exp, ksize=[1, 2, 1, 1],
39 | strides=[1, 1, 1, 1], padding='VALID')
40 | qp = tf.nn.avg_pool(self.qp_mat_exp, ksize=[1, 2, 1, 1],
41 | strides=[1, 1, 1, 1], padding='VALID')
42 | qn = tf.nn.avg_pool(self.qn_mat_exp, ksize=[1, 2, 1, 1],
43 | strides=[1, 1, 1, 1], padding='VALID')
44 | q = tf.reshape(tf.reduce_max(q, 1), [-1, embedding_size])
45 | qp = tf.reshape(tf.reduce_max(qp, 1), [-1, embedding_size])
46 | qn = tf.reshape(tf.reduce_max(qn, 1), [-1, embedding_size])
47 |
48 | q1 = tf.nn.avg_pool(self.q_mat_exp1, ksize=[1, 1, 1, 1],
49 | strides=[1, 1, 1, 1], padding='VALID')
50 | qp1 = tf.nn.avg_pool(self.qp_mat_exp1, ksize=[1, 1, 1, 1],
51 | strides=[1, 1, 1, 1], padding='VALID')
52 | qn1 = tf.nn.avg_pool(self.qn_mat_exp1, ksize=[1, 1, 1, 1],
53 | strides=[1, 1, 1, 1], padding='VALID')
54 | q1 = tf.reshape(tf.reduce_max(q1, 1), [-1, embedding_size])
55 | qp1 = tf.reshape(tf.reduce_max(qp1, 1), [-1, embedding_size])
56 | qn1 = tf.reshape(tf.reduce_max(qn1, 1), [-1, embedding_size])
57 |
58 | q = tf.concat([q, q1], 1)
59 | qp = tf.concat([qp, qp1], 1)
60 | qn = tf.concat([qn, qn1], 1)
61 |
62 | self.cos_q_qp = self.cosine(q, qp)
63 | self.cos_q_qn = self.cosine(q, qn)
64 |
65 | self.losses, loss_batch = self.margin_loss(self.zero, margin, self.cos_q_qp, self.cos_q_qn)
66 |
67 | correct = tf.equal(self.zero, loss_batch)
68 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float"))
69 |
70 | def margin_loss(self, zero, margin, cos_q_qp, cos_q_qn):
71 | loss_batch = tf.maximum(zero, tf.subtract(margin, tf.subtract(cos_q_qp, cos_q_qn)))
72 | losses = tf.reduce_sum(loss_batch)
73 | return losses, loss_batch
74 |
75 | def logloss(self, y, v_one, sim):
76 | a = tf.multiply(y, tf.log(sim)) #y*log(p)
77 | b = tf.subtract(v_one, y)#1-y
78 | c = tf.log(tf.subtract(v_one, sim))#log(1-p)
79 | losses = -tf.add(a, tf.multiply(b, c))#y*log(p)+(1-y)*log(1-p)
80 | losses = tf.reduce_sum(losses, -1)
81 | return losses
82 |
83 | def cosine(self, t1, t2):
84 | len1 = tf.sqrt(tf.reduce_sum(tf.multiply(t1, t1), 1))
85 | len2 = tf.sqrt(tf.reduce_sum(tf.multiply(t2, t2), 1))
86 | multiply = tf.reduce_sum(tf.multiply(t1, t2), 1)
87 | cos = tf.div(multiply, tf.multiply(len1, len2))
88 | return tf.clip_by_value(cos, 1e-5, 0.99999)
89 |
90 | def get_constant(batch_size):
91 | one, zero = [1.0] * batch_size, [0.0] * batch_size
92 | return np.array(one), np.array(zero)
93 |
94 | margin = 0.05
95 | max_len = 200
96 | num_epoch = 200000
97 | batch_size = 256
98 | checkpoint_every = 50000
99 | vocab, embeddings = utils.load_embeddings()
100 | embedding_size = len(embeddings[0])
101 | train_data, test_data = utils.load_train_data(vocab, max_len), utils.load_test_data(vocab, max_len)
102 | print('load data done ......')
103 | print(embeddings.shape)
104 |
105 | prev_auc = 0.0
106 | with tf.Graph().as_default():
107 | session_conf = tf.ConfigProto(
108 | allow_soft_placement=True, log_device_placement=False)
109 | sess = tf.Session(config=session_conf)
110 | with sess.as_default():
111 | swem = SWEM_HIER(margin, max_len, len(vocab), embedding_size, embeddings)
112 | global_step = tf.Variable(0, name="global_step", trainable=False)
113 | optimizer = tf.train.AdamOptimizer(1e-1)
114 | #optimizer = tf.train.GradientDescentOptimizer(1e-1)
115 | grads_and_vars = optimizer.compute_gradients(swem.losses)
116 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
117 |
118 | timestamp = str(int(time.time()))
119 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
120 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
121 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
122 | if not os.path.exists(checkpoint_dir):
123 | os.makedirs(checkpoint_dir)
124 | saver = tf.train.Saver(tf.all_variables())
125 | sess.run(tf.initialize_all_variables())
126 |
127 | def train_step():
128 | q, qp, qn = utils.gen_train_batch_qpn(train_data, batch_size)
129 | one, zero = get_constant(batch_size)
130 | feed_dict = {swem.q:q, swem.qp:qp, swem.qn:qn, swem.zero:zero}
131 | _, step, loss, cos, acc = sess.run(
132 | [train_op, global_step, swem.losses, swem.cos_q_qp, swem.accuracy], feed_dict)
133 | time_str = datetime.datetime.now().isoformat()
134 | print("{}: step {}, loss {:g}, acc{:g}".format(time_str, step, loss, acc))
135 |
136 | def test_step():
137 | yp, y, group = [], [], []
138 | for i in range(0, len(test_data), batch_size):
139 | f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+batch_size)
140 | one, zero = get_constant(len(f))
141 | feed_dict = {swem.q:q1, swem.qp:q2, swem.qn:q2, swem.zero:zero}
142 | loss, cos = sess.run([swem.losses, swem.cos_q_qp], feed_dict)
143 | yp.extend(cos)
144 | y.extend(f)
145 | group.extend(g)
146 | ppp = [(_y, _g, _yp) for _y, _g, _yp in zip(y, group, yp)]
147 | #for _y, _g, _yp in ppp:
148 | # print(str(_y) + ' ' + str(_g) + ' ' + str(_yp))
149 | return y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
150 |
151 | for i in range(num_epoch):
152 | train_step()
153 | current_step = tf.train.global_step(sess, global_step)
154 | if current_step % checkpoint_every == 0:
155 | y, g, yp = test_step()
156 | auc = utils.eval_auc(y, g, yp)
157 | top1_prec = utils._eval_top1_prec(y, g, yp)
158 | #if auc < prev_auc:
159 | # _flist = [(_f, [s]) for s, _f in zip(score[:len(test_data)], flags)]
160 | # features.append(_flist)
161 | # break
162 | #prev_auc = auc
163 |
164 | #utils.save_features(features[0] + features[1] + features[2], './data/gen_sweg_hier_train.f')
165 | #utils.save_features(features[3], './data/gen_sweg_hier_test.f')
166 |
--------------------------------------------------------------------------------
/cnn/tensorflow/insqa_train.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3.4
2 |
3 | import tensorflow as tf
4 | import numpy as np
5 | import os, time, datetime, operator, sys
6 | from insqa_cnn import InsQACNN
7 | sys.path.append('../../')
8 | import config, utils
9 |
10 | print(tf.__version__)
11 |
12 | # Parameters
13 | # ==================================================
14 |
15 | # Model Hyperparameters
16 | tf.flags.DEFINE_float("margin", 0.05, "CNN model margin")
17 | tf.flags.DEFINE_integer("sequence_length", 200, "Max sequence lehgth(default: 200)")
18 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
19 | tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')")
20 | tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)")
21 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)")
22 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)")
23 |
24 | # Training parameters
25 | tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)")
26 | tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)")
27 | tf.flags.DEFINE_integer("evaluate_every", 3000, "Evaluate model on dev set after this many steps (default: 100)")
28 | tf.flags.DEFINE_integer("checkpoint_every", 3000, "Save model after this many steps (default: 100)")
29 | # Misc Parameters
30 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
31 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
32 | FLAGS = tf.flags.FLAGS
33 | FLAGS._parse_flags()
34 | print("\nParameters:")
35 | for attr, value in sorted(FLAGS.__flags.items()):
36 | print("{}={}".format(attr.upper(), value))
37 | print("")
38 |
39 | # Data Preparatopn
40 | # ==================================================
41 |
42 | # Load data
43 | print("Loading data...")
44 | vocab, embeddings = utils.load_embeddings()
45 | train_data = utils.load_train_data(vocab, FLAGS.sequence_length)
46 | test_data = utils.load_test_data(vocab, FLAGS.sequence_length)
47 | print("Load done...")
48 |
49 | # Training
50 | # ==================================================
51 |
52 | prev_auc = 0
53 | with tf.Graph().as_default():
54 | with tf.device("/gpu:1"):
55 | session_conf = tf.ConfigProto(
56 | allow_soft_placement=FLAGS.allow_soft_placement,
57 | log_device_placement=FLAGS.log_device_placement)
58 | sess = tf.Session(config=session_conf)
59 | with sess.as_default():
60 | cnn = InsQACNN(
61 | _margin=FLAGS.margin,
62 | sequence_length=FLAGS.sequence_length,
63 | batch_size=FLAGS.batch_size,
64 | vocab_size=len(vocab),
65 | embedding_size=FLAGS.embedding_dim,
66 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
67 | num_filters=FLAGS.num_filters,
68 | l2_reg_lambda=FLAGS.l2_reg_lambda)
69 |
70 | # Define Training procedure
71 | global_step = tf.Variable(0, name="global_step", trainable=False)
72 | optimizer = tf.train.AdamOptimizer(1e-1)
73 | #optimizer = tf.train.GradientDescentOptimizer(1e-2)
74 | grads_and_vars = optimizer.compute_gradients(cnn.loss)
75 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
76 |
77 | # Keep track of gradient values and sparsity (optional)
78 | grad_summaries = []
79 | for g, v in grads_and_vars:
80 | if g is not None:
81 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
82 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
83 | grad_summaries.append(grad_hist_summary)
84 | grad_summaries.append(sparsity_summary)
85 | grad_summaries_merged = tf.summary.merge(grad_summaries)
86 |
87 | # Output directory for models and summaries
88 | timestamp = str(int(time.time()))
89 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
90 | print("Writing to {}\n".format(out_dir))
91 |
92 | # Summaries for loss and accuracy
93 | loss_summary = tf.summary.scalar("loss", cnn.loss)
94 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
95 |
96 | # Train Summaries
97 | train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
98 | train_summary_dir = os.path.join(out_dir, "summaries", "train")
99 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)
100 |
101 | # Dev summaries
102 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
103 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
104 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)
105 |
106 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
107 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
108 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
109 | if not os.path.exists(checkpoint_dir):
110 | os.makedirs(checkpoint_dir)
111 | saver = tf.train.Saver(tf.all_variables())
112 |
113 | # Initialize all variables
114 | sess.run(tf.initialize_all_variables())
115 |
116 | def train_step(q, qp, qn):
117 | feed_dict = {
118 | cnn.q: q, cnn.qp: qp, cnn.qn: qn,
119 | #cnn.input_x_1: q, cnn.input_x_2: qp, cnn.input_x_3: qn,
120 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
121 | }
122 | _, step, summaries, loss, accuracy, cos1, cos2 = sess.run(
123 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.cos_q_qp, cnn.cos_q_qn],
124 | feed_dict)
125 | #print(cos1)
126 | #print(cos2)
127 | time_str = datetime.datetime.now().isoformat()
128 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
129 | train_summary_writer.add_summary(summaries, step)
130 |
131 | def test_step():
132 | yp, y, group, of = [], [], [], open(config.predict1_file, 'w')
133 | for i in range(0, len(test_data), FLAGS.batch_size):
134 | f, g, q1, q2 = utils.gen_test_batch_qpn(test_data, i, i+FLAGS.batch_size)
135 | feed_dict = {
136 | cnn.q: q1, cnn.qp: q2, cnn.qn: q2,
137 | #cnn.input_x_1: q1, cnn.input_x_2: q2, cnn.input_x_3: q2,
138 | cnn.dropout_keep_prob: 1.0
139 | }
140 | cos = sess.run([cnn.cos_q_qp], feed_dict)
141 | yp.extend(cos[0])
142 | y.extend(f)
143 | group.extend(g)
144 | y, g, yp = y[:len(test_data)], group[:len(test_data)], yp[:len(test_data)]
145 | auc = utils.eval_auc(y[:len(test_data)], g, yp[:len(test_data)])
146 | top1_prec = utils._eval_top1_prec(y, g, yp)
147 | for p in yp[:len(test_data)]: of.write(str(p) + '\n')
148 | of.write(str(top1_prec) + '\n')
149 | of.close()
150 | return auc
151 |
152 | # Generate batches
153 | # Training loop. For each batch...
154 | for i in range(FLAGS.num_epochs):
155 | try:
156 | q, qp, qn = utils.gen_train_batch_qpn(train_data, FLAGS.batch_size)
157 | train_step(q, qp, qn)
158 | current_step = tf.train.global_step(sess, global_step)
159 | if current_step % FLAGS.evaluate_every == 0:
160 | auc = test_step()
161 | #if auc < prev_auc: break
162 | prev_auc = auc
163 | if current_step % FLAGS.checkpoint_every == 0:
164 | path = saver.save(sess, checkpoint_prefix, global_step=current_step)
165 | print("Saved model checkpoint to {}\n".format(path))
166 | except Exception as e:
167 | print(e)
168 |
--------------------------------------------------------------------------------
/cnn/tensorflow/insurance_qa_data_helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 |
4 | empty_vector = []
5 | for i in range(0, 100):
6 | empty_vector.append(float(0.0))
7 | onevector = []
8 | for i in range(0, 10):
9 | onevector.append(float(1))
10 | zerovector = []
11 | for i in range(0, 10):
12 | zerovector.append(float(0))
13 |
14 | def build_vocab():
15 | code = int(0)
16 | vocab = {}
17 | vocab['UNKNOWN'] = code
18 | code += 1
19 | for line in open('/export/jw/cnn/insuranceQA/train'):
20 | items = line.strip().split(' ')
21 | for i in range(2, 4):
22 | words = items[i].split('_')
23 | for word in words:
24 | if not word in vocab:
25 | vocab[word] = code
26 | code += 1
27 | for line in open('/export/jw/cnn/insuranceQA/test1'):
28 | items = line.strip().split(' ')
29 | for i in range(2, 4):
30 | words = items[i].split('_')
31 | for word in words:
32 | if not word in vocab:
33 | vocab[word] = code
34 | code += 1
35 | return vocab
36 |
37 | def rand_qa(qalist):
38 | index = random.randint(0, len(qalist) - 1)
39 | return qalist[index]
40 |
41 | def read_alist():
42 | alist = []
43 | for line in open('/export/jw/cnn/insuranceQA/train'):
44 | items = line.strip().split(' ')
45 | alist.append(items[3])
46 | print('read_alist done ......')
47 | return alist
48 |
49 | def vocab_plus_overlap(vectors, sent, over, size):
50 | global onevector
51 | global zerovector
52 | oldict = {}
53 | words = over.split('_')
54 | if len(words) < size:
55 | size = len(words)
56 | for i in range(0, size):
57 | if words[i] == '':
58 | continue
59 | oldict[words[i]] = '#'
60 | matrix = []
61 | words = sent.split('_')
62 | if len(words) < size:
63 | size = len(words)
64 | for i in range(0, size):
65 | vec = read_vector(vectors, words[i])
66 | newvec = vec.copy()
67 | #if words[i] in oldict:
68 | # newvec += onevector
69 | #else:
70 | # newvec += zerovector
71 | matrix.append(newvec)
72 | return matrix
73 |
74 | def load_vectors():
75 | vectors = {}
76 | for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
77 | items = line.strip().split(' ')
78 | if (len(items) < 101):
79 | continue
80 | vec = []
81 | for i in range(1, 101):
82 | vec.append(float(items[i]))
83 | vectors[items[0]] = vec
84 | return vectors
85 |
86 | def read_vector(vectors, word):
87 | global empty_vector
88 | if word in vectors:
89 | return vectors[word]
90 | else:
91 | return empty_vector
92 | #return vectors['']
93 |
94 | def load_test_and_vectors():
95 | testList = []
96 | for line in open('/export/jw/cnn/insuranceQA/test1'):
97 | testList.append(line.strip())
98 | vectors = load_vectors()
99 | return testList, vectors
100 |
101 | def load_train_and_vectors():
102 | trainList = []
103 | for line in open('/export/jw/cnn/insuranceQA/train'):
104 | trainList.append(line.strip())
105 | vectors = load_vectors()
106 | return trainList, vectors
107 |
108 | def load_data_val_10(testList, vectors, index):
109 | x_train_1 = []
110 | x_train_2 = []
111 | x_train_3 = []
112 | items = testList[index].split(' ')
113 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
114 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
115 | x_train_3.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
116 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
117 |
118 | def read_raw():
119 | raw = []
120 | for line in open('/export/jw/cnn/insuranceQA/train'):
121 | items = line.strip().split(' ')
122 | if items[0] == '1':
123 | raw.append(items)
124 | return raw
125 |
126 | def encode_sent(vocab, string, size):
127 | x = []
128 | words = string.split('_')
129 | for i in range(0, 200):
130 | if words[i] in vocab:
131 | x.append(vocab[words[i]])
132 | else:
133 | x.append(vocab['UNKNOWN'])
134 | return x
135 |
136 | def load_data_6(vocab, alist, raw, size):
137 | x_train_1 = []
138 | x_train_2 = []
139 | x_train_3 = []
140 | for i in range(0, size):
141 | items = raw[random.randint(0, len(raw) - 1)]
142 | nega = rand_qa(alist)
143 | x_train_1.append(encode_sent(vocab, items[2], 100))
144 | x_train_2.append(encode_sent(vocab, items[3], 100))
145 | x_train_3.append(encode_sent(vocab, nega, 100))
146 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
147 |
148 | def load_data_val_6(testList, vocab, index, batch):
149 | x_train_1 = []
150 | x_train_2 = []
151 | x_train_3 = []
152 | for i in range(0, batch):
153 | true_index = index + i
154 | if (true_index >= len(testList)):
155 | true_index = len(testList) - 1
156 | items = testList[true_index].split(' ')
157 | x_train_1.append(encode_sent(vocab, items[2], 100))
158 | x_train_2.append(encode_sent(vocab, items[3], 100))
159 | x_train_3.append(encode_sent(vocab, items[3], 100))
160 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
161 |
162 | def load_data_9(trainList, vectors, size):
163 | x_train_1 = []
164 | x_train_2 = []
165 | y_train = []
166 | for i in range(0, size):
167 | pos = trainList[random.randint(0, len(trainList) - 1)]
168 | posItems = pos.strip().split(' ')
169 | x_train_1.append(vocab_plus_overlap(vectors, posItems[2], posItems[3], 200))
170 | x_train_2.append(vocab_plus_overlap(vectors, posItems[3], posItems[2], 200))
171 | y_train.append([1, 0])
172 | neg = trainList[random.randint(0, len(trainList) - 1)]
173 | negItems = neg.strip().split(' ')
174 | x_train_1.append(vocab_plus_overlap(vectors, posItems[2], negItems[3], 200))
175 | x_train_2.append(vocab_plus_overlap(vectors, negItems[3], posItems[2], 200))
176 | y_train.append([0, 1])
177 | return np.array(x_train_1), np.array(x_train_2), np.array(y_train)
178 |
179 | def load_data_val_9(testList, vectors, index):
180 | x_train_1 = []
181 | x_train_2 = []
182 | items = testList[index].split(' ')
183 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
184 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
185 | return np.array(x_train_1), np.array(x_train_2)
186 |
187 | def load_data_10(vectors, qalist, raw, size):
188 | x_train_1 = []
189 | x_train_2 = []
190 | x_train_3 = []
191 | items = raw[random.randint(0, len(raw) - 1)]
192 | nega = rand_qa(qalist)
193 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
194 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
195 | x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200))
196 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
197 |
198 | def load_data_11(vectors, qalist, raw, size):
199 | x_train_1 = []
200 | x_train_2 = []
201 | x_train_3 = []
202 | items = raw[random.randint(0, len(raw) - 1)]
203 | nega = rand_qa(qalist)
204 | x_train_1.append(vocab_plus_overlap(vectors, items[2], items[3], 200))
205 | x_train_2.append(vocab_plus_overlap(vectors, items[3], items[2], 200))
206 | x_train_3.append(vocab_plus_overlap(vectors, nega, items[2], 200))
207 | return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
208 |
209 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
210 | data = np.array(data)
211 | data_size = len(data)
212 | num_batches_per_epoch = int(len(data)/batch_size) + 1
213 | for epoch in range(num_epochs):
214 | # Shuffle the data at each epoch
215 | if shuffle:
216 | shuffle_indices = np.random.permutation(np.arange(data_size))
217 | shuffled_data = data[shuffle_indices]
218 | else:
219 | shuffled_data = data
220 | for batch_num in range(num_batches_per_epoch):
221 | start_index = batch_num * batch_size
222 | end_index = min((batch_num + 1) * batch_size, data_size)
223 | yield shuffled_data[start_index:end_index]
224 |
225 |
226 |
--------------------------------------------------------------------------------
/cnn/tensorflow/insqa_train.py.old:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3.4
2 |
3 | import tensorflow as tf
4 | import numpy as np
5 | import os
6 | import time
7 | import datetime
8 | import insurance_qa_data_helpers
9 | from insqa_cnn import InsQACNN1
10 | import operator
11 |
12 | #print tf.__version__
13 |
14 | # Parameters
15 | # ==================================================
16 |
17 | # Model Hyperparameters
18 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
19 | tf.flags.DEFINE_string("filter_sizes", "1,2,3,5", "Comma-separated filter sizes (default: '3,4,5')")
20 | tf.flags.DEFINE_integer("num_filters", 256, "Number of filters per filter size (default: 128)")
21 | tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)")
22 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)")
23 |
24 | # Training parameters
25 | tf.flags.DEFINE_integer("batch_size", 100, "Batch Size (default: 64)")
26 | tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)")
27 | tf.flags.DEFINE_integer("evaluate_every", 5000, "Evaluate model on dev set after this many steps (default: 100)")
28 | tf.flags.DEFINE_integer("checkpoint_every", 5000, "Save model after this many steps (default: 100)")
29 | # Misc Parameters
30 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
31 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
32 |
33 | FLAGS = tf.flags.FLAGS
34 | FLAGS._parse_flags()
35 | print("\nParameters:")
36 | for attr, value in sorted(FLAGS.__flags.items()):
37 | print("{}={}".format(attr.upper(), value))
38 | print("")
39 |
40 | # Data Preparatopn
41 | # ==================================================
42 |
43 | # Load data
44 | print("Loading data...")
45 |
46 | vocab = insurance_qa_data_helpers.build_vocab()
47 | alist = insurance_qa_data_helpers.read_alist()
48 | raw = insurance_qa_data_helpers.read_raw()
49 | x_train_1, x_train_2, x_train_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size)
50 | testList, vectors = insurance_qa_data_helpers.load_test_and_vectors()
51 | vectors = ''
52 | print('x_train_1', np.shape(x_train_1))
53 | print("Load done...")
54 |
55 | val_file = '/export/jw/cnn/insuranceQA/test1'
56 | precision = '/export/jw/cnn/insuranceQA/test1.acc'
57 | #x_val, y_val = data_deepqa.load_data_val()
58 |
59 | # Training
60 | # ==================================================
61 |
62 | with tf.Graph().as_default():
63 | with tf.device("/gpu:1"):
64 | session_conf = tf.ConfigProto(
65 | allow_soft_placement=FLAGS.allow_soft_placement,
66 | log_device_placement=FLAGS.log_device_placement)
67 | sess = tf.Session(config=session_conf)
68 | with sess.as_default():
69 | cnn = InsQACNN1(
70 | sequence_length=x_train_1.shape[1],
71 | batch_size=FLAGS.batch_size,
72 | vocab_size=len(vocab),
73 | embedding_size=FLAGS.embedding_dim,
74 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
75 | num_filters=FLAGS.num_filters,
76 | l2_reg_lambda=FLAGS.l2_reg_lambda)
77 |
78 | # Define Training procedure
79 | global_step = tf.Variable(0, name="global_step", trainable=False)
80 | optimizer = tf.train.AdamOptimizer(1e-1)
81 | #optimizer = tf.train.GradientDescentOptimizer(1e-2)
82 | grads_and_vars = optimizer.compute_gradients(cnn.loss)
83 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
84 |
85 | # Keep track of gradient values and sparsity (optional)
86 | grad_summaries = []
87 | for g, v in grads_and_vars:
88 | if g is not None:
89 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
90 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
91 | grad_summaries.append(grad_hist_summary)
92 | grad_summaries.append(sparsity_summary)
93 | grad_summaries_merged = tf.summary.merge(grad_summaries)
94 |
95 | # Output directory for models and summaries
96 | timestamp = str(int(time.time()))
97 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
98 | print("Writing to {}\n".format(out_dir))
99 |
100 | # Summaries for loss and accuracy
101 | loss_summary = tf.summary.scalar("loss", cnn.loss)
102 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
103 |
104 | # Train Summaries
105 | train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
106 | train_summary_dir = os.path.join(out_dir, "summaries", "train")
107 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)
108 |
109 | # Dev summaries
110 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
111 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
112 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)
113 |
114 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
115 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
116 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
117 | if not os.path.exists(checkpoint_dir):
118 | os.makedirs(checkpoint_dir)
119 | saver = tf.train.Saver(tf.all_variables())
120 |
121 | # Initialize all variables
122 | sess.run(tf.initialize_all_variables())
123 |
124 | def train_step(x_batch_1, x_batch_2, x_batch_3):
125 | """
126 | A single training step
127 | """
128 | feed_dict = {
129 | cnn.input_x_1: x_batch_1,
130 | cnn.input_x_2: x_batch_2,
131 | cnn.input_x_3: x_batch_3,
132 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
133 | }
134 | _, step, summaries, loss, accuracy = sess.run(
135 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
136 | feed_dict)
137 | time_str = datetime.datetime.now().isoformat()
138 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
139 | train_summary_writer.add_summary(summaries, step)
140 |
141 | def dev_step():
142 | scoreList = []
143 | i = int(0)
144 | while True:
145 | x_test_1, x_test_2, x_test_3 = insurance_qa_data_helpers.load_data_val_6(testList, vocab, i, FLAGS.batch_size)
146 | feed_dict = {
147 | cnn.input_x_1: x_test_1,
148 | cnn.input_x_2: x_test_2,
149 | cnn.input_x_3: x_test_3,
150 | cnn.dropout_keep_prob: 1.0
151 | }
152 | batch_scores = sess.run([cnn.cos_12], feed_dict)
153 | for score in batch_scores[0]:
154 | scoreList.append(score)
155 | i += FLAGS.batch_size
156 | if i >= len(testList):
157 | break
158 | sessdict = {}
159 | index = int(0)
160 | for line in open(val_file):
161 | items = line.strip().split(' ')
162 | qid = items[1].split(':')[1]
163 | if not qid in sessdict:
164 | sessdict[qid] = []
165 | sessdict[qid].append((scoreList[index], items[0]))
166 | index += 1
167 | if index >= len(testList):
168 | break
169 | lev1 = float(0)
170 | lev0 = float(0)
171 | of = open(precision, 'a')
172 | for k, v in sessdict.items():
173 | v.sort(key=operator.itemgetter(0), reverse=True)
174 | score, flag = v[0]
175 | if flag == '1':
176 | lev1 += 1
177 | if flag == '0':
178 | lev0 += 1
179 | of.write('lev1:' + str(lev1) + '\n')
180 | of.write('lev0:' + str(lev0) + '\n')
181 | print('lev1 ' + str(lev1))
182 | print('lev0 ' + str(lev0))
183 | of.close()
184 |
185 | # Generate batches
186 | # Training loop. For each batch...
187 | for i in range(FLAGS.num_epochs):
188 | try:
189 | x_batch_1, x_batch_2, x_batch_3 = insurance_qa_data_helpers.load_data_6(vocab, alist, raw, FLAGS.batch_size)
190 | train_step(x_batch_1, x_batch_2, x_batch_3)
191 | current_step = tf.train.global_step(sess, global_step)
192 | if current_step % FLAGS.evaluate_every == 0:
193 | print("\nEvaluation:")
194 | dev_step()
195 | print("")
196 | if current_step % FLAGS.checkpoint_every == 0:
197 | path = saver.save(sess, checkpoint_prefix, global_step=current_step)
198 | print("Saved model checkpoint to {}\n".format(path))
199 | except Exception as e:
200 | print(e)
201 |
--------------------------------------------------------------------------------
/rnn_attention/tensorflow/tf_rnn_char.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | ####################################################################################
4 | #test1 top1准确率59%
5 | ####################################################################################
6 | import tensorflow as tf
7 | import numpy as np
8 | from operator import itemgetter
9 | import random, datetime, json, insurance_qa_data_helpers
10 |
11 | class RNN_Model(object):
12 | def _rnn_net(self, inputs, mask, embedding, keep_prob, batch_size, embed_dim, num_step, fw_cell, bw_cell):
13 | _initial_state = fw_cell.zero_state(batch_size,dtype=tf.float32)
14 | inputs=tf.nn.embedding_lookup(embedding, inputs)
15 | inputs = tf.nn.dropout(inputs, self.keep_prob)
16 | #[batch_size, sequence_length, embedding_size]转换为[sequence_length, batch_size, embedding_size]
17 | inputs = tf.transpose(inputs, [1, 0, 2])
18 | #[sequence_length, batch_size, embedding_size]转换为list, sequence_length个[batch_size, embedding_size]
19 | inputs = tf.unstack(inputs)
20 | #inputs = tf.reshape(inputs, [-1, embed_dim])
21 | #inputs = tf.split(inputs, num_step, 0)
22 | #输出为list, sequence_length个[batch_size, embedding_size * 2]
23 | outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=_initial_state, initial_state_bw=_initial_state)
24 | outputs = tf.transpose(tf.stack(outputs), [1, 0, 2])
25 | self.outputs = outputs
26 | #对rnn的输出[batch_size, sequence_length, embedding_size],目前采用maxpooling是最好的效果
27 | #mean_pooling以及取最后一个step的向量,效果都不好
28 | outputs = self._max_pooling(outputs)
29 | print outputs
30 |
31 | #outputs = outputs[-1]
32 | #outputs = outputs * mask[:, :, None]
33 | #mean pooling
34 | #outputs = tf.reduce_sum(outputs, 0) / (tf.reduce_sum(mask, 0)[:,None])
35 | return outputs
36 |
37 | def _max_pooling(self, lstm):
38 | sequence_length, embedding_size = int(lstm.get_shape()[1]), int(lstm.get_shape()[2])
39 | lstm = tf.expand_dims(lstm, -1)
40 | output = tf.nn.max_pool(lstm, ksize=[1, sequence_length, 1, 1], strides=[1, 1, 1, 1], padding='VALID')
41 | output = tf.reshape(output, [-1, embedding_size])
42 | return output
43 |
44 | def __init__(self, config, is_training=True):
45 | self.keep_prob=tf.placeholder(tf.float32, name='dropout_keep_prob')
46 | self.batch_size=config.batch_size
47 | self.num_step=config.num_step
48 |
49 | self.qlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
50 | #这个版本没有使用mask
51 | self.mask_q = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
52 | self.plist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
53 | self.mask_p = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
54 | self.nlist = tf.placeholder(tf.int32, [self.batch_size, self.num_step])
55 | self.mask_n = tf.placeholder(tf.float32, [self.num_step, self.batch_size])
56 |
57 | hidden_neural_size=config.hidden_neural_size
58 | vocabulary_size=config.vocabulary_size
59 | self.embed_dim=config.embed_dim
60 | hidden_layer_num=config.hidden_layer_num
61 |
62 | #fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True)
63 | fw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
64 | fw_cell = tf.contrib.rnn.DropoutWrapper(
65 | fw_cell,output_keep_prob=self.keep_prob
66 | )
67 | #bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=1.0,state_is_tuple=True)
68 | bw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
69 | bw_cell = tf.contrib.rnn.DropoutWrapper(
70 | bw_cell,output_keep_prob=self.keep_prob
71 | )
72 |
73 | #embedding layer
74 | with tf.device("/cpu:1"),tf.name_scope("embedding_layer"):
75 | self.embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W')
76 | #self.a_embedding = tf.Variable(tf.truncated_normal([vocabulary_size, self.embed_dim], stddev=0.1), name='W')
77 |
78 | q = self._rnn_net(self.qlist, mask_q, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
79 | tf.get_variable_scope().reuse_variables()
80 | p = self._rnn_net(self.plist, mask_p, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
81 | tf.get_variable_scope().reuse_variables()
82 | n = self._rnn_net(self.nlist, mask_n, self.embedding, self.keep_prob, self.batch_size, self.embed_dim, self.num_step, fw_cell, bw_cell)
83 | #len_1 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1)), 0.01, 100000)
84 | #len_2 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1)), 0.01, 100000)
85 | #len_3 = tf.clip_by_value(tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1)), 0.01, 100000)
86 | len_1 = tf.sqrt(tf.reduce_sum(tf.multiply(q, q), 1))
87 | len_2 = tf.sqrt(tf.reduce_sum(tf.multiply(p, p), 1))
88 | len_3 = tf.sqrt(tf.reduce_sum(tf.multiply(n, n), 1))
89 |
90 | self.cos12 = tf.reduce_sum(tf.multiply(q, p), axis=1) / (len_1 * len_2)
91 | self.cos13 = tf.reduce_sum(tf.multiply(q, n), axis=1) / (len_1 * len_3)
92 | self.q = q
93 | self.p = p
94 |
95 | zero = tf.constant(np.zeros(self.batch_size, dtype='float32'))
96 | margin = tf.constant(np.full(self.batch_size, 0.1, dtype='float32'))
97 | diff = tf.cast(tf.maximum(zero, margin - self.cos12 + self.cos13), dtype='float32')
98 | self.cost = tf.reduce_sum(diff)
99 | self.accuracy = tf.reduce_sum(tf.cast(tf.equal(zero, diff), dtype='float32')) / float(self.batch_size)
100 |
101 | def train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n):
102 | fetches = [model.cost, model.accuracy, global_step, train_op, model.cos12, model.q, model.p, model.outputs]
103 | feed_dict = {
104 | model.qlist: qlist,
105 | model.plist: plist,
106 | model.nlist: nlist,
107 | model.mask_q : mask_q,
108 | model.mask_p : mask_p,
109 | model.mask_n : mask_n,
110 | model.keep_prob: config.keep_prob
111 | }
112 | cost, accuracy, step, _, cos12, q, p, outputs = sess.run(fetches, feed_dict)
113 | time_str = datetime.datetime.now().isoformat()
114 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy))
115 |
116 |
117 | def dev_step(model, vocab, batch_size, max_len):
118 | score_list, i = [], int(0)
119 | while True:
120 | qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_val_data(test_list, vocab, i, FLAGS.batch_size, max_len)
121 | feed_dict = {
122 | model.qlist: qlist,
123 | model.plist: plist,
124 | model.nlist: nlist,
125 | model.mask_q : mask_q,
126 | model.mask_p : mask_p,
127 | model.mask_n : mask_n,
128 | model.keep_prob: float(1.0)
129 | }
130 | batch_scores = sess.run([model.cos12], feed_dict)
131 | for score in batch_scores[0]:
132 | score_list.append(score)
133 | i += FLAGS.batch_size
134 | if i >= len(test_list):
135 | break
136 | insurance_qa_data_helpers.evaluation(score_list, test_list)
137 |
138 | tf.flags.DEFINE_integer('evaluate_every',10000,'evaluate every')
139 | tf.flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')
140 | tf.flags.DEFINE_integer('emdedding_dim',100,'embedding dim')
141 | tf.flags.DEFINE_integer('hidden_neural_size',200,'LSTM hidden neural size')
142 | tf.flags.DEFINE_integer('hidden_layer_num',1,'LSTM hidden layer num')
143 | tf.flags.DEFINE_integer('max_len',100,'max_len of training sentence')
144 | tf.flags.DEFINE_float('init_scale',0.1,'init scale')
145 | tf.flags.DEFINE_float('keep_prob',0.5,'dropout rate')
146 | tf.flags.DEFINE_integer('num_epoch',1000000,'num epoch')
147 | tf.flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
148 | # Misc Parameters
149 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
150 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
151 | FLAGS = tf.flags.FLAGS
152 | FLAGS._parse_flags()
153 |
154 | vocab = insurance_qa_data_helpers.build_vocab()
155 | train_list = insurance_qa_data_helpers.load_train_list()
156 | qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len)
157 | test_list = insurance_qa_data_helpers.load_test_list()
158 |
159 | class Config(object):
160 | hidden_neural_size=FLAGS.hidden_neural_size
161 | vocabulary_size=len(vocab)
162 | embed_dim=FLAGS.emdedding_dim
163 | hidden_layer_num=FLAGS.hidden_layer_num
164 | keep_prob=FLAGS.keep_prob
165 | batch_size = FLAGS.batch_size
166 | num_step = FLAGS.max_len
167 | max_grad_norm=FLAGS.max_grad_norm
168 | num_epoch = FLAGS.num_epoch
169 |
170 | config = Config()
171 | eval_config=Config()
172 | eval_config.keep_prob=1.0
173 |
174 | with tf.Graph().as_default():
175 | with tf.device('/gpu:1'):
176 | session_conf = tf.ConfigProto(
177 | allow_soft_placement=FLAGS.allow_soft_placement,
178 | log_device_placement=FLAGS.log_device_placement)
179 | sess = tf.Session(config=session_conf)
180 | with sess.as_default():
181 | initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale)
182 | with tf.variable_scope("model",reuse=None,initializer=initializer):
183 | model = RNN_Model(config=config, is_training=True)
184 |
185 | # Define Training procedure
186 | global_step = tf.Variable(0, name="global_step", trainable=False)
187 | #optimizer = tf.train.RMSPropOptimizer(0.01)
188 | #optimizer = tf.train.AdamOptimizer(0.1)
189 | optimizer = tf.train.GradientDescentOptimizer(0.2)
190 | grads_and_vars = optimizer.compute_gradients(model.cost)
191 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
192 |
193 | # Initialize all variables
194 | sess.run(tf.global_variables_initializer())
195 | for i in range(config.num_epoch):
196 | qlist, plist, nlist, mask_q, mask_p, mask_n = insurance_qa_data_helpers.load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len)
197 | train_step(model, qlist, plist, nlist, mask_q, mask_p, mask_n)
198 | current_step = tf.train.global_step(sess, global_step)
199 | if current_step % FLAGS.evaluate_every == 0:
200 | dev_step(model, vocab, FLAGS.batch_size, FLAGS.max_len)
201 |
--------------------------------------------------------------------------------
/cnn/theano/insqa_cnn.py:
--------------------------------------------------------------------------------
1 |
2 | ###########################################################
3 | # test1 top-1 precision: 62%
4 | ###########################################################
5 |
6 | import os, sys, timeit, random, operator
7 |
8 | import numpy as np
9 |
10 | import theano
11 | import theano.tensor as T
12 | from theano.tensor.signal import pool
13 | from theano.tensor.nnet import conv2d
14 |
15 | #TODO change path to your dataset
16 | trainfile = '/export/jw/cnn/insuranceQA/train'
17 | test1file = '/export/jw/cnn/insuranceQA/test1'
18 | vectorsfile = '/export/jw/cnn/insuranceQA/vectors.nobin'
19 |
20 | ###########################################################
21 | # read qa data
22 | ###########################################################
23 | def build_vocab():
24 | global trainfile
25 | code, vocab = int(0), {}
26 | vocab['UNKNOWN'] = code
27 | code += 1
28 | for line in open(trainfile):
29 | items = line.strip().split(' ')
30 | for i in range(2, 3):
31 | for word in items[i].split('_'):
32 | if len(word) <= 0:
33 | continue
34 | if not word in vocab:
35 | vocab[word] = code
36 | code += 1
37 | return vocab
38 |
39 | def load_vectors():
40 | global vectorsfile
41 | vectors = {}
42 | for line in open(vectorsfile):
43 | items = line.strip().split(' ')
44 | if len(items[0]) <= 0:
45 | continue
46 | vec = []
47 | for i in range(1, 101):
48 | vec.append(float(items[i]))
49 | vectors[items[0]] = vec
50 | return vectors
51 |
52 | def load_word_embeddings(vocab, dim):
53 | vectors = load_vectors()
54 | embeddings = [] #brute initialization
55 | for i in range(0, len(vocab)):
56 | vec = []
57 | for j in range(0, dim):
58 | vec.append(0.01)
59 | embeddings.append(vec)
60 | for word, code in vocab.items():
61 | if word in vectors:
62 | embeddings[code] = vectors[word]
63 | return np.array(embeddings, dtype='float32')
64 |
65 | #be attention initialization of UNKNNOW
66 | def encode_sent(vocab, string, size):
67 | x = []
68 | words = string.split('_')
69 | for i in range(0, size):
70 | if words[i] in vocab:
71 | x.append(vocab[words[i]])
72 | else:
73 | x.append(vocab['UNKNOWN'])
74 | return x
75 |
76 | def load_train_list():
77 | global trainfile
78 | trainList = []
79 | for line in open(trainfile):
80 | trainList.append(line.strip().split(' '))
81 | return trainList
82 |
83 | def load_test_list():
84 | global test1file
85 | testList = []
86 | for line in open(test1file):
87 | testList.append(line.strip().split(' '))
88 | return testList
89 |
90 | def load_data(trainList, vocab, batch_size):
91 | train_1, train_2, train_3 = [], [], []
92 | for i in range(0, batch_size):
93 | pos = trainList[random.randint(0, len(trainList)-1)]
94 | neg = trainList[random.randint(0, len(trainList)-1)]
95 | train_1.append(encode_sent(vocab, pos[2], 100))
96 | train_2.append(encode_sent(vocab, pos[3], 100))
97 | train_3.append(encode_sent(vocab, neg[3], 100))
98 | return np.array(train_1, dtype='float32'), np.array(train_2, dtype='float32'), np.array(train_3, dtype='float32')
99 |
100 | def load_data_val(testList, vocab, index, batch_size):
101 | x1, x2, x3 = [], [], []
102 | for i in range(0, batch_size):
103 | true_index = index + i
104 | if true_index >= len(testList):
105 | true_index = len(testList) - 1
106 | items = testList[true_index]
107 | x1.append(encode_sent(vocab, items[2], 100))
108 | x2.append(encode_sent(vocab, items[3], 100))
109 | x3.append(encode_sent(vocab, items[3], 100))
110 | return np.array(x1, dtype='float32'), np.array(x2, dtype='float32'), np.array(x3, dtype='float32')
111 |
112 | def validation(validate_model, testList, vocab, batch_size):
113 | index, score_list = int(0), []
114 | while True:
115 | x1, x2, x3 = load_data_val(testList, vocab, index, batch_size)
116 | batch_scores, nouse = validate_model(x1, x2, x3, 1.0)
117 | for score in batch_scores:
118 | score_list.append(score)
119 | index += batch_size
120 | if index >= len(testList):
121 | break
122 | print 'Evaluation ' + str(index)
123 | sdict, index = {}, int(0)
124 | for items in testList:
125 | qid = items[1].split(':')[1]
126 | if not qid in sdict:
127 | sdict[qid] = []
128 | sdict[qid].append((score_list[index], items[0]))
129 | index += 1
130 | lev0, lev1 = float(0), float(0)
131 | for qid, cases in sdict.items():
132 | cases.sort(key=operator.itemgetter(0), reverse=True)
133 | score, flag = cases[0]
134 | if flag == '1':
135 | lev1 += 1
136 | if flag == '0':
137 | lev0 += 1
138 | print 'top-1 precition: ' + str(lev1 / (lev0 + lev1))
139 |
140 | class QACnn(object):
141 | def __init__(self, input1, input2, input3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters, keep_prob):
142 | rng = np.random.RandomState(23455)
143 | self.params = []
144 |
145 | lookup_table = theano.shared(word_embeddings)
146 | self.params += [lookup_table]
147 | #input1-问题, input2-正向答案, input3-负向答案
148 | #将每个字替换成字向量
149 | input_matrix1 = lookup_table[T.cast(input1.flatten(), dtype="int32")]
150 | input_matrix2 = lookup_table[T.cast(input2.flatten(), dtype="int32")]
151 | input_matrix3 = lookup_table[T.cast(input3.flatten(), dtype="int32")]
152 |
153 | #CNN的输入是4维矩阵,这里只是增加了一个维度而已
154 | input_x1 = input_matrix1.reshape((batch_size, 1, sequence_len, embedding_size))
155 | input_x2 = input_matrix2.reshape((batch_size, 1, sequence_len, embedding_size))
156 | input_x3 = input_matrix3.reshape((batch_size, 1, sequence_len, embedding_size))
157 | #print(input_x1.shape.eval())
158 | self.dbg_x1 = input_x1
159 |
160 | outputs_1, outputs_2, outputs_3 = [], [], []
161 | #设置多种大小的filter
162 | for filter_size in filter_sizes:
163 | #每种大小的filter的数量是num_filters
164 | filter_shape = (num_filters, 1, filter_size, embedding_size)
165 | image_shape = (batch_size, 1, sequence_len, embedding_size)
166 | fan_in = np.prod(filter_shape[1:])
167 | fan_out = filter_shape[0] * np.prod(filter_shape[2:])
168 | W_bound = np.sqrt(6. / (fan_in + fan_out))
169 | W = theano.shared(
170 | np.asarray(
171 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
172 | dtype=theano.config.floatX
173 | ),
174 | borrow=True
175 | )
176 | b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX)
177 | b = theano.shared(value=b_values, borrow=True)
178 |
179 | #卷积+max_pooling
180 | conv_out = conv2d(input=input_x1, filters=W, filter_shape=filter_shape, input_shape=image_shape)
181 | #卷积后的向量的长度为ds
182 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
183 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
184 | outputs_1.append(pooled_active)
185 |
186 | conv_out = conv2d(input=input_x2, filters=W, filter_shape=filter_shape, input_shape=image_shape)
187 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
188 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
189 | outputs_2.append(pooled_active)
190 |
191 | conv_out = conv2d(input=input_x3, filters=W, filter_shape=filter_shape, input_shape=image_shape)
192 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
193 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
194 | outputs_3.append(pooled_active)
195 |
196 | self.params += [W, b]
197 | self.dbg_conv_out = conv_out.shape
198 |
199 | num_filters_total = num_filters * len(filter_sizes)
200 | self.dbg_outputs_1 = outputs_1[0].shape
201 | #每一个句子的语义表示向量的长度为num_filters_total
202 | output_flat1 = T.reshape(T.concatenate(outputs_1, axis=1), [batch_size, num_filters_total])
203 | output_flat2 = T.reshape(T.concatenate(outputs_2, axis=1), [batch_size, num_filters_total])
204 | output_flat3 = T.reshape(T.concatenate(outputs_3, axis=1), [batch_size, num_filters_total])
205 | #dropout, keep_prob为1表示不进行dropout
206 | output_drop1 = self._dropout(rng, output_flat1, keep_prob)
207 | output_drop2 = self._dropout(rng, output_flat2, keep_prob)
208 | output_drop3 = self._dropout(rng, output_flat3, keep_prob)
209 |
210 | #计算问题和答案之前的向量夹角
211 | #计算向量的长度
212 | len1 = T.sqrt(T.sum(output_drop1 * output_drop1, axis=1))
213 | len2 = T.sqrt(T.sum(output_drop2 * output_drop2, axis=1))
214 | len3 = T.sqrt(T.sum(output_drop3 * output_drop3, axis=1))
215 | #计算向量之间的夹角
216 | cos12 = T.sum(output_drop1 * output_drop2, axis=1) / (len1 * len2)
217 | self.cos12 = cos12
218 | cos13 = T.sum(output_drop1 * output_drop3, axis=1) / (len1 * len3)
219 | self.cos13 = cos13
220 |
221 | zero = theano.shared(np.zeros(batch_size, dtype=theano.config.floatX), borrow=True)
222 | margin = theano.shared(np.full(batch_size, 0.05, dtype=theano.config.floatX), borrow=True)
223 | #Loss损失函数
224 | diff = T.cast(T.maximum(zero, margin - cos12 + cos13), dtype=theano.config.floatX)
225 | self.cost = T.sum(diff, acc_dtype=theano.config.floatX)
226 | #mini-batch数据的准确率(如果正向答案和问题之间的cosine大于负向答案和问题的cosine,则认为正确,
227 | #否则是错误的)
228 | #Loss和Accuracy是用来评估训练中模型时候收敛的两个很重要的指标
229 | self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size)
230 |
231 | def _dropout(self, rng, layer, keep_prob):
232 | srng = T.shared_randomstreams.RandomStreams(rng.randint(123456))
233 | mask = srng.binomial(n=1, p=keep_prob, size=layer.shape)
234 | output = layer * T.cast(mask, theano.config.floatX)
235 | output = output / keep_prob
236 | return output
237 |
238 | def train():
239 | batch_size = int(256)
240 | filter_sizes = [2,3,5]
241 | num_filters = 500
242 | embedding_size = 100
243 | learning_rate = 0.001
244 | n_epochs = 2000000
245 | validation_freq = 1000
246 | keep_prob_value = 0.25
247 |
248 | vocab = build_vocab()
249 | word_embeddings = load_word_embeddings(vocab, embedding_size)
250 | trainList = load_train_list()
251 | testList = load_test_list()
252 | train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size)
253 |
254 | x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3')
255 | keep_prob = T.fscalar('keep_prob')
256 | model = QACnn(
257 | input1=x1, input2=x2, input3=x3, keep_prob=keep_prob,
258 | word_embeddings=word_embeddings,
259 | batch_size=batch_size,
260 | sequence_len=train_x1.shape[1],
261 | embedding_size=embedding_size,
262 | filter_sizes=filter_sizes,
263 | num_filters=num_filters)
264 | dbg_x1 = model.dbg_x1
265 | dbg_outputs_1 = model.dbg_outputs_1
266 |
267 | cost, cos12, cos13 = model.cost, model.cos12, model.cos13
268 | print 'cost'
269 | print cost
270 | params, accuracy = model.params, model.accuracy
271 | grads = T.grad(cost, params)
272 |
273 | updates = [
274 | (param_i, param_i - learning_rate * grad_i)
275 | for param_i, grad_i in zip(params, grads)
276 | ]
277 |
278 | p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3')
279 | prob = T.fscalar('prob')
280 | train_model = theano.function(
281 | [p1, p2, p3, prob],
282 | [cost, accuracy, dbg_x1, dbg_outputs_1],
283 | updates=updates,
284 | givens={
285 | x1: p1, x2: p2, x3: p3, keep_prob: prob
286 | }
287 | )
288 |
289 | v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
290 | validate_model = theano.function(
291 | inputs=[v1, v2, v3, prob],
292 | outputs=[cos12, cos13],
293 | #updates=updates,
294 | givens={
295 | x1: v1, x2: v2, x3: v3, keep_prob: prob
296 | }
297 | )
298 |
299 | epoch = 0
300 | done_looping = False
301 | while (epoch < n_epochs) and (not done_looping):
302 | epoch = epoch + 1
303 | train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size)
304 | #print train_x3.shape
305 | cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model(train_x1, train_x2, train_x3, keep_prob_value)
306 | print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc)
307 | if epoch % validation_freq == 0:
308 | print 'Evaluation ......'
309 | validation(validate_model, testList, vocab, batch_size)
310 | #print dbg_outputs_1
311 |
312 | if __name__ == '__main__':
313 | train()
314 |
--------------------------------------------------------------------------------
/lstm_cnn/theano/insqa_lstm.py:
--------------------------------------------------------------------------------
1 |
2 | ############################################################
3 | # if batch_size is 1, there must be a dtype error when doing
4 | # T.grad, this is something about scan func
5 | # see https://github.com/Theano/Theano/issues/1772
6 | #
7 | # LSTM + cnn
8 | # test1 top-1 precision: 68.3%
9 | ############################################################
10 |
11 | from collections import OrderedDict
12 | import sys, time, random, operator
13 |
14 | import numpy as np
15 | import theano
16 | from theano import config
17 | import theano.tensor as T
18 | from theano.tensor.signal import pool
19 | from theano.tensor.nnet import conv2d
20 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
21 |
22 | #TODO change filepath to your local environment
23 | #include train test1 vectors.nobin
24 |
25 | def build_vocab():
26 | code, vocab = int(0), {}
27 | vocab['UNKNOWN'] = code
28 | code += 1
29 | for line in open('/export/jw/cnn/insuranceQA/train'):
30 | items = line.strip().split(' ')
31 | for i in range(2, 3):
32 | for word in items[i].split('_'):
33 | if len(word) <= 0:
34 | continue
35 | if not word in vocab:
36 | vocab[word] = code
37 | code += 1
38 | return vocab
39 |
40 | def load_vectors():
41 | vectors = {}
42 | for line in open('/export/jw/cnn/insuranceQA/vectors.nobin'):
43 | items = line.strip().split(' ')
44 | if len(items[0]) <= 0:
45 | continue
46 | vec = []
47 | for i in range(1, 101):
48 | vec.append(float(items[i]))
49 | vectors[items[0]] = vec
50 | return vectors
51 |
52 | def load_word_embeddings(vocab, dim):
53 | vectors = load_vectors()
54 | embeddings = [] #brute initialization
55 | for i in range(0, len(vocab)):
56 | vec = []
57 | for j in range(0, dim):
58 | vec.append(0.01)
59 | embeddings.append(vec)
60 | for word, code in vocab.items():
61 | if word in vectors:
62 | embeddings[code] = vectors[word]
63 | return np.array(embeddings, dtype='float32')
64 |
65 | #be attention initialization of UNKNNOW
66 | def encode_sent(vocab, string, size):
67 | x, m = [], []
68 | words = string.split('_')
69 | for i in range(0, size):
70 | if words[i] in vocab:
71 | x.append(vocab[words[i]])
72 | else:
73 | x.append(vocab['UNKNOWN'])
74 | if words[i] == '': #TODO
75 | m.append(1) #fixed sequence length, else use 0
76 | else:
77 | m.append(1)
78 | return x, m
79 |
80 | def load_train_list():
81 | trainList = []
82 | for line in open('/export/jw/cnn/insuranceQA/train'):
83 | items = line.strip().split(' ')
84 | if items[0] == '1':
85 | trainList.append(line.strip().split(' '))
86 | return trainList
87 |
88 | def load_test_list():
89 | testList = []
90 | for line in open('/export/jw/cnn/insuranceQA/test1'):
91 | testList.append(line.strip().split(' '))
92 | return testList
93 |
94 | def load_data(trainList, vocab, batch_size):
95 | train_1, train_2, train_3 = [], [], []
96 | mask_1, mask_2, mask_3 = [], [], []
97 | counter = 0
98 | while True:
99 | pos = trainList[random.randint(0, len(trainList)-1)]
100 | neg = trainList[random.randint(0, len(trainList)-1)]
101 | if pos[2].startswith('') or pos[3].startswith('') or neg[3].startswith(''):
102 | #print 'empty string ......'
103 | continue
104 | x, m = encode_sent(vocab, pos[2], 100)
105 | train_1.append(x)
106 | mask_1.append(m)
107 | x, m = encode_sent(vocab, pos[3], 100)
108 | train_2.append(x)
109 | mask_2.append(m)
110 | x, m = encode_sent(vocab, neg[3], 100)
111 | train_3.append(x)
112 | mask_3.append(m)
113 | counter += 1
114 | if counter >= batch_size:
115 | break
116 | return np.transpose(np.array(train_1, dtype=config.floatX)), np.transpose(np.array(train_2, dtype=config.floatX)), np.transpose(np.array(train_3, dtype=config.floatX)), np.transpose(np.array(mask_1, dtype=config.floatX)) , np.transpose(np.array(mask_2, dtype=config.floatX)), np.transpose(np.array(mask_3, dtype=config.floatX))
117 |
118 | def load_data_val(testList, vocab, index, batch_size):
119 | x1, x2, x3, m1, m2, m3 = [], [], [], [], [], []
120 | for i in range(0, batch_size):
121 | true_index = index + i
122 | if true_index >= len(testList):
123 | true_index = len(testList) - 1
124 | items = testList[true_index]
125 | x, m = encode_sent(vocab, items[2], 100)
126 | x1.append(x)
127 | m1.append(m)
128 | x, m = encode_sent(vocab, items[3], 100)
129 | x2.append(x)
130 | m2.append(m)
131 | x, m = encode_sent(vocab, items[3], 100)
132 | x3.append(x)
133 | m3.append(m)
134 | return np.transpose(np.array(x1, dtype=config.floatX)), np.transpose(np.array(x2, dtype=config.floatX)), np.transpose(np.array(x3, dtype=config.floatX)), np.transpose(np.array(m1, dtype=config.floatX)) , np.transpose(np.array(m2, dtype=config.floatX)), np.transpose(np.array(m3, dtype=config.floatX))
135 |
136 | def validation(validate_model, testList, vocab, batch_size):
137 | index, score_list = int(0), []
138 | while True:
139 | x1, x2, x3, m1, m2, m3 = load_data_val(testList, vocab, index, batch_size)
140 | batch_scores, nouse = validate_model(x1, x2, x3, m1, m2, m3)
141 | for score in batch_scores:
142 | score_list.append(score)
143 | index += batch_size
144 | if index >= len(testList):
145 | break
146 | print 'Evaluation ' + str(index)
147 | sdict, index = {}, int(0)
148 | for items in testList:
149 | qid = items[1].split(':')[1]
150 | if not qid in sdict:
151 | sdict[qid] = []
152 | sdict[qid].append((score_list[index], items[0]))
153 | index += 1
154 | lev0, lev1 = float(0), float(0)
155 | of = open('/export/jw/cnn/insuranceQA/acc.lstm', 'a')
156 | for qid, cases in sdict.items():
157 | cases.sort(key=operator.itemgetter(0), reverse=True)
158 | score, flag = cases[0]
159 | if flag == '1':
160 | lev1 += 1
161 | if flag == '0':
162 | lev0 += 1
163 | for s in score_list:
164 | of.write(str(s) + '\n')
165 | of.write('lev1:' + str(lev1) + '\n')
166 | of.write('lev0:' + str(lev0) + '\n')
167 | print 'lev1:' + str(lev1)
168 | print 'lev0:' + str(lev0)
169 | of.close()
170 |
171 | def ortho_weight(ndim):
172 | W = np.random.randn(ndim, ndim)
173 | u, s, v = np.linalg.svd(W)
174 | return u.astype(config.floatX)
175 |
176 | def numpy_floatX(data):
177 | return np.asarray(data, dtype=config.floatX)
178 |
179 | def param_init_cnn(filter_sizes, num_filters, proj_size, tparams, grad_params):
180 | rng = np.random.RandomState(23455)
181 | for filter_size in filter_sizes:
182 | filter_shape = (num_filters, 1, filter_size, proj_size)
183 | fan_in = np.prod(filter_shape[1:])
184 | fan_out = filter_shape[0] * np.prod(filter_shape[2:])
185 | W_bound = np.sqrt(6. / (fan_in + fan_out))
186 | W = theano.shared(
187 | np.asarray(
188 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
189 | dtype=theano.config.floatX
190 | ),
191 | borrow=True
192 | )
193 | tparams['cnn_W_' + str(filter_size)] = W
194 | b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX)
195 | b = theano.shared(value=b_values, borrow=True)
196 | tparams['cnn_b_' + str(filter_size)] = b
197 | grad_params += [W, b]
198 | return tparams, grad_params
199 |
200 | def param_init_lstm(proj_size, tparams, grad_params):
201 | W = np.concatenate([ortho_weight(proj_size),
202 | ortho_weight(proj_size),
203 | ortho_weight(proj_size),
204 | ortho_weight(proj_size)], axis=1)
205 | W_t = theano.shared(W, borrow=True)
206 | tparams[_p('lstm', 'W')] = W_t
207 | U = np.concatenate([ortho_weight(proj_size),
208 | ortho_weight(proj_size),
209 | ortho_weight(proj_size),
210 | ortho_weight(proj_size)], axis=1)
211 | U_t = theano.shared(U, borrow=True)
212 | tparams[_p('lstm', 'U')] = U_t
213 | b = np.zeros((4 * proj_size,))
214 | b_t = theano.shared(b.astype(config.floatX), borrow=True)
215 | tparams[_p('lstm', 'b')] = b_t
216 | grad_params += [W_t, U_t, b_t]
217 |
218 | return tparams, grad_params
219 |
220 | def dropout_layer(state_before, use_noise, trng):
221 | proj = T.switch(use_noise,
222 | (state_before *
223 | trng.binomial(state_before.shape,
224 | p=0.5, n=1,
225 | dtype=state_before.dtype)),
226 | state_before * 0.5)
227 | return proj
228 |
229 | class LSTM(object):
230 | def __init__(self, input1, input2, input3, mask1, mask2, mask3, word_embeddings, batch_size, sequence_len, embedding_size, filter_sizes, num_filters):
231 | #proj_size means embedding_size
232 | #'lstm_W' = [embedding_size, embedding_size]
233 | #'lstm_U' = [embedding_size, embedding_size]
234 | #'lstm_b' = [embedding_size]
235 | proj_size = 100 #TODO, what does proj mean
236 | self.params, tparams = [], {}
237 | tparams, self.params = param_init_lstm(proj_size, tparams, self.params)
238 | tparams, self.params = param_init_cnn(filter_sizes, num_filters, proj_size, tparams, self.params)
239 | lookup_table = theano.shared(word_embeddings, borrow=True)
240 | tparams['lookup_table'] = lookup_table
241 | self.params += [lookup_table]
242 |
243 | n_timesteps = input1.shape[0]
244 | n_samples = input1.shape[1]
245 |
246 | lstm1, lstm_whole1 = self._lstm_net(tparams, input1, sequence_len, batch_size, embedding_size, mask1, proj_size)
247 | lstm2, lstm_whole2 = self._lstm_net(tparams, input2, sequence_len, batch_size, embedding_size, mask2, proj_size)
248 | lstm3, lstm_whole3 = self._lstm_net(tparams, input3, sequence_len, batch_size, embedding_size, mask3, proj_size)
249 |
250 | #dimshuffle [sequence_len, batch_size, proj_size] to [batch_size, sequence_len, proj_size]
251 | cnn_input1 = T.reshape(lstm1.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
252 | cnn_input2 = T.reshape(lstm2.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
253 | cnn_input3 = T.reshape(lstm3.dimshuffle(1, 0, 2), [batch_size, 1, sequence_len, proj_size])
254 | cnn1 = self._cnn_net(tparams, cnn_input1, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
255 | cnn2 = self._cnn_net(tparams, cnn_input2, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
256 | cnn3 = self._cnn_net(tparams, cnn_input3, batch_size, sequence_len, num_filters, filter_sizes, proj_size)
257 |
258 | len1 = T.sqrt(T.sum(cnn1 * cnn1, axis=1))
259 | len2 = T.sqrt(T.sum(cnn2 * cnn2, axis=1))
260 | len3 = T.sqrt(T.sum(cnn3 * cnn3, axis=1))
261 |
262 | self.cos12 = T.sum(cnn1 * cnn2, axis=1) / (len1 * len2)
263 | self.cos13 = T.sum(cnn1 * cnn3, axis=1) / (len1 * len3)
264 |
265 | zero = theano.shared(np.zeros(batch_size, dtype=config.floatX), borrow=True)
266 | margin = theano.shared(np.full(batch_size, 0.05, dtype=config.floatX), borrow=True)
267 | diff = T.cast(T.maximum(zero, margin - self.cos12 + self.cos13), dtype=config.floatX)
268 | self.cost = T.sum(diff, acc_dtype=config.floatX)
269 | self.accuracy = T.sum(T.cast(T.eq(zero, diff), dtype='int32')) / float(batch_size)
270 |
271 | def _cnn_net(self, tparams, cnn_input, batch_size, sequence_len, num_filters, filter_sizes, proj_size):
272 | outputs = []
273 | for filter_size in filter_sizes:
274 | filter_shape = (num_filters, 1, filter_size, proj_size)
275 | image_shape = (batch_size, 1, sequence_len, proj_size)
276 | W = tparams['cnn_W_' + str(filter_size)]
277 | b = tparams['cnn_b_' + str(filter_size)]
278 | conv_out = conv2d(input=cnn_input, filters=W, filter_shape=filter_shape, input_shape=image_shape)
279 | pooled_out = pool.pool_2d(input=conv_out, ds=(sequence_len - filter_size + 1, 1), ignore_border=True, mode='max')
280 | pooled_active = T.tanh(pooled_out + b.dimshuffle('x', 0, 'x', 'x'))
281 | outputs.append(pooled_active)
282 | num_filters_total = num_filters * len(filter_sizes)
283 | output_tensor = T.reshape(T.concatenate(outputs, axis=1), [batch_size, num_filters_total])
284 | return output_tensor
285 |
286 | def _lstm_net(self, tparams, _input, sequence_len, batch_size, embedding_size, mask, proj_size):
287 | input_matrix = tparams['lookup_table'][T.cast(_input.flatten(), dtype="int32")]
288 | input_x = input_matrix.reshape((sequence_len, batch_size, embedding_size))
289 | proj, proj_whole = lstm_layer(tparams, input_x, proj_size, prefix='lstm', mask=mask)
290 | #if useMask == True:
291 | #proj = (proj * mask[:, :, None]).sum(axis=0)
292 | #proj = proj / mask.sum(axis=0)[:, None]
293 | #if options['use_dropout']:
294 | #proj = dropout_layer(proj, use_noise, trng)
295 | return proj, proj_whole
296 |
297 | #state_below is word_embbeding tensor(3dim)
298 | def lstm_layer(tparams, state_below, proj_size, prefix='lstm', mask=None):
299 | #dim-0 steps, dim-1 samples(batch_size), dim-3 word_embedding
300 | nsteps = state_below.shape[0]
301 | if state_below.ndim == 3:
302 | n_samples = state_below.shape[1]
303 | else:
304 | n_samples = 1
305 |
306 | assert mask is not None
307 |
308 | def _slice(_x, n, dim):
309 | if _x.ndim == 3:
310 | return _x[:, :, n * dim:(n + 1) * dim]
311 | return _x[:, n * dim:(n + 1) * dim]
312 |
313 | #h means hidden output? c means context? so we'll use h?
314 | #rval[0] = [sequence_len, batch_size, proj_size], rval[1] the same
315 |
316 | #so preact size must equl to x_(lstm input slice)
317 | #if you want change lstm h(t) size, 'lstm_U' and 'lstm_b'
318 | #and precat must be changed to another function, like h*U+b
319 | #see http://colah.github.io/posts/2015-08-Understanding-LSTMs/
320 | #f(t) = sigmoid(Wf * [h(t-1),x(t)] + bf)
321 | def _step(m_, x_, h_, c_):
322 | preact = T.dot(h_, tparams[_p(prefix, 'U')])
323 | preact += x_
324 |
325 | i = T.nnet.sigmoid(_slice(preact, 0, proj_size))
326 | f = T.nnet.sigmoid(_slice(preact, 1, proj_size))
327 | o = T.nnet.sigmoid(_slice(preact, 2, proj_size))
328 | c = T.tanh(_slice(preact, 3, proj_size))
329 |
330 | c = f * c_ + i * c
331 | c = m_[:, None] * c + (1. - m_)[:, None] * c_
332 |
333 | h = o * T.tanh(c)
334 | #if mask(t-1)==0, than make h(t) = h(t-1)
335 | h = m_[:, None] * h + (1. - m_)[:, None] * h_
336 |
337 | return h, c
338 |
339 | state_below = (T.dot(state_below, tparams[_p(prefix, 'W')]) +
340 | tparams[_p(prefix, 'b')])
341 |
342 | dim_proj = proj_size
343 | rval, updates = theano.scan(_step,
344 | sequences=[mask, state_below],
345 | outputs_info=[T.alloc(numpy_floatX(0.),
346 | n_samples,
347 | dim_proj),
348 | T.alloc(numpy_floatX(0.),
349 | n_samples,
350 | dim_proj)],
351 | name=_p(prefix, '_layers'),
352 | n_steps=nsteps)
353 | return rval[0], rval[1]
354 |
355 | def _p(pp, name):
356 | return '%s_%s' % (pp, name)
357 |
358 | def train():
359 | batch_size = int(256)
360 | embedding_size = 100
361 | learning_rate = 0.05
362 | n_epochs = 20000000
363 | validation_freq = 1000
364 | filter_sizes = [1, 2, 3, 5]
365 | num_filters = 500
366 |
367 | vocab = build_vocab()
368 | word_embeddings = load_word_embeddings(vocab, embedding_size)
369 | trainList = load_train_list()
370 | testList = load_test_list()
371 | train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size)
372 | x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3')
373 | m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3')
374 | model = LSTM(
375 | input1=x1, input2=x2, input3=x3,
376 | mask1=m1, mask2=m2, mask3=m3,
377 | word_embeddings=word_embeddings,
378 | batch_size=batch_size,
379 | sequence_len=train_x1.shape[0], #row is sequence_len
380 | embedding_size=embedding_size,
381 | filter_sizes=filter_sizes,
382 | num_filters=num_filters)
383 |
384 | cost, cos12, cos13 = model.cost, model.cos12, model.cos13
385 | params, accuracy = model.params, model.accuracy
386 | grads = T.grad(cost, params)
387 | updates = [
388 | (param_i, param_i - learning_rate * grad_i)
389 | for param_i, grad_i in zip(params, grads)
390 | ]
391 |
392 | p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3')
393 | q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3')
394 | train_model = theano.function(
395 | [p1, p2, p3, q1, q2, q3],
396 | [cost, accuracy],
397 | updates=updates,
398 | givens={
399 | x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3
400 | }
401 | )
402 |
403 | v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
404 | u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3')
405 | validate_model = theano.function(
406 | inputs=[v1, v2, v3, u1, u2, u3],
407 | outputs=[cos12, cos13],
408 | #updates=updates,
409 | givens={
410 | x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3
411 | }
412 | )
413 |
414 | epoch = 0
415 | done_looping = False
416 | while (epoch < n_epochs) and (not done_looping):
417 | epoch += 1
418 | train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_data(trainList, vocab, batch_size)
419 | #print('train_x1, train_x2, train_x3')
420 | #print(train_x1.shape, train_x2.shape, train_x3.shape)
421 | cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3)
422 | print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc)
423 | if epoch % validation_freq == 0:
424 | print 'Evaluation ......'
425 | validation(validate_model, testList, vocab, batch_size)
426 |
427 | if __name__ == '__main__':
428 | train()
429 |
--------------------------------------------------------------------------------