├── README.md ├── data_process.py ├── data_utils.py ├── params.json ├── sequence_to_sequence.py ├── test_anti.py ├── threadedgenerator.py ├── train_anti.py └── word_sequence.py /README.md: -------------------------------------------------------------------------------- 1 | # seq2seq_chatbot 2 | 3 | ## 环境依赖 4 | 5 | | 程序 | 版本 | 6 | | ---------- | ------- | 7 | | python | 3.52 | 8 | | Tensorflow | 1.10.0 | 9 | | CUDA | 9.0.103 | 10 | | cuDNN | 7.0 | 11 | 12 | 13 | 14 | ## Run 15 | 16 | - **预料获取** 17 | 18 | 1. `wget https://lvzhe.oss-cn-beijing.aliyuncs.com/dgk_shooter_min.conv.zip` 19 | 20 | 输出 :dgk_shooter_min.conv.zip 21 | 22 | 2. `unzip dgk_shooter_min.conv.zip` 23 | 24 | 输出 : dgk_shooter_min.conv 25 | 26 | - **文本处理** 27 | 28 | 1. `python data_process.py` 29 | 30 | 输出: chatbot.pkl 31 | 32 | - **训练数据** 33 | 34 | 1. `python train_anti.py` 35 | 36 | 在./model 得到训练好的模型 37 | 38 | - chatbot测试 39 | 40 | 1. `python test_anti.py` 41 | 42 | - ![](https://ws3.sinaimg.cn/large/006tNbRwgy1fxo7n94nbfj30id0biacl.jpg) 43 | 44 | [**个人博客详解**](https://blog.csdn.net/hl791026701/article/details/84404901) 45 | 46 | 有问题欢迎在Issues留言 ,如果觉得不错请给个star ! 47 | -------------------------------------------------------------------------------- /data_process.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import pickle 4 | from tqdm import tqdm 5 | from word_sequence import WordSequence 6 | 7 | def make_split(line): 8 | if re.match(r'.*([,。…?!\.,!? ])$', ''.join(line)): 9 | return [] 10 | return [','] 11 | 12 | 13 | def good_line(line): 14 | if len(re.findall(r'[a-zA-Z0-9]',''.join(line)))>2: 15 | return False 16 | return True 17 | 18 | def regular(sen): 19 | sen=re.sub(r'\.{3,100}','…',sen) 20 | sen =re.sub(r'…{2,100}','…',sen) 21 | sen =re.sub(r'[,]{1,100}',',',sen) 22 | sen=re.sub(r'[\.]{1,100}','。',sen) 23 | sen=re.sub(r'[\?]{1,100}','?',sen) 24 | sen=re.sub(r'[!]{1,100}','!',sen) 25 | 26 | return sen 27 | 28 | def main(limit=20,x_limit=3,y_limit=6): 29 | print("extract lines") 30 | 31 | fp = open("./data/dgk_shooter_min.conv",'r',encoding='utf-8') 32 | groups =[] 33 | group=[] 34 | 35 | for line in tqdm(fp): 36 | if line.startswith('M '): 37 | line = line.replace('\n','') 38 | if '/' in line: 39 | line = line[2:].split('/') 40 | else: 41 | line = list(line[2:]) 42 | line = line[:-1] # 43 | 44 | group.append(list(regular(''.join(line)))) 45 | else: 46 | lsat_line=None 47 | if group: 48 | groups.append(group) 49 | group=[] 50 | print('extract group') 51 | 52 | 53 | x_data=[] 54 | y_data=[] 55 | 56 | for group in tqdm(groups): 57 | for i,line in enumerate(group): 58 | last_line=None 59 | if i>0: 60 | last_line = group[i-1] 61 | if not good_line(last_line): 62 | lsat_line=None 63 | 64 | next_line = None 65 | if i =y_limit \ 101 | and len(x)>=x_limit 102 | 103 | ] 104 | x_data,y_data=zip(*data) 105 | 106 | print('fit word_sequence') 107 | 108 | ws_input = WordSequence() 109 | ws_input.fit(x_data + y_data) 110 | 111 | print('dump') 112 | 113 | pickle.dump( 114 | (x_data,y_data), 115 | open('./data/chatbot.pkl','wb') 116 | ) 117 | pickle.dump(ws_input,open('./data/ws.pkl','wb')) 118 | print('done') 119 | 120 | 121 | 122 | 123 | if __name__ == '__main__': 124 | main() 125 | 126 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from tensorflow.python.client import device_lib 4 | from word_sequence import WordSequence 5 | 6 | 7 | VOCAB_SIZE_THERSHOLD_CPU=50000 8 | 9 | #获取当前GUP信息 10 | def _get_available_gpus(): 11 | local_device_protos = device_lib.list_local_devices() 12 | return [x.name for x in local_device_protos if x.device_type == "GPU"] 13 | 14 | #根据输入输出的字典大小 选择 运算设备进行embeding 15 | def _get_embed_device(vocab_size): 16 | gpus = _get_available_gpus() 17 | if not gpus or vocab_size > VOCAB_SIZE_THERSHOLD_CPU: 18 | return "/cpu:0" 19 | return "/gpu:0" 20 | 21 | #单独的句子转换 22 | def transorform_sentence(sentence,ws,max_len=None,add_end=False): 23 | encoded = ws.transform( 24 | sentence, 25 | max_len = max_len if max_len is not None else len(sentence)) 26 | 27 | encoded_len = len(sentence) + (1 if add_end else 0) 28 | if encoded_len > len(encoded): 29 | encoded_len = len(encoded) 30 | 31 | #[4,4,5,6] 4 32 | return encoded, encoded_len 33 | 34 | 35 | 36 | def batch_flow(data, ws, batch_size, raw=False, add_end=True): 37 | """ 38 | 从数据中随机生成batch_size数据,然后给转换后输出出去。。。。 39 | 40 | raw:是否返回原始对象,如果为True,假设结果ret,那么len(ret) == len(data)*3 41 | 如果为false, 那么len(ret) == len(data) *2 42 | 43 | Q = (q1,q2,q3, ... qn) 44 | A = (q1,a2,a3, ... an) 45 | len(Q) ==len(A) 46 | batch_flow([Q,A], ws, batch_size=32) 47 | 48 | raw == False 49 | netx(generator) == q_i_encoded, q_i_len, a_i_encoded, a_i_len 50 | raw == True: 51 | next(generator) == q_i_encoded, q_i_len, q_i, a_i_encoded, a_i_len 52 | """ 53 | #ws 数量和data数量要保持一致(多个),len(date) == len(ws) 54 | all_data = list(zip(*data)) 55 | if isinstance(ws,(list,tuple)): 56 | assert len(ws) == len(data), "ws 的长度必须等于data的长度 if ws 是一个list or tuple" 57 | 58 | if isinstance(add_end,bool): 59 | add_end = [add_end] * len(data) 60 | else: 61 | assert(isinstance(add_end,(list,tuple))), 'add_len不是boolea,纠结应该是一个list(tuple) of boolea' 62 | assert len(add_len) ==len(data), '如果add_end 是一个list(tuple), 那么add_end的长度应该和输入数据长度一样' 63 | 64 | mul =2 65 | if raw: 66 | ml=3 67 | 68 | while True: 69 | data_batch = random.sample(all_data, batch_size) #在all_data数据中随机抽取生成batch_size个数据 70 | batches = [[] for i in range(len(data) * mul)] 71 | 72 | max_lens=[] 73 | for j in range(len(data)): 74 | max_len = max([ 75 | len(x[j]) if hasattr(x[j],'__len__') else 0 76 | for x in data_batch 77 | ]) + (1 if add_end[j] else 0) 78 | max_lens.append(max_len) #每一个batch的最大行数 79 | 80 | for d in data_batch: 81 | for j in range(len(data)): 82 | if isinstance(ws,(list,tuple)): 83 | w = ws[j] 84 | else: 85 | w = ws 86 | 87 | #添加结束标记(结尾) 88 | line = d[j] 89 | if add_end[j] and isinstance(line,(tuple,list)): 90 | line = list(line) + [WordSequence.END_TAG] 91 | 92 | if w is not None: 93 | x, xl = transorform_sentence(line,w,max_lens[j],add_end[j]) 94 | batches[j * mul].append(x) 95 | batches[j * mul + 1].append(xl) 96 | else: 97 | batches[j * mul].append(line) 98 | batches[j * mul +1].append(line) 99 | 100 | if raw: 101 | batches[j * mul +2].append(line) 102 | 103 | batches = [np.asarray(x) for x in batches] 104 | yield batches 105 | 106 | 107 | def batch_flow_bucket(data, ws, batch_size, raw=False, add_end=True, n_buckets=5, bucket_ind=1, debug=False): 108 | #bucket_ind 是指哪一个维度的输入作为bucket的依据 109 | #n_buckets 就是指把数据分成了多少个buckets 110 | all_data = list(zip(*data)) 111 | lengths = sorted(list(set([len(x[bucket_ind]) for x in all_data]))) 112 | if n_buckets > len(lengths): 113 | n_buckets = len(lengths) 114 | splits = np.array(lengths)[ 115 | (np.linspace(0,1,5,endpoint=False) * len(lengths)).astype(int) 116 | ].tolist() 117 | 118 | splits +=[np.inf] #np.inf无限大的正整数 119 | 120 | if debug: 121 | print(splits) 122 | 123 | ind_data = {} 124 | for x in all_data: 125 | l = len(x[bucket_ind]) 126 | for ind, s in enumerate(splits[:-1]): 127 | if l >= s and l <= splits[ind + 1]: 128 | if ind not in ind_data: 129 | ind_data[ind] = [] 130 | ind_data[ind].append(x) 131 | break 132 | 133 | inds = sorted(list(ind_data.keys())) 134 | ind_p = [len(ind_data[x]) / len(all_data) for x in inds] 135 | if debug: 136 | print(np.sum(ind_p), ind_p) 137 | if isinstance(ws, (list,tuple)): 138 | assert len(ws) == len(data), "len(wa) 必须等于len(data), ws是list或者是tuple" 139 | if isinstance(add_end, bool): 140 | add_end = [add_end] * len(data) 141 | else: 142 | assert(isinstance(add_end, (list, tuple))), "add_end 不是 boolan, 就应该是一个list(tuple) of boolan" 143 | assert len(add_end) ==len(data), "如果add_end 是 list(tuple), 那么add_end的长度应该和输入数据长度是一样" 144 | mul = 2 145 | if raw: 146 | mul =3 147 | 148 | while True: 149 | choice_ind = np.random.choice(inds,p=ind_p) 150 | if debug: 151 | print("choice_ind", choice_ind) 152 | data_batch = random.sample(ind_data[choice_ind], batch_size) 153 | batches = [[] for i in range(len(data) * mul)] 154 | 155 | max_lens =[] 156 | for j in range(len(data)): 157 | max_len = max([ 158 | len(x[j]) if hasattr(x[j], '__len__') else 0 159 | for x in data_batch 160 | ]) + (1 if add_end[j] else 0) 161 | 162 | max_lens.append(max_len) 163 | 164 | for d in data_batch: 165 | for j in range(len(data)): 166 | if isinstance(ws,(list,tuple)): 167 | w = ws[j] 168 | else: 169 | w = ws 170 | 171 | #添加结尾 172 | line = d[j] 173 | if add_end[j] and isinstance(line,(tuple,list)): 174 | line = list(line) + [WordSequence.END_TAG] 175 | 176 | if w is not None: 177 | x, xl = transorform_sentence(line, w, max_lens[j], add_end[j]) 178 | batches[j * mul].append(x) 179 | batches[j * mul +1].append(xl) 180 | else: 181 | batches[j * mul].append(line) 182 | batches[j * mul +1].append(line) 183 | 184 | if raw: 185 | batches[j * mul +2].append(line) 186 | 187 | batches = [np.asarray(x) for x in batches] 188 | 189 | yield batches 190 | 191 | 192 | 193 | def test_batch_flow(): 194 | from fake_data import generate 195 | x_data, y_data, ws_input, ws_target = generate(size=10000) 196 | flow = batch_flow([x_data, y_data], [ws_input, ws_target], 4) 197 | x, xl, y, yl = next(flow) 198 | print(x.shape, y.shape, xl.shape, yl.shape) 199 | 200 | def test_batch_flow_bucket(): 201 | from fake_data import generate 202 | x_data, y_data, ws_input, ws_target = generate(size=10000) 203 | flow = batch_flow_bucket([x_data, y_data], [ws_input, ws_target], 4, debug=True) 204 | for _ in range(10): 205 | x, xl, y, yl = next(flow) 206 | print(x.shape, y.shape, xl.shape, yl.shape) 207 | 208 | 209 | if __name__=="__main__": 210 | test_batch_flow_bucket() 211 | 212 | 213 | -------------------------------------------------------------------------------- /params.json: -------------------------------------------------------------------------------- 1 | { 2 | "bidirectional" : true, 3 | "use_residual" : false, 4 | "use_dropout" : true, 5 | "time_major" : false, 6 | "cell_type" : "lstm", 7 | "depth" : 16, 8 | "attention_type" : "Bahdanau", 9 | "hidden_units": 128, 10 | "optimizer" : "adam", 11 | "learning_rate" : 0.001, 12 | "embedding_size" :300 13 | } -------------------------------------------------------------------------------- /sequence_to_sequence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow import layers 4 | 5 | from tensorflow.python.ops import array_ops 6 | from tensorflow.contrib import seq2seq 7 | from tensorflow.contrib.seq2seq import BahdanauAttention 8 | from tensorflow.contrib.seq2seq import LuongAttention 9 | from tensorflow.contrib.seq2seq import AttentionWrapper 10 | from tensorflow.contrib.seq2seq import BeamSearchDecoder 11 | 12 | from tensorflow.contrib.rnn import LSTMCell 13 | from tensorflow.contrib.rnn import GRUCell 14 | from tensorflow.contrib.rnn import MultiRNNCell #实现多层RNN 15 | from tensorflow.contrib.rnn import DropoutWrapper #drop网络 16 | from tensorflow.contrib.rnn import ResidualWrapper #残差网络 就是把输入concat到输出上一起返回 17 | 18 | from word_sequence import WordSequence 19 | from data_utils import _get_embed_device 20 | 21 | class SequenceToSequence(object): 22 | """ 23 | 基本流程 24 | __init__ 基本参数保存,参数验证(验证参数的合法性) 25 | build_model 构建模型 26 | init_placeholders 初始化一些Tensorflow的变量占位符 27 | build_encoder 初始化编码器 28 | build_single_cell 29 | build_decoder_cell 30 | init_optimizer 如果是在训练模式下进行, 那么则需要初始化优化器 31 | train 训练一个batch 数据 32 | predict 预测一个batch数据 33 | """ 34 | def __init__(self, # 35 | input_vocab_size, #输入词表的大小 36 | target_vocab_size, #输出词表的大小 37 | batch_size=32, #数据batch的大小 38 | embedding_size=300, #输入词表与输出词表embedding的维度 39 | mode="train", #取值为train, 代表训练模式, 取值为decide,代表预训练模式 40 | hidden_units=256, #Rnn模型的中间层大小,encoder和decoder层相同 41 | depth=1, #encoder和decoder的rnn层数 42 | beam_width=0, #是beamsearch的超参数,用于解码 43 | cell_type="lstm", #rnn的神经元类型, lstm, gru 44 | dropout=0.2, #随机丢弃数据的比例,是要0到1之间 45 | use_dropout=False, #是否使用dropout 46 | use_residual=False, #是否使用residual 47 | optimizer='adam', #使用哪一个优化器 48 | learning_rate=1e-3, #学习率 49 | min_learning_rate=1e-5, #最小学习率 50 | decay_steps=50000, #衰减步数 51 | max_gradient_norm=5.0, #梯度正则裁剪的系数 52 | max_decode_step=None, #最大decode长度, 可以非常大 53 | attention_type='Bahdanau', #使用attention类型 54 | bidirectional=False, #是否使用双向encoder 55 | time_major=False, #是否在计算过程中使用时间作为主要的批量数据 56 | seed=0, #一些层间的操作的随机数 57 | parallel_iterations=None, #并行执行rnn循环的个数 58 | share_embedding=False, #是否让encoder和decoder共用一个embedding 59 | pretrained_embedding=False): #是不是要使用预训练的embedding 60 | self.input_vocab_size = input_vocab_size 61 | self.target_vocab_size = target_vocab_size 62 | self.batch_size = batch_size 63 | self.embedding_size = embedding_size 64 | self.hidden_units = hidden_units 65 | self.depth = depth 66 | self.cell_type = cell_type.lower() 67 | self.use_dropout = use_dropout 68 | self.use_residual = use_residual 69 | self.attention_type = attention_type 70 | self.mode = mode 71 | self.optimizer = optimizer 72 | self.learning_rate = learning_rate 73 | self.min_learning_rate = min_learning_rate 74 | self.decay_steps = decay_steps 75 | self.max_gradient_norm = max_gradient_norm 76 | self.keep_prob = 1.0 -dropout 77 | self.seed = seed 78 | self.pretrained_embedding = pretrained_embedding 79 | self.bidirectional = bidirectional 80 | 81 | if isinstance(parallel_iterations, int): 82 | self.parallel_iterations= parallel_iterations 83 | else: 84 | self.parallel_iterations = batch_size 85 | self.time_major = time_major 86 | self.share_embedding = share_embedding 87 | #生成均匀分布的随机数 用于变量初始化 88 | self.initializer = tf.random_uniform_initializer( 89 | -0.05, 0.05, dtype=tf.float32 90 | ) 91 | assert self.cell_type in ('gru', 'lstm'), 'cell_type 应该是GRU 或者是 LSTM' 92 | 93 | if share_embedding: 94 | assert input_vocab_size == target_vocab_size, '如果share_embedding 为True 那么两个vocab_size 必须一样' 95 | assert mode in ('train', 'decode'), 'mode 必须是train 或者是decode , 而不是{}'.format(mode) 96 | 97 | assert dropout >=0.0 and dropout< 1.0, 'dropout 必须大于等于0 且小于等于1' 98 | 99 | assert attention_type.lower() in ('bahdanau', 'loung'), 'attention_type 必须是bahdanau 或者是 loung' 100 | 101 | assert beam_width < target_vocab_size, 'beam_width {} 应该小于target_vocab_size{}'.format(beam_width,target_vocab_size) 102 | 103 | self.keep_prob_placeholder = tf.placeholder( 104 | tf.float32, 105 | shape=[], 106 | name='keep_prob' 107 | ) 108 | self.global_step = tf.Variable( 109 | 0, trainable = False, name = 'global_step' 110 | ) 111 | 112 | self.use_beamsearch_decode = False 113 | self.beam_width = beam_width 114 | self.use_beamsearch_decode = True if self.beam_width > 0 else False 115 | self.max_decode_step = max_decode_step 116 | 117 | assert self.optimizer.lower() in ('adadelta', 'adam', 'rmsprop', 'momentum', 'sgd'), \ 118 | 'optimizer 必须是下列之一: adadelta, adam, rmsprop, momentum, sgd ' 119 | self.build_model() 120 | 121 | def build_model(self): 122 | """ 123 | 1. 初始化训练, 预测所需要的变量 124 | 2. 构建编码器(encoder) build_encoder -> encoder_cell -> build_signal_cell 125 | 3. 构建解码器(decoder) 126 | 4. 构建优化器(optimizer) 127 | 5. 保存 128 | """ 129 | self.init_placeholders() 130 | encoder_outputs, encoder_state = self.build_encoder() 131 | self.build_decoder(encoder_outputs, encoder_state) 132 | 133 | if self.mode == 'train': 134 | self.init_optimizer() 135 | 136 | self.saver = tf.train.Saver() 137 | 138 | def init_placeholders(self): 139 | """初始化训练,初始化所需要的变量 """ 140 | self.add_loss = tf.placeholder( 141 | dtype=tf.float32, 142 | name='add_loss' 143 | ) 144 | #编码器的输入 145 | # 编码器输入,shape=(batch_size, time_step) 146 | # 有 batch_size 句话,每句话是最大长度为 time_step 的 index 表示 147 | self.encoder_inputs = tf.placeholder( 148 | dtype=tf.int32, 149 | shape=(self.batch_size,None), 150 | name='encoder_inputs' 151 | ) 152 | #编码器的长度输入 153 | # 编码器长度输入,shape=(batch_size, 1) 154 | # 指的是 batch_size 句话每句话的长度 155 | self.encoder_inputs_length = tf.placeholder( 156 | dtype = tf.int32, 157 | shape=(self.batch_size, ), 158 | name = 'encoder_inputs_length' 159 | ) 160 | if self.mode =='train': 161 | 162 | #解码器的输入 163 | # 解码器输入,shape=(batch_size, time_step) 164 | # 注意,会默认里面已经在每句结尾包含 165 | self.decoder_inputs = tf.placeholder( 166 | dtype = tf.int32, 167 | shape=(self.batch_size, None), 168 | name = 'decoder_inputs' 169 | ) 170 | #解码器输入的rewards 用于强化学习训练,shape=(batch_size, time_step) 171 | self.rewards = tf.placeholder( 172 | dtype = tf.float32, 173 | shape=(self.batch_size, 1), 174 | name='rewards' 175 | ) 176 | 177 | #解码器的长度输入 178 | self.decoder_inputs_length = tf.placeholder( 179 | dtype = tf.int32, 180 | shape=(self.batch_size,), 181 | name ='decoder_inputs_length' 182 | ) 183 | 184 | self.decoder_start_token = tf.ones( 185 | shape=(self.batch_size, 1), 186 | dtype=tf.int32 187 | ) * WordSequence.START 188 | 189 | #实际训练时解码器的输入, start_token + decoder_inputs 190 | self.decoder_inputs_train = tf.concat([ 191 | self.decoder_start_token, 192 | self.decoder_inputs 193 | ],axis=1) 194 | 195 | 196 | def build_signle_cell(self, n_hidden, use_residual): 197 | """ 198 | 构建一个单独的 RNNCell 199 | n_hidden : 隐藏层的神经元数量 200 | use_residiual : 是否使用residual wrapper 201 | """ 202 | 203 | if self.cell_type == 'gru': 204 | cell_type = GRUCell 205 | else: 206 | cell_type = LSTMCell 207 | 208 | cell = cell_type(n_hidden) 209 | if self.use_dropout: 210 | cell = DropoutWrapper( 211 | cell, 212 | dtype = tf.float32, 213 | output_keep_prob = self.keep_prob_placeholder, 214 | seed = self.seed 215 | ) 216 | 217 | if use_residual: 218 | cell = ResidualWrapper(cell) 219 | 220 | return cell 221 | 222 | def build_encoder_cell(self): 223 | """构建单独的编码器 """ 224 | # 通过MultiRNNCells类来实现Deep RNN 225 | return MultiRNNCell([ 226 | self.build_signle_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) 227 | ]) 228 | 229 | def feed_embedding(self, sess, encoder=None, decoder=None): 230 | """ 231 | 加载预训练好embedding 232 | """ 233 | assert self.pretrained_embedding, '必须开启pretrained_embedding才能使用feed_embedding' 234 | assert encoder is not None or decoder is not None, 'encoder 和 decoder 至少得输入一个!' 235 | 236 | if encoder is not None: 237 | sess.run(self.encoder_embeddings_init, 238 | {self.encoder_embeddings_placeholder: encoder}) 239 | 240 | if decoder is not None: 241 | sess.run(self.decoder_embeddings_init, 242 | {self.decoder_embeddings_placeholder: decoder}) 243 | 244 | 245 | def build_encoder(self): 246 | """ 构建编码器""" 247 | 248 | with tf.variable_scope('encoder'): #变量命名空间 ,实现变量共享 249 | encoder_cell = self.build_encoder_cell() 250 | 251 | with tf.device(_get_embed_device(self.input_vocab_size)): #判断使用显存还是内存 252 | if self.pretrained_embedding: 253 | self.encoder_embeddings = tf.Variable( 254 | tf.constant(0.0,shape=(self.input_vocab_size, self.embedding_size)), trainable=True, name = 'embeddings' 255 | ) 256 | 257 | self.encoder_embeddings_placeholder = tf.placeholder( 258 | tf.float32, 259 | (self.input_vocab_size, self.embedding_size) 260 | ) 261 | self.encoder_embeddings_init = self.encoder_embeddings.assign( #赋值操作 262 | self.encoder_embeddings_placeholder 263 | ) 264 | else: 265 | self.encoder_embeddings = tf.get_variable( 266 | name='embedding', 267 | shape=(self.input_vocab_size, self.embedding_size), 268 | initializer = self.initializer, 269 | dtype = tf.float32 270 | ) 271 | 272 | self.encoder_inputs_embedded = tf.nn.embedding_lookup( #函数是在params中查找ids的表示 273 | #这里是在二维embeddings中找二维的ids, ids每一行中的一个数对应embeddings中的一行,所以最后是[batch_size, time_step, embedding_size] 274 | params=self.encoder_embeddings, 275 | ids=self.encoder_inputs 276 | ) 277 | if self.use_residual: 278 | #全连接层 279 | self.encoder_inputs_embedded = layers.dense(self.encoder_inputs_embedded, 280 | self.hidden_units, 281 | use_bias = False, 282 | name='encoder_residual_projection') 283 | inputs = self.encoder_inputs_embedded 284 | if self.time_major: 285 | inputs = tf.transpose(inputs,(1,0,2)) 286 | 287 | if not self.bidirectional: 288 | (encoder_outputs,encoder_state) = tf.nn.dynamic_rnn( 289 | cell=encoder_cell, 290 | inputs = inputs, 291 | sequence_length = self.encoder_inputs_length, 292 | dtype = tf.float32, 293 | time_major = self.time_major, 294 | parallel_iterations = self.parallel_iterations, 295 | swap_memory=False 296 | ) 297 | else: 298 | encoder_cell_bw = self.build_encoder_cell() 299 | ( 300 | (encoder_fw_outputs, encoder_bw_outputs), 301 | (encoder_fw_state, encoder_bw_state) 302 | ) = tf.nn.bidirectional_dynamic_rnn( #动态多层双向lstm_rnn 303 | cell_fw=encoder_cell, 304 | cell_bw = encoder_cell_bw, 305 | inputs = inputs, 306 | sequence_length = self.encoder_inputs_length, 307 | dtype=tf.float32, 308 | time_major=self.time_major, 309 | parallel_iterations=self.parallel_iterations, 310 | swap_memory = True 311 | ) 312 | encoder_outputs = tf.concat([encoder_fw_outputs, encoder_bw_outputs], 2) 313 | 314 | encoder_state = [] 315 | for i in range(self.depth): 316 | encoder_state.append(encoder_fw_state[i]) 317 | encoder_state.append(encoder_bw_state[i]) 318 | encoder_state = tuple(encoder_state) 319 | 320 | return encoder_outputs, encoder_state 321 | 322 | 323 | def build_decoder_cell(self,encoder_outputs, encoder_state): 324 | """ 构建解码器cell """ 325 | encoder_inputs_length = self.encoder_inputs_length 326 | batch_size = self.batch_size 327 | 328 | if self.bidirectional: 329 | encoder_state = encoder_state[-self.depth:] 330 | 331 | if self.time_major: 332 | encoder_outputs = tf.transpose(encoder_outputs, (1, 0, 2)) 333 | 334 | if self.use_beamsearch_decode: 335 | encoder_outputs = seq2seq.tile_batch( 336 | encoder_outputs, multiplier = self.beam_width 337 | ) 338 | encoder_state = seq2seq.tile_batch( 339 | encoder_state, multiplier=self.beam_width 340 | ) 341 | encoder_inputs_length = seq2seq.tile_batch( 342 | self.encoder_inputs_length, multiplier=self.beam_width 343 | ) 344 | #如果使用了beamsearch, 那么输入应该是beam_width的倍数等于batch_size的 345 | batch_size *= self.beam_width 346 | 347 | if self.attention_type.lower() == 'luong': 348 | self.attention_mechanism = LuongAttention( 349 | num_units = self.hidden_units, 350 | memory = encoder_outputs, 351 | memory_sequence_length = encoder_inputs_length 352 | ) 353 | else: 354 | #BahdanauAttention 就是初始化时传入 num_units 以及 Encoder Outputs,然后调时传入 query 用即可得到权重变量 alignments。 355 | self.attention_mechanism = BahdanauAttention( 356 | num_units = self.hidden_units, 357 | memory = encoder_outputs, 358 | memory_sequence_length = encoder_inputs_length 359 | ) 360 | 361 | cell = MultiRNNCell([ self.build_signle_cell(self.hidden_units, use_residual=self.use_residual) for _ in range(self.depth) ]) 362 | # 在非训练(预测)模式,并且没开启 beamsearch 的时候,打开 attention 历史信息 363 | alignment_history = ( 364 | self.mode != 'train' and not self.use_beamsearch_decode 365 | ) 366 | 367 | def cell_input_fn(inputs, attention): 368 | """ 根据attn_input_feeding属性来判断是否在attention计算前进行一次投影的计算""" 369 | if not self.use_residual: 370 | return array_ops.concat([inputs, attention], -1) 371 | 372 | attn_projection = layers.Dense(self.hidden_units, 373 | dtype = tf.float32, 374 | use_bias=False, 375 | name='attention_cell_input_fn') 376 | return attn_projection(array_ops.concat([inputs, attention], -1)) 377 | 378 | attention_cell = AttentionWrapper( 379 | cell = cell, 380 | attention_mechanism = self.attention_mechanism, 381 | attention_layer_size= self.hidden_units, 382 | alignment_history = alignment_history, 383 | cell_input_fn = cell_input_fn, 384 | name = 'AttentionWrapper' 385 | ) 386 | # 空状态 387 | decoder_initial_state = attention_cell.zero_state(batch_size, tf.float32) 388 | 389 | #传递encoder的状态 定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 390 | decoder_initial_state = decoder_initial_state.clone( 391 | cell_state = encoder_state 392 | ) 393 | return attention_cell, decoder_initial_state 394 | 395 | def build_decoder(self, encoder_outputs, encoder_state): 396 | """ 397 | 构建解码器 398 | """ 399 | with tf.variable_scope('decoder') as decoder_scope: 400 | ( 401 | self.decoder_cell, 402 | self.decoder_initial_state 403 | ) = self.build_decoder_cell(encoder_outputs, encoder_state) 404 | #构建解码器的embedding 405 | with tf.device(_get_embed_device(self.target_vocab_size)): 406 | if self.share_embedding: 407 | self.decoder_embeddings = self.encoder_embeddings 408 | elif self.pretrained_embedding: 409 | 410 | self.decoder_embeddings = tf.Variable( 411 | tf.constant(0.0, shape=(self.target_vocab_size, self.embedding_size) 412 | ), 413 | trainable = True, 414 | name = 'embeddings' 415 | ) 416 | self.decoder_embeddings_placeholder = tf.placeholder( 417 | tf.float32, 418 | (self.target_vocab_size, self.embedding_size) 419 | ) 420 | self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder) 421 | else: 422 | self.decoder_embeddings = tf.get_variable( 423 | name = 'embeddings', 424 | shape=(self.target_vocab_size, self.embedding_size), 425 | initializer = self.initializer, 426 | dtype = tf.float32 427 | ) 428 | self.decoder_output_projection = layers.Dense( 429 | self.target_vocab_size, 430 | dtype = tf.float32, 431 | use_bias=False, 432 | name= 'decoder_output_projection' 433 | ) 434 | 435 | if self.mode == 'train': 436 | self.decoder_inputs_embedded= tf.nn.embedding_lookup( 437 | params=self.decoder_embeddings, 438 | ids = self.decoder_inputs_train 439 | ) 440 | 441 | inputs = self.decoder_inputs_embedded 442 | if self.time_major: 443 | inputs = tf.transpose(inputs, (1, 0, 2)) 444 | 445 | training_helper = seq2seq.TrainingHelper( 446 | #根据预测值或者真实值得到下一刻的输入 447 | inputs = inputs, 448 | sequence_length = self.decoder_inputs_length, 449 | time_major = self.time_major, 450 | name='training_helper' 451 | ) 452 | # 训练的时候不在这里应用 output_layer 453 | # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 454 | # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 455 | training_decoder = seq2seq.BasicDecoder( 456 | cell= self.decoder_cell, 457 | helper=training_helper, 458 | initial_state=self.decoder_initial_state 459 | #output_layer = self.decoder_output_projection #输出映射层,将rnn_size转化为vocab_size维 460 | ) 461 | #decoder在当前的batch下的最大time_steps 462 | max_decoder_length = tf.reduce_max(self.decoder_inputs_length) 463 | 464 | outputs, self.final_state, _ = seq2seq.dynamic_decode( 465 | decoder=training_decoder, 466 | output_time_major=self.time_major, 467 | impute_finished=True, #Boolean,为真时会拷贝最后一个时刻的状态并将输出置零,程序运行更稳定,使最终状态和输出具有正确的值,在反向传播时忽略最后一个完成步。但是会降低程序运行速度。 468 | maximum_iterations=max_decoder_length, #最大解码步数,一般训练设置为decoder_inputs_length,预测时设置一个想要的最大序列长度即可。程序会在产生或者到达最大步数处停止 469 | parallel_iterations=self.parallel_iterations, #parallel_iterations是并行执行循环的个数 470 | swap_memory=True, 471 | scope=decoder_scope 472 | ) 473 | 474 | self.decoder_logits_train = self.decoder_output_projection( 475 | outputs.rnn_output 476 | 477 | ) 478 | self.masks = tf.sequence_mask( 479 | #构建序列长度的mask标志 480 | lengths = self.decoder_inputs_length, 481 | maxlen = max_decoder_length, 482 | dtype = tf.float32, 483 | name='masks' 484 | ) 485 | 486 | decoder_logits_train = self.decoder_logits_train 487 | if self.time_major: 488 | decoder_logits_train = tf.transpose(decoder_logits_train, (1, 0, 2)) 489 | 490 | self.decoder_pred_train = tf.argmax( 491 | decoder_logits_train, 492 | axis = -1, 493 | name= 'decoder_pred_train' 494 | ) 495 | 496 | self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( 497 | labels = self.decoder_inputs, #真实值y 498 | logits = decoder_logits_train #预测值y_ 499 | ) 500 | 501 | self.masks_rewards = self.masks * self.rewards 502 | 503 | self.loss_rewards = seq2seq.sequence_loss( 504 | logits = decoder_logits_train, #[batch_size, sequence_length, num_decoder_symbols] 505 | targets = self.decoder_inputs, #[batch_size, sequence_length] 不用做one_hot 506 | weights = self.masks_rewards, #[batch_size, sequence_length] 即mask,滤去padding的loss计算,使loss计算更准确。 507 | average_across_timesteps=True, 508 | average_across_batch=True 509 | ) 510 | 511 | self.loss = seq2seq.sequence_loss( 512 | #序列的损失函数 513 | logits=decoder_logits_train, #[batch_size, sequence_length, num_decoder_symbols] 514 | targets = self.decoder_inputs, #[batch_size, sequence_length] 不用做one_hot 515 | weights = self.masks, # 即mask,滤去padding的loss计算,使loss计算更准确。 516 | average_across_timesteps=True, 517 | average_across_batch = True 518 | ) 519 | 520 | self.loss_add = self.loss + self.add_loss 521 | 522 | elif self.mode == 'decode': 523 | start_tokens = tf.tile([WordSequence.START],[self.batch_size]) 524 | end_token = WordSequence.END 525 | 526 | def embed_and_input_proj(inputs): 527 | return tf.nn.embedding_lookup(self.decoder_embeddings, inputs) 528 | 529 | if not self.use_beamsearch_decode: 530 | decoding_helper = seq2seq.GreedyEmbeddingHelper( 531 | start_tokens= start_tokens, 532 | end_token=end_token, 533 | embedding = embed_and_input_proj 534 | ) 535 | 536 | inference_decoder = seq2seq.BasicDecoder( 537 | cell = self.decoder_cell, 538 | helper=decoding_helper, 539 | initial_state = self.decoder_initial_state, 540 | output_layer = self.decoder_output_projection 541 | ) 542 | else: 543 | inference_decoder = BeamSearchDecoder( 544 | cell=self.decoder_cell, 545 | embedding=embed_and_input_proj, 546 | start_tokens = start_tokens, 547 | end_token = end_token, 548 | initial_state=self.decoder_initial_state, 549 | beam_width=self.beam_width, 550 | output_layer=self.decoder_output_projection 551 | ) 552 | 553 | if self.max_decode_step is not None: 554 | max_decoder_step = self.max_decode_step 555 | else: 556 | max_decoder_step = tf.round(tf.reduce_max(self.encoder_inputs_length) * 4) 557 | 558 | self.decoder_outputs_decode, self.final_state, _= seq2seq.dynamic_decode( 559 | decoder = inference_decoder, 560 | output_time_major=self.time_major, 561 | maximum_iterations = max_decoder_step, 562 | parallel_iterations=self.parallel_iterations, 563 | swap_memory=True, 564 | scope=decoder_scope 565 | ) 566 | 567 | if not self.use_beamsearch_decode: 568 | dod = self.decoder_outputs_decode 569 | self.decoder_pred_decode = dod.sample_id 570 | if self.time_major: 571 | self.decoder_pred_decode = tf.transpose( 572 | self.decoder_pred_decode, 573 | (1, 0)) 574 | else: 575 | self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids 576 | 577 | if self.time_major: 578 | self.decoder_pred_decode = tf.transpose( 579 | self.decoder_pred_decode, 580 | (1, 0, 2) 581 | ) 582 | self.decoder_pred_decode = tf.transpose( 583 | self.decoder_pred_decode, 584 | perm=[0, 2, 1] 585 | ) 586 | dod = self.decoder_outputs_decode 587 | self.beam_prob = dod.beam_search_decoder_output.scores 588 | 589 | def save(self, sess, save_path='model.ckpt'): 590 | """ 591 | 在tensorflow游两种保存模型: 592 | ckpt: 训练模型后保存, 这里会保存所有的训练参数, 文件相对来讲较大, 可以用来进行模型的恢复和加载 593 | pd: 用于模型最后的线上部署, 这里面的线上部署指的是Tensorflow Serving 进行模型发布, 一般发布成grpc形式的接口 594 | """ 595 | self.saver.save(sess, save_path=save_path) 596 | 597 | def load(self, sess, save_path='model.ckpt'): 598 | print('try load model from', save_path) 599 | self.saver.restore(sess, save_path) 600 | 601 | 602 | def init_optimizer(self): 603 | """ 604 | sgd, adadelta, adam, rmsprop, momentum 605 | """ 606 | learning_rate = tf.train.polynomial_decay( 607 | #多项式衰减 608 | self.learning_rate, 609 | self.global_step, 610 | self.decay_steps, 611 | self.min_learning_rate, 612 | power=0.5 613 | ) 614 | 615 | self.current_learning_rate = learning_rate 616 | #返回需要训练的参数列表 trainalbe=True 617 | trainable_params = tf.trainable_variables() 618 | #设置优化器 619 | if self.optimizer.lower() == 'adadelta': 620 | self.opt = tf.train.AdadeltaOptimizer( 621 | learning_rate = learning_rate 622 | ) 623 | elif self.optimizer.lower() == 'adam': 624 | self.opt = tf.train.AdamOptimizer( 625 | learning_rate = learning_rate 626 | ) 627 | elif self.optimizer.lower() == 'rmsprop': 628 | self.opt = tf.train.RMSPropOptimizer( 629 | learning_rate= learning_rate 630 | ) 631 | elif self.optimizer.lower() == 'momentum': 632 | self.opt = tf.train.MomentumOptimizer( 633 | learning_rate = learning_rate, momentum=0.9 634 | ) 635 | elif self.optimizer.lower() == 'sgd': 636 | self.opt = tf.train.GradientDescentOptimizer( 637 | learning_rate=learning_rate 638 | ) 639 | 640 | gradients = tf.gradients(ys=self.loss, xs=trainable_params) #函数列表ys里的每一个函数对xs中的每一个变量求偏导,返回一个梯度张量的列表 641 | 642 | #梯度裁剪 放置梯度爆炸 643 | clip_gradients, _ = tf.clip_by_global_norm( 644 | gradients, self.max_gradient_norm 645 | ) 646 | #更新model 647 | self.updates = self.opt.apply_gradients( 648 | #进行BP算法 649 | #由于apply_gradients函数接收的是一个(梯度张量, 变量)tuple列表 650 | #所以要将梯度列表和变量列表进行捉对组合,用zip函数 651 | zip(clip_gradients, trainable_params), 652 | global_step = self.global_step 653 | ) 654 | 655 | gradients = tf.gradients(self.loss_rewards, trainable_params) 656 | clip_gradients, _ = tf.clip_by_global_norm( 657 | gradients, self.max_gradient_norm 658 | ) 659 | self.updates_rewards = self.opt.apply_gradients( 660 | zip(clip_gradients, trainable_params), 661 | global_step=self.global_step 662 | ) 663 | 664 | #添加self.loss_add 的update 665 | gradients = tf.gradients(self.loss_add, trainable_params) 666 | clip_gradients, _ = tf.clip_by_global_norm( 667 | gradients, self.max_gradient_norm 668 | ) 669 | self.updates_add = self.opt.apply_gradients( 670 | zip(clip_gradients, trainable_params), 671 | global_step = self.global_step 672 | ) 673 | 674 | def check_feeds(self, encoder_inputs, encoder_inputs_length, 675 | decoder_inputs, decoder_inputs_length, decode): 676 | """ 677 | encoder_inputs :一个整型的二维矩阵,[batch_size, max_source_time_steps] 678 | encoder_inputs_length: [batch_size], 每一个维度就是encoder句子的真实长度 679 | decoder_inputs: 一个整型的二维矩阵,[batch_size, max_target_time_steps] 680 | decoder_inputs_length: [batch_size],每一个维度就是decoder句子的真实长度 681 | decode: 是训练模式还是train(decode=false),还是预测模式decode(decoder=true) 682 | return: tensorflow所需要的input_feed,包括encoder_inputs, encoder_inputs_length, decoder_inputs, decoder_inputs_length 683 | """ 684 | input_batch_size = encoder_inputs.shape[0] 685 | if input_batch_size != encoder_inputs_length.shape[0]: 686 | raise ValueError( 687 | 'encoder_inputs 和 encoder_inputs_length的第一个维度必须一致' 688 | '这个维度是batch_size, %d != %d' % ( 689 | input_batch_size, encoder_inputs_length.shape[0] 690 | ) 691 | ) 692 | if not decode: 693 | target_batch_size = decoder_inputs.shape[0] 694 | if target_batch_size != input_batch_size: 695 | raise ValueError( 696 | 'encoder_inputs 和 decoder_inputs 的第一个维度必须一致' 697 | '这个维度是batch_size, %d != %d' % ( 698 | input_batch_sezi, target_batch_size 699 | ) 700 | ) 701 | 702 | if target_batch_size != decoder_inputs_length.shape[0]: 703 | raise ValueError( 704 | 'encoder_inputs 和 decoder_inputs_length的第一个维度必须一致' 705 | '这个维度是batch_size, %d != %d' %( 706 | input_batch_size, target_batch_size.shape[0] 707 | ) 708 | ) 709 | 710 | input_feed = {} 711 | input_feed[self.encoder_inputs.name] = encoder_inputs 712 | input_feed[self.encoder_inputs_length.name] = encoder_inputs_length 713 | 714 | if not decode: 715 | input_feed[self.decoder_inputs.name] = decoder_inputs 716 | input_feed[self.decoder_inputs_length.name] = decoder_inputs_length 717 | 718 | return input_feed 719 | 720 | def train(self, sess, encoder_inputs, encoder_inputs_length, 721 | decoder_inputs, decoder_inputs_length, 722 | rewards=None, return_lr=False, 723 | loss_only=False, add_loss=None): 724 | """训练模型""" 725 | 726 | input_feed = self.check_feeds( 727 | encoder_inputs, encoder_inputs_length, 728 | decoder_inputs, decoder_inputs_length, 729 | False 730 | ) 731 | #设置dropout 732 | input_feed[self.keep_prob_placeholder.name] = self.keep_prob 733 | 734 | if loss_only: 735 | #输出 736 | return sess.run(self.loss, input_feed) 737 | if add_loss is not None: 738 | input_feed[self.add_loss.name] = add_loss 739 | output_feed =[self.updates_add, self.loss_add, self.current_learning_rate] 740 | 741 | _, cost, lr = sess.run(output_feed, input_feed) 742 | if return_lr: 743 | return cost, lr 744 | return cost 745 | if rewards is not None: 746 | input_feed[self.rewards.name] = rewards 747 | output_feed =[self.updates_rewards, self.loss_rewards, self.current_learning_rate] 748 | 749 | _, cost, lr = sess.run(output_feed, input_feed) 750 | if return_lr: 751 | return cost, lr 752 | return cost 753 | 754 | output_feed = [self.updates, self.loss, self.current_learning_rate] 755 | 756 | _, cost, lr =sess.run(output_feed, input_feed) 757 | 758 | if return_lr: 759 | return cost, lr 760 | return cost 761 | 762 | def get_encoder_embedding(self, sess, encoder_inputs): 763 | input_feed ={ 764 | self.encoder_inputs.name : encoder_inputs 765 | } 766 | emb = sess.run(self.encoder_inputs_embedded, input_feed) 767 | return emb 768 | 769 | def entropy(self, sess, encoder_inputs, encoder_inputs_length, 770 | decoder_inputs, decoder_inputs_length): 771 | input_feed = self.check_feeds( 772 | encoder_inputs, encoder_inputs_length, 773 | decoder_inputs, deocder_inputs_length, 774 | False 775 | ) 776 | input_feed[self.keep_prob_placeholder.name] = 1.0 777 | output_feed = [self.train_entropy, self.decoder_pred_train] 778 | entropy, logits = sess.run(output_feed, input_feed) 779 | return entropy, logits 780 | 781 | def predict(self, sess, 782 | encoder_inputs, 783 | encoder_inputs_length, 784 | attention=False): 785 | 786 | input_feed = self.check_feeds(encoder_inputs, encoder_inputs_length, None, None, True) 787 | 788 | input_feed[self.keep_prob_placeholder.name] =1.0 789 | 790 | if attention: 791 | assert not self.use_beamsearch_decode, 'Attention 模式不能打开BeamSearch' 792 | 793 | pred, atten = sess.run( 794 | [self.decoder_pred_decode, self.final_state.aligment_history.stack()], 795 | input_feed) 796 | return pred, atten 797 | 798 | if self.use_beamsearch_decode: 799 | pred, beam_prob = sess.run( 800 | [self.decoder_pred_decode, self.beam_prob], 801 | input_feed) 802 | beam_prob = np.mean(beam_prob, axis=1) 803 | pred = pred[0] 804 | return pred 805 | 806 | pred, = sess.run([self.decoder_pred_decode], input_feed) 807 | return pred 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | -------------------------------------------------------------------------------- /test_anti.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import numpy as np 4 | import tensorflow as tf 5 | from sequence_to_sequence import SequenceToSequence 6 | from data_utils import batch_flow 7 | import json 8 | import os 9 | 10 | def test(params): 11 | x_data, _ = pickle.load(open('./data/chatbot.pkl', 'rb')) 12 | ws = pickle.load(open('./data/ws.pkl', 'rb')) 13 | 14 | for x in x_data[:5]: 15 | print(' '.join(x)) 16 | 17 | config = tf.ConfigProto( 18 | device_count = {'CPU':1, 'GPU':0}, 19 | allow_soft_placement = True, 20 | log_device_placement=False 21 | ) 22 | 23 | save_path = 'model/s2s_chatbot_anti.ckpt' 24 | 25 | tf.reset_default_graph() 26 | model_pred = SequenceToSequence( 27 | input_vocab_size = len(ws), 28 | target_vocab_size = len(ws), 29 | batch_size=1, 30 | mode = 'decode', 31 | **params 32 | ) 33 | init = tf.global_variables_initializer() 34 | 35 | with tf.Session(config=config) as sess: 36 | sess.run(init) 37 | model_pred.load(sess, save_path) 38 | 39 | while True: 40 | user_text = input('请输入您的句子:') 41 | if user_text in ('exit', 'quit'): 42 | exit(0) 43 | x_test = [list(user_text.lower())] 44 | bar = batch_flow([x_test], ws, 1) 45 | x, xl = next(bar) 46 | x = np.flip(x, axis=1) 47 | 48 | print(x, xl) 49 | pred = model_pred.predict( 50 | sess, 51 | np.array(x), 52 | np.array(xl) 53 | ) 54 | print(pred) 55 | print(ws.inverse_transform(x[0])) 56 | for p in pred: 57 | ans = ws.inverse_transform(p) 58 | print(ans) 59 | 60 | def main(): 61 | os.environ['CUDA_VISIBLE_DEVICES'] = '2' 62 | test(json.load(open('params.json'))) 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /threadedgenerator.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | from queue import Queue 3 | 4 | class ThreadedGenerator(object): 5 | def __init__(self,iterator, 6 | sentinel=object(), 7 | queue_maxsize=0, 8 | daemon=False): 9 | 10 | self._iterator = iterator 11 | self._sentinel = sentinel 12 | self._queue = Queue(maxsize=queue_maxsize) 13 | self._thread = Thread( 14 | name=repr(iterator), 15 | target=self._run 16 | ) 17 | self._thread.daemon= daemon 18 | self._started=False 19 | 20 | def __repr__(self): 21 | return 'ThreadedGenerator({!r})'.format(self._iterator) 22 | 23 | def _run(self): 24 | try: 25 | for value in self._iterator: 26 | if not self._started: 27 | return 28 | self._queue.put(value) 29 | finally: 30 | self._queue.put(self._sentinel) 31 | 32 | def close(self): 33 | self._started = False 34 | try: 35 | while True: 36 | self._queue.get(timeout=30) 37 | except KeyboardInterrupt as e: 38 | raise e 39 | except: 40 | pass 41 | 42 | def __iter__(self): 43 | self._started = True 44 | self._thread.start() 45 | for value in iter(self._queue.get,self._sentinel): 46 | yield value 47 | self._thread.join() 48 | self._started=False 49 | 50 | 51 | def __next__(self): 52 | if not self._started: 53 | self._started = True 54 | self._thread.start() 55 | value = self._queue.get(timeout=30) 56 | if value ==self._sentinel: 57 | raise StopIteration() 58 | return value 59 | 60 | 61 | def test(): 62 | 63 | def gene(): 64 | i=0 65 | while True: 66 | yield i 67 | i+=1 68 | 69 | t = gene() 70 | test = ThreadedGenerator(t,queue_maxsize=10) 71 | for _ in range(10): 72 | print(next(test)) 73 | test.close() 74 | 75 | if __name__ == '__main__': 76 | test() -------------------------------------------------------------------------------- /train_anti.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pickle 3 | import numpy as np 4 | import tensorflow as tf 5 | from tqdm import tqdm 6 | from sequence_to_sequence import SequenceToSequence 7 | from data_utils import batch_flow_bucket as batch_flow 8 | from word_sequence import WordSequence 9 | from threadedgenerator import ThreadedGenerator 10 | import os 11 | import json 12 | 13 | def test(params): 14 | x_data, y_data = pickle.load(open('./data/chatbot.pkl', 'rb')) 15 | ws = pickle.load(open('./data/ws.pkl', 'rb')) 16 | 17 | n_epoch = 2 18 | batch_size=128 19 | steps = int(len(x_data) / batch_size) +1 20 | config = tf.ConfigProto( 21 | allow_soft_placement=True, 22 | log_device_placement=False 23 | ) 24 | 25 | save_path = 'model/s2s_chatbot_anti.ckpt' 26 | 27 | tf.reset_default_graph() 28 | with tf.Graph().as_default(): 29 | random.seed(0) 30 | np.random.seed(0) 31 | tf.set_random_seed(0) 32 | 33 | with tf.Session(config=config) as sess: 34 | model = SequenceToSequence( 35 | input_vocab_size=len(ws), 36 | target_vocab_size = len(ws), 37 | batch_size = batch_size, 38 | **params 39 | ) 40 | init = tf.global_variables_initializer() 41 | sess.run(init) 42 | 43 | flow = ThreadedGenerator( 44 | batch_flow([x_data, y_data], ws, batch_size, add_end=[False, True]), 45 | queue_maxsize=30 46 | ) 47 | 48 | dummy_encoder_inputs = np.array([ 49 | np.array([WordSequence.PAD]) for _ in range(batch_size) 50 | ]) 51 | 52 | dummy_encoder_inputs_length = np.array([1] * batch_size) 53 | 54 | for epoch in range(1, n_epoch+1): 55 | costs = [] 56 | bar = tqdm(range(steps), 57 | total=steps, 58 | desc='epoch {}, loss=0.000000'.format(epoch) 59 | ) 60 | for _ in bar: 61 | x, xl, y, yl = next(flow) 62 | x = np.flip(x, axis=1) 63 | 64 | add_loss = model.train( 65 | sess, 66 | dummy_encoder_inputs, 67 | dummy_encoder_inputs_length, 68 | y, yl, loss_only=True 69 | ) 70 | 71 | add_loss *= -0.5 72 | 73 | cost, lr = model.train(sess, x, xl, y, yl, 74 | return_lr=True, 75 | add_loss=add_loss 76 | ) 77 | costs.append(cost) 78 | bar.set_description('epoch {} loss={:.6f} lr={:.6f}'.format(epoch, np.mean(costs), lr)) 79 | 80 | model.save(sess, save_path) 81 | flow.close() 82 | 83 | tf.reset_default_graph() 84 | model_pred = SequenceToSequence( 85 | input_vocab_size= len(ws), 86 | target_vocab_size = len(ws), 87 | batch_size=1, 88 | mode='decode', 89 | beam_width=12, 90 | **params 91 | ) 92 | init = tf.global_variables_initializer() 93 | 94 | with tf.Session(config=config) as sess: 95 | sess.run(init) 96 | model_pred.load(sess, save_path) 97 | 98 | bar = batch_flow([x_data, y_data], ws, 1, add_end=False) 99 | t=0 100 | for x, xl, y, yl in bar: 101 | x = np.flip(x, axis=1) 102 | pred = model_pred.predict( 103 | sess, 104 | np.array(x), 105 | np.array(xl) 106 | ) 107 | print(ws.inverse_transform(x[0])) 108 | print(ws.inverse_transform(y[0])) 109 | print(ws.inverse_transform(pred[0])) 110 | t+=1 111 | if t >= 3: 112 | break 113 | 114 | 115 | tf.reset_default_graph() 116 | model_pred = SequenceToSequence( 117 | input_vocab_size= len(ws), 118 | target_vocab_size = len(ws), 119 | batch_size=1, 120 | mode='decode', 121 | beam_width=1, 122 | **params 123 | ) 124 | init = tf.global_variables_initializer() 125 | 126 | with tf.Session(config=config) as sess: 127 | sess.run(init) 128 | model_pred.load(sess, save_path) 129 | 130 | bar = batch_flow([x_data, y_data], ws, 1, add_end=False) 131 | t=0 132 | for x, xl, y, yl in bar: 133 | x = np.flip(x, axis=1) 134 | pred = model_pred.predict( 135 | sess, 136 | np.array(x), 137 | np.array(xl) 138 | ) 139 | print(ws.inverse_transform(x[0])) 140 | print(ws.inverse_transform(y[0])) 141 | print(ws.inverse_transform(pred[0])) 142 | t+=1 143 | if t >= 3: 144 | break 145 | 146 | 147 | def main(): 148 | os.environ['CUDA_VISIBLE_DEVICES'] = '2' 149 | test(json.load(open('params.json'))) 150 | 151 | 152 | if __name__ == '__main__': 153 | main() 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /word_sequence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | """ 3 | 维护一个字典,把一个list(或者字符串)编码化,或者反向恢复 4 | """ 5 | class WordSequence(object): 6 | PAD_TAG='' 7 | UNK_TAG='' 8 | START_TAG='' 9 | END_TAG='' 10 | 11 | PAD=0 12 | UNK=1 13 | START=2 14 | END=3 15 | word_dict={} 16 | def __init__(self, 17 | # word_vec_dic='sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5', #百度百科中文词向量 https://github.com/Embedding/Chinese-Word-Vectors 18 | # embedding_dim=300 19 | ): 20 | #初始化字典 21 | self.word_dict={ 22 | WordSequence.PAD_TAG:WordSequence.PAD, 23 | WordSequence.UNK_TAG:WordSequence.UNK, 24 | WordSequence.START_TAG:WordSequence.START, 25 | WordSequence.END_TAG:WordSequence.END 26 | } 27 | self.fited=False 28 | self.word_vec_dic=word_vec_dic 29 | # self.embedding_dim=embedding_dim 30 | 31 | 32 | def to_index(self,word): 33 | assert self.fited, 'WordSequence 尚未进行' 34 | if word in self.word_dict: 35 | return self.word_dict[word] 36 | return WordSequence.UNK 37 | 38 | def to_word(self,index): 39 | assert self.fited 40 | for k,v in self.word_dict.items(): 41 | if v==index: 42 | return k 43 | return WordSequence.UNK_TAG 44 | 45 | def size(self): 46 | assert self.fited 47 | return len(self.word_dict)+1 48 | 49 | def __len__(self): 50 | return self.size() 51 | 52 | def fit(self,sentences,min_count=5,max_count=None,max_features=None): 53 | """ 54 | Args: 55 | min_count 最小出现次数 56 | max_count 最大出现次数 57 | max_features 最大特征数 58 | """ 59 | assert not self.fited , 'WordSequence 只能 fit 一次' 60 | 61 | count={} 62 | for sentence in sentences: 63 | arr=list(sentence) 64 | for a in arr: 65 | if a not in count: 66 | count[a]=0 67 | count[a]+=1 68 | 69 | print(count) 70 | 71 | if min_count is not None: 72 | count={k : v for k,v in count.items() if v >= min_count} 73 | 74 | if max_count is not None: 75 | count={k : v for k,v in count.items() if v<=max_features} 76 | 77 | self.word_dict = { 78 | WordSequence.PAD_TAG:WordSequence.PAD, 79 | WordSequence.UNK_TAG:WordSequence.UNK, 80 | WordSequence.START_TAG:WordSequence.START, 81 | WordSequence.END_TAG:WordSequence.END 82 | 83 | } 84 | 85 | if isinstance(max_features,int): 86 | count = sorted(list(count.items()),key=lambda x:x[1]) #对value排序 升序 返回list元组 87 | if max_features is not None and len(count) > max_features: 88 | count = count[-int(max_features):] 89 | for w,_ in count: 90 | self.word_dict[w] = len(self.word_dict) #构建{word:index} 91 | else: 92 | for w in sorted(count.keys()): #按照key排序,返回keylist 93 | self.word_dict[w]=len(self.word_dict) 94 | 95 | self.fited=True 96 | 97 | #采用预训练好的部分词向量 98 | # embeddings_index={} 99 | # print("正在加载预训练词向量……") 100 | # with open(self.word_vec_dic, 'rb') as f: 101 | # for line in f: 102 | # values = line.decode('utf-8').split(' ') 103 | # word = values[0] 104 | # embedding=values[1:301] 105 | # embeddings_index[word]=embedding 106 | # print("预训练词向量加载完毕。") 107 | # nb_words = len(self.word_dict) 108 | 109 | # self.word_embedding_matrix=np.zeros((nb_words,self.embedding_dim),dtype=np.float32) 110 | # for word,i in self.word_dict.items(): 111 | # if word in embeddings_index: 112 | # self.word_embedding_matrix[i] = embeddings_index[word] 113 | # else: 114 | # new_embedding = np.array(np.random.uniform(-1,1,self.embedding_dim)) 115 | # embeddings_index[word] = new_embedding 116 | # self.word_embedding_matrix[i] = embeddings_index[word] 117 | # print('词向量映射完成') 118 | 119 | def showdict(self): 120 | assert self.fited 121 | 122 | for k,v in self.word_dict.items(): 123 | print(k,v) 124 | 125 | 126 | def transform(self,sentence,max_len=None): 127 | assert self.fited 128 | 129 | if max_len is not None: 130 | r = [self.PAD]*max_len 131 | else: 132 | r=[self.PAD]*len(sentence) 133 | 134 | for index,a in enumerate(sentence): 135 | if max_len is not None and index >=len(r): 136 | break 137 | r[index]=self.to_index(a) 138 | 139 | return np.array(r) #最后返回的是[3,4,6,5,end,pad,pad,pad] 140 | 141 | def inverse_transform(self,indices,ignore_pad=False,ignore_unk=False,ignore_start=False,ignore_end=False): 142 | ret=[] 143 | for i in indices: 144 | word = self.to_word(i) 145 | if word == WordSequence.PAD_TAG and ignore_pad: 146 | continue 147 | if word == WordSequence.UNK_TAG and ignore_unk: 148 | continue 149 | if word==WordSequence.START_TAG and ignore_start: 150 | continue 151 | if word==WordSequence.END_TAG and ignore_end: 152 | continue 153 | ret.append(word) 154 | 155 | return ret 156 | 157 | def test(): 158 | ws = WordSequence() 159 | ws.fit([ 160 | ['你','好','啊'], 161 | ['你','好','哦'], 162 | ['我','是','谁'] 163 | ]) 164 | print(ws.word_embedding_matrix[0]) 165 | print(ws.word_embedding_matrix[1]) 166 | 167 | # indice =ws.transform(['你','们']) 168 | # print(indice) 169 | 170 | # back = ws.inverse_transform(indice) 171 | # print(back) 172 | 173 | 174 | if __name__ == '__main__': 175 | test() 176 | 177 | --------------------------------------------------------------------------------