├── preprocess ├── convai2 │ ├── __init__.py │ ├── api.py │ └── candi_keyword.txt ├── extraction.py ├── prepare_data.py ├── dataset.py └── data_utils.py ├── train.py ├── config ├── retrieval.py ├── retrieval_stgy.py ├── matrix.py ├── neural.py ├── kernel.py └── data_config.py ├── chat.py ├── simulate.py ├── readme.md └── model ├── retrieval.py ├── retrieval_stgy.py ├── matrix.py ├── neural.py └── kernel.py /preprocess/convai2/__init__.py: -------------------------------------------------------------------------------- 1 | from .api import * 2 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import importlib 3 | import os 4 | if __name__ == '__main__': 5 | flags = tf.flags 6 | flags.DEFINE_string('data', 'data_config', 'The data config') 7 | flags.DEFINE_string('agent', 'kernel', 'The predictor type') 8 | flags.DEFINE_string('mode', 'train', 'The mode') 9 | 10 | FLAGS = flags.FLAGS 11 | config_data = importlib.import_module('config.' + FLAGS.data) 12 | config_model = importlib.import_module('config.' + FLAGS.agent) 13 | model = importlib.import_module('model.' + FLAGS.agent) 14 | predictor = model.Predictor(config_model, config_data, FLAGS.mode) 15 | if not os.path.exists('save/'+FLAGS.agent): 16 | os.makedirs('save/'+FLAGS.agent) 17 | 18 | if FLAGS.mode == 'train_kw': 19 | predictor.train_keywords() 20 | if FLAGS.mode == 'test_kw': 21 | predictor.test_keywords() 22 | if FLAGS.mode == 'train': 23 | predictor.train() 24 | predictor.test() 25 | if FLAGS.mode == 'test': 26 | predictor.test() 27 | -------------------------------------------------------------------------------- /config/retrieval.py: -------------------------------------------------------------------------------- 1 | _hidden_size = 200 2 | _code_len = 200 3 | _save_path = 'save/retrieval/model_1' 4 | _max_epoch = 10 5 | 6 | source_encoder_hparams = { 7 | "encoder_minor_type": "UnidirectionalRNNEncoder", 8 | "encoder_minor_hparams": { 9 | "rnn_cell": { 10 | "type": "GRUCell", 11 | "kwargs": { 12 | "num_units": _hidden_size, 13 | }, 14 | }, 15 | }, 16 | "encoder_major_type": "UnidirectionalRNNEncoder", 17 | "encoder_major_hparams": { 18 | "rnn_cell": { 19 | "type": "GRUCell", 20 | "kwargs": { 21 | "num_units": _hidden_size, 22 | }, 23 | } 24 | } 25 | } 26 | 27 | target_encoder_hparams = { 28 | "rnn_cell": { 29 | "type": "GRUCell", 30 | "kwargs": { 31 | "num_units": _hidden_size, 32 | }, 33 | } 34 | } 35 | 36 | opt_hparams = { 37 | "optimizer": { 38 | "type": "AdamOptimizer", 39 | "kwargs": { 40 | "learning_rate": 0.001, 41 | } 42 | }, 43 | } -------------------------------------------------------------------------------- /config/retrieval_stgy.py: -------------------------------------------------------------------------------- 1 | _hidden_size = 200 2 | _code_len = 200 3 | _save_path = 'save/retrieval/model_1' 4 | _max_epoch = 10 5 | 6 | source_encoder_hparams = { 7 | "encoder_minor_type": "UnidirectionalRNNEncoder", 8 | "encoder_minor_hparams": { 9 | "rnn_cell": { 10 | "type": "GRUCell", 11 | "kwargs": { 12 | "num_units": _hidden_size, 13 | }, 14 | }, 15 | }, 16 | "encoder_major_type": "UnidirectionalRNNEncoder", 17 | "encoder_major_hparams": { 18 | "rnn_cell": { 19 | "type": "GRUCell", 20 | "kwargs": { 21 | "num_units": _hidden_size, 22 | }, 23 | } 24 | } 25 | } 26 | 27 | target_encoder_hparams = { 28 | "rnn_cell": { 29 | "type": "GRUCell", 30 | "kwargs": { 31 | "num_units": _hidden_size, 32 | }, 33 | } 34 | } 35 | 36 | opt_hparams = { 37 | "optimizer": { 38 | "type": "AdamOptimizer", 39 | "kwargs": { 40 | "learning_rate": 0.001, 41 | } 42 | }, 43 | } -------------------------------------------------------------------------------- /config/matrix.py: -------------------------------------------------------------------------------- 1 | _hidden_size = 200 2 | _code_len = 800 3 | _save_path = 'save/matrix/model_1' 4 | _matrix_save_path = 'save/matrix/matrix_1.pk' 5 | _max_epoch = 10 6 | 7 | _vocab_path = 'tx_data/vocab.txt' 8 | _vocab = [x.strip() for x in open(_vocab_path, 'r').readlines()] 9 | _vocab_size = len(_vocab) 10 | 11 | source_encoder_hparams = { 12 | "encoder_minor_type": "BidirectionalRNNEncoder", 13 | "encoder_minor_hparams": { 14 | "rnn_cell_fw": { 15 | "type": "GRUCell", 16 | "kwargs": { 17 | "num_units": _hidden_size, 18 | }, 19 | }, 20 | "rnn_cell_share_config": True 21 | }, 22 | "encoder_major_type": "UnidirectionalRNNEncoder", 23 | "encoder_major_hparams": { 24 | "rnn_cell": { 25 | "type": "GRUCell", 26 | "kwargs": { 27 | "num_units": _hidden_size*2, 28 | }, 29 | } 30 | } 31 | } 32 | 33 | target_encoder_hparams = { 34 | "rnn_cell_fw": { 35 | "type": "GRUCell", 36 | "kwargs": { 37 | "num_units": _hidden_size, 38 | }, 39 | }, 40 | "rnn_cell_share_config": True 41 | } 42 | 43 | target_kwencoder_hparams = { 44 | "rnn_cell_fw": { 45 | "type": "GRUCell", 46 | "kwargs": { 47 | "num_units": _hidden_size, 48 | }, 49 | }, 50 | "rnn_cell_share_config": True 51 | } 52 | 53 | opt_hparams = { 54 | "optimizer": { 55 | "type": "AdamOptimizer", 56 | "kwargs": { 57 | "learning_rate": 0.001, 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /preprocess/extraction.py: -------------------------------------------------------------------------------- 1 | from data_utils import * 2 | 3 | class KeywordExtractor(): 4 | def __init__(self, idf_dict = None): 5 | self.idf_dict = idf_dict 6 | 7 | @staticmethod 8 | def is_keyword_tag(tag): 9 | return tag.startswith('VB') or tag.startswith('NN') or tag.startswith('JJ') 10 | 11 | @staticmethod 12 | def cal_tag_score(tag): 13 | if tag.startswith('VB'): 14 | return 1. 15 | if tag.startswith('NN'): 16 | return 2. 17 | if tag.startswith('JJ'): 18 | return 0.5 19 | return 0. 20 | 21 | def idf_extract(self, string, con_kw = None): 22 | tokens = simp_tokenize(string) 23 | seq_len = len(tokens) 24 | tokens = pos_tag(tokens) 25 | source = kw_tokenize(string) 26 | candi = [] 27 | result = [] 28 | for i, (word, tag) in enumerate(tokens): 29 | score = self.cal_tag_score(tag) 30 | if not is_candiword(source[i]) or score == 0.: 31 | continue 32 | if con_kw is not None and source[i] in con_kw: 33 | continue 34 | score *= source.count(source[i]) 35 | score *= 1 / seq_len 36 | score *= self.idf_dict[source[i]] 37 | candi.append((source[i], score)) 38 | if score > 0.15: 39 | result.append(source[i]) 40 | return list(set(result)) 41 | 42 | 43 | def extract(self, string): 44 | tokens = simp_tokenize(string) 45 | tokens = pos_tag(tokens) 46 | source = kw_tokenize(string) 47 | kwpos_alters = [] 48 | for i, (word, tag) in enumerate(tokens): 49 | if source[i] and self.is_keyword_tag(tag): 50 | kwpos_alters.append(i) 51 | kwpos, keywords = [], [] 52 | for id in kwpos_alters: 53 | if is_candiword(source[id]): 54 | keywords.append(source[id]) 55 | return list(set(keywords)) -------------------------------------------------------------------------------- /config/neural.py: -------------------------------------------------------------------------------- 1 | _hidden_size = 200 2 | _code_len = 800 3 | _save_path = 'save/neural/model_1' 4 | _neural_save_path = 'save/neural/keyword_1' 5 | _max_epoch = 10 6 | 7 | neural_opt_hparams = { 8 | "optimizer": { 9 | "type": "AdamOptimizer", 10 | "kwargs": { 11 | "learning_rate": 0.005, 12 | } 13 | }, 14 | "learning_rate_decay": { 15 | "type": "inverse_time_decay", 16 | "kwargs": { 17 | "decay_steps": 1600, 18 | "decay_rate": 0.8 19 | }, 20 | "start_decay_step": 0, 21 | "end_decay_step": 16000, 22 | }, 23 | } 24 | 25 | source_encoder_hparams = { 26 | "encoder_minor_type": "BidirectionalRNNEncoder", 27 | "encoder_minor_hparams": { 28 | "rnn_cell_fw": { 29 | "type": "GRUCell", 30 | "kwargs": { 31 | "num_units": _hidden_size, 32 | }, 33 | }, 34 | "rnn_cell_share_config": True 35 | }, 36 | "encoder_major_type": "UnidirectionalRNNEncoder", 37 | "encoder_major_hparams": { 38 | "rnn_cell": { 39 | "type": "GRUCell", 40 | "kwargs": { 41 | "num_units": _hidden_size*2, 42 | }, 43 | } 44 | } 45 | } 46 | 47 | target_encoder_hparams = { 48 | "rnn_cell_fw": { 49 | "type": "GRUCell", 50 | "kwargs": { 51 | "num_units": _hidden_size, 52 | }, 53 | }, 54 | "rnn_cell_share_config": True 55 | } 56 | 57 | target_kwencoder_hparams = { 58 | "rnn_cell_fw": { 59 | "type": "GRUCell", 60 | "kwargs": { 61 | "num_units": _hidden_size, 62 | }, 63 | }, 64 | "rnn_cell_share_config": True 65 | } 66 | 67 | context_encoder_hparams = { 68 | "rnn_cell": { 69 | "type": "GRUCell", 70 | "kwargs": { 71 | "num_units": _hidden_size, 72 | }, 73 | } 74 | } 75 | 76 | opt_hparams = { 77 | "optimizer": { 78 | "type": "AdamOptimizer", 79 | "kwargs": { 80 | "learning_rate": 0.001, 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /config/kernel.py: -------------------------------------------------------------------------------- 1 | _hidden_size = 200 2 | _code_len = 800 3 | _save_path = 'save/kernel/model_1' 4 | _kernel_save_path = 'save/kernel/keyword_1' 5 | _kernel_mu = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.] 6 | _kernel_sigma = 0.1 7 | _max_epoch = 10 8 | _early_stopping = 2 9 | 10 | kernel_opt_hparams = { 11 | "optimizer": { 12 | "type": "AdamOptimizer", 13 | "kwargs": { 14 | "learning_rate": 0.001, 15 | } 16 | }, 17 | "learning_rate_decay": { 18 | "type": "inverse_time_decay", 19 | "kwargs": { 20 | "decay_steps": 1600, 21 | "decay_rate": 0.8 22 | }, 23 | "start_decay_step": 0, 24 | "end_decay_step": 16000, 25 | }, 26 | } 27 | 28 | source_encoder_hparams = { 29 | "encoder_minor_type": "BidirectionalRNNEncoder", 30 | "encoder_minor_hparams": { 31 | "rnn_cell_fw": { 32 | "type": "GRUCell", 33 | "kwargs": { 34 | "num_units": _hidden_size, 35 | }, 36 | }, 37 | "rnn_cell_share_config": True 38 | }, 39 | "encoder_major_type": "UnidirectionalRNNEncoder", 40 | "encoder_major_hparams": { 41 | "rnn_cell": { 42 | "type": "GRUCell", 43 | "kwargs": { 44 | "num_units": _hidden_size*2, 45 | }, 46 | } 47 | } 48 | } 49 | 50 | target_encoder_hparams = { 51 | "rnn_cell_fw": { 52 | "type": "GRUCell", 53 | "kwargs": { 54 | "num_units": _hidden_size, 55 | }, 56 | }, 57 | "rnn_cell_share_config": True 58 | } 59 | 60 | target_kwencoder_hparams = { 61 | "rnn_cell_fw": { 62 | "type": "GRUCell", 63 | "kwargs": { 64 | "num_units": _hidden_size, 65 | }, 66 | }, 67 | "rnn_cell_share_config": True 68 | } 69 | 70 | context_encoder_hparams = { 71 | "rnn_cell": { 72 | "type": "GRUCell", 73 | "kwargs": { 74 | "num_units": _hidden_size, 75 | }, 76 | } 77 | } 78 | 79 | opt_hparams = { 80 | "optimizer": { 81 | "type": "AdamOptimizer", 82 | "kwargs": { 83 | "learning_rate": 0.001, 84 | } 85 | }, 86 | } -------------------------------------------------------------------------------- /chat.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import importlib 3 | import random 4 | from preprocess.data_utils import utter_preprocess, is_reach_goal 5 | 6 | class Target_Chat(): 7 | def __init__(self, agent): 8 | self.agent = agent 9 | self.start_utter = config_data._start_corpus 10 | with tf.Session(config=self.agent.gpu_config) as sess: 11 | self.agent.retrieve_init(sess) 12 | for i in range(int(FLAGS.times)): 13 | print('--------Session {} --------'.format(i)) 14 | self.chat(sess) 15 | 16 | def chat(self, sess): 17 | history = [] 18 | history.append(random.sample(self.start_utter, 1)[0]) 19 | target_kw = random.sample(target_set,1)[0] 20 | self.agent.target = target_kw 21 | self.agent.score = 0. 22 | self.agent.reply_list = [] 23 | print('START: ' + history[0]) 24 | for i in range(config_data._max_turns): 25 | history.append(input('HUMAN: ')) 26 | source = utter_preprocess(history, self.agent.data_config._max_seq_len) 27 | reply = self.agent.retrieve(source, sess) 28 | print('AGENT: ', reply) 29 | # print('Keyword: {}, Similarity: {:.2f}'.format(self.agent.next_kw, self.agent.score)) 30 | history.append(reply) 31 | if is_reach_goal(history[-2] + history[-1], target_kw): 32 | print('Successfully chat to the target \'{}\'.'.format(target_kw)) 33 | return 34 | print('Failed by reaching the maximum turn, target: \'{}\'.'.format(target_kw)) 35 | 36 | if __name__ == '__main__': 37 | flags = tf.flags 38 | # supports kernel / matrix / neural / retrieval / retrieval-stg 39 | flags.DEFINE_string('agent', 'kernel', 'The agent type') 40 | flags.DEFINE_string('times', '100', 'Conversation times') 41 | FLAGS = flags.FLAGS 42 | 43 | config_data = importlib.import_module('config.data_config') 44 | config_model = importlib.import_module('config.' + FLAGS.agent) 45 | model = importlib.import_module('model.' + FLAGS.agent) 46 | predictor = model.Predictor(config_model, config_data, 'test') 47 | 48 | target_set = [] 49 | for line in open('tx_data/test/keywords.txt', 'r').readlines(): 50 | target_set = target_set + line.strip().split(' ') 51 | 52 | Target_Chat(predictor) 53 | -------------------------------------------------------------------------------- /preprocess/prepare_data.py: -------------------------------------------------------------------------------- 1 | from dataset import dts_Target 2 | from collections import Counter 3 | import pickle 4 | import random 5 | import os 6 | import shutil 7 | if not os.path.exists('../tx_data'): 8 | os.mkdir('../tx_data') 9 | os.mkdir('../tx_data/train') 10 | os.mkdir('../tx_data/valid') 11 | os.mkdir('../tx_data/test') 12 | 13 | # import texar 14 | # if not os.path.exists('convai2/source'): 15 | # print('Downloading source ConvAI2 data') 16 | # texar.data.maybe_download('https://drive.google.com/file/d/1LPxNIVO52hZOwbV3Zply_ITi2Uacit-V/view?usp=sharing' 17 | # ,'convai2', extract=True) 18 | 19 | shutil.copy('convai2/source/embedding.txt', '../tx_data/embedding.txt') 20 | dataset = dts_Target() 21 | dataset.make_dataset() 22 | 23 | data = pickle.load(open("source_data.pk","rb")) 24 | max_utter = 9 25 | candidate_num = 20 26 | start_corpus_file = open("../tx_data/start_corpus.txt", "w") 27 | corpus_file = open("../tx_data/corpus.txt", "w") 28 | 29 | for stage in ['train', 'valid', 'test']: 30 | source_file = open("../tx_data/{}/source.txt".format(stage), "w") 31 | target_file = open("../tx_data/{}/target.txt".format(stage), "w") 32 | context_file = open("../tx_data/{}/context.txt".format(stage), "w") 33 | keywords_file = open("../tx_data/{}/keywords.txt".format(stage), "w") 34 | label_file = open("../tx_data/{}/label.txt".format(stage), "w") 35 | keywords_vocab_file = open("../tx_data/{}/keywords_vocab.txt".format(stage), "w") 36 | corpus = [] 37 | keywords_counter = Counter() 38 | for sample in data[stage]: 39 | corpus += sample['dialog'][1:] 40 | start_corpus_file.write(sample['dialog'][0]+ '\n') 41 | for kws in sample['kwlist']: 42 | keywords_counter.update(kws) 43 | for kw, _ in keywords_counter.most_common(): 44 | keywords_vocab_file.write(kw + '\n') 45 | for sample in data[stage]: 46 | for i in range(2, len(sample['dialog'])): 47 | if len(sample['kwlist'][i]) > 0: 48 | source_list = sample['dialog'][max(0, i - max_utter):i] 49 | source_str = '|||'.join(source_list) 50 | while True: 51 | random_corpus = random.sample(corpus, candidate_num - 1) 52 | if sample['dialog'][i] not in random_corpus: 53 | break 54 | corpus_file.write(sample['dialog'][i] + '\n') 55 | target_list = [sample['dialog'][i]] + random_corpus 56 | target_str = '|||'.join(target_list) 57 | source_file.write(source_str + '\n') 58 | target_file.write(target_str + '\n') 59 | context_file.write(' '.join(sample['kwlist'][i-2] + 60 | sample['kwlist'][i-1]) + '\n') 61 | keywords_file.write(' '.join(sample['kwlist'][i]) + '\n') 62 | label_file.write('0\n') 63 | 64 | source_file.close() 65 | target_file.close() 66 | label_file.close() 67 | keywords_vocab_file.close() 68 | context_file.close() 69 | 70 | start_corpus_file.close() 71 | corpus_file.close() 72 | -------------------------------------------------------------------------------- /preprocess/convai2/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'source') 3 | 4 | class dts_ConvAI2(object): 5 | def __init__(self, path=data_path): 6 | self.path = path 7 | 8 | def _txt_to_json(self, txt_path, mode, cands): 9 | def pop_one_sample(lines): 10 | self_persona = [] 11 | other_persona = [] 12 | dialog = [] 13 | candidates = [] 14 | 15 | started = False 16 | while len(lines) > 0: 17 | line = lines.pop() 18 | id, context = line.split(' ', 1) 19 | id = int(id) 20 | context = context.strip() 21 | 22 | if started == False: # not started 23 | assert id == 1 24 | started = True 25 | elif id == 1: # break for next 26 | lines.append(line) 27 | break 28 | 29 | if context.startswith('partner\'s persona: '): # partner 30 | assert mode in ['both', 'other'] 31 | other_persona.append(context[19:]) 32 | 33 | elif context.startswith('your persona: '): # self 34 | assert mode in ['both', 'self'] 35 | self_persona.append(context[13:]) 36 | 37 | elif cands == False: # no cands 38 | try: 39 | uttr, response = context.split('\t', 2)[:2] 40 | dialog.append(uttr) 41 | dialog.append(response) 42 | except: 43 | uttr = context 44 | dialog.append(uttr) 45 | else: 46 | uttr, response, _, negs = context.split('\t', 4)[:4] 47 | dialog.append(uttr) 48 | dialog.append(response) 49 | candidates.append(negs.split('|')) 50 | candidates.append(None) 51 | 52 | return { 53 | 'self_persona': self_persona, 54 | 'other_persona': other_persona, 55 | 'dialog': dialog, 56 | 'candidates': candidates 57 | } 58 | 59 | lines = open(txt_path, 'r').readlines()[::-1] 60 | 61 | samples = [] 62 | while len(lines) > 0: 63 | samples.append(pop_one_sample(lines)) 64 | 65 | return samples 66 | 67 | def get_data(self, mode='train', revised=False, cands=False): 68 | txt_path = os.path.join(self.path, '{}_{}_{}{}.txt'.format( 69 | mode, 70 | 'none', 71 | 'revised' if revised is True else 'original', 72 | '' if cands is True else '_no_cands')) 73 | assert mode in ['train', 'valid', 'test', 'all'] 74 | print("Get dialog from ", txt_path) 75 | assert os.path.exists(txt_path) 76 | return self._txt_to_json(txt_path, mode, cands) 77 | 78 | def get_dialogs(self, mode='all'): 79 | dialogs = [sample['dialog'] for sample in self.get_data(mode, False, False)] 80 | return dialogs -------------------------------------------------------------------------------- /config/data_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | data_root = './tx_data' 3 | _corpus = [x.strip() for x in open('tx_data/corpus.txt', 'r').readlines()] 4 | _start_corpus = [x.strip() for x in open('tx_data/start_corpus.txt', 'r').readlines()] 5 | _max_seq_len = 30 6 | _num_neg = 20 7 | _max_turns = 8 8 | _batch_size = 64 9 | _retrieval_candidates = 1000 10 | 11 | data_hparams = { 12 | stage: { 13 | "num_epochs": 1, 14 | "shuffle": stage != 'test', 15 | "batch_size": _batch_size, 16 | "datasets": [ 17 | { # dialogue history 18 | "variable_utterance": True, 19 | "max_utterance_cnt": 9, 20 | "max_seq_length": _max_seq_len, 21 | "files": [os.path.join(data_root, '{}/source.txt'.format(stage))], 22 | "vocab_file": os.path.join(data_root, 'vocab.txt'), 23 | "embedding_init": { 24 | "file": os.path.join(data_root, 'embedding.txt'), 25 | "dim": 200, 26 | "read_fn": "load_glove" 27 | }, 28 | "data_name": "source" 29 | }, 30 | { # candidate response 31 | "variable_utterance": True, 32 | "max_utterance_cnt": 20, 33 | "max_seq_length": _max_seq_len, 34 | "files": [os.path.join(data_root, '{}/target.txt'.format(stage))], 35 | "vocab_share_with": 0, 36 | "embedding_init_share_with" : 0, 37 | "data_name": "target" 38 | }, 39 | { # context (source keywords) 40 | "files": [os.path.join(data_root, '{}/context.txt'.format(stage))], 41 | "vocab_share_with": 0, 42 | "embedding_init_share_with": 0, 43 | "data_name": "context", 44 | "bos_token": '', 45 | "eos_token": '', 46 | }, 47 | { # target keywords 48 | "files": [os.path.join(data_root, '{}/keywords.txt'.format(stage))], 49 | "vocab_share_with": 0, 50 | "embedding_init_share_with": 0, 51 | "data_name": "keywords", 52 | "bos_token": '', 53 | "eos_token": '', 54 | }, 55 | { # label 56 | "files": [os.path.join(data_root, '{}/label.txt'.format(stage))], 57 | "data_type": "int", 58 | "data_name": "label" 59 | } 60 | ] 61 | } 62 | for stage in ['train','valid','test'] 63 | } 64 | 65 | 66 | corpus_hparams = { 67 | "batch_size": _batch_size*2, 68 | "shuffle": False, 69 | "dataset":{ 70 | "max_seq_length": _max_seq_len, 71 | "files": [os.path.join(data_root, 'corpus.txt')], 72 | "vocab_file": os.path.join(data_root, 'vocab.txt'), 73 | "data_name": "corpus" 74 | } 75 | } 76 | 77 | 78 | _keywords_path = 'tx_data/test/keywords_vocab.txt' 79 | _keywords_candi = [x.strip() for x in open(_keywords_path, 'r').readlines()] 80 | _keywords_num = len(_keywords_candi) 81 | _keywords_dict = {} 82 | for i in range(_keywords_num): 83 | _keywords_dict[_keywords_candi[i]] = i 84 | -------------------------------------------------------------------------------- /preprocess/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | import random 4 | import pickle 5 | from convai2 import dts_ConvAI2 6 | from extraction import KeywordExtractor 7 | from data_utils import * 8 | 9 | class dts_Target(dts_ConvAI2): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | 13 | def get_vocab(self): 14 | counter = collections.Counter() 15 | dialogs = self.get_dialogs() 16 | for dialog in dialogs: 17 | for uttr in dialog: 18 | counter.update(simp_tokenize(uttr)) 19 | print('total vocab count: ', len(counter.items())) 20 | vocab = [token for token, times in sorted(list(counter.items()), key=lambda x: (-x[1], x[0]))] 21 | with open('../tx_data/vocab.txt','w') as f: 22 | for word in vocab: 23 | f.write(word + '\n') 24 | print('save vocab in vocab.txt') 25 | return vocab 26 | 27 | def get_kwsess(self, vocab, mode='all'): 28 | keyword_extractor = KeywordExtractor(vocab) 29 | corpus = self.get_data(mode = mode, cands=False) 30 | sess_set = [] 31 | for sess in corpus: 32 | data = {} 33 | data['history'] = '' 34 | data['dialog'] = [] 35 | for dialog in sess['dialog']: 36 | data['dialog'].append(dialog) 37 | data['history'] = data['history'] + ' ' + dialog 38 | data['kws'] = keyword_extractor.extract(data['history']) 39 | sess_set.append(data) 40 | return sess_set 41 | 42 | def cal_idf(self): 43 | counter = collections.Counter() 44 | dialogs = self.get_dialogs() 45 | total = 0. 46 | for dialog in dialogs: 47 | for uttr in dialog: 48 | total += 1 49 | counter.update(set(kw_tokenize(uttr))) 50 | idf_dict = {} 51 | for k,v in counter.items(): 52 | idf_dict[k] = np.log10(total / (v+1.)) 53 | return idf_dict 54 | 55 | def make_dataset(self): 56 | vocab = self.get_vocab() 57 | idf_dict = self.cal_idf() 58 | kw_counter = collections.Counter() 59 | sess_set = self.get_kwsess(vocab) 60 | for data in sess_set: 61 | kw_counter.update(data['kws']) 62 | kw_freq = {} 63 | kw_sum = sum(kw_counter.values()) 64 | for k, v in kw_counter.most_common(): 65 | kw_freq[k] = v / kw_sum 66 | for data in sess_set: 67 | data['score'] = 0. 68 | for kw in set(data['kws']): 69 | data['score'] += kw_freq[kw] 70 | data['score'] /= len(set(data['kws'])) 71 | sess_set.sort(key=lambda x: x['score'], reverse=True) 72 | 73 | all_data = {'train':[], 'valid':[], 'test':[]} 74 | keyword_extractor = KeywordExtractor(idf_dict) 75 | for id, sess in enumerate(sess_set): 76 | type = 'train' 77 | if id < 500: 78 | type = 'test' 79 | elif random.random() < 0.05: 80 | type = 'valid' 81 | sample = {'dialog':sess['dialog'], 'kwlist':[]} 82 | for i in range(len(sess['dialog'])): 83 | sample['kwlist'].append(keyword_extractor.idf_extract(sess['dialog'][i])) 84 | all_data[type].append(sample) 85 | pickle.dump(all_data, open('source_data.pk','wb')) 86 | return all_data -------------------------------------------------------------------------------- /simulate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import importlib 3 | import random 4 | from preprocess.data_utils import utter_preprocess, is_reach_goal 5 | from model import retrieval 6 | 7 | class Target_Simulation(): 8 | def __init__(self, config_model, config_data, config_retrieval): 9 | g1 = tf.Graph() 10 | with g1.as_default(): 11 | self.retrieval_agent = retrieval.Predictor(config_retrieval, config_data) 12 | sess1 = tf.Session(graph=g1, config=self.retrieval_agent.gpu_config) 13 | self.retrieval_agent.retrieve_init(sess1) 14 | g2 = tf.Graph() 15 | with g2.as_default(): 16 | self.target_agent = model.Predictor(config_model, config_data) 17 | sess2 = tf.Session(graph=g2, config=self.target_agent.gpu_config) 18 | self.target_agent.retrieve_init(sess2) 19 | self.start_utter = config_data._start_corpus 20 | success_cnt, turns_cnt = 0, 0 21 | for i in range(int(FLAGS.times)): 22 | print('--------Session {} --------'.format(i)) 23 | success, turns = self.simulate(sess1, sess2) 24 | success_cnt += success 25 | turns_cnt += turns 26 | print('success time {}, average turns {:.2f}'.format(success_cnt, turns_cnt / success_cnt)) 27 | 28 | def simulate(self, sess1, sess2): 29 | history = [] 30 | history.append(random.sample(self.start_utter,1)[0]) 31 | target_kw = random.sample(target_set,1)[0] 32 | self.target_agent.target = target_kw 33 | self.target_agent.score = 0. 34 | self.target_agent.reply_list = [] 35 | self.retrieval_agent.reply_list = [] 36 | 37 | print('START: ' + history[0]) 38 | for i in range(config_data._max_turns): 39 | source = utter_preprocess(history, config_data._max_seq_len) 40 | reply = self.retrieval_agent.retrieve(source, sess1) 41 | print('retrieval_agent: ', reply) 42 | history.append(reply) 43 | source = utter_preprocess(history, config_data._max_seq_len) 44 | reply = self.target_agent.retrieve(source, sess2) 45 | print('{}_agent: '.format(FLAGS.agent), reply) 46 | print('Keyword: {}, Similarity: {:.2f}'.format(self.target_agent.next_kw, self.target_agent.score)) 47 | history.append(reply) 48 | if is_reach_goal(history[-2] + history[-1], target_kw): 49 | print('Successfully chat to the target \'{}\'.'.format(target_kw)) 50 | return (True, (len(history)+1)//2) 51 | 52 | print('Failed by reaching the maximum turn, target: \'{}\'.'.format(target_kw)) 53 | return (False, 0) 54 | 55 | if __name__ == '__main__': 56 | flags = tf.flags 57 | flags.DEFINE_string('agent', 'kernel', 'The agent type, supports kernel / matrix / neural / retrieval.') 58 | flags.DEFINE_string('times', '100', 'Simulation times.') 59 | 60 | FLAGS = flags.FLAGS 61 | config_data = importlib.import_module('config.data_config') 62 | config_model = importlib.import_module('config.' + FLAGS.agent) 63 | config_retrieval = importlib.import_module('config.retrieval') 64 | model = importlib.import_module('model.' + FLAGS.agent) 65 | 66 | target_set = [] 67 | for line in open('tx_data/test/keywords.txt', 'r').readlines(): 68 | target_set = target_set + line.strip().split(' ') 69 | 70 | Target_Simulation(config_model,config_data,config_retrieval) -------------------------------------------------------------------------------- /preprocess/data_utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import os 3 | from nltk.stem import WordNetLemmatizer 4 | 5 | _lemmatizer = WordNetLemmatizer() 6 | 7 | 8 | def tokenize(example, ppln): 9 | for fn in ppln: 10 | example = fn(example) 11 | return example 12 | 13 | 14 | def kw_tokenize(string): 15 | return tokenize(string, [nltk_tokenize, lower, pos_tag, to_basic_form]) 16 | 17 | 18 | def simp_tokenize(string): 19 | return tokenize(string, [nltk_tokenize, lower]) 20 | 21 | 22 | def nltk_tokenize(string): 23 | return nltk.word_tokenize(string) 24 | 25 | 26 | def lower(tokens): 27 | if not isinstance(tokens, str): 28 | return [lower(token) for token in tokens] 29 | return tokens.lower() 30 | 31 | 32 | def pos_tag(tokens): 33 | return nltk.pos_tag(tokens) 34 | 35 | 36 | def to_basic_form(tokens): 37 | if not isinstance(tokens, tuple): 38 | return [to_basic_form(token) for token in tokens] 39 | word, tag = tokens 40 | if tag.startswith('NN'): 41 | pos = 'n' 42 | elif tag.startswith('VB'): 43 | pos = 'v' 44 | elif tag.startswith('JJ'): 45 | pos = 'a' 46 | else: 47 | return word 48 | return _lemmatizer.lemmatize(word, pos) 49 | 50 | 51 | def truecasing(tokens): 52 | ret = [] 53 | is_start = True 54 | for word, tag in tokens: 55 | if word == 'i': 56 | ret.append('I') 57 | elif tag[0].isalpha(): 58 | if is_start: 59 | ret.append(word[0].upper() + word[1:]) 60 | else: 61 | ret.append(word) 62 | is_start = False 63 | else: 64 | if tag != ',': 65 | is_start = True 66 | ret.append(word) 67 | return ret 68 | 69 | 70 | candi_keyword_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'convai2/candi_keyword.txt') 71 | _candiwords = [x.strip() for x in open(candi_keyword_path).readlines()] 72 | 73 | 74 | def is_candiword(a): 75 | if a in _candiwords: 76 | return True 77 | return False 78 | 79 | 80 | from nltk.corpus import wordnet as wn 81 | from nltk.corpus import wordnet_ic 82 | 83 | brown_ic = wordnet_ic.ic('ic-brown.dat') 84 | 85 | 86 | def calculate_linsim(a, b): 87 | linsim = -1 88 | syna = wn.synsets(a) 89 | synb = wn.synsets(b) 90 | for sa in syna: 91 | for sb in synb: 92 | try: 93 | linsim = max(linsim, sa.lin_similarity(sb, brown_ic)) 94 | except: 95 | pass 96 | return linsim 97 | 98 | 99 | def is_reach_goal(context, goal): 100 | context = kw_tokenize(context) 101 | if goal in context: 102 | return True 103 | for wd in context: 104 | if is_candiword(wd): 105 | rela = calculate_linsim(wd, goal) 106 | if rela > 0.9: 107 | return True 108 | return False 109 | 110 | 111 | def make_context(string): 112 | string = kw_tokenize(string) 113 | context = [] 114 | for word in string: 115 | if is_candiword(word): 116 | context.append(word) 117 | return context 118 | 119 | 120 | def utter_preprocess(string_list, max_length): 121 | source, minor_length = [], [] 122 | string_list = string_list[-9:] 123 | major_length = len(string_list) 124 | if major_length == 1: 125 | context = make_context(string_list[-1]) 126 | else: 127 | context = make_context(string_list[-2] + string_list[-1]) 128 | context_len = len(context) 129 | while len(context) < 20: 130 | context.append('') 131 | for string in string_list: 132 | string = simp_tokenize(string) 133 | if len(string) > max_length: 134 | string = string[:max_length] 135 | string = [''] + string + [''] 136 | minor_length.append(len(string)) 137 | while len(string) < max_length + 2: 138 | string.append('') 139 | source.append(string) 140 | while len(source) < 9: 141 | source.append([''] * (max_length + 2)) 142 | minor_length.append(0) 143 | return (source, minor_length, major_length, context, context_len) 144 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Target-Guided Open-Domain Conversation 2 | 3 | This is the code for the following paper: 4 | 5 | [Target-Guided Open-Domain Conversation](http://arxiv.org/abs/1905.11553) 6 | *Jianheng Tang, Tiancheng Zhao, Chenyan Xiong, Xiaodan Liang, Eric Xing, Zhiting Hu; ACL 2019* 7 | 8 | ### Requirement 9 | 10 | - `nltk==3.4` 11 | - `tensoflow==1.12` 12 | - `texar>=0.2.1` ([Texar](https://github.com/asyml/texar)) 13 | 14 | ### Usage 15 | 16 | #### Data Preparation 17 | The dataset developed in the paper is on [google drive](https://drive.google.com/file/d/1oTjOQjm7iiUitOPLCmlkXOCbEPoSWDPX/view?usp=sharing). Download 18 | and unzip it into `preprocess/convai2`. Then run the following command: 19 | ```shell 20 | cd preprocess 21 | python prepare_data.py 22 | ``` 23 | By default, the processed data will be put in the `tx_data` directory. 24 | 25 | #### Turn-level Supervised Learning 26 | In this project there are 5 different types of agents, including the kernel/neural/matrix/retrieval/retrieval_stgy agent, 27 | which are all discribed in the paper. You can modify the configration of each agent in the `config` directory. 28 | 29 | To train the kernel/neural/matrix agent, you need to first train/test the keyword prediction module, 30 | and then train/test the retrieval module of each agent specified by the `--agent` parameter. 31 | 32 | ```shell 33 | python train.py --mode train_kw --agent kernel 34 | python train.py --mode train --agent kernel 35 | python train.py --mode test --agent kernel 36 | ``` 37 | 38 | The retrieval agent and the retrieval_stgy agent share the same retrival module. You only need to train one of them: 39 | 40 | ```shell 41 | python train.py --mode train --agent retrieval 42 | python train.py --mode test --agent retrieval 43 | ``` 44 | 45 | #### Target-guided Conversation 46 | 47 | After turn-level training, you can start target-guided conversation (human evaluation) with 48 | the kernel/neural/matrix/retrieval/retrieval_stgy agent specified by the `--agent` parameter. 49 | 50 | ```shell 51 | python chat.py --agent kernel 52 | ``` 53 | A demo of target-guided conversation (user is required to input the response after `HUMAN:`) 54 | 55 | ``` 56 | --------Session 0 -------- 57 | START: hello, how are you doing 58 | HUMAN: hi i am great just finishing up some homework. how about you 59 | AGENT: just got done eating some chicken and rice after i got done with my gym session 60 | HUMAN: i am gonna eat some vegetable for dinner 61 | AGENT: so you are vegetarian ? i am from los angeles so thereare lots of vegetarian restaurants 62 | HUMAN: i am not a vegetarian . fried chicken is my favorite. 63 | AGENT: oh boy , when i go to mcdonald i always get fries for my 2 dogs 64 | Successfully chat to the target 'mcdonald'. 65 | ``` 66 | 67 | You can also watch the simulation of the target-guided conversation 68 | between the retrieval agent pretending the user and the kernel/neural/matrix/retrieval_stgy agent specified by the `--agent` parameter. Simulation times are defined by the `--times` parameter. The program will calculate the success times and average turns in the end. 69 | 70 | ```shell 71 | python simulate.py --agent kernel --times 100 72 | ``` 73 | 74 | A demo of simulation: 75 | ``` 76 | --------Session 0 -------- 77 | START: hey hey good morning to you 78 | retrieval_agent: fine thanks . what do you do for a living ? 79 | kernel_agent: rewarding job , i work at a hospital 80 | Keyword: job, Similarity: 0.58 81 | Successfully chat to the target 'hospital'. 82 | 83 | ... 84 | --------Session 99 -------- 85 | START: hey hows it going ? i'm just cooking a steak 86 | retrieval_agent: i'm thinking of a bbq sandwich for lunch 87 | kernel_agent: nice i love to cook but now its just me and the fur babies 88 | Keyword: baby, Similarity: 0.45 89 | retrieval_agent: i love bagels however i own a dry cleaners 90 | kernel_agent: i love animals felix my cat and my dog emmy 91 | Keyword: cat, Similarity: 0.56 92 | retrieval_agent: sounds awesome i have all kind of pets my family own a farm 93 | kernel_agent: i love blue as well even my hair is blue 94 | Keyword: blue, Similarity: 1.00 95 | Successfully chat to the target 'blue'. 96 | 97 | success time 83, average turns 4.28 98 | ``` 99 | -------------------------------------------------------------------------------- /model/retrieval.py: -------------------------------------------------------------------------------- 1 | import texar as tx 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | 6 | class Predictor(): 7 | def __init__(self, config_model, config_data, mode=None): 8 | self.config = config_model 9 | self.data_config = config_data 10 | self.build_model() 11 | self.gpu_config = tf.ConfigProto() 12 | self.gpu_config.gpu_options.allow_growth = True 13 | 14 | def build_model(self): 15 | self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train']) 16 | self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid']) 17 | self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test']) 18 | self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data) 19 | self.vocab = self.train_data.vocab(0) 20 | self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs) 21 | self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams) 22 | self.target_encoder = tx.modules.UnidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams) 23 | self.linear_matcher = tx.modules.MLPTransformConnector(1) 24 | 25 | def forward(self, batch): 26 | source_embed = self.embedder(batch['source_text_ids']) 27 | target_embed = self.embedder(batch['target_text_ids']) 28 | target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim]) 29 | source_code = self.source_encoder(source_embed, 30 | sequence_length_minor=batch['source_length'], 31 | sequence_length_major=batch['source_utterance_cnt'])[1] 32 | target_length = tf.reshape(batch['target_length'], [-1]) 33 | target_code = self.target_encoder(target_embed, sequence_length=target_length)[1] 34 | target_code = tf.reshape(target_code, [-1, 20, self.config._code_len]) 35 | source_code = tf.expand_dims(source_code, 1) 36 | source_code = tf.tile(source_code, [1, 20, 1]) 37 | feature_code = target_code * source_code 38 | feature_code = tf.reshape(feature_code, [-1, self.config._code_len]) 39 | logits = self.linear_matcher(feature_code) 40 | logits = tf.reshape(logits, [-1, 20]) 41 | labels = tf.one_hot(batch['label'], 20) 42 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)) 43 | ans = tf.arg_max(logits, -1) 44 | acc = tx.evals.accuracy(batch['label'], ans) 45 | rank = tf.nn.top_k(logits, k=20)[1] 46 | return loss, acc, rank 47 | 48 | def train(self): 49 | batch = self.iterator.get_next() 50 | loss, acc, _ = self.forward(batch) 51 | op_step = tf.Variable(0, name='op_step') 52 | train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams) 53 | max_val_acc = 0. 54 | self.saver = tf.train.Saver() 55 | with tf.Session(config=self.gpu_config) as sess: 56 | sess.run(tf.global_variables_initializer()) 57 | sess.run(tf.local_variables_initializer()) 58 | sess.run(tf.tables_initializer()) 59 | for epoch_id in range(self.config._max_epoch): 60 | self.iterator.switch_to_train_data(sess) 61 | cur_step = 0 62 | cnt_acc = [] 63 | while True: 64 | try: 65 | cur_step += 1 66 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 67 | loss, acc_ = sess.run([train_op, acc], feed_dict=feed) 68 | cnt_acc.append(acc_) 69 | if cur_step % 200 == 0: 70 | print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:]))) 71 | except tf.errors.OutOfRangeError: 72 | break 73 | op_step = op_step + 1 74 | self.iterator.switch_to_val_data(sess) 75 | cnt_acc = [] 76 | while True: 77 | try: 78 | feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} 79 | acc_ = sess.run([acc], feed_dict=feed) 80 | cnt_acc.append(acc_) 81 | except tf.errors.OutOfRangeError: 82 | mean_acc = np.mean(cnt_acc) 83 | print('valid acc1={}'.format(mean_acc)) 84 | if mean_acc > max_val_acc: 85 | max_val_acc = mean_acc 86 | self.saver.save(sess, self.config._save_path) 87 | break 88 | 89 | def test(self): 90 | batch = self.iterator.get_next() 91 | loss, acc, rank = self.forward(batch) 92 | with tf.Session(config=self.gpu_config) as sess: 93 | sess.run(tf.tables_initializer()) 94 | self.saver = tf.train.Saver() 95 | self.saver.restore(sess, self.config._save_path) 96 | self.iterator.switch_to_test_data(sess) 97 | rank_cnt = [] 98 | while True: 99 | try: 100 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 101 | ranks, labels = sess.run([rank, batch['label']], feed_dict=feed) 102 | for i in range(len(ranks)): 103 | rank_cnt.append(np.where(ranks[i]==labels[i])[0][0]) 104 | except tf.errors.OutOfRangeError: 105 | rec = [0,0,0,0,0] 106 | MRR = 0 107 | for rank in rank_cnt: 108 | for i in range(5): 109 | rec[i] += (rank <= i) 110 | MRR += 1 / (rank+1) 111 | print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format( 112 | rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt))) 113 | break 114 | 115 | def retrieve_init(self, sess): 116 | data_batch = self.iterator.get_next() 117 | loss, acc, _ = self.forward(data_batch) 118 | self.corpus = self.data_config._corpus 119 | self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams) 120 | corpus_iterator = tx.data.DataIterator(self.corpus_data) 121 | batch = corpus_iterator.get_next() 122 | corpus_embed = self.embedder(batch['corpus_text_ids']) 123 | utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 124 | self.corpus_code = np.zeros([0, self.config._code_len]) 125 | corpus_iterator.switch_to_dataset(sess) 126 | sess.run(tf.tables_initializer()) 127 | saver = tf.train.Saver() 128 | saver.restore(sess, self.config._save_path) 129 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 130 | while True: 131 | try: 132 | utter_code_ = sess.run(utter_code, feed_dict=feed) 133 | self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0) 134 | except tf.errors.OutOfRangeError: 135 | break 136 | 137 | self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9)) 138 | self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1)) 139 | self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2)) 140 | 141 | history_ids = self.vocab.map_tokens_to_ids(self.history_input) 142 | history_embed = self.embedder(history_ids) 143 | history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0), 144 | sequence_length_minor=self.minor_length_input, 145 | sequence_length_major=self.major_length_input)[1] 146 | select_corpus = tf.cast(self.corpus_code, dtype=tf.float32) 147 | feature_code = self.linear_matcher(select_corpus * history_code) 148 | self.ans_output = tf.nn.top_k(tf.squeeze(feature_code, 1), k=self.data_config._retrieval_candidates)[1] 149 | 150 | def retrieve(self, source, sess): 151 | history, seq_len, turns, context, context_len = source 152 | ans = sess.run(self.ans_output, feed_dict={self.history_input: history, 153 | self.minor_length_input: [seq_len], 154 | self.major_length_input: [turns]}) 155 | for i in range(self.data_config._max_turns + 1): 156 | if ans[i] not in self.reply_list: # avoid repeat 157 | self.reply_list.append(ans[i]) 158 | reply = self.corpus[ans[i]] 159 | break 160 | return reply 161 | -------------------------------------------------------------------------------- /model/retrieval_stgy.py: -------------------------------------------------------------------------------- 1 | import texar as tx 2 | import tensorflow as tf 3 | import numpy as np 4 | from preprocess.data_utils import kw_tokenize 5 | 6 | class Predictor(): 7 | def __init__(self, config_model, config_data, mode=None): 8 | self.config = config_model 9 | self.data_config = config_data 10 | self.build_model() 11 | self.gpu_config = tf.ConfigProto() 12 | self.gpu_config.gpu_options.allow_growth = True 13 | 14 | def build_model(self): 15 | self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train']) 16 | self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid']) 17 | self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test']) 18 | self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data) 19 | self.vocab = self.train_data.vocab(0) 20 | self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs) 21 | self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams) 22 | self.target_encoder = tx.modules.UnidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams) 23 | self.linear_matcher = tx.modules.MLPTransformConnector(1) 24 | self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi)) 25 | 26 | def forward(self, batch): 27 | source_embed = self.embedder(batch['source_text_ids']) 28 | target_embed = self.embedder(batch['target_text_ids']) 29 | target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim]) 30 | source_code = self.source_encoder(source_embed, 31 | sequence_length_minor=batch['source_length'], 32 | sequence_length_major=batch['source_utterance_cnt'])[1] 33 | target_length = tf.reshape(batch['target_length'], [-1]) 34 | target_code = self.target_encoder(target_embed, sequence_length=target_length)[1] 35 | target_code = tf.reshape(target_code, [-1, 20, self.config._code_len]) 36 | source_code = tf.expand_dims(source_code, 1) 37 | source_code = tf.tile(source_code, [1, 20, 1]) 38 | feature_code = target_code * source_code 39 | feature_code = tf.reshape(feature_code, [-1, self.config._code_len]) 40 | logits = self.linear_matcher(feature_code) 41 | logits = tf.reshape(logits, [-1, 20]) 42 | labels = tf.one_hot(batch['label'], 20) 43 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)) 44 | ans = tf.arg_max(logits, -1) 45 | acc = tx.evals.accuracy(batch['label'], ans) 46 | rank = tf.nn.top_k(logits, k=20)[1] 47 | return loss, acc, rank 48 | 49 | def train(self): 50 | batch = self.iterator.get_next() 51 | loss, acc, _ = self.forward(batch) 52 | op_step = tf.Variable(0, name='op_step') 53 | train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams) 54 | max_val_acc = 0. 55 | self.saver = tf.train.Saver() 56 | with tf.Session(config=self.gpu_config) as sess: 57 | sess.run(tf.global_variables_initializer()) 58 | sess.run(tf.local_variables_initializer()) 59 | sess.run(tf.tables_initializer()) 60 | for epoch_id in range(self.config._max_epoch): 61 | self.iterator.switch_to_train_data(sess) 62 | cur_step = 0 63 | cnt_acc = [] 64 | while True: 65 | try: 66 | cur_step += 1 67 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 68 | loss, acc_ = sess.run([train_op, acc], feed_dict=feed) 69 | cnt_acc.append(acc_) 70 | if cur_step % 200 == 0: 71 | print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:]))) 72 | except tf.errors.OutOfRangeError: 73 | break 74 | op_step = op_step + 1 75 | self.iterator.switch_to_val_data(sess) 76 | cnt_acc = [] 77 | while True: 78 | try: 79 | feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} 80 | acc_ = sess.run([acc], feed_dict=feed) 81 | cnt_acc.append(acc_) 82 | except tf.errors.OutOfRangeError: 83 | mean_acc = np.mean(cnt_acc) 84 | print('valid acc1={}'.format(mean_acc)) 85 | if mean_acc > max_val_acc: 86 | max_val_acc = mean_acc 87 | self.saver.save(sess, self.config._save_path) 88 | break 89 | 90 | def test(self): 91 | batch = self.iterator.get_next() 92 | loss, acc, rank = self.forward(batch) 93 | with tf.Session(config=self.gpu_config) as sess: 94 | sess.run(tf.tables_initializer()) 95 | self.saver = tf.train.Saver() 96 | self.saver.restore(sess, self.config._save_path) 97 | self.iterator.switch_to_test_data(sess) 98 | rank_cnt = [] 99 | while True: 100 | try: 101 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 102 | ranks, labels = sess.run([rank, batch['label']], feed_dict=feed) 103 | for i in range(len(ranks)): 104 | rank_cnt.append(np.where(ranks[i]==labels[i])[0][0]) 105 | except tf.errors.OutOfRangeError: 106 | rec = [0,0,0,0,0] 107 | MRR = 0 108 | for rank in rank_cnt: 109 | for i in range(5): 110 | rec[i] += (rank <= i) 111 | MRR += 1 / (rank+1) 112 | print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format( 113 | rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt))) 114 | break 115 | 116 | def retrieve_init(self, sess): 117 | data_batch = self.iterator.get_next() 118 | loss, acc, _ = self.forward(data_batch) 119 | self.corpus = self.data_config._corpus 120 | self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams) 121 | corpus_iterator = tx.data.DataIterator(self.corpus_data) 122 | batch = corpus_iterator.get_next() 123 | corpus_embed = self.embedder(batch['corpus_text_ids']) 124 | utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 125 | self.corpus_code = np.zeros([0, self.config._code_len]) 126 | 127 | corpus_iterator.switch_to_dataset(sess) 128 | sess.run(tf.tables_initializer()) 129 | saver = tf.train.Saver() 130 | saver.restore(sess, self.config._save_path) 131 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 132 | while True: 133 | try: 134 | utter_code_ = sess.run(utter_code, feed_dict=feed) 135 | self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0) 136 | except tf.errors.OutOfRangeError: 137 | break 138 | 139 | self.keywords_embed = tf.nn.l2_normalize(self.embedder(self.kw_list), axis=1) 140 | self.kw_embedding = sess.run(self.keywords_embed) 141 | 142 | self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9)) 143 | self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1)) 144 | self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2)) 145 | 146 | history_ids = self.vocab.map_tokens_to_ids(self.history_input) 147 | history_embed = self.embedder(history_ids) 148 | history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0), 149 | sequence_length_minor=self.minor_length_input, 150 | sequence_length_major=self.major_length_input)[1] 151 | select_corpus = tf.cast(self.corpus_code, dtype=tf.float32) 152 | feature_code = self.linear_matcher(select_corpus * history_code) 153 | self.ans_output = tf.nn.top_k(tf.squeeze(feature_code, 1), k=1000)[1] 154 | 155 | def retrieve(self, source, sess): 156 | history, seq_len, turns, context, context_len = source 157 | ans = sess.run(self.ans_output, feed_dict={self.history_input: history, 158 | self.minor_length_input: [seq_len], 159 | self.major_length_input: [turns]}) 160 | flag = 0 161 | reply = self.corpus[ans[0]] 162 | for i in ans: 163 | if i in self.reply_list: # avoid repeat 164 | continue 165 | for wd in kw_tokenize(self.corpus[i]): 166 | if wd in self.data_config._keywords_candi: 167 | tmp_score = sum(self.kw_embedding[self.data_config._keywords_dict[wd]] * 168 | self.kw_embedding[self.data_config._keywords_dict[self.target]]) 169 | if tmp_score > self.score: 170 | reply = self.corpus[i] 171 | self.score = tmp_score 172 | self.next_kw = wd 173 | flag = 1 174 | break 175 | if flag == 0: 176 | continue 177 | break 178 | return reply 179 | 180 | -------------------------------------------------------------------------------- /model/matrix.py: -------------------------------------------------------------------------------- 1 | import texar as tx 2 | import tensorflow as tf 3 | import numpy as np 4 | import pickle 5 | 6 | class Predictor(): 7 | def __init__(self, config_model, config_data, mode=None): 8 | self.config = config_model 9 | self.data_config = config_data 10 | self.gpu_config = tf.ConfigProto() 11 | self.gpu_config.gpu_options.allow_growth = True 12 | self.build_model(mode) 13 | 14 | def build_model(self, mode): 15 | self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train']) 16 | self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid']) 17 | self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test']) 18 | self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data) 19 | self.vocab = self.train_data.vocab(0) 20 | self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams) 21 | self.target_encoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams) 22 | self.target_kwencoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_kwencoder_hparams) 23 | self.linear_transform = tx.modules.MLPTransformConnector(self.config._code_len // 2) 24 | self.linear_matcher = tx.modules.MLPTransformConnector(1) 25 | self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs) 26 | self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi)) 27 | self.kw_vocab = tx.data.Vocab(self.data_config._keywords_path) 28 | 29 | if mode == 'train_kw': 30 | self.pmi_matrix = np.zeros([self.config._vocab_size+4, self.data_config._keywords_num]) 31 | else: 32 | with open(self.config._matrix_save_path, 'rb') as f: 33 | matrix = pickle.load(f) 34 | self.pmi_matrix = tf.convert_to_tensor(matrix,dtype=tf.float32) 35 | 36 | def forward_matrix(self, context_ids): 37 | matching_score = tf.gather(self.pmi_matrix, context_ids) 38 | return tf.reduce_sum(tf.log(matching_score), axis=0) 39 | 40 | def predict_keywords(self, batch): 41 | keywords_ids = self.kw_vocab.map_tokens_to_ids(batch['keywords_text']) 42 | matching_score = tf.map_fn(lambda x: self.forward_matrix(x), batch['context_text_ids'], 43 | dtype=tf.float32, parallel_iterations=True) 44 | kw_labels = tf.map_fn(lambda x: tf.sparse_to_dense(x, [self.kw_vocab.size], 1., 0., False), 45 | keywords_ids, dtype=tf.float32, parallel_iterations=True)[:, 4:] 46 | kw_ans = tf.arg_max(matching_score, -1) 47 | acc_label = tf.map_fn(lambda x: tf.gather(x[0], x[1]), (kw_labels, kw_ans), dtype=tf.float32) 48 | acc = tf.reduce_mean(acc_label) 49 | kws = tf.nn.top_k(matching_score, k=5)[1] 50 | kws = tf.reshape(kws,[-1]) 51 | kws = tf.map_fn(lambda x: self.kw_list[x], kws, dtype=tf.int64) 52 | kws = tf.reshape(kws,[-1, 5]) 53 | return acc, kws 54 | 55 | def train_keywords(self): 56 | batch = self.iterator.get_next() 57 | acc, _ = self.predict_keywords(batch) 58 | with tf.Session(config=self.gpu_config) as sess: 59 | sess.run(tf.global_variables_initializer()) 60 | sess.run(tf.local_variables_initializer()) 61 | sess.run(tf.tables_initializer()) 62 | self.iterator.switch_to_train_data(sess) 63 | 64 | batchid = 0 65 | while True: 66 | try: 67 | batchid += 1 68 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 69 | source_keywords, target_keywords = sess.run([batch['context_text_ids'], 70 | batch['keywords_text_ids']], feed_dict=feed) 71 | for i in range(len(source_keywords)): 72 | for skw_id in source_keywords[i]: 73 | if skw_id == 0: 74 | break 75 | for tkw_id in target_keywords[i]: 76 | if skw_id >= 3 and tkw_id >= 3: 77 | tkw = self.config._vocab[tkw_id-4] 78 | if tkw in self.data_config._keywords_candi: 79 | tkw_id = self.data_config._keywords_dict[tkw] 80 | self.pmi_matrix[skw_id][tkw_id] += 1 81 | 82 | except tf.errors.OutOfRangeError: 83 | break 84 | self.pmi_matrix += 0.5 85 | self.pmi_matrix = self.pmi_matrix / (np.sum(self.pmi_matrix, axis=0) + 1) 86 | with open(self.config._matrix_save_path,'wb') as f: 87 | pickle.dump(self.pmi_matrix, f) 88 | 89 | def test_keywords(self): 90 | batch = self.iterator.get_next() 91 | acc, kws = self.predict_keywords(batch) 92 | with tf.Session(config=self.gpu_config) as sess: 93 | sess.run(tf.global_variables_initializer()) 94 | sess.run(tf.local_variables_initializer()) 95 | sess.run(tf.tables_initializer()) 96 | self.iterator.switch_to_test_data(sess) 97 | cnt_acc, cnt_rec1, cnt_rec3, cnt_rec5 = [], [], [], [] 98 | while True: 99 | try: 100 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 101 | acc_, kw_ans, kw_labels = sess.run([acc, kws, batch['keywords_text_ids']], feed_dict=feed) 102 | cnt_acc.append(acc_) 103 | rec = [0,0,0,0,0] 104 | sum_kws = 0 105 | for i in range(len(kw_ans)): 106 | sum_kws += sum(kw_labels[i] > 3) 107 | for j in range(5): 108 | if kw_ans[i][j] in kw_labels[i]: 109 | for k in range(j, 5): 110 | rec[k] += 1 111 | cnt_rec1.append(rec[0]/sum_kws) 112 | cnt_rec3.append(rec[2]/sum_kws) 113 | cnt_rec5.append(rec[4]/sum_kws) 114 | 115 | except tf.errors.OutOfRangeError: 116 | print('test_kw acc@1={:.4f}, rec@1={:.4f}, rec@3={:.4f}, rec@5={:.4f}'.format( 117 | np.mean(cnt_acc), np.mean(cnt_rec1), np.mean(cnt_rec3), np.mean(cnt_rec5))) 118 | break 119 | 120 | 121 | def forward(self, batch): 122 | matching_score = tf.map_fn(lambda x: self.forward_matrix(x), batch['context_text_ids'], 123 | dtype=tf.float32, parallel_iterations=True) 124 | kw_weight, predict_kw = tf.nn.top_k(matching_score, k=3) 125 | predict_kw = tf.reshape(predict_kw, [-1]) 126 | predict_kw = tf.map_fn(lambda x: self.kw_list[x], predict_kw, dtype=tf.int64) 127 | predict_kw = tf.reshape(predict_kw, [-1, 3]) 128 | embed_code = self.embedder(predict_kw) 129 | embed_code = tf.reduce_sum(embed_code, axis=1) 130 | embed_code = self.linear_transform(embed_code) 131 | 132 | source_embed = self.embedder(batch['source_text_ids']) 133 | target_embed = self.embedder(batch['target_text_ids']) 134 | target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim]) 135 | target_length = tf.reshape(batch['target_length'], [-1]) 136 | source_code = self.source_encoder( 137 | source_embed, 138 | sequence_length_minor=batch['source_length'], 139 | sequence_length_major=batch['source_utterance_cnt'])[1] 140 | target_code = self.target_encoder( 141 | target_embed, 142 | sequence_length=target_length)[1] 143 | target_kwcode = self.target_kwencoder( 144 | target_embed, 145 | sequence_length=target_length)[1] 146 | target_code = tf.concat([target_code[0], target_code[1], target_kwcode[0], target_kwcode[1]], -1) 147 | target_code = tf.reshape(target_code, [-1, 20, self.config._code_len]) 148 | 149 | source_code = tf.concat([source_code, embed_code], -1) 150 | source_code = tf.expand_dims(source_code, 1) 151 | source_code = tf.tile(source_code, [1, 20, 1]) 152 | feature_code = target_code * source_code 153 | feature_code = tf.reshape(feature_code, [-1, self.config._code_len]) 154 | 155 | logits = self.linear_matcher(feature_code) 156 | logits = tf.reshape(logits, [-1, 20]) 157 | labels = tf.one_hot(batch['label'], 20) 158 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)) 159 | ans = tf.arg_max(logits, -1) 160 | acc = tx.evals.accuracy(batch['label'], ans) 161 | rank = tf.nn.top_k(logits, k=20)[1] 162 | return loss, acc, rank 163 | 164 | def train(self): 165 | batch = self.iterator.get_next() 166 | loss, acc, _ = self.forward(batch) 167 | op_step = tf.Variable(0, name='retrieval_step') 168 | train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams) 169 | max_val_acc = 0. 170 | with tf.Session(config=self.gpu_config) as sess: 171 | sess.run(tf.tables_initializer()) 172 | sess.run(tf.global_variables_initializer()) 173 | sess.run(tf.local_variables_initializer()) 174 | saver = tf.train.Saver() 175 | for epoch_id in range(self.config._max_epoch): 176 | self.iterator.switch_to_train_data(sess) 177 | cur_step = 0 178 | cnt_acc = [] 179 | while True: 180 | try: 181 | cur_step += 1 182 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 183 | loss, acc_ = sess.run([train_op, acc], feed_dict=feed) 184 | cnt_acc.append(acc_) 185 | if cur_step % 200 == 0: 186 | print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:]))) 187 | except tf.errors.OutOfRangeError: 188 | break 189 | self.iterator.switch_to_val_data(sess) 190 | 191 | cnt_acc= [] 192 | while True: 193 | try: 194 | feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} 195 | acc_ = sess.run(acc, feed_dict=feed) 196 | cnt_acc.append(acc_) 197 | except tf.errors.OutOfRangeError: 198 | mean_acc = np.mean(cnt_acc) 199 | print('valid acc1={}'.format(mean_acc)) 200 | if mean_acc > max_val_acc: 201 | max_val_acc = mean_acc 202 | saver.save(sess, self.config._save_path) 203 | break 204 | 205 | def test(self): 206 | batch = self.iterator.get_next() 207 | loss, acc, rank = self.forward(batch) 208 | with tf.Session(config=self.gpu_config) as sess: 209 | sess.run(tf.tables_initializer()) 210 | self.saver = tf.train.Saver() 211 | self.saver.restore(sess, self.config._save_path) 212 | self.iterator.switch_to_test_data(sess) 213 | rank_cnt = [] 214 | while True: 215 | try: 216 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 217 | ranks, labels = sess.run([rank, batch['label']], feed_dict=feed) 218 | for i in range(len(ranks)): 219 | rank_cnt.append(np.where(ranks[i]==labels[i])[0][0]) 220 | except tf.errors.OutOfRangeError: 221 | rec = [0,0,0,0,0] 222 | MRR = 0 223 | for rank in rank_cnt: 224 | for i in range(5): 225 | rec[i] += (rank <= i) 226 | MRR += 1 / (rank+1) 227 | print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format( 228 | rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt))) 229 | break 230 | 231 | def retrieve_init(self, sess): 232 | data_batch = self.iterator.get_next() 233 | loss, acc, _ = self.forward(data_batch) 234 | self.corpus = self.data_config._corpus 235 | self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams) 236 | corpus_iterator = tx.data.DataIterator(self.corpus_data) 237 | batch = corpus_iterator.get_next() 238 | corpus_embed = self.embedder(batch['corpus_text_ids']) 239 | utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 240 | utter_kwcode = self.target_kwencoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 241 | utter_code = tf.concat([utter_code[0], utter_code[1], utter_kwcode[0], utter_kwcode[1]], -1) 242 | self.corpus_code = np.zeros([0, self.config._code_len]) 243 | 244 | corpus_iterator.switch_to_dataset(sess) 245 | sess.run(tf.tables_initializer()) 246 | saver = tf.train.Saver() 247 | saver.restore(sess, self.config._save_path) 248 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 249 | while True: 250 | try: 251 | utter_code_ = sess.run(utter_code, feed_dict=feed) 252 | self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0) 253 | except tf.errors.OutOfRangeError: 254 | break 255 | self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9)) 256 | self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1)) 257 | self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2)) 258 | self.keywords_embed = tf.nn.l2_normalize(self.embedder(self.kw_list), axis=1) 259 | self.kw_embedding = sess.run(self.keywords_embed) 260 | 261 | # predict keyword 262 | self.context_input = tf.placeholder(dtype=object) 263 | context_ids = self.vocab.map_tokens_to_ids(self.context_input) 264 | matching_score = self.forward_matrix(context_ids) 265 | self.candi_output =tf.nn.top_k(matching_score, self.data_config._keywords_num)[1] 266 | 267 | # retrieve 268 | self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9)) 269 | self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1)) 270 | self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2)) 271 | self.kw_input = tf.placeholder(dtype=tf.int32) 272 | history_ids = self.vocab.map_tokens_to_ids(self.history_input) 273 | history_embed = self.embedder(history_ids) 274 | history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0), 275 | sequence_length_minor=self.minor_length_input, 276 | sequence_length_major=self.major_length_input)[1] 277 | self.next_kw_ids = self.kw_list[self.kw_input] 278 | embed_code = tf.expand_dims(self.embedder(self.next_kw_ids), 0) 279 | embed_code = self.linear_transform(embed_code) 280 | history_code = tf.concat([history_code, embed_code], 1) 281 | select_corpus = tf.cast(self.corpus_code, dtype=tf.float32) 282 | feature_code = self.linear_matcher(select_corpus * history_code) 283 | self.ans_output = tf.nn.top_k(tf.squeeze(feature_code,1), k=self.data_config._retrieval_candidates)[1] 284 | 285 | def retrieve(self, history_all, sess): 286 | history, seq_len, turns, context, context_len = history_all 287 | kw_candi = sess.run(self.candi_output, feed_dict={self.context_input: context[:context_len]}) 288 | for kw in kw_candi: 289 | tmp_score = sum(self.kw_embedding[kw] * self.kw_embedding[self.data_config._keywords_dict[self.target]]) 290 | if tmp_score > self.score: 291 | self.score = tmp_score 292 | self.next_kw = self.data_config._keywords_candi[kw] 293 | break 294 | ans = sess.run(self.ans_output, feed_dict={self.history_input: history, 295 | self.minor_length_input: [seq_len], self.major_length_input: [turns], 296 | self.kw_input: self.data_config._keywords_dict[self.next_kw]}) 297 | for i in range(self.data_config._max_turns + 1): 298 | if ans[i] not in self.reply_list: 299 | self.reply_list.append(ans[i]) 300 | reply = self.corpus[ans[i]] 301 | break 302 | return reply 303 | -------------------------------------------------------------------------------- /model/neural.py: -------------------------------------------------------------------------------- 1 | import texar as tx 2 | import tensorflow as tf 3 | import numpy as np 4 | from preprocess.data_utils import kw_tokenize 5 | 6 | 7 | class Predictor(): 8 | def __init__(self, config_model, config_data, mode=None): 9 | self.config = config_model 10 | self.data_config = config_data 11 | self.gpu_config = tf.ConfigProto() 12 | self.gpu_config.gpu_options.allow_growth = True 13 | self.build_model() 14 | 15 | def build_model(self): 16 | self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train']) 17 | self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid']) 18 | self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test']) 19 | self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data) 20 | self.vocab = self.train_data.vocab(0) 21 | self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams) 22 | self.target_encoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams) 23 | self.target_kwencoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_kwencoder_hparams) 24 | self.linear_transform = tx.modules.MLPTransformConnector(self.config._code_len // 2) 25 | self.linear_matcher = tx.modules.MLPTransformConnector(1) 26 | self.context_encoder = tx.modules.UnidirectionalRNNEncoder(hparams=self.config.context_encoder_hparams) 27 | self.predict_layer = tx.modules.MLPTransformConnector(self.data_config._keywords_num) 28 | self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs) 29 | self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi)) 30 | self.kw_vocab = tx.data.Vocab(self.data_config._keywords_path) 31 | 32 | def forward_neural(self, context_ids, context_length): 33 | context_embed = self.embedder(context_ids) 34 | context_code = self.context_encoder(context_embed, sequence_length=context_length)[1] 35 | keyword_score = self.predict_layer(context_code) 36 | return keyword_score 37 | 38 | def predict_keywords(self, batch): 39 | matching_score = self.forward_neural(batch['context_text_ids'], batch['context_length']) 40 | keywords_ids = self.kw_vocab.map_tokens_to_ids(batch['keywords_text']) 41 | kw_labels = tf.map_fn(lambda x: tf.sparse_to_dense(x, [self.kw_vocab.size], 1., 0., False), 42 | keywords_ids, dtype=tf.float32, parallel_iterations=True)[:, 4:] 43 | loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=kw_labels, logits=matching_score) 44 | loss = tf.reduce_mean(loss) 45 | kw_ans = tf.arg_max(matching_score, -1) 46 | acc_label = tf.map_fn(lambda x: tf.gather(x[0], x[1]), (kw_labels, kw_ans), dtype=tf.float32) 47 | acc = tf.reduce_mean(acc_label) 48 | kws = tf.nn.top_k(matching_score, k=5)[1] 49 | kws = tf.reshape(kws,[-1]) 50 | kws = tf.map_fn(lambda x: self.kw_list[x], kws, dtype=tf.int64) 51 | kws = tf.reshape(kws,[-1, 5]) 52 | return loss, acc, kws 53 | 54 | def train_keywords(self): 55 | batch = self.iterator.get_next() 56 | loss, acc, _ = self.predict_keywords(batch) 57 | op_step = tf.Variable(0, name='op_step') 58 | train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.neural_opt_hparams) 59 | max_val_acc = 0. 60 | self.saver = tf.train.Saver() 61 | with tf.Session(config=self.gpu_config) as sess: 62 | sess.run(tf.global_variables_initializer()) 63 | sess.run(tf.local_variables_initializer()) 64 | sess.run(tf.tables_initializer()) 65 | for epoch_id in range(self.config._max_epoch): 66 | self.iterator.switch_to_train_data(sess) 67 | cur_step = 0 68 | cnt_acc = [] 69 | while True: 70 | try: 71 | cur_step += 1 72 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 73 | loss_, acc_ = sess.run([train_op, acc], feed_dict=feed) 74 | cnt_acc.append(acc_) 75 | if cur_step % 200 == 0: 76 | print('batch {}, loss={}, acc1={}'.format(cur_step, loss_, np.mean(cnt_acc[-200:]))) 77 | except tf.errors.OutOfRangeError: 78 | break 79 | self.iterator.switch_to_val_data(sess) 80 | cnt_acc = [] 81 | while True: 82 | try: 83 | feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} 84 | acc_ = sess.run(acc, feed_dict=feed) 85 | cnt_acc.append(acc_) 86 | except tf.errors.OutOfRangeError: 87 | mean_acc = np.mean(cnt_acc) 88 | if mean_acc > max_val_acc: 89 | max_val_acc = mean_acc 90 | self.saver.save(sess, self.config._neural_save_path) 91 | print('epoch_id {}, valid acc1={}'.format(epoch_id+1, mean_acc)) 92 | break 93 | 94 | def test_keywords(self): 95 | batch = self.iterator.get_next() 96 | loss, acc, kws = self.predict_keywords(batch) 97 | saver = tf.train.Saver() 98 | with tf.Session(config=self.gpu_config) as sess: 99 | sess.run(tf.global_variables_initializer()) 100 | sess.run(tf.local_variables_initializer()) 101 | sess.run(tf.tables_initializer()) 102 | saver.restore(sess, self.config._neural_save_path) 103 | self.iterator.switch_to_test_data(sess) 104 | cnt_acc, cnt_rec1, cnt_rec3, cnt_rec5 = [], [], [], [] 105 | while True: 106 | try: 107 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 108 | acc_, kw_ans, kw_labels = sess.run([acc, kws, batch['keywords_text_ids']], feed_dict=feed) 109 | cnt_acc.append(acc_) 110 | rec = [0,0,0,0,0] 111 | sum_kws = 0 112 | for i in range(len(kw_ans)): 113 | sum_kws += sum(kw_labels[i] > 3) 114 | for j in range(5): 115 | if kw_ans[i][j] in kw_labels[i]: 116 | for k in range(j, 5): 117 | rec[k] += 1 118 | cnt_rec1.append(rec[0]/sum_kws) 119 | cnt_rec3.append(rec[2]/sum_kws) 120 | cnt_rec5.append(rec[4]/sum_kws) 121 | 122 | except tf.errors.OutOfRangeError: 123 | print('test_kw acc@1={:.4f}, rec@1={:.4f}, rec@3={:.4f}, rec@5={:.4f}'.format( 124 | np.mean(cnt_acc), np.mean(cnt_rec1), np.mean(cnt_rec3), np.mean(cnt_rec5))) 125 | break 126 | 127 | def forward(self, batch): 128 | matching_score = self.forward_neural(batch['context_text_ids'], batch['context_length']) 129 | kw_weight, predict_kw = tf.nn.top_k(matching_score, k=3) 130 | predict_kw = tf.reshape(predict_kw, [-1]) 131 | predict_kw = tf.map_fn(lambda x: self.kw_list[x], predict_kw, dtype=tf.int64) 132 | predict_kw = tf.reshape(predict_kw, [-1, 3]) 133 | embed_code = self.embedder(predict_kw) 134 | embed_code = tf.reduce_sum(embed_code, axis=1) 135 | embed_code = self.linear_transform(embed_code) 136 | 137 | source_embed = self.embedder(batch['source_text_ids']) 138 | target_embed = self.embedder(batch['target_text_ids']) 139 | target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim]) 140 | target_length = tf.reshape(batch['target_length'], [-1]) 141 | source_code = self.source_encoder( 142 | source_embed, 143 | sequence_length_minor=batch['source_length'], 144 | sequence_length_major=batch['source_utterance_cnt'])[1] # 145 | target_code = self.target_encoder( 146 | target_embed, 147 | sequence_length=target_length)[1] 148 | target_kwcode = self.target_kwencoder( 149 | target_embed, 150 | sequence_length=target_length)[1] 151 | target_code = tf.concat([target_code[0], target_code[1], target_kwcode[0], target_kwcode[1]], -1) 152 | target_code = tf.reshape(target_code, [-1, 20, self.config._code_len]) 153 | 154 | source_code = tf.concat([source_code, embed_code], -1) 155 | source_code = tf.expand_dims(source_code, 1) 156 | source_code = tf.tile(source_code, [1, 20, 1]) 157 | feature_code = target_code * source_code 158 | feature_code = tf.reshape(feature_code, [-1, self.config._code_len]) 159 | 160 | logits = self.linear_matcher(feature_code) 161 | logits = tf.reshape(logits, [-1, 20]) 162 | labels = tf.one_hot(batch['label'], 20) 163 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)) 164 | ans = tf.arg_max(logits, -1) 165 | acc = tx.evals.accuracy(batch['label'], ans) 166 | rank = tf.nn.top_k(logits, k=20)[1] 167 | return loss, acc, rank 168 | 169 | def train(self): 170 | batch = self.iterator.get_next() 171 | kw_loss, kw_acc, _ = self.predict_keywords(batch) 172 | kw_saver = tf.train.Saver() 173 | loss, acc, _ = self.forward(batch) 174 | op_step = tf.Variable(0, name='retrieval_step') 175 | train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams) 176 | max_val_acc = 0. 177 | with tf.Session(config=self.gpu_config) as sess: 178 | sess.run(tf.tables_initializer()) 179 | sess.run(tf.global_variables_initializer()) 180 | sess.run(tf.local_variables_initializer()) 181 | kw_saver.restore(sess, self.config._neural_save_path) 182 | saver = tf.train.Saver() 183 | for epoch_id in range(self.config._max_epoch): 184 | self.iterator.switch_to_train_data(sess) 185 | cur_step = 0 186 | cnt_acc = [] 187 | while True: 188 | try: 189 | cur_step += 1 190 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 191 | loss, acc_ = sess.run([train_op, acc], feed_dict=feed) 192 | cnt_acc.append(acc_) 193 | if cur_step % 200 == 0: 194 | print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:]))) 195 | except tf.errors.OutOfRangeError: 196 | break 197 | 198 | self.iterator.switch_to_val_data(sess) 199 | cnt_acc, cnt_kwacc = [], [] 200 | while True: 201 | try: 202 | feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} 203 | acc_, kw_acc_ = sess.run([acc, kw_acc], feed_dict=feed) 204 | cnt_acc.append(acc_) 205 | cnt_kwacc.append(kw_acc_) 206 | except tf.errors.OutOfRangeError: 207 | mean_acc = np.mean(cnt_acc) 208 | print('valid acc1={}, kw_acc1={}'.format(mean_acc, np.mean(cnt_kwacc))) 209 | if mean_acc > max_val_acc: 210 | max_val_acc = mean_acc 211 | saver.save(sess, self.config._save_path) 212 | break 213 | 214 | def test(self): 215 | batch = self.iterator.get_next() 216 | loss, acc, rank = self.forward(batch) 217 | with tf.Session(config=self.gpu_config) as sess: 218 | sess.run(tf.tables_initializer()) 219 | self.saver = tf.train.Saver() 220 | self.saver.restore(sess, self.config._save_path) 221 | self.iterator.switch_to_test_data(sess) 222 | rank_cnt = [] 223 | while True: 224 | try: 225 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 226 | ranks, labels = sess.run([rank, batch['label']], feed_dict=feed) 227 | for i in range(len(ranks)): 228 | rank_cnt.append(np.where(ranks[i]==labels[i])[0][0]) 229 | except tf.errors.OutOfRangeError: 230 | rec = [0,0,0,0,0] 231 | MRR = 0 232 | for rank in rank_cnt: 233 | for i in range(5): 234 | rec[i] += (rank <= i) 235 | MRR += 1 / (rank+1) 236 | print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format( 237 | rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt))) 238 | break 239 | 240 | 241 | def retrieve_init(self, sess): 242 | data_batch = self.iterator.get_next() 243 | loss, acc, _ = self.forward(data_batch) 244 | self.corpus = self.data_config._corpus 245 | self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams) 246 | corpus_iterator = tx.data.DataIterator(self.corpus_data) 247 | batch = corpus_iterator.get_next() 248 | corpus_embed = self.embedder(batch['corpus_text_ids']) 249 | utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 250 | utter_kwcode = self.target_kwencoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 251 | utter_code = tf.concat([utter_code[0], utter_code[1], utter_kwcode[0], utter_kwcode[1]], -1) 252 | self.corpus_code = np.zeros([0, self.config._code_len]) 253 | 254 | corpus_iterator.switch_to_dataset(sess) 255 | sess.run(tf.tables_initializer()) 256 | saver = tf.train.Saver() 257 | saver.restore(sess, self.config._save_path) 258 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 259 | while True: 260 | try: 261 | utter_code_ = sess.run(utter_code, feed_dict=feed) 262 | self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0) 263 | except tf.errors.OutOfRangeError: 264 | break 265 | self.keywords_embed = tf.nn.l2_normalize(self.embedder(self.kw_list), axis=1) 266 | self.kw_embedding = sess.run(self.keywords_embed) 267 | 268 | # predict keyword 269 | self.context_input = tf.placeholder(dtype=object, shape=(20)) 270 | self.context_length_input = tf.placeholder(dtype=tf.int32, shape=(1)) 271 | context_ids = tf.expand_dims(self.vocab.map_tokens_to_ids(self.context_input), 0) 272 | context_embed = self.embedder(context_ids) 273 | context_code = self.context_encoder(context_embed, sequence_length=self.context_length_input)[1] 274 | matching_score = self.predict_layer(context_code) 275 | self.candi_output =tf.nn.top_k(tf.squeeze(matching_score, 0), self.data_config._keywords_num)[1] 276 | 277 | # retrieve 278 | self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9)) 279 | self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1)) 280 | self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2)) 281 | self.kw_input = tf.placeholder(dtype=tf.int32) 282 | history_ids = self.vocab.map_tokens_to_ids(self.history_input) 283 | history_embed = self.embedder(history_ids) 284 | history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0), 285 | sequence_length_minor=self.minor_length_input, 286 | sequence_length_major=self.major_length_input)[1] 287 | self.next_kw_ids = self.kw_list[self.kw_input] 288 | embed_code = tf.expand_dims(self.embedder(self.next_kw_ids), 0) 289 | embed_code = self.linear_transform(embed_code) 290 | history_code = tf.concat([history_code, embed_code], 1) 291 | select_corpus = tf.cast(self.corpus_code, dtype=tf.float32) 292 | feature_code = self.linear_matcher(select_corpus * history_code) 293 | self.ans_output = tf.nn.top_k(tf.squeeze(feature_code,1), k=self.data_config._retrieval_candidates)[1] 294 | 295 | def retrieve(self, history_all, sess): 296 | history, seq_len, turns, context, context_len = history_all 297 | kw_candi = sess.run(self.candi_output, feed_dict={self.context_input: context, 298 | self.context_length_input: [context_len]}) 299 | for kw in kw_candi: 300 | tmp_score = sum(self.kw_embedding[kw] * self.kw_embedding[self.data_config._keywords_dict[self.target]]) 301 | if tmp_score > self.score: 302 | self.score = tmp_score 303 | self.next_kw = self.data_config._keywords_candi[kw] 304 | break 305 | ans = sess.run(self.ans_output, feed_dict={self.history_input: history, 306 | self.minor_length_input: [seq_len], self.major_length_input: [turns], 307 | self.kw_input: self.data_config._keywords_dict[self.next_kw]}) 308 | flag = 0 309 | reply = self.corpus[ans[0]] 310 | for i in ans: 311 | if i in self.reply_list: # avoid repeat 312 | continue 313 | for wd in kw_tokenize(self.corpus[i]): 314 | if wd in self.data_config._keywords_candi: 315 | tmp_score = sum(self.kw_embedding[self.data_config._keywords_dict[wd]] * 316 | self.kw_embedding[self.data_config._keywords_dict[self.target]]) 317 | if tmp_score > self.score: 318 | reply = self.corpus[i] 319 | self.score = tmp_score 320 | self.next_kw = wd 321 | flag = 1 322 | break 323 | if flag == 0: 324 | continue 325 | break 326 | return reply 327 | -------------------------------------------------------------------------------- /model/kernel.py: -------------------------------------------------------------------------------- 1 | import texar as tx 2 | import tensorflow as tf 3 | import numpy as np 4 | from preprocess.data_utils import kw_tokenize 5 | 6 | 7 | class Predictor(): 8 | def __init__(self, config_model, config_data, mode=None): 9 | self.config = config_model 10 | self.data_config = config_data 11 | self.gpu_config = tf.ConfigProto() 12 | self.gpu_config.gpu_options.allow_growth = True 13 | self.build_model() 14 | 15 | def build_model(self): 16 | self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train']) 17 | self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid']) 18 | self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test']) 19 | self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data) 20 | self.vocab = self.train_data.vocab(0) 21 | self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs) 22 | self.kw_embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs) 23 | self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams) 24 | self.target_encoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams) 25 | self.target_kwencoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_kwencoder_hparams) 26 | self.linear_transform = tx.modules.MLPTransformConnector(self.config._code_len // 2) 27 | self.linear_matcher = tx.modules.MLPTransformConnector(1) 28 | self.linear_kernel = tx.modules.MLPTransformConnector(1) 29 | self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi)) 30 | self.kw_vocab = tx.data.Vocab(self.data_config._keywords_path) 31 | self.keywords_embed = tf.nn.l2_normalize(self.kw_embedder(self.kw_list), axis=1) 32 | 33 | def forward_kernel(self, kw_embed, context_ids): 34 | kernel_sigma = self.config._kernel_sigma 35 | mu = tf.convert_to_tensor(self.config._kernel_mu) 36 | mask = tf.cast(context_ids > 3, dtype=tf.float32) 37 | context_embed = self.kw_embedder(context_ids) 38 | context_embed = tf.nn.l2_normalize(context_embed, axis=2) 39 | similarity_matrix = tf.reduce_sum(kw_embed * context_embed, axis=2) 40 | similarity_matrix = tf.tile(tf.expand_dims(similarity_matrix, 2), [1, 1, len(self.config._kernel_mu)]) 41 | matching_feature = tf.exp(-(similarity_matrix - mu) ** 2 / (kernel_sigma ** 2)) 42 | matching_feature = matching_feature * tf.tile(tf.expand_dims(mask, 2), [1, 1, len(self.config._kernel_mu)]) 43 | matching_feature = tf.reduce_sum(matching_feature, axis=1) 44 | matching_score = self.linear_kernel(matching_feature) 45 | matching_score = tf.squeeze(matching_score, 1) 46 | return matching_score 47 | 48 | def predict_keywords(self, batch): 49 | keywords_ids = self.kw_vocab.map_tokens_to_ids(batch['keywords_text']) 50 | matching_score = tf.map_fn(lambda kw_embed: self.forward_kernel(kw_embed, batch['context_text_ids']), 51 | self.keywords_embed, dtype=tf.float32, parallel_iterations=True) 52 | matching_score = tf.transpose(matching_score) 53 | matching_score = tf.nn.softmax(matching_score) 54 | kw_labels = tf.map_fn(lambda x: tf.sparse_to_dense(x, [self.kw_vocab.size], 1., 0., False), 55 | keywords_ids, dtype=tf.float32, parallel_iterations=True)[:, 4:] 56 | loss = tf.reduce_sum(-tf.log(matching_score) * kw_labels) / tf.reduce_sum(kw_labels) 57 | kw_ans = tf.arg_max(matching_score, -1) 58 | acc_label = tf.map_fn(lambda x: tf.gather(x[0], x[1]), (kw_labels, kw_ans), dtype=tf.float32) 59 | acc = tf.reduce_mean(acc_label) 60 | kws = tf.nn.top_k(matching_score, k=5)[1] 61 | kws = tf.reshape(kws,[-1]) 62 | kws = tf.map_fn(lambda x: self.kw_list[x], kws, dtype=tf.int64) 63 | kws = tf.reshape(kws,[-1, 5]) 64 | return loss, acc, kws 65 | 66 | def train_keywords(self): 67 | batch = self.iterator.get_next() 68 | loss, acc, _ = self.predict_keywords(batch) 69 | op_step = tf.Variable(0, name='op_step') 70 | train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.kernel_opt_hparams) 71 | max_val_acc, stopping_flag = 0, 0 72 | self.saver = tf.train.Saver() 73 | with tf.Session(config=self.gpu_config) as sess: 74 | sess.run(tf.global_variables_initializer()) 75 | sess.run(tf.local_variables_initializer()) 76 | sess.run(tf.tables_initializer()) 77 | for epoch_id in range(self.config._max_epoch): 78 | self.iterator.switch_to_train_data(sess) 79 | cur_step = 0 80 | cnt_acc = [] 81 | while True: 82 | try: 83 | cur_step += 1 84 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 85 | loss_, acc_ = sess.run([train_op, acc], feed_dict=feed) 86 | cnt_acc.append(acc_) 87 | if cur_step % 100 == 0: 88 | print('batch {}, loss={}, acc1={}'.format(cur_step, loss_, np.mean(cnt_acc[-100:]))) 89 | except tf.errors.OutOfRangeError: 90 | break 91 | 92 | self.iterator.switch_to_val_data(sess) 93 | cnt_acc = [] 94 | while True: 95 | try: 96 | feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} 97 | acc_ = sess.run(acc, feed_dict=feed) 98 | cnt_acc.append(acc_) 99 | except tf.errors.OutOfRangeError: 100 | mean_acc = np.mean(cnt_acc) 101 | if mean_acc > max_val_acc: 102 | max_val_acc = mean_acc 103 | self.saver.save(sess, self.config._kernel_save_path) 104 | else: 105 | stopping_flag += 1 106 | print('epoch_id {}, valid acc1={}'.format(epoch_id+1, mean_acc)) 107 | break 108 | if stopping_flag >= self.config._early_stopping: 109 | break 110 | 111 | def test_keywords(self): 112 | batch = self.iterator.get_next() 113 | loss, acc, kws = self.predict_keywords(batch) 114 | saver = tf.train.Saver() 115 | with tf.Session(config=self.gpu_config) as sess: 116 | sess.run(tf.global_variables_initializer()) 117 | sess.run(tf.local_variables_initializer()) 118 | sess.run(tf.tables_initializer()) 119 | saver.restore(sess, self.config._kernel_save_path) 120 | self.iterator.switch_to_test_data(sess) 121 | cnt_acc, cnt_rec1, cnt_rec3, cnt_rec5 = [], [], [], [] 122 | while True: 123 | try: 124 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 125 | acc_, kw_ans, kw_labels = sess.run([acc, kws, batch['keywords_text_ids']], feed_dict=feed) 126 | cnt_acc.append(acc_) 127 | rec = [0,0,0,0,0] 128 | sum_kws = 0 129 | for i in range(len(kw_ans)): 130 | sum_kws += sum(kw_labels[i] > 3) 131 | for j in range(5): 132 | if kw_ans[i][j] in kw_labels[i]: 133 | for k in range(j, 5): 134 | rec[k] += 1 135 | cnt_rec1.append(rec[0]/sum_kws) 136 | cnt_rec3.append(rec[2]/sum_kws) 137 | cnt_rec5.append(rec[4]/sum_kws) 138 | 139 | except tf.errors.OutOfRangeError: 140 | print('test_kw acc@1={:.4f}, rec@1={:.4f}, rec@3={:.4f}, rec@5={:.4f}'.format( 141 | np.mean(cnt_acc), np.mean(cnt_rec1), np.mean(cnt_rec3), np.mean(cnt_rec5))) 142 | break 143 | 144 | def forward(self, batch): 145 | matching_score = tf.map_fn(lambda kw_embed: self.forward_kernel(kw_embed, batch['context_text_ids']), 146 | self.keywords_embed, dtype=tf.float32, parallel_iterations=True) 147 | matching_score = tf.transpose(matching_score) 148 | 149 | kw_weight, predict_kw = tf.nn.top_k(matching_score, k=3) 150 | predict_kw = tf.reshape(predict_kw,[-1]) 151 | predict_kw = tf.map_fn(lambda x: self.kw_list[x], predict_kw, dtype=tf.int64) 152 | predict_kw = tf.reshape(predict_kw,[-1,3]) 153 | embed_code = self.embedder(predict_kw) 154 | embed_code = tf.reduce_sum(embed_code, axis=1) 155 | embed_code = self.linear_transform(embed_code) 156 | 157 | source_embed = self.embedder(batch['source_text_ids']) 158 | target_embed = self.embedder(batch['target_text_ids']) # bs * 20 * 32 * 200 159 | target_embed = tf.reshape(target_embed,[-1, self.data_config._max_seq_len+2, self.embedder.dim]) # (bs * 20) * 32 * 200 160 | target_length = tf.reshape(batch['target_length'],[-1]) # (bs * 20) * 32 * 200 161 | source_code = self.source_encoder( 162 | source_embed, 163 | sequence_length_minor=batch['source_length'], 164 | sequence_length_major=batch['source_utterance_cnt'])[1] 165 | target_code = self.target_encoder( 166 | target_embed, 167 | sequence_length=target_length)[1] 168 | target_kwcode = self.target_kwencoder( 169 | target_embed, 170 | sequence_length=target_length)[1] 171 | target_code = tf.concat([target_code[0], target_code[1], target_kwcode[0], target_kwcode[1]], -1) 172 | target_code = tf.reshape(target_code, [-1,20,self.config._code_len]) 173 | 174 | source_code = tf.concat([source_code,embed_code], -1) 175 | source_code = tf.expand_dims(source_code, 1) 176 | source_code = tf.tile(source_code, [1,20,1]) 177 | feature_code = target_code * source_code 178 | feature_code = tf.reshape(feature_code,[-1,self.config._code_len]) 179 | logits = self.linear_matcher(feature_code) 180 | logits = tf.reshape(logits,[-1,20]) 181 | labels = tf.one_hot(batch['label'], 20) 182 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)) 183 | ans = tf.arg_max(logits, -1) 184 | acc = tx.evals.accuracy(batch['label'], ans) 185 | rank = tf.nn.top_k(logits, k=20)[1] 186 | return loss, acc, rank 187 | 188 | def train(self): 189 | batch = self.iterator.get_next() 190 | loss_t, acc_t, _ = self.predict_keywords(batch) 191 | kw_saver = tf.train.Saver() 192 | loss, acc, _ = self.forward(batch) 193 | retrieval_step = tf.Variable(0, name='retrieval_step') 194 | train_op = tx.core.get_train_op(loss, global_step=retrieval_step, hparams=self.config.opt_hparams) 195 | max_val_acc, stopping_flag = 0, 0 196 | with tf.Session(config=self.gpu_config) as sess: 197 | sess.run(tf.tables_initializer()) 198 | sess.run(tf.global_variables_initializer()) 199 | sess.run(tf.local_variables_initializer()) 200 | kw_saver.restore(sess, self.config._kernel_save_path) 201 | saver = tf.train.Saver() 202 | for epoch_id in range(self.config._max_epoch): 203 | self.iterator.switch_to_train_data(sess) 204 | cur_step = 0 205 | cnt_acc, cnt_kwacc = [],[] 206 | while True: 207 | try: 208 | cur_step += 1 209 | feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN} 210 | loss, acc_, acc_kw = sess.run([train_op, acc, acc_t], feed_dict=feed) 211 | cnt_acc.append(acc_) 212 | cnt_kwacc.append(acc_kw) 213 | if cur_step % 200 == 0: 214 | print('batch {}, loss={}, acc1={}, kw_acc1={}'.format(cur_step, loss, 215 | np.mean(cnt_acc[-200:]) ,np.mean(cnt_kwacc[-200:]))) 216 | except tf.errors.OutOfRangeError: 217 | break 218 | self.iterator.switch_to_val_data(sess) 219 | cnt_acc, cnt_kwacc = [],[] 220 | while True: 221 | try: 222 | feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL} 223 | acc_, acc_kw = sess.run([acc, acc_t], feed_dict=feed) 224 | cnt_acc.append(acc_) 225 | cnt_kwacc.append(acc_kw) 226 | except tf.errors.OutOfRangeError: 227 | mean_acc = np.mean(cnt_acc) 228 | print('valid acc1={}, kw_acc1={}'.format(mean_acc, np.mean(cnt_kwacc))) 229 | if mean_acc > max_val_acc: 230 | max_val_acc = mean_acc 231 | saver.save(sess, self.config._save_path) 232 | else: 233 | stopping_flag += 1 234 | break 235 | if stopping_flag >= self.config._early_stopping: 236 | break 237 | 238 | def test(self): 239 | batch = self.iterator.get_next() 240 | loss, acc, rank = self.forward(batch) 241 | with tf.Session(config=self.gpu_config) as sess: 242 | sess.run(tf.tables_initializer()) 243 | self.saver = tf.train.Saver() 244 | self.saver.restore(sess, self.config._save_path) 245 | self.iterator.switch_to_test_data(sess) 246 | rank_cnt = [] 247 | while True: 248 | try: 249 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 250 | ranks, labels = sess.run([rank, batch['label']], feed_dict=feed) 251 | for i in range(len(ranks)): 252 | rank_cnt.append(np.where(ranks[i]==labels[i])[0][0]) 253 | except tf.errors.OutOfRangeError: 254 | rec = [0,0,0,0,0] 255 | MRR = 0 256 | for rank in rank_cnt: 257 | for i in range(5): 258 | rec[i] += (rank <= i) 259 | MRR += 1 / (rank+1) 260 | print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format( 261 | rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt))) 262 | break 263 | 264 | def retrieve_init(self, sess): 265 | data_batch = self.iterator.get_next() 266 | loss, acc, _ = self.forward(data_batch) 267 | self.corpus = self.data_config._corpus 268 | self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams) 269 | corpus_iterator = tx.data.DataIterator(self.corpus_data) 270 | batch = corpus_iterator.get_next() 271 | corpus_embed = self.embedder(batch['corpus_text_ids']) 272 | utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 273 | utter_kwcode = self.target_kwencoder(corpus_embed, sequence_length=batch['corpus_length'])[1] 274 | utter_code = tf.concat([utter_code[0], utter_code[1], utter_kwcode[0], utter_kwcode[1]], -1) 275 | self.corpus_code = np.zeros([0, self.config._code_len]) 276 | corpus_iterator.switch_to_dataset(sess) 277 | sess.run(tf.tables_initializer()) 278 | saver = tf.train.Saver() 279 | saver.restore(sess, self.config._save_path) 280 | feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT} 281 | while True: 282 | try: 283 | utter_code_ = sess.run(utter_code, feed_dict=feed) 284 | self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0) 285 | except tf.errors.OutOfRangeError: 286 | break 287 | self.kw_embedding = sess.run(self.keywords_embed) 288 | 289 | # predict keyword 290 | self.context_input = tf.placeholder(dtype=object) 291 | context_ids = tf.expand_dims(self.vocab.map_tokens_to_ids(self.context_input), 0) 292 | matching_score = tf.map_fn(lambda kw_embed: self.forward_kernel(kw_embed, context_ids), 293 | self.keywords_embed, dtype=tf.float32, parallel_iterations=True) 294 | self.candi_output = tf.nn.top_k(tf.squeeze(matching_score, 1), self.data_config._keywords_num)[1] 295 | 296 | # retrieve 297 | self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9)) 298 | self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1)) 299 | self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2)) 300 | self.kw_input = tf.placeholder(dtype=tf.int32) 301 | history_ids = self.vocab.map_tokens_to_ids(self.history_input) 302 | history_embed = self.embedder(history_ids) 303 | history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0), 304 | sequence_length_minor=self.minor_length_input, 305 | sequence_length_major=self.major_length_input)[1] 306 | self.next_kw_ids = self.kw_list[self.kw_input] 307 | embed_code = tf.expand_dims(self.embedder(self.next_kw_ids), 0) 308 | embed_code = self.linear_transform(embed_code) 309 | history_code = tf.concat([history_code, embed_code], 1) 310 | select_corpus = tf.cast(self.corpus_code, dtype=tf.float32) 311 | feature_code = self.linear_matcher(select_corpus * history_code) 312 | self.ans_output = tf.nn.top_k(tf.squeeze(feature_code,1), k=self.data_config._retrieval_candidates)[1] 313 | 314 | def retrieve(self, history_all, sess): 315 | history, seq_len, turns, context, context_len = history_all 316 | kw_candi = sess.run(self.candi_output, feed_dict={self.context_input: context[:context_len]}) 317 | for kw in kw_candi: 318 | tmp_score = sum(self.kw_embedding[kw] * self.kw_embedding[self.data_config._keywords_dict[self.target]]) 319 | if tmp_score > self.score: 320 | self.score = tmp_score 321 | self.next_kw = self.data_config._keywords_candi[kw] 322 | break 323 | ans = sess.run(self.ans_output, feed_dict={self.history_input: history, 324 | self.minor_length_input: [seq_len], self.major_length_input: [turns], 325 | self.kw_input: self.data_config._keywords_dict[self.next_kw]}) 326 | flag = 0 327 | reply = self.corpus[ans[0]] 328 | for i in ans: 329 | if i in self.reply_list: # avoid repeat 330 | continue 331 | for wd in kw_tokenize(self.corpus[i]): 332 | if wd in self.data_config._keywords_candi: 333 | tmp_score = sum(self.kw_embedding[self.data_config._keywords_dict[wd]] * 334 | self.kw_embedding[self.data_config._keywords_dict[self.target]]) 335 | if tmp_score > self.score: 336 | reply = self.corpus[i] 337 | self.score = tmp_score 338 | self.next_kw = wd 339 | flag = 1 340 | break 341 | if flag == 0: 342 | continue 343 | break 344 | return reply 345 | -------------------------------------------------------------------------------- /preprocess/convai2/candi_keyword.txt: -------------------------------------------------------------------------------- 1 | favorite 2 | sound 3 | play 4 | dog 5 | music 6 | kid 7 | eat 8 | school 9 | enjoy 10 | job 11 | watch 12 | read 13 | food 14 | cat 15 | friend 16 | family 17 | hobby 18 | people 19 | pet 20 | car 21 | game 22 | hear 23 | movie 24 | travel 25 | book 26 | cook 27 | listen 28 | animal 29 | life 30 | color 31 | drive 32 | college 33 | hope 34 | living 35 | parent 36 | teach 37 | bad 38 | sport 39 | hard 40 | dad 41 | feel 42 | child 43 | hair 44 | band 45 | country 46 | money 47 | pizza 48 | marry 49 | bet 50 | hate 51 | walk 52 | stay 53 | study 54 | write 55 | husband 56 | fish 57 | start 58 | guess 59 | brother 60 | blue 61 | night 62 | dance 63 | busy 64 | red 65 | wife 66 | learn 67 | talk 68 | sing 69 | meet 70 | beach 71 | spend 72 | drink 73 | rock 74 | city 75 | video 76 | grow 77 | person 78 | house 79 | tv 80 | true 81 | shop 82 | teacher 83 | buy 84 | girl 85 | weekend 86 | prefer 87 | hike 88 | meat 89 | football 90 | free 91 | visit 92 | sister 93 | care 94 | plan 95 | art 96 | swim 97 | sweet 98 | paint 99 | stuff 100 | pretty 101 | vegan 102 | store 103 | class 104 | farm 105 | type 106 | funny 107 | understand 108 | son 109 | business 110 | happy 111 | mother 112 | ride 113 | coffee 114 | leave 115 | fan 116 | garden 117 | week 118 | boy 119 | bake 120 | green 121 | pay 122 | beautiful 123 | draw 124 | student 125 | horse 126 | sell 127 | sad 128 | summer 129 | wear 130 | truck 131 | black 132 | hot 133 | wait 134 | yea 135 | sleep 136 | cold 137 | agree 138 | single 139 | guy 140 | sibling 141 | real 142 | crazy 143 | healthy 144 | fine 145 | tall 146 | guitar 147 | lose 148 | cute 149 | hour 150 | cake 151 | purple 152 | month 153 | relax 154 | finish 155 | idea 156 | break 157 | company 158 | daughter 159 | happen 160 | dream 161 | team 162 | mind 163 | park 164 | woman 165 | reading 166 | restaurant 167 | bike 168 | short 169 | italian 170 | exercise 171 | hang 172 | chocolate 173 | wonderful 174 | basketball 175 | speak 176 | soccer 177 | weather 178 | graduate 179 | retire 180 | winter 181 | morning 182 | nurse 183 | bring 184 | die 185 | easy 186 | party 187 | grade 188 | father 189 | ready 190 | ice 191 | win 192 | spare 193 | doctor 194 | online 195 | song 196 | local 197 | degree 198 | chicken 199 | concert 200 | florida 201 | glad 202 | fall 203 | baseball 204 | eye 205 | office 206 | volunteer 207 | water 208 | girlfriend 209 | bear 210 | french 211 | shopping 212 | age 213 | luck 214 | fishing 215 | artist 216 | weird 217 | foot 218 | save 219 | surf 220 | dinner 221 | yoga 222 | lucky 223 | story 224 | hunt 225 | cooking 226 | pink 227 | suck 228 | cream 229 | boyfriend 230 | language 231 | beer 232 | outdoors 233 | change 234 | diet 235 | passion 236 | york 237 | major 238 | cheese 239 | collect 240 | imagine 241 | super 242 | english 243 | rap 244 | practice 245 | chat 246 | head 247 | california 248 | train 249 | lake 250 | clothes 251 | pass 252 | nature 253 | baby 254 | fry 255 | season 256 | apple 257 | piano 258 | sit 259 | hop 260 | scary 261 | taco 262 | law 263 | steak 264 | hockey 265 | comic 266 | brown 267 | bar 268 | gym 269 | smart 270 | huge 271 | clean 272 | fav 273 | close 274 | jazz 275 | vegetarian 276 | canada 277 | science 278 | career 279 | picture 280 | tea 281 | excite 282 | tough 283 | deal 284 | pick 285 | allergic 286 | neat 287 | church 288 | social 289 | sick 290 | shoe 291 | trip 292 | fly 293 | vacation 294 | catch 295 | bed 296 | raise 297 | boring 298 | rid 299 | ocean 300 | club 301 | town 302 | instrument 303 | check 304 | race 305 | dragon 306 | fast 307 | vegetable 308 | wrong 309 | boat 310 | terrible 311 | tennis 312 | candy 313 | rain 314 | worry 315 | veggie 316 | fruit 317 | metal 318 | perfect 319 | twin 320 | tattoo 321 | mountain 322 | tomorrow 323 | god 324 | stand 325 | hospital 326 | apartment 327 | build 328 | stick 329 | design 330 | japan 331 | texas 332 | meal 333 | grocery 334 | camp 335 | hit 336 | orange 337 | mexican 338 | allergy 339 | ta 340 | phone 341 | famous 342 | player 343 | flower 344 | bore 345 | army 346 | star 347 | share 348 | pool 349 | delicious 350 | relationship 351 | exciting 352 | math 353 | egg 354 | museum 355 | classic 356 | dress 357 | sushi 358 | taste 359 | married 360 | amaze 361 | classical 362 | lady 363 | shelter 364 | sense 365 | joke 366 | pop 367 | pie 368 | yellow 369 | american 370 | lawyer 371 | expensive 372 | stress 373 | hand 374 | cut 375 | pasta 376 | remember 377 | professional 378 | choice 379 | impressive 380 | youtube 381 | kinda 382 | yum 383 | chicago 384 | birthday 385 | cooky 386 | sunday 387 | follow 388 | divorce 389 | gon 390 | moment 391 | fresh 392 | tire 393 | hurt 394 | wedding 395 | fit 396 | weight 397 | health 398 | plant 399 | count 400 | chef 401 | heard 402 | ball 403 | scar 404 | sort 405 | smell 406 | dead 407 | special 408 | comedy 409 | couple 410 | rich 411 | hiking 412 | fave 413 | create 414 | accountant 415 | bird 416 | relaxing 417 | history 418 | blonde 419 | film 420 | everyday 421 | glass 422 | heart 423 | voice 424 | ballet 425 | vet 426 | military 427 | horror 428 | field 429 | fight 430 | mile 431 | salad 432 | extra 433 | afraid 434 | market 435 | christmas 436 | reason 437 | mexico 438 | attend 439 | cop 440 | chance 441 | potato 442 | snow 443 | halloween 444 | folk 445 | snake 446 | ski 447 | character 448 | card 449 | adopt 450 | figure 451 | tho 452 | hat 453 | nail 454 | ford 455 | fat 456 | warm 457 | swimming 458 | difficult 459 | sew 460 | suppose 461 | dye 462 | safe 463 | wine 464 | tend 465 | style 466 | afford 467 | recipe 468 | writer 469 | shower 470 | lunch 471 | remind 472 | painting 473 | news 474 | congrats 475 | photography 476 | rough 477 | road 478 | holiday 479 | join 480 | beat 481 | white 482 | throne 483 | michigan 484 | awful 485 | actor 486 | breakfast 487 | library 488 | congratulation 489 | goal 490 | shrimp 491 | singer 492 | middle 493 | horrible 494 | common 495 | plane 496 | profession 497 | bacon 498 | smoke 499 | coast 500 | kill 501 | future 502 | word 503 | female 504 | south 505 | craft 506 | neighbor 507 | violin 508 | fair 509 | paris 510 | fashion 511 | quiet 512 | service 513 | bank 514 | workout 515 | dish 516 | shape 517 | tour 518 | theater 519 | genre 520 | forget 521 | fiction 522 | makeup 523 | model 524 | clown 525 | author 526 | experience 527 | rest 528 | librarian 529 | king 530 | puppy 531 | german 532 | tree 533 | trust 534 | street 535 | yummy 536 | marketing 537 | quit 538 | hungry 539 | creative 540 | poor 541 | cali 542 | wild 543 | alright 544 | accident 545 | fantasy 546 | cartoon 547 | air 548 | pepper 549 | hurricane 550 | ton 551 | lab 552 | anime 553 | university 554 | drop 555 | factory 556 | iphone 557 | mystery 558 | active 559 | netflix 560 | stressful 561 | space 562 | trouble 563 | secret 564 | mcdonalds 565 | laugh 566 | gun 567 | support 568 | spanish 569 | pain 570 | casino 571 | france 572 | set 573 | spaghetti 574 | sale 575 | magic 576 | grill 577 | burger 578 | climb 579 | internet 580 | pig 581 | hold 582 | decide 583 | mustang 584 | program 585 | drum 586 | hmm 587 | focus 588 | seafood 589 | master 590 | medium 591 | strange 592 | lesson 593 | arm 594 | war 595 | serve 596 | van 597 | north 598 | cow 599 | toe 600 | driver 601 | paper 602 | education 603 | table 604 | station 605 | dancer 606 | soda 607 | playing 608 | medical 609 | dark 610 | blog 611 | usa 612 | power 613 | wan 614 | bob 615 | simple 616 | zoo 617 | shoot 618 | fancy 619 | mechanic 620 | collection 621 | national 622 | motorcycle 623 | strong 624 | parrot 625 | nursing 626 | america 627 | harry 628 | blood 629 | ahahah 630 | body 631 | bible 632 | police 633 | energy 634 | alabama 635 | lazy 636 | death 637 | rose 638 | musician 639 | italy 640 | adventure 641 | worker 642 | writing 643 | pair 644 | activity 645 | bread 646 | throw 647 | issue 648 | europe 649 | london 650 | gross 651 | jealous 652 | male 653 | height 654 | disney 655 | treat 656 | marathon 657 | door 658 | bean 659 | bos 660 | boot 661 | aww 662 | cancer 663 | lover 664 | milk 665 | skill 666 | plenty 667 | beard 668 | light 669 | poetry 670 | subject 671 | sea 672 | romantic 673 | relate 674 | spending 675 | firm 676 | send 677 | seattle 678 | couch 679 | question 680 | flavor 681 | photo 682 | channel 683 | favourite 684 | match 685 | contact 686 | choose 687 | japanese 688 | singing 689 | lift 690 | boston 691 | wood 692 | grandmother 693 | farmer 694 | radio 695 | navy 696 | roommate 697 | bus 698 | politics 699 | grandchild 700 | jam 701 | chinese 702 | viking 703 | matter 704 | series 705 | competition 706 | designer 707 | gas 708 | san 709 | feeling 710 | knit 711 | lizard 712 | slow 713 | publish 714 | fantastic 715 | crochet 716 | donate 717 | add 718 | television 719 | insurance 720 | talent 721 | bowl 722 | human 723 | land 724 | rise 725 | hmmm 726 | test 727 | alive 728 | involve 729 | dangerous 730 | pro 731 | yesterday 732 | wind 733 | obsess 734 | golden 735 | stone 736 | post 737 | west 738 | nephew 739 | depend 740 | nervous 741 | breed 742 | bakery 743 | hip 744 | goodness 745 | baker 746 | surgery 747 | mall 748 | don 749 | honda 750 | colorado 751 | dancing 752 | dr 753 | dude 754 | lifestyle 755 | calm 756 | wake 757 | crime 758 | athlete 759 | skin 760 | beauty 761 | lie 762 | facebook 763 | mar 764 | reader 765 | lead 766 | mess 767 | snack 768 | shift 769 | employ 770 | spring 771 | size 772 | handle 773 | shy 774 | iron 775 | corvette 776 | evening 777 | superman 778 | board 779 | cheer 780 | traffic 781 | grand 782 | sun 783 | fee 784 | musical 785 | base 786 | public 787 | chip 788 | mad 789 | careful 790 | pound 791 | brand 792 | potter 793 | punk 794 | smile 795 | advice 796 | justin 797 | manager 798 | bass 799 | peaceful 800 | golf 801 | clothing 802 | john 803 | action 804 | buddy 805 | sunny 806 | box 807 | waitress 808 | gardening 809 | popular 810 | wing 811 | wall 812 | personal 813 | coach 814 | cover 815 | elementary 816 | mix 817 | pray 818 | odd 819 | stink 820 | minute 821 | key 822 | inspire 823 | event 824 | adorable 825 | community 826 | juice 827 | engineer 828 | karate 829 | skate 830 | saturday 831 | literature 832 | reward 833 | baking 834 | ghost 835 | ohio 836 | vega 837 | kitchen 838 | sugar 839 | chill 840 | tomato 841 | chase 842 | chevy 843 | roll 844 | la 845 | organize 846 | blame 847 | homework 848 | bee 849 | grandma 850 | beatles 851 | passionate 852 | soul 853 | knee 854 | anxiety 855 | nut 856 | spot 857 | ticket 858 | grandparent 859 | honest 860 | google 861 | junk 862 | product 863 | tech 864 | track 865 | normal 866 | role 867 | client 868 | veterinarian 869 | spicy 870 | building 871 | scare 872 | hawaii 873 | bummer 874 | reality 875 | iguana 876 | niece 877 | opera 878 | charlie 879 | worth 880 | pack 881 | trail 882 | senior 883 | toyota 884 | beet 885 | ireland 886 | finance 887 | chili 888 | officer 889 | pickle 890 | kinds 891 | manage 892 | tired 893 | christian 894 | daily 895 | nose 896 | cheap 897 | spider 898 | gig 899 | indian 900 | locate 901 | pleasure 902 | league 903 | hunting 904 | retired 905 | george 906 | tuna 907 | teaching 908 | attention 909 | step 910 | awe 911 | stock 912 | list 913 | portland 914 | sign 915 | avoid 916 | bug 917 | brain 918 | scientist 919 | dessert 920 | excellent 921 | finger 922 | religious 923 | superhero 924 | highschool 925 | kick 926 | rule 927 | salesman 928 | drawing 929 | handful 930 | pilot 931 | escape 932 | schedule 933 | tasty 934 | gosh 935 | burrito 936 | kansa 937 | commercial 938 | rescue 939 | mary 940 | fi 941 | retail 942 | retriever 943 | skydive 944 | husky 945 | gamble 946 | georgia 947 | industry 948 | meeting 949 | nyc 950 | walmart 951 | annoy 952 | cruise 953 | doubt 954 | forest 955 | bunch 956 | cashier 957 | actress 958 | kayak 959 | partner 960 | mac 961 | tiny 962 | friday 963 | spell 964 | culture 965 | kindergarten 966 | lay 967 | tiger 968 | deaf 969 | mention 970 | drinking 971 | accounting 972 | cup 973 | subway 974 | pancake 975 | habit 976 | prius 977 | blind 978 | familiar 979 | starbucks 980 | assistant 981 | pumpkin 982 | burn 983 | answer 984 | poodle 985 | metallica 986 | circus 987 | jewelry 988 | sock 989 | recommend 990 | sandwich 991 | journalist 992 | fear 993 | pretend 994 | grandson 995 | nfl 996 | brownie 997 | cupcake 998 | doll 999 | stephen 1000 | bother 1001 | desert 1002 | sci 1003 | moon 1004 | construction 1005 | reach 1006 | technology 1007 | option 1008 | marriage 1009 | pony 1010 | hell 1011 | romance 1012 | cuisine 1013 | strawberry 1014 | shirt 1015 | bbq 1016 | purse 1017 | leg 1018 | kitten 1019 | unemployed 1020 | project 1021 | running 1022 | deliver 1023 | natural 1024 | dang 1025 | control 1026 | hero 1027 | beef 1028 | trade 1029 | sunset 1030 | carrot 1031 | main 1032 | chess 1033 | grandkids 1034 | talented 1035 | explore 1036 | nasty 1037 | east 1038 | hamburger 1039 | private 1040 | yup 1041 | guard 1042 | choir 1043 | australia 1044 | pen 1045 | stamp 1046 | struggle 1047 | dive 1048 | bro 1049 | perry 1050 | omg 1051 | thinking 1052 | candle 1053 | fell 1054 | ink 1055 | wheel 1056 | ranch 1057 | owner 1058 | carry 1059 | biology 1060 | earn 1061 | regular 1062 | lion 1063 | peanut 1064 | broccoli 1065 | suit 1066 | medicine 1067 | speaking 1068 | chew 1069 | inspiration 1070 | sauce 1071 | hotel 1072 | camera 1073 | england 1074 | repair 1075 | turtle 1076 | athletic 1077 | unique 1078 | ray 1079 | sky 1080 | skateboard 1081 | river 1082 | tune 1083 | freak 1084 | estate 1085 | cheesecake 1086 | yuck 1087 | surfing 1088 | rent 1089 | ooh 1090 | grey 1091 | memory 1092 | perform 1093 | popcorn 1094 | dislike 1095 | childhood 1096 | adore 1097 | quick 1098 | comedian 1099 | sweater 1100 | antique 1101 | lottery 1102 | hows 1103 | mcdonald 1104 | jump 1105 | tutor 1106 | carolina 1107 | walking 1108 | pregnant 1109 | mike 1110 | vampire 1111 | decorate 1112 | bieber 1113 | alcohol 1114 | compete 1115 | mansion 1116 | owl 1117 | gotcha 1118 | jack 1119 | engineering 1120 | retirement 1121 | pot 1122 | airplane 1123 | ferrari 1124 | dry 1125 | dentist 1126 | russian 1127 | piece 1128 | security 1129 | spirit 1130 | offer 1131 | dorm 1132 | record 1133 | settle 1134 | lobster 1135 | foreign 1136 | software 1137 | honey 1138 | rice 1139 | princess 1140 | excited 1141 | amazon 1142 | baltimore 1143 | island 1144 | skiing 1145 | center 1146 | alien 1147 | butter 1148 | corn 1149 | civic 1150 | jane 1151 | view 1152 | nap 1153 | pit 1154 | bulldog 1155 | lovely 1156 | prince 1157 | loud 1158 | photograph 1159 | gift 1160 | coke 1161 | org 1162 | belt 1163 | festival 1164 | ugh 1165 | ahh 1166 | vintage 1167 | pug 1168 | birth 1169 | understandable 1170 | sweetheart 1171 | irma 1172 | deep 1173 | india 1174 | feed 1175 | ring 1176 | item 1177 | scene 1178 | spear 1179 | mushroom 1180 | position 1181 | afternoon 1182 | hire 1183 | trainer 1184 | distract 1185 | touch 1186 | unfortunate 1187 | alaska 1188 | expect 1189 | katy 1190 | scratch 1191 | cost 1192 | situation 1193 | gay 1194 | loss 1195 | organic 1196 | washington 1197 | joy 1198 | gummy 1199 | survive 1200 | med 1201 | bless 1202 | prepare 1203 | charity 1204 | sight 1205 | rare 1206 | heavy 1207 | rural 1208 | russia 1209 | newspaper 1210 | karaoke 1211 | driving 1212 | customer 1213 | watching 1214 | require 1215 | graphic 1216 | mood 1217 | maine 1218 | freelance 1219 | fitness 1220 | diner 1221 | pepsi 1222 | condo 1223 | miami 1224 | cross 1225 | runner 1226 | accept 1227 | panda 1228 | bunny 1229 | engage 1230 | uncle 1231 | commute 1232 | indie 1233 | cooler 1234 | straight 1235 | hollywood 1236 | bagel 1237 | terrify 1238 | training 1239 | boxer 1240 | left 1241 | protein 1242 | bull 1243 | jog 1244 | tom 1245 | shame 1246 | ouch 1247 | current 1248 | programmer 1249 | nerd 1250 | magazine 1251 | artistic 1252 | yikes 1253 | eating 1254 | skittle 1255 | furniture 1256 | eagle 1257 | form 1258 | fabulous 1259 | legal 1260 | agency 1261 | internship 1262 | cabin 1263 | drama 1264 | positive 1265 | addict 1266 | surprise 1267 | rewarding 1268 | tax 1269 | barista 1270 | fake 1271 | spain 1272 | crash 1273 | random 1274 | kale 1275 | bright 1276 | shark 1277 | studio 1278 | bow 1279 | boys 1280 | bell 1281 | brace 1282 | trick 1283 | wheelchair 1284 | cloud 1285 | southern 1286 | force 1287 | chair 1288 | spouse 1289 | thumb 1290 | frank 1291 | rapper 1292 | virginia 1293 | physical 1294 | bye 1295 | grad 1296 | soup 1297 | fiance 1298 | elvis 1299 | meatloaf 1300 | queen 1301 | united 1302 | income 1303 | salon 1304 | volleyball 1305 | target 1306 | chihuahua 1307 | limit 1308 | scholarship 1309 | direction 1310 | supply 1311 | canadian 1312 | daddy 1313 | toy 1314 | kentucky 1315 | respect 1316 | earth 1317 | political 1318 | binge 1319 | hilarious 1320 | blast 1321 | unhealthy 1322 | pant 1323 | cheeseburger 1324 | complete 1325 | unicorn 1326 | religion 1327 | dairy 1328 | drug 1329 | fl 1330 | whiskey 1331 | reject 1332 | iced 1333 | average 1334 | eater 1335 | dirty 1336 | rat 1337 | angry 1338 | heck 1339 | hide 1340 | competitive 1341 | gum 1342 | website 1343 | laptop 1344 | exhaust 1345 | robot 1346 | challenge 1347 | outdoor 1348 | raw 1349 | audition 1350 | cafe 1351 | onion 1352 | assume 1353 | opinion 1354 | horseback 1355 | zebra 1356 | philosophy 1357 | psychology 1358 | exact 1359 | spice 1360 | debt 1361 | reside 1362 | heat 1363 | hobbies 1364 | leather 1365 | rude 1366 | china 1367 | storm 1368 | grown 1369 | invite 1370 | instructor 1371 | steal 1372 | curly 1373 | wash 1374 | worm 1375 | bf 1376 | credit 1377 | greek 1378 | oregon 1379 | shot 1380 | riding 1381 | pride 1382 | speed 1383 | floor 1384 | intense 1385 | court 1386 | dental 1387 | bone 1388 | friendly 1389 | diving 1390 | lame 1391 | kitty 1392 | alternative 1393 | hubby 1394 | strict 1395 | ham 1396 | camping 1397 | pond 1398 | marine 1399 | grandpa 1400 | secretary 1401 | theme 1402 | judge 1403 | shade 1404 | complain 1405 | herb 1406 | advertising 1407 | celebrity 1408 | fascinate 1409 | lasagna 1410 | environment 1411 | painter 1412 | comfortable 1413 | beagle 1414 | recycle 1415 | bruno 1416 | invest 1417 | search 1418 | society 1419 | halo 1420 | pursue 1421 | hm 1422 | sam 1423 | connect 1424 | angeles 1425 | weed 1426 | grab 1427 | kiss 1428 | bald 1429 | em 1430 | britney 1431 | oil 1432 | conversation 1433 | mmm 1434 | yard 1435 | cash 1436 | jean 1437 | ship 1438 | exotic 1439 | vanilla 1440 | waste 1441 | photographer 1442 | adult 1443 | rolling 1444 | dig 1445 | cookie 1446 | tennessee 1447 | balance 1448 | cleaning 1449 | ocd 1450 | eggplant 1451 | archery 1452 | ma 1453 | meatball 1454 | dust 1455 | deer 1456 | gluten 1457 | agent 1458 | gaming 1459 | celebrate 1460 | helpful 1461 | tofu 1462 | campus 1463 | lord 1464 | evil 1465 | er 1466 | los 1467 | shake 1468 | martial 1469 | drummer 1470 | outfit 1471 | grass 1472 | wrestle 1473 | note 1474 | convertible 1475 | biking 1476 | taller 1477 | gorgeous 1478 | file 1479 | hectic 1480 | salsa 1481 | rush 1482 | awhile 1483 | distance 1484 | soft 1485 | homeless 1486 | daycare 1487 | process 1488 | patient 1489 | houston 1490 | crowd 1491 | stew 1492 | duty 1493 | bookstore 1494 | tie 1495 | neighborhood 1496 | professor 1497 | orleans 1498 | electric 1499 | original 1500 | opportunity 1501 | francisco 1502 | angel 1503 | fund 1504 | site 1505 | peace 1506 | picky 1507 | wisconsin 1508 | madonna 1509 | adam 1510 | iceland 1511 | blow 1512 | uh 1513 | total 1514 | urban 1515 | coincidence 1516 | entire 1517 | jello 1518 | compare 1519 | dollar 1520 | headache 1521 | blond 1522 | guilty 1523 | cure 1524 | electronic 1525 | grader 1526 | greece 1527 | correct 1528 | twilight 1529 | enjoyable 1530 | benefit 1531 | damn 1532 | coupon 1533 | cheat 1534 | thrift 1535 | print 1536 | scout 1537 | successful 1538 | bartender 1539 | stranger 1540 | cattle 1541 | mommy 1542 | bath 1543 | nascar 1544 | honor 1545 | suggestion 1546 | appalachian 1547 | ginger 1548 | hook 1549 | aquarium 1550 | james 1551 | scared 1552 | gamer 1553 | collie 1554 | creepy 1555 | upstate 1556 | therapist 1557 | bicycle 1558 | trophy 1559 | department 1560 | cheerleader 1561 | variety 1562 | suburb 1563 | explain 1564 | cuddle 1565 | flip 1566 | shepherd 1567 | tupac 1568 | whoa 1569 | iq 1570 | lifeguard 1571 | sunshine 1572 | jersey 1573 | rainy 1574 | vehicle 1575 | closet 1576 | rainbow 1577 | ross 1578 | specialty 1579 | attorney 1580 | interior 1581 | gf 1582 | crab 1583 | podcasts 1584 | fart 1585 | decent 1586 | insane 1587 | shepard 1588 | gossip 1589 | jimmy 1590 | blackjack 1591 | austin 1592 | nike 1593 | louisiana 1594 | brat 1595 | therapy 1596 | noble 1597 | skunk 1598 | listening 1599 | awww 1600 | pleasant 1601 | pittsburgh 1602 | barbie 1603 | specific 1604 | pas 1605 | um 1606 | avid 1607 | occasion 1608 | mustache 1609 | autograph 1610 | noise 1611 | diego 1612 | ruin 1613 | admit 1614 | fail 1615 | scotch 1616 | creature 1617 | costume 1618 | jeopardy 1619 | swift 1620 | africa 1621 | cd 1622 | account 1623 | edit 1624 | topic 1625 | handy 1626 | window 1627 | steelers 1628 | accent 1629 | activist 1630 | preacher 1631 | affect 1632 | apply 1633 | loose 1634 | refuse 1635 | bum 1636 | minnesota 1637 | gender 1638 | confuse 1639 | landscape 1640 | broadway 1641 | bmw 1642 | argue 1643 | foodie 1644 | trek 1645 | relieve 1646 | todd 1647 | path 1648 | freedom 1649 | pearl 1650 | tap 1651 | medication 1652 | vera 1653 | rapid 1654 | ohh 1655 | government 1656 | environmental 1657 | fault 1658 | ft 1659 | cope 1660 | carbs 1661 | standard 1662 | planet 1663 | mcqueen 1664 | nugget 1665 | pull 1666 | difference 1667 | babysit 1668 | teller 1669 | disappoint 1670 | mermaid 1671 | pageant 1672 | soap 1673 | midwest 1674 | giant 1675 | puerto 1676 | bowling 1677 | asian 1678 | arizona 1679 | happiness 1680 | provide 1681 | fond 1682 | bud 1683 | cell 1684 | entertain 1685 | butterfly 1686 | genius 1687 | scream 1688 | architect 1689 | jar 1690 | monkey 1691 | theatre 1692 | hah 1693 | purchase 1694 | yay 1695 | vote 1696 | level 1697 | cab 1698 | combo 1699 | tool 1700 | fluent 1701 | duck 1702 | creek 1703 | load 1704 | affair 1705 | humor 1706 | slave 1707 | publishing 1708 | painful 1709 | redhead 1710 | jerry 1711 | thursday 1712 | dungeon 1713 | cape 1714 | messy 1715 | approve 1716 | cousin 1717 | weekly 1718 | terrier 1719 | paddle 1720 | david 1721 | typical 1722 | waffle 1723 | desk 1724 | catholic 1725 | germany 1726 | instagram 1727 | admire 1728 | image 1729 | upset 1730 | muscle 1731 | monday 1732 | pic 1733 | basement 1734 | ground 1735 | wave 1736 | shellfish 1737 | technician 1738 | episode 1739 | jim 1740 | jacob 1741 | ballerina 1742 | loan 1743 | improve 1744 | garage 1745 | ibm 1746 | tooth 1747 | bachelor 1748 | thrill 1749 | hippie 1750 | notice 1751 | return 1752 | crush 1753 | jesus 1754 | stomach 1755 | techno 1756 | raven 1757 | nerve 1758 | denver 1759 | foster 1760 | thriller 1761 | rugby 1762 | daydream 1763 | oreo 1764 | discover 1765 | detroit 1766 | cult 1767 | atlanta 1768 | incredible 1769 | stable 1770 | poem 1771 | yr 1772 | oooh 1773 | wide 1774 | mango 1775 | snowboard 1776 | weapon 1777 | countryside 1778 | alcoholic 1779 | brunch 1780 | fisherman 1781 | aunt 1782 | toronto 1783 | sarah 1784 | addiction 1785 | surround 1786 | lactose 1787 | clinic 1788 | universe 1789 | pretzel 1790 | toto 1791 | blah 1792 | sir 1793 | sausage 1794 | cosplay 1795 | text 1796 | quality 1797 | millionaire 1798 | clerk 1799 | skinny 1800 | hendrix 1801 | disabled 1802 | puzzle 1803 | pepperoni 1804 | civil 1805 | surfer 1806 | larp 1807 | package 1808 | roof 1809 | pa 1810 | suspense 1811 | drone 1812 | mail 1813 | jason 1814 | exam 1815 | mark 1816 | financial 1817 | encyclopedia 1818 | cheetos 1819 | demand 1820 | shell 1821 | planning 1822 | stupid 1823 | yell 1824 | grateful 1825 | bingo 1826 | source 1827 | companion 1828 | director 1829 | bite 1830 | detective 1831 | biography 1832 | gospel 1833 | silly 1834 | pudding 1835 | pork 1836 | teeth 1837 | autobiography 1838 | salt 1839 | footstep 1840 | deserve 1841 | produce 1842 | swimmer 1843 | barbecue 1844 | maryland 1845 | btw 1846 | defense 1847 | fallon 1848 | teen 1849 | continue 1850 | cart 1851 | wizard 1852 | meditate 1853 | shelf 1854 | zombie 1855 | irish 1856 | pecan 1857 | bubble 1858 | discount 1859 | scooter 1860 | push 1861 | tutorial 1862 | scuba 1863 | homemade 1864 | weakness 1865 | translator 1866 | gymnastics 1867 | background 1868 | softball 1869 | kidding 1870 | mistake 1871 | realize 1872 | ironic 1873 | floyd 1874 | flight 1875 | surgeon 1876 | crack 1877 | desire 1878 | introvert 1879 | ted 1880 | knife 1881 | mba 1882 | pottery 1883 | orphan 1884 | recover 1885 | mini 1886 | bucket 1887 | perk 1888 | autism 1889 | moped 1890 | cycle 1891 | youth 1892 | spoil 1893 | attitude 1894 | injury 1895 | pennsylvania 1896 | aspire 1897 | loyal 1898 | attack 1899 | murder 1900 | price 1901 | tank 1902 | safety 1903 | olympics 1904 | rome 1905 | dj 1906 | carpenter 1907 | sesame 1908 | consume 1909 | protect 1910 | british 1911 | sword 1912 | cheetah 1913 | float 1914 | asia 1915 | mate 1916 | creed 1917 | xbox 1918 | lean 1919 | freckle 1920 | caffeine 1921 | hunter 1922 | trump 1923 | programming 1924 | picnic 1925 | ear 1926 | fridge 1927 | easter 1928 | asparagus 1929 | oldie 1930 | darn 1931 | disagree 1932 | mirror 1933 | hehe 1934 | ew 1935 | biscuit 1936 | freshman 1937 | whistle 1938 | usual 1939 | inch 1940 | deli 1941 | eclipse 1942 | cycling 1943 | modern 1944 | tease 1945 | review 1946 | pattern 1947 | award 1948 | belong 1949 | nickname 1950 | buff 1951 | sweden 1952 | cuz 1953 | selfish 1954 | personality 1955 | graduation 1956 | dolphin 1957 | album 1958 | pup 1959 | taylor 1960 | lease 1961 | educate 1962 | depress 1963 | paramedic 1964 | gila 1965 | purpose 1966 | prison 1967 | worried 1968 | vermont 1969 | block 1970 | buffalo 1971 | olive 1972 | identical 1973 | score 1974 | string 1975 | actual 1976 | intelligent 1977 | epilepsy 1978 | sand 1979 | plate 1980 | subtitle 1981 | border 1982 | cable 1983 | smooth 1984 | lexus 1985 | develop 1986 | vienna 1987 | brave 1988 | welfare 1989 | doberman 1990 | wealthy 1991 | switch 1992 | sneak 1993 | robert 1994 | hill 1995 | nacho 1996 | mugger 1997 | snap 1998 | dumb 1999 | coworker 2000 | peta 2001 | hr 2002 | chick 2003 | fur 2004 | goalie 2005 | range 2006 | introduce 2007 | costco 2008 | railroad 2009 | suffer 2010 | menu 2011 | soldier 2012 | asthma 2013 | sex 2014 | lindsey 2015 | utah 2016 | grandfather 2017 | documentary 2018 | admirable 2019 | traveling 2020 | dane 2021 | employee 2022 | article 2023 | caramel 2024 | harley 2025 | equipment 2026 | heal 2027 | basic 2028 | dabble 2029 | depression 2030 | attract 2031 | gaga 2032 | tale 2033 | tube 2034 | fried 2035 | doggy 2036 | mash 2037 | taught 2038 | receptionist 2039 | empire 2040 | intern 2041 | piercings 2042 | tackle 2043 | ease 2044 | blanket 2045 | participate 2046 | cheesy 2047 | gray 2048 | daisy 2049 | toddler 2050 | link 2051 | diamond 2052 | propose 2053 | dallas 2054 | president 2055 | ranger 2056 | wolf 2057 | gain 2058 | chai 2059 | annoying 2060 | earring 2061 | version 2062 | basket 2063 | lens 2064 | salary 2065 | corner 2066 | champion 2067 | firefighter 2068 | ferret 2069 | achieve 2070 | sears 2071 | mia 2072 | idol 2073 | joe 2074 | decade 2075 | emotion 2076 | koala 2077 | management 2078 | pharmacist 2079 | apps 2080 | depends 2081 | killer 2082 | fellow 2083 | uniform 2084 | gourmet 2085 | cleveland 2086 | motivate 2087 | hummus 2088 | entertainment 2089 | mmmm 2090 | bag 2091 | nashville 2092 | flirt 2093 | owen 2094 | trumpet 2095 | nevada 2096 | stage 2097 | jerky 2098 | responsibility 2099 | drake 2100 | bentley 2101 | gold 2102 | tx 2103 | arcade 2104 | ankle 2105 | vegas 2106 | kj 2107 | batman 2108 | unwind 2109 | keyboard 2110 | combination 2111 | leaf 2112 | koi 2113 | cello 2114 | minimum 2115 | adventurous 2116 | loving 2117 | kiddos 2118 | spill 2119 | diabetic 2120 | central 2121 | rob 2122 | trend 2123 | bubblegum 2124 | indoors 2125 | monster 2126 | drunk 2127 | confidence 2128 | pyramid 2129 | grunge 2130 | banker 2131 | breath 2132 | grasshopper 2133 | hoop 2134 | encourage 2135 | gutter 2136 | macaroni 2137 | robotics 2138 | double 2139 | seat 2140 | dew 2141 | uncomfortable 2142 | flash 2143 | bench 2144 | bomb 2145 | info 2146 | semi 2147 | comfort 2148 | wildlife 2149 | geology 2150 | lonely 2151 | coz 2152 | edge 2153 | anniversary 2154 | vitamin 2155 | material 2156 | hotdog 2157 | gathering 2158 | socialize 2159 | machine 2160 | editor 2161 | droopy 2162 | brew 2163 | liberal 2164 | mercedes 2165 | quarterback 2166 | rn 2167 | planner 2168 | fortunate 2169 | ur 2170 | greenhouse 2171 | si 2172 | psychologist 2173 | promotion 2174 | def 2175 | discovery 2176 | carb 2177 | humane 2178 | broken 2179 | chanel 2180 | fluffy 2181 | chain 2182 | vision 2183 | stanford 2184 | pipe 2185 | ability 2186 | overweight 2187 | promote 2188 | labrador 2189 | veteran 2190 | preference 2191 | symphony 2192 | alpaca 2193 | ve 2194 | app 2195 | teenager 2196 | anne 2197 | promise 2198 | doo 2199 | publisher 2200 | curious 2201 | tiki 2202 | porsche 2203 | mixed 2204 | maid 2205 | legend 2206 | michael 2207 | supportive 2208 | pineapple 2209 | ariel 2210 | diabetes 2211 | consulting 2212 | starve 2213 | gal 2214 | collar 2215 | gable 2216 | battle 2217 | jacket 2218 | sexy 2219 | sleeve 2220 | felix 2221 | pastime 2222 | jamaica 2223 | mortal 2224 | weak 2225 | scrub 2226 | rabbit 2227 | godfather 2228 | sinatra 2229 | valley 2230 | despise 2231 | regret 2232 | goodwill 2233 | heaven 2234 | buddhist 2235 | smoking 2236 | oops 2237 | pitbulls 2238 | salmon 2239 | rick 2240 | bitcoin 2241 | dip 2242 | trout 2243 | pill 2244 | farming 2245 | thankful 2246 | tokyo 2247 | housewife 2248 | prayer 2249 | impala 2250 | valedictorian 2251 | plain 2252 | message 2253 | temper 2254 | flintstone 2255 | leprechaun 2256 | sucker 2257 | breathe 2258 | csi 2259 | criminal 2260 | rip 2261 | maiden 2262 | fascinating 2263 | rico 2264 | algeria 2265 | report 2266 | umm 2267 | patience 2268 | leader 2269 | curl 2270 | motivation 2271 | climbing 2272 | tahoe 2273 | ymca 2274 | relief 2275 | glacier 2276 | breast 2277 | enter 2278 | clutter 2279 | dull 2280 | fighter 2281 | tat 2282 | awake 2283 | brewery 2284 | victorian 2285 | volcano 2286 | friends 2287 | mount 2288 | pillage 2289 | magical 2290 | generation 2291 | clue 2292 | conscious 2293 | stare 2294 | silver 2295 | wrestling 2296 | levine 2297 | joint 2298 | restore 2299 | everest 2300 | dope 2301 | stray 2302 | international 2303 | parking 2304 | hampshire 2305 | hearse 2306 | warehouse 2307 | pitbull 2308 | nyu 2309 | outdoorsy 2310 | development 2311 | employment 2312 | drinker 2313 | zumba 2314 | paul 2315 | budget 2316 | daniel 2317 | eyesight 2318 | sour 2319 | mouth 2320 | stain 2321 | blogger 2322 | exist 2323 | rib 2324 | brush 2325 | interview 2326 | bff 2327 | custom 2328 | snuggle 2329 | vancouver 2330 | mario 2331 | ferraris 2332 | mural 2333 | poet 2334 | oriole 2335 | period 2336 | karma 2337 | damage 2338 | warmer 2339 | crossword 2340 | childrens 2341 | pomeranian 2342 | imaginary 2343 | dave 2344 | anatomy 2345 | tone 2346 | code 2347 | videogames 2348 | woodstock 2349 | convention 2350 | janitor 2351 | preschool 2352 | screen 2353 | prejudice 2354 | crystal 2355 | rage 2356 | tradition 2357 | chatting 2358 | traditional 2359 | parakeet 2360 | ramen 2361 | combat 2362 | multiple 2363 | crave 2364 | syrup 2365 | racing 2366 | highlight 2367 | communist 2368 | concentrate 2369 | waiter 2370 | ebooks 2371 | dodge 2372 | hp 2373 | boil 2374 | attic 2375 | medal 2376 | commitment 2377 | release 2378 | downtown 2379 | alligator 2380 | statement 2381 | debate 2382 | agreed 2383 | maga 2384 | homeschooled 2385 | strength 2386 | plumber 2387 | hippy 2388 | windy 2389 | condition 2390 | smoothie 2391 | stair 2392 | content 2393 | depressed 2394 | ferrell 2395 | keto 2396 | remodel 2397 | donut 2398 | winner 2399 | playlist 2400 | wayne 2401 | nation 2402 | kpop 2403 | map 2404 | coon 2405 | junior 2406 | mum 2407 | tape 2408 | quake 2409 | smithsonian 2410 | washer 2411 | abigail 2412 | radiohead 2413 | humble 2414 | unicycle 2415 | administration 2416 | ontario 2417 | performance 2418 | truth 2419 | fred 2420 | ingredient 2421 | cucumber 2422 | beastie 2423 | orchestra 2424 | sewing 2425 | knock 2426 | culinary 2427 | sweat 2428 | seashell 2429 | impression 2430 | network 2431 | languages 2432 | tailgate 2433 | celebration 2434 | thomas 2435 | embarrass 2436 | born 2437 | mama 2438 | freeze 2439 | crap 2440 | fortune 2441 | figurine 2442 | confident 2443 | homebody 2444 | chemistry 2445 | collector 2446 | merna 2447 | arrive 2448 | titanic 2449 | meditation 2450 | bout 2451 | manta 2452 | announcer 2453 | solo 2454 | circle 2455 | md 2456 | funeral 2457 | engine 2458 | butt 2459 | delivery 2460 | ultimate 2461 | specialize 2462 | web 2463 | palm 2464 | absolute 2465 | investment 2466 | harsh 2467 | pistachio 2468 | loner 2469 | experiment 2470 | gut 2471 | austen 2472 | fuel 2473 | cramp 2474 | trauma 2475 | sleepy 2476 | celtic 2477 | press 2478 | draft 2479 | auto 2480 | sprite 2481 | obsession 2482 | sip 2483 | fifty 2484 | vinyl 2485 | swing 2486 | fool 2487 | hbu 2488 | harvey 2489 | copperfield 2490 | playoff 2491 | kite 2492 | lesbian 2493 | jerk 2494 | owe 2495 | democrat 2496 | mass 2497 | hamilton 2498 | ga 2499 | uk 2500 | luis 2501 | impress 2502 | slice 2503 | pita 2504 | hobbie 2505 | apologize 2506 | santa 2507 | tacos 2508 | landing 2509 | hometown 2510 | telecom 2511 | mater 2512 | mutt 2513 | deploy 2514 | del 2515 | sore 2516 | nancy 2517 | barbies 2518 | fam 2519 | clay 2520 | ethnic 2521 | pastry 2522 | hostage 2523 | tight 2524 | backyard 2525 | convince 2526 | maker 2527 | curry 2528 | android 2529 | pc 2530 | jessica 2531 | ignore 2532 | flow 2533 | sickness 2534 | elderly 2535 | chore 2536 | upholstery 2537 | sweetie 2538 | lettuce 2539 | cuba 2540 | gadget 2541 | animation 2542 | trooper 2543 | faith 2544 | tongue 2545 | success 2546 | gentle 2547 | portrait 2548 | sheeran 2549 | chevrolet 2550 | packer 2551 | risk 2552 | spark 2553 | frustrate 2554 | mouse 2555 | pitch 2556 | weld 2557 | eyebrow 2558 | bella 2559 | linebacker 2560 | bully 2561 | routine 2562 | spelling 2563 | bc 2564 | coat 2565 | saudi 2566 | arabia 2567 | tampa 2568 | emmy 2569 | samsung 2570 | mop 2571 | kevin 2572 | checker 2573 | teapot 2574 | weigh 2575 | suv 2576 | miserable 2577 | sevenfold 2578 | f150 2579 | lit 2580 | posse 2581 | thai 2582 | curator 2583 | steve 2584 | poop 2585 | historical 2586 | morty 2587 | cane 2588 | miley 2589 | wise 2590 | petition 2591 | tear 2592 | penn 2593 | astronaut 2594 | cod 2595 | colour 2596 | acting 2597 | precious 2598 | buck 2599 | lucy 2600 | muse 2601 | cosmetic 2602 | occupation 2603 | nba 2604 | ate 2605 | flexible 2606 | ideal 2607 | suspender 2608 | bang 2609 | direct 2610 | gotti 2611 | agitate 2612 | hairdresser 2613 | dealership 2614 | influence 2615 | cursive 2616 | sunfish 2617 | snorkel 2618 | shallow 2619 | root 2620 | pediatrician 2621 | compost 2622 | coaster 2623 | nearby 2624 | foreman 2625 | deadbeat 2626 | penny 2627 | jay 2628 | jasper 2629 | tarot 2630 | pressure 2631 | clarinet 2632 | supper 2633 | express 2634 | ai 2635 | martini 2636 | favor 2637 | chop 2638 | lutefisk 2639 | charge 2640 | dakota 2641 | hitchhike 2642 | formal 2643 | ivy 2644 | raptor 2645 | battlestar 2646 | captain 2647 | disgust 2648 | task 2649 | sitcom 2650 | yorkie 2651 | coco 2652 | understood 2653 | naw 2654 | ant 2655 | stinky 2656 | speckle 2657 | title 2658 | corporate 2659 | wednesday 2660 | gambler 2661 | wage 2662 | multi 2663 | mma 2664 | cookbook 2665 | citizen 2666 | hazel 2667 | aspiration 2668 | goat 2669 | stuck 2670 | lumberjack 2671 | flag 2672 | wet 2673 | ufc 2674 | learning 2675 | stirling 2676 | dealer 2677 | grisham 2678 | acre 2679 | --------------------------------------------------------------------------------