├── preprocess
    ├── convai2
    │   ├── __init__.py
    │   ├── api.py
    │   └── candi_keyword.txt
    ├── extraction.py
    ├── prepare_data.py
    ├── dataset.py
    └── data_utils.py
├── train.py
├── config
    ├── retrieval.py
    ├── retrieval_stgy.py
    ├── matrix.py
    ├── neural.py
    ├── kernel.py
    └── data_config.py
├── chat.py
├── simulate.py
├── readme.md
└── model
    ├── retrieval.py
    ├── retrieval_stgy.py
    ├── matrix.py
    ├── neural.py
    └── kernel.py


/preprocess/convai2/__init__.py:
--------------------------------------------------------------------------------
1 | from .api import *
2 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import importlib
 3 | import os
 4 | if __name__ == '__main__':
 5 |     flags = tf.flags
 6 |     flags.DEFINE_string('data', 'data_config', 'The data config')
 7 |     flags.DEFINE_string('agent', 'kernel', 'The predictor type')
 8 |     flags.DEFINE_string('mode', 'train', 'The mode')
 9 | 
10 |     FLAGS = flags.FLAGS
11 |     config_data = importlib.import_module('config.' + FLAGS.data)
12 |     config_model = importlib.import_module('config.' + FLAGS.agent)
13 |     model = importlib.import_module('model.' + FLAGS.agent)
14 |     predictor = model.Predictor(config_model, config_data, FLAGS.mode)
15 |     if not os.path.exists('save/'+FLAGS.agent):
16 |         os.makedirs('save/'+FLAGS.agent)
17 | 
18 |     if FLAGS.mode == 'train_kw':
19 |         predictor.train_keywords()
20 |     if FLAGS.mode == 'test_kw':
21 |         predictor.test_keywords()
22 |     if FLAGS.mode == 'train':
23 |         predictor.train()
24 |         predictor.test()
25 |     if FLAGS.mode == 'test':
26 |         predictor.test()
27 | 


--------------------------------------------------------------------------------
/config/retrieval.py:
--------------------------------------------------------------------------------
 1 | _hidden_size = 200
 2 | _code_len = 200
 3 | _save_path = 'save/retrieval/model_1'
 4 | _max_epoch = 10
 5 | 
 6 | source_encoder_hparams = {
 7 |     "encoder_minor_type": "UnidirectionalRNNEncoder",
 8 |     "encoder_minor_hparams": {
 9 |         "rnn_cell": {
10 |             "type": "GRUCell",
11 |             "kwargs": {
12 |                 "num_units": _hidden_size,
13 |             },
14 |         },
15 |     },
16 |     "encoder_major_type": "UnidirectionalRNNEncoder",
17 |     "encoder_major_hparams": {
18 |         "rnn_cell": {
19 |             "type": "GRUCell",
20 |             "kwargs": {
21 |                 "num_units": _hidden_size,
22 |             },
23 |         }
24 |     }
25 | }
26 | 
27 | target_encoder_hparams = {
28 |     "rnn_cell": {
29 |         "type": "GRUCell",
30 |         "kwargs": {
31 |             "num_units": _hidden_size,
32 |         },
33 |     }
34 | }
35 | 
36 | opt_hparams = {
37 |     "optimizer": {
38 |         "type": "AdamOptimizer",
39 |         "kwargs": {
40 |             "learning_rate": 0.001,
41 |         }
42 |     },
43 | }


--------------------------------------------------------------------------------
/config/retrieval_stgy.py:
--------------------------------------------------------------------------------
 1 | _hidden_size = 200
 2 | _code_len = 200
 3 | _save_path = 'save/retrieval/model_1'
 4 | _max_epoch = 10
 5 | 
 6 | source_encoder_hparams = {
 7 |     "encoder_minor_type": "UnidirectionalRNNEncoder",
 8 |     "encoder_minor_hparams": {
 9 |         "rnn_cell": {
10 |             "type": "GRUCell",
11 |             "kwargs": {
12 |                 "num_units": _hidden_size,
13 |             },
14 |         },
15 |     },
16 |     "encoder_major_type": "UnidirectionalRNNEncoder",
17 |     "encoder_major_hparams": {
18 |         "rnn_cell": {
19 |             "type": "GRUCell",
20 |             "kwargs": {
21 |                 "num_units": _hidden_size,
22 |             },
23 |         }
24 |     }
25 | }
26 | 
27 | target_encoder_hparams = {
28 |     "rnn_cell": {
29 |         "type": "GRUCell",
30 |         "kwargs": {
31 |             "num_units": _hidden_size,
32 |         },
33 |     }
34 | }
35 | 
36 | opt_hparams = {
37 |     "optimizer": {
38 |         "type": "AdamOptimizer",
39 |         "kwargs": {
40 |             "learning_rate": 0.001,
41 |         }
42 |     },
43 | }


--------------------------------------------------------------------------------
/config/matrix.py:
--------------------------------------------------------------------------------
 1 | _hidden_size = 200
 2 | _code_len = 800
 3 | _save_path = 'save/matrix/model_1'
 4 | _matrix_save_path = 'save/matrix/matrix_1.pk'
 5 | _max_epoch = 10
 6 | 
 7 | _vocab_path = 'tx_data/vocab.txt'
 8 | _vocab = [x.strip() for x in open(_vocab_path, 'r').readlines()]
 9 | _vocab_size = len(_vocab)
10 | 
11 | source_encoder_hparams = {
12 |     "encoder_minor_type": "BidirectionalRNNEncoder",
13 |     "encoder_minor_hparams": {
14 |         "rnn_cell_fw": {
15 |             "type": "GRUCell",
16 |             "kwargs": {
17 |                 "num_units": _hidden_size,
18 |             },
19 |         },
20 |         "rnn_cell_share_config": True
21 |     },
22 |     "encoder_major_type": "UnidirectionalRNNEncoder",
23 |     "encoder_major_hparams": {
24 |         "rnn_cell": {
25 |             "type": "GRUCell",
26 |             "kwargs": {
27 |                 "num_units": _hidden_size*2,
28 |             },
29 |         }
30 |     }
31 | }
32 | 
33 | target_encoder_hparams = {
34 |     "rnn_cell_fw": {
35 |         "type": "GRUCell",
36 |         "kwargs": {
37 |             "num_units": _hidden_size,
38 |         },
39 |     },
40 |     "rnn_cell_share_config": True
41 | }
42 | 
43 | target_kwencoder_hparams = {
44 |     "rnn_cell_fw": {
45 |         "type": "GRUCell",
46 |         "kwargs": {
47 |             "num_units": _hidden_size,
48 |         },
49 |     },
50 |     "rnn_cell_share_config": True
51 | }
52 | 
53 | opt_hparams = {
54 |     "optimizer": {
55 |         "type": "AdamOptimizer",
56 |         "kwargs": {
57 |             "learning_rate": 0.001,
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/preprocess/extraction.py:
--------------------------------------------------------------------------------
 1 | from data_utils import *
 2 | 
 3 | class KeywordExtractor():
 4 |     def __init__(self, idf_dict = None):
 5 |         self.idf_dict = idf_dict
 6 | 
 7 |     @staticmethod
 8 |     def is_keyword_tag(tag):
 9 |         return tag.startswith('VB') or tag.startswith('NN') or tag.startswith('JJ')
10 | 
11 |     @staticmethod
12 |     def cal_tag_score(tag):
13 |         if tag.startswith('VB'):
14 |             return 1.
15 |         if tag.startswith('NN'):
16 |             return 2.
17 |         if tag.startswith('JJ'):
18 |             return 0.5
19 |         return 0.
20 | 
21 |     def idf_extract(self, string, con_kw = None):
22 |         tokens = simp_tokenize(string)
23 |         seq_len = len(tokens)
24 |         tokens = pos_tag(tokens)
25 |         source = kw_tokenize(string)
26 |         candi = []
27 |         result = []
28 |         for i, (word, tag) in enumerate(tokens):
29 |             score = self.cal_tag_score(tag)
30 |             if not is_candiword(source[i]) or score == 0.:
31 |                 continue
32 |             if con_kw is not None and source[i] in con_kw:
33 |                 continue
34 |             score *= source.count(source[i])
35 |             score *= 1 / seq_len
36 |             score *= self.idf_dict[source[i]]
37 |             candi.append((source[i], score))
38 |             if score > 0.15:
39 |                 result.append(source[i])
40 |         return list(set(result))
41 | 
42 | 
43 |     def extract(self, string):
44 |         tokens = simp_tokenize(string)
45 |         tokens = pos_tag(tokens)
46 |         source = kw_tokenize(string)
47 |         kwpos_alters = []
48 |         for i, (word, tag) in enumerate(tokens):
49 |             if source[i] and self.is_keyword_tag(tag):
50 |                 kwpos_alters.append(i)
51 |         kwpos, keywords = [], []
52 |         for id in kwpos_alters:
53 |             if is_candiword(source[id]):
54 |                 keywords.append(source[id])
55 |         return list(set(keywords))


--------------------------------------------------------------------------------
/config/neural.py:
--------------------------------------------------------------------------------
 1 | _hidden_size = 200
 2 | _code_len = 800
 3 | _save_path = 'save/neural/model_1'
 4 | _neural_save_path = 'save/neural/keyword_1'
 5 | _max_epoch = 10
 6 | 
 7 | neural_opt_hparams = {
 8 |     "optimizer": {
 9 |         "type": "AdamOptimizer",
10 |         "kwargs": {
11 |             "learning_rate": 0.005,
12 |         }
13 |     },
14 |     "learning_rate_decay": {
15 |         "type": "inverse_time_decay",
16 |         "kwargs": {
17 |             "decay_steps": 1600,
18 |             "decay_rate": 0.8
19 |         },
20 |         "start_decay_step": 0,
21 |         "end_decay_step": 16000,
22 |     },
23 | }
24 | 
25 | source_encoder_hparams = {
26 |     "encoder_minor_type": "BidirectionalRNNEncoder",
27 |     "encoder_minor_hparams": {
28 |         "rnn_cell_fw": {
29 |             "type": "GRUCell",
30 |             "kwargs": {
31 |                 "num_units": _hidden_size,
32 |             },
33 |         },
34 |         "rnn_cell_share_config": True
35 |     },
36 |     "encoder_major_type": "UnidirectionalRNNEncoder",
37 |     "encoder_major_hparams": {
38 |         "rnn_cell": {
39 |             "type": "GRUCell",
40 |             "kwargs": {
41 |                 "num_units": _hidden_size*2,
42 |             },
43 |         }
44 |     }
45 | }
46 | 
47 | target_encoder_hparams = {
48 |     "rnn_cell_fw": {
49 |         "type": "GRUCell",
50 |         "kwargs": {
51 |             "num_units": _hidden_size,
52 |         },
53 |     },
54 |     "rnn_cell_share_config": True
55 | }
56 | 
57 | target_kwencoder_hparams = {
58 |     "rnn_cell_fw": {
59 |         "type": "GRUCell",
60 |         "kwargs": {
61 |             "num_units": _hidden_size,
62 |         },
63 |     },
64 |     "rnn_cell_share_config": True
65 | }
66 | 
67 | context_encoder_hparams = {
68 |     "rnn_cell": {
69 |         "type": "GRUCell",
70 |         "kwargs": {
71 |             "num_units": _hidden_size,
72 |         },
73 |     }
74 | }
75 | 
76 | opt_hparams = {
77 |     "optimizer": {
78 |         "type": "AdamOptimizer",
79 |         "kwargs": {
80 |             "learning_rate": 0.001,
81 |         }
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/config/kernel.py:
--------------------------------------------------------------------------------
 1 | _hidden_size = 200
 2 | _code_len = 800
 3 | _save_path = 'save/kernel/model_1'
 4 | _kernel_save_path = 'save/kernel/keyword_1'
 5 | _kernel_mu = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.]
 6 | _kernel_sigma = 0.1
 7 | _max_epoch = 10
 8 | _early_stopping = 2
 9 | 
10 | kernel_opt_hparams = {
11 |     "optimizer": {
12 |         "type": "AdamOptimizer",
13 |         "kwargs": {
14 |             "learning_rate": 0.001,
15 |         }
16 |     },
17 |     "learning_rate_decay": {
18 |         "type": "inverse_time_decay",
19 |         "kwargs": {
20 |             "decay_steps": 1600,
21 |             "decay_rate": 0.8
22 |         },
23 |         "start_decay_step": 0,
24 |         "end_decay_step": 16000,
25 |     },
26 | }
27 | 
28 | source_encoder_hparams = {
29 |     "encoder_minor_type": "BidirectionalRNNEncoder",
30 |     "encoder_minor_hparams": {
31 |         "rnn_cell_fw": {
32 |             "type": "GRUCell",
33 |             "kwargs": {
34 |                 "num_units": _hidden_size,
35 |             },
36 |         },
37 |         "rnn_cell_share_config": True
38 |     },
39 |     "encoder_major_type": "UnidirectionalRNNEncoder",
40 |     "encoder_major_hparams": {
41 |         "rnn_cell": {
42 |             "type": "GRUCell",
43 |             "kwargs": {
44 |                 "num_units": _hidden_size*2,
45 |             },
46 |         }
47 |     }
48 | }
49 | 
50 | target_encoder_hparams = {
51 |     "rnn_cell_fw": {
52 |         "type": "GRUCell",
53 |         "kwargs": {
54 |             "num_units": _hidden_size,
55 |         },
56 |     },
57 |     "rnn_cell_share_config": True
58 | }
59 | 
60 | target_kwencoder_hparams = {
61 |     "rnn_cell_fw": {
62 |         "type": "GRUCell",
63 |         "kwargs": {
64 |             "num_units": _hidden_size,
65 |         },
66 |     },
67 |     "rnn_cell_share_config": True
68 | }
69 | 
70 | context_encoder_hparams = {
71 |     "rnn_cell": {
72 |         "type": "GRUCell",
73 |         "kwargs": {
74 |             "num_units": _hidden_size,
75 |         },
76 |     }
77 | }
78 | 
79 | opt_hparams = {
80 |     "optimizer": {
81 |         "type": "AdamOptimizer",
82 |         "kwargs": {
83 |             "learning_rate": 0.001,
84 |         }
85 |     },
86 | }


--------------------------------------------------------------------------------
/chat.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import importlib
 3 | import random
 4 | from preprocess.data_utils import utter_preprocess, is_reach_goal
 5 | 
 6 | class Target_Chat():
 7 |     def __init__(self, agent):
 8 |         self.agent = agent
 9 |         self.start_utter = config_data._start_corpus
10 |         with tf.Session(config=self.agent.gpu_config) as sess:
11 |             self.agent.retrieve_init(sess)
12 |             for i in range(int(FLAGS.times)):
13 |                 print('--------Session {} --------'.format(i))
14 |                 self.chat(sess)
15 | 
16 |     def chat(self, sess):
17 |         history = []
18 |         history.append(random.sample(self.start_utter, 1)[0])
19 |         target_kw = random.sample(target_set,1)[0]
20 |         self.agent.target = target_kw
21 |         self.agent.score = 0.
22 |         self.agent.reply_list = []
23 |         print('START: ' + history[0])
24 |         for i in range(config_data._max_turns):
25 |             history.append(input('HUMAN: '))
26 |             source = utter_preprocess(history, self.agent.data_config._max_seq_len)
27 |             reply = self.agent.retrieve(source, sess)
28 |             print('AGENT: ', reply)
29 | #             print('Keyword: {}, Similarity: {:.2f}'.format(self.agent.next_kw, self.agent.score))
30 |             history.append(reply)
31 |             if is_reach_goal(history[-2] + history[-1], target_kw):
32 |                 print('Successfully chat to the target \'{}\'.'.format(target_kw))
33 |                 return
34 |         print('Failed by reaching the maximum turn, target: \'{}\'.'.format(target_kw))
35 | 
36 | if __name__ == '__main__':
37 |     flags = tf.flags
38 |     # supports kernel / matrix / neural / retrieval / retrieval-stg
39 |     flags.DEFINE_string('agent', 'kernel', 'The agent type')
40 |     flags.DEFINE_string('times', '100', 'Conversation times')
41 |     FLAGS = flags.FLAGS
42 | 
43 |     config_data = importlib.import_module('config.data_config')
44 |     config_model = importlib.import_module('config.' + FLAGS.agent)
45 |     model = importlib.import_module('model.' + FLAGS.agent)
46 |     predictor = model.Predictor(config_model, config_data, 'test')
47 |     
48 |     target_set = []
49 |     for line in open('tx_data/test/keywords.txt', 'r').readlines():
50 |         target_set = target_set + line.strip().split(' ')
51 | 
52 |     Target_Chat(predictor)
53 | 


--------------------------------------------------------------------------------
/preprocess/prepare_data.py:
--------------------------------------------------------------------------------
 1 | from dataset import dts_Target
 2 | from collections import Counter
 3 | import pickle
 4 | import random
 5 | import os
 6 | import shutil
 7 | if not os.path.exists('../tx_data'):
 8 |     os.mkdir('../tx_data')
 9 |     os.mkdir('../tx_data/train')
10 |     os.mkdir('../tx_data/valid')
11 |     os.mkdir('../tx_data/test')
12 | 
13 | # import texar
14 | # if not os.path.exists('convai2/source'):
15 | #     print('Downloading source ConvAI2 data')
16 | #     texar.data.maybe_download('https://drive.google.com/file/d/1LPxNIVO52hZOwbV3Zply_ITi2Uacit-V/view?usp=sharing'
17 | #                                 ,'convai2', extract=True)
18 | 
19 | shutil.copy('convai2/source/embedding.txt', '../tx_data/embedding.txt')
20 | dataset = dts_Target()
21 | dataset.make_dataset()
22 | 
23 | data = pickle.load(open("source_data.pk","rb"))
24 | max_utter = 9
25 | candidate_num = 20
26 | start_corpus_file = open("../tx_data/start_corpus.txt", "w")
27 | corpus_file = open("../tx_data/corpus.txt", "w")
28 | 
29 | for stage in ['train', 'valid', 'test']:
30 |     source_file = open("../tx_data/{}/source.txt".format(stage), "w")
31 |     target_file = open("../tx_data/{}/target.txt".format(stage), "w")
32 |     context_file = open("../tx_data/{}/context.txt".format(stage), "w")
33 |     keywords_file = open("../tx_data/{}/keywords.txt".format(stage), "w")
34 |     label_file = open("../tx_data/{}/label.txt".format(stage), "w")
35 |     keywords_vocab_file = open("../tx_data/{}/keywords_vocab.txt".format(stage), "w")
36 |     corpus = []
37 |     keywords_counter = Counter()
38 |     for sample in data[stage]:
39 |         corpus += sample['dialog'][1:]
40 |         start_corpus_file.write(sample['dialog'][0]+ '\n')
41 |         for kws in sample['kwlist']:
42 |             keywords_counter.update(kws)
43 |     for kw, _ in keywords_counter.most_common():
44 |         keywords_vocab_file.write(kw + '\n')
45 |     for sample in data[stage]:
46 |         for i in range(2, len(sample['dialog'])):
47 |             if len(sample['kwlist'][i]) > 0:
48 |                 source_list = sample['dialog'][max(0, i - max_utter):i]
49 |                 source_str = '|||'.join(source_list)
50 |                 while True:
51 |                     random_corpus = random.sample(corpus, candidate_num - 1)
52 |                     if sample['dialog'][i] not in random_corpus:
53 |                         break
54 |                 corpus_file.write(sample['dialog'][i] + '\n')
55 |                 target_list = [sample['dialog'][i]] + random_corpus
56 |                 target_str = '|||'.join(target_list)
57 |                 source_file.write(source_str + '\n')
58 |                 target_file.write(target_str + '\n')
59 |                 context_file.write(' '.join(sample['kwlist'][i-2] +
60 |                     sample['kwlist'][i-1]) + '\n')
61 |                 keywords_file.write(' '.join(sample['kwlist'][i]) + '\n')
62 |                 label_file.write('0\n')
63 |                 
64 |     source_file.close()
65 |     target_file.close()
66 |     label_file.close()
67 |     keywords_vocab_file.close()
68 |     context_file.close()
69 | 
70 | start_corpus_file.close()
71 | corpus_file.close()
72 | 


--------------------------------------------------------------------------------
/preprocess/convai2/api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'source')
 3 | 
 4 | class dts_ConvAI2(object):
 5 |     def __init__(self, path=data_path):
 6 |         self.path = path
 7 | 
 8 |     def _txt_to_json(self, txt_path, mode, cands):
 9 |         def pop_one_sample(lines):
10 |             self_persona = []
11 |             other_persona = []
12 |             dialog = []
13 |             candidates = []
14 | 
15 |             started = False
16 |             while len(lines) > 0:
17 |                 line = lines.pop()
18 |                 id, context = line.split(' ', 1)
19 |                 id = int(id)
20 |                 context = context.strip()
21 | 
22 |                 if started == False: # not started
23 |                     assert id == 1
24 |                     started = True
25 |                 elif id == 1: # break for next
26 |                     lines.append(line)
27 |                     break
28 | 
29 |                 if context.startswith('partner\'s persona: '): # partner
30 |                     assert mode in ['both', 'other']
31 |                     other_persona.append(context[19:])
32 | 
33 |                 elif context.startswith('your persona: '): # self
34 |                     assert mode in ['both', 'self']
35 |                     self_persona.append(context[13:])
36 | 
37 |                 elif cands == False: # no cands 
38 |                     try:
39 |                         uttr, response = context.split('\t', 2)[:2]
40 |                         dialog.append(uttr)
41 |                         dialog.append(response)
42 |                     except:
43 |                         uttr = context 
44 |                         dialog.append(uttr)
45 |                 else:
46 |                     uttr, response, _, negs = context.split('\t', 4)[:4]
47 |                     dialog.append(uttr)
48 |                     dialog.append(response)                    
49 |                     candidates.append(negs.split('|'))
50 |                     candidates.append(None)
51 | 
52 |             return {
53 |                 'self_persona': self_persona,
54 |                 'other_persona': other_persona,
55 |                 'dialog': dialog,
56 |                 'candidates': candidates
57 |             }
58 | 
59 |         lines = open(txt_path, 'r').readlines()[::-1]
60 | 
61 |         samples = []
62 |         while len(lines) > 0:
63 |             samples.append(pop_one_sample(lines))
64 | 
65 |         return samples
66 | 
67 |     def get_data(self, mode='train', revised=False, cands=False):
68 |         txt_path = os.path.join(self.path, '{}_{}_{}{}.txt'.format(
69 |             mode,
70 |             'none',
71 |             'revised' if revised is True else 'original',
72 |             '' if cands is True else '_no_cands'))
73 |         assert mode in ['train', 'valid', 'test', 'all']
74 |         print("Get dialog from ", txt_path)
75 |         assert os.path.exists(txt_path)
76 |         return self._txt_to_json(txt_path, mode, cands)
77 | 
78 |     def get_dialogs(self, mode='all'):
79 |         dialogs = [sample['dialog'] for sample in self.get_data(mode, False, False)]
80 |         return dialogs


--------------------------------------------------------------------------------
/config/data_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | data_root = './tx_data'
 3 | _corpus = [x.strip() for x in open('tx_data/corpus.txt', 'r').readlines()]
 4 | _start_corpus = [x.strip() for x in open('tx_data/start_corpus.txt', 'r').readlines()]
 5 | _max_seq_len = 30
 6 | _num_neg = 20
 7 | _max_turns = 8
 8 | _batch_size = 64
 9 | _retrieval_candidates = 1000
10 | 
11 | data_hparams = {
12 |     stage: {
13 |         "num_epochs": 1,
14 |         "shuffle": stage != 'test',
15 |         "batch_size": _batch_size,
16 |         "datasets": [
17 |             {  # dialogue history
18 |                 "variable_utterance": True,
19 |                 "max_utterance_cnt": 9,
20 |                 "max_seq_length": _max_seq_len,
21 |                 "files": [os.path.join(data_root, '{}/source.txt'.format(stage))],
22 |                 "vocab_file": os.path.join(data_root, 'vocab.txt'),
23 |                 "embedding_init": {
24 |                     "file": os.path.join(data_root, 'embedding.txt'),
25 |                     "dim": 200,
26 |                     "read_fn": "load_glove"
27 |                 },
28 |                 "data_name": "source"
29 |             },
30 |             {  # candidate response
31 |                 "variable_utterance": True,
32 |                 "max_utterance_cnt": 20,
33 |                 "max_seq_length": _max_seq_len,
34 |                 "files": [os.path.join(data_root, '{}/target.txt'.format(stage))],
35 |                 "vocab_share_with": 0,
36 |                 "embedding_init_share_with" : 0,
37 |                 "data_name": "target"
38 |             },
39 |             {  # context (source keywords)
40 |                 "files": [os.path.join(data_root, '{}/context.txt'.format(stage))],
41 |                 "vocab_share_with": 0,
42 |                 "embedding_init_share_with": 0,
43 |                 "data_name": "context",
44 |                 "bos_token": '',
45 |                 "eos_token": '',
46 |             },
47 |             {  # target keywords
48 |                 "files": [os.path.join(data_root, '{}/keywords.txt'.format(stage))],
49 |                 "vocab_share_with": 0,
50 |                 "embedding_init_share_with": 0,
51 |                 "data_name": "keywords",
52 |                 "bos_token": '',
53 |                 "eos_token": '',
54 |             },
55 |             {  # label
56 |                 "files": [os.path.join(data_root, '{}/label.txt'.format(stage))],
57 |                 "data_type": "int",
58 |                 "data_name": "label"
59 |             }
60 |         ]
61 |     }
62 |     for stage in ['train','valid','test']
63 | }
64 | 
65 | 
66 | corpus_hparams = {
67 |     "batch_size": _batch_size*2,
68 |     "shuffle": False,
69 |     "dataset":{
70 |         "max_seq_length": _max_seq_len,
71 |         "files": [os.path.join(data_root, 'corpus.txt')],
72 |         "vocab_file": os.path.join(data_root, 'vocab.txt'),
73 |         "data_name": "corpus"
74 |     }
75 | }
76 | 
77 | 
78 | _keywords_path = 'tx_data/test/keywords_vocab.txt'
79 | _keywords_candi = [x.strip() for x in open(_keywords_path, 'r').readlines()]
80 | _keywords_num = len(_keywords_candi)
81 | _keywords_dict = {}
82 | for i in range(_keywords_num):
83 |     _keywords_dict[_keywords_candi[i]] = i
84 | 


--------------------------------------------------------------------------------
/preprocess/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import collections
 3 | import random
 4 | import pickle
 5 | from convai2 import dts_ConvAI2
 6 | from extraction import KeywordExtractor
 7 | from data_utils import *
 8 | 
 9 | class dts_Target(dts_ConvAI2):
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 | 
13 |     def get_vocab(self):
14 |         counter = collections.Counter()
15 |         dialogs = self.get_dialogs()
16 |         for dialog in dialogs:
17 |             for uttr in dialog:
18 |                 counter.update(simp_tokenize(uttr))
19 |         print('total vocab count: ', len(counter.items()))
20 |         vocab = [token for token, times in sorted(list(counter.items()), key=lambda x: (-x[1], x[0]))]
21 |         with open('../tx_data/vocab.txt','w') as f:
22 |             for word in vocab:
23 |                 f.write(word + '\n')
24 |         print('save vocab in vocab.txt')
25 |         return vocab
26 | 
27 |     def get_kwsess(self, vocab, mode='all'):
28 |         keyword_extractor = KeywordExtractor(vocab)
29 |         corpus = self.get_data(mode = mode, cands=False)
30 |         sess_set = []
31 |         for sess in corpus:
32 |             data = {}
33 |             data['history'] = ''
34 |             data['dialog'] = []
35 |             for dialog in sess['dialog']:
36 |                 data['dialog'].append(dialog)
37 |                 data['history'] = data['history'] + ' ' + dialog
38 |             data['kws'] = keyword_extractor.extract(data['history'])
39 |             sess_set.append(data)
40 |         return sess_set
41 | 
42 |     def cal_idf(self):
43 |         counter = collections.Counter()
44 |         dialogs = self.get_dialogs()
45 |         total = 0.
46 |         for dialog in dialogs:
47 |             for uttr in dialog:
48 |                 total += 1
49 |                 counter.update(set(kw_tokenize(uttr)))
50 |         idf_dict = {}
51 |         for k,v in counter.items():
52 |             idf_dict[k] = np.log10(total / (v+1.))
53 |         return idf_dict
54 | 
55 |     def make_dataset(self):
56 |         vocab = self.get_vocab()
57 |         idf_dict = self.cal_idf()
58 |         kw_counter = collections.Counter()
59 |         sess_set = self.get_kwsess(vocab)
60 |         for data in sess_set:
61 |             kw_counter.update(data['kws'])
62 |         kw_freq = {}
63 |         kw_sum = sum(kw_counter.values())
64 |         for k, v in kw_counter.most_common():
65 |             kw_freq[k] = v / kw_sum
66 |         for data in sess_set:
67 |             data['score'] = 0.
68 |             for kw in set(data['kws']):
69 |                 data['score'] += kw_freq[kw]
70 |             data['score'] /= len(set(data['kws']))
71 |         sess_set.sort(key=lambda x: x['score'], reverse=True)
72 | 
73 |         all_data = {'train':[], 'valid':[], 'test':[]}
74 |         keyword_extractor = KeywordExtractor(idf_dict)
75 |         for id, sess in enumerate(sess_set):
76 |             type = 'train'
77 |             if id < 500:
78 |                 type = 'test'
79 |             elif random.random() < 0.05:
80 |                 type = 'valid'
81 |             sample = {'dialog':sess['dialog'], 'kwlist':[]}
82 |             for i in range(len(sess['dialog'])):
83 |                 sample['kwlist'].append(keyword_extractor.idf_extract(sess['dialog'][i]))
84 |             all_data[type].append(sample)
85 |         pickle.dump(all_data, open('source_data.pk','wb'))
86 |         return all_data


--------------------------------------------------------------------------------
/simulate.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import importlib
 3 | import random
 4 | from preprocess.data_utils import utter_preprocess, is_reach_goal
 5 | from model import retrieval
 6 | 
 7 | class Target_Simulation():
 8 |     def __init__(self, config_model, config_data, config_retrieval):
 9 |         g1 = tf.Graph()
10 |         with g1.as_default():
11 |             self.retrieval_agent = retrieval.Predictor(config_retrieval, config_data)
12 |             sess1 = tf.Session(graph=g1, config=self.retrieval_agent.gpu_config)
13 |             self.retrieval_agent.retrieve_init(sess1)
14 |         g2 = tf.Graph()
15 |         with g2.as_default():
16 |             self.target_agent = model.Predictor(config_model, config_data)
17 |             sess2 = tf.Session(graph=g2, config=self.target_agent.gpu_config)
18 |             self.target_agent.retrieve_init(sess2)
19 |         self.start_utter = config_data._start_corpus
20 |         success_cnt, turns_cnt = 0, 0
21 |         for i in range(int(FLAGS.times)):
22 |             print('--------Session {} --------'.format(i))
23 |             success, turns = self.simulate(sess1, sess2)
24 |             success_cnt += success
25 |             turns_cnt += turns
26 |         print('success time {}, average turns {:.2f}'.format(success_cnt, turns_cnt / success_cnt))
27 | 
28 |     def simulate(self, sess1, sess2):
29 |         history = []
30 |         history.append(random.sample(self.start_utter,1)[0])
31 |         target_kw = random.sample(target_set,1)[0]
32 |         self.target_agent.target = target_kw
33 |         self.target_agent.score = 0.
34 |         self.target_agent.reply_list = []
35 |         self.retrieval_agent.reply_list = []
36 | 
37 |         print('START: ' + history[0])
38 |         for i in range(config_data._max_turns):
39 |             source = utter_preprocess(history, config_data._max_seq_len)
40 |             reply = self.retrieval_agent.retrieve(source, sess1)
41 |             print('retrieval_agent: ', reply)
42 |             history.append(reply)
43 |             source = utter_preprocess(history, config_data._max_seq_len)
44 |             reply = self.target_agent.retrieve(source, sess2)
45 |             print('{}_agent: '.format(FLAGS.agent), reply)
46 |             print('Keyword: {}, Similarity: {:.2f}'.format(self.target_agent.next_kw, self.target_agent.score))
47 |             history.append(reply)
48 |             if is_reach_goal(history[-2] + history[-1], target_kw):
49 |                 print('Successfully chat to the target \'{}\'.'.format(target_kw))
50 |                 return (True, (len(history)+1)//2)
51 | 
52 |         print('Failed by reaching the maximum turn, target: \'{}\'.'.format(target_kw))
53 |         return (False, 0)
54 | 
55 | if __name__ == '__main__':
56 |     flags = tf.flags
57 |     flags.DEFINE_string('agent', 'kernel', 'The agent type, supports kernel / matrix / neural / retrieval.')
58 |     flags.DEFINE_string('times', '100', 'Simulation times.')
59 | 
60 |     FLAGS = flags.FLAGS
61 |     config_data = importlib.import_module('config.data_config')
62 |     config_model = importlib.import_module('config.' + FLAGS.agent)
63 |     config_retrieval = importlib.import_module('config.retrieval')
64 |     model = importlib.import_module('model.' + FLAGS.agent)
65 | 
66 |     target_set = []
67 |     for line in open('tx_data/test/keywords.txt', 'r').readlines():
68 |         target_set = target_set + line.strip().split(' ')
69 | 
70 |     Target_Simulation(config_model,config_data,config_retrieval)


--------------------------------------------------------------------------------
/preprocess/data_utils.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import os
  3 | from nltk.stem import WordNetLemmatizer
  4 | 
  5 | _lemmatizer = WordNetLemmatizer()
  6 | 
  7 | 
  8 | def tokenize(example, ppln):
  9 |     for fn in ppln:
 10 |         example = fn(example)
 11 |     return example
 12 | 
 13 | 
 14 | def kw_tokenize(string):
 15 |     return tokenize(string, [nltk_tokenize, lower, pos_tag, to_basic_form])
 16 | 
 17 | 
 18 | def simp_tokenize(string):
 19 |     return tokenize(string, [nltk_tokenize, lower])
 20 | 
 21 | 
 22 | def nltk_tokenize(string):
 23 |     return nltk.word_tokenize(string)
 24 | 
 25 | 
 26 | def lower(tokens):
 27 |     if not isinstance(tokens, str):
 28 |         return [lower(token) for token in tokens]
 29 |     return tokens.lower()
 30 | 
 31 | 
 32 | def pos_tag(tokens):
 33 |     return nltk.pos_tag(tokens)
 34 | 
 35 | 
 36 | def to_basic_form(tokens):
 37 |     if not isinstance(tokens, tuple):
 38 |         return [to_basic_form(token) for token in tokens]
 39 |     word, tag = tokens
 40 |     if tag.startswith('NN'):
 41 |         pos = 'n'
 42 |     elif tag.startswith('VB'):
 43 |         pos = 'v'
 44 |     elif tag.startswith('JJ'):
 45 |         pos = 'a'
 46 |     else:
 47 |         return word
 48 |     return _lemmatizer.lemmatize(word, pos)
 49 | 
 50 | 
 51 | def truecasing(tokens):
 52 |     ret = []
 53 |     is_start = True
 54 |     for word, tag in tokens:
 55 |         if word == 'i':
 56 |             ret.append('I')
 57 |         elif tag[0].isalpha():
 58 |             if is_start:
 59 |                 ret.append(word[0].upper() + word[1:])
 60 |             else:
 61 |                 ret.append(word)
 62 |             is_start = False
 63 |         else:
 64 |             if tag != ',':
 65 |                 is_start = True
 66 |             ret.append(word)
 67 |     return ret
 68 | 
 69 | 
 70 | candi_keyword_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'convai2/candi_keyword.txt')
 71 | _candiwords = [x.strip() for x in open(candi_keyword_path).readlines()]
 72 | 
 73 | 
 74 | def is_candiword(a):
 75 |     if a in _candiwords:
 76 |         return True
 77 |     return False
 78 | 
 79 | 
 80 | from nltk.corpus import wordnet as wn
 81 | from nltk.corpus import wordnet_ic
 82 | 
 83 | brown_ic = wordnet_ic.ic('ic-brown.dat')
 84 | 
 85 | 
 86 | def calculate_linsim(a, b):
 87 |     linsim = -1
 88 |     syna = wn.synsets(a)
 89 |     synb = wn.synsets(b)
 90 |     for sa in syna:
 91 |         for sb in synb:
 92 |             try:
 93 |                 linsim = max(linsim, sa.lin_similarity(sb, brown_ic))
 94 |             except:
 95 |                 pass
 96 |     return linsim
 97 | 
 98 | 
 99 | def is_reach_goal(context, goal):
100 |     context = kw_tokenize(context)
101 |     if goal in context:
102 |         return True
103 |     for wd in context:
104 |         if is_candiword(wd):
105 |             rela = calculate_linsim(wd, goal)
106 |             if rela > 0.9:
107 |                 return True
108 |     return False
109 | 
110 | 
111 | def make_context(string):
112 |     string = kw_tokenize(string)
113 |     context = []
114 |     for word in string:
115 |         if is_candiword(word):
116 |             context.append(word)
117 |     return context
118 | 
119 | 
120 | def utter_preprocess(string_list, max_length):
121 |     source, minor_length = [], []
122 |     string_list = string_list[-9:]
123 |     major_length = len(string_list)
124 |     if major_length == 1:
125 |         context = make_context(string_list[-1])
126 |     else:
127 |         context = make_context(string_list[-2] + string_list[-1])
128 |     context_len = len(context)
129 |     while len(context) < 20:
130 |         context.append('<PAD>')
131 |     for string in string_list:
132 |         string = simp_tokenize(string)
133 |         if len(string) > max_length:
134 |             string = string[:max_length]
135 |         string = ['<BOS>'] + string + ['<EOS>']
136 |         minor_length.append(len(string))
137 |         while len(string) < max_length + 2:
138 |             string.append('<PAD>')
139 |         source.append(string)
140 |     while len(source) < 9:
141 |         source.append(['<PAD>'] * (max_length + 2))
142 |         minor_length.append(0)
143 |     return (source, minor_length, major_length, context, context_len)
144 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Target-Guided Open-Domain Conversation
 2 | 
 3 | This is the code for the following paper:
 4 | 
 5 | [Target-Guided Open-Domain Conversation](http://arxiv.org/abs/1905.11553)  
 6 | *Jianheng Tang, Tiancheng Zhao, Chenyan Xiong, Xiaodan Liang, Eric Xing, Zhiting Hu; ACL 2019*
 7 | 
 8 | ### Requirement
 9 | 
10 | - `nltk==3.4`  
11 | - `tensoflow==1.12`   
12 | - `texar>=0.2.1` ([Texar](https://github.com/asyml/texar))
13 | 
14 | ### Usage
15 | 
16 | #### Data Preparation
17 | The dataset developed in the paper is on [google drive](https://drive.google.com/file/d/1oTjOQjm7iiUitOPLCmlkXOCbEPoSWDPX/view?usp=sharing). Download 
18 | and unzip it into `preprocess/convai2`. Then run the following command:
19 | ```shell
20 | cd preprocess
21 | python prepare_data.py
22 | ```
23 | By default, the processed data will be put in the `tx_data` directory.
24 | 
25 | #### Turn-level Supervised Learning
26 | In this project there are 5 different types of agents, including the kernel/neural/matrix/retrieval/retrieval_stgy agent,
27 |  which are all discribed in the paper. You can modify the configration of each agent in the `config` directory.
28 | 
29 | To train the kernel/neural/matrix agent, you need to first train/test the keyword prediction module, 
30 | and then train/test the retrieval module of each agent specified by the `--agent` parameter.
31 | 
32 | ```shell
33 | python train.py --mode train_kw --agent kernel
34 | python train.py --mode train --agent kernel
35 | python train.py --mode test --agent kernel
36 | ```
37 | 
38 | The retrieval agent and the retrieval_stgy agent share the same retrival module. You only need to train one of them:
39 | 
40 | ```shell
41 | python train.py --mode train --agent retrieval
42 | python train.py --mode test --agent retrieval
43 | ```
44 | 
45 | #### Target-guided Conversation
46 | 
47 | After turn-level training, you can start target-guided conversation (human evaluation) with 
48 | the kernel/neural/matrix/retrieval/retrieval_stgy  agent specified by the `--agent` parameter.
49 | 
50 | ```shell
51 | python chat.py --agent kernel
52 | ```
53 | A demo of target-guided conversation (user is required to input the response after `HUMAN:`)
54 | 
55 | ```
56 | --------Session 0 --------
57 | START:  hello, how are you doing
58 | HUMAN:  hi i am great just finishing up some homework. how about you
59 | AGENT:  just got done eating some chicken and rice after i got done with my gym session
60 | HUMAN:  i am gonna eat some vegetable for dinner
61 | AGENT:  so you are vegetarian ? i am from los angeles so thereare lots of vegetarian restaurants
62 | HUMAN:  i am not a vegetarian . fried chicken is my favorite.
63 | AGENT:  oh boy , when i go to mcdonald i always get fries for my 2 dogs
64 | Successfully chat to the target 'mcdonald'.
65 | ```
66 | 
67 | You can also watch the simulation of the target-guided conversation 
68 | between the retrieval agent pretending the user and the kernel/neural/matrix/retrieval_stgy agent specified by the `--agent` parameter. Simulation times are defined by the `--times` parameter. The program will calculate the success times and average turns in the end.
69 | 
70 | ```shell
71 | python simulate.py --agent kernel --times 100
72 | ```
73 | 
74 | A demo of simulation:
75 | ```
76 | --------Session 0 --------
77 | START: hey hey good morning to you
78 | retrieval_agent:  fine thanks . what do you do for a living ?
79 | kernel_agent:  rewarding job , i work at a hospital
80 | Keyword: job, Similarity: 0.58
81 | Successfully chat to the target 'hospital'.
82 | 
83 | ...
84 | --------Session 99 --------
85 | START: hey hows it going ? i'm just cooking a steak
86 | retrieval_agent:  i'm thinking of a bbq sandwich for lunch
87 | kernel_agent:  nice i love to cook but now its just me and the fur babies
88 | Keyword: baby, Similarity: 0.45
89 | retrieval_agent:  i love bagels however i own a dry cleaners
90 | kernel_agent:  i love animals felix my cat and my dog emmy
91 | Keyword: cat, Similarity: 0.56
92 | retrieval_agent:  sounds awesome i have all kind of pets my family own a farm
93 | kernel_agent:  i love blue as well even my hair is blue
94 | Keyword: blue, Similarity: 1.00
95 | Successfully chat to the target 'blue'.
96 | 
97 | success time 83, average turns 4.28
98 | ```
99 | 


--------------------------------------------------------------------------------
/model/retrieval.py:
--------------------------------------------------------------------------------
  1 | import texar as tx
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | 
  5 | 
  6 | class Predictor():
  7 |     def __init__(self, config_model, config_data, mode=None):
  8 |         self.config = config_model
  9 |         self.data_config = config_data
 10 |         self.build_model()
 11 |         self.gpu_config = tf.ConfigProto()
 12 |         self.gpu_config.gpu_options.allow_growth = True
 13 | 
 14 |     def build_model(self):
 15 |         self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train'])
 16 |         self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid'])
 17 |         self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test'])
 18 |         self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data)
 19 |         self.vocab = self.train_data.vocab(0)
 20 |         self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs)
 21 |         self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams)
 22 |         self.target_encoder = tx.modules.UnidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams)
 23 |         self.linear_matcher = tx.modules.MLPTransformConnector(1)
 24 | 
 25 |     def forward(self, batch):
 26 |         source_embed = self.embedder(batch['source_text_ids'])
 27 |         target_embed = self.embedder(batch['target_text_ids'])
 28 |         target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim])
 29 |         source_code = self.source_encoder(source_embed,
 30 |                                           sequence_length_minor=batch['source_length'],
 31 |                                           sequence_length_major=batch['source_utterance_cnt'])[1]
 32 |         target_length = tf.reshape(batch['target_length'], [-1])
 33 |         target_code = self.target_encoder(target_embed, sequence_length=target_length)[1]
 34 |         target_code = tf.reshape(target_code, [-1, 20, self.config._code_len])
 35 |         source_code = tf.expand_dims(source_code, 1)
 36 |         source_code = tf.tile(source_code, [1, 20, 1])
 37 |         feature_code = target_code * source_code
 38 |         feature_code = tf.reshape(feature_code, [-1, self.config._code_len])
 39 |         logits = self.linear_matcher(feature_code)
 40 |         logits = tf.reshape(logits, [-1, 20])
 41 |         labels = tf.one_hot(batch['label'], 20)
 42 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits))
 43 |         ans = tf.arg_max(logits, -1)
 44 |         acc = tx.evals.accuracy(batch['label'], ans)
 45 |         rank = tf.nn.top_k(logits, k=20)[1]
 46 |         return loss, acc, rank
 47 | 
 48 |     def train(self):
 49 |         batch = self.iterator.get_next()
 50 |         loss, acc, _ = self.forward(batch)
 51 |         op_step = tf.Variable(0, name='op_step')
 52 |         train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams)
 53 |         max_val_acc = 0.
 54 |         self.saver = tf.train.Saver()
 55 |         with tf.Session(config=self.gpu_config) as sess:
 56 |             sess.run(tf.global_variables_initializer())
 57 |             sess.run(tf.local_variables_initializer())
 58 |             sess.run(tf.tables_initializer())
 59 |             for epoch_id in range(self.config._max_epoch):
 60 |                 self.iterator.switch_to_train_data(sess)
 61 |                 cur_step = 0
 62 |                 cnt_acc = []
 63 |                 while True:
 64 |                     try:
 65 |                         cur_step += 1
 66 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
 67 |                         loss, acc_ = sess.run([train_op, acc], feed_dict=feed)
 68 |                         cnt_acc.append(acc_)
 69 |                         if cur_step % 200 == 0:
 70 |                             print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:])))
 71 |                     except tf.errors.OutOfRangeError:
 72 |                         break
 73 |                 op_step = op_step + 1
 74 |                 self.iterator.switch_to_val_data(sess)
 75 |                 cnt_acc = []
 76 |                 while True:
 77 |                     try:
 78 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
 79 |                         acc_ = sess.run([acc], feed_dict=feed)
 80 |                         cnt_acc.append(acc_)
 81 |                     except tf.errors.OutOfRangeError:
 82 |                         mean_acc = np.mean(cnt_acc)
 83 |                         print('valid acc1={}'.format(mean_acc))
 84 |                         if mean_acc > max_val_acc:
 85 |                             max_val_acc = mean_acc
 86 |                             self.saver.save(sess, self.config._save_path)
 87 |                         break
 88 | 
 89 |     def test(self):
 90 |         batch = self.iterator.get_next()
 91 |         loss, acc, rank = self.forward(batch)
 92 |         with tf.Session(config=self.gpu_config) as sess:
 93 |             sess.run(tf.tables_initializer())
 94 |             self.saver = tf.train.Saver()
 95 |             self.saver.restore(sess, self.config._save_path)
 96 |             self.iterator.switch_to_test_data(sess)
 97 |             rank_cnt = []
 98 |             while True:
 99 |                 try:
100 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
101 |                     ranks, labels = sess.run([rank, batch['label']], feed_dict=feed)
102 |                     for i in range(len(ranks)):
103 |                         rank_cnt.append(np.where(ranks[i]==labels[i])[0][0])
104 |                 except tf.errors.OutOfRangeError:
105 |                     rec = [0,0,0,0,0]
106 |                     MRR = 0
107 |                     for rank in rank_cnt:
108 |                         for i in range(5):
109 |                             rec[i] += (rank <= i)
110 |                         MRR += 1 / (rank+1)
111 |                     print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format(
112 |                         rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt)))
113 |                     break
114 | 
115 |     def retrieve_init(self, sess):
116 |         data_batch = self.iterator.get_next()
117 |         loss, acc, _ = self.forward(data_batch)
118 |         self.corpus = self.data_config._corpus
119 |         self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams)
120 |         corpus_iterator = tx.data.DataIterator(self.corpus_data)
121 |         batch = corpus_iterator.get_next()
122 |         corpus_embed = self.embedder(batch['corpus_text_ids'])
123 |         utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
124 |         self.corpus_code = np.zeros([0, self.config._code_len])
125 |         corpus_iterator.switch_to_dataset(sess)
126 |         sess.run(tf.tables_initializer())
127 |         saver = tf.train.Saver()
128 |         saver.restore(sess, self.config._save_path)
129 |         feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
130 |         while True:
131 |             try:
132 |                 utter_code_ = sess.run(utter_code, feed_dict=feed)
133 |                 self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0)
134 |             except tf.errors.OutOfRangeError:
135 |                 break
136 | 
137 |         self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9))
138 |         self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1))
139 |         self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2))
140 | 
141 |         history_ids = self.vocab.map_tokens_to_ids(self.history_input)
142 |         history_embed = self.embedder(history_ids)
143 |         history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0),
144 |                                            sequence_length_minor=self.minor_length_input,
145 |                                            sequence_length_major=self.major_length_input)[1]
146 |         select_corpus = tf.cast(self.corpus_code, dtype=tf.float32)
147 |         feature_code = self.linear_matcher(select_corpus * history_code)
148 |         self.ans_output = tf.nn.top_k(tf.squeeze(feature_code, 1), k=self.data_config._retrieval_candidates)[1]
149 | 
150 |     def retrieve(self, source, sess):
151 |         history, seq_len, turns, context, context_len = source
152 |         ans = sess.run(self.ans_output, feed_dict={self.history_input: history,
153 |                                                    self.minor_length_input: [seq_len],
154 |                                                    self.major_length_input: [turns]})
155 |         for i in range(self.data_config._max_turns + 1):
156 |             if ans[i] not in self.reply_list:  # avoid repeat
157 |                 self.reply_list.append(ans[i])
158 |                 reply = self.corpus[ans[i]]
159 |                 break
160 |         return reply
161 | 


--------------------------------------------------------------------------------
/model/retrieval_stgy.py:
--------------------------------------------------------------------------------
  1 | import texar as tx
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | from preprocess.data_utils import kw_tokenize
  5 | 
  6 | class Predictor():
  7 |     def __init__(self, config_model, config_data, mode=None):
  8 |         self.config = config_model
  9 |         self.data_config = config_data
 10 |         self.build_model()
 11 |         self.gpu_config = tf.ConfigProto()
 12 |         self.gpu_config.gpu_options.allow_growth = True
 13 | 
 14 |     def build_model(self):
 15 |         self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train'])
 16 |         self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid'])
 17 |         self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test'])
 18 |         self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data)
 19 |         self.vocab = self.train_data.vocab(0)
 20 |         self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs)
 21 |         self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams)
 22 |         self.target_encoder = tx.modules.UnidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams)
 23 |         self.linear_matcher = tx.modules.MLPTransformConnector(1)
 24 |         self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi))
 25 | 
 26 |     def forward(self, batch):
 27 |         source_embed = self.embedder(batch['source_text_ids'])
 28 |         target_embed = self.embedder(batch['target_text_ids'])
 29 |         target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim])
 30 |         source_code = self.source_encoder(source_embed,
 31 |                                           sequence_length_minor=batch['source_length'],
 32 |                                           sequence_length_major=batch['source_utterance_cnt'])[1]
 33 |         target_length = tf.reshape(batch['target_length'], [-1])
 34 |         target_code = self.target_encoder(target_embed, sequence_length=target_length)[1]
 35 |         target_code = tf.reshape(target_code, [-1, 20, self.config._code_len])
 36 |         source_code = tf.expand_dims(source_code, 1)
 37 |         source_code = tf.tile(source_code, [1, 20, 1])
 38 |         feature_code = target_code * source_code
 39 |         feature_code = tf.reshape(feature_code, [-1, self.config._code_len])
 40 |         logits = self.linear_matcher(feature_code)
 41 |         logits = tf.reshape(logits, [-1, 20])
 42 |         labels = tf.one_hot(batch['label'], 20)
 43 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits))
 44 |         ans = tf.arg_max(logits, -1)
 45 |         acc = tx.evals.accuracy(batch['label'], ans)
 46 |         rank = tf.nn.top_k(logits, k=20)[1]
 47 |         return loss, acc, rank
 48 | 
 49 |     def train(self):
 50 |         batch = self.iterator.get_next()
 51 |         loss, acc, _ = self.forward(batch)
 52 |         op_step = tf.Variable(0, name='op_step')
 53 |         train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams)
 54 |         max_val_acc = 0.
 55 |         self.saver = tf.train.Saver()
 56 |         with tf.Session(config=self.gpu_config) as sess:
 57 |             sess.run(tf.global_variables_initializer())
 58 |             sess.run(tf.local_variables_initializer())
 59 |             sess.run(tf.tables_initializer())
 60 |             for epoch_id in range(self.config._max_epoch):
 61 |                 self.iterator.switch_to_train_data(sess)
 62 |                 cur_step = 0
 63 |                 cnt_acc = []
 64 |                 while True:
 65 |                     try:
 66 |                         cur_step += 1
 67 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
 68 |                         loss, acc_ = sess.run([train_op, acc], feed_dict=feed)
 69 |                         cnt_acc.append(acc_)
 70 |                         if cur_step % 200 == 0:
 71 |                             print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:])))
 72 |                     except tf.errors.OutOfRangeError:
 73 |                         break
 74 |                 op_step = op_step + 1
 75 |                 self.iterator.switch_to_val_data(sess)
 76 |                 cnt_acc = []
 77 |                 while True:
 78 |                     try:
 79 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
 80 |                         acc_ = sess.run([acc], feed_dict=feed)
 81 |                         cnt_acc.append(acc_)
 82 |                     except tf.errors.OutOfRangeError:
 83 |                         mean_acc = np.mean(cnt_acc)
 84 |                         print('valid acc1={}'.format(mean_acc))
 85 |                         if mean_acc > max_val_acc:
 86 |                             max_val_acc = mean_acc
 87 |                             self.saver.save(sess, self.config._save_path)
 88 |                         break
 89 | 
 90 |     def test(self):
 91 |         batch = self.iterator.get_next()
 92 |         loss, acc, rank = self.forward(batch)
 93 |         with tf.Session(config=self.gpu_config) as sess:
 94 |             sess.run(tf.tables_initializer())
 95 |             self.saver = tf.train.Saver()
 96 |             self.saver.restore(sess, self.config._save_path)
 97 |             self.iterator.switch_to_test_data(sess)
 98 |             rank_cnt = []
 99 |             while True:
100 |                 try:
101 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
102 |                     ranks, labels = sess.run([rank, batch['label']], feed_dict=feed)
103 |                     for i in range(len(ranks)):
104 |                         rank_cnt.append(np.where(ranks[i]==labels[i])[0][0])
105 |                 except tf.errors.OutOfRangeError:
106 |                     rec = [0,0,0,0,0]
107 |                     MRR = 0
108 |                     for rank in rank_cnt:
109 |                         for i in range(5):
110 |                             rec[i] += (rank <= i)
111 |                         MRR += 1 / (rank+1)
112 |                     print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format(
113 |                         rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt)))
114 |                     break
115 | 
116 |     def retrieve_init(self, sess):
117 |         data_batch = self.iterator.get_next()
118 |         loss, acc, _ = self.forward(data_batch)
119 |         self.corpus = self.data_config._corpus
120 |         self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams)
121 |         corpus_iterator = tx.data.DataIterator(self.corpus_data)
122 |         batch = corpus_iterator.get_next()
123 |         corpus_embed = self.embedder(batch['corpus_text_ids'])
124 |         utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
125 |         self.corpus_code = np.zeros([0, self.config._code_len])
126 | 
127 |         corpus_iterator.switch_to_dataset(sess)
128 |         sess.run(tf.tables_initializer())
129 |         saver = tf.train.Saver()
130 |         saver.restore(sess, self.config._save_path)
131 |         feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
132 |         while True:
133 |             try:
134 |                 utter_code_ = sess.run(utter_code, feed_dict=feed)
135 |                 self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0)
136 |             except tf.errors.OutOfRangeError:
137 |                 break
138 | 
139 |         self.keywords_embed = tf.nn.l2_normalize(self.embedder(self.kw_list), axis=1)
140 |         self.kw_embedding = sess.run(self.keywords_embed)
141 | 
142 |         self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9))
143 |         self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1))
144 |         self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2))
145 | 
146 |         history_ids = self.vocab.map_tokens_to_ids(self.history_input)
147 |         history_embed = self.embedder(history_ids)
148 |         history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0),
149 |                                            sequence_length_minor=self.minor_length_input,
150 |                                            sequence_length_major=self.major_length_input)[1]
151 |         select_corpus = tf.cast(self.corpus_code, dtype=tf.float32)
152 |         feature_code = self.linear_matcher(select_corpus * history_code)
153 |         self.ans_output = tf.nn.top_k(tf.squeeze(feature_code, 1), k=1000)[1]
154 | 
155 |     def retrieve(self, source, sess):
156 |         history, seq_len, turns, context, context_len = source
157 |         ans = sess.run(self.ans_output, feed_dict={self.history_input: history,
158 |                                                    self.minor_length_input: [seq_len],
159 |                                                    self.major_length_input: [turns]})
160 |         flag = 0
161 |         reply = self.corpus[ans[0]]
162 |         for i in ans:
163 |             if i in self.reply_list:  # avoid repeat
164 |                 continue
165 |             for wd in kw_tokenize(self.corpus[i]):
166 |                 if wd in self.data_config._keywords_candi:
167 |                     tmp_score = sum(self.kw_embedding[self.data_config._keywords_dict[wd]] *
168 |                                     self.kw_embedding[self.data_config._keywords_dict[self.target]])
169 |                     if tmp_score > self.score:
170 |                         reply = self.corpus[i]
171 |                         self.score = tmp_score
172 |                         self.next_kw = wd
173 |                         flag = 1
174 |                         break
175 |             if flag == 0:
176 |                 continue
177 |             break
178 |         return reply
179 | 
180 | 


--------------------------------------------------------------------------------
/model/matrix.py:
--------------------------------------------------------------------------------
  1 | import texar as tx
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import pickle
  5 | 
  6 | class Predictor():
  7 |     def __init__(self, config_model, config_data, mode=None):
  8 |         self.config = config_model
  9 |         self.data_config = config_data
 10 |         self.gpu_config = tf.ConfigProto()
 11 |         self.gpu_config.gpu_options.allow_growth = True
 12 |         self.build_model(mode)
 13 | 
 14 |     def build_model(self, mode):
 15 |         self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train'])
 16 |         self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid'])
 17 |         self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test'])
 18 |         self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data)
 19 |         self.vocab = self.train_data.vocab(0)
 20 |         self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams)
 21 |         self.target_encoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams)
 22 |         self.target_kwencoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_kwencoder_hparams)
 23 |         self.linear_transform = tx.modules.MLPTransformConnector(self.config._code_len // 2)
 24 |         self.linear_matcher = tx.modules.MLPTransformConnector(1)
 25 |         self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs)
 26 |         self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi))
 27 |         self.kw_vocab = tx.data.Vocab(self.data_config._keywords_path)
 28 | 
 29 |         if mode == 'train_kw':
 30 |             self.pmi_matrix = np.zeros([self.config._vocab_size+4, self.data_config._keywords_num])
 31 |         else:
 32 |             with open(self.config._matrix_save_path, 'rb') as f:
 33 |                 matrix = pickle.load(f)
 34 |                 self.pmi_matrix = tf.convert_to_tensor(matrix,dtype=tf.float32)
 35 | 
 36 |     def forward_matrix(self, context_ids):
 37 |         matching_score = tf.gather(self.pmi_matrix, context_ids)
 38 |         return tf.reduce_sum(tf.log(matching_score), axis=0)
 39 | 
 40 |     def predict_keywords(self, batch):
 41 |         keywords_ids = self.kw_vocab.map_tokens_to_ids(batch['keywords_text'])
 42 |         matching_score = tf.map_fn(lambda x: self.forward_matrix(x), batch['context_text_ids'],
 43 |              dtype=tf.float32, parallel_iterations=True)
 44 |         kw_labels = tf.map_fn(lambda x: tf.sparse_to_dense(x, [self.kw_vocab.size], 1., 0., False),
 45 |                               keywords_ids, dtype=tf.float32, parallel_iterations=True)[:, 4:]
 46 |         kw_ans = tf.arg_max(matching_score, -1)
 47 |         acc_label = tf.map_fn(lambda x: tf.gather(x[0], x[1]), (kw_labels, kw_ans), dtype=tf.float32)
 48 |         acc = tf.reduce_mean(acc_label)
 49 |         kws = tf.nn.top_k(matching_score, k=5)[1]
 50 |         kws = tf.reshape(kws,[-1])
 51 |         kws = tf.map_fn(lambda x: self.kw_list[x], kws, dtype=tf.int64)
 52 |         kws = tf.reshape(kws,[-1, 5])
 53 |         return acc, kws
 54 | 
 55 |     def train_keywords(self):
 56 |         batch = self.iterator.get_next()
 57 |         acc, _ = self.predict_keywords(batch)
 58 |         with tf.Session(config=self.gpu_config) as sess:
 59 |             sess.run(tf.global_variables_initializer())
 60 |             sess.run(tf.local_variables_initializer())
 61 |             sess.run(tf.tables_initializer())
 62 |             self.iterator.switch_to_train_data(sess)
 63 | 
 64 |             batchid = 0
 65 |             while True:
 66 |                 try:
 67 |                     batchid += 1
 68 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
 69 |                     source_keywords, target_keywords = sess.run([batch['context_text_ids'],
 70 |                                                                    batch['keywords_text_ids']], feed_dict=feed)
 71 |                     for i in range(len(source_keywords)):
 72 |                         for skw_id in source_keywords[i]:
 73 |                             if skw_id == 0:
 74 |                                 break
 75 |                             for tkw_id in target_keywords[i]:
 76 |                                 if skw_id >= 3 and tkw_id >= 3:
 77 |                                     tkw = self.config._vocab[tkw_id-4]
 78 |                                     if tkw in self.data_config._keywords_candi:
 79 |                                         tkw_id = self.data_config._keywords_dict[tkw]
 80 |                                         self.pmi_matrix[skw_id][tkw_id] += 1
 81 | 
 82 |                 except tf.errors.OutOfRangeError:
 83 |                     break
 84 |             self.pmi_matrix += 0.5
 85 |             self.pmi_matrix = self.pmi_matrix / (np.sum(self.pmi_matrix, axis=0) + 1)
 86 |             with open(self.config._matrix_save_path,'wb') as f:
 87 |                 pickle.dump(self.pmi_matrix, f)
 88 | 
 89 |     def test_keywords(self):
 90 |         batch = self.iterator.get_next()
 91 |         acc, kws = self.predict_keywords(batch)
 92 |         with tf.Session(config=self.gpu_config) as sess:
 93 |             sess.run(tf.global_variables_initializer())
 94 |             sess.run(tf.local_variables_initializer())
 95 |             sess.run(tf.tables_initializer())
 96 |             self.iterator.switch_to_test_data(sess)
 97 |             cnt_acc, cnt_rec1, cnt_rec3, cnt_rec5 = [], [], [], []
 98 |             while True:
 99 |                 try:
100 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
101 |                     acc_, kw_ans, kw_labels = sess.run([acc, kws, batch['keywords_text_ids']], feed_dict=feed)
102 |                     cnt_acc.append(acc_)
103 |                     rec = [0,0,0,0,0]
104 |                     sum_kws = 0
105 |                     for i in range(len(kw_ans)):
106 |                         sum_kws += sum(kw_labels[i] > 3)
107 |                         for j in range(5):
108 |                             if kw_ans[i][j] in kw_labels[i]:
109 |                                 for k in range(j, 5):
110 |                                     rec[k] += 1
111 |                     cnt_rec1.append(rec[0]/sum_kws)
112 |                     cnt_rec3.append(rec[2]/sum_kws)
113 |                     cnt_rec5.append(rec[4]/sum_kws)
114 | 
115 |                 except tf.errors.OutOfRangeError:
116 |                     print('test_kw acc@1={:.4f}, rec@1={:.4f}, rec@3={:.4f}, rec@5={:.4f}'.format(
117 |                         np.mean(cnt_acc), np.mean(cnt_rec1), np.mean(cnt_rec3), np.mean(cnt_rec5)))
118 |                     break
119 | 
120 | 
121 |     def forward(self, batch):
122 |         matching_score = tf.map_fn(lambda x: self.forward_matrix(x), batch['context_text_ids'],
123 |              dtype=tf.float32, parallel_iterations=True)
124 |         kw_weight, predict_kw = tf.nn.top_k(matching_score, k=3)
125 |         predict_kw = tf.reshape(predict_kw, [-1])
126 |         predict_kw = tf.map_fn(lambda x: self.kw_list[x], predict_kw, dtype=tf.int64)
127 |         predict_kw = tf.reshape(predict_kw, [-1, 3])
128 |         embed_code = self.embedder(predict_kw)
129 |         embed_code = tf.reduce_sum(embed_code, axis=1)
130 |         embed_code = self.linear_transform(embed_code)
131 | 
132 |         source_embed = self.embedder(batch['source_text_ids'])
133 |         target_embed = self.embedder(batch['target_text_ids'])
134 |         target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim])
135 |         target_length = tf.reshape(batch['target_length'], [-1])
136 |         source_code = self.source_encoder(
137 |             source_embed,
138 |             sequence_length_minor=batch['source_length'],
139 |             sequence_length_major=batch['source_utterance_cnt'])[1]
140 |         target_code = self.target_encoder(
141 |             target_embed,
142 |             sequence_length=target_length)[1]
143 |         target_kwcode = self.target_kwencoder(
144 |             target_embed,
145 |             sequence_length=target_length)[1]
146 |         target_code = tf.concat([target_code[0], target_code[1], target_kwcode[0], target_kwcode[1]], -1)
147 |         target_code = tf.reshape(target_code, [-1, 20, self.config._code_len])
148 | 
149 |         source_code = tf.concat([source_code, embed_code], -1)
150 |         source_code = tf.expand_dims(source_code, 1)
151 |         source_code = tf.tile(source_code, [1, 20, 1])
152 |         feature_code = target_code * source_code
153 |         feature_code = tf.reshape(feature_code, [-1, self.config._code_len])
154 | 
155 |         logits = self.linear_matcher(feature_code)
156 |         logits = tf.reshape(logits, [-1, 20])
157 |         labels = tf.one_hot(batch['label'], 20)
158 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits))
159 |         ans = tf.arg_max(logits, -1)
160 |         acc = tx.evals.accuracy(batch['label'], ans)
161 |         rank = tf.nn.top_k(logits, k=20)[1]
162 |         return loss, acc, rank
163 | 
164 |     def train(self):
165 |         batch = self.iterator.get_next()
166 |         loss, acc, _ = self.forward(batch)
167 |         op_step = tf.Variable(0, name='retrieval_step')
168 |         train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams)
169 |         max_val_acc = 0.
170 |         with tf.Session(config=self.gpu_config) as sess:
171 |             sess.run(tf.tables_initializer())
172 |             sess.run(tf.global_variables_initializer())
173 |             sess.run(tf.local_variables_initializer())
174 |             saver = tf.train.Saver()
175 |             for epoch_id in range(self.config._max_epoch):
176 |                 self.iterator.switch_to_train_data(sess)
177 |                 cur_step = 0
178 |                 cnt_acc = []
179 |                 while True:
180 |                     try:
181 |                         cur_step += 1
182 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
183 |                         loss, acc_ = sess.run([train_op, acc], feed_dict=feed)
184 |                         cnt_acc.append(acc_)
185 |                         if cur_step % 200 == 0:
186 |                             print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:])))
187 |                     except tf.errors.OutOfRangeError:
188 |                         break
189 |                 self.iterator.switch_to_val_data(sess)
190 | 
191 |                 cnt_acc= []
192 |                 while True:
193 |                     try:
194 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
195 |                         acc_ = sess.run(acc, feed_dict=feed)
196 |                         cnt_acc.append(acc_)
197 |                     except tf.errors.OutOfRangeError:
198 |                         mean_acc = np.mean(cnt_acc)
199 |                         print('valid acc1={}'.format(mean_acc))
200 |                         if mean_acc > max_val_acc:
201 |                             max_val_acc = mean_acc
202 |                             saver.save(sess, self.config._save_path)
203 |                         break
204 | 
205 |     def test(self):
206 |         batch = self.iterator.get_next()
207 |         loss, acc, rank = self.forward(batch)
208 |         with tf.Session(config=self.gpu_config) as sess:
209 |             sess.run(tf.tables_initializer())
210 |             self.saver = tf.train.Saver()
211 |             self.saver.restore(sess, self.config._save_path)
212 |             self.iterator.switch_to_test_data(sess)
213 |             rank_cnt = []
214 |             while True:
215 |                 try:
216 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
217 |                     ranks, labels = sess.run([rank, batch['label']], feed_dict=feed)
218 |                     for i in range(len(ranks)):
219 |                         rank_cnt.append(np.where(ranks[i]==labels[i])[0][0])
220 |                 except tf.errors.OutOfRangeError:
221 |                     rec = [0,0,0,0,0]
222 |                     MRR = 0
223 |                     for rank in rank_cnt:
224 |                         for i in range(5):
225 |                             rec[i] += (rank <= i)
226 |                         MRR += 1 / (rank+1)
227 |                     print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format(
228 |                         rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt)))
229 |                     break
230 | 
231 |     def retrieve_init(self, sess):
232 |         data_batch = self.iterator.get_next()
233 |         loss, acc, _ = self.forward(data_batch)
234 |         self.corpus = self.data_config._corpus
235 |         self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams)
236 |         corpus_iterator = tx.data.DataIterator(self.corpus_data)
237 |         batch = corpus_iterator.get_next()
238 |         corpus_embed = self.embedder(batch['corpus_text_ids'])
239 |         utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
240 |         utter_kwcode = self.target_kwencoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
241 |         utter_code = tf.concat([utter_code[0], utter_code[1], utter_kwcode[0], utter_kwcode[1]], -1)
242 |         self.corpus_code = np.zeros([0, self.config._code_len])
243 | 
244 |         corpus_iterator.switch_to_dataset(sess)
245 |         sess.run(tf.tables_initializer())
246 |         saver = tf.train.Saver()
247 |         saver.restore(sess, self.config._save_path)
248 |         feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
249 |         while True:
250 |             try:
251 |                 utter_code_ = sess.run(utter_code, feed_dict=feed)
252 |                 self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0)
253 |             except tf.errors.OutOfRangeError:
254 |                 break
255 |         self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9))
256 |         self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1))
257 |         self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2))
258 |         self.keywords_embed = tf.nn.l2_normalize(self.embedder(self.kw_list), axis=1)
259 |         self.kw_embedding = sess.run(self.keywords_embed)
260 | 
261 |         # predict keyword
262 |         self.context_input = tf.placeholder(dtype=object)
263 |         context_ids = self.vocab.map_tokens_to_ids(self.context_input)
264 |         matching_score = self.forward_matrix(context_ids)
265 |         self.candi_output =tf.nn.top_k(matching_score, self.data_config._keywords_num)[1]
266 | 
267 |         # retrieve
268 |         self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9))
269 |         self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1))
270 |         self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2))
271 |         self.kw_input = tf.placeholder(dtype=tf.int32)
272 |         history_ids = self.vocab.map_tokens_to_ids(self.history_input)
273 |         history_embed = self.embedder(history_ids)
274 |         history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0),
275 |                                            sequence_length_minor=self.minor_length_input,
276 |                                            sequence_length_major=self.major_length_input)[1]
277 |         self.next_kw_ids = self.kw_list[self.kw_input]
278 |         embed_code = tf.expand_dims(self.embedder(self.next_kw_ids), 0)
279 |         embed_code = self.linear_transform(embed_code)
280 |         history_code = tf.concat([history_code, embed_code], 1)
281 |         select_corpus = tf.cast(self.corpus_code, dtype=tf.float32)
282 |         feature_code = self.linear_matcher(select_corpus * history_code)
283 |         self.ans_output = tf.nn.top_k(tf.squeeze(feature_code,1), k=self.data_config._retrieval_candidates)[1]
284 | 
285 |     def retrieve(self, history_all, sess):
286 |         history, seq_len, turns, context, context_len = history_all
287 |         kw_candi = sess.run(self.candi_output, feed_dict={self.context_input: context[:context_len]})
288 |         for kw in kw_candi:
289 |             tmp_score = sum(self.kw_embedding[kw] * self.kw_embedding[self.data_config._keywords_dict[self.target]])
290 |             if tmp_score > self.score:
291 |                 self.score = tmp_score
292 |                 self.next_kw = self.data_config._keywords_candi[kw]
293 |                 break
294 |         ans = sess.run(self.ans_output, feed_dict={self.history_input: history,
295 |                                                    self.minor_length_input: [seq_len], self.major_length_input: [turns],
296 |                                                    self.kw_input: self.data_config._keywords_dict[self.next_kw]})
297 |         for i in range(self.data_config._max_turns + 1):
298 |             if ans[i] not in self.reply_list:
299 |                 self.reply_list.append(ans[i])
300 |                 reply = self.corpus[ans[i]]
301 |                 break
302 |         return reply
303 | 


--------------------------------------------------------------------------------
/model/neural.py:
--------------------------------------------------------------------------------
  1 | import texar as tx
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | from preprocess.data_utils import kw_tokenize
  5 | 
  6 | 
  7 | class Predictor():
  8 |     def __init__(self, config_model, config_data, mode=None):
  9 |         self.config = config_model
 10 |         self.data_config = config_data
 11 |         self.gpu_config = tf.ConfigProto()
 12 |         self.gpu_config.gpu_options.allow_growth = True
 13 |         self.build_model()
 14 | 
 15 |     def build_model(self):
 16 |         self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train'])
 17 |         self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid'])
 18 |         self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test'])
 19 |         self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data)
 20 |         self.vocab = self.train_data.vocab(0)
 21 |         self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams)
 22 |         self.target_encoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams)
 23 |         self.target_kwencoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_kwencoder_hparams)
 24 |         self.linear_transform = tx.modules.MLPTransformConnector(self.config._code_len // 2)
 25 |         self.linear_matcher = tx.modules.MLPTransformConnector(1)
 26 |         self.context_encoder = tx.modules.UnidirectionalRNNEncoder(hparams=self.config.context_encoder_hparams)
 27 |         self.predict_layer = tx.modules.MLPTransformConnector(self.data_config._keywords_num)
 28 |         self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs)
 29 |         self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi))
 30 |         self.kw_vocab = tx.data.Vocab(self.data_config._keywords_path)
 31 | 
 32 |     def forward_neural(self, context_ids, context_length):
 33 |         context_embed = self.embedder(context_ids)
 34 |         context_code = self.context_encoder(context_embed, sequence_length=context_length)[1]
 35 |         keyword_score = self.predict_layer(context_code)
 36 |         return keyword_score
 37 | 
 38 |     def predict_keywords(self, batch):
 39 |         matching_score = self.forward_neural(batch['context_text_ids'], batch['context_length'])
 40 |         keywords_ids = self.kw_vocab.map_tokens_to_ids(batch['keywords_text'])
 41 |         kw_labels = tf.map_fn(lambda x: tf.sparse_to_dense(x, [self.kw_vocab.size], 1., 0., False),
 42 |                               keywords_ids, dtype=tf.float32, parallel_iterations=True)[:, 4:]
 43 |         loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=kw_labels, logits=matching_score)
 44 |         loss = tf.reduce_mean(loss)
 45 |         kw_ans = tf.arg_max(matching_score, -1)
 46 |         acc_label = tf.map_fn(lambda x: tf.gather(x[0], x[1]), (kw_labels, kw_ans), dtype=tf.float32)
 47 |         acc = tf.reduce_mean(acc_label)
 48 |         kws = tf.nn.top_k(matching_score, k=5)[1]
 49 |         kws = tf.reshape(kws,[-1])
 50 |         kws = tf.map_fn(lambda x: self.kw_list[x], kws, dtype=tf.int64)
 51 |         kws = tf.reshape(kws,[-1, 5])
 52 |         return loss, acc, kws
 53 | 
 54 |     def train_keywords(self):
 55 |         batch = self.iterator.get_next()
 56 |         loss, acc, _ = self.predict_keywords(batch)
 57 |         op_step = tf.Variable(0, name='op_step')
 58 |         train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.neural_opt_hparams)
 59 |         max_val_acc = 0.
 60 |         self.saver = tf.train.Saver()
 61 |         with tf.Session(config=self.gpu_config) as sess:
 62 |             sess.run(tf.global_variables_initializer())
 63 |             sess.run(tf.local_variables_initializer())
 64 |             sess.run(tf.tables_initializer())
 65 |             for epoch_id in range(self.config._max_epoch):
 66 |                 self.iterator.switch_to_train_data(sess)
 67 |                 cur_step = 0
 68 |                 cnt_acc = []
 69 |                 while True:
 70 |                     try:
 71 |                         cur_step += 1
 72 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
 73 |                         loss_, acc_ = sess.run([train_op, acc], feed_dict=feed)
 74 |                         cnt_acc.append(acc_)
 75 |                         if cur_step % 200 == 0:
 76 |                             print('batch {}, loss={}, acc1={}'.format(cur_step, loss_, np.mean(cnt_acc[-200:])))
 77 |                     except tf.errors.OutOfRangeError:
 78 |                         break
 79 |                 self.iterator.switch_to_val_data(sess)
 80 |                 cnt_acc = []
 81 |                 while True:
 82 |                     try:
 83 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
 84 |                         acc_ = sess.run(acc, feed_dict=feed)
 85 |                         cnt_acc.append(acc_)
 86 |                     except tf.errors.OutOfRangeError:
 87 |                         mean_acc = np.mean(cnt_acc)
 88 |                         if mean_acc > max_val_acc:
 89 |                             max_val_acc = mean_acc
 90 |                             self.saver.save(sess, self.config._neural_save_path)
 91 |                         print('epoch_id {}, valid acc1={}'.format(epoch_id+1, mean_acc))
 92 |                         break
 93 | 
 94 |     def test_keywords(self):
 95 |         batch = self.iterator.get_next()
 96 |         loss, acc, kws = self.predict_keywords(batch)
 97 |         saver = tf.train.Saver()
 98 |         with tf.Session(config=self.gpu_config) as sess:
 99 |             sess.run(tf.global_variables_initializer())
100 |             sess.run(tf.local_variables_initializer())
101 |             sess.run(tf.tables_initializer())
102 |             saver.restore(sess, self.config._neural_save_path)
103 |             self.iterator.switch_to_test_data(sess)
104 |             cnt_acc, cnt_rec1, cnt_rec3, cnt_rec5 = [], [], [], []
105 |             while True:
106 |                 try:
107 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
108 |                     acc_, kw_ans, kw_labels = sess.run([acc, kws, batch['keywords_text_ids']], feed_dict=feed)
109 |                     cnt_acc.append(acc_)
110 |                     rec = [0,0,0,0,0]
111 |                     sum_kws = 0
112 |                     for i in range(len(kw_ans)):
113 |                         sum_kws += sum(kw_labels[i] > 3)
114 |                         for j in range(5):
115 |                             if kw_ans[i][j] in kw_labels[i]:
116 |                                 for k in range(j, 5):
117 |                                     rec[k] += 1
118 |                     cnt_rec1.append(rec[0]/sum_kws)
119 |                     cnt_rec3.append(rec[2]/sum_kws)
120 |                     cnt_rec5.append(rec[4]/sum_kws)
121 | 
122 |                 except tf.errors.OutOfRangeError:
123 |                     print('test_kw acc@1={:.4f}, rec@1={:.4f}, rec@3={:.4f}, rec@5={:.4f}'.format(
124 |                         np.mean(cnt_acc), np.mean(cnt_rec1), np.mean(cnt_rec3), np.mean(cnt_rec5)))
125 |                     break
126 | 
127 |     def forward(self, batch):
128 |         matching_score = self.forward_neural(batch['context_text_ids'], batch['context_length'])
129 |         kw_weight, predict_kw = tf.nn.top_k(matching_score, k=3)
130 |         predict_kw = tf.reshape(predict_kw, [-1])
131 |         predict_kw = tf.map_fn(lambda x: self.kw_list[x], predict_kw, dtype=tf.int64)
132 |         predict_kw = tf.reshape(predict_kw, [-1, 3])
133 |         embed_code = self.embedder(predict_kw)
134 |         embed_code = tf.reduce_sum(embed_code, axis=1)
135 |         embed_code = self.linear_transform(embed_code)
136 | 
137 |         source_embed = self.embedder(batch['source_text_ids'])
138 |         target_embed = self.embedder(batch['target_text_ids'])
139 |         target_embed = tf.reshape(target_embed, [-1, self.data_config._max_seq_len + 2, self.embedder.dim])
140 |         target_length = tf.reshape(batch['target_length'], [-1])
141 |         source_code = self.source_encoder(
142 |             source_embed,
143 |             sequence_length_minor=batch['source_length'],
144 |             sequence_length_major=batch['source_utterance_cnt'])[1]  #
145 |         target_code = self.target_encoder(
146 |             target_embed,
147 |             sequence_length=target_length)[1]
148 |         target_kwcode = self.target_kwencoder(
149 |             target_embed,
150 |             sequence_length=target_length)[1]
151 |         target_code = tf.concat([target_code[0], target_code[1], target_kwcode[0], target_kwcode[1]], -1)
152 |         target_code = tf.reshape(target_code, [-1, 20, self.config._code_len])
153 | 
154 |         source_code = tf.concat([source_code, embed_code], -1)
155 |         source_code = tf.expand_dims(source_code, 1)
156 |         source_code = tf.tile(source_code, [1, 20, 1])
157 |         feature_code = target_code * source_code
158 |         feature_code = tf.reshape(feature_code, [-1, self.config._code_len])
159 | 
160 |         logits = self.linear_matcher(feature_code)
161 |         logits = tf.reshape(logits, [-1, 20])
162 |         labels = tf.one_hot(batch['label'], 20)
163 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits))
164 |         ans = tf.arg_max(logits, -1)
165 |         acc = tx.evals.accuracy(batch['label'], ans)
166 |         rank = tf.nn.top_k(logits, k=20)[1]
167 |         return loss, acc, rank
168 | 
169 |     def train(self):
170 |         batch = self.iterator.get_next()
171 |         kw_loss, kw_acc, _ = self.predict_keywords(batch)
172 |         kw_saver = tf.train.Saver()
173 |         loss, acc, _ = self.forward(batch)
174 |         op_step = tf.Variable(0, name='retrieval_step')
175 |         train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.opt_hparams)
176 |         max_val_acc = 0.
177 |         with tf.Session(config=self.gpu_config) as sess:
178 |             sess.run(tf.tables_initializer())
179 |             sess.run(tf.global_variables_initializer())
180 |             sess.run(tf.local_variables_initializer())
181 |             kw_saver.restore(sess, self.config._neural_save_path)
182 |             saver = tf.train.Saver()
183 |             for epoch_id in range(self.config._max_epoch):
184 |                 self.iterator.switch_to_train_data(sess)
185 |                 cur_step = 0
186 |                 cnt_acc = []
187 |                 while True:
188 |                     try:
189 |                         cur_step += 1
190 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
191 |                         loss, acc_ = sess.run([train_op, acc], feed_dict=feed)
192 |                         cnt_acc.append(acc_)
193 |                         if cur_step % 200 == 0:
194 |                             print('batch {}, loss={}, acc1={}'.format(cur_step, loss, np.mean(cnt_acc[-200:])))
195 |                     except tf.errors.OutOfRangeError:
196 |                         break
197 |                         
198 |                 self.iterator.switch_to_val_data(sess)
199 |                 cnt_acc, cnt_kwacc = [], []
200 |                 while True:
201 |                     try:
202 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
203 |                         acc_, kw_acc_ = sess.run([acc, kw_acc], feed_dict=feed)
204 |                         cnt_acc.append(acc_)
205 |                         cnt_kwacc.append(kw_acc_)
206 |                     except tf.errors.OutOfRangeError:
207 |                         mean_acc = np.mean(cnt_acc)
208 |                         print('valid acc1={}, kw_acc1={}'.format(mean_acc, np.mean(cnt_kwacc)))
209 |                         if mean_acc > max_val_acc:
210 |                             max_val_acc = mean_acc
211 |                             saver.save(sess, self.config._save_path)
212 |                         break
213 | 
214 |     def test(self):
215 |         batch = self.iterator.get_next()
216 |         loss, acc, rank = self.forward(batch)
217 |         with tf.Session(config=self.gpu_config) as sess:
218 |             sess.run(tf.tables_initializer())
219 |             self.saver = tf.train.Saver()
220 |             self.saver.restore(sess, self.config._save_path)
221 |             self.iterator.switch_to_test_data(sess)
222 |             rank_cnt = []
223 |             while True:
224 |                 try:
225 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
226 |                     ranks, labels = sess.run([rank, batch['label']], feed_dict=feed)
227 |                     for i in range(len(ranks)):
228 |                         rank_cnt.append(np.where(ranks[i]==labels[i])[0][0])
229 |                 except tf.errors.OutOfRangeError:
230 |                     rec = [0,0,0,0,0]
231 |                     MRR = 0
232 |                     for rank in rank_cnt:
233 |                         for i in range(5):
234 |                             rec[i] += (rank <= i)
235 |                         MRR += 1 / (rank+1)
236 |                     print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format(
237 |                         rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt)))
238 |                     break
239 | 
240 | 
241 |     def retrieve_init(self, sess):
242 |         data_batch = self.iterator.get_next()
243 |         loss, acc, _ = self.forward(data_batch)
244 |         self.corpus = self.data_config._corpus
245 |         self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams)
246 |         corpus_iterator = tx.data.DataIterator(self.corpus_data)
247 |         batch = corpus_iterator.get_next()
248 |         corpus_embed = self.embedder(batch['corpus_text_ids'])
249 |         utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
250 |         utter_kwcode = self.target_kwencoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
251 |         utter_code = tf.concat([utter_code[0], utter_code[1], utter_kwcode[0], utter_kwcode[1]], -1)
252 |         self.corpus_code = np.zeros([0, self.config._code_len])
253 | 
254 |         corpus_iterator.switch_to_dataset(sess)
255 |         sess.run(tf.tables_initializer())
256 |         saver = tf.train.Saver()
257 |         saver.restore(sess, self.config._save_path)
258 |         feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
259 |         while True:
260 |             try:
261 |                 utter_code_ = sess.run(utter_code, feed_dict=feed)
262 |                 self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0)
263 |             except tf.errors.OutOfRangeError:
264 |                 break
265 |         self.keywords_embed = tf.nn.l2_normalize(self.embedder(self.kw_list), axis=1)
266 |         self.kw_embedding = sess.run(self.keywords_embed)
267 | 
268 |         # predict keyword
269 |         self.context_input = tf.placeholder(dtype=object, shape=(20))
270 |         self.context_length_input = tf.placeholder(dtype=tf.int32, shape=(1))
271 |         context_ids = tf.expand_dims(self.vocab.map_tokens_to_ids(self.context_input), 0)
272 |         context_embed = self.embedder(context_ids)
273 |         context_code = self.context_encoder(context_embed, sequence_length=self.context_length_input)[1]
274 |         matching_score = self.predict_layer(context_code)
275 |         self.candi_output =tf.nn.top_k(tf.squeeze(matching_score, 0), self.data_config._keywords_num)[1]
276 | 
277 |         # retrieve
278 |         self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9))
279 |         self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1))
280 |         self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2))
281 |         self.kw_input = tf.placeholder(dtype=tf.int32)
282 |         history_ids = self.vocab.map_tokens_to_ids(self.history_input)
283 |         history_embed = self.embedder(history_ids)
284 |         history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0),
285 |                                            sequence_length_minor=self.minor_length_input,
286 |                                            sequence_length_major=self.major_length_input)[1]
287 |         self.next_kw_ids = self.kw_list[self.kw_input]
288 |         embed_code = tf.expand_dims(self.embedder(self.next_kw_ids), 0)
289 |         embed_code = self.linear_transform(embed_code)
290 |         history_code = tf.concat([history_code, embed_code], 1)
291 |         select_corpus = tf.cast(self.corpus_code, dtype=tf.float32)
292 |         feature_code = self.linear_matcher(select_corpus * history_code)
293 |         self.ans_output = tf.nn.top_k(tf.squeeze(feature_code,1), k=self.data_config._retrieval_candidates)[1]
294 | 
295 |     def retrieve(self, history_all, sess):
296 |         history, seq_len, turns, context, context_len = history_all
297 |         kw_candi = sess.run(self.candi_output, feed_dict={self.context_input: context,
298 |                                                           self.context_length_input: [context_len]})
299 |         for kw in kw_candi:
300 |             tmp_score = sum(self.kw_embedding[kw] * self.kw_embedding[self.data_config._keywords_dict[self.target]])
301 |             if tmp_score > self.score:
302 |                 self.score = tmp_score
303 |                 self.next_kw = self.data_config._keywords_candi[kw]
304 |                 break
305 |         ans = sess.run(self.ans_output, feed_dict={self.history_input: history,
306 |                                                    self.minor_length_input: [seq_len], self.major_length_input: [turns],
307 |                                                    self.kw_input: self.data_config._keywords_dict[self.next_kw]})
308 |         flag = 0
309 |         reply = self.corpus[ans[0]]
310 |         for i in ans:
311 |             if i in self.reply_list:  # avoid repeat
312 |                 continue
313 |             for wd in kw_tokenize(self.corpus[i]):
314 |                 if wd in self.data_config._keywords_candi:
315 |                     tmp_score = sum(self.kw_embedding[self.data_config._keywords_dict[wd]] *
316 |                                     self.kw_embedding[self.data_config._keywords_dict[self.target]])
317 |                     if tmp_score > self.score:
318 |                         reply = self.corpus[i]
319 |                         self.score = tmp_score
320 |                         self.next_kw = wd
321 |                         flag = 1
322 |                         break
323 |             if flag == 0:
324 |                 continue
325 |             break
326 |         return reply
327 | 


--------------------------------------------------------------------------------
/model/kernel.py:
--------------------------------------------------------------------------------
  1 | import texar as tx
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | from preprocess.data_utils import kw_tokenize
  5 | 
  6 | 
  7 | class Predictor():
  8 |     def __init__(self, config_model, config_data, mode=None):
  9 |         self.config = config_model
 10 |         self.data_config = config_data
 11 |         self.gpu_config = tf.ConfigProto()
 12 |         self.gpu_config.gpu_options.allow_growth = True
 13 |         self.build_model()
 14 | 
 15 |     def build_model(self):
 16 |         self.train_data = tx.data.MultiAlignedData(self.data_config.data_hparams['train'])
 17 |         self.valid_data = tx.data.MultiAlignedData(self.data_config.data_hparams['valid'])
 18 |         self.test_data = tx.data.MultiAlignedData(self.data_config.data_hparams['test'])
 19 |         self.iterator = tx.data.TrainTestDataIterator(train=self.train_data, val=self.valid_data, test=self.test_data)
 20 |         self.vocab = self.train_data.vocab(0)
 21 |         self.embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs)
 22 |         self.kw_embedder = tx.modules.WordEmbedder(init_value=self.train_data.embedding_init_value(0).word_vecs)
 23 |         self.source_encoder = tx.modules.HierarchicalRNNEncoder(hparams=self.config.source_encoder_hparams)
 24 |         self.target_encoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_encoder_hparams)
 25 |         self.target_kwencoder = tx.modules.BidirectionalRNNEncoder(hparams=self.config.target_kwencoder_hparams)
 26 |         self.linear_transform = tx.modules.MLPTransformConnector(self.config._code_len // 2)
 27 |         self.linear_matcher = tx.modules.MLPTransformConnector(1)
 28 |         self.linear_kernel = tx.modules.MLPTransformConnector(1)
 29 |         self.kw_list = self.vocab.map_tokens_to_ids(tf.convert_to_tensor(self.data_config._keywords_candi))
 30 |         self.kw_vocab = tx.data.Vocab(self.data_config._keywords_path)
 31 |         self.keywords_embed = tf.nn.l2_normalize(self.kw_embedder(self.kw_list), axis=1)
 32 | 
 33 |     def forward_kernel(self, kw_embed, context_ids):
 34 |         kernel_sigma = self.config._kernel_sigma
 35 |         mu = tf.convert_to_tensor(self.config._kernel_mu)
 36 |         mask = tf.cast(context_ids > 3, dtype=tf.float32)
 37 |         context_embed = self.kw_embedder(context_ids)
 38 |         context_embed = tf.nn.l2_normalize(context_embed, axis=2)
 39 |         similarity_matrix = tf.reduce_sum(kw_embed * context_embed, axis=2)
 40 |         similarity_matrix = tf.tile(tf.expand_dims(similarity_matrix, 2), [1, 1, len(self.config._kernel_mu)])
 41 |         matching_feature = tf.exp(-(similarity_matrix - mu) ** 2 / (kernel_sigma ** 2))
 42 |         matching_feature = matching_feature * tf.tile(tf.expand_dims(mask, 2), [1, 1, len(self.config._kernel_mu)])
 43 |         matching_feature = tf.reduce_sum(matching_feature, axis=1)
 44 |         matching_score = self.linear_kernel(matching_feature)
 45 |         matching_score = tf.squeeze(matching_score, 1)
 46 |         return matching_score
 47 | 
 48 |     def predict_keywords(self, batch):
 49 |         keywords_ids = self.kw_vocab.map_tokens_to_ids(batch['keywords_text'])
 50 |         matching_score = tf.map_fn(lambda kw_embed: self.forward_kernel(kw_embed, batch['context_text_ids']),
 51 |             self.keywords_embed, dtype=tf.float32, parallel_iterations=True)
 52 |         matching_score = tf.transpose(matching_score)
 53 |         matching_score = tf.nn.softmax(matching_score)
 54 |         kw_labels = tf.map_fn(lambda x: tf.sparse_to_dense(x, [self.kw_vocab.size], 1., 0., False),
 55 |             keywords_ids, dtype=tf.float32, parallel_iterations=True)[:, 4:]
 56 |         loss = tf.reduce_sum(-tf.log(matching_score) * kw_labels) / tf.reduce_sum(kw_labels)
 57 |         kw_ans = tf.arg_max(matching_score, -1)
 58 |         acc_label = tf.map_fn(lambda x: tf.gather(x[0], x[1]), (kw_labels, kw_ans), dtype=tf.float32)
 59 |         acc = tf.reduce_mean(acc_label)
 60 |         kws = tf.nn.top_k(matching_score, k=5)[1]
 61 |         kws = tf.reshape(kws,[-1])
 62 |         kws = tf.map_fn(lambda x: self.kw_list[x], kws, dtype=tf.int64)
 63 |         kws = tf.reshape(kws,[-1, 5])
 64 |         return loss, acc, kws
 65 | 
 66 |     def train_keywords(self):
 67 |         batch = self.iterator.get_next()
 68 |         loss, acc, _ = self.predict_keywords(batch)
 69 |         op_step = tf.Variable(0, name='op_step')
 70 |         train_op = tx.core.get_train_op(loss, global_step=op_step, hparams=self.config.kernel_opt_hparams)
 71 |         max_val_acc, stopping_flag = 0, 0
 72 |         self.saver = tf.train.Saver()
 73 |         with tf.Session(config=self.gpu_config) as sess:
 74 |             sess.run(tf.global_variables_initializer())
 75 |             sess.run(tf.local_variables_initializer())
 76 |             sess.run(tf.tables_initializer())
 77 |             for epoch_id in range(self.config._max_epoch):
 78 |                 self.iterator.switch_to_train_data(sess)
 79 |                 cur_step = 0
 80 |                 cnt_acc = []
 81 |                 while True:
 82 |                     try:
 83 |                         cur_step += 1
 84 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
 85 |                         loss_, acc_ = sess.run([train_op, acc], feed_dict=feed)
 86 |                         cnt_acc.append(acc_)
 87 |                         if cur_step % 100 == 0:
 88 |                             print('batch {}, loss={}, acc1={}'.format(cur_step, loss_, np.mean(cnt_acc[-100:])))
 89 |                     except tf.errors.OutOfRangeError:
 90 |                         break
 91 | 
 92 |                 self.iterator.switch_to_val_data(sess)
 93 |                 cnt_acc = []
 94 |                 while True:
 95 |                     try:
 96 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
 97 |                         acc_ = sess.run(acc, feed_dict=feed)
 98 |                         cnt_acc.append(acc_)
 99 |                     except tf.errors.OutOfRangeError:
100 |                         mean_acc = np.mean(cnt_acc)
101 |                         if mean_acc > max_val_acc:
102 |                             max_val_acc = mean_acc
103 |                             self.saver.save(sess, self.config._kernel_save_path)
104 |                         else:
105 |                             stopping_flag += 1
106 |                         print('epoch_id {}, valid acc1={}'.format(epoch_id+1, mean_acc))
107 |                         break
108 |                 if stopping_flag >= self.config._early_stopping:
109 |                     break
110 | 
111 |     def test_keywords(self):
112 |         batch = self.iterator.get_next()
113 |         loss, acc, kws = self.predict_keywords(batch)
114 |         saver = tf.train.Saver()
115 |         with tf.Session(config=self.gpu_config) as sess:
116 |             sess.run(tf.global_variables_initializer())
117 |             sess.run(tf.local_variables_initializer())
118 |             sess.run(tf.tables_initializer())
119 |             saver.restore(sess, self.config._kernel_save_path)
120 |             self.iterator.switch_to_test_data(sess)
121 |             cnt_acc, cnt_rec1, cnt_rec3, cnt_rec5 = [], [], [], []
122 |             while True:
123 |                 try:
124 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
125 |                     acc_, kw_ans, kw_labels = sess.run([acc, kws, batch['keywords_text_ids']], feed_dict=feed)
126 |                     cnt_acc.append(acc_)
127 |                     rec = [0,0,0,0,0]
128 |                     sum_kws = 0
129 |                     for i in range(len(kw_ans)):
130 |                         sum_kws += sum(kw_labels[i] > 3)
131 |                         for j in range(5):
132 |                             if kw_ans[i][j] in kw_labels[i]:
133 |                                 for k in range(j, 5):
134 |                                     rec[k] += 1
135 |                     cnt_rec1.append(rec[0]/sum_kws)
136 |                     cnt_rec3.append(rec[2]/sum_kws)
137 |                     cnt_rec5.append(rec[4]/sum_kws)
138 | 
139 |                 except tf.errors.OutOfRangeError:
140 |                     print('test_kw acc@1={:.4f}, rec@1={:.4f}, rec@3={:.4f}, rec@5={:.4f}'.format(
141 |                         np.mean(cnt_acc), np.mean(cnt_rec1), np.mean(cnt_rec3), np.mean(cnt_rec5)))
142 |                     break
143 | 
144 |     def forward(self, batch):
145 |         matching_score = tf.map_fn(lambda kw_embed: self.forward_kernel(kw_embed, batch['context_text_ids']),
146 |             self.keywords_embed, dtype=tf.float32, parallel_iterations=True)
147 |         matching_score = tf.transpose(matching_score)
148 | 
149 |         kw_weight, predict_kw = tf.nn.top_k(matching_score, k=3)
150 |         predict_kw = tf.reshape(predict_kw,[-1])
151 |         predict_kw = tf.map_fn(lambda x: self.kw_list[x], predict_kw, dtype=tf.int64)
152 |         predict_kw = tf.reshape(predict_kw,[-1,3])
153 |         embed_code = self.embedder(predict_kw)
154 |         embed_code = tf.reduce_sum(embed_code, axis=1)
155 |         embed_code = self.linear_transform(embed_code)
156 | 
157 |         source_embed = self.embedder(batch['source_text_ids'])
158 |         target_embed = self.embedder(batch['target_text_ids']) # bs * 20 * 32 * 200
159 |         target_embed = tf.reshape(target_embed,[-1, self.data_config._max_seq_len+2, self.embedder.dim]) # (bs * 20) * 32 * 200
160 |         target_length = tf.reshape(batch['target_length'],[-1]) # (bs * 20) * 32 * 200
161 |         source_code = self.source_encoder(
162 |             source_embed,
163 |             sequence_length_minor=batch['source_length'],
164 |             sequence_length_major=batch['source_utterance_cnt'])[1]
165 |         target_code = self.target_encoder(
166 |             target_embed,
167 |             sequence_length=target_length)[1]
168 |         target_kwcode = self.target_kwencoder(
169 |             target_embed,
170 |             sequence_length=target_length)[1]
171 |         target_code = tf.concat([target_code[0], target_code[1], target_kwcode[0], target_kwcode[1]], -1)
172 |         target_code = tf.reshape(target_code, [-1,20,self.config._code_len])
173 | 
174 |         source_code = tf.concat([source_code,embed_code], -1)
175 |         source_code = tf.expand_dims(source_code, 1)
176 |         source_code = tf.tile(source_code, [1,20,1])
177 |         feature_code = target_code * source_code
178 |         feature_code = tf.reshape(feature_code,[-1,self.config._code_len])
179 |         logits = self.linear_matcher(feature_code)
180 |         logits = tf.reshape(logits,[-1,20])
181 |         labels = tf.one_hot(batch['label'], 20)
182 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits))
183 |         ans = tf.arg_max(logits, -1)
184 |         acc = tx.evals.accuracy(batch['label'], ans)
185 |         rank = tf.nn.top_k(logits, k=20)[1]
186 |         return loss, acc, rank
187 | 
188 |     def train(self):
189 |         batch = self.iterator.get_next()
190 |         loss_t, acc_t, _ = self.predict_keywords(batch)
191 |         kw_saver = tf.train.Saver()
192 |         loss, acc, _ = self.forward(batch)
193 |         retrieval_step = tf.Variable(0, name='retrieval_step')
194 |         train_op = tx.core.get_train_op(loss, global_step=retrieval_step, hparams=self.config.opt_hparams)
195 |         max_val_acc, stopping_flag = 0, 0
196 |         with tf.Session(config=self.gpu_config) as sess:
197 |             sess.run(tf.tables_initializer())
198 |             sess.run(tf.global_variables_initializer())
199 |             sess.run(tf.local_variables_initializer())
200 |             kw_saver.restore(sess, self.config._kernel_save_path)
201 |             saver = tf.train.Saver()
202 |             for epoch_id in range(self.config._max_epoch):
203 |                 self.iterator.switch_to_train_data(sess)
204 |                 cur_step = 0
205 |                 cnt_acc, cnt_kwacc = [],[]
206 |                 while True:
207 |                     try:
208 |                         cur_step += 1
209 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.TRAIN}
210 |                         loss, acc_, acc_kw = sess.run([train_op, acc, acc_t], feed_dict=feed)
211 |                         cnt_acc.append(acc_)
212 |                         cnt_kwacc.append(acc_kw)
213 |                         if cur_step % 200 == 0:
214 |                             print('batch {}, loss={}, acc1={}, kw_acc1={}'.format(cur_step, loss,
215 |                                                 np.mean(cnt_acc[-200:]) ,np.mean(cnt_kwacc[-200:])))
216 |                     except tf.errors.OutOfRangeError:
217 |                         break
218 |                 self.iterator.switch_to_val_data(sess)
219 |                 cnt_acc, cnt_kwacc = [],[]
220 |                 while True:
221 |                     try:
222 |                         feed = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
223 |                         acc_, acc_kw = sess.run([acc, acc_t], feed_dict=feed)
224 |                         cnt_acc.append(acc_)
225 |                         cnt_kwacc.append(acc_kw)
226 |                     except tf.errors.OutOfRangeError:
227 |                         mean_acc = np.mean(cnt_acc)
228 |                         print('valid acc1={}, kw_acc1={}'.format(mean_acc, np.mean(cnt_kwacc)))
229 |                         if mean_acc > max_val_acc:
230 |                             max_val_acc = mean_acc
231 |                             saver.save(sess, self.config._save_path)
232 |                         else:
233 |                             stopping_flag += 1
234 |                         break
235 |                 if stopping_flag >= self.config._early_stopping:
236 |                     break
237 | 
238 |     def test(self):
239 |         batch = self.iterator.get_next()
240 |         loss, acc, rank = self.forward(batch)
241 |         with tf.Session(config=self.gpu_config) as sess:
242 |             sess.run(tf.tables_initializer())
243 |             self.saver = tf.train.Saver()
244 |             self.saver.restore(sess, self.config._save_path)
245 |             self.iterator.switch_to_test_data(sess)
246 |             rank_cnt = []
247 |             while True:
248 |                 try:
249 |                     feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
250 |                     ranks, labels = sess.run([rank, batch['label']], feed_dict=feed)
251 |                     for i in range(len(ranks)):
252 |                         rank_cnt.append(np.where(ranks[i]==labels[i])[0][0])
253 |                 except tf.errors.OutOfRangeError:
254 |                     rec = [0,0,0,0,0]
255 |                     MRR = 0
256 |                     for rank in rank_cnt:
257 |                         for i in range(5):
258 |                             rec[i] += (rank <= i)
259 |                         MRR += 1 / (rank+1)
260 |                     print('test rec1@20={:.4f}, rec3@20={:.4f}, rec5@20={:.4f}, MRR={:.4f}'.format(
261 |                         rec[0]/len(rank_cnt), rec[2]/len(rank_cnt), rec[4]/len(rank_cnt), MRR/len(rank_cnt)))
262 |                     break
263 | 
264 |     def retrieve_init(self, sess):
265 |         data_batch = self.iterator.get_next()
266 |         loss, acc, _ = self.forward(data_batch)
267 |         self.corpus = self.data_config._corpus
268 |         self.corpus_data = tx.data.MonoTextData(self.data_config.corpus_hparams)
269 |         corpus_iterator = tx.data.DataIterator(self.corpus_data)
270 |         batch = corpus_iterator.get_next()
271 |         corpus_embed = self.embedder(batch['corpus_text_ids'])
272 |         utter_code = self.target_encoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
273 |         utter_kwcode = self.target_kwencoder(corpus_embed, sequence_length=batch['corpus_length'])[1]
274 |         utter_code = tf.concat([utter_code[0], utter_code[1], utter_kwcode[0], utter_kwcode[1]], -1)
275 |         self.corpus_code = np.zeros([0, self.config._code_len])
276 |         corpus_iterator.switch_to_dataset(sess)
277 |         sess.run(tf.tables_initializer())
278 |         saver = tf.train.Saver()
279 |         saver.restore(sess, self.config._save_path)
280 |         feed = {tx.global_mode(): tf.estimator.ModeKeys.PREDICT}
281 |         while True:
282 |             try:
283 |                 utter_code_ = sess.run(utter_code, feed_dict=feed)
284 |                 self.corpus_code = np.concatenate([self.corpus_code, utter_code_], axis=0)
285 |             except tf.errors.OutOfRangeError:
286 |                 break
287 |         self.kw_embedding = sess.run(self.keywords_embed)
288 | 
289 |         # predict keyword
290 |         self.context_input = tf.placeholder(dtype=object)
291 |         context_ids = tf.expand_dims(self.vocab.map_tokens_to_ids(self.context_input), 0)
292 |         matching_score = tf.map_fn(lambda kw_embed: self.forward_kernel(kw_embed, context_ids),
293 |                                    self.keywords_embed, dtype=tf.float32, parallel_iterations=True)
294 |         self.candi_output = tf.nn.top_k(tf.squeeze(matching_score, 1), self.data_config._keywords_num)[1]
295 | 
296 |         # retrieve
297 |         self.minor_length_input = tf.placeholder(dtype=tf.int32, shape=(1, 9))
298 |         self.major_length_input = tf.placeholder(dtype=tf.int32, shape=(1))
299 |         self.history_input = tf.placeholder(dtype=object, shape=(9, self.data_config._max_seq_len + 2))
300 |         self.kw_input = tf.placeholder(dtype=tf.int32)
301 |         history_ids = self.vocab.map_tokens_to_ids(self.history_input)
302 |         history_embed = self.embedder(history_ids)
303 |         history_code = self.source_encoder(tf.expand_dims(history_embed, axis=0),
304 |                                            sequence_length_minor=self.minor_length_input,
305 |                                            sequence_length_major=self.major_length_input)[1]
306 |         self.next_kw_ids = self.kw_list[self.kw_input]
307 |         embed_code = tf.expand_dims(self.embedder(self.next_kw_ids), 0)
308 |         embed_code = self.linear_transform(embed_code)
309 |         history_code = tf.concat([history_code, embed_code], 1)
310 |         select_corpus = tf.cast(self.corpus_code, dtype=tf.float32)
311 |         feature_code = self.linear_matcher(select_corpus * history_code)
312 |         self.ans_output = tf.nn.top_k(tf.squeeze(feature_code,1), k=self.data_config._retrieval_candidates)[1]
313 | 
314 |     def retrieve(self, history_all, sess):
315 |         history, seq_len, turns, context, context_len = history_all
316 |         kw_candi = sess.run(self.candi_output, feed_dict={self.context_input: context[:context_len]})
317 |         for kw in kw_candi:
318 |             tmp_score = sum(self.kw_embedding[kw] * self.kw_embedding[self.data_config._keywords_dict[self.target]])
319 |             if tmp_score > self.score:
320 |                 self.score = tmp_score
321 |                 self.next_kw = self.data_config._keywords_candi[kw]
322 |                 break
323 |         ans = sess.run(self.ans_output, feed_dict={self.history_input: history,
324 |                                                    self.minor_length_input: [seq_len], self.major_length_input: [turns],
325 |                                                    self.kw_input: self.data_config._keywords_dict[self.next_kw]})
326 |         flag = 0
327 |         reply = self.corpus[ans[0]]
328 |         for i in ans:
329 |             if i in self.reply_list:  # avoid repeat
330 |                 continue
331 |             for wd in kw_tokenize(self.corpus[i]):
332 |                 if wd in self.data_config._keywords_candi:
333 |                     tmp_score = sum(self.kw_embedding[self.data_config._keywords_dict[wd]] *
334 |                                     self.kw_embedding[self.data_config._keywords_dict[self.target]])
335 |                     if tmp_score > self.score:
336 |                         reply = self.corpus[i]
337 |                         self.score = tmp_score
338 |                         self.next_kw = wd
339 |                         flag = 1
340 |                         break
341 |             if flag == 0:
342 |                 continue
343 |             break
344 |         return reply
345 | 


--------------------------------------------------------------------------------
/preprocess/convai2/candi_keyword.txt:
--------------------------------------------------------------------------------
   1 | favorite
   2 | sound
   3 | play
   4 | dog
   5 | music
   6 | kid
   7 | eat
   8 | school
   9 | enjoy
  10 | job
  11 | watch
  12 | read
  13 | food
  14 | cat
  15 | friend
  16 | family
  17 | hobby
  18 | people
  19 | pet
  20 | car
  21 | game
  22 | hear
  23 | movie
  24 | travel
  25 | book
  26 | cook
  27 | listen
  28 | animal
  29 | life
  30 | color
  31 | drive
  32 | college
  33 | hope
  34 | living
  35 | parent
  36 | teach
  37 | bad
  38 | sport
  39 | hard
  40 | dad
  41 | feel
  42 | child
  43 | hair
  44 | band
  45 | country
  46 | money
  47 | pizza
  48 | marry
  49 | bet
  50 | hate
  51 | walk
  52 | stay
  53 | study
  54 | write
  55 | husband
  56 | fish
  57 | start
  58 | guess
  59 | brother
  60 | blue
  61 | night
  62 | dance
  63 | busy
  64 | red
  65 | wife
  66 | learn
  67 | talk
  68 | sing
  69 | meet
  70 | beach
  71 | spend
  72 | drink
  73 | rock
  74 | city
  75 | video
  76 | grow
  77 | person
  78 | house
  79 | tv
  80 | true
  81 | shop
  82 | teacher
  83 | buy
  84 | girl
  85 | weekend
  86 | prefer
  87 | hike
  88 | meat
  89 | football
  90 | free
  91 | visit
  92 | sister
  93 | care
  94 | plan
  95 | art
  96 | swim
  97 | sweet
  98 | paint
  99 | stuff
 100 | pretty
 101 | vegan
 102 | store
 103 | class
 104 | farm
 105 | type
 106 | funny
 107 | understand
 108 | son
 109 | business
 110 | happy
 111 | mother
 112 | ride
 113 | coffee
 114 | leave
 115 | fan
 116 | garden
 117 | week
 118 | boy
 119 | bake
 120 | green
 121 | pay
 122 | beautiful
 123 | draw
 124 | student
 125 | horse
 126 | sell
 127 | sad
 128 | summer
 129 | wear
 130 | truck
 131 | black
 132 | hot
 133 | wait
 134 | yea
 135 | sleep
 136 | cold
 137 | agree
 138 | single
 139 | guy
 140 | sibling
 141 | real
 142 | crazy
 143 | healthy
 144 | fine
 145 | tall
 146 | guitar
 147 | lose
 148 | cute
 149 | hour
 150 | cake
 151 | purple
 152 | month
 153 | relax
 154 | finish
 155 | idea
 156 | break
 157 | company
 158 | daughter
 159 | happen
 160 | dream
 161 | team
 162 | mind
 163 | park
 164 | woman
 165 | reading
 166 | restaurant
 167 | bike
 168 | short
 169 | italian
 170 | exercise
 171 | hang
 172 | chocolate
 173 | wonderful
 174 | basketball
 175 | speak
 176 | soccer
 177 | weather
 178 | graduate
 179 | retire
 180 | winter
 181 | morning
 182 | nurse
 183 | bring
 184 | die
 185 | easy
 186 | party
 187 | grade
 188 | father
 189 | ready
 190 | ice
 191 | win
 192 | spare
 193 | doctor
 194 | online
 195 | song
 196 | local
 197 | degree
 198 | chicken
 199 | concert
 200 | florida
 201 | glad
 202 | fall
 203 | baseball
 204 | eye
 205 | office
 206 | volunteer
 207 | water
 208 | girlfriend
 209 | bear
 210 | french
 211 | shopping
 212 | age
 213 | luck
 214 | fishing
 215 | artist
 216 | weird
 217 | foot
 218 | save
 219 | surf
 220 | dinner
 221 | yoga
 222 | lucky
 223 | story
 224 | hunt
 225 | cooking
 226 | pink
 227 | suck
 228 | cream
 229 | boyfriend
 230 | language
 231 | beer
 232 | outdoors
 233 | change
 234 | diet
 235 | passion
 236 | york
 237 | major
 238 | cheese
 239 | collect
 240 | imagine
 241 | super
 242 | english
 243 | rap
 244 | practice
 245 | chat
 246 | head
 247 | california
 248 | train
 249 | lake
 250 | clothes
 251 | pass
 252 | nature
 253 | baby
 254 | fry
 255 | season
 256 | apple
 257 | piano
 258 | sit
 259 | hop
 260 | scary
 261 | taco
 262 | law
 263 | steak
 264 | hockey
 265 | comic
 266 | brown
 267 | bar
 268 | gym
 269 | smart
 270 | huge
 271 | clean
 272 | fav
 273 | close
 274 | jazz
 275 | vegetarian
 276 | canada
 277 | science
 278 | career
 279 | picture
 280 | tea
 281 | excite
 282 | tough
 283 | deal
 284 | pick
 285 | allergic
 286 | neat
 287 | church
 288 | social
 289 | sick
 290 | shoe
 291 | trip
 292 | fly
 293 | vacation
 294 | catch
 295 | bed
 296 | raise
 297 | boring
 298 | rid
 299 | ocean
 300 | club
 301 | town
 302 | instrument
 303 | check
 304 | race
 305 | dragon
 306 | fast
 307 | vegetable
 308 | wrong
 309 | boat
 310 | terrible
 311 | tennis
 312 | candy
 313 | rain
 314 | worry
 315 | veggie
 316 | fruit
 317 | metal
 318 | perfect
 319 | twin
 320 | tattoo
 321 | mountain
 322 | tomorrow
 323 | god
 324 | stand
 325 | hospital
 326 | apartment
 327 | build
 328 | stick
 329 | design
 330 | japan
 331 | texas
 332 | meal
 333 | grocery
 334 | camp
 335 | hit
 336 | orange
 337 | mexican
 338 | allergy
 339 | ta
 340 | phone
 341 | famous
 342 | player
 343 | flower
 344 | bore
 345 | army
 346 | star
 347 | share
 348 | pool
 349 | delicious
 350 | relationship
 351 | exciting
 352 | math
 353 | egg
 354 | museum
 355 | classic
 356 | dress
 357 | sushi
 358 | taste
 359 | married
 360 | amaze
 361 | classical
 362 | lady
 363 | shelter
 364 | sense
 365 | joke
 366 | pop
 367 | pie
 368 | yellow
 369 | american
 370 | lawyer
 371 | expensive
 372 | stress
 373 | hand
 374 | cut
 375 | pasta
 376 | remember
 377 | professional
 378 | choice
 379 | impressive
 380 | youtube
 381 | kinda
 382 | yum
 383 | chicago
 384 | birthday
 385 | cooky
 386 | sunday
 387 | follow
 388 | divorce
 389 | gon
 390 | moment
 391 | fresh
 392 | tire
 393 | hurt
 394 | wedding
 395 | fit
 396 | weight
 397 | health
 398 | plant
 399 | count
 400 | chef
 401 | heard
 402 | ball
 403 | scar
 404 | sort
 405 | smell
 406 | dead
 407 | special
 408 | comedy
 409 | couple
 410 | rich
 411 | hiking
 412 | fave
 413 | create
 414 | accountant
 415 | bird
 416 | relaxing
 417 | history
 418 | blonde
 419 | film
 420 | everyday
 421 | glass
 422 | heart
 423 | voice
 424 | ballet
 425 | vet
 426 | military
 427 | horror
 428 | field
 429 | fight
 430 | mile
 431 | salad
 432 | extra
 433 | afraid
 434 | market
 435 | christmas
 436 | reason
 437 | mexico
 438 | attend
 439 | cop
 440 | chance
 441 | potato
 442 | snow
 443 | halloween
 444 | folk
 445 | snake
 446 | ski
 447 | character
 448 | card
 449 | adopt
 450 | figure
 451 | tho
 452 | hat
 453 | nail
 454 | ford
 455 | fat
 456 | warm
 457 | swimming
 458 | difficult
 459 | sew
 460 | suppose
 461 | dye
 462 | safe
 463 | wine
 464 | tend
 465 | style
 466 | afford
 467 | recipe
 468 | writer
 469 | shower
 470 | lunch
 471 | remind
 472 | painting
 473 | news
 474 | congrats
 475 | photography
 476 | rough
 477 | road
 478 | holiday
 479 | join
 480 | beat
 481 | white
 482 | throne
 483 | michigan
 484 | awful
 485 | actor
 486 | breakfast
 487 | library
 488 | congratulation
 489 | goal
 490 | shrimp
 491 | singer
 492 | middle
 493 | horrible
 494 | common
 495 | plane
 496 | profession
 497 | bacon
 498 | smoke
 499 | coast
 500 | kill
 501 | future
 502 | word
 503 | female
 504 | south
 505 | craft
 506 | neighbor
 507 | violin
 508 | fair
 509 | paris
 510 | fashion
 511 | quiet
 512 | service
 513 | bank
 514 | workout
 515 | dish
 516 | shape
 517 | tour
 518 | theater
 519 | genre
 520 | forget
 521 | fiction
 522 | makeup
 523 | model
 524 | clown
 525 | author
 526 | experience
 527 | rest
 528 | librarian
 529 | king
 530 | puppy
 531 | german
 532 | tree
 533 | trust
 534 | street
 535 | yummy
 536 | marketing
 537 | quit
 538 | hungry
 539 | creative
 540 | poor
 541 | cali
 542 | wild
 543 | alright
 544 | accident
 545 | fantasy
 546 | cartoon
 547 | air
 548 | pepper
 549 | hurricane
 550 | ton
 551 | lab
 552 | anime
 553 | university
 554 | drop
 555 | factory
 556 | iphone
 557 | mystery
 558 | active
 559 | netflix
 560 | stressful
 561 | space
 562 | trouble
 563 | secret
 564 | mcdonalds
 565 | laugh
 566 | gun
 567 | support
 568 | spanish
 569 | pain
 570 | casino
 571 | france
 572 | set
 573 | spaghetti
 574 | sale
 575 | magic
 576 | grill
 577 | burger
 578 | climb
 579 | internet
 580 | pig
 581 | hold
 582 | decide
 583 | mustang
 584 | program
 585 | drum
 586 | hmm
 587 | focus
 588 | seafood
 589 | master
 590 | medium
 591 | strange
 592 | lesson
 593 | arm
 594 | war
 595 | serve
 596 | van
 597 | north
 598 | cow
 599 | toe
 600 | driver
 601 | paper
 602 | education
 603 | table
 604 | station
 605 | dancer
 606 | soda
 607 | playing
 608 | medical
 609 | dark
 610 | blog
 611 | usa
 612 | power
 613 | wan
 614 | bob
 615 | simple
 616 | zoo
 617 | shoot
 618 | fancy
 619 | mechanic
 620 | collection
 621 | national
 622 | motorcycle
 623 | strong
 624 | parrot
 625 | nursing
 626 | america
 627 | harry
 628 | blood
 629 | ahahah
 630 | body
 631 | bible
 632 | police
 633 | energy
 634 | alabama
 635 | lazy
 636 | death
 637 | rose
 638 | musician
 639 | italy
 640 | adventure
 641 | worker
 642 | writing
 643 | pair
 644 | activity
 645 | bread
 646 | throw
 647 | issue
 648 | europe
 649 | london
 650 | gross
 651 | jealous
 652 | male
 653 | height
 654 | disney
 655 | treat
 656 | marathon
 657 | door
 658 | bean
 659 | bos
 660 | boot
 661 | aww
 662 | cancer
 663 | lover
 664 | milk
 665 | skill
 666 | plenty
 667 | beard
 668 | light
 669 | poetry
 670 | subject
 671 | sea
 672 | romantic
 673 | relate
 674 | spending
 675 | firm
 676 | send
 677 | seattle
 678 | couch
 679 | question
 680 | flavor
 681 | photo
 682 | channel
 683 | favourite
 684 | match
 685 | contact
 686 | choose
 687 | japanese
 688 | singing
 689 | lift
 690 | boston
 691 | wood
 692 | grandmother
 693 | farmer
 694 | radio
 695 | navy
 696 | roommate
 697 | bus
 698 | politics
 699 | grandchild
 700 | jam
 701 | chinese
 702 | viking
 703 | matter
 704 | series
 705 | competition
 706 | designer
 707 | gas
 708 | san
 709 | feeling
 710 | knit
 711 | lizard
 712 | slow
 713 | publish
 714 | fantastic
 715 | crochet
 716 | donate
 717 | add
 718 | television
 719 | insurance
 720 | talent
 721 | bowl
 722 | human
 723 | land
 724 | rise
 725 | hmmm
 726 | test
 727 | alive
 728 | involve
 729 | dangerous
 730 | pro
 731 | yesterday
 732 | wind
 733 | obsess
 734 | golden
 735 | stone
 736 | post
 737 | west
 738 | nephew
 739 | depend
 740 | nervous
 741 | breed
 742 | bakery
 743 | hip
 744 | goodness
 745 | baker
 746 | surgery
 747 | mall
 748 | don
 749 | honda
 750 | colorado
 751 | dancing
 752 | dr
 753 | dude
 754 | lifestyle
 755 | calm
 756 | wake
 757 | crime
 758 | athlete
 759 | skin
 760 | beauty
 761 | lie
 762 | facebook
 763 | mar
 764 | reader
 765 | lead
 766 | mess
 767 | snack
 768 | shift
 769 | employ
 770 | spring
 771 | size
 772 | handle
 773 | shy
 774 | iron
 775 | corvette
 776 | evening
 777 | superman
 778 | board
 779 | cheer
 780 | traffic
 781 | grand
 782 | sun
 783 | fee
 784 | musical
 785 | base
 786 | public
 787 | chip
 788 | mad
 789 | careful
 790 | pound
 791 | brand
 792 | potter
 793 | punk
 794 | smile
 795 | advice
 796 | justin
 797 | manager
 798 | bass
 799 | peaceful
 800 | golf
 801 | clothing
 802 | john
 803 | action
 804 | buddy
 805 | sunny
 806 | box
 807 | waitress
 808 | gardening
 809 | popular
 810 | wing
 811 | wall
 812 | personal
 813 | coach
 814 | cover
 815 | elementary
 816 | mix
 817 | pray
 818 | odd
 819 | stink
 820 | minute
 821 | key
 822 | inspire
 823 | event
 824 | adorable
 825 | community
 826 | juice
 827 | engineer
 828 | karate
 829 | skate
 830 | saturday
 831 | literature
 832 | reward
 833 | baking
 834 | ghost
 835 | ohio
 836 | vega
 837 | kitchen
 838 | sugar
 839 | chill
 840 | tomato
 841 | chase
 842 | chevy
 843 | roll
 844 | la
 845 | organize
 846 | blame
 847 | homework
 848 | bee
 849 | grandma
 850 | beatles
 851 | passionate
 852 | soul
 853 | knee
 854 | anxiety
 855 | nut
 856 | spot
 857 | ticket
 858 | grandparent
 859 | honest
 860 | google
 861 | junk
 862 | product
 863 | tech
 864 | track
 865 | normal
 866 | role
 867 | client
 868 | veterinarian
 869 | spicy
 870 | building
 871 | scare
 872 | hawaii
 873 | bummer
 874 | reality
 875 | iguana
 876 | niece
 877 | opera
 878 | charlie
 879 | worth
 880 | pack
 881 | trail
 882 | senior
 883 | toyota
 884 | beet
 885 | ireland
 886 | finance
 887 | chili
 888 | officer
 889 | pickle
 890 | kinds
 891 | manage
 892 | tired
 893 | christian
 894 | daily
 895 | nose
 896 | cheap
 897 | spider
 898 | gig
 899 | indian
 900 | locate
 901 | pleasure
 902 | league
 903 | hunting
 904 | retired
 905 | george
 906 | tuna
 907 | teaching
 908 | attention
 909 | step
 910 | awe
 911 | stock
 912 | list
 913 | portland
 914 | sign
 915 | avoid
 916 | bug
 917 | brain
 918 | scientist
 919 | dessert
 920 | excellent
 921 | finger
 922 | religious
 923 | superhero
 924 | highschool
 925 | kick
 926 | rule
 927 | salesman
 928 | drawing
 929 | handful
 930 | pilot
 931 | escape
 932 | schedule
 933 | tasty
 934 | gosh
 935 | burrito
 936 | kansa
 937 | commercial
 938 | rescue
 939 | mary
 940 | fi
 941 | retail
 942 | retriever
 943 | skydive
 944 | husky
 945 | gamble
 946 | georgia
 947 | industry
 948 | meeting
 949 | nyc
 950 | walmart
 951 | annoy
 952 | cruise
 953 | doubt
 954 | forest
 955 | bunch
 956 | cashier
 957 | actress
 958 | kayak
 959 | partner
 960 | mac
 961 | tiny
 962 | friday
 963 | spell
 964 | culture
 965 | kindergarten
 966 | lay
 967 | tiger
 968 | deaf
 969 | mention
 970 | drinking
 971 | accounting
 972 | cup
 973 | subway
 974 | pancake
 975 | habit
 976 | prius
 977 | blind
 978 | familiar
 979 | starbucks
 980 | assistant
 981 | pumpkin
 982 | burn
 983 | answer
 984 | poodle
 985 | metallica
 986 | circus
 987 | jewelry
 988 | sock
 989 | recommend
 990 | sandwich
 991 | journalist
 992 | fear
 993 | pretend
 994 | grandson
 995 | nfl
 996 | brownie
 997 | cupcake
 998 | doll
 999 | stephen
1000 | bother
1001 | desert
1002 | sci
1003 | moon
1004 | construction
1005 | reach
1006 | technology
1007 | option
1008 | marriage
1009 | pony
1010 | hell
1011 | romance
1012 | cuisine
1013 | strawberry
1014 | shirt
1015 | bbq
1016 | purse
1017 | leg
1018 | kitten
1019 | unemployed
1020 | project
1021 | running
1022 | deliver
1023 | natural
1024 | dang
1025 | control
1026 | hero
1027 | beef
1028 | trade
1029 | sunset
1030 | carrot
1031 | main
1032 | chess
1033 | grandkids
1034 | talented
1035 | explore
1036 | nasty
1037 | east
1038 | hamburger
1039 | private
1040 | yup
1041 | guard
1042 | choir
1043 | australia
1044 | pen
1045 | stamp
1046 | struggle
1047 | dive
1048 | bro
1049 | perry
1050 | omg
1051 | thinking
1052 | candle
1053 | fell
1054 | ink
1055 | wheel
1056 | ranch
1057 | owner
1058 | carry
1059 | biology
1060 | earn
1061 | regular
1062 | lion
1063 | peanut
1064 | broccoli
1065 | suit
1066 | medicine
1067 | speaking
1068 | chew
1069 | inspiration
1070 | sauce
1071 | hotel
1072 | camera
1073 | england
1074 | repair
1075 | turtle
1076 | athletic
1077 | unique
1078 | ray
1079 | sky
1080 | skateboard
1081 | river
1082 | tune
1083 | freak
1084 | estate
1085 | cheesecake
1086 | yuck
1087 | surfing
1088 | rent
1089 | ooh
1090 | grey
1091 | memory
1092 | perform
1093 | popcorn
1094 | dislike
1095 | childhood
1096 | adore
1097 | quick
1098 | comedian
1099 | sweater
1100 | antique
1101 | lottery
1102 | hows
1103 | mcdonald
1104 | jump
1105 | tutor
1106 | carolina
1107 | walking
1108 | pregnant
1109 | mike
1110 | vampire
1111 | decorate
1112 | bieber
1113 | alcohol
1114 | compete
1115 | mansion
1116 | owl
1117 | gotcha
1118 | jack
1119 | engineering
1120 | retirement
1121 | pot
1122 | airplane
1123 | ferrari
1124 | dry
1125 | dentist
1126 | russian
1127 | piece
1128 | security
1129 | spirit
1130 | offer
1131 | dorm
1132 | record
1133 | settle
1134 | lobster
1135 | foreign
1136 | software
1137 | honey
1138 | rice
1139 | princess
1140 | excited
1141 | amazon
1142 | baltimore
1143 | island
1144 | skiing
1145 | center
1146 | alien
1147 | butter
1148 | corn
1149 | civic
1150 | jane
1151 | view
1152 | nap
1153 | pit
1154 | bulldog
1155 | lovely
1156 | prince
1157 | loud
1158 | photograph
1159 | gift
1160 | coke
1161 | org
1162 | belt
1163 | festival
1164 | ugh
1165 | ahh
1166 | vintage
1167 | pug
1168 | birth
1169 | understandable
1170 | sweetheart
1171 | irma
1172 | deep
1173 | india
1174 | feed
1175 | ring
1176 | item
1177 | scene
1178 | spear
1179 | mushroom
1180 | position
1181 | afternoon
1182 | hire
1183 | trainer
1184 | distract
1185 | touch
1186 | unfortunate
1187 | alaska
1188 | expect
1189 | katy
1190 | scratch
1191 | cost
1192 | situation
1193 | gay
1194 | loss
1195 | organic
1196 | washington
1197 | joy
1198 | gummy
1199 | survive
1200 | med
1201 | bless
1202 | prepare
1203 | charity
1204 | sight
1205 | rare
1206 | heavy
1207 | rural
1208 | russia
1209 | newspaper
1210 | karaoke
1211 | driving
1212 | customer
1213 | watching
1214 | require
1215 | graphic
1216 | mood
1217 | maine
1218 | freelance
1219 | fitness
1220 | diner
1221 | pepsi
1222 | condo
1223 | miami
1224 | cross
1225 | runner
1226 | accept
1227 | panda
1228 | bunny
1229 | engage
1230 | uncle
1231 | commute
1232 | indie
1233 | cooler
1234 | straight
1235 | hollywood
1236 | bagel
1237 | terrify
1238 | training
1239 | boxer
1240 | left
1241 | protein
1242 | bull
1243 | jog
1244 | tom
1245 | shame
1246 | ouch
1247 | current
1248 | programmer
1249 | nerd
1250 | magazine
1251 | artistic
1252 | yikes
1253 | eating
1254 | skittle
1255 | furniture
1256 | eagle
1257 | form
1258 | fabulous
1259 | legal
1260 | agency
1261 | internship
1262 | cabin
1263 | drama
1264 | positive
1265 | addict
1266 | surprise
1267 | rewarding
1268 | tax
1269 | barista
1270 | fake
1271 | spain
1272 | crash
1273 | random
1274 | kale
1275 | bright
1276 | shark
1277 | studio
1278 | bow
1279 | boys
1280 | bell
1281 | brace
1282 | trick
1283 | wheelchair
1284 | cloud
1285 | southern
1286 | force
1287 | chair
1288 | spouse
1289 | thumb
1290 | frank
1291 | rapper
1292 | virginia
1293 | physical
1294 | bye
1295 | grad
1296 | soup
1297 | fiance
1298 | elvis
1299 | meatloaf
1300 | queen
1301 | united
1302 | income
1303 | salon
1304 | volleyball
1305 | target
1306 | chihuahua
1307 | limit
1308 | scholarship
1309 | direction
1310 | supply
1311 | canadian
1312 | daddy
1313 | toy
1314 | kentucky
1315 | respect
1316 | earth
1317 | political
1318 | binge
1319 | hilarious
1320 | blast
1321 | unhealthy
1322 | pant
1323 | cheeseburger
1324 | complete
1325 | unicorn
1326 | religion
1327 | dairy
1328 | drug
1329 | fl
1330 | whiskey
1331 | reject
1332 | iced
1333 | average
1334 | eater
1335 | dirty
1336 | rat
1337 | angry
1338 | heck
1339 | hide
1340 | competitive
1341 | gum
1342 | website
1343 | laptop
1344 | exhaust
1345 | robot
1346 | challenge
1347 | outdoor
1348 | raw
1349 | audition
1350 | cafe
1351 | onion
1352 | assume
1353 | opinion
1354 | horseback
1355 | zebra
1356 | philosophy
1357 | psychology
1358 | exact
1359 | spice
1360 | debt
1361 | reside
1362 | heat
1363 | hobbies
1364 | leather
1365 | rude
1366 | china
1367 | storm
1368 | grown
1369 | invite
1370 | instructor
1371 | steal
1372 | curly
1373 | wash
1374 | worm
1375 | bf
1376 | credit
1377 | greek
1378 | oregon
1379 | shot
1380 | riding
1381 | pride
1382 | speed
1383 | floor
1384 | intense
1385 | court
1386 | dental
1387 | bone
1388 | friendly
1389 | diving
1390 | lame
1391 | kitty
1392 | alternative
1393 | hubby
1394 | strict
1395 | ham
1396 | camping
1397 | pond
1398 | marine
1399 | grandpa
1400 | secretary
1401 | theme
1402 | judge
1403 | shade
1404 | complain
1405 | herb
1406 | advertising
1407 | celebrity
1408 | fascinate
1409 | lasagna
1410 | environment
1411 | painter
1412 | comfortable
1413 | beagle
1414 | recycle
1415 | bruno
1416 | invest
1417 | search
1418 | society
1419 | halo
1420 | pursue
1421 | hm
1422 | sam
1423 | connect
1424 | angeles
1425 | weed
1426 | grab
1427 | kiss
1428 | bald
1429 | em
1430 | britney
1431 | oil
1432 | conversation
1433 | mmm
1434 | yard
1435 | cash
1436 | jean
1437 | ship
1438 | exotic
1439 | vanilla
1440 | waste
1441 | photographer
1442 | adult
1443 | rolling
1444 | dig
1445 | cookie
1446 | tennessee
1447 | balance
1448 | cleaning
1449 | ocd
1450 | eggplant
1451 | archery
1452 | ma
1453 | meatball
1454 | dust
1455 | deer
1456 | gluten
1457 | agent
1458 | gaming
1459 | celebrate
1460 | helpful
1461 | tofu
1462 | campus
1463 | lord
1464 | evil
1465 | er
1466 | los
1467 | shake
1468 | martial
1469 | drummer
1470 | outfit
1471 | grass
1472 | wrestle
1473 | note
1474 | convertible
1475 | biking
1476 | taller
1477 | gorgeous
1478 | file
1479 | hectic
1480 | salsa
1481 | rush
1482 | awhile
1483 | distance
1484 | soft
1485 | homeless
1486 | daycare
1487 | process
1488 | patient
1489 | houston
1490 | crowd
1491 | stew
1492 | duty
1493 | bookstore
1494 | tie
1495 | neighborhood
1496 | professor
1497 | orleans
1498 | electric
1499 | original
1500 | opportunity
1501 | francisco
1502 | angel
1503 | fund
1504 | site
1505 | peace
1506 | picky
1507 | wisconsin
1508 | madonna
1509 | adam
1510 | iceland
1511 | blow
1512 | uh
1513 | total
1514 | urban
1515 | coincidence
1516 | entire
1517 | jello
1518 | compare
1519 | dollar
1520 | headache
1521 | blond
1522 | guilty
1523 | cure
1524 | electronic
1525 | grader
1526 | greece
1527 | correct
1528 | twilight
1529 | enjoyable
1530 | benefit
1531 | damn
1532 | coupon
1533 | cheat
1534 | thrift
1535 | print
1536 | scout
1537 | successful
1538 | bartender
1539 | stranger
1540 | cattle
1541 | mommy
1542 | bath
1543 | nascar
1544 | honor
1545 | suggestion
1546 | appalachian
1547 | ginger
1548 | hook
1549 | aquarium
1550 | james
1551 | scared
1552 | gamer
1553 | collie
1554 | creepy
1555 | upstate
1556 | therapist
1557 | bicycle
1558 | trophy
1559 | department
1560 | cheerleader
1561 | variety
1562 | suburb
1563 | explain
1564 | cuddle
1565 | flip
1566 | shepherd
1567 | tupac
1568 | whoa
1569 | iq
1570 | lifeguard
1571 | sunshine
1572 | jersey
1573 | rainy
1574 | vehicle
1575 | closet
1576 | rainbow
1577 | ross
1578 | specialty
1579 | attorney
1580 | interior
1581 | gf
1582 | crab
1583 | podcasts
1584 | fart
1585 | decent
1586 | insane
1587 | shepard
1588 | gossip
1589 | jimmy
1590 | blackjack
1591 | austin
1592 | nike
1593 | louisiana
1594 | brat
1595 | therapy
1596 | noble
1597 | skunk
1598 | listening
1599 | awww
1600 | pleasant
1601 | pittsburgh
1602 | barbie
1603 | specific
1604 | pas
1605 | um
1606 | avid
1607 | occasion
1608 | mustache
1609 | autograph
1610 | noise
1611 | diego
1612 | ruin
1613 | admit
1614 | fail
1615 | scotch
1616 | creature
1617 | costume
1618 | jeopardy
1619 | swift
1620 | africa
1621 | cd
1622 | account
1623 | edit
1624 | topic
1625 | handy
1626 | window
1627 | steelers
1628 | accent
1629 | activist
1630 | preacher
1631 | affect
1632 | apply
1633 | loose
1634 | refuse
1635 | bum
1636 | minnesota
1637 | gender
1638 | confuse
1639 | landscape
1640 | broadway
1641 | bmw
1642 | argue
1643 | foodie
1644 | trek
1645 | relieve
1646 | todd
1647 | path
1648 | freedom
1649 | pearl
1650 | tap
1651 | medication
1652 | vera
1653 | rapid
1654 | ohh
1655 | government
1656 | environmental
1657 | fault
1658 | ft
1659 | cope
1660 | carbs
1661 | standard
1662 | planet
1663 | mcqueen
1664 | nugget
1665 | pull
1666 | difference
1667 | babysit
1668 | teller
1669 | disappoint
1670 | mermaid
1671 | pageant
1672 | soap
1673 | midwest
1674 | giant
1675 | puerto
1676 | bowling
1677 | asian
1678 | arizona
1679 | happiness
1680 | provide
1681 | fond
1682 | bud
1683 | cell
1684 | entertain
1685 | butterfly
1686 | genius
1687 | scream
1688 | architect
1689 | jar
1690 | monkey
1691 | theatre
1692 | hah
1693 | purchase
1694 | yay
1695 | vote
1696 | level
1697 | cab
1698 | combo
1699 | tool
1700 | fluent
1701 | duck
1702 | creek
1703 | load
1704 | affair
1705 | humor
1706 | slave
1707 | publishing
1708 | painful
1709 | redhead
1710 | jerry
1711 | thursday
1712 | dungeon
1713 | cape
1714 | messy
1715 | approve
1716 | cousin
1717 | weekly
1718 | terrier
1719 | paddle
1720 | david
1721 | typical
1722 | waffle
1723 | desk
1724 | catholic
1725 | germany
1726 | instagram
1727 | admire
1728 | image
1729 | upset
1730 | muscle
1731 | monday
1732 | pic
1733 | basement
1734 | ground
1735 | wave
1736 | shellfish
1737 | technician
1738 | episode
1739 | jim
1740 | jacob
1741 | ballerina
1742 | loan
1743 | improve
1744 | garage
1745 | ibm
1746 | tooth
1747 | bachelor
1748 | thrill
1749 | hippie
1750 | notice
1751 | return
1752 | crush
1753 | jesus
1754 | stomach
1755 | techno
1756 | raven
1757 | nerve
1758 | denver
1759 | foster
1760 | thriller
1761 | rugby
1762 | daydream
1763 | oreo
1764 | discover
1765 | detroit
1766 | cult
1767 | atlanta
1768 | incredible
1769 | stable
1770 | poem
1771 | yr
1772 | oooh
1773 | wide
1774 | mango
1775 | snowboard
1776 | weapon
1777 | countryside
1778 | alcoholic
1779 | brunch
1780 | fisherman
1781 | aunt
1782 | toronto
1783 | sarah
1784 | addiction
1785 | surround
1786 | lactose
1787 | clinic
1788 | universe
1789 | pretzel
1790 | toto
1791 | blah
1792 | sir
1793 | sausage
1794 | cosplay
1795 | text
1796 | quality
1797 | millionaire
1798 | clerk
1799 | skinny
1800 | hendrix
1801 | disabled
1802 | puzzle
1803 | pepperoni
1804 | civil
1805 | surfer
1806 | larp
1807 | package
1808 | roof
1809 | pa
1810 | suspense
1811 | drone
1812 | mail
1813 | jason
1814 | exam
1815 | mark
1816 | financial
1817 | encyclopedia
1818 | cheetos
1819 | demand
1820 | shell
1821 | planning
1822 | stupid
1823 | yell
1824 | grateful
1825 | bingo
1826 | source
1827 | companion
1828 | director
1829 | bite
1830 | detective
1831 | biography
1832 | gospel
1833 | silly
1834 | pudding
1835 | pork
1836 | teeth
1837 | autobiography
1838 | salt
1839 | footstep
1840 | deserve
1841 | produce
1842 | swimmer
1843 | barbecue
1844 | maryland
1845 | btw
1846 | defense
1847 | fallon
1848 | teen
1849 | continue
1850 | cart
1851 | wizard
1852 | meditate
1853 | shelf
1854 | zombie
1855 | irish
1856 | pecan
1857 | bubble
1858 | discount
1859 | scooter
1860 | push
1861 | tutorial
1862 | scuba
1863 | homemade
1864 | weakness
1865 | translator
1866 | gymnastics
1867 | background
1868 | softball
1869 | kidding
1870 | mistake
1871 | realize
1872 | ironic
1873 | floyd
1874 | flight
1875 | surgeon
1876 | crack
1877 | desire
1878 | introvert
1879 | ted
1880 | knife
1881 | mba
1882 | pottery
1883 | orphan
1884 | recover
1885 | mini
1886 | bucket
1887 | perk
1888 | autism
1889 | moped
1890 | cycle
1891 | youth
1892 | spoil
1893 | attitude
1894 | injury
1895 | pennsylvania
1896 | aspire
1897 | loyal
1898 | attack
1899 | murder
1900 | price
1901 | tank
1902 | safety
1903 | olympics
1904 | rome
1905 | dj
1906 | carpenter
1907 | sesame
1908 | consume
1909 | protect
1910 | british
1911 | sword
1912 | cheetah
1913 | float
1914 | asia
1915 | mate
1916 | creed
1917 | xbox
1918 | lean
1919 | freckle
1920 | caffeine
1921 | hunter
1922 | trump
1923 | programming
1924 | picnic
1925 | ear
1926 | fridge
1927 | easter
1928 | asparagus
1929 | oldie
1930 | darn
1931 | disagree
1932 | mirror
1933 | hehe
1934 | ew
1935 | biscuit
1936 | freshman
1937 | whistle
1938 | usual
1939 | inch
1940 | deli
1941 | eclipse
1942 | cycling
1943 | modern
1944 | tease
1945 | review
1946 | pattern
1947 | award
1948 | belong
1949 | nickname
1950 | buff
1951 | sweden
1952 | cuz
1953 | selfish
1954 | personality
1955 | graduation
1956 | dolphin
1957 | album
1958 | pup
1959 | taylor
1960 | lease
1961 | educate
1962 | depress
1963 | paramedic
1964 | gila
1965 | purpose
1966 | prison
1967 | worried
1968 | vermont
1969 | block
1970 | buffalo
1971 | olive
1972 | identical
1973 | score
1974 | string
1975 | actual
1976 | intelligent
1977 | epilepsy
1978 | sand
1979 | plate
1980 | subtitle
1981 | border
1982 | cable
1983 | smooth
1984 | lexus
1985 | develop
1986 | vienna
1987 | brave
1988 | welfare
1989 | doberman
1990 | wealthy
1991 | switch
1992 | sneak
1993 | robert
1994 | hill
1995 | nacho
1996 | mugger
1997 | snap
1998 | dumb
1999 | coworker
2000 | peta
2001 | hr
2002 | chick
2003 | fur
2004 | goalie
2005 | range
2006 | introduce
2007 | costco
2008 | railroad
2009 | suffer
2010 | menu
2011 | soldier
2012 | asthma
2013 | sex
2014 | lindsey
2015 | utah
2016 | grandfather
2017 | documentary
2018 | admirable
2019 | traveling
2020 | dane
2021 | employee
2022 | article
2023 | caramel
2024 | harley
2025 | equipment
2026 | heal
2027 | basic
2028 | dabble
2029 | depression
2030 | attract
2031 | gaga
2032 | tale
2033 | tube
2034 | fried
2035 | doggy
2036 | mash
2037 | taught
2038 | receptionist
2039 | empire
2040 | intern
2041 | piercings
2042 | tackle
2043 | ease
2044 | blanket
2045 | participate
2046 | cheesy
2047 | gray
2048 | daisy
2049 | toddler
2050 | link
2051 | diamond
2052 | propose
2053 | dallas
2054 | president
2055 | ranger
2056 | wolf
2057 | gain
2058 | chai
2059 | annoying
2060 | earring
2061 | version
2062 | basket
2063 | lens
2064 | salary
2065 | corner
2066 | champion
2067 | firefighter
2068 | ferret
2069 | achieve
2070 | sears
2071 | mia
2072 | idol
2073 | joe
2074 | decade
2075 | emotion
2076 | koala
2077 | management
2078 | pharmacist
2079 | apps
2080 | depends
2081 | killer
2082 | fellow
2083 | uniform
2084 | gourmet
2085 | cleveland
2086 | motivate
2087 | hummus
2088 | entertainment
2089 | mmmm
2090 | bag
2091 | nashville
2092 | flirt
2093 | owen
2094 | trumpet
2095 | nevada
2096 | stage
2097 | jerky
2098 | responsibility
2099 | drake
2100 | bentley
2101 | gold
2102 | tx
2103 | arcade
2104 | ankle
2105 | vegas
2106 | kj
2107 | batman
2108 | unwind
2109 | keyboard
2110 | combination
2111 | leaf
2112 | koi
2113 | cello
2114 | minimum
2115 | adventurous
2116 | loving
2117 | kiddos
2118 | spill
2119 | diabetic
2120 | central
2121 | rob
2122 | trend
2123 | bubblegum
2124 | indoors
2125 | monster
2126 | drunk
2127 | confidence
2128 | pyramid
2129 | grunge
2130 | banker
2131 | breath
2132 | grasshopper
2133 | hoop
2134 | encourage
2135 | gutter
2136 | macaroni
2137 | robotics
2138 | double
2139 | seat
2140 | dew
2141 | uncomfortable
2142 | flash
2143 | bench
2144 | bomb
2145 | info
2146 | semi
2147 | comfort
2148 | wildlife
2149 | geology
2150 | lonely
2151 | coz
2152 | edge
2153 | anniversary
2154 | vitamin
2155 | material
2156 | hotdog
2157 | gathering
2158 | socialize
2159 | machine
2160 | editor
2161 | droopy
2162 | brew
2163 | liberal
2164 | mercedes
2165 | quarterback
2166 | rn
2167 | planner
2168 | fortunate
2169 | ur
2170 | greenhouse
2171 | si
2172 | psychologist
2173 | promotion
2174 | def
2175 | discovery
2176 | carb
2177 | humane
2178 | broken
2179 | chanel
2180 | fluffy
2181 | chain
2182 | vision
2183 | stanford
2184 | pipe
2185 | ability
2186 | overweight
2187 | promote
2188 | labrador
2189 | veteran
2190 | preference
2191 | symphony
2192 | alpaca
2193 | ve
2194 | app
2195 | teenager
2196 | anne
2197 | promise
2198 | doo
2199 | publisher
2200 | curious
2201 | tiki
2202 | porsche
2203 | mixed
2204 | maid
2205 | legend
2206 | michael
2207 | supportive
2208 | pineapple
2209 | ariel
2210 | diabetes
2211 | consulting
2212 | starve
2213 | gal
2214 | collar
2215 | gable
2216 | battle
2217 | jacket
2218 | sexy
2219 | sleeve
2220 | felix
2221 | pastime
2222 | jamaica
2223 | mortal
2224 | weak
2225 | scrub
2226 | rabbit
2227 | godfather
2228 | sinatra
2229 | valley
2230 | despise
2231 | regret
2232 | goodwill
2233 | heaven
2234 | buddhist
2235 | smoking
2236 | oops
2237 | pitbulls
2238 | salmon
2239 | rick
2240 | bitcoin
2241 | dip
2242 | trout
2243 | pill
2244 | farming
2245 | thankful
2246 | tokyo
2247 | housewife
2248 | prayer
2249 | impala
2250 | valedictorian
2251 | plain
2252 | message
2253 | temper
2254 | flintstone
2255 | leprechaun
2256 | sucker
2257 | breathe
2258 | csi
2259 | criminal
2260 | rip
2261 | maiden
2262 | fascinating
2263 | rico
2264 | algeria
2265 | report
2266 | umm
2267 | patience
2268 | leader
2269 | curl
2270 | motivation
2271 | climbing
2272 | tahoe
2273 | ymca
2274 | relief
2275 | glacier
2276 | breast
2277 | enter
2278 | clutter
2279 | dull
2280 | fighter
2281 | tat
2282 | awake
2283 | brewery
2284 | victorian
2285 | volcano
2286 | friends
2287 | mount
2288 | pillage
2289 | magical
2290 | generation
2291 | clue
2292 | conscious
2293 | stare
2294 | silver
2295 | wrestling
2296 | levine
2297 | joint
2298 | restore
2299 | everest
2300 | dope
2301 | stray
2302 | international
2303 | parking
2304 | hampshire
2305 | hearse
2306 | warehouse
2307 | pitbull
2308 | nyu
2309 | outdoorsy
2310 | development
2311 | employment
2312 | drinker
2313 | zumba
2314 | paul
2315 | budget
2316 | daniel
2317 | eyesight
2318 | sour
2319 | mouth
2320 | stain
2321 | blogger
2322 | exist
2323 | rib
2324 | brush
2325 | interview
2326 | bff
2327 | custom
2328 | snuggle
2329 | vancouver
2330 | mario
2331 | ferraris
2332 | mural
2333 | poet
2334 | oriole
2335 | period
2336 | karma
2337 | damage
2338 | warmer
2339 | crossword
2340 | childrens
2341 | pomeranian
2342 | imaginary
2343 | dave
2344 | anatomy
2345 | tone
2346 | code
2347 | videogames
2348 | woodstock
2349 | convention
2350 | janitor
2351 | preschool
2352 | screen
2353 | prejudice
2354 | crystal
2355 | rage
2356 | tradition
2357 | chatting
2358 | traditional
2359 | parakeet
2360 | ramen
2361 | combat
2362 | multiple
2363 | crave
2364 | syrup
2365 | racing
2366 | highlight
2367 | communist
2368 | concentrate
2369 | waiter
2370 | ebooks
2371 | dodge
2372 | hp
2373 | boil
2374 | attic
2375 | medal
2376 | commitment
2377 | release
2378 | downtown
2379 | alligator
2380 | statement
2381 | debate
2382 | agreed
2383 | maga
2384 | homeschooled
2385 | strength
2386 | plumber
2387 | hippy
2388 | windy
2389 | condition
2390 | smoothie
2391 | stair
2392 | content
2393 | depressed
2394 | ferrell
2395 | keto
2396 | remodel
2397 | donut
2398 | winner
2399 | playlist
2400 | wayne
2401 | nation
2402 | kpop
2403 | map
2404 | coon
2405 | junior
2406 | mum
2407 | tape
2408 | quake
2409 | smithsonian
2410 | washer
2411 | abigail
2412 | radiohead
2413 | humble
2414 | unicycle
2415 | administration
2416 | ontario
2417 | performance
2418 | truth
2419 | fred
2420 | ingredient
2421 | cucumber
2422 | beastie
2423 | orchestra
2424 | sewing
2425 | knock
2426 | culinary
2427 | sweat
2428 | seashell
2429 | impression
2430 | network
2431 | languages
2432 | tailgate
2433 | celebration
2434 | thomas
2435 | embarrass
2436 | born
2437 | mama
2438 | freeze
2439 | crap
2440 | fortune
2441 | figurine
2442 | confident
2443 | homebody
2444 | chemistry
2445 | collector
2446 | merna
2447 | arrive
2448 | titanic
2449 | meditation
2450 | bout
2451 | manta
2452 | announcer
2453 | solo
2454 | circle
2455 | md
2456 | funeral
2457 | engine
2458 | butt
2459 | delivery
2460 | ultimate
2461 | specialize
2462 | web
2463 | palm
2464 | absolute
2465 | investment
2466 | harsh
2467 | pistachio
2468 | loner
2469 | experiment
2470 | gut
2471 | austen
2472 | fuel
2473 | cramp
2474 | trauma
2475 | sleepy
2476 | celtic
2477 | press
2478 | draft
2479 | auto
2480 | sprite
2481 | obsession
2482 | sip
2483 | fifty
2484 | vinyl
2485 | swing
2486 | fool
2487 | hbu
2488 | harvey
2489 | copperfield
2490 | playoff
2491 | kite
2492 | lesbian
2493 | jerk
2494 | owe
2495 | democrat
2496 | mass
2497 | hamilton
2498 | ga
2499 | uk
2500 | luis
2501 | impress
2502 | slice
2503 | pita
2504 | hobbie
2505 | apologize
2506 | santa
2507 | tacos
2508 | landing
2509 | hometown
2510 | telecom
2511 | mater
2512 | mutt
2513 | deploy
2514 | del
2515 | sore
2516 | nancy
2517 | barbies
2518 | fam
2519 | clay
2520 | ethnic
2521 | pastry
2522 | hostage
2523 | tight
2524 | backyard
2525 | convince
2526 | maker
2527 | curry
2528 | android
2529 | pc
2530 | jessica
2531 | ignore
2532 | flow
2533 | sickness
2534 | elderly
2535 | chore
2536 | upholstery
2537 | sweetie
2538 | lettuce
2539 | cuba
2540 | gadget
2541 | animation
2542 | trooper
2543 | faith
2544 | tongue
2545 | success
2546 | gentle
2547 | portrait
2548 | sheeran
2549 | chevrolet
2550 | packer
2551 | risk
2552 | spark
2553 | frustrate
2554 | mouse
2555 | pitch
2556 | weld
2557 | eyebrow
2558 | bella
2559 | linebacker
2560 | bully
2561 | routine
2562 | spelling
2563 | bc
2564 | coat
2565 | saudi
2566 | arabia
2567 | tampa
2568 | emmy
2569 | samsung
2570 | mop
2571 | kevin
2572 | checker
2573 | teapot
2574 | weigh
2575 | suv
2576 | miserable
2577 | sevenfold
2578 | f150
2579 | lit
2580 | posse
2581 | thai
2582 | curator
2583 | steve
2584 | poop
2585 | historical
2586 | morty
2587 | cane
2588 | miley
2589 | wise
2590 | petition
2591 | tear
2592 | penn
2593 | astronaut
2594 | cod
2595 | colour
2596 | acting
2597 | precious
2598 | buck
2599 | lucy
2600 | muse
2601 | cosmetic
2602 | occupation
2603 | nba
2604 | ate
2605 | flexible
2606 | ideal
2607 | suspender
2608 | bang
2609 | direct
2610 | gotti
2611 | agitate
2612 | hairdresser
2613 | dealership
2614 | influence
2615 | cursive
2616 | sunfish
2617 | snorkel
2618 | shallow
2619 | root
2620 | pediatrician
2621 | compost
2622 | coaster
2623 | nearby
2624 | foreman
2625 | deadbeat
2626 | penny
2627 | jay
2628 | jasper
2629 | tarot
2630 | pressure
2631 | clarinet
2632 | supper
2633 | express
2634 | ai
2635 | martini
2636 | favor
2637 | chop
2638 | lutefisk
2639 | charge
2640 | dakota
2641 | hitchhike
2642 | formal
2643 | ivy
2644 | raptor
2645 | battlestar
2646 | captain
2647 | disgust
2648 | task
2649 | sitcom
2650 | yorkie
2651 | coco
2652 | understood
2653 | naw
2654 | ant
2655 | stinky
2656 | speckle
2657 | title
2658 | corporate
2659 | wednesday
2660 | gambler
2661 | wage
2662 | multi
2663 | mma
2664 | cookbook
2665 | citizen
2666 | hazel
2667 | aspiration
2668 | goat
2669 | stuck
2670 | lumberjack
2671 | flag
2672 | wet
2673 | ufc
2674 | learning
2675 | stirling
2676 | dealer
2677 | grisham
2678 | acre
2679 | 


--------------------------------------------------------------------------------