├── models ├── __init__.py ├── helpers.pyc ├── __init__.pyc ├── dual_encoder.pyc ├── __pycache__ │ ├── helpers.cpython-35.pyc │ ├── __init__.cpython-35.pyc │ └── dual_encoder.cpython-35.pyc ├── helpers.py └── dual_encoder.py ├── Images └── model.jpg ├── train.sh ├── predict.sh ├── udc_metrics.py ├── requirements.txt ├── README.md ├── udc_test.py ├── data_sample └── persona │ ├── train.csv │ ├── test.csv │ └── valid.csv ├── udc_hparams.py ├── udc_train.py ├── udc_inputs.py ├── udc_predict.py ├── udc_model.py └── prepare_data.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Images/model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/Images/model.jpg -------------------------------------------------------------------------------- /models/helpers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/helpers.pyc -------------------------------------------------------------------------------- /models/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__init__.pyc -------------------------------------------------------------------------------- /models/dual_encoder.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/dual_encoder.pyc -------------------------------------------------------------------------------- /models/__pycache__/helpers.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__pycache__/helpers.cpython-35.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /models/__pycache__/dual_encoder.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__pycache__/dual_encoder.cpython-35.pyc -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" 3 | export CUDA_HOME=/usr/local/cuda 4 | export CUDA_VISIBLE_DEVICES=0 5 | python3 udc_train.py 6 | -------------------------------------------------------------------------------- /predict.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" 3 | export CUDA_HOME=/usr/local/cuda 4 | export CUDA_VISIBLE_DEVICES=0 5 | python3 udc_predict.py --model_dir=./runs/1480971822/ -------------------------------------------------------------------------------- /udc_metrics.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import functools 3 | from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec 4 | 5 | 6 | def create_evaluation_metrics(): 7 | eval_metrics = {} 8 | for k in [1, 2, 5, 10]: 9 | eval_metrics["recall_at_%d" % k] = MetricSpec(metric_fn=functools.partial( 10 | tf.contrib.metrics.streaming_sparse_recall_at_k, 11 | k=k)) 12 | return eval_metrics 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope 2 | backports.shutil-get-terminal-size 3 | cycler 4 | decorator 5 | entrypoints 6 | ipykernel 7 | ipython 8 | ipython-genutils 9 | ipywidgets 10 | Jinja2 11 | jsonschema 12 | jupyter 13 | jupyter-client 14 | jupyter-console 15 | jupyter-core 16 | numpy 17 | MarkupSafe 18 | matplotlib 19 | mistune 20 | nbconvert 21 | nbformat 22 | notebook 23 | pexpect 24 | pandas 25 | pickleshare 26 | protobuf 27 | ptyprocess 28 | Pygments 29 | pyparsing 30 | python-dateutil 31 | pytz 32 | pyzmq 33 | qtconsole 34 | scikit-learn 35 | scipy 36 | simplegeneric 37 | six 38 | terminado 39 | termcolor 40 | tornado 41 | traitlets 42 | widgetsnbextension 43 | 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # persona.chatbot 2 | 3 | #### almost change the code from [wildml](https://github.com/dennybritz/chatbot-retrieval/), the code implements the Dual LSTM Encoder model from [The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems](http://arxiv.org/abs/1506.08909). 4 | 5 | ### Overview 6 | 7 | The code here simply implements the [Personalizing Dialogue Agents: I have a dog, do you have pets too? ](https://arxiv.org/abs/1801.07243) 8 | 9 | Use the first persona sentence as input 10 | 11 | ![image](https://github.com/fannn1217/persona.chatbot/blob/master/Images/model.jpg) 12 | 13 | ### Setup 14 | 15 | ``` 16 | Python 3 17 | tensorflow 1.3 18 | ``` 19 | 20 | ### Get the Data 21 | 22 | from ParlAI [github](https://github.com/facebookresearch/ParlAI/tree/master/projects/personachat) 23 | 24 | 25 | ### Training 26 | 27 | ``` 28 | python udc_train.py 29 | ``` 30 | 31 | ### Evaluation 32 | 33 | ``` 34 | python udc_test.py --model_dir=... 35 | ``` 36 | 37 | ### Prediction 38 | 39 | ``` 40 | python udc_predict.py --model_dir=... 41 | ``` 42 | -------------------------------------------------------------------------------- /udc_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import itertools 4 | import sys 5 | import tensorflow as tf 6 | import udc_model 7 | import udc_hparams 8 | import udc_metrics 9 | import udc_inputs 10 | from models.dual_encoder import dual_encoder_model 11 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 12 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 13 | 14 | 15 | tf.flags.DEFINE_string("test_file", "./data/persona/test.tfrecords", "Path of test data in TFRecords format") 16 | tf.flags.DEFINE_string("model_dir", "./runs/1542774662", "Directory to load model checkpoints from") 17 | tf.flags.DEFINE_integer("loglevel", 20, "Tensorflow log level") 18 | tf.flags.DEFINE_integer("test_batch_size", 8, "Batch size for testing") 19 | FLAGS = tf.flags.FLAGS 20 | 21 | if not FLAGS.model_dir: 22 | print("You must specify a model directory") 23 | sys.exit(1) 24 | 25 | tf.logging.set_verbosity(FLAGS.loglevel) 26 | 27 | if __name__ == "__main__": 28 | hparams = udc_hparams.create_hparams() 29 | model_fn = udc_model.create_model_fn(hparams, model_impl=dual_encoder_model) 30 | estimator = tf.contrib.learn.Estimator( 31 | model_fn=model_fn, 32 | model_dir=FLAGS.model_dir, 33 | config=tf.contrib.learn.RunConfig()) 34 | 35 | input_fn_test = udc_inputs.create_input_fn( 36 | mode=tf.contrib.learn.ModeKeys.EVAL, 37 | input_files=[FLAGS.test_file], 38 | batch_size=FLAGS.test_batch_size, 39 | num_epochs=1) 40 | 41 | eval_metrics = udc_metrics.create_evaluation_metrics() 42 | estimator.evaluate(input_fn=input_fn_test, steps=None, metrics=eval_metrics) 43 | -------------------------------------------------------------------------------- /models/helpers.py: -------------------------------------------------------------------------------- 1 | import array 2 | import numpy as np 3 | import tensorflow as tf 4 | from collections import defaultdict 5 | 6 | def load_vocab(filename): 7 | vocab = None 8 | with open(filename) as f: 9 | vocab = f.read().splitlines() 10 | dct = defaultdict(int) 11 | for idx, word in enumerate(vocab): 12 | dct[word] = idx 13 | return [vocab, dct] 14 | 15 | def load_glove_vectors(filename, vocab): 16 | """ 17 | Load glove vectors from a .txt file. 18 | Optionally limit the vocabulary to save memory. `vocab` should be a set. 19 | """ 20 | dct = {} 21 | vectors = array.array('d') 22 | current_idx = 0 23 | with open(filename, "r", encoding="utf-8") as f: 24 | for _, line in enumerate(f): 25 | tokens = line.split(" ") 26 | word = tokens[0] 27 | entries = tokens[1:] 28 | if not vocab or word in vocab: 29 | dct[word] = current_idx 30 | vectors.extend(float(x) for x in entries) 31 | current_idx += 1 32 | word_dim = len(entries) 33 | num_vectors = len(dct) 34 | tf.logging.info("Found {} out of {} vectors in Glove".format(num_vectors, len(vocab))) 35 | return [np.array(vectors).reshape(num_vectors, word_dim), dct] 36 | 37 | 38 | def build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, embedding_dim): 39 | initial_embeddings = np.random.uniform(-0.25, 0.25, (len(vocab_dict), embedding_dim)).astype("float32") 40 | for word, glove_word_idx in glove_dict.items(): 41 | word_idx = vocab_dict.get(word) 42 | initial_embeddings[word_idx, :] = glove_vectors[glove_word_idx] -------------------------------------------------------------------------------- /data_sample/persona/train.csv: -------------------------------------------------------------------------------- 1 | Context,Utterance,Persona,Label 2 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","my mom was single with 3 boys , so we never left the projects .","i like to remodel homes.",0 3 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","i try to wear all black every day . it makes me feel comfortable .","i like to remodel homes.",0 4 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","well nursing stresses you out so i wish luck with sister","i like to remodel homes.",0 5 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","yeah just want to pick up nba nfl getting old","i like to remodel homes.",0 6 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","i really like celine dion . what about you ?","i like to remodel homes.",0 7 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","no . i live near farms .","i like to remodel homes.",0 8 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","i wish i had a daughter , i am a boy mom . they are beautiful boys though still lucky","i like to remodel homes.",0 9 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","yeah when i get bored i play gone with the wind my favorite movie .","i like to remodel homes.",0 10 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","hi how are you ? i am eating dinner with my hubby and 2 kids .","i like to remodel homes.",0 11 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","you must be very fast . hunting is one of my favorite hobbies .","i like to remodel homes.",1 -------------------------------------------------------------------------------- /udc_hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from collections import namedtuple 3 | 4 | # Model Parameters 5 | tf.flags.DEFINE_integer( 6 | "vocab_size", 7 | #91620, 8 | #16758, 9 | #1727, 10 | 18537, 11 | "The size of the vocabulary. Only change this if you changed the preprocessing") 12 | 13 | # Model Parameters 14 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of the embeddings") 15 | tf.flags.DEFINE_integer("rnn_dim", 256, "Dimensionality of the RNN cell") 16 | tf.flags.DEFINE_integer("max_context_len", 100, "Truncate contexts to this length") 17 | tf.flags.DEFINE_integer("max_utterance_len", 50, "Truncate utterance to this length") 18 | tf.flags.DEFINE_integer("max_persona_len", 50, "Truncate persona to this length") 19 | 20 | # Pre-trained embeddings 21 | tf.flags.DEFINE_string("glove_path", None, "Path to pre-trained Glove vectors") 22 | tf.flags.DEFINE_string("vocab_path", None, "Path to vocabulary.txt file") 23 | 24 | # Training Parameters 25 | tf.flags.DEFINE_float("learning_rate", 0.001, "Learning rate") 26 | tf.flags.DEFINE_integer("batch_size", 64, "Batch size during training") 27 | tf.flags.DEFINE_integer("eval_batch_size", 8, "Batch size during evaluation") 28 | tf.flags.DEFINE_string("optimizer", "Adam", "Optimizer Name (Adam, Adagrad, etc)") 29 | 30 | FLAGS = tf.flags.FLAGS 31 | 32 | HParams = namedtuple( 33 | "HParams", 34 | [ 35 | "batch_size", 36 | "embedding_dim", 37 | "eval_batch_size", 38 | "learning_rate", 39 | "max_context_len", 40 | "max_utterance_len", 41 | "max_persona_len", 42 | "optimizer", 43 | "rnn_dim", 44 | "vocab_size", 45 | "glove_path", 46 | "vocab_path" 47 | ]) 48 | 49 | def create_hparams(): 50 | return HParams( 51 | batch_size=FLAGS.batch_size, 52 | eval_batch_size=FLAGS.eval_batch_size, 53 | vocab_size=FLAGS.vocab_size, 54 | optimizer=FLAGS.optimizer, 55 | learning_rate=FLAGS.learning_rate, 56 | embedding_dim=FLAGS.embedding_dim, 57 | max_context_len=FLAGS.max_context_len, 58 | max_utterance_len=FLAGS.max_utterance_len, 59 | max_persona_len=FLAGS.max_persona_len, 60 | glove_path=FLAGS.glove_path, 61 | vocab_path=FLAGS.vocab_path, 62 | rnn_dim=FLAGS.rnn_dim) -------------------------------------------------------------------------------- /udc_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import itertools 4 | import tensorflow as tf 5 | import udc_model 6 | import udc_hparams 7 | import udc_metrics 8 | import udc_inputs 9 | from models.dual_encoder import dual_encoder_model 10 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 12 | 13 | tf.flags.DEFINE_string("input_dir", "./data/persona", "Directory containing input data files 'train.tfrecords' and 'validation.tfrecords'") 14 | tf.flags.DEFINE_string("model_dir", None, "Directory to store model checkpoints (defaults to ./runs)") 15 | tf.flags.DEFINE_integer("loglevel", 20, "Tensorflow log level") 16 | tf.flags.DEFINE_integer("num_epochs", None, "Number of training Epochs. Defaults to indefinite.") 17 | tf.flags.DEFINE_integer("eval_every", 2000, "Evaluate after this many train steps") 18 | FLAGS = tf.flags.FLAGS 19 | 20 | TIMESTAMP = int(time.time()) 21 | 22 | if FLAGS.model_dir: 23 | MODEL_DIR = FLAGS.model_dir 24 | else: 25 | MODEL_DIR = os.path.abspath(os.path.join("./runs", str(TIMESTAMP))) 26 | 27 | TRAIN_FILE = os.path.abspath(os.path.join(FLAGS.input_dir, "train.tfrecords")) 28 | VALIDATION_FILE = os.path.abspath(os.path.join(FLAGS.input_dir, "validation.tfrecords")) 29 | 30 | tf.logging.set_verbosity(FLAGS.loglevel) 31 | 32 | def main(unused_argv): 33 | hparams = udc_hparams.create_hparams() 34 | 35 | model_fn = udc_model.create_model_fn( 36 | hparams, 37 | model_impl=dual_encoder_model) 38 | 39 | estimator = tf.contrib.learn.Estimator( 40 | model_fn=model_fn, 41 | model_dir=MODEL_DIR, 42 | config=tf.contrib.learn.RunConfig()) 43 | 44 | input_fn_train = udc_inputs.create_input_fn( 45 | mode=tf.contrib.learn.ModeKeys.TRAIN, 46 | input_files=[TRAIN_FILE], 47 | batch_size=hparams.batch_size, 48 | num_epochs=FLAGS.num_epochs) 49 | 50 | input_fn_eval = udc_inputs.create_input_fn( 51 | mode=tf.contrib.learn.ModeKeys.EVAL, 52 | input_files=[VALIDATION_FILE], 53 | batch_size=hparams.eval_batch_size, 54 | num_epochs=1) 55 | 56 | eval_metrics = udc_metrics.create_evaluation_metrics() 57 | 58 | eval_monitor = tf.contrib.learn.monitors.ValidationMonitor( 59 | input_fn=input_fn_eval, 60 | every_n_steps=FLAGS.eval_every, 61 | metrics=eval_metrics) 62 | 63 | estimator.fit(input_fn=input_fn_train, steps=None, monitors=[eval_monitor]) 64 | 65 | if __name__ == "__main__": 66 | tf.app.run() 67 | -------------------------------------------------------------------------------- /data_sample/persona/test.csv: -------------------------------------------------------------------------------- 1 | Context,Ground Truth Utterance,Persona,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8 2 | "hello , how are you doing tonight ?",i am well an loving this interaction how are you ?,i love to meet new people.,god help you and make you rich one day .,it is okay . nothing special !,do you live in a large or small city ? i am in large,i like blue hair . is it dark blue or light blue ?,hey ! coach johnny here ! how are you,great ! i do work on my fathers hobby farm in my spare time,yes ! one photo is going in an ad at my agency too,i try to go to the gym every once in a while,"that is so nice ! i wanted girls but have two boys . would not trade them , though !" 3 | i am great . i just got back from the club .,this is my favorite time of the year season wise,i love to meet new people.,neat i follow the stock market carefully since becoming a stock broker,have a cat . i wanted a dog but i am allergic .,"oh nice ! i live in new york but i do not drive , though .",cannot blame you there . i know the feeling every time i take a camping trip,i still have roommates here in seattle,i am from california . where are you from ?,i am not really into sports . i like robots though !,i do not blame you . i work as a registered nurse .,no i do not . i like theatre though 4 | i would rather eat chocolate cake during this season .,what club did you go to ? me an timothy watched tv,i love to meet new people.,fine how are you on this cold day ?,what do you paint ? and it is not the same .,the only language i speak is english,i like some romance shows . they have to be realistic though .,i do how tall are you ?,business administration . is it hard to work as consulting manager,hi there ! how you doing today ? i just got home from school .,i use to have anxiety but then i took tae kwan do . i am a champ at it,i only have a turtle . 5 | i went to club chino . what show are you watching ?,lol oh okay kind of random,i love to meet new people.,"when i move to nyc , i hope to hear some jazz on the streets .",it is very pretty ! do you like asia ?,like water or oceans ? i am terrified .,layla is at great cat name . i am huge and tall,in screenplays i like reading horror . love to see difference versus movie,since when am i a dog ? that is quite rude !,no i am just on my lunch break,"hello , how are you today ?",her name is ellenor rose . we alos have a 4th family member whose a dog names milly -------------------------------------------------------------------------------- /data_sample/persona/valid.csv: -------------------------------------------------------------------------------- 1 | Context,Ground Truth Utterance,Persona,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8 2 | hello what are doing today ?,"i am good , i just got off work and tired , i have two jobs .",i read twenty books a year.,oh really ? i am actually in high school and i am graduating as class of 2019 !,that is an interesting choice . i would have to pick french fries,i just got a pet fish for my 18th birthday yesterday from my parents .,"yeah , well what about you ?",my favorite watch is the rolex ? what is yours ?,what is in spain that is so interesting,i do not like clowns . they are scary to a kid like me,poetry . roses are red . violet are . . . ?,"my father is a member of the army , served for 10 years now ." 3 | i just got done watching a horror movie,"i rather read , i have read about 20 books this year .",i read twenty books a year.,why have you not sent help ? ! the scorpions are stinging my legs ! ree ! ! ! ! ! ! !,that is great i am expecting twins in two months . will these be your first kids ?,do you live on a farm or ranch ?,hi how are you doing tonight i am fine .,i would love to see her do that .,i do not . but i am so glad you do something that brings you joy,"it is hard , buy my dog keeps me company . do you have dogs ?","sounds like a good plan , what would you like to teach ?",i like rap music and i also produce for music artists . 4 | wow ! i do love a good horror movie . loving this cooler weather,but a good movie is always good .,i read twenty books a year.,oh i am sure they are,that would be great ! we just bought a house so no travel soon,"lol , i must go too . the imelda marcos shoe collection on qvc is on",i am so sorry to hear that . my father also passed away . he drove for nascar .,yes we do . my hair also goes down to my waist,"i am just fine . my name is priya , and you ?","gotta celebrate ! i am so old , i recall when nobody owned a tv .","very carefully , did you get flooded in texas ?",with kids in my house i can not listen to rap anymore . 5 | yes ! my son is in junior high and i just started letting him watch them too,i work in the movies as well .,i read twenty books a year.,that is great ! are you going to college ?,ugh . i do not think i could live with them .,i sing folk music . my parents are not supportive .,i love piano music . do you play jazz ?,"sure , your parents will have to take you here , i am also a teacher in kindergarten","that is nice , i need a new car , how is the bmw ?",all natural . . . second time . . . twins this time .,one is a tabby yet overweight and the other is a black and white cat,go to national parks . think about what i missed out on in high school . you ? -------------------------------------------------------------------------------- /udc_inputs.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | TEXT_FEATURE_SIZE = 100 4 | 5 | def get_feature_columns(mode): 6 | feature_columns = [] 7 | 8 | feature_columns.append(tf.contrib.layers.real_valued_column( 9 | column_name="context", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64)) 10 | feature_columns.append(tf.contrib.layers.real_valued_column( 11 | column_name="context_len", dimension=1, dtype=tf.int64)) 12 | feature_columns.append(tf.contrib.layers.real_valued_column( 13 | column_name="utterance", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64)) 14 | feature_columns.append(tf.contrib.layers.real_valued_column( 15 | column_name="utterance_len", dimension=1, dtype=tf.int64)) 16 | feature_columns.append(tf.contrib.layers.real_valued_column( 17 | column_name="persona", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64)) 18 | feature_columns.append(tf.contrib.layers.real_valued_column( 19 | column_name="persona_len", dimension=1, dtype=tf.int64)) 20 | 21 | if mode == tf.contrib.learn.ModeKeys.TRAIN: 22 | # During training we have a label feature 23 | feature_columns.append(tf.contrib.layers.real_valued_column( 24 | column_name="label", dimension=1, dtype=tf.int64)) 25 | 26 | if mode == tf.contrib.learn.ModeKeys.EVAL: 27 | # During evaluation we have distractors 28 | for i in range(9): 29 | feature_columns.append(tf.contrib.layers.real_valued_column( 30 | column_name="distractor_{}".format(i), dimension=TEXT_FEATURE_SIZE, dtype=tf.int64)) 31 | feature_columns.append(tf.contrib.layers.real_valued_column( 32 | column_name="distractor_{}_len".format(i), dimension=1, dtype=tf.int64)) 33 | 34 | return set(feature_columns) 35 | 36 | 37 | def create_input_fn(mode, input_files, batch_size, num_epochs): 38 | def input_fn(): 39 | features = tf.contrib.layers.create_feature_spec_for_parsing( 40 | get_feature_columns(mode)) 41 | 42 | feature_map = tf.contrib.learn.io.read_batch_features( 43 | file_pattern=input_files, 44 | batch_size=batch_size, 45 | features=features, 46 | reader=tf.TFRecordReader, 47 | randomize_input=True, 48 | num_epochs=num_epochs, 49 | queue_capacity=200000 + batch_size * 10, 50 | name="read_batch_features_{}".format(mode)) 51 | 52 | # This is an ugly hack because of a current bug in tf.learn 53 | # During evaluation TF tries to restore the epoch variable which isn't defined during training 54 | # So we define the variable manually here 55 | if mode == tf.contrib.learn.ModeKeys.TRAIN: 56 | tf.get_variable( 57 | "read_batch_features_eval/file_name_queue/limit_epochs/epochs", 58 | initializer=tf.constant(0, dtype=tf.int64)) 59 | 60 | if mode == tf.contrib.learn.ModeKeys.TRAIN: 61 | target = feature_map.pop("label") 62 | else: 63 | # In evaluation we have 10 classes (utterances). 64 | # The first one (index 0) is always the correct one 65 | target = tf.zeros([batch_size, 1], dtype=tf.int64) 66 | return feature_map, target 67 | return input_fn 68 | -------------------------------------------------------------------------------- /models/dual_encoder.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from models import helpers 4 | 5 | FLAGS = tf.flags.FLAGS 6 | 7 | def get_embeddings(hparams): 8 | if hparams.glove_path and hparams.vocab_path: 9 | tf.logging.info("Loading Glove embeddings...") 10 | vocab_array, vocab_dict = helpers.load_vocab(hparams.vocab_path) 11 | glove_vectors, glove_dict = helpers.load_glove_vectors(hparams.glove_path, vocab=set(vocab_array)) 12 | initializer = helpers.build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, hparams.embedding_dim) 13 | else: 14 | tf.logging.info("No glove/vocab path specificed, starting with random embeddings.") 15 | initializer = tf.random_uniform_initializer(-0.25, 0.25) 16 | 17 | return tf.get_variable( 18 | "word_embeddings", 19 | shape=[hparams.vocab_size, hparams.embedding_dim], 20 | initializer=initializer) 21 | 22 | 23 | def dual_encoder_model( 24 | hparams, 25 | mode, 26 | context, 27 | context_len, 28 | utterance, 29 | utterance_len, 30 | persona, 31 | persona_len, 32 | targets): 33 | 34 | # Initialize embedidngs randomly or with pre-trained vectors if available 35 | embeddings_W = get_embeddings(hparams) 36 | 37 | # Embed the context and the utterance 38 | context_embedded = tf.nn.embedding_lookup( 39 | embeddings_W, context, name="embed_context") 40 | utterance_embedded = tf.nn.embedding_lookup( 41 | embeddings_W, utterance, name="embed_utterance") 42 | persona_embedded = tf.nn.embedding_lookup( 43 | embeddings_W, persona, name="embed_persona") 44 | 45 | 46 | # Build the RNN 47 | with tf.variable_scope("rnn") as vs: 48 | # We use an LSTM Cell 49 | cell = tf.nn.rnn_cell.LSTMCell( 50 | hparams.rnn_dim, 51 | forget_bias=2.0, 52 | use_peepholes=True, 53 | state_is_tuple=True) 54 | 55 | # Run the utterance and context through the RNN 56 | rnn_outputs, rnn_states = tf.nn.dynamic_rnn( 57 | cell, 58 | tf.concat([context_embedded, utterance_embedded, persona_embedded], 0), 59 | sequence_length=tf.concat([context_len, utterance_len, persona_len,], 0), 60 | dtype=tf.float32) 61 | encoding_context, encoding_utterance , encoding_persona = tf.split(rnn_states.h,3,0) 62 | 63 | with tf.variable_scope("prediction") as vs: 64 | M = tf.get_variable("M", 65 | shape=[hparams.rnn_dim, hparams.rnn_dim], 66 | initializer=tf.truncated_normal_initializer()) 67 | 68 | # sum encoding_context, encoding_persona 69 | pc = tf.add(encoding_context, encoding_persona) 70 | 71 | # "Predict" a response: c * M 72 | generated_response = tf.matmul(pc, M) 73 | generated_response = tf.expand_dims(generated_response, 2) 74 | encoding_utterance = tf.expand_dims(encoding_utterance, 2) 75 | 76 | # Dot product between generated response and actual response 77 | # (c * M) * r 78 | logits = tf.matmul(generated_response, encoding_utterance, True) 79 | logits = tf.squeeze(logits, [2]) 80 | 81 | # Apply sigmoid to convert logits to probabilities 82 | probs = tf.sigmoid(logits) 83 | 84 | if mode == tf.contrib.learn.ModeKeys.INFER: 85 | return probs, None 86 | 87 | # Calculate the binary cross-entropy loss 88 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.to_float(targets)) 89 | 90 | # Mean loss across the batch of examples 91 | mean_loss = tf.reduce_mean(losses, name="mean_loss") 92 | return probs, mean_loss 93 | -------------------------------------------------------------------------------- /udc_predict.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import numpy as np 4 | import tensorflow as tf 5 | import udc_model 6 | import udc_hparams 7 | from models.dual_encoder import dual_encoder_model 8 | import pandas as pd 9 | from termcolor import colored 10 | import os 11 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 12 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 13 | 14 | tf.flags.DEFINE_string("model_dir", "./runs/1542774662", "Directory to load model checkpoints from") 15 | tf.flags.DEFINE_string("vocab_processor_file", "./data/persona/vocab_processor.bin", "Saved vocabulary processor file") 16 | FLAGS = tf.flags.FLAGS 17 | 18 | if not FLAGS.model_dir: 19 | print("You must specify a model directory") 20 | sys.exit(1) 21 | 22 | def tokenizer_fn(iterator): 23 | return (x.split(" ") for x in iterator) 24 | 25 | def get_features(context, persona, utterances): 26 | context_matrix = np.array(list(vp.transform([context]))) 27 | persona_matrix = np.array(list(vp.transform([persona]))) 28 | utterance_matrix = np.array(list(vp.transform([utterances[0]]))) 29 | context_len = len(context.split(" ")) 30 | persona_len = len(persona.split(" ")) 31 | utterance_len = len(utterances[0].split(" ")) 32 | features = { 33 | "context": tf.convert_to_tensor(context_matrix, dtype=tf.int64), 34 | "context_len": tf.constant(context_len, shape=[1,1], dtype=tf.int64), 35 | "persona": tf.convert_to_tensor(persona_matrix, dtype=tf.int64), 36 | "persona_len": tf.constant(persona_len, shape=[1,1], dtype=tf.int64), 37 | "utterance": tf.convert_to_tensor(utterance_matrix, dtype=tf.int64), 38 | "utterance_len": tf.constant(utterance_len, shape=[1,1], dtype=tf.int64), 39 | "len":len(utterances) 40 | } 41 | 42 | for i in range(1,len(utterances)): 43 | utterance = utterances[i]; 44 | 45 | utterance_matrix = np.array(list(vp.transform([utterance]))) 46 | utterance_len = len(utterance.split(" ")) 47 | 48 | features["utterance_{}".format(i)] = tf.convert_to_tensor(utterance_matrix, dtype=tf.int64) 49 | features["utterance_{}_len".format(i)] = tf.constant(utterance_len, shape=[1,1], dtype=tf.int64) 50 | 51 | return features, None 52 | 53 | if __name__ == "__main__": 54 | # tf.logging.set_verbosity(tf.logging.INFO) 55 | # Load vocabulary 56 | vp = tf.contrib.learn.preprocessing.VocabularyProcessor.restore( 57 | FLAGS.vocab_processor_file) 58 | 59 | # Load data for predict 60 | test_df = pd.read_csv("./data/persona/predict.csv") 61 | #elementId = 0 62 | #INPUT_CONTEXT = test_df.Context[elementId] 63 | #INPUT_PERSONA = test_df.Persona[elementId] 64 | #POTENTIAL_RESPONSES = test_df.iloc[elementId,2:].values 65 | 66 | hparams = udc_hparams.create_hparams() 67 | model_fn = udc_model.create_model_fn(hparams, model_impl=dual_encoder_model) 68 | 69 | estimator = tf.contrib.learn.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir) 70 | 71 | #starttime = time.time() 72 | for elementId in range(10): 73 | INPUT_CONTEXT = test_df.Context[elementId] 74 | INPUT_PERSONA = test_df.Persona[elementId] 75 | POTENTIAL_RESPONSES = test_df.iloc[elementId,2:].values 76 | prob = estimator.predict(input_fn=lambda: get_features(INPUT_CONTEXT, INPUT_PERSONA, POTENTIAL_RESPONSES)) 77 | results = next(prob) 78 | print('\n') 79 | print(colored('[ Context]', on_color='on_blue',color="white"),INPUT_CONTEXT) 80 | print(colored('[ Persona]', on_color='on_blue',color="white"),INPUT_PERSONA) 81 | #print("[Results value ]",results) 82 | answerId = results.argmax(axis=0) 83 | if answerId==0: 84 | print(colored('[ Answer]', on_color='on_green'), POTENTIAL_RESPONSES[answerId]) 85 | else: 86 | print (colored('[ Answer]', on_color='on_red'),POTENTIAL_RESPONSES[answerId]) 87 | print (colored('[Right answer]', on_color='on_green'), POTENTIAL_RESPONSES[0]) 88 | 89 | #endtime = time.time() 90 | print('\n') 91 | #print(colored('[Predict time]', on_color='on_blue',color="white"),"%.2f sec" % round(endtime - starttime,2)) -------------------------------------------------------------------------------- /udc_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import sys 3 | 4 | def get_id_feature(features, key, len_key, max_len): 5 | ids = features[key] 6 | ids_len = tf.squeeze(features[len_key], [1]) 7 | ids_len = tf.minimum(ids_len, tf.constant(max_len, dtype=tf.int64)) 8 | return ids, ids_len 9 | 10 | def create_train_op(loss, hparams): 11 | train_op = tf.contrib.layers.optimize_loss( 12 | loss=loss, 13 | global_step=tf.contrib.framework.get_global_step(), 14 | learning_rate=hparams.learning_rate, 15 | clip_gradients=10.0, 16 | optimizer=hparams.optimizer) 17 | return train_op 18 | 19 | 20 | def create_model_fn(hparams, model_impl): 21 | 22 | def model_fn(features, targets, mode): 23 | context, context_len = get_id_feature( 24 | features, "context", "context_len", hparams.max_context_len) 25 | 26 | utterance, utterance_len = get_id_feature( 27 | features, "utterance", "utterance_len", hparams.max_utterance_len) 28 | 29 | persona, persona_len = get_id_feature( 30 | features, "persona", "persona_len", hparams.max_persona_len) 31 | 32 | if mode == tf.contrib.learn.ModeKeys.TRAIN: 33 | probs, loss = model_impl( 34 | hparams, 35 | mode, 36 | context, 37 | context_len, 38 | utterance, 39 | utterance_len, 40 | persona, 41 | persona_len, 42 | targets) 43 | train_op = create_train_op(loss, hparams) 44 | return probs, loss, train_op 45 | 46 | if mode == tf.contrib.learn.ModeKeys.INFER: 47 | 48 | all_contexts = [context] 49 | all_context_lens = [context_len] 50 | all_utterances = [utterance] 51 | all_utterance_lens = [utterance_len] 52 | all_personas = [persona] 53 | all_persona_lens = [persona_len] 54 | 55 | for i in range(1,features["len"]): 56 | distractor, distractor_len = get_id_feature(features, 57 | "utterance_{}".format(i), 58 | "utterance_{}_len".format(i), 59 | hparams.max_utterance_len) 60 | all_contexts.append(context) 61 | all_context_lens.append(context_len) 62 | all_utterances.append(distractor) 63 | all_utterance_lens.append(distractor_len) 64 | all_personas.append(persona) 65 | all_persona_lens.append(persona_len) 66 | 67 | probs, loss = model_impl( 68 | hparams, 69 | mode, 70 | tf.concat(all_contexts,0), 71 | tf.concat(all_context_lens,0), 72 | tf.concat(all_utterances,0), 73 | tf.concat(all_utterance_lens,0), 74 | tf.concat(all_personas,0), 75 | tf.concat(all_persona_lens,0), 76 | None) 77 | 78 | split_probs = tf.split(probs, features["len"],0) 79 | probs = tf.concat(split_probs,1) 80 | 81 | return probs, 0.0, None 82 | 83 | if mode == tf.contrib.learn.ModeKeys.EVAL: 84 | batch_size = targets.get_shape().as_list()[0] 85 | 86 | # We have 10 exampels per record, so we accumulate them 87 | all_contexts = [context] 88 | all_context_lens = [context_len] 89 | all_utterances = [utterance] 90 | all_utterance_lens = [utterance_len] 91 | all_personas = [persona] 92 | all_persona_lens = [persona_len] 93 | all_targets = [tf.ones([batch_size, 1], dtype=tf.int64)] 94 | 95 | for i in range(9): 96 | distractor, distractor_len = get_id_feature(features, 97 | "distractor_{}".format(i), 98 | "distractor_{}_len".format(i), 99 | hparams.max_utterance_len) 100 | all_contexts.append(context) 101 | all_context_lens.append(context_len) 102 | all_utterances.append(distractor) 103 | all_utterance_lens.append(distractor_len) 104 | all_personas.append(persona) 105 | all_persona_lens.append(persona_len) 106 | all_targets.append( 107 | tf.zeros([batch_size, 1], dtype=tf.int64) 108 | ) 109 | 110 | probs, loss = model_impl( 111 | hparams, 112 | mode, 113 | tf.concat(all_contexts,0), 114 | tf.concat(all_context_lens,0), 115 | tf.concat(all_utterances,0), 116 | tf.concat(all_utterance_lens,0), 117 | tf.concat(all_personas,0), 118 | tf.concat(all_persona_lens,0), 119 | tf.concat(all_targets,0)) 120 | 121 | split_probs = tf.split(probs,10,0) 122 | shaped_probs = tf.concat(split_probs,1) 123 | 124 | # Add summaries 125 | tf.summary.histogram("eval_correct_probs_hist", split_probs[0]) 126 | tf.summary.scalar("eval_correct_probs_average", tf.reduce_mean(split_probs[0])) 127 | tf.summary.histogram("eval_incorrect_probs_hist", split_probs[1]) 128 | tf.summary.scalar("eval_incorrect_probs_average", tf.reduce_mean(split_probs[1])) 129 | 130 | return shaped_probs, loss, None 131 | 132 | return model_fn 133 | -------------------------------------------------------------------------------- /prepare_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import itertools 4 | import functools 5 | import tensorflow as tf 6 | import numpy as np 7 | import array 8 | 9 | tf.flags.DEFINE_integer( 10 | "min_word_frequency", 5, "Minimum frequency of words in the vocabulary") 11 | 12 | tf.flags.DEFINE_integer("max_sentence_len", 100, "Maximum Sentence Length") 13 | 14 | tf.flags.DEFINE_string( 15 | "input_dir", os.path.abspath("./data/persona"), 16 | "Input directory containing original CSV data files (default = './data')") 17 | 18 | tf.flags.DEFINE_string( 19 | "output_dir", os.path.abspath("./data/persona"), 20 | "Output directory for TFrEcord files (default = './data')") 21 | 22 | FLAGS = tf.flags.FLAGS 23 | 24 | TRAIN_PATH = os.path.join(FLAGS.input_dir, "train_shuf.csv") 25 | VALIDATION_PATH = os.path.join(FLAGS.input_dir, "valid.csv") 26 | TEST_PATH = os.path.join(FLAGS.input_dir, "test.csv") 27 | 28 | def tokenizer_fn(iterator): 29 | return (x.split(" ") for x in iterator) 30 | 31 | def create_csv_iter(filename): 32 | """ 33 | Returns an iterator over a CSV file. Skips the header. 34 | """ 35 | with open(filename) as csvfile: 36 | reader = csv.reader(csvfile) 37 | # Skip the header 38 | next(reader) 39 | for row in reader: 40 | yield row 41 | 42 | 43 | def create_vocab(input_iter, min_frequency): 44 | """ 45 | Creates and returns a VocabularyProcessor object with the vocabulary 46 | for the input iterator. 47 | """ 48 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( 49 | FLAGS.max_sentence_len, 50 | min_frequency=min_frequency, 51 | tokenizer_fn=tokenizer_fn) 52 | vocab_processor.fit(input_iter) 53 | return vocab_processor 54 | 55 | 56 | def transform_sentence(sequence, vocab_processor): 57 | """ 58 | Maps a single sentence into the integer vocabulary. Returns a python array. 59 | """ 60 | return next(vocab_processor.transform([sequence])).tolist() 61 | 62 | 63 | def create_text_sequence_feature(fl, sentence, sentence_len, vocab): 64 | """ 65 | Writes a sentence to FeatureList protocol buffer 66 | """ 67 | sentence_transformed = transform_sentence(sentence, vocab) 68 | for word_id in sentence_transformed: 69 | fl.feature.add().int64_list.value.extend([word_id]) 70 | return fl 71 | 72 | 73 | def create_example_train(row, vocab): 74 | """ 75 | Creates a training example for the Ubuntu Dialog Corpus dataset. 76 | Returnsthe a tensorflow.Example Protocol Buffer object. 77 | """ 78 | context, utterance, persona, label = row 79 | context_transformed = transform_sentence(context, vocab) 80 | utterance_transformed = transform_sentence(utterance, vocab) 81 | persona_transformed = transform_sentence(persona, vocab) 82 | context_len = len(next(vocab._tokenizer([context]))) 83 | utterance_len = len(next(vocab._tokenizer([utterance]))) 84 | persona_len = len(next(vocab._tokenizer([persona]))) 85 | label = int(float(label)) 86 | 87 | # New Example 88 | example = tf.train.Example() 89 | example.features.feature["context"].int64_list.value.extend(context_transformed) 90 | example.features.feature["utterance"].int64_list.value.extend(utterance_transformed) 91 | example.features.feature["persona"].int64_list.value.extend(persona_transformed) 92 | example.features.feature["context_len"].int64_list.value.extend([context_len]) 93 | example.features.feature["utterance_len"].int64_list.value.extend([utterance_len]) 94 | example.features.feature["persona_len"].int64_list.value.extend([persona_len]) 95 | example.features.feature["label"].int64_list.value.extend([label]) 96 | return example 97 | 98 | 99 | def create_example_test(row, vocab): 100 | """ 101 | Creates a test/validation example for the Ubuntu Dialog Corpus dataset. 102 | Returnsthe a tensorflow.Example Protocol Buffer object. 103 | """ 104 | context, utterance, persona = row[:3] 105 | distractors = row[3:] 106 | context_len = len(next(vocab._tokenizer([context]))) 107 | utterance_len = len(next(vocab._tokenizer([utterance]))) 108 | persona_len = len(next(vocab._tokenizer([persona]))) 109 | context_transformed = transform_sentence(context, vocab) 110 | utterance_transformed = transform_sentence(utterance, vocab) 111 | persona_transformed = transform_sentence(persona, vocab) 112 | 113 | # New Example 114 | example = tf.train.Example() 115 | example.features.feature["context"].int64_list.value.extend(context_transformed) 116 | example.features.feature["utterance"].int64_list.value.extend(utterance_transformed) 117 | example.features.feature["persona"].int64_list.value.extend(persona_transformed) 118 | example.features.feature["context_len"].int64_list.value.extend([context_len]) 119 | example.features.feature["utterance_len"].int64_list.value.extend([utterance_len]) 120 | example.features.feature["persona_len"].int64_list.value.extend([persona_len]) 121 | 122 | # Distractor sequences 123 | for i, distractor in enumerate(distractors): 124 | dis_key = "distractor_{}".format(i) 125 | dis_len_key = "distractor_{}_len".format(i) 126 | # Distractor Length Feature 127 | dis_len = len(next(vocab._tokenizer([distractor]))) 128 | example.features.feature[dis_len_key].int64_list.value.extend([dis_len]) 129 | # Distractor Text Feature 130 | dis_transformed = transform_sentence(distractor, vocab) 131 | example.features.feature[dis_key].int64_list.value.extend(dis_transformed) 132 | return example 133 | 134 | 135 | def create_tfrecords_file(input_filename, output_filename, example_fn): 136 | """ 137 | Creates a TFRecords file for the given input data and 138 | example transofmration function 139 | """ 140 | writer = tf.python_io.TFRecordWriter(output_filename) 141 | print("Creating TFRecords file at {}...".format(output_filename)) 142 | for i, row in enumerate(create_csv_iter(input_filename)): 143 | x = example_fn(row) 144 | writer.write(x.SerializeToString()) 145 | writer.close() 146 | print("Wrote to {}".format(output_filename)) 147 | 148 | 149 | def write_vocabulary(vocab_processor, outfile): 150 | """ 151 | Writes the vocabulary to a file, one word per line. 152 | """ 153 | vocab_size = len(vocab_processor.vocabulary_) 154 | with open(outfile, "w") as vocabfile: 155 | for id in range(vocab_size): 156 | word = vocab_processor.vocabulary_._reverse_mapping[id] 157 | vocabfile.write(word + "\n") 158 | print("Saved vocabulary to {}".format(outfile)) 159 | 160 | 161 | if __name__ == "__main__": 162 | print("Creating vocabulary...") 163 | input_iter = create_csv_iter(TRAIN_PATH) 164 | input_iter = (x[0] + " " + x[1] + " " + x[2]for x in input_iter) #context+utter+persona 165 | vocab = create_vocab(input_iter, min_frequency=FLAGS.min_word_frequency) 166 | print("Total vocabulary size: {}".format(len(vocab.vocabulary_))) 167 | 168 | # Create vocabulary.txt file 169 | write_vocabulary( 170 | vocab, os.path.join(FLAGS.output_dir, "vocabulary.txt")) 171 | 172 | # Save vocab processor 173 | vocab.save(os.path.join(FLAGS.output_dir, "vocab_processor.bin")) 174 | 175 | # Create validation.tfrecords 176 | create_tfrecords_file( 177 | input_filename=VALIDATION_PATH, 178 | output_filename=os.path.join(FLAGS.output_dir, "validation.tfrecords"), 179 | example_fn=functools.partial(create_example_test, vocab=vocab)) 180 | 181 | # Create test.tfrecords 182 | create_tfrecords_file( 183 | input_filename=TEST_PATH, 184 | output_filename=os.path.join(FLAGS.output_dir, "test.tfrecords"), 185 | example_fn=functools.partial(create_example_test, vocab=vocab)) 186 | 187 | # Create train.tfrecords 188 | create_tfrecords_file( 189 | input_filename=TRAIN_PATH, 190 | output_filename=os.path.join(FLAGS.output_dir, "train.tfrecords"), 191 | example_fn=functools.partial(create_example_train, vocab=vocab)) 192 | --------------------------------------------------------------------------------