├── models
    ├── __init__.py
    ├── helpers.pyc
    ├── __init__.pyc
    ├── dual_encoder.pyc
    ├── __pycache__
    │   ├── helpers.cpython-35.pyc
    │   ├── __init__.cpython-35.pyc
    │   └── dual_encoder.cpython-35.pyc
    ├── helpers.py
    └── dual_encoder.py
├── Images
    └── model.jpg
├── train.sh
├── predict.sh
├── udc_metrics.py
├── requirements.txt
├── README.md
├── udc_test.py
├── data_sample
    └── persona
    │   ├── train.csv
    │   ├── test.csv
    │   └── valid.csv
├── udc_hparams.py
├── udc_train.py
├── udc_inputs.py
├── udc_predict.py
├── udc_model.py
└── prepare_data.py


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Images/model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/Images/model.jpg


--------------------------------------------------------------------------------
/models/helpers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/helpers.pyc


--------------------------------------------------------------------------------
/models/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__init__.pyc


--------------------------------------------------------------------------------
/models/dual_encoder.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/dual_encoder.pyc


--------------------------------------------------------------------------------
/models/__pycache__/helpers.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__pycache__/helpers.cpython-35.pyc


--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/models/__pycache__/dual_encoder.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fannn1217/persona.chatbot/HEAD/models/__pycache__/dual_encoder.cpython-35.pyc


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
3 | export CUDA_HOME=/usr/local/cuda
4 | export CUDA_VISIBLE_DEVICES=0
5 | python3 udc_train.py
6 | 


--------------------------------------------------------------------------------
/predict.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
3 | export CUDA_HOME=/usr/local/cuda
4 | export CUDA_VISIBLE_DEVICES=0
5 | python3 udc_predict.py --model_dir=./runs/1480971822/


--------------------------------------------------------------------------------
/udc_metrics.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import functools
 3 | from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 4 | 
 5 | 
 6 | def create_evaluation_metrics():
 7 |     eval_metrics = {}
 8 |     for k in [1, 2, 5, 10]:
 9 |         eval_metrics["recall_at_%d" % k] = MetricSpec(metric_fn=functools.partial(
10 |             tf.contrib.metrics.streaming_sparse_recall_at_k,
11 |             k=k))
12 |     return eval_metrics
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope
 2 | backports.shutil-get-terminal-size
 3 | cycler
 4 | decorator
 5 | entrypoints
 6 | ipykernel
 7 | ipython
 8 | ipython-genutils
 9 | ipywidgets
10 | Jinja2
11 | jsonschema
12 | jupyter
13 | jupyter-client
14 | jupyter-console
15 | jupyter-core
16 | numpy
17 | MarkupSafe
18 | matplotlib
19 | mistune
20 | nbconvert
21 | nbformat
22 | notebook
23 | pexpect
24 | pandas
25 | pickleshare
26 | protobuf
27 | ptyprocess
28 | Pygments
29 | pyparsing
30 | python-dateutil
31 | pytz
32 | pyzmq
33 | qtconsole
34 | scikit-learn
35 | scipy
36 | simplegeneric
37 | six
38 | terminado
39 | termcolor
40 | tornado
41 | traitlets
42 | widgetsnbextension
43 | 
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # persona.chatbot
 2 | 
 3 | #### almost change the code from [wildml](https://github.com/dennybritz/chatbot-retrieval/), the code implements the Dual LSTM Encoder model from [The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems](http://arxiv.org/abs/1506.08909).
 4 | 
 5 | ### Overview
 6 | 
 7 | The code here simply implements the [Personalizing Dialogue Agents: I have a dog, do you have pets too? ](https://arxiv.org/abs/1801.07243)
 8 | 
 9 | Use the first persona sentence as input
10 | 
11 | ![image](https://github.com/fannn1217/persona.chatbot/blob/master/Images/model.jpg)
12 | 
13 | ### Setup
14 | 
15 | ```
16 | Python 3
17 | tensorflow 1.3
18 | ```
19 | 
20 | ### Get the Data
21 | 
22 | from ParlAI [github](https://github.com/facebookresearch/ParlAI/tree/master/projects/personachat)
23 | 
24 | 
25 | ### Training
26 | 
27 | ```
28 | python udc_train.py
29 | ```
30 | 
31 | ### Evaluation
32 | 
33 | ```
34 | python udc_test.py --model_dir=...
35 | ```
36 | 
37 | ### Prediction
38 | 
39 | ```
40 | python udc_predict.py --model_dir=...
41 | ```
42 | 


--------------------------------------------------------------------------------
/udc_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import itertools
 4 | import sys
 5 | import tensorflow as tf
 6 | import udc_model
 7 | import udc_hparams
 8 | import udc_metrics
 9 | import udc_inputs
10 | from models.dual_encoder import dual_encoder_model
11 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
12 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
13 | 
14 | 
15 | tf.flags.DEFINE_string("test_file", "./data/persona/test.tfrecords", "Path of test data in TFRecords format")
16 | tf.flags.DEFINE_string("model_dir", "./runs/1542774662", "Directory to load model checkpoints from")
17 | tf.flags.DEFINE_integer("loglevel", 20, "Tensorflow log level")
18 | tf.flags.DEFINE_integer("test_batch_size", 8, "Batch size for testing")
19 | FLAGS = tf.flags.FLAGS
20 | 
21 | if not FLAGS.model_dir:
22 |   print("You must specify a model directory")
23 |   sys.exit(1)
24 | 
25 | tf.logging.set_verbosity(FLAGS.loglevel)
26 | 
27 | if __name__ == "__main__":
28 |   hparams = udc_hparams.create_hparams()
29 |   model_fn = udc_model.create_model_fn(hparams, model_impl=dual_encoder_model)
30 |   estimator = tf.contrib.learn.Estimator(
31 |     model_fn=model_fn,
32 |     model_dir=FLAGS.model_dir,
33 |     config=tf.contrib.learn.RunConfig())
34 | 
35 |   input_fn_test = udc_inputs.create_input_fn(
36 |     mode=tf.contrib.learn.ModeKeys.EVAL,
37 |     input_files=[FLAGS.test_file],
38 |     batch_size=FLAGS.test_batch_size,
39 |     num_epochs=1)
40 | 
41 |   eval_metrics = udc_metrics.create_evaluation_metrics()
42 |   estimator.evaluate(input_fn=input_fn_test, steps=None, metrics=eval_metrics)
43 | 


--------------------------------------------------------------------------------
/models/helpers.py:
--------------------------------------------------------------------------------
 1 | import array
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from collections import defaultdict
 5 | 
 6 | def load_vocab(filename):
 7 |   vocab = None
 8 |   with open(filename) as f:
 9 |     vocab = f.read().splitlines()
10 |   dct = defaultdict(int)
11 |   for idx, word in enumerate(vocab):
12 |     dct[word] = idx
13 |   return [vocab, dct]
14 | 
15 | def load_glove_vectors(filename, vocab):
16 |   """
17 |   Load glove vectors from a .txt file.
18 |   Optionally limit the vocabulary to save memory. `vocab` should be a set.
19 |   """
20 |   dct = {}
21 |   vectors = array.array('d')
22 |   current_idx = 0
23 |   with open(filename, "r", encoding="utf-8") as f:
24 |     for _, line in enumerate(f):
25 |       tokens = line.split(" ")
26 |       word = tokens[0]
27 |       entries = tokens[1:]
28 |       if not vocab or word in vocab:
29 |         dct[word] = current_idx
30 |         vectors.extend(float(x) for x in entries)
31 |         current_idx += 1
32 |     word_dim = len(entries)
33 |     num_vectors = len(dct)
34 |     tf.logging.info("Found {} out of {} vectors in Glove".format(num_vectors, len(vocab)))
35 |     return [np.array(vectors).reshape(num_vectors, word_dim), dct]
36 | 
37 | 
38 | def build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, embedding_dim):
39 |   initial_embeddings = np.random.uniform(-0.25, 0.25, (len(vocab_dict), embedding_dim)).astype("float32")
40 |   for word, glove_word_idx in glove_dict.items():
41 |     word_idx = vocab_dict.get(word)
42 |     initial_embeddings[word_idx, :] = glove_vectors[glove_word_idx]


--------------------------------------------------------------------------------
/data_sample/persona/train.csv:
--------------------------------------------------------------------------------
 1 | Context,Utterance,Persona,Label
 2 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","my mom was single with 3 boys , so we never left the projects .","i like to remodel homes.",0
 3 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","i try to wear all black every day . it makes me feel comfortable .","i like to remodel homes.",0
 4 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","well nursing stresses you out so i wish luck with sister","i like to remodel homes.",0
 5 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","yeah just want to pick up nba nfl getting old","i like to remodel homes.",0
 6 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","i really like celine dion . what about you ?","i like to remodel homes.",0
 7 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","no . i live near farms .","i like to remodel homes.",0
 8 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","i wish i had a daughter , i am a boy mom . they are beautiful boys though still lucky","i like to remodel homes.",0
 9 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","yeah when i get bored i play gone with the wind my favorite movie .","i like to remodel homes.",0
10 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","hi how are you ? i am eating dinner with my hubby and 2 kids .","i like to remodel homes.",0
11 | "hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .","you must be very fast . hunting is one of my favorite hobbies .","i like to remodel homes.",1


--------------------------------------------------------------------------------
/udc_hparams.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from collections import namedtuple
 3 | 
 4 | # Model Parameters
 5 | tf.flags.DEFINE_integer(
 6 |   "vocab_size",
 7 |   #91620,
 8 |   #16758,
 9 |   #1727,
10 |   18537,
11 |   "The size of the vocabulary. Only change this if you changed the preprocessing")
12 | 
13 | # Model Parameters
14 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of the embeddings")
15 | tf.flags.DEFINE_integer("rnn_dim", 256, "Dimensionality of the RNN cell")
16 | tf.flags.DEFINE_integer("max_context_len", 100, "Truncate contexts to this length")
17 | tf.flags.DEFINE_integer("max_utterance_len", 50, "Truncate utterance to this length")
18 | tf.flags.DEFINE_integer("max_persona_len", 50, "Truncate persona to this length")
19 | 
20 | # Pre-trained embeddings
21 | tf.flags.DEFINE_string("glove_path", None, "Path to pre-trained Glove vectors")
22 | tf.flags.DEFINE_string("vocab_path", None, "Path to vocabulary.txt file")
23 | 
24 | # Training Parameters
25 | tf.flags.DEFINE_float("learning_rate", 0.001, "Learning rate")
26 | tf.flags.DEFINE_integer("batch_size", 64, "Batch size during training")
27 | tf.flags.DEFINE_integer("eval_batch_size", 8, "Batch size during evaluation")
28 | tf.flags.DEFINE_string("optimizer", "Adam", "Optimizer Name (Adam, Adagrad, etc)")
29 | 
30 | FLAGS = tf.flags.FLAGS
31 | 
32 | HParams = namedtuple(
33 |   "HParams",
34 |   [
35 |     "batch_size",
36 |     "embedding_dim",
37 |     "eval_batch_size",
38 |     "learning_rate",
39 |     "max_context_len",
40 |     "max_utterance_len",
41 |     "max_persona_len",
42 |     "optimizer",
43 |     "rnn_dim",
44 |     "vocab_size",
45 |     "glove_path",
46 |     "vocab_path"
47 |   ])
48 | 
49 | def create_hparams():
50 |   return HParams(
51 |     batch_size=FLAGS.batch_size,
52 |     eval_batch_size=FLAGS.eval_batch_size,
53 |     vocab_size=FLAGS.vocab_size,
54 |     optimizer=FLAGS.optimizer,
55 |     learning_rate=FLAGS.learning_rate,
56 |     embedding_dim=FLAGS.embedding_dim,
57 |     max_context_len=FLAGS.max_context_len,
58 |     max_utterance_len=FLAGS.max_utterance_len,
59 |     max_persona_len=FLAGS.max_persona_len,
60 |     glove_path=FLAGS.glove_path,
61 |     vocab_path=FLAGS.vocab_path,
62 |     rnn_dim=FLAGS.rnn_dim)


--------------------------------------------------------------------------------
/udc_train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import itertools
 4 | import tensorflow as tf
 5 | import udc_model
 6 | import udc_hparams
 7 | import udc_metrics
 8 | import udc_inputs
 9 | from models.dual_encoder import dual_encoder_model
10 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
11 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
12 | 
13 | tf.flags.DEFINE_string("input_dir", "./data/persona", "Directory containing input data files 'train.tfrecords' and 'validation.tfrecords'")
14 | tf.flags.DEFINE_string("model_dir", None, "Directory to store model checkpoints (defaults to ./runs)")
15 | tf.flags.DEFINE_integer("loglevel", 20, "Tensorflow log level")
16 | tf.flags.DEFINE_integer("num_epochs", None, "Number of training Epochs. Defaults to indefinite.")
17 | tf.flags.DEFINE_integer("eval_every", 2000, "Evaluate after this many train steps")
18 | FLAGS = tf.flags.FLAGS
19 | 
20 | TIMESTAMP = int(time.time())
21 | 
22 | if FLAGS.model_dir:
23 |   MODEL_DIR = FLAGS.model_dir
24 | else:
25 |   MODEL_DIR = os.path.abspath(os.path.join("./runs", str(TIMESTAMP)))
26 | 
27 | TRAIN_FILE = os.path.abspath(os.path.join(FLAGS.input_dir, "train.tfrecords"))
28 | VALIDATION_FILE = os.path.abspath(os.path.join(FLAGS.input_dir, "validation.tfrecords"))
29 | 
30 | tf.logging.set_verbosity(FLAGS.loglevel)
31 | 
32 | def main(unused_argv):
33 |   hparams = udc_hparams.create_hparams()
34 | 
35 |   model_fn = udc_model.create_model_fn(
36 |     hparams,
37 |     model_impl=dual_encoder_model)
38 | 
39 |   estimator = tf.contrib.learn.Estimator(
40 |     model_fn=model_fn,
41 |     model_dir=MODEL_DIR,
42 |     config=tf.contrib.learn.RunConfig())
43 | 
44 |   input_fn_train = udc_inputs.create_input_fn(
45 |     mode=tf.contrib.learn.ModeKeys.TRAIN,
46 |     input_files=[TRAIN_FILE],
47 |     batch_size=hparams.batch_size,
48 |     num_epochs=FLAGS.num_epochs)
49 | 
50 |   input_fn_eval = udc_inputs.create_input_fn(
51 |     mode=tf.contrib.learn.ModeKeys.EVAL,
52 |     input_files=[VALIDATION_FILE],
53 |     batch_size=hparams.eval_batch_size,
54 |     num_epochs=1)
55 | 
56 |   eval_metrics = udc_metrics.create_evaluation_metrics()
57 |   
58 |   eval_monitor = tf.contrib.learn.monitors.ValidationMonitor(
59 |         input_fn=input_fn_eval,
60 |         every_n_steps=FLAGS.eval_every,
61 |         metrics=eval_metrics)
62 | 
63 |   estimator.fit(input_fn=input_fn_train, steps=None, monitors=[eval_monitor])
64 | 
65 | if __name__ == "__main__":
66 |   tf.app.run()
67 | 


--------------------------------------------------------------------------------
/data_sample/persona/test.csv:
--------------------------------------------------------------------------------
1 | Context,Ground Truth Utterance,Persona,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8
2 | "hello , how are you doing tonight ?",i am well an loving this interaction how are you ?,i love to meet new people.,god help you and make you rich one day .,it is okay . nothing special !,do you live in a large or small city ? i am in large,i like blue hair . is it dark blue or light blue ?,hey ! coach johnny here ! how are you,great ! i do work on my fathers hobby farm in my spare time,yes ! one photo is going in an ad at my agency too,i try to go to the gym every once in a while,"that is so nice ! i wanted girls but have two boys . would not trade them , though !"
3 | i am great . i just got back from the club .,this is my favorite time of the year season wise,i love to meet new people.,neat i follow the stock market carefully since becoming a stock broker,have a cat . i wanted a dog but i am allergic .,"oh nice ! i live in new york but i do not drive , though .",cannot blame you there . i know the feeling every time i take a camping trip,i still have roommates here in seattle,i am from california . where are you from ?,i am not really into sports . i like robots though !,i do not blame you . i work as a registered nurse .,no i do not . i like theatre though
4 | i would rather eat chocolate cake during this season .,what club did you go to ? me an timothy watched tv,i love to meet new people.,fine how are you on this cold day ?,what do you paint ? and it is not the same .,the only language i speak is english,i like some romance shows . they have to be realistic though .,i do how tall are you ?,business administration . is it hard to work as consulting manager,hi there ! how you doing today ? i just got home from school .,i use to have anxiety but then i took tae kwan do . i am a champ at it,i only have a turtle .
5 | i went to club chino . what show are you watching ?,lol oh okay kind of random,i love to meet new people.,"when i move to nyc , i hope to hear some jazz on the streets .",it is very pretty ! do you like asia ?,like water or oceans ? i am terrified .,layla is at great cat name . i am huge and tall,in screenplays i like reading horror . love to see difference versus movie,since when am i a dog ? that is quite rude !,no i am just on my lunch break,"hello , how are you today ?",her name is ellenor rose . we alos have a 4th family member whose a dog names milly


--------------------------------------------------------------------------------
/data_sample/persona/valid.csv:
--------------------------------------------------------------------------------
1 | Context,Ground Truth Utterance,Persona,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8
2 | hello what are doing today ?,"i am good , i just got off work and tired , i have two jobs .",i read twenty books a year.,oh really ? i am actually in high school and i am graduating as class of 2019 !,that is an interesting choice . i would have to pick french fries,i just got a pet fish for my 18th birthday yesterday from my parents .,"yeah , well what about you ?",my favorite watch is the rolex ? what is yours ?,what is in spain that is so interesting,i do not like clowns . they are scary to a kid like me,poetry . roses are red . violet are . . . ?,"my father is a member of the army , served for 10 years now ."
3 | i just got done watching a horror movie,"i rather read , i have read about 20 books this year .",i read twenty books a year.,why have you not sent help ? ! the scorpions are stinging my legs ! ree ! ! ! ! ! ! !,that is great i am expecting twins in two months . will these be your first kids ?,do you live on a farm or ranch ?,hi how are you doing tonight i am fine .,i would love to see her do that .,i do not . but i am so glad you do something that brings you joy,"it is hard , buy my dog keeps me company . do you have dogs ?","sounds like a good plan , what would you like to teach ?",i like rap music and i also produce for music artists .
4 | wow ! i do love a good horror movie . loving this cooler weather,but a good movie is always good .,i read twenty books a year.,oh i am sure they are,that would be great ! we just bought a house so no travel soon,"lol , i must go too . the imelda marcos shoe collection on qvc is on",i am so sorry to hear that . my father also passed away . he drove for nascar .,yes we do . my hair also goes down to my waist,"i am just fine . my name is priya , and you ?","gotta celebrate ! i am so old , i recall when nobody owned a tv .","very carefully , did you get flooded in texas ?",with kids in my house i can not listen to rap anymore .
5 | yes ! my son is in junior high and i just started letting him watch them too,i work in the movies as well .,i read twenty books a year.,that is great ! are you going to college ?,ugh . i do not think i could live with them .,i sing folk music . my parents are not supportive .,i love piano music . do you play jazz ?,"sure , your parents will have to take you here , i am also a teacher in kindergarten","that is nice , i need a new car , how is the bmw ?",all natural . . . second time . . . twins this time .,one is a tabby yet overweight and the other is a black and white cat,go to national parks . think about what i missed out on in high school . you ?


--------------------------------------------------------------------------------
/udc_inputs.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | TEXT_FEATURE_SIZE = 100
 4 | 
 5 | def get_feature_columns(mode):
 6 |   feature_columns = []
 7 | 
 8 |   feature_columns.append(tf.contrib.layers.real_valued_column(
 9 |     column_name="context", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64))
10 |   feature_columns.append(tf.contrib.layers.real_valued_column(
11 |       column_name="context_len", dimension=1, dtype=tf.int64))
12 |   feature_columns.append(tf.contrib.layers.real_valued_column(
13 |       column_name="utterance", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64))
14 |   feature_columns.append(tf.contrib.layers.real_valued_column(
15 |       column_name="utterance_len", dimension=1, dtype=tf.int64))
16 |   feature_columns.append(tf.contrib.layers.real_valued_column(
17 |       column_name="persona", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64))
18 |   feature_columns.append(tf.contrib.layers.real_valued_column(
19 |       column_name="persona_len", dimension=1, dtype=tf.int64))
20 | 
21 |   if mode == tf.contrib.learn.ModeKeys.TRAIN:
22 |     # During training we have a label feature
23 |     feature_columns.append(tf.contrib.layers.real_valued_column(
24 |       column_name="label", dimension=1, dtype=tf.int64))
25 | 
26 |   if mode == tf.contrib.learn.ModeKeys.EVAL:
27 |     # During evaluation we have distractors
28 |     for i in range(9):
29 |       feature_columns.append(tf.contrib.layers.real_valued_column(
30 |         column_name="distractor_{}".format(i), dimension=TEXT_FEATURE_SIZE, dtype=tf.int64))
31 |       feature_columns.append(tf.contrib.layers.real_valued_column(
32 |         column_name="distractor_{}_len".format(i), dimension=1, dtype=tf.int64))
33 | 
34 |   return set(feature_columns)
35 | 
36 | 
37 | def create_input_fn(mode, input_files, batch_size, num_epochs):
38 |   def input_fn():
39 |     features = tf.contrib.layers.create_feature_spec_for_parsing(
40 |         get_feature_columns(mode))
41 | 
42 |     feature_map = tf.contrib.learn.io.read_batch_features(
43 |         file_pattern=input_files,
44 |         batch_size=batch_size,
45 |         features=features,
46 |         reader=tf.TFRecordReader,
47 |         randomize_input=True,
48 |         num_epochs=num_epochs,
49 |         queue_capacity=200000 + batch_size * 10,
50 |         name="read_batch_features_{}".format(mode))
51 | 
52 |     # This is an ugly hack because of a current bug in tf.learn
53 |     # During evaluation TF tries to restore the epoch variable which isn't defined during training
54 |     # So we define the variable manually here
55 |     if mode == tf.contrib.learn.ModeKeys.TRAIN:
56 |       tf.get_variable(
57 |         "read_batch_features_eval/file_name_queue/limit_epochs/epochs",
58 |         initializer=tf.constant(0, dtype=tf.int64))
59 | 
60 |     if mode == tf.contrib.learn.ModeKeys.TRAIN:
61 |       target = feature_map.pop("label")
62 |     else:
63 |       # In evaluation we have 10 classes (utterances).
64 |       # The first one (index 0) is always the correct one
65 |       target = tf.zeros([batch_size, 1], dtype=tf.int64)
66 |     return feature_map, target
67 |   return input_fn
68 | 


--------------------------------------------------------------------------------
/models/dual_encoder.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from models import helpers
 4 | 
 5 | FLAGS = tf.flags.FLAGS
 6 | 
 7 | def get_embeddings(hparams):
 8 |   if hparams.glove_path and hparams.vocab_path:
 9 |     tf.logging.info("Loading Glove embeddings...")
10 |     vocab_array, vocab_dict = helpers.load_vocab(hparams.vocab_path)
11 |     glove_vectors, glove_dict = helpers.load_glove_vectors(hparams.glove_path, vocab=set(vocab_array))
12 |     initializer = helpers.build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, hparams.embedding_dim)
13 |   else:
14 |     tf.logging.info("No glove/vocab path specificed, starting with random embeddings.")
15 |     initializer = tf.random_uniform_initializer(-0.25, 0.25)
16 | 
17 |   return tf.get_variable(
18 |     "word_embeddings",
19 |     shape=[hparams.vocab_size, hparams.embedding_dim],
20 |     initializer=initializer)
21 | 
22 | 
23 | def dual_encoder_model(
24 |     hparams,
25 |     mode,
26 |     context,
27 |     context_len,
28 |     utterance,
29 |     utterance_len,
30 |     persona,
31 |     persona_len,
32 |     targets):
33 | 
34 |   # Initialize embedidngs randomly or with pre-trained vectors if available
35 |   embeddings_W = get_embeddings(hparams)
36 | 
37 |   # Embed the context and the utterance
38 |   context_embedded = tf.nn.embedding_lookup(
39 |       embeddings_W, context, name="embed_context")
40 |   utterance_embedded = tf.nn.embedding_lookup(
41 |       embeddings_W, utterance, name="embed_utterance")
42 |   persona_embedded = tf.nn.embedding_lookup(
43 |       embeddings_W, persona, name="embed_persona")
44 | 
45 | 
46 |   # Build the RNN
47 |   with tf.variable_scope("rnn") as vs:
48 |     # We use an LSTM Cell
49 |     cell = tf.nn.rnn_cell.LSTMCell(
50 |         hparams.rnn_dim,
51 |         forget_bias=2.0,
52 |         use_peepholes=True,
53 |         state_is_tuple=True)
54 | 
55 |     # Run the utterance and context through the RNN
56 |     rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
57 |         cell,
58 |         tf.concat([context_embedded, utterance_embedded, persona_embedded], 0),
59 |         sequence_length=tf.concat([context_len, utterance_len, persona_len,], 0),
60 |         dtype=tf.float32)
61 |     encoding_context, encoding_utterance , encoding_persona = tf.split(rnn_states.h,3,0)
62 | 
63 |   with tf.variable_scope("prediction") as vs:
64 |     M = tf.get_variable("M",
65 |       shape=[hparams.rnn_dim, hparams.rnn_dim],
66 |       initializer=tf.truncated_normal_initializer())
67 | 
68 |     # sum encoding_context, encoding_persona
69 |     pc = tf.add(encoding_context, encoding_persona)
70 | 
71 |     # "Predict" a  response: c * M
72 |     generated_response = tf.matmul(pc, M)
73 |     generated_response = tf.expand_dims(generated_response, 2)
74 |     encoding_utterance = tf.expand_dims(encoding_utterance, 2)
75 | 
76 |     # Dot product between generated response and actual response
77 |     # (c * M) * r
78 |     logits = tf.matmul(generated_response, encoding_utterance, True)
79 |     logits = tf.squeeze(logits, [2])
80 | 
81 |     # Apply sigmoid to convert logits to probabilities
82 |     probs = tf.sigmoid(logits)
83 | 
84 |     if mode == tf.contrib.learn.ModeKeys.INFER:
85 |       return probs, None
86 | 
87 |     # Calculate the binary cross-entropy loss
88 |     losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.to_float(targets))
89 | 
90 |   # Mean loss across the batch of examples
91 |   mean_loss = tf.reduce_mean(losses, name="mean_loss")
92 |   return probs, mean_loss
93 | 


--------------------------------------------------------------------------------
/udc_predict.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import sys
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | import udc_model
 6 | import udc_hparams
 7 | from models.dual_encoder import dual_encoder_model
 8 | import pandas as pd
 9 | from termcolor import colored
10 | import os
11 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
12 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
13 | 
14 | tf.flags.DEFINE_string("model_dir", "./runs/1542774662", "Directory to load model checkpoints from")
15 | tf.flags.DEFINE_string("vocab_processor_file", "./data/persona/vocab_processor.bin", "Saved vocabulary processor file")
16 | FLAGS = tf.flags.FLAGS
17 | 
18 | if not FLAGS.model_dir:
19 |   print("You must specify a model directory")
20 |   sys.exit(1)
21 | 
22 | def tokenizer_fn(iterator):
23 |   return (x.split(" ") for x in iterator)
24 | 
25 | def get_features(context, persona, utterances):
26 |   context_matrix = np.array(list(vp.transform([context])))
27 |   persona_matrix = np.array(list(vp.transform([persona])))
28 |   utterance_matrix = np.array(list(vp.transform([utterances[0]])))
29 |   context_len = len(context.split(" "))
30 |   persona_len = len(persona.split(" "))
31 |   utterance_len = len(utterances[0].split(" "))
32 |   features =  {
33 |         "context": tf.convert_to_tensor(context_matrix, dtype=tf.int64),
34 |         "context_len": tf.constant(context_len, shape=[1,1], dtype=tf.int64),
35 |         "persona": tf.convert_to_tensor(persona_matrix, dtype=tf.int64),
36 |         "persona_len": tf.constant(persona_len, shape=[1,1], dtype=tf.int64),
37 |         "utterance": tf.convert_to_tensor(utterance_matrix, dtype=tf.int64),
38 |         "utterance_len": tf.constant(utterance_len, shape=[1,1], dtype=tf.int64),
39 |         "len":len(utterances)
40 |   }
41 | 
42 |   for i in range(1,len(utterances)):
43 |       utterance = utterances[i];
44 | 
45 |       utterance_matrix = np.array(list(vp.transform([utterance])))
46 |       utterance_len = len(utterance.split(" "))
47 | 
48 |       features["utterance_{}".format(i)] = tf.convert_to_tensor(utterance_matrix, dtype=tf.int64)
49 |       features["utterance_{}_len".format(i)] = tf.constant(utterance_len, shape=[1,1], dtype=tf.int64)
50 | 
51 |   return features, None
52 | 
53 | if __name__ == "__main__":
54 |   # tf.logging.set_verbosity(tf.logging.INFO)
55 |   # Load vocabulary
56 |   vp = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(
57 |     FLAGS.vocab_processor_file)
58 | 
59 |   # Load data for predict
60 |   test_df = pd.read_csv("./data/persona/predict.csv")
61 |   #elementId = 0
62 |   #INPUT_CONTEXT = test_df.Context[elementId]
63 |   #INPUT_PERSONA = test_df.Persona[elementId]
64 |   #POTENTIAL_RESPONSES = test_df.iloc[elementId,2:].values
65 | 
66 |   hparams = udc_hparams.create_hparams()
67 |   model_fn = udc_model.create_model_fn(hparams, model_impl=dual_encoder_model)
68 | 
69 |   estimator = tf.contrib.learn.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir)
70 | 
71 |   #starttime = time.time()
72 |   for elementId in range(10):
73 |     INPUT_CONTEXT = test_df.Context[elementId]
74 |     INPUT_PERSONA = test_df.Persona[elementId]
75 |     POTENTIAL_RESPONSES = test_df.iloc[elementId,2:].values
76 |     prob = estimator.predict(input_fn=lambda: get_features(INPUT_CONTEXT, INPUT_PERSONA, POTENTIAL_RESPONSES))
77 |     results = next(prob)
78 |     print('\n')
79 |     print(colored('[     Context]', on_color='on_blue',color="white"),INPUT_CONTEXT)
80 |     print(colored('[     Persona]', on_color='on_blue',color="white"),INPUT_PERSONA)
81 |     #print("[Results value ]",results)
82 |     answerId = results.argmax(axis=0)
83 |     if answerId==0:
84 |         print(colored('[      Answer]', on_color='on_green'), POTENTIAL_RESPONSES[answerId])
85 |     else:
86 |         print (colored('[      Answer]', on_color='on_red'),POTENTIAL_RESPONSES[answerId])
87 |         print (colored('[Right answer]', on_color='on_green'), POTENTIAL_RESPONSES[0])
88 | 
89 |   #endtime = time.time()
90 |   print('\n')
91 |   #print(colored('[Predict time]', on_color='on_blue',color="white"),"%.2f sec" % round(endtime - starttime,2))


--------------------------------------------------------------------------------
/udc_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import sys
  3 | 
  4 | def get_id_feature(features, key, len_key, max_len):
  5 |   ids = features[key]
  6 |   ids_len = tf.squeeze(features[len_key], [1])
  7 |   ids_len = tf.minimum(ids_len, tf.constant(max_len, dtype=tf.int64))
  8 |   return ids, ids_len
  9 | 
 10 | def create_train_op(loss, hparams):
 11 |   train_op = tf.contrib.layers.optimize_loss(
 12 |       loss=loss,
 13 |       global_step=tf.contrib.framework.get_global_step(),
 14 |       learning_rate=hparams.learning_rate,
 15 |       clip_gradients=10.0,
 16 |       optimizer=hparams.optimizer)
 17 |   return train_op
 18 | 
 19 | 
 20 | def create_model_fn(hparams, model_impl):
 21 | 
 22 |   def model_fn(features, targets, mode):
 23 |     context, context_len = get_id_feature(
 24 |         features, "context", "context_len", hparams.max_context_len)
 25 | 
 26 |     utterance, utterance_len = get_id_feature(
 27 |         features, "utterance", "utterance_len", hparams.max_utterance_len)
 28 | 
 29 |     persona, persona_len = get_id_feature(
 30 |         features, "persona", "persona_len", hparams.max_persona_len)
 31 | 
 32 |     if mode == tf.contrib.learn.ModeKeys.TRAIN:
 33 |       probs, loss = model_impl(
 34 |           hparams,
 35 |           mode,
 36 |           context,
 37 |           context_len,
 38 |           utterance,
 39 |           utterance_len,
 40 |           persona,
 41 |           persona_len,
 42 |           targets)
 43 |       train_op = create_train_op(loss, hparams)
 44 |       return probs, loss, train_op
 45 | 
 46 |     if mode == tf.contrib.learn.ModeKeys.INFER:
 47 | 
 48 |       all_contexts = [context]
 49 |       all_context_lens = [context_len]
 50 |       all_utterances = [utterance]
 51 |       all_utterance_lens = [utterance_len]
 52 |       all_personas = [persona]
 53 |       all_persona_lens = [persona_len]
 54 | 
 55 |       for i in range(1,features["len"]):
 56 |         distractor, distractor_len = get_id_feature(features,
 57 |             "utterance_{}".format(i),
 58 |             "utterance_{}_len".format(i),
 59 |             hparams.max_utterance_len)
 60 |         all_contexts.append(context)
 61 |         all_context_lens.append(context_len)
 62 |         all_utterances.append(distractor)
 63 |         all_utterance_lens.append(distractor_len)
 64 |         all_personas.append(persona)
 65 |         all_persona_lens.append(persona_len)
 66 | 
 67 |       probs, loss = model_impl(
 68 |           hparams,
 69 |           mode,
 70 |           tf.concat(all_contexts,0),
 71 |           tf.concat(all_context_lens,0),
 72 |           tf.concat(all_utterances,0),
 73 |           tf.concat(all_utterance_lens,0),
 74 |           tf.concat(all_personas,0),
 75 |           tf.concat(all_persona_lens,0),
 76 |           None)
 77 | 
 78 |       split_probs = tf.split(probs, features["len"],0)
 79 |       probs = tf.concat(split_probs,1)
 80 | 
 81 |       return probs, 0.0, None
 82 | 
 83 |     if mode == tf.contrib.learn.ModeKeys.EVAL:
 84 |       batch_size = targets.get_shape().as_list()[0]
 85 | 
 86 |       # We have 10 exampels per record, so we accumulate them
 87 |       all_contexts = [context]
 88 |       all_context_lens = [context_len]
 89 |       all_utterances = [utterance]
 90 |       all_utterance_lens = [utterance_len]
 91 |       all_personas = [persona]
 92 |       all_persona_lens = [persona_len]
 93 |       all_targets = [tf.ones([batch_size, 1], dtype=tf.int64)]
 94 | 
 95 |       for i in range(9):
 96 |         distractor, distractor_len = get_id_feature(features,
 97 |             "distractor_{}".format(i),
 98 |             "distractor_{}_len".format(i),
 99 |             hparams.max_utterance_len)
100 |         all_contexts.append(context)
101 |         all_context_lens.append(context_len)
102 |         all_utterances.append(distractor)
103 |         all_utterance_lens.append(distractor_len)
104 |         all_personas.append(persona)
105 |         all_persona_lens.append(persona_len)
106 |         all_targets.append(
107 |           tf.zeros([batch_size, 1], dtype=tf.int64)
108 |         )
109 | 
110 |       probs, loss = model_impl(
111 |           hparams,
112 |           mode,
113 |           tf.concat(all_contexts,0),
114 |           tf.concat(all_context_lens,0),
115 |           tf.concat(all_utterances,0),
116 |           tf.concat(all_utterance_lens,0),
117 |           tf.concat(all_personas,0),
118 |           tf.concat(all_persona_lens,0),
119 |           tf.concat(all_targets,0))
120 | 
121 |       split_probs = tf.split(probs,10,0)
122 |       shaped_probs = tf.concat(split_probs,1)
123 | 
124 |       # Add summaries
125 |       tf.summary.histogram("eval_correct_probs_hist", split_probs[0])
126 |       tf.summary.scalar("eval_correct_probs_average", tf.reduce_mean(split_probs[0]))
127 |       tf.summary.histogram("eval_incorrect_probs_hist", split_probs[1])
128 |       tf.summary.scalar("eval_incorrect_probs_average", tf.reduce_mean(split_probs[1]))
129 | 
130 |       return shaped_probs, loss, None
131 | 
132 |   return model_fn
133 | 


--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import itertools
  4 | import functools
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | import array
  8 | 
  9 | tf.flags.DEFINE_integer(
 10 |   "min_word_frequency", 5, "Minimum frequency of words in the vocabulary")
 11 | 
 12 | tf.flags.DEFINE_integer("max_sentence_len", 100, "Maximum Sentence Length")
 13 | 
 14 | tf.flags.DEFINE_string(
 15 |   "input_dir", os.path.abspath("./data/persona"),
 16 |   "Input directory containing original CSV data files (default = './data')")
 17 | 
 18 | tf.flags.DEFINE_string(
 19 |   "output_dir", os.path.abspath("./data/persona"),
 20 |   "Output directory for TFrEcord files (default = './data')")
 21 | 
 22 | FLAGS = tf.flags.FLAGS
 23 | 
 24 | TRAIN_PATH = os.path.join(FLAGS.input_dir, "train_shuf.csv")
 25 | VALIDATION_PATH = os.path.join(FLAGS.input_dir, "valid.csv")
 26 | TEST_PATH = os.path.join(FLAGS.input_dir, "test.csv")
 27 | 
 28 | def tokenizer_fn(iterator):
 29 |   return (x.split(" ") for x in iterator)
 30 | 
 31 | def create_csv_iter(filename):
 32 |   """
 33 |   Returns an iterator over a CSV file. Skips the header.
 34 |   """
 35 |   with open(filename) as csvfile:
 36 |     reader = csv.reader(csvfile)
 37 |     # Skip the header
 38 |     next(reader)
 39 |     for row in reader:
 40 |       yield row
 41 | 
 42 | 
 43 | def create_vocab(input_iter, min_frequency):
 44 |   """
 45 |   Creates and returns a VocabularyProcessor object with the vocabulary
 46 |   for the input iterator.
 47 |   """
 48 |   vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
 49 |       FLAGS.max_sentence_len,
 50 |       min_frequency=min_frequency,
 51 |       tokenizer_fn=tokenizer_fn)
 52 |   vocab_processor.fit(input_iter)
 53 |   return vocab_processor
 54 | 
 55 | 
 56 | def transform_sentence(sequence, vocab_processor):
 57 |   """
 58 |   Maps a single sentence into the integer vocabulary. Returns a python array.
 59 |   """
 60 |   return next(vocab_processor.transform([sequence])).tolist()
 61 | 
 62 | 
 63 | def create_text_sequence_feature(fl, sentence, sentence_len, vocab):
 64 |   """
 65 |   Writes a sentence to FeatureList protocol buffer
 66 |   """
 67 |   sentence_transformed = transform_sentence(sentence, vocab)
 68 |   for word_id in sentence_transformed:
 69 |     fl.feature.add().int64_list.value.extend([word_id])
 70 |   return fl
 71 | 
 72 | 
 73 | def create_example_train(row, vocab):
 74 |   """
 75 |   Creates a training example for the Ubuntu Dialog Corpus dataset.
 76 |   Returnsthe a tensorflow.Example Protocol Buffer object.
 77 |   """
 78 |   context, utterance, persona, label = row
 79 |   context_transformed = transform_sentence(context, vocab)
 80 |   utterance_transformed = transform_sentence(utterance, vocab)
 81 |   persona_transformed = transform_sentence(persona, vocab)
 82 |   context_len = len(next(vocab._tokenizer([context])))
 83 |   utterance_len = len(next(vocab._tokenizer([utterance])))
 84 |   persona_len = len(next(vocab._tokenizer([persona])))
 85 |   label = int(float(label))
 86 | 
 87 |   # New Example
 88 |   example = tf.train.Example()
 89 |   example.features.feature["context"].int64_list.value.extend(context_transformed)
 90 |   example.features.feature["utterance"].int64_list.value.extend(utterance_transformed)
 91 |   example.features.feature["persona"].int64_list.value.extend(persona_transformed)
 92 |   example.features.feature["context_len"].int64_list.value.extend([context_len])
 93 |   example.features.feature["utterance_len"].int64_list.value.extend([utterance_len])
 94 |   example.features.feature["persona_len"].int64_list.value.extend([persona_len])
 95 |   example.features.feature["label"].int64_list.value.extend([label])
 96 |   return example
 97 | 
 98 | 
 99 | def create_example_test(row, vocab):
100 |   """
101 |   Creates a test/validation example for the Ubuntu Dialog Corpus dataset.
102 |   Returnsthe a tensorflow.Example Protocol Buffer object.
103 |   """
104 |   context, utterance, persona = row[:3]
105 |   distractors = row[3:]
106 |   context_len = len(next(vocab._tokenizer([context])))
107 |   utterance_len = len(next(vocab._tokenizer([utterance])))
108 |   persona_len = len(next(vocab._tokenizer([persona])))
109 |   context_transformed = transform_sentence(context, vocab)
110 |   utterance_transformed = transform_sentence(utterance, vocab)
111 |   persona_transformed = transform_sentence(persona, vocab)
112 | 
113 |   # New Example
114 |   example = tf.train.Example()
115 |   example.features.feature["context"].int64_list.value.extend(context_transformed)
116 |   example.features.feature["utterance"].int64_list.value.extend(utterance_transformed)
117 |   example.features.feature["persona"].int64_list.value.extend(persona_transformed)
118 |   example.features.feature["context_len"].int64_list.value.extend([context_len])
119 |   example.features.feature["utterance_len"].int64_list.value.extend([utterance_len])
120 |   example.features.feature["persona_len"].int64_list.value.extend([persona_len])
121 | 
122 |   # Distractor sequences
123 |   for i, distractor in enumerate(distractors):
124 |     dis_key = "distractor_{}".format(i)
125 |     dis_len_key = "distractor_{}_len".format(i)
126 |     # Distractor Length Feature
127 |     dis_len = len(next(vocab._tokenizer([distractor])))
128 |     example.features.feature[dis_len_key].int64_list.value.extend([dis_len])
129 |     # Distractor Text Feature
130 |     dis_transformed = transform_sentence(distractor, vocab)
131 |     example.features.feature[dis_key].int64_list.value.extend(dis_transformed)
132 |   return example
133 | 
134 | 
135 | def create_tfrecords_file(input_filename, output_filename, example_fn):
136 |   """
137 |   Creates a TFRecords file for the given input data and
138 |   example transofmration function
139 |   """
140 |   writer = tf.python_io.TFRecordWriter(output_filename)
141 |   print("Creating TFRecords file at {}...".format(output_filename))
142 |   for i, row in enumerate(create_csv_iter(input_filename)):
143 |     x = example_fn(row)
144 |     writer.write(x.SerializeToString())
145 |   writer.close()
146 |   print("Wrote to {}".format(output_filename))
147 | 
148 | 
149 | def write_vocabulary(vocab_processor, outfile):
150 |   """
151 |   Writes the vocabulary to a file, one word per line.
152 |   """
153 |   vocab_size = len(vocab_processor.vocabulary_)
154 |   with open(outfile, "w") as vocabfile:
155 |     for id in range(vocab_size):
156 |       word =  vocab_processor.vocabulary_._reverse_mapping[id]
157 |       vocabfile.write(word + "\n")
158 |   print("Saved vocabulary to {}".format(outfile))
159 | 
160 | 
161 | if __name__ == "__main__":
162 |   print("Creating vocabulary...")
163 |   input_iter = create_csv_iter(TRAIN_PATH)
164 |   input_iter = (x[0] + " " + x[1] + " " + x[2]for x in input_iter) #context+utter+persona
165 |   vocab = create_vocab(input_iter, min_frequency=FLAGS.min_word_frequency)
166 |   print("Total vocabulary size: {}".format(len(vocab.vocabulary_)))
167 | 
168 |   # Create vocabulary.txt file
169 |   write_vocabulary(
170 |     vocab, os.path.join(FLAGS.output_dir, "vocabulary.txt"))
171 | 
172 |   # Save vocab processor
173 |   vocab.save(os.path.join(FLAGS.output_dir, "vocab_processor.bin"))
174 | 
175 |   # Create validation.tfrecords
176 |   create_tfrecords_file(
177 |       input_filename=VALIDATION_PATH,
178 |       output_filename=os.path.join(FLAGS.output_dir, "validation.tfrecords"),
179 |       example_fn=functools.partial(create_example_test, vocab=vocab))
180 | 
181 |   # Create test.tfrecords
182 |   create_tfrecords_file(
183 |       input_filename=TEST_PATH,
184 |       output_filename=os.path.join(FLAGS.output_dir, "test.tfrecords"),
185 |       example_fn=functools.partial(create_example_test, vocab=vocab))
186 | 
187 |   # Create train.tfrecords
188 |   create_tfrecords_file(
189 |       input_filename=TRAIN_PATH,
190 |       output_filename=os.path.join(FLAGS.output_dir, "train.tfrecords"),
191 |       example_fn=functools.partial(create_example_train, vocab=vocab))
192 | 


--------------------------------------------------------------------------------