├── README.md
├── images
    └── 20180128-035256-plot.png
├── paraphraser
    ├── __init__.py
    ├── dataset_generator.py
    ├── download_models.py
    ├── embeddings.py
    ├── inference.py
    ├── inference_frozen_graph.py
    ├── inspect_checkpoint.py
    ├── lstm_model.py
    ├── nlp_pipeline.py
    ├── paraphraser.py
    ├── preprocess_data.py
    ├── sample_embedding_helper.py
    ├── synonym_model.py
    ├── training_pipeline.py
    └── utils.py
├── requirements.txt
├── setup.cfg
└── setup.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Paraphraser 
 2 | 
 3 | This project providers users the ability to do paraphrase generation for sentences through a clean and simple API.  A demo can be seen here: [pair-a-phrase](http://pair-a-phrase.it)
 4 | 
 5 | The paraphraser was developed under the [Insight Data Science Artificial Intelligence](http://insightdata.ai/) program.
 6 | 
 7 | ## Model
 8 | 
 9 | The underlying model is a bidirectional LSTM encoder and LSTM decoder with attention trained using Tensorflow.  Downloadable link here: [paraphrase model](https://drive.google.com/open?id=18uOQsosF4uVGvUgp6pB4BKrQZ1FktlmM)
10 | 
11 | ### Prerequisiteis
12 | 
13 | * python 3.5
14 | * Tensorflow 1.4.1
15 | * spacy
16 | 
17 | ### Inference Execution
18 | 
19 | Download the model checkpoint from the link above and run:
20 | 
21 | ```
22 | python inference.py --checkpoint=<checkpoint_path/model-171856>
23 | ```
24 | 
25 | ### Datasets
26 | 
27 | The dataset used to train this model is an aggregation of many different public datasets.  To name a few:
28 | * para-nmt-5m
29 | * Quora question pair
30 | * SNLI
31 | * Semeval
32 | * And more!
33 | 
34 | I have not included the aggregated dataset as part of this repo.  If you're curious and would like to know more, contact me.  Pretrained embeddings come from [John Wieting](http://www.cs.cmu.edu/~jwieting)'s [para-nmt-50m](https://github.com/jwieting/para-nmt-50m) project.
35 | 
36 | ### Training
37 | 
38 | Training was done for 2 epochs on a Nvidia GTX 1080 and evaluted on the BLEU score. The Tensorboard training curves can be seen below.  The grey curve is train and the orange curve is dev.
39 | 
40 | <img src="https://raw.githubusercontent.com/vsuthichai/paraphraser/master/images/20180128-035256-plot.png" align="center">
41 | 
42 | ## TODOs
43 | 
44 | * pip installable package
45 | * Explore deeper number of layers
46 | * Recurrent layer dropout
47 | * Greater dataset augmentation 
48 | * Try residual layer
49 | * Model compression
50 | * Byte pair encoding for out of set vocabulary
51 | 
52 | ## Citations
53 | 
54 | ```
55 | @inproceedings { wieting-17-millions, 
56 |     author = {John Wieting and Kevin Gimpel}, 
57 |     title = {Pushing the Limits of Paraphrastic Sentence Embeddings with Millions of Machine Translations}, 
58 |     booktitle = {arXiv preprint arXiv:1711.05732}, year = {2017} 
59 | }
60 | 
61 | @inproceedings { wieting-17-backtrans, 
62 |     author = {John Wieting, Jonathan Mallinson, and Kevin Gimpel}, 
63 |     title = {Learning Paraphrastic Sentence Embeddings from Back-Translated Bitext}, 
64 |     booktitle = {Proceedings of Empirical Methods in Natural Language Processing}, 
65 |     year = {2017} 
66 | }
67 | ```
68 | 
69 | 


--------------------------------------------------------------------------------
/images/20180128-035256-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vincent9514/Text-Variant-Generation/29f4507baecee11c72b8b1f3c66686fff008a9e8/images/20180128-035256-plot.png


--------------------------------------------------------------------------------
/paraphraser/__init__.py:
--------------------------------------------------------------------------------
1 | from .synonym_model import synonym_paraphrase
2 | 


--------------------------------------------------------------------------------
/paraphraser/dataset_generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.preprocessing.sequence import pad_sequences
  3 | from embeddings import load_sentence_embeddings
  4 | from six.moves import xrange
  5 | from six import iteritems
  6 | from random import shuffle
  7 | 
  8 | class ParaphraseDataset(object):
  9 |     """This class is responsible for batching the paraphrase dataset into mini batches
 10 |     for train, dev, and test.  The dataset itself must be partition into files
 11 |     beforehand and must follow this format:
 12 |     
 13 |     "source sentence\tsource sentence token ids\treference sentence\treference sentence token ids"
 14 |     
 15 |     The intraseparator is a space.  
 16 |     """
 17 | 
 18 |     def __init__(self, dataset_metadata, batch_size, embeddings, word_to_id, start_id, end_id, unk_id, mask_id):
 19 |         """ Constructor initialization.
 20 | 
 21 |         Args:
 22 |             dataset_metadata: metadata list that follows the format [
 23 |                     {
 24 |                         'maxlen': X
 25 |                         'train': training filename with sentences of length X,
 26 |                         'dev': dev filename with sentences of length X,
 27 |                         'test': test filename with sentences of length X,
 28 |                     },
 29 |                 ].  Each element is a list that describes the train, dev, and
 30 |                 test files for sentences of maximum length X.
 31 |             batch_size: mini batch size
 32 |             embeddings: pretrained embeddings
 33 |             word_to_id: vocabulary index
 34 |             start_id: start of sentence token id
 35 |             end_id: end of sentence token id
 36 |             unk_id: unknown token id
 37 |             mask_id: pad token id applied after the end of sentence.
 38 |         """
 39 |                 
 40 |         # batch size
 41 |         self.batch_size = batch_size
 42 | 
 43 |         # Special tokens
 44 |         self.start_id = start_id
 45 |         self.end_id = end_id
 46 |         self.unk_id = unk_id
 47 |         self.mask_id = mask_id
 48 |         
 49 |         # Word embeddings, vocab
 50 |         self.embeddings = embeddings
 51 |         self.word_to_id = word_to_id
 52 |         self.vocab_size, self.embedding_size = embeddings.shape
 53 | 
 54 |         # dataset
 55 |         self.lengths = sorted([ v for d in dataset_metadata for k, v in iteritems(d) if k == 'maxlen' ])
 56 |         self.dataset_metadata = {}
 57 |         for dm in dataset_metadata:
 58 |             for k, v in iteritems(dm):
 59 |                 if k == 'maxlen':
 60 |                     self.dataset_metadata[v] = dm
 61 |         self.dataset = {}
 62 | 
 63 |     def load_dataset_into_memory(self, dataset_type):
 64 |         """Load dataset into memory and partition by train, dev, and test."""
 65 | 
 66 |         if dataset_type not in set(['train', 'test', 'dev']):
 67 |             raise ValueError("Invalid dataset type.")
 68 | 
 69 |         self.dataset[dataset_type] = {}
 70 |         self.dataset[dataset_type]['all_source_words'] = []
 71 |         self.dataset[dataset_type]['all_source_ids'] = []
 72 |         self.dataset[dataset_type]['all_source_len'] = []
 73 |         self.dataset[dataset_type]['all_ref_words'] = []
 74 |         self.dataset[dataset_type]['all_ref_ids'] = []
 75 |         self.dataset[dataset_type]['all_ref_len'] = []
 76 | 
 77 |         batch_source_words = []
 78 |         batch_source_ids = []
 79 |         batch_source_len = []
 80 |         batch_ref_words = []
 81 |         batch_ref_ids = []
 82 |         batch_ref_len = []
 83 | 
 84 |         for length in self.lengths:
 85 |             with open(self.dataset_metadata[length][dataset_type], 'r') as f:
 86 |                 for i, line in enumerate(f):
 87 |                     source_words, source_ids, ref_words, ref_ids = line.split('\t')
 88 |                     batch_source_words.append(source_words.strip().split(' '))
 89 |                     batch_source_ids.append(source_ids.strip().split(' '))
 90 |                     batch_ref_words.append(ref_words.strip().split(' '))
 91 |                     batch_ref_ids.append(ref_ids.strip().split(' '))
 92 | 
 93 |                     if i % self.batch_size != 0 and i != 0:
 94 |                         continue
 95 | 
 96 |                     batch_source_len = [ len(source_ids) for source_ids in batch_source_ids ]
 97 |                     batch_ref_len = [ len(ref_ids) for ref_ids in batch_ref_ids ] 
 98 | 
 99 |                     self.dataset[dataset_type]['all_source_ids'].append(self.pad_batch(batch_source_ids, length))
100 |                     self.dataset[dataset_type]['all_source_words'].append(batch_source_words)
101 |                     self.dataset[dataset_type]['all_source_len'].append(batch_source_len)
102 |                     self.dataset[dataset_type]['all_ref_ids'].append(self.pad_batch(batch_ref_ids, length))
103 |                     self.dataset[dataset_type]['all_ref_words'].append(batch_ref_words)
104 |                     self.dataset[dataset_type]['all_ref_len'].append(batch_ref_len)
105 | 
106 |                     batch_source_words = []
107 |                     batch_source_ids = []
108 |                     batch_source_len = []
109 |                     batch_ref_words = []
110 |                     batch_ref_ids = []
111 |                     batch_ref_len = []
112 | 
113 |                 if len(batch_source_words) > 0:
114 |                     batch_source_len = [ len(source_ids) for source_ids in batch_source_ids ]
115 |                     batch_ref_len = [ len(ref_ids) for ref_ids in batch_ref_ids ] 
116 | 
117 |                     self.dataset[dataset_type]['all_source_ids'].append(self.pad_batch(batch_source_ids, length))
118 |                     self.dataset[dataset_type]['all_source_words'].append(batch_source_words)
119 |                     self.dataset[dataset_type]['all_source_len'].append(batch_source_len)
120 |                     self.dataset[dataset_type]['all_ref_ids'].append(self.pad_batch(batch_ref_ids, length))
121 |                     self.dataset[dataset_type]['all_ref_words'].append(batch_ref_words)
122 |                     self.dataset[dataset_type]['all_ref_len'].append(batch_ref_len)
123 | 
124 |                     batch_source_words = []
125 |                     batch_source_ids = []
126 |                     batch_source_len = []
127 |                     batch_ref_words = []
128 |                     batch_ref_ids = []
129 |                     batch_ref_len = []
130 | 
131 |     def generate_batch(self, dataset_type):
132 |         """Return a generator that yields a mini batch of size self.batch_size.
133 |         
134 |         Args:
135 |             dataset_type: 'train', 'test', or 'dev'
136 |         """
137 | 
138 |         if dataset_type not in set(['train', 'test', 'dev']):
139 |             raise ValueError("Invalid dataset type.")
140 | 
141 |         if dataset_type not in self.dataset:
142 |             self.load_dataset_into_memory(dataset_type)
143 | 
144 |         dataset_size = len(self.dataset[dataset_type]['all_source_ids'])
145 | 
146 |         rs = np.random.get_state()
147 |         np.random.shuffle(self.dataset[dataset_type]['all_source_ids'])
148 |         np.random.set_state(rs)
149 |         np.random.shuffle(self.dataset[dataset_type]['all_source_words'])
150 |         np.random.set_state(rs)
151 |         np.random.shuffle(self.dataset[dataset_type]['all_source_len'])
152 |         np.random.set_state(rs)
153 |         np.random.shuffle(self.dataset[dataset_type]['all_ref_ids'])
154 |         np.random.set_state(rs)
155 |         np.random.shuffle(self.dataset[dataset_type]['all_ref_words'])
156 |         np.random.set_state(rs)
157 |         np.random.shuffle(self.dataset[dataset_type]['all_ref_len'])
158 |         np.random.set_state(rs)
159 | 
160 |         for i in xrange(dataset_size):
161 |             yield {
162 |                 'seq_source_ids': self.dataset[dataset_type]['all_source_ids'][i],
163 |                 'seq_source_words': self.dataset[dataset_type]['all_source_words'][i],
164 |                 'seq_source_len': self.dataset[dataset_type]['all_source_len'][i],
165 |                 'seq_ref_ids': self.dataset[dataset_type]['all_ref_ids'][i],
166 |                 'seq_ref_words': self.dataset[dataset_type]['all_ref_words'][i],
167 |                 'seq_ref_len': self.dataset[dataset_type]['all_ref_len'][i]
168 |             }
169 | 
170 |     def pad_batch(self, batch_ids, max_len):
171 |         """ Pad a mini batch with mask_id.  This is intended to fill in any
172 |         remaining time steps after the end of sentence tokens.
173 | 
174 |         Args:
175 |             batch_ids: The mini batch of token ids of shape (batch_size, time_steps)
176 |             max_len: The maximum number of time steps.
177 | 
178 |         Returns:
179 |             a batch of samples padded with mask_id
180 |         """
181 |         padded_batch = np.array(pad_sequences(batch_ids, maxlen=max_len, padding='post', value=self.mask_id))
182 |         return padded_batch
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     from pprint import pprint as pp
187 |     from utils import dataset_config
188 |     dataset = dataset_config()
189 |     word_to_id, idx_to_word, embeddings, start_id, end_id, unk_id, mask_id = load_sentence_embeddings()
190 |     pd = ParaphraseDataset(dataset, 10, embeddings, word_to_id, start_id, end_id, unk_id, mask_id)
191 |     generator = pd.generate_batch('train')
192 |     for i, d in enumerate(generator):
193 |         if i == 5:
194 |             break
195 |         print("=== seq source ids ===")
196 |         print(d['seq_source_ids'].shape, flush=True)
197 |         print(d['seq_source_ids'], flush=True)
198 |         for i in d['seq_source_words']:
199 |             print(i)
200 |         print(d['seq_source_len'], flush=True)
201 |     
202 |         print("=== seq ref ids ===")
203 |         print(d['seq_ref_ids'].shape, flush=True)
204 |         print(d['seq_ref_ids'], flush=True)
205 |         for i in d['seq_ref_words']:
206 |             print(i)
207 |         print(d['seq_ref_len'])
208 | 
209 | 


--------------------------------------------------------------------------------
/paraphraser/download_models.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import requests
 4 | import logging
 5 | 
 6 | logging.basicConfig(format = u'[LINE:%(lineno)d]# %(levelname)-8s [%(asctime)s]  %(message)s', level = logging.NOTSET)
 7 | 
 8 | def download_file_from_google_drive(id, destination):
 9 |     URL = "https://docs.google.com/uc?export=download"
10 |     logging.info("Downloading "+id + " to "+destination)
11 |     logging.info("Please be patient, it may take a while...")
12 |     session = requests.Session()
13 | 
14 |     response = session.get(URL, params = { 'id' : id }, stream = True)
15 |     token = get_confirm_token(response)
16 |     logging.info("...")
17 |     if token:
18 |         params = { 'id' : id, 'confirm' : token }
19 |         response = session.get(URL, params = params, stream = True)
20 | 
21 |     save_response_content(response, destination)    
22 |     logging.info("Done with " + id)
23 | 
24 | def get_confirm_token(response):
25 |     for key, value in response.cookies.items():
26 |         if key.startswith('download_warning'):
27 |             return value
28 | 
29 |     return None
30 | 
31 | def save_response_content(response, destination):
32 |     CHUNK_SIZE = 32768
33 | 
34 |     with open(destination, "wb") as f:
35 |         for chunk in response.iter_content(CHUNK_SIZE):
36 |             if chunk: # filter out keep-alive new chunks
37 |                 f.write(chunk)
38 | 
39 | '''
40 |     to download the .t7 NTS models used for text simplification
41 |     if for some reason, this doanload fails, please use the direct urls:
42 |     - for NTS:
43 |         https://drive.google.com/open?id=0B_pjS_ZjPfT9dEtrbV85UXhSelU
44 |     -for NTS-w2v:
45 |         https://drive.google.com/open?id=0B_pjS_ZjPfT9ZTRfSFp4Ql92U0E
46 | '''
47 | 
48 | if __name__ == "__main__":
49 |     try:
50 |         out_dir = sys.argv[1]
51 |         logging.info("Saving files to: " + out_dir)
52 |     except:
53 |         out_dir = os.path.dirname(os.path.realpath(__file__))
54 |         logging.info("Saving files to: " + out_dir)
55 |         
56 |     #NTS_model = '0B_pjS_ZjPfT9dEtrbV85UXhSelU' 
57 |     #NTS_model_output = 'NTS_epoch11_10.19.t7'
58 |     #download_file_from_google_drive(NTS_model, os.path.join(out_dir, NTS_model_output))
59 | 
60 |     #NTS_w2v_model = '0B_pjS_ZjPfT9ZTRfSFp4Ql92U0E' 
61 |     #NTS_w2v_model_output = 'NTS-w2v_epoch11_10.20.t7'
62 |     #download_file_from_google_drive(NTS_w2v_model, os.path.join(out_dir, NTS_w2v_model_output))
63 | 
64 |     #model = '1_JsQ_iMnHwvnyd5vZM-6BMqe9hnVqrPi'
65 |     model = '19QDCd4UMgt3FtlYYwu0qZU3G1F9_XCvk'
66 |     download_file_from_google_drive(model, 'paraphrase-model.tar.gz')
67 | 
68 | 


--------------------------------------------------------------------------------
/paraphraser/embeddings.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | from six import iteritems
 4 | from pprint import pprint as pp
 5 | #from keras.layers.embeddings import Embedding
 6 | 
 7 | def load_sentence_embeddings():
 8 |     '''Load John Wieting sentence embeddings'''
 9 |     with open("../../para-nmt-50m/data/ngram-word-concat-40.pickle", 'rb') as f:
10 |         # [ numpy.ndarray(95283, 300), numpy.ndarray(74664, 300), (trigram_dict, word_dict)]
11 |         x = pickle.load(f, encoding='latin1')
12 |         word_vocab_size, embedding_size = x[1].shape
13 | 
14 |         trigram_embeddings, word_embeddings, _ = x
15 |         trigram_to_id, word_to_id = x[2]
16 | 
17 |         word_to_id['<START>'] = word_vocab_size
18 |         word_to_id['<END>'] = word_vocab_size + 1
19 | 
20 |         idx_to_word = { idx: word for word, idx in iteritems(word_to_id) }
21 | 
22 |         word_embeddings = np.vstack((word_embeddings, np.random.randn(2, embedding_size)))
23 | 
24 |         return (word_to_id, idx_to_word, word_embeddings, word_to_id['<START>'], 
25 |                word_to_id['<END>'], word_to_id['UUUNKKK'], word_to_id['★'])
26 | 
27 | def load_glove_embeddings():
28 |     with open("/media/sdb/datasets/glove.6B/glove.6B.300d.pickle", "rb") as f:
29 |         word_to_id, id_to_word, word_embeddings = pickle.load(f, encoding='latin1')
30 |         word_vocab_size, embedding_size = word_embeddings.shape
31 |         word_to_id['<START>'] = word_vocab_size
32 |         word_to_id['<END>'] = word_vocab_size + 1
33 |         word_to_id['UUUNKKK'] = word_vocab_size + 2
34 |         word_to_id['★'] = word_vocab_size + 3
35 |         id_to_word[word_vocab_size] = '<START>'
36 |         id_to_word[word_vocab_size+1] = '<END>'
37 |         id_to_word[word_vocab_size+2] = 'UUUNKKK'
38 |         id_to_word[word_vocab_size+3] = '★'
39 |         word_embeddings = np.vstack((word_embeddings, np.random.randn(4, embedding_size)))
40 |         return (word_to_id, id_to_word, word_embeddings, word_to_id['<START>'], 
41 |                word_to_id['<END>'], word_to_id['UUUNKKK'], word_to_id['★'])
42 |         
43 | 
44 | if __name__ == '__main__':
45 |     word_to_id, idx_to_word, embedding, start_id, end_id, unk_id, mask_id = load_sentence_embeddings()
46 |     pp(idx_to_word[mask_id])
47 |     #pp(idx_to_word)
48 |     #pp(word_to_id)
49 |     #print(embedding.shape)
50 | 
51 | 


--------------------------------------------------------------------------------
/paraphraser/inference.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from embeddings import load_sentence_embeddings
  3 | from preprocess_data import preprocess_batch
  4 | from six.moves import input
  5 | from lstm_model import lstm_model
  6 | import numpy as np
  7 | from pprint import pprint as pp
  8 | 
  9 | 
 10 | class Paraphraser(object):
 11 |     '''Heart of the paraphraser model.  This class loads the checkpoint
 12 |     into the Tensorflow runtime environment and is responsible for inference.
 13 |     Greedy and sampling based approaches are supported
 14 |     '''
 15 | 
 16 |     def __init__(self, checkpoint):
 17 |         """Constructor.  Load vocabulary index, start token, end token, unk id,
 18 |         mask_id.  Restore checkpoint.
 19 | 
 20 |         Args:
 21 |             checkpoint: A path to the checkpoint
 22 |         """
 23 |         self.word_to_id, self.idx_to_word, self.embedding, self.start_id, self.end_id, self.unk_id, self.mask_id = load_sentence_embeddings()
 24 |         self.checkpoint = checkpoint
 25 |         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
 26 |         self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
 27 |         self.model = lstm_model(self.sess, 'infer', 300, self.embedding, self.start_id, self.end_id, self.mask_id)
 28 |         saver = tf.train.Saver()
 29 |         saver.restore(self.sess, checkpoint)
 30 | 
 31 |     def sample_paraphrase(self, sentence, sampling_temp=1.0, how_many=1):
 32 |         """Paraphrase by sampling a distribution
 33 | 
 34 |         Args:
 35 |             sentence (str): A sentence input that will be paraphrased by 
 36 |                 sampling from distribution.
 37 |             sampling_temp (int) : A number between 0 an 1
 38 | 
 39 |         Returns:
 40 |             str: a candidate paraphrase of the `sentence`
 41 |         """
 42 | 
 43 |         return self.infer(1, sentence, self.idx_to_word, sampling_temp, how_many)
 44 | 
 45 |     def greedy_paraphrase(self, sentence):
 46 |         """Paraphrase using greedy sampler
 47 |     
 48 |         Args:
 49 |             sentence : The source sentence to be paraphrased.
 50 | 
 51 |         Returns:
 52 |             str : a candidate paraphrase of the `sentence`
 53 |         """
 54 | 
 55 |         return self.infer(0, sentence, self.idx_to_word, 0., 1)
 56 | 
 57 | 
 58 |     def infer(self, decoder, source_sent, id_to_vocab, temp, how_many):
 59 |         """ Perform inferencing.  In other words, generate a paraphrase
 60 |         for the source sentence.
 61 | 
 62 |         Args:
 63 |             decoder : 0 for greedy, 1 for sampling
 64 |             source_sent : source sentence to generate a paraphrase for
 65 |             id_to_vocab : dict of vocabulary index to word
 66 |             end_id : the end token
 67 |             temp : the sampling temperature to use when `decoder` is 1
 68 | 
 69 |         Returns:
 70 |             str : for the generated paraphrase
 71 |         """
 72 | 
 73 |         seq_source_words, seq_source_ids = preprocess_batch([ source_sent ] * how_many)
 74 |         #print(seq_source_words)
 75 |         #print(seq_source_ids)
 76 |         seq_source_len = [ len(seq_source) for seq_source in seq_source_ids ]
 77 |         #print(seq_source_len)
 78 | 
 79 |         feed_dict = {
 80 |             self.model['seq_source_ids']: seq_source_ids,
 81 |             self.model['seq_source_lengths']: seq_source_len,
 82 |             self.model['decoder_technique']: decoder,
 83 |             self.model['sampling_temperature']: temp
 84 |         }
 85 | 
 86 |         feeds = [
 87 |             self.model['predictions']
 88 |             #model['final_sequence_lengths']
 89 |         ]
 90 | 
 91 |         predictions = self.sess.run(feeds, feed_dict)[0]
 92 |         #print(predictions)
 93 |         return self.translate(predictions, decoder, id_to_vocab, seq_source_words[0])
 94 | 
 95 |     def translate(self, predictions, decoder, id_to_vocab, seq_source_words):
 96 |         """ Translate the vocabulary ids in `predictions` to actual words
 97 |         that compose the paraphrase.
 98 | 
 99 |         Args:
100 |             predictions : arrays of vocabulary ids
101 |             decoder : 0 for greedy, 1 for sample, 2 for beam
102 |             id_to_vocab : dict of vocabulary index to word
103 | 
104 |         Returns:
105 |             str : the paraphrase
106 |         """
107 |         translated_predictions = []
108 |         #np_end = np.where(translated_predictions == end_id)
109 |         for sent_pred in predictions:
110 |             translated = []
111 |             for pred in sent_pred:
112 |                 word = 'UUNNKK'
113 |                 if pred == self.end_id:
114 |                     break
115 |                 if pred == self.unk_id:
116 |                     # Search for rare word
117 |                     for seq_source_word in seq_source_words:
118 |                         if seq_source_word not in self.word_to_id:
119 |                             word = seq_source_word
120 |                 else:
121 |                     word = id_to_vocab[pred]
122 |                 translated.append(word)
123 |             translated_predictions.append(' '.join(translated))
124 |         return translated_predictions
125 | 
126 | def main():
127 |     import argparse
128 |     parser = argparse.ArgumentParser()
129 |     parser.add_argument('--checkpoint', type=str, help='Checkpoint path')
130 |     args = parser.parse_args()
131 |     paraphraser = Paraphraser(args.checkpoint)
132 | 
133 |     while 1:
134 |         source_sentence = input("Source: ")
135 |         #p = paraphraser.greedy_paraphrase(source_sentence)
136 |         #print(p)
137 |         paraphrases = paraphraser.sample_paraphrase(source_sentence, sampling_temp=0.75, how_many=10)
138 |         for i, paraphrase in enumerate(paraphrases):
139 |             print("Paraph #{}: {}".format(i, paraphrase))
140 | 
141 | if __name__ == '__main__':
142 |     main()
143 | 
144 | 


--------------------------------------------------------------------------------
/paraphraser/inference_frozen_graph.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from load_sent_embeddings import load_sentence_embeddings
  3 | from preprocess_data import preprocess_batch
  4 | from six.moves import input
  5 | 
  6 | word_to_id, idx_to_word, embedding, start_id, end_id, unk_id  = load_sentence_embeddings()
  7 | mask_id = 5800
  8 | 
  9 | with open('/media/sdb/models/paraphraser/frozen_model.pb', 'rb') as f:
 10 |     graph_def = tf.GraphDef()
 11 |     graph_def.ParseFromString(f.read())
 12 | 
 13 | with tf.Graph().as_default() as graph:
 14 |     predictions = tf.import_graph_def(
 15 |         graph_def=graph_def,
 16 |         return_elements=['predictions:0'],
 17 |         name='')
 18 | 
 19 |     print([op.name for op in graph.get_operations()])
 20 | 
 21 |     seq_source_ids = graph.get_tensor_by_name('placeholders/source_ids:0')
 22 |     seq_source_lengths = graph.get_tensor_by_name('placeholders/sequence_source_lengths:0')
 23 |     decoder_technique = graph.get_tensor_by_name('placeholders/decoder_technique:0')
 24 |     sampling_temperature = graph.get_tensor_by_name('placeholders/sampling_temperature:0')
 25 |     keep_prob = graph.get_tensor_by_name('placeholders/keep_prob:0')
 26 | 
 27 | model = {
 28 |     'seq_source_ids': seq_source_ids,
 29 |     'seq_source_lengths': seq_source_lengths,
 30 |     'predictions': predictions,
 31 |     'decoder_technique': decoder_technique,
 32 |     'sampling_temperature': sampling_temperature
 33 | }
 34 | 
 35 | sess = tf.Session()
 36 | 
 37 | def restore_model(checkpoint):
 38 |     model = lstm_model(sess, 'infer', 300, embedding, start_id, end_id, mask_id)
 39 |     saver = tf.train.Saver()
 40 |     saver.restore(sess, checkpoint)
 41 | 
 42 | def translate(predictions, decoder, id_to_vocab, end_id):
 43 |     """ Translate the vocabulary ids in `predictions` to actual words
 44 |     that compose the paraphrase.
 45 | 
 46 |     Args:
 47 |         predictions : arrays of vocabulary ids
 48 |         decoder : 0 for greedy, 1 for sample, 2 for beam
 49 |         id_to_vocab : dict of vocabulary index to word
 50 |         end_id : end token index 
 51 | 
 52 |     Returns:
 53 |         str : the paraphrase
 54 |     """
 55 |     if decoder == 2:
 56 |         _, sentence_length, num_samples = predictions.shape
 57 |         for i in xrange(num_samples):
 58 |             sent_pred = []
 59 |             for j in xrange(sentence_length):
 60 |                 sent_pred.append(predictions[0][j][i])
 61 |             try:
 62 |                 end_index = sent_pred.index(end_id)
 63 |                 sent_pred = sent_pred[:end_index]
 64 |             except Exception as e:
 65 |                 pass
 66 |             return ' '.join([ id_to_vocab[pred] for pred in sent_pred ])
 67 |     else:
 68 |         for sent_pred in predictions:
 69 |             if sent_pred[-1] == end_id:
 70 |                 sent_pred = sent_pred[0:-1]
 71 |             return ' '.join([ id_to_vocab[pred] for pred in sent_pred ])
 72 | 
 73 | 
 74 | def infer(sess, model, decoder, source_sent, id_to_vocab, end_id, temp):
 75 |     """ Perform inferencing.  In other words, generate a paraphrase
 76 |     for the source sentence.
 77 | 
 78 |     Args:
 79 |         sess : Tensorflow session.
 80 |         model : dict of tensor to value
 81 |         decoder : 0 for greedy, 1 for sampling
 82 |         source_sent : source sentence to generate a paraphrase for
 83 |         id_to_vocab : dict of vocabulary index to word
 84 |         end_id : the end token
 85 |         temp : the sampling temperature to use when `decoder` is 1
 86 | 
 87 |     Returns:
 88 |         str : for the generated paraphrase
 89 |     """
 90 | 
 91 |     seq_source_words, seq_source_ids = preprocess_batch([ source_sent ])
 92 |     seq_source_len = [ len(seq_source) for seq_source in seq_source_ids ]
 93 | 
 94 |     feed_dict = {
 95 |         model['seq_source_ids']: seq_source_ids,
 96 |         model['seq_source_lengths']: seq_source_len,
 97 |         model['decoder_technique']: decoder,
 98 |         model['sampling_temperature']: temp
 99 |     }
100 | 
101 |     feeds = [
102 |         model['predictions']
103 |         #model['final_sequence_lengths']
104 |     ]
105 | 
106 |     predictions = sess.run(feeds, feed_dict)[0][0]
107 |     return translate(predictions, decoder, id_to_vocab, end_id)
108 | 
109 | def greedy_paraphrase(sentence):
110 |     """Paraphrase using greedy sampler
111 |     
112 |     Args:
113 |         sentence : The source sentence to be paraphrased.
114 | 
115 |     Returns:
116 |         str : a candidate paraphrase of the `sentence`
117 |     """
118 | 
119 |     with tf.Session(graph=graph) as sess:
120 |         return infer(sess, model, 0, sentence, idx_to_word, end_id, 0.)
121 | 
122 | def sampler_paraphrase(sentence, sampling_temp=1.0):
123 |     """Paraphrase by sampling a distribution
124 | 
125 |     Args:
126 |         sentence (str): A sentence input that will be paraphrased by 
127 |             sampling from distribution.
128 |         sampling_temp (int) : A number between 0 an 1
129 | 
130 |     Returns:
131 |         str: a candidate paraphrase of the `sentence`
132 |     """
133 | 
134 |     with tf.Session(graph=graph) as sess:
135 |         return infer(sess, model, 1, sentence, idx_to_word, end_id, sampling_temp)
136 | 
137 | def main():
138 |     while 1:
139 |         source_sentence = input("Source: ")
140 |         #print("Paraph: {}".format(sampler_paraphrase('hello world.')))
141 |         print("Paraph: {}".format(greedy_paraphrase('hello world.')))
142 | 
143 | if __name__ == '__main__':
144 |     main()
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/paraphraser/inspect_checkpoint.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A simple script for inspect checkpoint files."""
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | import argparse
 21 | import sys
 22 | 
 23 | import numpy as np
 24 | 
 25 | from tensorflow.python import pywrap_tensorflow
 26 | from tensorflow.python.platform import app
 27 | from tensorflow.python.platform import flags
 28 | 
 29 | FLAGS = None
 30 | 
 31 | 
 32 | def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
 33 |                                      all_tensor_names):
 34 |   """Prints tensors in a checkpoint file.
 35 | 
 36 |   If no `tensor_name` is provided, prints the tensor names and shapes
 37 |   in the checkpoint file.
 38 | 
 39 |   If `tensor_name` is provided, prints the content of the tensor.
 40 | 
 41 |   Args:
 42 |     file_name: Name of the checkpoint file.
 43 |     tensor_name: Name of the tensor in the checkpoint file to print.
 44 |     all_tensors: Boolean indicating whether to print all tensors.
 45 |     all_tensor_names: Boolean indicating whether to print all tensor names.
 46 |   """
 47 |   try:
 48 |     reader = pywrap_tensorflow.NewCheckpointReader(file_name)
 49 |     if all_tensors or all_tensor_names:
 50 |       var_to_shape_map = reader.get_variable_to_shape_map()
 51 |       for key in sorted(var_to_shape_map):
 52 |         print("tensor_name: ", key)
 53 |         if all_tensors:
 54 |           print(reader.get_tensor(key))
 55 |     elif not tensor_name:
 56 |       print(reader.debug_string().decode("utf-8"))
 57 |     else:
 58 |       print("tensor_name: ", tensor_name)
 59 |       print(reader.get_tensor(tensor_name))
 60 |   except Exception as e:  # pylint: disable=broad-except
 61 |     print(str(e))
 62 |     if "corrupted compressed block contents" in str(e):
 63 |       print("It's likely that your checkpoint file has been compressed "
 64 |             "with SNAPPY.")
 65 |     if ("Data loss" in str(e) and
 66 |         (any([e in file_name for e in [".index", ".meta", ".data"]]))):
 67 |       proposed_file = ".".join(file_name.split(".")[0:-1])
 68 |       v2_file_error_template = """
 69 | It's likely that this is a V2 checkpoint and you need to provide the filename
 70 | *prefix*.  Try removing the '.' and extension.  Try:
 71 | inspect checkpoint --file_name = {}"""
 72 |       print(v2_file_error_template.format(proposed_file))
 73 | 
 74 | 
 75 | def parse_numpy_printoption(kv_str):
 76 |   """Sets a single numpy printoption from a string of the form 'x=y'.
 77 | 
 78 |   See documentation on numpy.set_printoptions() for details about what values
 79 |   x and y can take. x can be any option listed there other than 'formatter'.
 80 | 
 81 |   Args:
 82 |     kv_str: A string of the form 'x=y', such as 'threshold=100000'
 83 | 
 84 |   Raises:
 85 |     argparse.ArgumentTypeError: If the string couldn't be used to set any
 86 |         nump printoption.
 87 |   """
 88 |   k_v_str = kv_str.split("=", 1)
 89 |   if len(k_v_str) != 2 or not k_v_str[0]:
 90 |     raise argparse.ArgumentTypeError("'%s' is not in the form k=v." % kv_str)
 91 |   k, v_str = k_v_str
 92 |   printoptions = np.get_printoptions()
 93 |   if k not in printoptions:
 94 |     raise argparse.ArgumentTypeError("'%s' is not a valid printoption." % k)
 95 |   v_type = type(printoptions[k])
 96 |   if v_type is type(None):
 97 |     raise argparse.ArgumentTypeError(
 98 |         "Setting '%s' from the command line is not supported." % k)
 99 |   try:
100 |     v = (v_type(v_str) if v_type is not bool
101 |          else flags.BooleanParser().parse(v_str))
102 |   except ValueError as e:
103 |     raise argparse.ArgumentTypeError(e.message)
104 |   np.set_printoptions(**{k: v})
105 | 
106 | 
107 | def main(unused_argv):
108 |   if not FLAGS.file_name:
109 |     print("Usage: inspect_checkpoint --file_name=checkpoint_file_name "
110 |           "[--tensor_name=tensor_to_print] "
111 |           "[--all_tensors] "
112 |           "[--all_tensor_names] "
113 |           "[--printoptions]")
114 |     sys.exit(1)
115 |   else:
116 |     print_tensors_in_checkpoint_file(FLAGS.file_name, FLAGS.tensor_name,
117 |                                      FLAGS.all_tensors, FLAGS.all_tensor_names)
118 | 
119 | 
120 | if __name__ == "__main__":
121 |   parser = argparse.ArgumentParser()
122 |   parser.register("type", "bool", lambda v: v.lower() == "true")
123 |   parser.add_argument(
124 |       "--file_name", type=str, default="", help="Checkpoint filename. "
125 |                     "Note, if using Checkpoint V2 format, file_name is the "
126 |                     "shared prefix between all files in the checkpoint.")
127 |   parser.add_argument(
128 |       "--tensor_name",
129 |       type=str,
130 |       default="",
131 |       help="Name of the tensor to inspect")
132 |   parser.add_argument(
133 |       "--all_tensors",
134 |       nargs="?",
135 |       const=True,
136 |       type="bool",
137 |       default=False,
138 |       help="If True, print the values of all the tensors.")
139 |   parser.add_argument(
140 |       "--all_tensor_names",
141 |       nargs="?",
142 |       const=True,
143 |       type="bool",
144 |       default=False,
145 |       help="If True, print the names of all the tensors.")
146 |   parser.add_argument(
147 |       "--printoptions",
148 |       nargs="*",
149 |       type=parse_numpy_printoption,
150 |       help="Argument for numpy.set_printoptions(), in the form 'k=v'.")
151 |   FLAGS, unparsed = parser.parse_known_args()
152 |   app.run(main=main, argv=[sys.argv[0]] + unparsed)
153 | 


--------------------------------------------------------------------------------
/paraphraser/lstm_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import datetime as dt
  3 | import sys
  4 | import numpy as np
  5 | from tensorflow.python.layers import core as layers_core
  6 | from sample_embedding_helper import MySampleEmbeddingHelper
  7 | 
  8 | #def lstm_model(args, np_embeddings, start_id, end_id, mask_id, mode):
  9 | def lstm_model(sess, mode, cell_hidden_size, np_embeddings, start_id, end_id, mask_id):
 10 |     vocab_size, hidden_size = np_embeddings.shape
 11 | 
 12 |     # Embeddings
 13 |     with tf.variable_scope('embeddings'):
 14 |         encoder_embeddings = tf.get_variable(name="encoder_embeddings", shape=np_embeddings.shape, initializer=tf.constant_initializer(np_embeddings), trainable=True)
 15 |         decoder_embeddings = tf.get_variable(name="decoder_embeddings", shape=np_embeddings.shape, initializer=tf.constant_initializer(np_embeddings), trainable=True)
 16 |         #embeddings = tf.get_variable(name="embeddings", shape=np_embeddings.shape, initializer=tf.constant_initializer(np_embeddings), trainable=True)
 17 | 
 18 |     # Define placeholders
 19 |     with tf.variable_scope('placeholders'):
 20 |         lr = tf.placeholder(tf.float32, shape=(), name="learning_rate")
 21 |         seq_source_ids = tf.placeholder(tf.int32, shape=(None, None), name="source_ids")
 22 |         seq_source_lengths = tf.placeholder(tf.int32, [None], name="sequence_source_lengths")
 23 |         keep_prob = tf.placeholder_with_default(1.0, shape=(), name="keep_prob")
 24 |         # 0: greedy, 1: sampling, 2: beam
 25 |         sampling_temperature = tf.placeholder_with_default(0.5, shape=(), name="sampling_temperature")
 26 |         decoder_technique = tf.placeholder_with_default(1, shape=(), name="decoder_technique")
 27 |         #beam_width = tf.placeholder_with_default(5, shape=(), name="beam_width")
 28 |         dummy = tf.add(sampling_temperature, 1, name="dummy")
 29 | 
 30 |         if mode in set(['train', 'dev', 'test']):
 31 |             seq_reference_ids = tf.placeholder(tf.int32, shape=(None, None), name="reference_ids")
 32 |             seq_reference_lengths = tf.placeholder(tf.int32, [None], name="sequence_reference_lengths")
 33 |             paddings = tf.constant([[0, 0], [0, 1]])
 34 |             seq_output_ids = tf.pad(seq_reference_ids[:, 1:], paddings, mode="CONSTANT", name="seq_output_ids", constant_values=mask_id)
 35 |         else:
 36 |             seq_reference_ids = None
 37 |             seq_reference_lengths = None
 38 |             seq_output_ids = None
 39 | 
 40 |     #batch_size = tf.cast(tf.shape(seq_source_ids)[0], tf.float32)
 41 |     batch_size = tf.shape(seq_source_ids)[0]
 42 | 
 43 |     # Encoder
 44 |     #with tf.variable_scope('encoder'):
 45 |     encoder_embedding = tf.nn.embedding_lookup(encoder_embeddings, seq_source_ids, name="encoder_embedding")
 46 |     encoder_fw_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(cell_hidden_size), input_keep_prob=keep_prob, output_keep_prob=keep_prob)
 47 |     encoder_bw_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(cell_hidden_size), input_keep_prob=keep_prob, output_keep_prob=keep_prob)
 48 |     encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_fw_cell, 
 49 |                                                                       cell_bw=encoder_bw_cell,
 50 |                                                                       inputs=encoder_embedding, 
 51 |                                                                       sequence_length=seq_source_lengths, 
 52 |                                                                       dtype=tf.float32)
 53 |     concat_encoder_outputs = tf.concat(encoder_outputs, 2)
 54 |     encoder_fw_state, encoder_bw_state = encoder_states
 55 |     encoder_state_c = tf.concat((encoder_fw_state.c, encoder_bw_state.c), axis=1, name="encoder_state_c")
 56 |     encoder_state_h = tf.concat((encoder_fw_state.h, encoder_bw_state.h), axis=1, name="encoder_state_h")
 57 |     joined_encoder_state = tf.contrib.rnn.LSTMStateTuple(encoder_state_c, encoder_state_h)
 58 | 
 59 |     fc_layer = layers_core.Dense(vocab_size, use_bias=False)
 60 |     attention = tf.contrib.seq2seq.BahdanauAttention(num_units=cell_hidden_size, memory=concat_encoder_outputs)
 61 |     decoder_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(cell_hidden_size * 2), input_keep_prob=keep_prob, output_keep_prob=keep_prob)
 62 |     attn_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention, attention_layer_size=cell_hidden_size)
 63 |     zero_state = attn_cell.zero_state(batch_size, tf.float32)
 64 |     decoder_initial_state = zero_state.clone(cell_state=joined_encoder_state)
 65 | 
 66 |     ''' Beam search 
 67 |     tiled_joined_encoder_state = tf.contrib.seq2seq.tile_batch(joined_encoder_state, multiplier=beam_width)
 68 |     tiled_concat_encoder_outputs = tf.contrib.seq2seq.tile_batch(concat_encoder_outputs, multiplier=beam_width)
 69 |     beam_attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
 70 |         num_units=hidden_size,
 71 |         memory=tiled_concat_encoder_outputs)
 72 |     #decoder_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(hidden_size * 2), input_keep_prob=1.0, output_keep_prob=1.0)
 73 |     beam_attn_wrapper = tf.contrib.seq2seq.AttentionWrapper(
 74 |         cell=tf.nn.rnn_cell.BasicLSTMCell(hidden_size * 2),
 75 |         attention_mechanism=beam_attention_mechanism, 
 76 |         attention_layer_size=hidden_size)
 77 |     '''
 78 | 
 79 |     # Train, dev, test
 80 |     if mode in set(['train', 'dev', 'test']):
 81 |         # Decoder
 82 |         decoder_embedding = tf.nn.embedding_lookup(decoder_embeddings, seq_reference_ids, name="decoder_embedding")
 83 |         helper = tf.contrib.seq2seq.TrainingHelper(decoder_embedding, seq_reference_lengths)
 84 |         decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, helper, decoder_initial_state, fc_layer)
 85 |         final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder, swap_memory=True)
 86 |         logits = final_outputs.rnn_output
 87 |         predictions = final_outputs.sample_id
 88 | 
 89 |         with tf.variable_scope('train_loss'):
 90 |             max_output_len = tf.shape(logits)[1]
 91 |             seq_output_ids = seq_output_ids[:, :max_output_len]
 92 |             pad = tf.fill((tf.shape(seq_output_ids)[0], max_output_len), -1) #mask_id
 93 |             boolean_mask = tf.not_equal(seq_output_ids, pad)
 94 |             mask = tf.cast(boolean_mask, tf.float32)
 95 |             labels = tf.reshape(seq_output_ids, shape=(-1, 1))
 96 |             crossent = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(labels, vocab_size), logits=logits)
 97 |             loss = (tf.reduce_sum(crossent * mask) / tf.cast(batch_size, tf.float32))
 98 | 
 99 |         with tf.variable_scope('summaries'):
100 |             tf.summary.scalar("batch_loss", loss)
101 |             summaries = tf.summary.merge_all()
102 | 
103 |         train_step = tf.train.AdamOptimizer(lr).minimize(loss)
104 | 
105 |     # Test
106 |     elif mode == 'infer':
107 |         loss = None
108 |         train_step = None
109 |         labels = None
110 |         summaries = None
111 |         start_tokens = tf.fill([batch_size], start_id)
112 | 
113 |         # Beach search decoder
114 |         '''
115 |         beam_search_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
116 |             cell=beam_attn_wrapper,
117 |             embedding=decoder_embeddings,
118 |             start_tokens=start_tokens,
119 |             end_token=end_id,
120 |             initial_state=beam_attn_wrapper.zero_state(batch_size * beam_width, tf.float32).clone(cell_state=tiled_joined_encoder_state),
121 |             beam_width=beam_width.eval(),
122 |             output_layer=fc_layer,
123 |             length_penalty_weight=0.0)
124 |         '''
125 | 
126 |         # Distribution sampling
127 |         #sample_helper = MySampleEmbeddingHelper(decoder_embeddings, start_tokens, end_id, softmax_temperature=sampling_temperature)
128 |         sample_helper = tf.contrib.seq2seq.SampleEmbeddingHelper(decoder_embeddings, start_tokens, end_id, softmax_temperature=sampling_temperature)
129 |         sample_decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, sample_helper, decoder_initial_state, output_layer=fc_layer)
130 | 
131 |         # Greedy argmax decoder
132 |         greedy_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, start_tokens, end_id)
133 |         # applied per timestep
134 |         greedy_decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, greedy_helper, decoder_initial_state, output_layer=fc_layer)
135 | 
136 |         # Decode!
137 |         greedy_outputs, greedy_final_state, greedy_fsl = tf.contrib.seq2seq.dynamic_decode(
138 |             greedy_decoder,
139 |             #maximum_iterations=maximum_iterations,
140 |             swap_memory=True)
141 |         greedy_logits = greedy_outputs.rnn_output
142 |         greedy_predictions = tf.identity(greedy_outputs.sample_id, name="greedy_predictions")
143 | 
144 |         sample_outputs, sample_final_state, sample_fsl = tf.contrib.seq2seq.dynamic_decode(
145 |             sample_decoder,
146 |             swap_memory=True)
147 |         sample_logits = sample_outputs.rnn_output
148 |         sample_predictions = tf.identity(sample_outputs.sample_id, name="sample_predictions")
149 | 
150 |         '''
151 |         beam_search_outputs, beam_search_final_state, beam_search_fsl = tf.contrib.seq2seq.dynamic_decode(
152 |             beam_search_decoder,
153 |             swap_memory=True)
154 |         beam_search_logits = tf.no_op()
155 |         beam_search_predictions = tf.identity(beam_search_outputs.predicted_ids, name="beam_search_predictions")
156 |         print(beam_search_predictions)
157 |         '''
158 |         z,y,a = tf.case(
159 |             pred_fn_pairs={
160 |                 tf.equal(sampling_temperature, tf.constant(0.0)): lambda: (greedy_predictions, greedy_fsl, greedy_logits),
161 |                 tf.equal(sampling_temperature, tf.constant(1.0)): lambda: (sample_predictions, sample_fsl, sample_logits),
162 |             },
163 |             default = lambda: (sample_predictions, sample_fsl, sample_logits),
164 |             exclusive=True
165 |         )
166 | 
167 |         predictions, final_sequence_lengths, logits = tf.case(
168 |             pred_fn_pairs={
169 |                 tf.equal(decoder_technique, tf.constant(0)): lambda: (greedy_predictions, greedy_fsl, greedy_logits),
170 |                 tf.equal(decoder_technique, tf.constant(1)): lambda: (sample_predictions, sample_fsl, sample_logits),
171 |                 #tf.equal(decoder_technique, tf.constant(2)): lambda: (beam_search_predictions, beam_search_fsl)
172 |             },
173 |             exclusive=True)
174 | 
175 |         predictions = tf.identity(predictions, name='predictions')
176 |         final_sequence_lengths = tf.identity(final_sequence_lengths, name='final_sequence_lengths')
177 |         logits = tf.identity(logits, name='logits')
178 | 
179 |     return {
180 |         'lr': lr,
181 |         'keep_prob': keep_prob,
182 |         'decoder_technique': decoder_technique,
183 |         'sampling_temperature': sampling_temperature,
184 |         #'beam_width': beam_width,
185 |         'seq_source_ids': seq_source_ids,
186 |         'seq_source_lengths': seq_source_lengths,
187 |         'seq_reference_ids': seq_reference_ids,
188 |         'seq_reference_lengths': seq_reference_lengths,
189 |         #'final_state': final_state,
190 |         'final_sequence_lengths': final_sequence_lengths,
191 |         'embedding_source': encoder_embedding,
192 |         'encoder_states': encoder_states,
193 |         'loss': loss,
194 |         'predictions': predictions,
195 |         'labels': labels,
196 |         'summaries': summaries,
197 |         'train_step': train_step,
198 |         'dummy': dummy
199 |     }
200 | 
201 | 


--------------------------------------------------------------------------------
/paraphraser/nlp_pipeline.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import spacy
 3 | from spacy.tokenizer import Tokenizer
 4 | import datetime as dt
 5 | import multiprocessing as mp
 6 | 
 7 | nlp = spacy.load('en')
 8 | tokenizer = Tokenizer(nlp.vocab)
 9 | 
10 | def nlp_pipeline(sentence, word_to_id, unk_id):
11 |     ''' Convert word tokens into their vocab ids '''
12 |     return [ word_to_id.get(token.lower_, unk_id) for token in nlp_pipeline_0(sentence) ]
13 | 
14 | def nlp_pipeline_0(sentence):
15 |     ''' Execute spacy pipeline, single thread '''
16 |     return nlp(sentence, disable=['parser', 'tagger', 'ner'])
17 | 
18 | def mp_nlp_pipeline(pool, lines):
19 |     ''' Execute spacy pipeline, multiprocessing style '''
20 |     return pool.map(nlp_pipeline_0, lines, 1)
21 | 
22 | def openmp_nlp_pipeline(lines, n_threads=12):
23 |     ''' Execute spacy's openmp nlp pipeline '''
24 |     return [ [ token.lower_ for token in doc ] for doc in nlp.pipe(lines, n_threads=n_threads, disable=['parser', 'tagger', 'ner']) ]
25 | 
26 | def single_thread_nlp_pipeline(lines):
27 |     ''' Another single thread pipeline '''
28 |     return [ nlp(line) for line in lines ]
29 | 
30 | def main():
31 |     import datetime as dt    
32 |     from embeddings import load_sentence_embeddings
33 |     #pool = mp.Pool(10)
34 | 
35 |     word_to_id, idx_to_word, embedding, start_id, end_id, unk_id = load_sentence_embeddings()
36 |     print(unk_id)
37 | 
38 |     with open('/media/sdb/datasets/para-nmt-5m-processed/para-nmt-5m-processed.txt', 'r') as f:
39 |         lines = []
40 |         for i, line in enumerate(f):
41 |             lines.append(line.strip())
42 | 
43 |             if i % 64 == 0:
44 |                 start = dt.datetime.now()
45 |                 #docs = mp_nlp_pipeline(pool, lines)
46 |                 docs = openmp_nlp_pipeline(lines, word_to_id, unk_id)
47 |                 #docs = single_thread_nlp_pipeline(lines)
48 |                 #doc = nlp_pipeline_0(line)
49 |                 print(docs)
50 | 
51 |                 end = dt.datetime.now()
52 |                 print(end - start, flush=True)
53 |                 lines = []
54 |             else:
55 |                 continue
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 
61 | 


--------------------------------------------------------------------------------
/paraphraser/paraphraser.py:
--------------------------------------------------------------------------------
1 | from synonym_model import synonym_paraphrase
2 | from inference import Paraphraser
3 | 


--------------------------------------------------------------------------------
/paraphraser/preprocess_data.py:
--------------------------------------------------------------------------------
  1 | """Dataset preprocessing and generation.
  2 | 
  3 | This module's purpose is to consume raw paraphrase text and output a dataset
  4 | in an optimal form to later be consumed by ParaphraseDataset class in
  5 | dataset_generator.py.  The raw text are assumed to be valid paraphrases
  6 | and must follow the following format each line:
  7 | 
  8 | source sentence\treference sentence
  9 | 
 10 | The number of tokens within a sentence are counted so that samples can be 
 11 | grouped into the same file by similar length.  After nlp preprocessing and
 12 | tokenization, the resulting new format per line is:
 13 | 
 14 | source sentence tokens\tsource sentence token ids\treference tokens\treference token ids
 15 | 
 16 | This format is consumed directly into ParaphraseDataset to generate mini
 17 | batches where each batch contains similar length sentences.
 18 | 
 19 | """
 20 | 
 21 | import os
 22 | from six import iteritems
 23 | from nlp_pipeline import openmp_nlp_pipeline
 24 | from embeddings import load_sentence_embeddings
 25 | 
 26 | word_to_id, idx_to_word, embedding, start_id, end_id, unk_id, mask_id = load_sentence_embeddings()
 27 | 
 28 | def generate_length_index(max_lengths):
 29 |     l = []
 30 |     prev = None
 31 |     for ml in max_lengths:
 32 |         if prev == None:
 33 |             a = (ml+1) * [ml]
 34 |         else:
 35 |             a = (ml - prev) * [ml]
 36 |         prev = ml
 37 |         l.extend(a)
 38 |     return l
 39 | 
 40 | def word_to_token_ids(batch_docs):
 41 |     batch_token_ids = [ [ word_to_id.get(word, unk_id) for word in doc ] for doc in batch_docs ]
 42 |     return batch_token_ids
 43 | 
 44 | def preprocess_batch(batch_sentences):
 45 |     # NLP Pipleine
 46 |     batch_words = openmp_nlp_pipeline(batch_sentences)
 47 |     batch_ids_ = word_to_token_ids(batch_words)
 48 | 
 49 |     # Create reference, preprend start id, append end id
 50 |     batch_ids = [ [start_id] + ids + [end_id] for ids in batch_ids_ ]
 51 |     
 52 |     return (batch_words, batch_ids)
 53 | 
 54 | def fsave_data(filename, batch_source_words, batch_source_ids, batch_ref_words, batch_ref_ids):
 55 |     max_lengths = [5, 10, 20, 30, 40, 50]
 56 | 
 57 |     for length in max_lengths:
 58 |         try:
 59 |             os.remove(filename + "." + str(length))
 60 |         except:
 61 |             pass
 62 | 
 63 |     files = { length: open(filename + "." + str(length), 'a') for length in max_lengths }
 64 |     l = generate_length_index(max_lengths)
 65 | 
 66 |     z = zip(batch_source_words, batch_source_ids, batch_ref_words, batch_ref_ids)
 67 | 
 68 |     for source_words, source_ids, ref_words, ref_ids in z:
 69 |         max_len = max(len(source_ids), len(ref_ids))
 70 |         try:
 71 |             files[l[max_len]].write("{}\t{}\t{}\t{}\n".format(' '.join(source_words), 
 72 |                                                               ' '.join([ str(source_id) for source_id in source_ids ]),
 73 |                                                               ' '.join(ref_words),
 74 |                                                               ' '.join([ str(ref_id) for ref_id in ref_ids ])))
 75 |         except Exception as e:
 76 |             print(e)
 77 |             print("Error writing {} {} {} {}".format(' '.join(source_words),
 78 |                                                      ' '.join([ str(source_id) for source_id in source_ids ]),
 79 |                                                      ' '.join(ref_words),
 80 |                                                      ' '.join([ str(ref_id) for ref_id in ref_ids ])))
 81 |             continue
 82 | 
 83 |     for length, f in iteritems(files):
 84 |         f.close()
 85 | 
 86 | def preprocess_data(filename):
 87 |     batch_source_sentences = []
 88 |     batch_ref_sentences = []
 89 | 
 90 |     with open(filename, 'r') as f:
 91 |         for i, line in enumerate(f):
 92 |             source, ref = line.split('\t')
 93 |             batch_source_sentences.append(source.strip())
 94 |             batch_ref_sentences.append(ref.strip())
 95 | 
 96 |     batch_source_words, batch_source_ids = preprocess_batch(batch_source_sentences)
 97 |     batch_ref_words, batch_ref_ids = preprocess_batch(batch_ref_sentences)
 98 | 
 99 |     fsave_data(filename, batch_source_words, batch_source_ids, batch_ref_words, batch_ref_ids)
100 | 
101 | def main():
102 |     import sys
103 |     preprocess_data(sys.argv[1])
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 | 
108 | 


--------------------------------------------------------------------------------
/paraphraser/sample_embedding_helper.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import abc
 6 | 
 7 | import six
 8 | 
 9 | from tensorflow.contrib.seq2seq.python.ops import decoder
10 | from tensorflow.python.framework import dtypes
11 | from tensorflow.python.framework import ops
12 | from tensorflow.python.framework import tensor_shape
13 | from tensorflow.python.ops import array_ops
14 | from tensorflow.python.ops import control_flow_ops
15 | from tensorflow.python.ops import embedding_ops
16 | from tensorflow.python.ops import gen_array_ops
17 | from tensorflow.python.ops import math_ops
18 | from tensorflow.python.ops import tensor_array_ops
19 | from tensorflow.python.ops.distributions import bernoulli
20 | from tensorflow.python.ops.distributions import categorical
21 | from tensorflow.python.util import nest
22 | from tensorflow.contrib.seq2seq.python.ops.helper import GreedyEmbeddingHelper
23 | 
24 | 
25 | class MySampleEmbeddingHelper(GreedyEmbeddingHelper):
26 |   """A helper for use during inference.
27 |   Uses sampling (from a distribution) instead of argmax and passes the
28 |   result through an embedding layer to get the next input.
29 |   """
30 | 
31 |   def __init__(self, embedding, start_tokens, end_token,
32 |                softmax_temperature=None, seed=None):
33 |     """Initializer.
34 |     Args:
35 |       embedding: A callable that takes a vector tensor of `ids` (argmax ids),
36 |         or the `params` argument for `embedding_lookup`. The returned tensor
37 |         will be passed to the decoder input.
38 |       start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
39 |       end_token: `int32` scalar, the token that marks end of decoding.
40 |       softmax_temperature: (Optional) `float32` scalar, value to divide the
41 |         logits by before computing the softmax. Larger values (above 1.0) result
42 |         in more random samples, while smaller values push the sampling
43 |         distribution towards the argmax. Must be strictly greater than 0.
44 |         Defaults to 1.0.
45 |       seed: (Optional) The sampling seed.
46 |     Raises:
47 |       ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
48 |         scalar.
49 |     """
50 |     super(MySampleEmbeddingHelper, self).__init__(
51 |         embedding, start_tokens, end_token)
52 |     self._softmax_temperature = softmax_temperature
53 |     self._seed = seed
54 | 
55 |   def sample(self, time, outputs, state, name=None):
56 |     """sample for SampleEmbeddingHelper."""
57 |     del time, state  # unused by sample_fn
58 |     # Outputs are logits, we sample instead of argmax (greedy).
59 |     if not isinstance(outputs, ops.Tensor):
60 |       raise TypeError("Expected outputs to be a single Tensor, got: %s" %
61 |                       type(outputs))
62 |     if self._softmax_temperature is None:
63 |       logits = outputs
64 |     else:
65 |       #logits = outputs / self._softmax_temperature
66 |       logits = math_ops.divide(outputs, self._softmax_temperature)
67 | 
68 |     sample_id_sampler = categorical.Categorical(logits=logits)
69 |     sample_ids = sample_id_sampler.sample(seed=self._seed)
70 | 
71 |     return sample_ids
72 | 
73 | 


--------------------------------------------------------------------------------
/paraphraser/synonym_model.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import spacy
 3 | from pprint import pprint
 4 | from spacy.tokens.token import Token
 5 | from nltk.corpus import wordnet as wn
 6 | from six.moves import xrange
 7 | import random
 8 | 
 9 | nlp = spacy.load('en')
10 | 
11 | def generate_sentence(original_doc, new_tokens):
12 |     new_sentence = ' '.join(new_tokens).replace('_', ' ')
13 |     new_doc = nlp(new_sentence)
14 |     similarity_score = original_doc.similarity(new_doc)
15 |     return (new_sentence, similarity_score)
16 | 
17 | def synonym_model(s):
18 |     generated_sentences = set([])
19 | 
20 |     doc = nlp(s)
21 |     original_tokens = [ token.text for token in doc ]
22 | 
23 |     index_to_lemmas = {}
24 | 
25 |     for index, token in enumerate(doc):
26 |         index_to_lemmas[index] = set([])
27 |         index_to_lemmas[index].add(token)
28 | 
29 |         if token.pos_ == 'NOUN' and len(token.text) >= 3:
30 |             pos = wn.NOUN
31 |         elif token.pos_ == 'VERB' and len(token.text) >= 3:
32 |             pos = wn.VERB
33 |         elif token.pos_ == 'ADV' and len(token.text) >= 3:
34 |             pos = wn.ADV
35 |         elif token.pos_ == 'ADJ' and len(token.text) >= 3:
36 |             pos = wn.ADJ
37 |         else:
38 |             continue
39 | 
40 |         # Synsets
41 |         for synset in wn.synsets(token.text, pos):
42 |             for lemma in synset.lemmas():
43 |                 new_tokens = original_tokens.copy()
44 |                 new_tokens[index] = lemma.name()
45 |                 sentence_and_score = generate_sentence(doc, new_tokens)
46 |                 generated_sentences.add(sentence_and_score)
47 |                 index_to_lemmas[index].add(lemma.name())
48 | 
49 |     count = sum([ len(words) for words in index_to_lemmas.values() ])
50 | 
51 |     for i in xrange(min(count, 40)):
52 |         new_tokens = []
53 |         for index, words in sorted(index_to_lemmas.items(), key=lambda x: x[0]):
54 |             token = random.sample(index_to_lemmas[index], 1)[0]
55 |             new_tokens.append(str(token))
56 |         sentence_and_score = generate_sentence(doc, new_tokens)
57 |         generated_sentences.add(sentence_and_score)
58 | 
59 |     #print(generated_sentences)
60 |     return generated_sentences
61 | 
62 | def synonym_paraphrase(s):
63 |     return synonym_model(s)
64 | 
65 | if __name__ == '__main__':
66 |     #x = synonym_model('I am discussing my homework with the teacher.')
67 |     #x = synonym_model('the rabbit quickly ran down the hole')
68 |     #x = synonym_model('John tried to fix his computer by hacking away at it.')
69 |     x = synonym_model('team based multiplayer online first person shooter video game')
70 |     print(x)
71 | 
72 | 


--------------------------------------------------------------------------------
/paraphraser/training_pipeline.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import os
  5 | import sys
  6 | import datetime as dt
  7 | from six.moves import xrange, input
  8 | from lstm_model_beam import lstm_model
  9 | from embeddings import load_sentence_embeddings
 10 | from dataset_generator import ParaphraseDataset
 11 | from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
 12 | from utils import dataset_config, debug_data, summarize_scalar
 13 | import logging
 14 | 
 15 | logging.basicConfig(format = u'[%(asctime)s] %(levelname)-8s : %(message)s', level = logging.INFO)
 16 | 
 17 | def evaluate(sess, model, dataset_generator, mode, id_to_vocab):
 18 |     """Evaluate current model on the dev or test set.
 19 |     
 20 |     Args:
 21 |         sess: Tensorflow session
 22 |         model: dictionary containing model's tensors of interest for evaluation
 23 |         dataset_generator: dataset batch generator
 24 |         mode: 'dev' or 'test'
 25 |         id_to_vocab: voabulary dictionary id -> word
 26 | 
 27 |     Returns:
 28 |         loss: the loss after evaluating the dataset
 29 |         bleu_score: BLEU score after evaluation
 30 |     """
 31 | 
 32 |     batch_generator = dataset_generator.generate_batch(mode)
 33 |     chencherry = SmoothingFunction()
 34 |     batch_losses = []
 35 |     all_seq_ref_words = []
 36 |     all_bleu_pred_words = []
 37 | 
 38 |     for batch in batch_generator:
 39 |         seq_source_ids = batch['seq_source_ids']
 40 |         seq_source_words = batch['seq_source_words']
 41 |         seq_source_len = batch['seq_source_len']
 42 |         seq_ref_ids = batch['seq_ref_ids']
 43 |         seq_ref_words = batch['seq_ref_words']
 44 |         seq_ref_len = batch['seq_ref_len']
 45 | 
 46 |         feed_dict = {
 47 |             model['seq_source_ids']: seq_source_ids,
 48 |             model['seq_source_lengths']: seq_source_len,
 49 |             model['seq_reference_ids']: seq_ref_ids,
 50 |             model['seq_reference_lengths']: seq_ref_len
 51 |         }
 52 | 
 53 |         feeds = [
 54 |             model['loss'],
 55 |             model['predictions'], 
 56 |             model['final_sequence_lengths']
 57 |         ]
 58 | 
 59 |         try:
 60 |             batch_loss, predictions, fsl = sess.run(feeds, feed_dict)
 61 |         except Exception as e:
 62 |             debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab)
 63 |             raise e
 64 | 
 65 |         # batch losses
 66 |         batch_losses.append(batch_loss)
 67 | 
 68 |         # all ref words
 69 |         seq_ref_words = [ [ref_words] for ref_words in seq_ref_words ]
 70 |         all_seq_ref_words.extend(seq_ref_words)
 71 | 
 72 |         # all prediction words to compute bleu on
 73 |         bleu_pred_words = [ [ id_to_vocab[vocab_id] for vocab_id in prediction if vocab_id in id_to_vocab ] for prediction in predictions ]
 74 |         bleu_pred_words = [ pred_words[:pred_words.index('<END>') if '<END>' in pred_words else len(pred_words) ] for pred_words in bleu_pred_words ]
 75 |         all_bleu_pred_words.extend(bleu_pred_words)
 76 | 
 77 |     bleu_score = corpus_bleu(all_seq_ref_words, all_bleu_pred_words, smoothing_function=chencherry.method1)
 78 |     loss = sum(batch_losses) / len(batch_losses)
 79 |     logging.info("{} : Evaluating on {} set loss={:.4f} bleu={:.4f}".format(dt.datetime.now(), mode, loss, bleu_score))
 80 |     return loss, bleu_score
 81 | 
 82 | def infer(sess, args, model, id_to_vocab, end_id):
 83 |     """Perform inference on a model.  This is intended to be interactive.
 84 |     A user will run this from the command line to provide an input sentence
 85 |     and receive a paraphrase as output continuously within a loop.
 86 | 
 87 |     Args:
 88 |         sess: Tensorflow session
 89 |         args: ArgumentParser object configuration
 90 |         model: a dictionary containing the model tensors
 91 |         id_to_vocab: vocabulary index of id_to_vocab
 92 |         end_id: the end of sentence token
 93 | 
 94 |     """
 95 |     from preprocess_data import preprocess_batch
 96 | 
 97 |     while 1:
 98 |         source_sent = input("Enter source sentence: ")
 99 |         seq_source_words, seq_source_ids = preprocess_batch([ source_sent ])
100 |         seq_source_len = [ len(seq_source) for seq_source in seq_source_ids ]
101 | 
102 |         if args.decoder == 'greedy':
103 |             decoder = 0
104 |         elif args.decoder == 'sample':
105 |             decoder = 1
106 | 
107 |         feed_dict = {
108 |             model['seq_source_ids']: seq_source_ids,
109 |             model['seq_source_lengths']: seq_source_len,
110 |             model['decoder_technique']: decoder,
111 |             model['sampling_temperature']: args.sampling_temperature,
112 |         }
113 | 
114 |         feeds = [
115 |             model['predictions'], 
116 |             model['final_sequence_lengths']
117 |         ]
118 | 
119 |         predictions, final_sequence_lengths = sess.run(feeds, feed_dict)
120 | 
121 |         for sent_pred in predictions:
122 |             if sent_pred[-1] == end_id:
123 |                 sent_pred = sent_pred[0:-1]
124 |             print("Paraphrase : {}".format(' '.join([ id_to_vocab[pred] for pred in sent_pred ])))
125 |             
126 | def compress_graph(sess, args, model):
127 |     """After training has completed, this function can be called to compress
128 |     the model.  The computation graph is frozen turning the checkpoint
129 |     variables into constants.  Finally, optimization is done by stripping
130 |     away all unnecessary nodes from the graph if they are not used at
131 |     inference time.
132 | 
133 |     Args:
134 |         sess: Tensorflow session
135 |         args: ArgumentParser config object
136 |         model: model dictionary containing tensors of interest
137 | 
138 |     """
139 |     from tensorflow.python.tools import freeze_graph 
140 |     from tensorflow.python.tools import optimize_for_inference_lib
141 | 
142 |     tf.train.write_graph(sess.graph_def, '/media/sdb/models/paraphraser', 'model.pb', as_text=False)
143 | 
144 |     freeze_graph.freeze_graph(
145 |         #input_graph='/tmp/model.pbtxt', 
146 |         input_graph='/media/sdb/models/paraphraser/model.pb',
147 |         input_saver='',
148 |         input_binary=True, 
149 |         input_checkpoint=args.checkpoint,
150 |         output_node_names='predictions',
151 |         restore_op_name='save/restore_all', 
152 |         filename_tensor_name='save/Const:0',
153 |         output_graph='/media/sdb/models/paraphraser/frozen_model.pb', 
154 |         clear_devices=True, 
155 |         initializer_nodes='')
156 | 
157 |     '''
158 |     input_graph_def = tf.GraphDef()
159 |     #with tf.gfile.Open('/media/sdb/models/paraphraser/frozen_model.pb', 'rb') as f:
160 |     with tf.gfile.Open('/tmp/frozen_model.pb', 'rb') as f:
161 |         data = f.read()
162 |         input_graph_def.ParseFromString(data)
163 |         with tf.Graph().as_default() as graph:
164 |             tf.import_graph_def(input_graph_def)
165 |             print(dir(graph))
166 |             print(graph.find_tensor_by_name('placeholders/sampling_temperature'))
167 | 
168 |     output_graph_def = optimize_for_inference_lib.optimize_for_inference(
169 |         input_graph_def,
170 |         ['placeholders/source_ids', 'placeholders/sequence_source_lengths'],
171 |         ['predictions'],
172 |         tf.float32.as_datatype_enum)
173 |     
174 |     f = tf.gfile.FastGFile('/tmp/optimized_model.pb', "w")
175 |     f.write(output_graph_def.SerializeToString())
176 |     '''
177 | 
178 |         
179 | def parse_arguments():
180 |     """Argument parser configuration."""
181 |     parser = argparse.ArgumentParser()
182 | 
183 |     parser.add_argument('--log_dir', type=str, default="logs", help="Log directory to store tensorboard summary and model checkpoints")
184 |     parser.add_argument('--epochs', type=int, default=3, help="Number of epochs to train")
185 |     parser.add_argument('--lr', type=float, default=1e-3, help="Learning rate")
186 |     parser.add_argument('--batch_size', type=int, default=64, help="Mini batch size")
187 |     parser.add_argument('--max_seq_length', type=int, default=40, help="Maximum sequence length.  Sentence lengths beyond this are truncated.")
188 |     parser.add_argument('--hidden_size', type=int, default=300, help="Hidden dimension size")
189 |     parser.add_argument('--keep_prob', type=float, default=0.8, help="Keep probability for dropout")
190 |     parser.add_argument('--decoder', type=str, choices=['greedy', 'sample'], help="Decoder type")
191 |     parser.add_argument('--sampling_temperature', type=float, default=0.0, help="Sampling temperature")
192 |     parser.add_argument('--mode', type=str, default=None, choices=['train', 'dev', 'test', 'infer'], help='train or dev or test or infer or minimize')
193 |     parser.add_argument('--checkpoint', type=str, default=None, help="Model checkpoint file")
194 |     parser.add_argument('--minimize_graph', type=bool, default=False, help="Save existing checkpoint to minimal graph")
195 | 
196 |     return parser.parse_args()
197 | 
198 | def main():
199 |     """Entry point for all training, evaluation, and model compression begins here"""
200 |     args = parse_arguments()
201 |     word_to_id, id_to_vocab, embeddings, start_id, end_id, unk_id, mask_id = load_sentence_embeddings()
202 |     vocab_size, embedding_size = embeddings.shape
203 |     lr = args.lr
204 | 
205 |     dataset = dataset_config()
206 | 
207 |     if args.mode not in set(['train', 'dev', 'test', 'infer', 'minimize']):
208 |         raise ValueError("{} is not a valid mode".format(args.mode))
209 | 
210 |     with tf.Session() as sess:
211 |         start = dt.datetime.now()
212 |         model = lstm_model(sess, args.mode, args.hidden_size, embeddings, start_id, end_id, mask_id)
213 | 
214 |         # Saver object
215 |         saver = tf.train.Saver()
216 |         name_to_var_map = {var.op.name: var for var in tf.global_variables()}
217 | 
218 |         # Restore checkpoint
219 |         if args.checkpoint:
220 |             saver.restore(sess, args.checkpoint)
221 | 
222 |         # Save minimal graph
223 |         if args.minimize_graph:
224 |             compress_graph(sess, args, model)
225 |             return
226 | 
227 |         # Load dataset only in train, dev, or test mode
228 |         if args.mode in set(['train', 'dev', 'test']):
229 |             logging.info("{}: Loading dataset into memory.".format(dt.datetime.now()))
230 |             dataset_generator = ParaphraseDataset(dataset, args.batch_size, embeddings, word_to_id, start_id, end_id, unk_id, mask_id)
231 | 
232 |         # Evaluate on dev or test
233 |         if args.mode == 'dev' or args.mode == 'test':
234 |             evaluate(sess, model, dataset_generator, args.mode, id_to_vocab)
235 |             return
236 | 
237 |         # Perform inferencing
238 |         if args.mode == 'infer':
239 |             infer(sess, args, model, id_to_vocab, end_id)
240 |             return
241 | 
242 |         ###################################
243 |         # Training run proceeds from here #
244 |         ###################################
245 | 
246 |         # Training summary writer
247 |         train_logdir = os.path.join(args.log_dir, "train-" + start.strftime("%Y%m%d-%H%M%S"))
248 |         train_writer = tf.summary.FileWriter(train_logdir)
249 | 
250 |         # Dev summary writer
251 |         dev_logdir = os.path.join(args.log_dir, "dev-" + start.strftime("%Y%m%d-%H%M%S"))
252 |         dev_writer = tf.summary.FileWriter(dev_logdir)
253 | 
254 |         chencherry = SmoothingFunction()
255 |         global_step = 0
256 |         tf.global_variables_initializer().run()
257 |         sess.run(model['dummy'], {model['sampling_temperature']: 7.5})
258 | 
259 |         # Training per epoch
260 |         for epoch in xrange(args.epochs):
261 |             train_losses = []
262 |             train_batch_generator = dataset_generator.generate_batch('train')
263 |             for train_batch in train_batch_generator:
264 |                 seq_source_ids = train_batch['seq_source_ids']
265 |                 seq_source_words = train_batch['seq_source_words']
266 |                 seq_source_len = train_batch['seq_source_len']
267 |                 seq_ref_ids = train_batch['seq_ref_ids']
268 |                 seq_ref_words = train_batch['seq_ref_words']
269 |                 seq_ref_len = train_batch['seq_ref_len']
270 | 
271 |                 feed_dict = {
272 |                     model['lr']: lr,
273 |                     model['seq_source_ids']: seq_source_ids,
274 |                     model['seq_source_lengths']: seq_source_len,
275 |                     model['seq_reference_ids']: seq_ref_ids,
276 |                     model['seq_reference_lengths']: seq_ref_len,
277 |                     model['keep_prob']: args.keep_prob
278 |                 }
279 | 
280 |                 feeds = [
281 |                     model['train_step'], 
282 |                     model['loss'], 
283 |                     model['predictions'], 
284 |                     model['summaries'],
285 |                     model['final_sequence_lengths']
286 |                 ]
287 | 
288 |                 try:
289 |                     _, batch_loss, predictions, summary, fsl = sess.run(feeds, feed_dict)
290 |                 except Exception as e:
291 |                     debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab)
292 |                     raise e
293 | 
294 |                 train_losses.append(batch_loss)
295 | 
296 |                 # Status update
297 |                 if global_step % 25 == 0:
298 |                     train_writer.add_summary(summary, global_step)
299 |                     train_writer.flush()
300 |                     seq_ref_words = [ [ref_words] for ref_words in seq_ref_words ]
301 |                     bleu_pred_words = [ [ id_to_vocab[vocab_id] for vocab_id in prediction if vocab_id in id_to_vocab ] for prediction in predictions ]
302 |                     bleu_pred_words = [ pred_words[:pred_words.index('<END>') if '<END>' in pred_words else len(pred_words) ] for pred_words in bleu_pred_words ]
303 |                     bleu_score = corpus_bleu(seq_ref_words, bleu_pred_words, smoothing_function=chencherry.method1)
304 |                     summarize_scalar(train_writer, 'bleu_score', bleu_score, global_step)
305 |                     train_loss = sum(train_losses) / len(train_losses)
306 |                     summarize_scalar(train_writer, 'loss', train_loss, global_step)
307 |                     logging.info("step={} epoch={} batch_loss={:.4f} train_loss={:.4f} bleu={:.4f}".format(global_step, epoch, batch_loss, train_loss, bleu_score))
308 | 
309 |                 # Print predictions for this batch every 1000 steps
310 |                 # Evaluate on dev set
311 |                 if global_step % 1000 == 0 and global_step != 0:
312 |                     debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab)
313 |                     logging.info("PREDICTIONS!")
314 |                     logging.info("final_seq_lengths: " + str(fsl))
315 |                     logging.info("len(predictions): " + str(len(predictions)))
316 |                     for prediction in predictions:
317 |                         logging.info(str(len(prediction)) + ' ' + ' '.join([id_to_vocab[vocab_id] for vocab_id in prediction if vocab_id in id_to_vocab]))
318 | 
319 |                     dev_loss, bleu_score = evaluate(sess, model, dataset_generator, 'dev', id_to_vocab)
320 |                     summarize_scalar(dev_writer, 'bleu_score', bleu_score, global_step)
321 |                     summarize_scalar(dev_writer, 'loss', dev_loss, global_step)
322 |                     dev_writer.flush()
323 | 
324 |                 # Checkpoint.
325 |                 #if global_step % 50 == 0 and global_step != 0:
326 |                 if global_step % 5000 == 0 and global_step != 0:
327 |                     saver.save(sess, os.path.join(train_logdir, 'model'), global_step=global_step)
328 | 
329 |                 global_step += 1
330 |             # End train batch
331 | 
332 |             saver.save(sess, os.path.join(train_logdir, 'model'), global_step=global_step)
333 |             lr /= 10.
334 |         # End epoch
335 | 
336 |         evaluate(sess, model, dataset_generator, 'test', id_to_vocab)
337 |     # End sess
338 | 
339 | if __name__ == '__main__':
340 |     main()
341 | 
342 | 


--------------------------------------------------------------------------------
/paraphraser/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tensorflow as tf
 3 | 
 4 | def summarize_scalar(writer, tag, value, step):
 5 |     """Prepare data to be written to protobuf event file.  This is later
 6 |     read into tensorboard for visualization.
 7 | 
 8 |     Args:
 9 |         writer: summary writer
10 |         tag: identifier name of the the data in question
11 |         value: the value the data takes on
12 |         step: global step during training
13 |     """
14 |     summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
15 |     writer.add_summary(summary, step)
16 | 
17 | 
18 | def debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab):
19 |     """Debug dataset batch samples to ensure they take on intended avlues"""
20 |     logging.info("==============================================================")
21 |     logging.info("SOURCE!")
22 |     #logging.info(seq_source_ids)
23 |     for source_ids in seq_source_ids:
24 |         logging.info(' '.join([id_to_vocab[source_id] for source_id in source_ids]))
25 |     logging.info(seq_source_len)
26 |     logging.info("REFERENCE!")
27 |     #logging.info(seq_ref_ids)
28 |     for i in seq_ref_ids:
29 |         logging.info(' '.join([id_to_vocab[label] for label in i if label != -1]))
30 |     logging.info(seq_ref_len)
31 |     logging.info("==============================================================")
32 | 
33 | def dataset_config():
34 |     """Dataset configuration.  Dataset files are grouped by sentences of maximum
35 |     length for train, dev, and test.  """
36 | 
37 |     dataset = [
38 |         { 
39 |             'maxlen': 5,
40 |             'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.5',
41 |             'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.5',
42 |             'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.5' 
43 |         },
44 |         { 
45 |             'maxlen': 10,
46 |             'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.10',
47 |             'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.10',
48 |             'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.10' 
49 |         },
50 |         { 
51 |             'maxlen': 20,
52 |             'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.20',
53 |             'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.20',
54 |             'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.20' 
55 |         },
56 |         { 
57 |             'maxlen': 30,
58 |             'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.30',
59 |             'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.30',
60 |             'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.30' 
61 |         },
62 |         { 
63 |             'maxlen': 40,
64 |             'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.40',
65 |             'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.40',
66 |             'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.40' 
67 |         },
68 |         { 
69 |             'maxlen': 50,
70 |             'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.50',
71 |             'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.50',
72 |             'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.50' 
73 |         }
74 |     ]
75 | 
76 |     return dataset
77 | 
78 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.weakref==1.0rc1
 2 | bleach==1.5.0
 3 | blessings==1.6
 4 | blis==0.0.13
 5 | bottle==0.12.13
 6 | certifi==2017.11.5
 7 | chardet==3.0.4
 8 | cycler==0.10.0
 9 | cymem==1.31.2
10 | cytoolz==0.8.2
11 | decorator==4.1.1
12 | dill==0.2.7.1
13 | en-core-web-sm==2.0.0
14 | enum34==1.1.6
15 | ftfy==4.4.3
16 | future==0.16.0
17 | gpustat==0.4.1
18 | h5py==2.7.1
19 | html5lib==0.9999999
20 | idna==2.6
21 | ipython==6.1.0
22 | ipython-genutils==0.2.0
23 | jedi==0.10.2
24 | Keras==2.1.3
25 | Markdown==2.6.10
26 | matplotlib==2.1.2
27 | msgpack-numpy==0.4.1
28 | msgpack-python==0.5.1
29 | murmurhash==0.28.0
30 | nltk==3.2.5
31 | numpy==1.14.0
32 | nvidia-ml-py3==7.352.0
33 | paraphraser==0.1.0a1
34 | pathlib==1.0.1
35 | pexpect==4.2.1
36 | pickleshare==0.7.4
37 | plac==0.9.6
38 | preshed==1.0.0
39 | prompt-toolkit==1.0.14
40 | protobuf==3.5.0.post1
41 | psutil==5.4.2
42 | ptyprocess==0.5.2
43 | Pygments==2.2.0
44 | pyparsing==2.2.0
45 | python-dateutil==2.6.1
46 | pytz==2017.3
47 | PyYAML==3.12
48 | regex==2017.4.5
49 | requests==2.18.4
50 | scipy==1.0.0
51 | simplegeneric==0.8.1
52 | simplejson==3.13.2
53 | six==1.11.0
54 | spacy==2.0.5
55 | tensorflow-gpu==1.4.1
56 | tensorflow-tensorboard==0.4.0rc3
57 | termcolor==1.1.0
58 | thinc==6.10.2
59 | toolz==0.9.0
60 | tqdm==4.19.5
61 | traitlets==4.3.2
62 | ujson==1.35
63 | urllib3==1.22
64 | uWSGI==2.0.15
65 | wcwidth==0.1.7
66 | Werkzeug==0.13
67 | wrapt==1.10.11
68 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # This flag says to generate wheels that support both Python 2 and Python
3 | # 3. If your code will not run unchanged on both Python 2 and 3, you will
4 | # need to generate separate wheels for each Python version that you
5 | # support.
6 | universal=1
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | from setuptools.command.install import install
 5 | 
 6 | here = path.abspath(path.dirname(__file__))
 7 | 
 8 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
 9 |     long_desc = f.read()
10 | 
11 | class DownloadCorpora(install):
12 |     def run(self):
13 |         install.run(self)
14 |         import spacy
15 |         import nltk
16 |         nltk.download('wordnet')
17 |         spacy.cli.download('download', 'en')
18 | 
19 | class DownloadParaphraseModel(install):
20 |     def run(self):
21 |         install.run(self)
22 |         from paraphaser.download_models import download_file_from_google_drive
23 |         download_file_from_google_drive('19QDCd4UMgt3FtlYYwu0qZU3G1F9_XCvk', 
24 |                                         'paraphrase-model.tar.gz')
25 | 
26 | setup(
27 |     name='paraphraser',
28 |     version='0.1.0',
29 |     description='Generate sentence paraphrases given an input sentence',
30 |     long_description=long_desc,
31 |     url='https://github.com/vsuthichai/paraphraser',
32 |     author='Victor Suthichai',
33 |     author_email='victor.suthichai@gmail.com',
34 | 
35 |     # https://pypi.python.org/pypi?%3Aaction=list_classifiers
36 |     classifiers=[
37 |         'Development Status :: 3 - Alpha',
38 |         'Intended Audience :: Developers',
39 |         'Programming Language :: Python :: 2.7',
40 |         'Programming Language :: Python :: 3',
41 |         'Programming Language :: Python :: 3.4',
42 |         'Programming Language :: Python :: 3.5',
43 |         'Programming Language :: Python :: 3.6'
44 |     ],
45 | 
46 |     keywords=[
47 |         'paraphraser'
48 |     ],
49 | 
50 |     py_modules=['paraphraser.synonym_model', 'paraphraser.inference', 'paraphraser.download_models'],
51 |     #packages=find_packages(exclude=['contrib', 'docs', 'tests']),
52 |     #install_requires=['nltk', 'spacy', 'ipython'],
53 |     install_requires=[],
54 |     extras_require={
55 | 
56 |     },
57 |     package_data={
58 | 
59 |     },
60 |     data_files=[],
61 |     entry_points={
62 |     },
63 |     cmdclass={
64 |         'download_model': DownloadParaphraseModel
65 |         #'download_corpora': DownloadCorpora
66 |     }
67 | )
68 | 
69 | 


--------------------------------------------------------------------------------