├── README.md ├── images └── 20180128-035256-plot.png ├── paraphraser ├── __init__.py ├── dataset_generator.py ├── download_models.py ├── embeddings.py ├── inference.py ├── inference_frozen_graph.py ├── inspect_checkpoint.py ├── lstm_model.py ├── nlp_pipeline.py ├── paraphraser.py ├── preprocess_data.py ├── sample_embedding_helper.py ├── synonym_model.py ├── training_pipeline.py └── utils.py ├── requirements.txt ├── setup.cfg └── setup.py /README.md: -------------------------------------------------------------------------------- 1 | # Paraphraser 2 | 3 | This project providers users the ability to do paraphrase generation for sentences through a clean and simple API. A demo can be seen here: [pair-a-phrase](http://pair-a-phrase.it) 4 | 5 | The paraphraser was developed under the [Insight Data Science Artificial Intelligence](http://insightdata.ai/) program. 6 | 7 | ## Model 8 | 9 | The underlying model is a bidirectional LSTM encoder and LSTM decoder with attention trained using Tensorflow. Downloadable link here: [paraphrase model](https://drive.google.com/open?id=18uOQsosF4uVGvUgp6pB4BKrQZ1FktlmM) 10 | 11 | ### Prerequisiteis 12 | 13 | * python 3.5 14 | * Tensorflow 1.4.1 15 | * spacy 16 | 17 | ### Inference Execution 18 | 19 | Download the model checkpoint from the link above and run: 20 | 21 | ``` 22 | python inference.py --checkpoint= 23 | ``` 24 | 25 | ### Datasets 26 | 27 | The dataset used to train this model is an aggregation of many different public datasets. To name a few: 28 | * para-nmt-5m 29 | * Quora question pair 30 | * SNLI 31 | * Semeval 32 | * And more! 33 | 34 | I have not included the aggregated dataset as part of this repo. If you're curious and would like to know more, contact me. Pretrained embeddings come from [John Wieting](http://www.cs.cmu.edu/~jwieting)'s [para-nmt-50m](https://github.com/jwieting/para-nmt-50m) project. 35 | 36 | ### Training 37 | 38 | Training was done for 2 epochs on a Nvidia GTX 1080 and evaluted on the BLEU score. The Tensorboard training curves can be seen below. The grey curve is train and the orange curve is dev. 39 | 40 | 41 | 42 | ## TODOs 43 | 44 | * pip installable package 45 | * Explore deeper number of layers 46 | * Recurrent layer dropout 47 | * Greater dataset augmentation 48 | * Try residual layer 49 | * Model compression 50 | * Byte pair encoding for out of set vocabulary 51 | 52 | ## Citations 53 | 54 | ``` 55 | @inproceedings { wieting-17-millions, 56 | author = {John Wieting and Kevin Gimpel}, 57 | title = {Pushing the Limits of Paraphrastic Sentence Embeddings with Millions of Machine Translations}, 58 | booktitle = {arXiv preprint arXiv:1711.05732}, year = {2017} 59 | } 60 | 61 | @inproceedings { wieting-17-backtrans, 62 | author = {John Wieting, Jonathan Mallinson, and Kevin Gimpel}, 63 | title = {Learning Paraphrastic Sentence Embeddings from Back-Translated Bitext}, 64 | booktitle = {Proceedings of Empirical Methods in Natural Language Processing}, 65 | year = {2017} 66 | } 67 | ``` 68 | 69 | -------------------------------------------------------------------------------- /images/20180128-035256-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vincent9514/Text-Variant-Generation/29f4507baecee11c72b8b1f3c66686fff008a9e8/images/20180128-035256-plot.png -------------------------------------------------------------------------------- /paraphraser/__init__.py: -------------------------------------------------------------------------------- 1 | from .synonym_model import synonym_paraphrase 2 | -------------------------------------------------------------------------------- /paraphraser/dataset_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.preprocessing.sequence import pad_sequences 3 | from embeddings import load_sentence_embeddings 4 | from six.moves import xrange 5 | from six import iteritems 6 | from random import shuffle 7 | 8 | class ParaphraseDataset(object): 9 | """This class is responsible for batching the paraphrase dataset into mini batches 10 | for train, dev, and test. The dataset itself must be partition into files 11 | beforehand and must follow this format: 12 | 13 | "source sentence\tsource sentence token ids\treference sentence\treference sentence token ids" 14 | 15 | The intraseparator is a space. 16 | """ 17 | 18 | def __init__(self, dataset_metadata, batch_size, embeddings, word_to_id, start_id, end_id, unk_id, mask_id): 19 | """ Constructor initialization. 20 | 21 | Args: 22 | dataset_metadata: metadata list that follows the format [ 23 | { 24 | 'maxlen': X 25 | 'train': training filename with sentences of length X, 26 | 'dev': dev filename with sentences of length X, 27 | 'test': test filename with sentences of length X, 28 | }, 29 | ]. Each element is a list that describes the train, dev, and 30 | test files for sentences of maximum length X. 31 | batch_size: mini batch size 32 | embeddings: pretrained embeddings 33 | word_to_id: vocabulary index 34 | start_id: start of sentence token id 35 | end_id: end of sentence token id 36 | unk_id: unknown token id 37 | mask_id: pad token id applied after the end of sentence. 38 | """ 39 | 40 | # batch size 41 | self.batch_size = batch_size 42 | 43 | # Special tokens 44 | self.start_id = start_id 45 | self.end_id = end_id 46 | self.unk_id = unk_id 47 | self.mask_id = mask_id 48 | 49 | # Word embeddings, vocab 50 | self.embeddings = embeddings 51 | self.word_to_id = word_to_id 52 | self.vocab_size, self.embedding_size = embeddings.shape 53 | 54 | # dataset 55 | self.lengths = sorted([ v for d in dataset_metadata for k, v in iteritems(d) if k == 'maxlen' ]) 56 | self.dataset_metadata = {} 57 | for dm in dataset_metadata: 58 | for k, v in iteritems(dm): 59 | if k == 'maxlen': 60 | self.dataset_metadata[v] = dm 61 | self.dataset = {} 62 | 63 | def load_dataset_into_memory(self, dataset_type): 64 | """Load dataset into memory and partition by train, dev, and test.""" 65 | 66 | if dataset_type not in set(['train', 'test', 'dev']): 67 | raise ValueError("Invalid dataset type.") 68 | 69 | self.dataset[dataset_type] = {} 70 | self.dataset[dataset_type]['all_source_words'] = [] 71 | self.dataset[dataset_type]['all_source_ids'] = [] 72 | self.dataset[dataset_type]['all_source_len'] = [] 73 | self.dataset[dataset_type]['all_ref_words'] = [] 74 | self.dataset[dataset_type]['all_ref_ids'] = [] 75 | self.dataset[dataset_type]['all_ref_len'] = [] 76 | 77 | batch_source_words = [] 78 | batch_source_ids = [] 79 | batch_source_len = [] 80 | batch_ref_words = [] 81 | batch_ref_ids = [] 82 | batch_ref_len = [] 83 | 84 | for length in self.lengths: 85 | with open(self.dataset_metadata[length][dataset_type], 'r') as f: 86 | for i, line in enumerate(f): 87 | source_words, source_ids, ref_words, ref_ids = line.split('\t') 88 | batch_source_words.append(source_words.strip().split(' ')) 89 | batch_source_ids.append(source_ids.strip().split(' ')) 90 | batch_ref_words.append(ref_words.strip().split(' ')) 91 | batch_ref_ids.append(ref_ids.strip().split(' ')) 92 | 93 | if i % self.batch_size != 0 and i != 0: 94 | continue 95 | 96 | batch_source_len = [ len(source_ids) for source_ids in batch_source_ids ] 97 | batch_ref_len = [ len(ref_ids) for ref_ids in batch_ref_ids ] 98 | 99 | self.dataset[dataset_type]['all_source_ids'].append(self.pad_batch(batch_source_ids, length)) 100 | self.dataset[dataset_type]['all_source_words'].append(batch_source_words) 101 | self.dataset[dataset_type]['all_source_len'].append(batch_source_len) 102 | self.dataset[dataset_type]['all_ref_ids'].append(self.pad_batch(batch_ref_ids, length)) 103 | self.dataset[dataset_type]['all_ref_words'].append(batch_ref_words) 104 | self.dataset[dataset_type]['all_ref_len'].append(batch_ref_len) 105 | 106 | batch_source_words = [] 107 | batch_source_ids = [] 108 | batch_source_len = [] 109 | batch_ref_words = [] 110 | batch_ref_ids = [] 111 | batch_ref_len = [] 112 | 113 | if len(batch_source_words) > 0: 114 | batch_source_len = [ len(source_ids) for source_ids in batch_source_ids ] 115 | batch_ref_len = [ len(ref_ids) for ref_ids in batch_ref_ids ] 116 | 117 | self.dataset[dataset_type]['all_source_ids'].append(self.pad_batch(batch_source_ids, length)) 118 | self.dataset[dataset_type]['all_source_words'].append(batch_source_words) 119 | self.dataset[dataset_type]['all_source_len'].append(batch_source_len) 120 | self.dataset[dataset_type]['all_ref_ids'].append(self.pad_batch(batch_ref_ids, length)) 121 | self.dataset[dataset_type]['all_ref_words'].append(batch_ref_words) 122 | self.dataset[dataset_type]['all_ref_len'].append(batch_ref_len) 123 | 124 | batch_source_words = [] 125 | batch_source_ids = [] 126 | batch_source_len = [] 127 | batch_ref_words = [] 128 | batch_ref_ids = [] 129 | batch_ref_len = [] 130 | 131 | def generate_batch(self, dataset_type): 132 | """Return a generator that yields a mini batch of size self.batch_size. 133 | 134 | Args: 135 | dataset_type: 'train', 'test', or 'dev' 136 | """ 137 | 138 | if dataset_type not in set(['train', 'test', 'dev']): 139 | raise ValueError("Invalid dataset type.") 140 | 141 | if dataset_type not in self.dataset: 142 | self.load_dataset_into_memory(dataset_type) 143 | 144 | dataset_size = len(self.dataset[dataset_type]['all_source_ids']) 145 | 146 | rs = np.random.get_state() 147 | np.random.shuffle(self.dataset[dataset_type]['all_source_ids']) 148 | np.random.set_state(rs) 149 | np.random.shuffle(self.dataset[dataset_type]['all_source_words']) 150 | np.random.set_state(rs) 151 | np.random.shuffle(self.dataset[dataset_type]['all_source_len']) 152 | np.random.set_state(rs) 153 | np.random.shuffle(self.dataset[dataset_type]['all_ref_ids']) 154 | np.random.set_state(rs) 155 | np.random.shuffle(self.dataset[dataset_type]['all_ref_words']) 156 | np.random.set_state(rs) 157 | np.random.shuffle(self.dataset[dataset_type]['all_ref_len']) 158 | np.random.set_state(rs) 159 | 160 | for i in xrange(dataset_size): 161 | yield { 162 | 'seq_source_ids': self.dataset[dataset_type]['all_source_ids'][i], 163 | 'seq_source_words': self.dataset[dataset_type]['all_source_words'][i], 164 | 'seq_source_len': self.dataset[dataset_type]['all_source_len'][i], 165 | 'seq_ref_ids': self.dataset[dataset_type]['all_ref_ids'][i], 166 | 'seq_ref_words': self.dataset[dataset_type]['all_ref_words'][i], 167 | 'seq_ref_len': self.dataset[dataset_type]['all_ref_len'][i] 168 | } 169 | 170 | def pad_batch(self, batch_ids, max_len): 171 | """ Pad a mini batch with mask_id. This is intended to fill in any 172 | remaining time steps after the end of sentence tokens. 173 | 174 | Args: 175 | batch_ids: The mini batch of token ids of shape (batch_size, time_steps) 176 | max_len: The maximum number of time steps. 177 | 178 | Returns: 179 | a batch of samples padded with mask_id 180 | """ 181 | padded_batch = np.array(pad_sequences(batch_ids, maxlen=max_len, padding='post', value=self.mask_id)) 182 | return padded_batch 183 | 184 | 185 | if __name__ == '__main__': 186 | from pprint import pprint as pp 187 | from utils import dataset_config 188 | dataset = dataset_config() 189 | word_to_id, idx_to_word, embeddings, start_id, end_id, unk_id, mask_id = load_sentence_embeddings() 190 | pd = ParaphraseDataset(dataset, 10, embeddings, word_to_id, start_id, end_id, unk_id, mask_id) 191 | generator = pd.generate_batch('train') 192 | for i, d in enumerate(generator): 193 | if i == 5: 194 | break 195 | print("=== seq source ids ===") 196 | print(d['seq_source_ids'].shape, flush=True) 197 | print(d['seq_source_ids'], flush=True) 198 | for i in d['seq_source_words']: 199 | print(i) 200 | print(d['seq_source_len'], flush=True) 201 | 202 | print("=== seq ref ids ===") 203 | print(d['seq_ref_ids'].shape, flush=True) 204 | print(d['seq_ref_ids'], flush=True) 205 | for i in d['seq_ref_words']: 206 | print(i) 207 | print(d['seq_ref_len']) 208 | 209 | -------------------------------------------------------------------------------- /paraphraser/download_models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import requests 4 | import logging 5 | 6 | logging.basicConfig(format = u'[LINE:%(lineno)d]# %(levelname)-8s [%(asctime)s] %(message)s', level = logging.NOTSET) 7 | 8 | def download_file_from_google_drive(id, destination): 9 | URL = "https://docs.google.com/uc?export=download" 10 | logging.info("Downloading "+id + " to "+destination) 11 | logging.info("Please be patient, it may take a while...") 12 | session = requests.Session() 13 | 14 | response = session.get(URL, params = { 'id' : id }, stream = True) 15 | token = get_confirm_token(response) 16 | logging.info("...") 17 | if token: 18 | params = { 'id' : id, 'confirm' : token } 19 | response = session.get(URL, params = params, stream = True) 20 | 21 | save_response_content(response, destination) 22 | logging.info("Done with " + id) 23 | 24 | def get_confirm_token(response): 25 | for key, value in response.cookies.items(): 26 | if key.startswith('download_warning'): 27 | return value 28 | 29 | return None 30 | 31 | def save_response_content(response, destination): 32 | CHUNK_SIZE = 32768 33 | 34 | with open(destination, "wb") as f: 35 | for chunk in response.iter_content(CHUNK_SIZE): 36 | if chunk: # filter out keep-alive new chunks 37 | f.write(chunk) 38 | 39 | ''' 40 | to download the .t7 NTS models used for text simplification 41 | if for some reason, this doanload fails, please use the direct urls: 42 | - for NTS: 43 | https://drive.google.com/open?id=0B_pjS_ZjPfT9dEtrbV85UXhSelU 44 | -for NTS-w2v: 45 | https://drive.google.com/open?id=0B_pjS_ZjPfT9ZTRfSFp4Ql92U0E 46 | ''' 47 | 48 | if __name__ == "__main__": 49 | try: 50 | out_dir = sys.argv[1] 51 | logging.info("Saving files to: " + out_dir) 52 | except: 53 | out_dir = os.path.dirname(os.path.realpath(__file__)) 54 | logging.info("Saving files to: " + out_dir) 55 | 56 | #NTS_model = '0B_pjS_ZjPfT9dEtrbV85UXhSelU' 57 | #NTS_model_output = 'NTS_epoch11_10.19.t7' 58 | #download_file_from_google_drive(NTS_model, os.path.join(out_dir, NTS_model_output)) 59 | 60 | #NTS_w2v_model = '0B_pjS_ZjPfT9ZTRfSFp4Ql92U0E' 61 | #NTS_w2v_model_output = 'NTS-w2v_epoch11_10.20.t7' 62 | #download_file_from_google_drive(NTS_w2v_model, os.path.join(out_dir, NTS_w2v_model_output)) 63 | 64 | #model = '1_JsQ_iMnHwvnyd5vZM-6BMqe9hnVqrPi' 65 | model = '19QDCd4UMgt3FtlYYwu0qZU3G1F9_XCvk' 66 | download_file_from_google_drive(model, 'paraphrase-model.tar.gz') 67 | 68 | -------------------------------------------------------------------------------- /paraphraser/embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | from six import iteritems 4 | from pprint import pprint as pp 5 | #from keras.layers.embeddings import Embedding 6 | 7 | def load_sentence_embeddings(): 8 | '''Load John Wieting sentence embeddings''' 9 | with open("../../para-nmt-50m/data/ngram-word-concat-40.pickle", 'rb') as f: 10 | # [ numpy.ndarray(95283, 300), numpy.ndarray(74664, 300), (trigram_dict, word_dict)] 11 | x = pickle.load(f, encoding='latin1') 12 | word_vocab_size, embedding_size = x[1].shape 13 | 14 | trigram_embeddings, word_embeddings, _ = x 15 | trigram_to_id, word_to_id = x[2] 16 | 17 | word_to_id[''] = word_vocab_size 18 | word_to_id[''] = word_vocab_size + 1 19 | 20 | idx_to_word = { idx: word for word, idx in iteritems(word_to_id) } 21 | 22 | word_embeddings = np.vstack((word_embeddings, np.random.randn(2, embedding_size))) 23 | 24 | return (word_to_id, idx_to_word, word_embeddings, word_to_id[''], 25 | word_to_id[''], word_to_id['UUUNKKK'], word_to_id['★']) 26 | 27 | def load_glove_embeddings(): 28 | with open("/media/sdb/datasets/glove.6B/glove.6B.300d.pickle", "rb") as f: 29 | word_to_id, id_to_word, word_embeddings = pickle.load(f, encoding='latin1') 30 | word_vocab_size, embedding_size = word_embeddings.shape 31 | word_to_id[''] = word_vocab_size 32 | word_to_id[''] = word_vocab_size + 1 33 | word_to_id['UUUNKKK'] = word_vocab_size + 2 34 | word_to_id['★'] = word_vocab_size + 3 35 | id_to_word[word_vocab_size] = '' 36 | id_to_word[word_vocab_size+1] = '' 37 | id_to_word[word_vocab_size+2] = 'UUUNKKK' 38 | id_to_word[word_vocab_size+3] = '★' 39 | word_embeddings = np.vstack((word_embeddings, np.random.randn(4, embedding_size))) 40 | return (word_to_id, id_to_word, word_embeddings, word_to_id[''], 41 | word_to_id[''], word_to_id['UUUNKKK'], word_to_id['★']) 42 | 43 | 44 | if __name__ == '__main__': 45 | word_to_id, idx_to_word, embedding, start_id, end_id, unk_id, mask_id = load_sentence_embeddings() 46 | pp(idx_to_word[mask_id]) 47 | #pp(idx_to_word) 48 | #pp(word_to_id) 49 | #print(embedding.shape) 50 | 51 | -------------------------------------------------------------------------------- /paraphraser/inference.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from embeddings import load_sentence_embeddings 3 | from preprocess_data import preprocess_batch 4 | from six.moves import input 5 | from lstm_model import lstm_model 6 | import numpy as np 7 | from pprint import pprint as pp 8 | 9 | 10 | class Paraphraser(object): 11 | '''Heart of the paraphraser model. This class loads the checkpoint 12 | into the Tensorflow runtime environment and is responsible for inference. 13 | Greedy and sampling based approaches are supported 14 | ''' 15 | 16 | def __init__(self, checkpoint): 17 | """Constructor. Load vocabulary index, start token, end token, unk id, 18 | mask_id. Restore checkpoint. 19 | 20 | Args: 21 | checkpoint: A path to the checkpoint 22 | """ 23 | self.word_to_id, self.idx_to_word, self.embedding, self.start_id, self.end_id, self.unk_id, self.mask_id = load_sentence_embeddings() 24 | self.checkpoint = checkpoint 25 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) 26 | self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 27 | self.model = lstm_model(self.sess, 'infer', 300, self.embedding, self.start_id, self.end_id, self.mask_id) 28 | saver = tf.train.Saver() 29 | saver.restore(self.sess, checkpoint) 30 | 31 | def sample_paraphrase(self, sentence, sampling_temp=1.0, how_many=1): 32 | """Paraphrase by sampling a distribution 33 | 34 | Args: 35 | sentence (str): A sentence input that will be paraphrased by 36 | sampling from distribution. 37 | sampling_temp (int) : A number between 0 an 1 38 | 39 | Returns: 40 | str: a candidate paraphrase of the `sentence` 41 | """ 42 | 43 | return self.infer(1, sentence, self.idx_to_word, sampling_temp, how_many) 44 | 45 | def greedy_paraphrase(self, sentence): 46 | """Paraphrase using greedy sampler 47 | 48 | Args: 49 | sentence : The source sentence to be paraphrased. 50 | 51 | Returns: 52 | str : a candidate paraphrase of the `sentence` 53 | """ 54 | 55 | return self.infer(0, sentence, self.idx_to_word, 0., 1) 56 | 57 | 58 | def infer(self, decoder, source_sent, id_to_vocab, temp, how_many): 59 | """ Perform inferencing. In other words, generate a paraphrase 60 | for the source sentence. 61 | 62 | Args: 63 | decoder : 0 for greedy, 1 for sampling 64 | source_sent : source sentence to generate a paraphrase for 65 | id_to_vocab : dict of vocabulary index to word 66 | end_id : the end token 67 | temp : the sampling temperature to use when `decoder` is 1 68 | 69 | Returns: 70 | str : for the generated paraphrase 71 | """ 72 | 73 | seq_source_words, seq_source_ids = preprocess_batch([ source_sent ] * how_many) 74 | #print(seq_source_words) 75 | #print(seq_source_ids) 76 | seq_source_len = [ len(seq_source) for seq_source in seq_source_ids ] 77 | #print(seq_source_len) 78 | 79 | feed_dict = { 80 | self.model['seq_source_ids']: seq_source_ids, 81 | self.model['seq_source_lengths']: seq_source_len, 82 | self.model['decoder_technique']: decoder, 83 | self.model['sampling_temperature']: temp 84 | } 85 | 86 | feeds = [ 87 | self.model['predictions'] 88 | #model['final_sequence_lengths'] 89 | ] 90 | 91 | predictions = self.sess.run(feeds, feed_dict)[0] 92 | #print(predictions) 93 | return self.translate(predictions, decoder, id_to_vocab, seq_source_words[0]) 94 | 95 | def translate(self, predictions, decoder, id_to_vocab, seq_source_words): 96 | """ Translate the vocabulary ids in `predictions` to actual words 97 | that compose the paraphrase. 98 | 99 | Args: 100 | predictions : arrays of vocabulary ids 101 | decoder : 0 for greedy, 1 for sample, 2 for beam 102 | id_to_vocab : dict of vocabulary index to word 103 | 104 | Returns: 105 | str : the paraphrase 106 | """ 107 | translated_predictions = [] 108 | #np_end = np.where(translated_predictions == end_id) 109 | for sent_pred in predictions: 110 | translated = [] 111 | for pred in sent_pred: 112 | word = 'UUNNKK' 113 | if pred == self.end_id: 114 | break 115 | if pred == self.unk_id: 116 | # Search for rare word 117 | for seq_source_word in seq_source_words: 118 | if seq_source_word not in self.word_to_id: 119 | word = seq_source_word 120 | else: 121 | word = id_to_vocab[pred] 122 | translated.append(word) 123 | translated_predictions.append(' '.join(translated)) 124 | return translated_predictions 125 | 126 | def main(): 127 | import argparse 128 | parser = argparse.ArgumentParser() 129 | parser.add_argument('--checkpoint', type=str, help='Checkpoint path') 130 | args = parser.parse_args() 131 | paraphraser = Paraphraser(args.checkpoint) 132 | 133 | while 1: 134 | source_sentence = input("Source: ") 135 | #p = paraphraser.greedy_paraphrase(source_sentence) 136 | #print(p) 137 | paraphrases = paraphraser.sample_paraphrase(source_sentence, sampling_temp=0.75, how_many=10) 138 | for i, paraphrase in enumerate(paraphrases): 139 | print("Paraph #{}: {}".format(i, paraphrase)) 140 | 141 | if __name__ == '__main__': 142 | main() 143 | 144 | -------------------------------------------------------------------------------- /paraphraser/inference_frozen_graph.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from load_sent_embeddings import load_sentence_embeddings 3 | from preprocess_data import preprocess_batch 4 | from six.moves import input 5 | 6 | word_to_id, idx_to_word, embedding, start_id, end_id, unk_id = load_sentence_embeddings() 7 | mask_id = 5800 8 | 9 | with open('/media/sdb/models/paraphraser/frozen_model.pb', 'rb') as f: 10 | graph_def = tf.GraphDef() 11 | graph_def.ParseFromString(f.read()) 12 | 13 | with tf.Graph().as_default() as graph: 14 | predictions = tf.import_graph_def( 15 | graph_def=graph_def, 16 | return_elements=['predictions:0'], 17 | name='') 18 | 19 | print([op.name for op in graph.get_operations()]) 20 | 21 | seq_source_ids = graph.get_tensor_by_name('placeholders/source_ids:0') 22 | seq_source_lengths = graph.get_tensor_by_name('placeholders/sequence_source_lengths:0') 23 | decoder_technique = graph.get_tensor_by_name('placeholders/decoder_technique:0') 24 | sampling_temperature = graph.get_tensor_by_name('placeholders/sampling_temperature:0') 25 | keep_prob = graph.get_tensor_by_name('placeholders/keep_prob:0') 26 | 27 | model = { 28 | 'seq_source_ids': seq_source_ids, 29 | 'seq_source_lengths': seq_source_lengths, 30 | 'predictions': predictions, 31 | 'decoder_technique': decoder_technique, 32 | 'sampling_temperature': sampling_temperature 33 | } 34 | 35 | sess = tf.Session() 36 | 37 | def restore_model(checkpoint): 38 | model = lstm_model(sess, 'infer', 300, embedding, start_id, end_id, mask_id) 39 | saver = tf.train.Saver() 40 | saver.restore(sess, checkpoint) 41 | 42 | def translate(predictions, decoder, id_to_vocab, end_id): 43 | """ Translate the vocabulary ids in `predictions` to actual words 44 | that compose the paraphrase. 45 | 46 | Args: 47 | predictions : arrays of vocabulary ids 48 | decoder : 0 for greedy, 1 for sample, 2 for beam 49 | id_to_vocab : dict of vocabulary index to word 50 | end_id : end token index 51 | 52 | Returns: 53 | str : the paraphrase 54 | """ 55 | if decoder == 2: 56 | _, sentence_length, num_samples = predictions.shape 57 | for i in xrange(num_samples): 58 | sent_pred = [] 59 | for j in xrange(sentence_length): 60 | sent_pred.append(predictions[0][j][i]) 61 | try: 62 | end_index = sent_pred.index(end_id) 63 | sent_pred = sent_pred[:end_index] 64 | except Exception as e: 65 | pass 66 | return ' '.join([ id_to_vocab[pred] for pred in sent_pred ]) 67 | else: 68 | for sent_pred in predictions: 69 | if sent_pred[-1] == end_id: 70 | sent_pred = sent_pred[0:-1] 71 | return ' '.join([ id_to_vocab[pred] for pred in sent_pred ]) 72 | 73 | 74 | def infer(sess, model, decoder, source_sent, id_to_vocab, end_id, temp): 75 | """ Perform inferencing. In other words, generate a paraphrase 76 | for the source sentence. 77 | 78 | Args: 79 | sess : Tensorflow session. 80 | model : dict of tensor to value 81 | decoder : 0 for greedy, 1 for sampling 82 | source_sent : source sentence to generate a paraphrase for 83 | id_to_vocab : dict of vocabulary index to word 84 | end_id : the end token 85 | temp : the sampling temperature to use when `decoder` is 1 86 | 87 | Returns: 88 | str : for the generated paraphrase 89 | """ 90 | 91 | seq_source_words, seq_source_ids = preprocess_batch([ source_sent ]) 92 | seq_source_len = [ len(seq_source) for seq_source in seq_source_ids ] 93 | 94 | feed_dict = { 95 | model['seq_source_ids']: seq_source_ids, 96 | model['seq_source_lengths']: seq_source_len, 97 | model['decoder_technique']: decoder, 98 | model['sampling_temperature']: temp 99 | } 100 | 101 | feeds = [ 102 | model['predictions'] 103 | #model['final_sequence_lengths'] 104 | ] 105 | 106 | predictions = sess.run(feeds, feed_dict)[0][0] 107 | return translate(predictions, decoder, id_to_vocab, end_id) 108 | 109 | def greedy_paraphrase(sentence): 110 | """Paraphrase using greedy sampler 111 | 112 | Args: 113 | sentence : The source sentence to be paraphrased. 114 | 115 | Returns: 116 | str : a candidate paraphrase of the `sentence` 117 | """ 118 | 119 | with tf.Session(graph=graph) as sess: 120 | return infer(sess, model, 0, sentence, idx_to_word, end_id, 0.) 121 | 122 | def sampler_paraphrase(sentence, sampling_temp=1.0): 123 | """Paraphrase by sampling a distribution 124 | 125 | Args: 126 | sentence (str): A sentence input that will be paraphrased by 127 | sampling from distribution. 128 | sampling_temp (int) : A number between 0 an 1 129 | 130 | Returns: 131 | str: a candidate paraphrase of the `sentence` 132 | """ 133 | 134 | with tf.Session(graph=graph) as sess: 135 | return infer(sess, model, 1, sentence, idx_to_word, end_id, sampling_temp) 136 | 137 | def main(): 138 | while 1: 139 | source_sentence = input("Source: ") 140 | #print("Paraph: {}".format(sampler_paraphrase('hello world.'))) 141 | print("Paraph: {}".format(greedy_paraphrase('hello world.'))) 142 | 143 | if __name__ == '__main__': 144 | main() 145 | 146 | 147 | -------------------------------------------------------------------------------- /paraphraser/inspect_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A simple script for inspect checkpoint files.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | import sys 22 | 23 | import numpy as np 24 | 25 | from tensorflow.python import pywrap_tensorflow 26 | from tensorflow.python.platform import app 27 | from tensorflow.python.platform import flags 28 | 29 | FLAGS = None 30 | 31 | 32 | def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors, 33 | all_tensor_names): 34 | """Prints tensors in a checkpoint file. 35 | 36 | If no `tensor_name` is provided, prints the tensor names and shapes 37 | in the checkpoint file. 38 | 39 | If `tensor_name` is provided, prints the content of the tensor. 40 | 41 | Args: 42 | file_name: Name of the checkpoint file. 43 | tensor_name: Name of the tensor in the checkpoint file to print. 44 | all_tensors: Boolean indicating whether to print all tensors. 45 | all_tensor_names: Boolean indicating whether to print all tensor names. 46 | """ 47 | try: 48 | reader = pywrap_tensorflow.NewCheckpointReader(file_name) 49 | if all_tensors or all_tensor_names: 50 | var_to_shape_map = reader.get_variable_to_shape_map() 51 | for key in sorted(var_to_shape_map): 52 | print("tensor_name: ", key) 53 | if all_tensors: 54 | print(reader.get_tensor(key)) 55 | elif not tensor_name: 56 | print(reader.debug_string().decode("utf-8")) 57 | else: 58 | print("tensor_name: ", tensor_name) 59 | print(reader.get_tensor(tensor_name)) 60 | except Exception as e: # pylint: disable=broad-except 61 | print(str(e)) 62 | if "corrupted compressed block contents" in str(e): 63 | print("It's likely that your checkpoint file has been compressed " 64 | "with SNAPPY.") 65 | if ("Data loss" in str(e) and 66 | (any([e in file_name for e in [".index", ".meta", ".data"]]))): 67 | proposed_file = ".".join(file_name.split(".")[0:-1]) 68 | v2_file_error_template = """ 69 | It's likely that this is a V2 checkpoint and you need to provide the filename 70 | *prefix*. Try removing the '.' and extension. Try: 71 | inspect checkpoint --file_name = {}""" 72 | print(v2_file_error_template.format(proposed_file)) 73 | 74 | 75 | def parse_numpy_printoption(kv_str): 76 | """Sets a single numpy printoption from a string of the form 'x=y'. 77 | 78 | See documentation on numpy.set_printoptions() for details about what values 79 | x and y can take. x can be any option listed there other than 'formatter'. 80 | 81 | Args: 82 | kv_str: A string of the form 'x=y', such as 'threshold=100000' 83 | 84 | Raises: 85 | argparse.ArgumentTypeError: If the string couldn't be used to set any 86 | nump printoption. 87 | """ 88 | k_v_str = kv_str.split("=", 1) 89 | if len(k_v_str) != 2 or not k_v_str[0]: 90 | raise argparse.ArgumentTypeError("'%s' is not in the form k=v." % kv_str) 91 | k, v_str = k_v_str 92 | printoptions = np.get_printoptions() 93 | if k not in printoptions: 94 | raise argparse.ArgumentTypeError("'%s' is not a valid printoption." % k) 95 | v_type = type(printoptions[k]) 96 | if v_type is type(None): 97 | raise argparse.ArgumentTypeError( 98 | "Setting '%s' from the command line is not supported." % k) 99 | try: 100 | v = (v_type(v_str) if v_type is not bool 101 | else flags.BooleanParser().parse(v_str)) 102 | except ValueError as e: 103 | raise argparse.ArgumentTypeError(e.message) 104 | np.set_printoptions(**{k: v}) 105 | 106 | 107 | def main(unused_argv): 108 | if not FLAGS.file_name: 109 | print("Usage: inspect_checkpoint --file_name=checkpoint_file_name " 110 | "[--tensor_name=tensor_to_print] " 111 | "[--all_tensors] " 112 | "[--all_tensor_names] " 113 | "[--printoptions]") 114 | sys.exit(1) 115 | else: 116 | print_tensors_in_checkpoint_file(FLAGS.file_name, FLAGS.tensor_name, 117 | FLAGS.all_tensors, FLAGS.all_tensor_names) 118 | 119 | 120 | if __name__ == "__main__": 121 | parser = argparse.ArgumentParser() 122 | parser.register("type", "bool", lambda v: v.lower() == "true") 123 | parser.add_argument( 124 | "--file_name", type=str, default="", help="Checkpoint filename. " 125 | "Note, if using Checkpoint V2 format, file_name is the " 126 | "shared prefix between all files in the checkpoint.") 127 | parser.add_argument( 128 | "--tensor_name", 129 | type=str, 130 | default="", 131 | help="Name of the tensor to inspect") 132 | parser.add_argument( 133 | "--all_tensors", 134 | nargs="?", 135 | const=True, 136 | type="bool", 137 | default=False, 138 | help="If True, print the values of all the tensors.") 139 | parser.add_argument( 140 | "--all_tensor_names", 141 | nargs="?", 142 | const=True, 143 | type="bool", 144 | default=False, 145 | help="If True, print the names of all the tensors.") 146 | parser.add_argument( 147 | "--printoptions", 148 | nargs="*", 149 | type=parse_numpy_printoption, 150 | help="Argument for numpy.set_printoptions(), in the form 'k=v'.") 151 | FLAGS, unparsed = parser.parse_known_args() 152 | app.run(main=main, argv=[sys.argv[0]] + unparsed) 153 | -------------------------------------------------------------------------------- /paraphraser/lstm_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import datetime as dt 3 | import sys 4 | import numpy as np 5 | from tensorflow.python.layers import core as layers_core 6 | from sample_embedding_helper import MySampleEmbeddingHelper 7 | 8 | #def lstm_model(args, np_embeddings, start_id, end_id, mask_id, mode): 9 | def lstm_model(sess, mode, cell_hidden_size, np_embeddings, start_id, end_id, mask_id): 10 | vocab_size, hidden_size = np_embeddings.shape 11 | 12 | # Embeddings 13 | with tf.variable_scope('embeddings'): 14 | encoder_embeddings = tf.get_variable(name="encoder_embeddings", shape=np_embeddings.shape, initializer=tf.constant_initializer(np_embeddings), trainable=True) 15 | decoder_embeddings = tf.get_variable(name="decoder_embeddings", shape=np_embeddings.shape, initializer=tf.constant_initializer(np_embeddings), trainable=True) 16 | #embeddings = tf.get_variable(name="embeddings", shape=np_embeddings.shape, initializer=tf.constant_initializer(np_embeddings), trainable=True) 17 | 18 | # Define placeholders 19 | with tf.variable_scope('placeholders'): 20 | lr = tf.placeholder(tf.float32, shape=(), name="learning_rate") 21 | seq_source_ids = tf.placeholder(tf.int32, shape=(None, None), name="source_ids") 22 | seq_source_lengths = tf.placeholder(tf.int32, [None], name="sequence_source_lengths") 23 | keep_prob = tf.placeholder_with_default(1.0, shape=(), name="keep_prob") 24 | # 0: greedy, 1: sampling, 2: beam 25 | sampling_temperature = tf.placeholder_with_default(0.5, shape=(), name="sampling_temperature") 26 | decoder_technique = tf.placeholder_with_default(1, shape=(), name="decoder_technique") 27 | #beam_width = tf.placeholder_with_default(5, shape=(), name="beam_width") 28 | dummy = tf.add(sampling_temperature, 1, name="dummy") 29 | 30 | if mode in set(['train', 'dev', 'test']): 31 | seq_reference_ids = tf.placeholder(tf.int32, shape=(None, None), name="reference_ids") 32 | seq_reference_lengths = tf.placeholder(tf.int32, [None], name="sequence_reference_lengths") 33 | paddings = tf.constant([[0, 0], [0, 1]]) 34 | seq_output_ids = tf.pad(seq_reference_ids[:, 1:], paddings, mode="CONSTANT", name="seq_output_ids", constant_values=mask_id) 35 | else: 36 | seq_reference_ids = None 37 | seq_reference_lengths = None 38 | seq_output_ids = None 39 | 40 | #batch_size = tf.cast(tf.shape(seq_source_ids)[0], tf.float32) 41 | batch_size = tf.shape(seq_source_ids)[0] 42 | 43 | # Encoder 44 | #with tf.variable_scope('encoder'): 45 | encoder_embedding = tf.nn.embedding_lookup(encoder_embeddings, seq_source_ids, name="encoder_embedding") 46 | encoder_fw_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(cell_hidden_size), input_keep_prob=keep_prob, output_keep_prob=keep_prob) 47 | encoder_bw_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(cell_hidden_size), input_keep_prob=keep_prob, output_keep_prob=keep_prob) 48 | encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_fw_cell, 49 | cell_bw=encoder_bw_cell, 50 | inputs=encoder_embedding, 51 | sequence_length=seq_source_lengths, 52 | dtype=tf.float32) 53 | concat_encoder_outputs = tf.concat(encoder_outputs, 2) 54 | encoder_fw_state, encoder_bw_state = encoder_states 55 | encoder_state_c = tf.concat((encoder_fw_state.c, encoder_bw_state.c), axis=1, name="encoder_state_c") 56 | encoder_state_h = tf.concat((encoder_fw_state.h, encoder_bw_state.h), axis=1, name="encoder_state_h") 57 | joined_encoder_state = tf.contrib.rnn.LSTMStateTuple(encoder_state_c, encoder_state_h) 58 | 59 | fc_layer = layers_core.Dense(vocab_size, use_bias=False) 60 | attention = tf.contrib.seq2seq.BahdanauAttention(num_units=cell_hidden_size, memory=concat_encoder_outputs) 61 | decoder_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(cell_hidden_size * 2), input_keep_prob=keep_prob, output_keep_prob=keep_prob) 62 | attn_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention, attention_layer_size=cell_hidden_size) 63 | zero_state = attn_cell.zero_state(batch_size, tf.float32) 64 | decoder_initial_state = zero_state.clone(cell_state=joined_encoder_state) 65 | 66 | ''' Beam search 67 | tiled_joined_encoder_state = tf.contrib.seq2seq.tile_batch(joined_encoder_state, multiplier=beam_width) 68 | tiled_concat_encoder_outputs = tf.contrib.seq2seq.tile_batch(concat_encoder_outputs, multiplier=beam_width) 69 | beam_attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( 70 | num_units=hidden_size, 71 | memory=tiled_concat_encoder_outputs) 72 | #decoder_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(hidden_size * 2), input_keep_prob=1.0, output_keep_prob=1.0) 73 | beam_attn_wrapper = tf.contrib.seq2seq.AttentionWrapper( 74 | cell=tf.nn.rnn_cell.BasicLSTMCell(hidden_size * 2), 75 | attention_mechanism=beam_attention_mechanism, 76 | attention_layer_size=hidden_size) 77 | ''' 78 | 79 | # Train, dev, test 80 | if mode in set(['train', 'dev', 'test']): 81 | # Decoder 82 | decoder_embedding = tf.nn.embedding_lookup(decoder_embeddings, seq_reference_ids, name="decoder_embedding") 83 | helper = tf.contrib.seq2seq.TrainingHelper(decoder_embedding, seq_reference_lengths) 84 | decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, helper, decoder_initial_state, fc_layer) 85 | final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder, swap_memory=True) 86 | logits = final_outputs.rnn_output 87 | predictions = final_outputs.sample_id 88 | 89 | with tf.variable_scope('train_loss'): 90 | max_output_len = tf.shape(logits)[1] 91 | seq_output_ids = seq_output_ids[:, :max_output_len] 92 | pad = tf.fill((tf.shape(seq_output_ids)[0], max_output_len), -1) #mask_id 93 | boolean_mask = tf.not_equal(seq_output_ids, pad) 94 | mask = tf.cast(boolean_mask, tf.float32) 95 | labels = tf.reshape(seq_output_ids, shape=(-1, 1)) 96 | crossent = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(labels, vocab_size), logits=logits) 97 | loss = (tf.reduce_sum(crossent * mask) / tf.cast(batch_size, tf.float32)) 98 | 99 | with tf.variable_scope('summaries'): 100 | tf.summary.scalar("batch_loss", loss) 101 | summaries = tf.summary.merge_all() 102 | 103 | train_step = tf.train.AdamOptimizer(lr).minimize(loss) 104 | 105 | # Test 106 | elif mode == 'infer': 107 | loss = None 108 | train_step = None 109 | labels = None 110 | summaries = None 111 | start_tokens = tf.fill([batch_size], start_id) 112 | 113 | # Beach search decoder 114 | ''' 115 | beam_search_decoder = tf.contrib.seq2seq.BeamSearchDecoder( 116 | cell=beam_attn_wrapper, 117 | embedding=decoder_embeddings, 118 | start_tokens=start_tokens, 119 | end_token=end_id, 120 | initial_state=beam_attn_wrapper.zero_state(batch_size * beam_width, tf.float32).clone(cell_state=tiled_joined_encoder_state), 121 | beam_width=beam_width.eval(), 122 | output_layer=fc_layer, 123 | length_penalty_weight=0.0) 124 | ''' 125 | 126 | # Distribution sampling 127 | #sample_helper = MySampleEmbeddingHelper(decoder_embeddings, start_tokens, end_id, softmax_temperature=sampling_temperature) 128 | sample_helper = tf.contrib.seq2seq.SampleEmbeddingHelper(decoder_embeddings, start_tokens, end_id, softmax_temperature=sampling_temperature) 129 | sample_decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, sample_helper, decoder_initial_state, output_layer=fc_layer) 130 | 131 | # Greedy argmax decoder 132 | greedy_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, start_tokens, end_id) 133 | # applied per timestep 134 | greedy_decoder = tf.contrib.seq2seq.BasicDecoder(attn_cell, greedy_helper, decoder_initial_state, output_layer=fc_layer) 135 | 136 | # Decode! 137 | greedy_outputs, greedy_final_state, greedy_fsl = tf.contrib.seq2seq.dynamic_decode( 138 | greedy_decoder, 139 | #maximum_iterations=maximum_iterations, 140 | swap_memory=True) 141 | greedy_logits = greedy_outputs.rnn_output 142 | greedy_predictions = tf.identity(greedy_outputs.sample_id, name="greedy_predictions") 143 | 144 | sample_outputs, sample_final_state, sample_fsl = tf.contrib.seq2seq.dynamic_decode( 145 | sample_decoder, 146 | swap_memory=True) 147 | sample_logits = sample_outputs.rnn_output 148 | sample_predictions = tf.identity(sample_outputs.sample_id, name="sample_predictions") 149 | 150 | ''' 151 | beam_search_outputs, beam_search_final_state, beam_search_fsl = tf.contrib.seq2seq.dynamic_decode( 152 | beam_search_decoder, 153 | swap_memory=True) 154 | beam_search_logits = tf.no_op() 155 | beam_search_predictions = tf.identity(beam_search_outputs.predicted_ids, name="beam_search_predictions") 156 | print(beam_search_predictions) 157 | ''' 158 | z,y,a = tf.case( 159 | pred_fn_pairs={ 160 | tf.equal(sampling_temperature, tf.constant(0.0)): lambda: (greedy_predictions, greedy_fsl, greedy_logits), 161 | tf.equal(sampling_temperature, tf.constant(1.0)): lambda: (sample_predictions, sample_fsl, sample_logits), 162 | }, 163 | default = lambda: (sample_predictions, sample_fsl, sample_logits), 164 | exclusive=True 165 | ) 166 | 167 | predictions, final_sequence_lengths, logits = tf.case( 168 | pred_fn_pairs={ 169 | tf.equal(decoder_technique, tf.constant(0)): lambda: (greedy_predictions, greedy_fsl, greedy_logits), 170 | tf.equal(decoder_technique, tf.constant(1)): lambda: (sample_predictions, sample_fsl, sample_logits), 171 | #tf.equal(decoder_technique, tf.constant(2)): lambda: (beam_search_predictions, beam_search_fsl) 172 | }, 173 | exclusive=True) 174 | 175 | predictions = tf.identity(predictions, name='predictions') 176 | final_sequence_lengths = tf.identity(final_sequence_lengths, name='final_sequence_lengths') 177 | logits = tf.identity(logits, name='logits') 178 | 179 | return { 180 | 'lr': lr, 181 | 'keep_prob': keep_prob, 182 | 'decoder_technique': decoder_technique, 183 | 'sampling_temperature': sampling_temperature, 184 | #'beam_width': beam_width, 185 | 'seq_source_ids': seq_source_ids, 186 | 'seq_source_lengths': seq_source_lengths, 187 | 'seq_reference_ids': seq_reference_ids, 188 | 'seq_reference_lengths': seq_reference_lengths, 189 | #'final_state': final_state, 190 | 'final_sequence_lengths': final_sequence_lengths, 191 | 'embedding_source': encoder_embedding, 192 | 'encoder_states': encoder_states, 193 | 'loss': loss, 194 | 'predictions': predictions, 195 | 'labels': labels, 196 | 'summaries': summaries, 197 | 'train_step': train_step, 198 | 'dummy': dummy 199 | } 200 | 201 | -------------------------------------------------------------------------------- /paraphraser/nlp_pipeline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import spacy 3 | from spacy.tokenizer import Tokenizer 4 | import datetime as dt 5 | import multiprocessing as mp 6 | 7 | nlp = spacy.load('en') 8 | tokenizer = Tokenizer(nlp.vocab) 9 | 10 | def nlp_pipeline(sentence, word_to_id, unk_id): 11 | ''' Convert word tokens into their vocab ids ''' 12 | return [ word_to_id.get(token.lower_, unk_id) for token in nlp_pipeline_0(sentence) ] 13 | 14 | def nlp_pipeline_0(sentence): 15 | ''' Execute spacy pipeline, single thread ''' 16 | return nlp(sentence, disable=['parser', 'tagger', 'ner']) 17 | 18 | def mp_nlp_pipeline(pool, lines): 19 | ''' Execute spacy pipeline, multiprocessing style ''' 20 | return pool.map(nlp_pipeline_0, lines, 1) 21 | 22 | def openmp_nlp_pipeline(lines, n_threads=12): 23 | ''' Execute spacy's openmp nlp pipeline ''' 24 | return [ [ token.lower_ for token in doc ] for doc in nlp.pipe(lines, n_threads=n_threads, disable=['parser', 'tagger', 'ner']) ] 25 | 26 | def single_thread_nlp_pipeline(lines): 27 | ''' Another single thread pipeline ''' 28 | return [ nlp(line) for line in lines ] 29 | 30 | def main(): 31 | import datetime as dt 32 | from embeddings import load_sentence_embeddings 33 | #pool = mp.Pool(10) 34 | 35 | word_to_id, idx_to_word, embedding, start_id, end_id, unk_id = load_sentence_embeddings() 36 | print(unk_id) 37 | 38 | with open('/media/sdb/datasets/para-nmt-5m-processed/para-nmt-5m-processed.txt', 'r') as f: 39 | lines = [] 40 | for i, line in enumerate(f): 41 | lines.append(line.strip()) 42 | 43 | if i % 64 == 0: 44 | start = dt.datetime.now() 45 | #docs = mp_nlp_pipeline(pool, lines) 46 | docs = openmp_nlp_pipeline(lines, word_to_id, unk_id) 47 | #docs = single_thread_nlp_pipeline(lines) 48 | #doc = nlp_pipeline_0(line) 49 | print(docs) 50 | 51 | end = dt.datetime.now() 52 | print(end - start, flush=True) 53 | lines = [] 54 | else: 55 | continue 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | 61 | -------------------------------------------------------------------------------- /paraphraser/paraphraser.py: -------------------------------------------------------------------------------- 1 | from synonym_model import synonym_paraphrase 2 | from inference import Paraphraser 3 | -------------------------------------------------------------------------------- /paraphraser/preprocess_data.py: -------------------------------------------------------------------------------- 1 | """Dataset preprocessing and generation. 2 | 3 | This module's purpose is to consume raw paraphrase text and output a dataset 4 | in an optimal form to later be consumed by ParaphraseDataset class in 5 | dataset_generator.py. The raw text are assumed to be valid paraphrases 6 | and must follow the following format each line: 7 | 8 | source sentence\treference sentence 9 | 10 | The number of tokens within a sentence are counted so that samples can be 11 | grouped into the same file by similar length. After nlp preprocessing and 12 | tokenization, the resulting new format per line is: 13 | 14 | source sentence tokens\tsource sentence token ids\treference tokens\treference token ids 15 | 16 | This format is consumed directly into ParaphraseDataset to generate mini 17 | batches where each batch contains similar length sentences. 18 | 19 | """ 20 | 21 | import os 22 | from six import iteritems 23 | from nlp_pipeline import openmp_nlp_pipeline 24 | from embeddings import load_sentence_embeddings 25 | 26 | word_to_id, idx_to_word, embedding, start_id, end_id, unk_id, mask_id = load_sentence_embeddings() 27 | 28 | def generate_length_index(max_lengths): 29 | l = [] 30 | prev = None 31 | for ml in max_lengths: 32 | if prev == None: 33 | a = (ml+1) * [ml] 34 | else: 35 | a = (ml - prev) * [ml] 36 | prev = ml 37 | l.extend(a) 38 | return l 39 | 40 | def word_to_token_ids(batch_docs): 41 | batch_token_ids = [ [ word_to_id.get(word, unk_id) for word in doc ] for doc in batch_docs ] 42 | return batch_token_ids 43 | 44 | def preprocess_batch(batch_sentences): 45 | # NLP Pipleine 46 | batch_words = openmp_nlp_pipeline(batch_sentences) 47 | batch_ids_ = word_to_token_ids(batch_words) 48 | 49 | # Create reference, preprend start id, append end id 50 | batch_ids = [ [start_id] + ids + [end_id] for ids in batch_ids_ ] 51 | 52 | return (batch_words, batch_ids) 53 | 54 | def fsave_data(filename, batch_source_words, batch_source_ids, batch_ref_words, batch_ref_ids): 55 | max_lengths = [5, 10, 20, 30, 40, 50] 56 | 57 | for length in max_lengths: 58 | try: 59 | os.remove(filename + "." + str(length)) 60 | except: 61 | pass 62 | 63 | files = { length: open(filename + "." + str(length), 'a') for length in max_lengths } 64 | l = generate_length_index(max_lengths) 65 | 66 | z = zip(batch_source_words, batch_source_ids, batch_ref_words, batch_ref_ids) 67 | 68 | for source_words, source_ids, ref_words, ref_ids in z: 69 | max_len = max(len(source_ids), len(ref_ids)) 70 | try: 71 | files[l[max_len]].write("{}\t{}\t{}\t{}\n".format(' '.join(source_words), 72 | ' '.join([ str(source_id) for source_id in source_ids ]), 73 | ' '.join(ref_words), 74 | ' '.join([ str(ref_id) for ref_id in ref_ids ]))) 75 | except Exception as e: 76 | print(e) 77 | print("Error writing {} {} {} {}".format(' '.join(source_words), 78 | ' '.join([ str(source_id) for source_id in source_ids ]), 79 | ' '.join(ref_words), 80 | ' '.join([ str(ref_id) for ref_id in ref_ids ]))) 81 | continue 82 | 83 | for length, f in iteritems(files): 84 | f.close() 85 | 86 | def preprocess_data(filename): 87 | batch_source_sentences = [] 88 | batch_ref_sentences = [] 89 | 90 | with open(filename, 'r') as f: 91 | for i, line in enumerate(f): 92 | source, ref = line.split('\t') 93 | batch_source_sentences.append(source.strip()) 94 | batch_ref_sentences.append(ref.strip()) 95 | 96 | batch_source_words, batch_source_ids = preprocess_batch(batch_source_sentences) 97 | batch_ref_words, batch_ref_ids = preprocess_batch(batch_ref_sentences) 98 | 99 | fsave_data(filename, batch_source_words, batch_source_ids, batch_ref_words, batch_ref_ids) 100 | 101 | def main(): 102 | import sys 103 | preprocess_data(sys.argv[1]) 104 | 105 | if __name__ == '__main__': 106 | main() 107 | 108 | -------------------------------------------------------------------------------- /paraphraser/sample_embedding_helper.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import abc 6 | 7 | import six 8 | 9 | from tensorflow.contrib.seq2seq.python.ops import decoder 10 | from tensorflow.python.framework import dtypes 11 | from tensorflow.python.framework import ops 12 | from tensorflow.python.framework import tensor_shape 13 | from tensorflow.python.ops import array_ops 14 | from tensorflow.python.ops import control_flow_ops 15 | from tensorflow.python.ops import embedding_ops 16 | from tensorflow.python.ops import gen_array_ops 17 | from tensorflow.python.ops import math_ops 18 | from tensorflow.python.ops import tensor_array_ops 19 | from tensorflow.python.ops.distributions import bernoulli 20 | from tensorflow.python.ops.distributions import categorical 21 | from tensorflow.python.util import nest 22 | from tensorflow.contrib.seq2seq.python.ops.helper import GreedyEmbeddingHelper 23 | 24 | 25 | class MySampleEmbeddingHelper(GreedyEmbeddingHelper): 26 | """A helper for use during inference. 27 | Uses sampling (from a distribution) instead of argmax and passes the 28 | result through an embedding layer to get the next input. 29 | """ 30 | 31 | def __init__(self, embedding, start_tokens, end_token, 32 | softmax_temperature=None, seed=None): 33 | """Initializer. 34 | Args: 35 | embedding: A callable that takes a vector tensor of `ids` (argmax ids), 36 | or the `params` argument for `embedding_lookup`. The returned tensor 37 | will be passed to the decoder input. 38 | start_tokens: `int32` vector shaped `[batch_size]`, the start tokens. 39 | end_token: `int32` scalar, the token that marks end of decoding. 40 | softmax_temperature: (Optional) `float32` scalar, value to divide the 41 | logits by before computing the softmax. Larger values (above 1.0) result 42 | in more random samples, while smaller values push the sampling 43 | distribution towards the argmax. Must be strictly greater than 0. 44 | Defaults to 1.0. 45 | seed: (Optional) The sampling seed. 46 | Raises: 47 | ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a 48 | scalar. 49 | """ 50 | super(MySampleEmbeddingHelper, self).__init__( 51 | embedding, start_tokens, end_token) 52 | self._softmax_temperature = softmax_temperature 53 | self._seed = seed 54 | 55 | def sample(self, time, outputs, state, name=None): 56 | """sample for SampleEmbeddingHelper.""" 57 | del time, state # unused by sample_fn 58 | # Outputs are logits, we sample instead of argmax (greedy). 59 | if not isinstance(outputs, ops.Tensor): 60 | raise TypeError("Expected outputs to be a single Tensor, got: %s" % 61 | type(outputs)) 62 | if self._softmax_temperature is None: 63 | logits = outputs 64 | else: 65 | #logits = outputs / self._softmax_temperature 66 | logits = math_ops.divide(outputs, self._softmax_temperature) 67 | 68 | sample_id_sampler = categorical.Categorical(logits=logits) 69 | sample_ids = sample_id_sampler.sample(seed=self._seed) 70 | 71 | return sample_ids 72 | 73 | -------------------------------------------------------------------------------- /paraphraser/synonym_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import spacy 3 | from pprint import pprint 4 | from spacy.tokens.token import Token 5 | from nltk.corpus import wordnet as wn 6 | from six.moves import xrange 7 | import random 8 | 9 | nlp = spacy.load('en') 10 | 11 | def generate_sentence(original_doc, new_tokens): 12 | new_sentence = ' '.join(new_tokens).replace('_', ' ') 13 | new_doc = nlp(new_sentence) 14 | similarity_score = original_doc.similarity(new_doc) 15 | return (new_sentence, similarity_score) 16 | 17 | def synonym_model(s): 18 | generated_sentences = set([]) 19 | 20 | doc = nlp(s) 21 | original_tokens = [ token.text for token in doc ] 22 | 23 | index_to_lemmas = {} 24 | 25 | for index, token in enumerate(doc): 26 | index_to_lemmas[index] = set([]) 27 | index_to_lemmas[index].add(token) 28 | 29 | if token.pos_ == 'NOUN' and len(token.text) >= 3: 30 | pos = wn.NOUN 31 | elif token.pos_ == 'VERB' and len(token.text) >= 3: 32 | pos = wn.VERB 33 | elif token.pos_ == 'ADV' and len(token.text) >= 3: 34 | pos = wn.ADV 35 | elif token.pos_ == 'ADJ' and len(token.text) >= 3: 36 | pos = wn.ADJ 37 | else: 38 | continue 39 | 40 | # Synsets 41 | for synset in wn.synsets(token.text, pos): 42 | for lemma in synset.lemmas(): 43 | new_tokens = original_tokens.copy() 44 | new_tokens[index] = lemma.name() 45 | sentence_and_score = generate_sentence(doc, new_tokens) 46 | generated_sentences.add(sentence_and_score) 47 | index_to_lemmas[index].add(lemma.name()) 48 | 49 | count = sum([ len(words) for words in index_to_lemmas.values() ]) 50 | 51 | for i in xrange(min(count, 40)): 52 | new_tokens = [] 53 | for index, words in sorted(index_to_lemmas.items(), key=lambda x: x[0]): 54 | token = random.sample(index_to_lemmas[index], 1)[0] 55 | new_tokens.append(str(token)) 56 | sentence_and_score = generate_sentence(doc, new_tokens) 57 | generated_sentences.add(sentence_and_score) 58 | 59 | #print(generated_sentences) 60 | return generated_sentences 61 | 62 | def synonym_paraphrase(s): 63 | return synonym_model(s) 64 | 65 | if __name__ == '__main__': 66 | #x = synonym_model('I am discussing my homework with the teacher.') 67 | #x = synonym_model('the rabbit quickly ran down the hole') 68 | #x = synonym_model('John tried to fix his computer by hacking away at it.') 69 | x = synonym_model('team based multiplayer online first person shooter video game') 70 | print(x) 71 | 72 | -------------------------------------------------------------------------------- /paraphraser/training_pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import numpy as np 4 | import os 5 | import sys 6 | import datetime as dt 7 | from six.moves import xrange, input 8 | from lstm_model_beam import lstm_model 9 | from embeddings import load_sentence_embeddings 10 | from dataset_generator import ParaphraseDataset 11 | from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction 12 | from utils import dataset_config, debug_data, summarize_scalar 13 | import logging 14 | 15 | logging.basicConfig(format = u'[%(asctime)s] %(levelname)-8s : %(message)s', level = logging.INFO) 16 | 17 | def evaluate(sess, model, dataset_generator, mode, id_to_vocab): 18 | """Evaluate current model on the dev or test set. 19 | 20 | Args: 21 | sess: Tensorflow session 22 | model: dictionary containing model's tensors of interest for evaluation 23 | dataset_generator: dataset batch generator 24 | mode: 'dev' or 'test' 25 | id_to_vocab: voabulary dictionary id -> word 26 | 27 | Returns: 28 | loss: the loss after evaluating the dataset 29 | bleu_score: BLEU score after evaluation 30 | """ 31 | 32 | batch_generator = dataset_generator.generate_batch(mode) 33 | chencherry = SmoothingFunction() 34 | batch_losses = [] 35 | all_seq_ref_words = [] 36 | all_bleu_pred_words = [] 37 | 38 | for batch in batch_generator: 39 | seq_source_ids = batch['seq_source_ids'] 40 | seq_source_words = batch['seq_source_words'] 41 | seq_source_len = batch['seq_source_len'] 42 | seq_ref_ids = batch['seq_ref_ids'] 43 | seq_ref_words = batch['seq_ref_words'] 44 | seq_ref_len = batch['seq_ref_len'] 45 | 46 | feed_dict = { 47 | model['seq_source_ids']: seq_source_ids, 48 | model['seq_source_lengths']: seq_source_len, 49 | model['seq_reference_ids']: seq_ref_ids, 50 | model['seq_reference_lengths']: seq_ref_len 51 | } 52 | 53 | feeds = [ 54 | model['loss'], 55 | model['predictions'], 56 | model['final_sequence_lengths'] 57 | ] 58 | 59 | try: 60 | batch_loss, predictions, fsl = sess.run(feeds, feed_dict) 61 | except Exception as e: 62 | debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab) 63 | raise e 64 | 65 | # batch losses 66 | batch_losses.append(batch_loss) 67 | 68 | # all ref words 69 | seq_ref_words = [ [ref_words] for ref_words in seq_ref_words ] 70 | all_seq_ref_words.extend(seq_ref_words) 71 | 72 | # all prediction words to compute bleu on 73 | bleu_pred_words = [ [ id_to_vocab[vocab_id] for vocab_id in prediction if vocab_id in id_to_vocab ] for prediction in predictions ] 74 | bleu_pred_words = [ pred_words[:pred_words.index('') if '' in pred_words else len(pred_words) ] for pred_words in bleu_pred_words ] 75 | all_bleu_pred_words.extend(bleu_pred_words) 76 | 77 | bleu_score = corpus_bleu(all_seq_ref_words, all_bleu_pred_words, smoothing_function=chencherry.method1) 78 | loss = sum(batch_losses) / len(batch_losses) 79 | logging.info("{} : Evaluating on {} set loss={:.4f} bleu={:.4f}".format(dt.datetime.now(), mode, loss, bleu_score)) 80 | return loss, bleu_score 81 | 82 | def infer(sess, args, model, id_to_vocab, end_id): 83 | """Perform inference on a model. This is intended to be interactive. 84 | A user will run this from the command line to provide an input sentence 85 | and receive a paraphrase as output continuously within a loop. 86 | 87 | Args: 88 | sess: Tensorflow session 89 | args: ArgumentParser object configuration 90 | model: a dictionary containing the model tensors 91 | id_to_vocab: vocabulary index of id_to_vocab 92 | end_id: the end of sentence token 93 | 94 | """ 95 | from preprocess_data import preprocess_batch 96 | 97 | while 1: 98 | source_sent = input("Enter source sentence: ") 99 | seq_source_words, seq_source_ids = preprocess_batch([ source_sent ]) 100 | seq_source_len = [ len(seq_source) for seq_source in seq_source_ids ] 101 | 102 | if args.decoder == 'greedy': 103 | decoder = 0 104 | elif args.decoder == 'sample': 105 | decoder = 1 106 | 107 | feed_dict = { 108 | model['seq_source_ids']: seq_source_ids, 109 | model['seq_source_lengths']: seq_source_len, 110 | model['decoder_technique']: decoder, 111 | model['sampling_temperature']: args.sampling_temperature, 112 | } 113 | 114 | feeds = [ 115 | model['predictions'], 116 | model['final_sequence_lengths'] 117 | ] 118 | 119 | predictions, final_sequence_lengths = sess.run(feeds, feed_dict) 120 | 121 | for sent_pred in predictions: 122 | if sent_pred[-1] == end_id: 123 | sent_pred = sent_pred[0:-1] 124 | print("Paraphrase : {}".format(' '.join([ id_to_vocab[pred] for pred in sent_pred ]))) 125 | 126 | def compress_graph(sess, args, model): 127 | """After training has completed, this function can be called to compress 128 | the model. The computation graph is frozen turning the checkpoint 129 | variables into constants. Finally, optimization is done by stripping 130 | away all unnecessary nodes from the graph if they are not used at 131 | inference time. 132 | 133 | Args: 134 | sess: Tensorflow session 135 | args: ArgumentParser config object 136 | model: model dictionary containing tensors of interest 137 | 138 | """ 139 | from tensorflow.python.tools import freeze_graph 140 | from tensorflow.python.tools import optimize_for_inference_lib 141 | 142 | tf.train.write_graph(sess.graph_def, '/media/sdb/models/paraphraser', 'model.pb', as_text=False) 143 | 144 | freeze_graph.freeze_graph( 145 | #input_graph='/tmp/model.pbtxt', 146 | input_graph='/media/sdb/models/paraphraser/model.pb', 147 | input_saver='', 148 | input_binary=True, 149 | input_checkpoint=args.checkpoint, 150 | output_node_names='predictions', 151 | restore_op_name='save/restore_all', 152 | filename_tensor_name='save/Const:0', 153 | output_graph='/media/sdb/models/paraphraser/frozen_model.pb', 154 | clear_devices=True, 155 | initializer_nodes='') 156 | 157 | ''' 158 | input_graph_def = tf.GraphDef() 159 | #with tf.gfile.Open('/media/sdb/models/paraphraser/frozen_model.pb', 'rb') as f: 160 | with tf.gfile.Open('/tmp/frozen_model.pb', 'rb') as f: 161 | data = f.read() 162 | input_graph_def.ParseFromString(data) 163 | with tf.Graph().as_default() as graph: 164 | tf.import_graph_def(input_graph_def) 165 | print(dir(graph)) 166 | print(graph.find_tensor_by_name('placeholders/sampling_temperature')) 167 | 168 | output_graph_def = optimize_for_inference_lib.optimize_for_inference( 169 | input_graph_def, 170 | ['placeholders/source_ids', 'placeholders/sequence_source_lengths'], 171 | ['predictions'], 172 | tf.float32.as_datatype_enum) 173 | 174 | f = tf.gfile.FastGFile('/tmp/optimized_model.pb', "w") 175 | f.write(output_graph_def.SerializeToString()) 176 | ''' 177 | 178 | 179 | def parse_arguments(): 180 | """Argument parser configuration.""" 181 | parser = argparse.ArgumentParser() 182 | 183 | parser.add_argument('--log_dir', type=str, default="logs", help="Log directory to store tensorboard summary and model checkpoints") 184 | parser.add_argument('--epochs', type=int, default=3, help="Number of epochs to train") 185 | parser.add_argument('--lr', type=float, default=1e-3, help="Learning rate") 186 | parser.add_argument('--batch_size', type=int, default=64, help="Mini batch size") 187 | parser.add_argument('--max_seq_length', type=int, default=40, help="Maximum sequence length. Sentence lengths beyond this are truncated.") 188 | parser.add_argument('--hidden_size', type=int, default=300, help="Hidden dimension size") 189 | parser.add_argument('--keep_prob', type=float, default=0.8, help="Keep probability for dropout") 190 | parser.add_argument('--decoder', type=str, choices=['greedy', 'sample'], help="Decoder type") 191 | parser.add_argument('--sampling_temperature', type=float, default=0.0, help="Sampling temperature") 192 | parser.add_argument('--mode', type=str, default=None, choices=['train', 'dev', 'test', 'infer'], help='train or dev or test or infer or minimize') 193 | parser.add_argument('--checkpoint', type=str, default=None, help="Model checkpoint file") 194 | parser.add_argument('--minimize_graph', type=bool, default=False, help="Save existing checkpoint to minimal graph") 195 | 196 | return parser.parse_args() 197 | 198 | def main(): 199 | """Entry point for all training, evaluation, and model compression begins here""" 200 | args = parse_arguments() 201 | word_to_id, id_to_vocab, embeddings, start_id, end_id, unk_id, mask_id = load_sentence_embeddings() 202 | vocab_size, embedding_size = embeddings.shape 203 | lr = args.lr 204 | 205 | dataset = dataset_config() 206 | 207 | if args.mode not in set(['train', 'dev', 'test', 'infer', 'minimize']): 208 | raise ValueError("{} is not a valid mode".format(args.mode)) 209 | 210 | with tf.Session() as sess: 211 | start = dt.datetime.now() 212 | model = lstm_model(sess, args.mode, args.hidden_size, embeddings, start_id, end_id, mask_id) 213 | 214 | # Saver object 215 | saver = tf.train.Saver() 216 | name_to_var_map = {var.op.name: var for var in tf.global_variables()} 217 | 218 | # Restore checkpoint 219 | if args.checkpoint: 220 | saver.restore(sess, args.checkpoint) 221 | 222 | # Save minimal graph 223 | if args.minimize_graph: 224 | compress_graph(sess, args, model) 225 | return 226 | 227 | # Load dataset only in train, dev, or test mode 228 | if args.mode in set(['train', 'dev', 'test']): 229 | logging.info("{}: Loading dataset into memory.".format(dt.datetime.now())) 230 | dataset_generator = ParaphraseDataset(dataset, args.batch_size, embeddings, word_to_id, start_id, end_id, unk_id, mask_id) 231 | 232 | # Evaluate on dev or test 233 | if args.mode == 'dev' or args.mode == 'test': 234 | evaluate(sess, model, dataset_generator, args.mode, id_to_vocab) 235 | return 236 | 237 | # Perform inferencing 238 | if args.mode == 'infer': 239 | infer(sess, args, model, id_to_vocab, end_id) 240 | return 241 | 242 | ################################### 243 | # Training run proceeds from here # 244 | ################################### 245 | 246 | # Training summary writer 247 | train_logdir = os.path.join(args.log_dir, "train-" + start.strftime("%Y%m%d-%H%M%S")) 248 | train_writer = tf.summary.FileWriter(train_logdir) 249 | 250 | # Dev summary writer 251 | dev_logdir = os.path.join(args.log_dir, "dev-" + start.strftime("%Y%m%d-%H%M%S")) 252 | dev_writer = tf.summary.FileWriter(dev_logdir) 253 | 254 | chencherry = SmoothingFunction() 255 | global_step = 0 256 | tf.global_variables_initializer().run() 257 | sess.run(model['dummy'], {model['sampling_temperature']: 7.5}) 258 | 259 | # Training per epoch 260 | for epoch in xrange(args.epochs): 261 | train_losses = [] 262 | train_batch_generator = dataset_generator.generate_batch('train') 263 | for train_batch in train_batch_generator: 264 | seq_source_ids = train_batch['seq_source_ids'] 265 | seq_source_words = train_batch['seq_source_words'] 266 | seq_source_len = train_batch['seq_source_len'] 267 | seq_ref_ids = train_batch['seq_ref_ids'] 268 | seq_ref_words = train_batch['seq_ref_words'] 269 | seq_ref_len = train_batch['seq_ref_len'] 270 | 271 | feed_dict = { 272 | model['lr']: lr, 273 | model['seq_source_ids']: seq_source_ids, 274 | model['seq_source_lengths']: seq_source_len, 275 | model['seq_reference_ids']: seq_ref_ids, 276 | model['seq_reference_lengths']: seq_ref_len, 277 | model['keep_prob']: args.keep_prob 278 | } 279 | 280 | feeds = [ 281 | model['train_step'], 282 | model['loss'], 283 | model['predictions'], 284 | model['summaries'], 285 | model['final_sequence_lengths'] 286 | ] 287 | 288 | try: 289 | _, batch_loss, predictions, summary, fsl = sess.run(feeds, feed_dict) 290 | except Exception as e: 291 | debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab) 292 | raise e 293 | 294 | train_losses.append(batch_loss) 295 | 296 | # Status update 297 | if global_step % 25 == 0: 298 | train_writer.add_summary(summary, global_step) 299 | train_writer.flush() 300 | seq_ref_words = [ [ref_words] for ref_words in seq_ref_words ] 301 | bleu_pred_words = [ [ id_to_vocab[vocab_id] for vocab_id in prediction if vocab_id in id_to_vocab ] for prediction in predictions ] 302 | bleu_pred_words = [ pred_words[:pred_words.index('') if '' in pred_words else len(pred_words) ] for pred_words in bleu_pred_words ] 303 | bleu_score = corpus_bleu(seq_ref_words, bleu_pred_words, smoothing_function=chencherry.method1) 304 | summarize_scalar(train_writer, 'bleu_score', bleu_score, global_step) 305 | train_loss = sum(train_losses) / len(train_losses) 306 | summarize_scalar(train_writer, 'loss', train_loss, global_step) 307 | logging.info("step={} epoch={} batch_loss={:.4f} train_loss={:.4f} bleu={:.4f}".format(global_step, epoch, batch_loss, train_loss, bleu_score)) 308 | 309 | # Print predictions for this batch every 1000 steps 310 | # Evaluate on dev set 311 | if global_step % 1000 == 0 and global_step != 0: 312 | debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab) 313 | logging.info("PREDICTIONS!") 314 | logging.info("final_seq_lengths: " + str(fsl)) 315 | logging.info("len(predictions): " + str(len(predictions))) 316 | for prediction in predictions: 317 | logging.info(str(len(prediction)) + ' ' + ' '.join([id_to_vocab[vocab_id] for vocab_id in prediction if vocab_id in id_to_vocab])) 318 | 319 | dev_loss, bleu_score = evaluate(sess, model, dataset_generator, 'dev', id_to_vocab) 320 | summarize_scalar(dev_writer, 'bleu_score', bleu_score, global_step) 321 | summarize_scalar(dev_writer, 'loss', dev_loss, global_step) 322 | dev_writer.flush() 323 | 324 | # Checkpoint. 325 | #if global_step % 50 == 0 and global_step != 0: 326 | if global_step % 5000 == 0 and global_step != 0: 327 | saver.save(sess, os.path.join(train_logdir, 'model'), global_step=global_step) 328 | 329 | global_step += 1 330 | # End train batch 331 | 332 | saver.save(sess, os.path.join(train_logdir, 'model'), global_step=global_step) 333 | lr /= 10. 334 | # End epoch 335 | 336 | evaluate(sess, model, dataset_generator, 'test', id_to_vocab) 337 | # End sess 338 | 339 | if __name__ == '__main__': 340 | main() 341 | 342 | -------------------------------------------------------------------------------- /paraphraser/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tensorflow as tf 3 | 4 | def summarize_scalar(writer, tag, value, step): 5 | """Prepare data to be written to protobuf event file. This is later 6 | read into tensorboard for visualization. 7 | 8 | Args: 9 | writer: summary writer 10 | tag: identifier name of the the data in question 11 | value: the value the data takes on 12 | step: global step during training 13 | """ 14 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 15 | writer.add_summary(summary, step) 16 | 17 | 18 | def debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab): 19 | """Debug dataset batch samples to ensure they take on intended avlues""" 20 | logging.info("==============================================================") 21 | logging.info("SOURCE!") 22 | #logging.info(seq_source_ids) 23 | for source_ids in seq_source_ids: 24 | logging.info(' '.join([id_to_vocab[source_id] for source_id in source_ids])) 25 | logging.info(seq_source_len) 26 | logging.info("REFERENCE!") 27 | #logging.info(seq_ref_ids) 28 | for i in seq_ref_ids: 29 | logging.info(' '.join([id_to_vocab[label] for label in i if label != -1])) 30 | logging.info(seq_ref_len) 31 | logging.info("==============================================================") 32 | 33 | def dataset_config(): 34 | """Dataset configuration. Dataset files are grouped by sentences of maximum 35 | length for train, dev, and test. """ 36 | 37 | dataset = [ 38 | { 39 | 'maxlen': 5, 40 | 'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.5', 41 | 'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.5', 42 | 'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.5' 43 | }, 44 | { 45 | 'maxlen': 10, 46 | 'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.10', 47 | 'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.10', 48 | 'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.10' 49 | }, 50 | { 51 | 'maxlen': 20, 52 | 'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.20', 53 | 'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.20', 54 | 'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.20' 55 | }, 56 | { 57 | 'maxlen': 30, 58 | 'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.30', 59 | 'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.30', 60 | 'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.30' 61 | }, 62 | { 63 | 'maxlen': 40, 64 | 'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.40', 65 | 'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.40', 66 | 'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.40' 67 | }, 68 | { 69 | 'maxlen': 50, 70 | 'train': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.train.50', 71 | 'dev': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.dev.50', 72 | 'test': '/media/sdb/datasets/aggregate_paraphrase_corpus_0/dataset.test.50' 73 | } 74 | ] 75 | 76 | return dataset 77 | 78 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backports.weakref==1.0rc1 2 | bleach==1.5.0 3 | blessings==1.6 4 | blis==0.0.13 5 | bottle==0.12.13 6 | certifi==2017.11.5 7 | chardet==3.0.4 8 | cycler==0.10.0 9 | cymem==1.31.2 10 | cytoolz==0.8.2 11 | decorator==4.1.1 12 | dill==0.2.7.1 13 | en-core-web-sm==2.0.0 14 | enum34==1.1.6 15 | ftfy==4.4.3 16 | future==0.16.0 17 | gpustat==0.4.1 18 | h5py==2.7.1 19 | html5lib==0.9999999 20 | idna==2.6 21 | ipython==6.1.0 22 | ipython-genutils==0.2.0 23 | jedi==0.10.2 24 | Keras==2.1.3 25 | Markdown==2.6.10 26 | matplotlib==2.1.2 27 | msgpack-numpy==0.4.1 28 | msgpack-python==0.5.1 29 | murmurhash==0.28.0 30 | nltk==3.2.5 31 | numpy==1.14.0 32 | nvidia-ml-py3==7.352.0 33 | paraphraser==0.1.0a1 34 | pathlib==1.0.1 35 | pexpect==4.2.1 36 | pickleshare==0.7.4 37 | plac==0.9.6 38 | preshed==1.0.0 39 | prompt-toolkit==1.0.14 40 | protobuf==3.5.0.post1 41 | psutil==5.4.2 42 | ptyprocess==0.5.2 43 | Pygments==2.2.0 44 | pyparsing==2.2.0 45 | python-dateutil==2.6.1 46 | pytz==2017.3 47 | PyYAML==3.12 48 | regex==2017.4.5 49 | requests==2.18.4 50 | scipy==1.0.0 51 | simplegeneric==0.8.1 52 | simplejson==3.13.2 53 | six==1.11.0 54 | spacy==2.0.5 55 | tensorflow-gpu==1.4.1 56 | tensorflow-tensorboard==0.4.0rc3 57 | termcolor==1.1.0 58 | thinc==6.10.2 59 | toolz==0.9.0 60 | tqdm==4.19.5 61 | traitlets==4.3.2 62 | ujson==1.35 63 | urllib3==1.22 64 | uWSGI==2.0.15 65 | wcwidth==0.1.7 66 | Werkzeug==0.13 67 | wrapt==1.10.11 68 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says to generate wheels that support both Python 2 and Python 3 | # 3. If your code will not run unchanged on both Python 2 and 3, you will 4 | # need to generate separate wheels for each Python version that you 5 | # support. 6 | universal=1 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | from setuptools.command.install import install 5 | 6 | here = path.abspath(path.dirname(__file__)) 7 | 8 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 9 | long_desc = f.read() 10 | 11 | class DownloadCorpora(install): 12 | def run(self): 13 | install.run(self) 14 | import spacy 15 | import nltk 16 | nltk.download('wordnet') 17 | spacy.cli.download('download', 'en') 18 | 19 | class DownloadParaphraseModel(install): 20 | def run(self): 21 | install.run(self) 22 | from paraphaser.download_models import download_file_from_google_drive 23 | download_file_from_google_drive('19QDCd4UMgt3FtlYYwu0qZU3G1F9_XCvk', 24 | 'paraphrase-model.tar.gz') 25 | 26 | setup( 27 | name='paraphraser', 28 | version='0.1.0', 29 | description='Generate sentence paraphrases given an input sentence', 30 | long_description=long_desc, 31 | url='https://github.com/vsuthichai/paraphraser', 32 | author='Victor Suthichai', 33 | author_email='victor.suthichai@gmail.com', 34 | 35 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 36 | classifiers=[ 37 | 'Development Status :: 3 - Alpha', 38 | 'Intended Audience :: Developers', 39 | 'Programming Language :: Python :: 2.7', 40 | 'Programming Language :: Python :: 3', 41 | 'Programming Language :: Python :: 3.4', 42 | 'Programming Language :: Python :: 3.5', 43 | 'Programming Language :: Python :: 3.6' 44 | ], 45 | 46 | keywords=[ 47 | 'paraphraser' 48 | ], 49 | 50 | py_modules=['paraphraser.synonym_model', 'paraphraser.inference', 'paraphraser.download_models'], 51 | #packages=find_packages(exclude=['contrib', 'docs', 'tests']), 52 | #install_requires=['nltk', 'spacy', 'ipython'], 53 | install_requires=[], 54 | extras_require={ 55 | 56 | }, 57 | package_data={ 58 | 59 | }, 60 | data_files=[], 61 | entry_points={ 62 | }, 63 | cmdclass={ 64 | 'download_model': DownloadParaphraseModel 65 | #'download_corpora': DownloadCorpora 66 | } 67 | ) 68 | 69 | --------------------------------------------------------------------------------