├── pycocoevalcap ├── __init__.py ├── bleu │ ├── __init__.py │ ├── bleu.pyc │ ├── __init__.pyc │ ├── bleu_scorer.pyc │ ├── LICENSE │ ├── bleu.py │ └── bleu_scorer.py ├── cider │ ├── __init__.py │ ├── cider.pyc │ ├── __init__.pyc │ ├── cider_scorer.pyc │ ├── cider.py │ └── cider_scorer.py ├── meteor │ ├── __init__.py │ ├── __init__.pyc │ ├── meteor.pyc │ └── meteor.py ├── rouge │ ├── __init__.py │ ├── rouge.pyc │ ├── __init__.pyc │ └── rouge.py ├── tokenizer │ ├── __init__.py │ ├── __init__.pyc │ ├── ptbtokenizer.pyc │ └── ptbtokenizer.py ├── eval.pyc ├── __init__.pyc └── eval.py ├── requirements.txt ├── .DS_Store ├── README.md ├── char_preprocessing.py ├── denoise.py ├── rougescore.py ├── error_rate.py ├── demo.py ├── data_utils.py ├── utils.py ├── char_correction.py ├── auto_encoding_cnn_denoise.py ├── model.py └── semi_supervised.py /pycocoevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.15.4 2 | scipy 3 | nltk -------------------------------------------------------------------------------- /pycocoevalcap/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /pycocoevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /pycocoevalcap/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /pycocoevalcap/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /pycocoevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/.DS_Store -------------------------------------------------------------------------------- /pycocoevalcap/eval.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/eval.pyc -------------------------------------------------------------------------------- /pycocoevalcap/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/__init__.pyc -------------------------------------------------------------------------------- /pycocoevalcap/bleu/bleu.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/bleu/bleu.pyc -------------------------------------------------------------------------------- /pycocoevalcap/cider/cider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/cider/cider.pyc -------------------------------------------------------------------------------- /pycocoevalcap/rouge/rouge.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/rouge/rouge.pyc -------------------------------------------------------------------------------- /pycocoevalcap/bleu/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/bleu/__init__.pyc -------------------------------------------------------------------------------- /pycocoevalcap/cider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/cider/__init__.pyc -------------------------------------------------------------------------------- /pycocoevalcap/meteor/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/meteor/__init__.pyc -------------------------------------------------------------------------------- /pycocoevalcap/meteor/meteor.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/meteor/meteor.pyc -------------------------------------------------------------------------------- /pycocoevalcap/rouge/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/rouge/__init__.pyc -------------------------------------------------------------------------------- /pycocoevalcap/bleu/bleu_scorer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/bleu/bleu_scorer.pyc -------------------------------------------------------------------------------- /pycocoevalcap/cider/cider_scorer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/cider/cider_scorer.pyc -------------------------------------------------------------------------------- /pycocoevalcap/tokenizer/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/tokenizer/__init__.pyc -------------------------------------------------------------------------------- /pycocoevalcap/tokenizer/ptbtokenizer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/tokenizer/ptbtokenizer.pyc -------------------------------------------------------------------------------- /pycocoevalcap/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /pycocoevalcap/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(gts.keys() == res.keys()) 24 | imgIds = gts.keys() 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | #print(ref) 36 | #assert(len(ref) > 1) 37 | 38 | bleu_scorer += (hypo[0], ref) 39 | 40 | #score, scores = bleu_scorer.compute_score(option='shortest') 41 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 42 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 43 | 44 | # return (bleu, bleu_info) 45 | return score, scores 46 | 47 | def method(self): 48 | return "Bleu" 49 | -------------------------------------------------------------------------------- /pycocoevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(gts.keys() == res.keys()) 33 | imgIds = gts.keys() 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) > 0) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deconvolutional Paragraph Representation Learning 2 | 3 | Implementations of the models in the paper "Deconvolutional Paragraph Representation Learning" by Yizhe Zhang, Dinghan Shen, Guoyin Wang, Zhe Gan, Ricardo Henao and Lawrence Carin, NIPS 2017 4 | 5 | ## Prerequisite: 6 | * CUDA, cudnn 7 | * Tensorflow (version >1.0). We used tensorflow 1.2. 8 | Run: `pip install -r requirements.txt` to install requirements 9 | 10 | 11 | ## Run 12 | * Run: `python demo.py` for reconstruction task 13 | * Run: `python char_correction.py` for character-level correction task 14 | * Run: `python semi_supervised.py` for semi-supervised task 15 | * Options: options can be made by changing `option` class in the demo.py code. 16 | 17 | - `opt.n_hidden`: number of hidden units. 18 | - `opt.layer`: number of CNN/DCNN layer [2,3,4]. 19 | - `opt.lr`: learning rate. 20 | - `opt.batch_size`: number of batchsize. 21 | 22 | * Training roughly takes 6-7 hours (around 10-20 epochs) (for recontruction task) to converge on a K80 GPU machine. 23 | * See `output.txt` for a sample of screen output for reconstruction task. 24 | 25 | ## Data: 26 | * Download from : 27 | * Reconstruction: [Hotel review (1.52GB)](https://drive.google.com/file/d/0B52eYWrYWqIpQzhBNkVxaV9mMjQ/view) 28 | * Char-level correction: [Yahoo! review (character-level, 451MB)](https://drive.google.com/open?id=1kBIAWyi3kvcMme-_1q4OU881yWH_j3ki) 29 | * Semi-supervised classification: [Yelp review (629MB)](https://drive.google.com/open?id=1qKos_wB45MzMu7Sn8RdvE6SRVAKCTC6e) 30 | 31 | 32 | ## Citation 33 | Please cite our paper if it helps with your research 34 | * Arxiv link: [https://arxiv.org/abs/1708.04729](https://arxiv.org/abs/1708.04729) 35 | ```latex 36 | @inproceedings{zhang2017deconvolutional, 37 | title={Deconvolutional Paragraph Representation Learning}, 38 | author={Zhang, Yizhe and Shen, Dinghan and Wang, Guoyin and Gan, Zhe and Henao, Ricardo and Carin, Lawrence}, 39 | Booktitle={NIPS}, 40 | year={2017} 41 | } 42 | ``` 43 | For any question or suggestions, feel free to contact yizhe.zhang@microsoft.com 44 | -------------------------------------------------------------------------------- /char_preprocessing.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import pdb 3 | import numpy as np 4 | def idx2sent(text, alphabet): 5 | char_seq = [] 6 | # print(text) 7 | for it in text: 8 | it_list = list(it) 9 | # padded = pad_sentence(it_list) 10 | # text_int8_repr = string_to_int8_conversion(padded, alphabet) 11 | text_int8_repr = string_to_int8_conversion(it_list, alphabet) 12 | char_seq.append(text_int8_repr) 13 | return char_seq 14 | 15 | 16 | def pad_sentence(char_seq, padding_char=" ", char_seq_length=301): 17 | # char_seq_length = 1014 18 | 19 | num_padding = char_seq_length - len(char_seq) 20 | 21 | new_char_seq = char_seq + [padding_char] * num_padding 22 | return new_char_seq 23 | 24 | 25 | def string_to_int8_conversion(char_seq, alphabet): 26 | x = [alphabet.find(char) + 2 for char in char_seq] 27 | # x = np.array([alphabet.find(char) for char in char_seq], dtype=np.int8) 28 | return x 29 | 30 | def prepare_data_for_charCNN(loadpath = "./data/yahoo4char.p"): 31 | 32 | x = cPickle.load(open(loadpath,"rb")) 33 | 34 | train, val, test = x[0], x[1], x[2] 35 | train_text, val_text, test_text = x[3], x[4], x[5] 36 | train_lab, val_lab, test_lab = x[6], x[7], x[8] 37 | wordtoix, ixtoword = x[9], x[10] 38 | 39 | # alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}\n" 40 | 41 | alphabet = "abcdefghijklmnopqrstuvwxyz0,.!?()" 42 | 43 | 44 | 45 | train_char = idx2sent(train_text, alphabet) 46 | val_char = idx2sent(val_text, alphabet) 47 | test_char = idx2sent(test_text, alphabet) 48 | chartoix = { c: i + 2 for i, c in enumerate(alphabet)} # make sure 0 is space 49 | chartoix[' '] = 1 50 | ixtochar = { i+2:c for i, c in enumerate(alphabet)} 51 | ixtochar[1] = ' ' 52 | 53 | 54 | # add padding character 55 | chartoix['N'] = 0 56 | ixtochar[0] = 'N' 57 | 58 | with open('./data/yahoo_char.p', 'w+') as f: 59 | cPickle.dump([train_char, val_char, test_char, train_text, val_text, test_text, train_lab, val_lab, test_lab, chartoix, ixtochar, alphabet, ], f) 60 | 61 | if __name__ == '__main__': 62 | 63 | prepare_data_for_charCNN() 64 | -------------------------------------------------------------------------------- /pycocoevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | from tokenizer.ptbtokenizer import PTBTokenizer 3 | from bleu.bleu import Bleu 4 | from meteor.meteor import Meteor 5 | from rouge.rouge import Rouge 6 | from cider.cider import Cider 7 | 8 | class COCOEvalCap: 9 | def __init__(self, coco, cocoRes): 10 | self.evalImgs = [] 11 | self.eval = {} 12 | self.imgToEval = {} 13 | self.coco = coco 14 | self.cocoRes = cocoRes 15 | self.params = {'image_id': coco.getImgIds()} 16 | 17 | def evaluate(self): 18 | imgIds = self.params['image_id'] 19 | # imgIds = self.coco.getImgIds() 20 | gts = {} 21 | res = {} 22 | for imgId in imgIds: 23 | gts[imgId] = self.coco.imgToAnns[imgId] 24 | res[imgId] = self.cocoRes.imgToAnns[imgId] 25 | 26 | # ================================================= 27 | # Set up scorers 28 | # ================================================= 29 | print 'tokenization...' 30 | tokenizer = PTBTokenizer() 31 | gts = tokenizer.tokenize(gts) 32 | res = tokenizer.tokenize(res) 33 | 34 | # ================================================= 35 | # Set up scorers 36 | # ================================================= 37 | print 'setting up scorers...' 38 | scorers = [ 39 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 40 | (Meteor(),"METEOR"), 41 | (Rouge(), "ROUGE_L"), 42 | (Cider(), "CIDEr") 43 | ] 44 | 45 | # ================================================= 46 | # Compute scores 47 | # ================================================= 48 | for scorer, method in scorers: 49 | print 'computing %s score...'%(scorer.method()) 50 | score, scores = scorer.compute_score(gts, res) 51 | if type(method) == list: 52 | for sc, scs, m in zip(score, scores, method): 53 | self.setEval(sc, m) 54 | self.setImgToEvalImgs(scs, gts.keys(), m) 55 | print "%s: %0.3f"%(m, sc) 56 | else: 57 | self.setEval(score, method) 58 | self.setImgToEvalImgs(scores, gts.keys(), method) 59 | print "%s: %0.3f"%(method, score) 60 | self.setEvalImgs() 61 | 62 | def setEval(self, score, method): 63 | self.eval[method] = score 64 | 65 | def setImgToEvalImgs(self, scores, imgIds, method): 66 | for imgId, score in zip(imgIds, scores): 67 | if not imgId in self.imgToEval: 68 | self.imgToEval[imgId] = {} 69 | self.imgToEval[imgId]["image_id"] = imgId 70 | self.imgToEval[imgId][method] = score 71 | 72 | def setEvalImgs(self): 73 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()] -------------------------------------------------------------------------------- /pycocoevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import sys 13 | import subprocess 14 | import tempfile 15 | import itertools 16 | 17 | # path to the stanford corenlp jar 18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 19 | 20 | # punctuations to be removed from the sentences 21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 22 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 23 | 24 | class PTBTokenizer: 25 | """Python wrapper of Stanford PTBTokenizer""" 26 | 27 | def tokenize(self, captions_for_image): 28 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 29 | 'edu.stanford.nlp.process.PTBTokenizer', \ 30 | '-preserveLines', '-lowerCase'] 31 | 32 | # ====================================================== 33 | # prepare data for PTB Tokenizer 34 | # ====================================================== 35 | final_tokenized_captions_for_image = {} 36 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 37 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 38 | 39 | # ====================================================== 40 | # save sentences to temporary file 41 | # ====================================================== 42 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 43 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 44 | tmp_file.write(sentences) 45 | tmp_file.close() 46 | 47 | # ====================================================== 48 | # tokenize sentence 49 | # ====================================================== 50 | cmd.append(os.path.basename(tmp_file.name)) 51 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 52 | stdout=subprocess.PIPE) 53 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 54 | lines = token_lines.split('\n') 55 | # remove temp file 56 | os.remove(tmp_file.name) 57 | 58 | # ====================================================== 59 | # create dictionary for tokenized captions 60 | # ====================================================== 61 | for k, line in zip(image_id, lines): 62 | if not k in final_tokenized_captions_for_image: 63 | final_tokenized_captions_for_image[k] = [] 64 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 65 | if w not in PUNCTUATIONS]) 66 | final_tokenized_captions_for_image[k].append(tokenized_caption) 67 | 68 | return final_tokenized_captions_for_image 69 | -------------------------------------------------------------------------------- /pycocoevalcap/meteor/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python wrapper for METEOR implementation, by Xinlei Chen 4 | # Acknowledge Michael Denkowski for the generous discussion and help 5 | 6 | import os 7 | import sys 8 | import subprocess 9 | import threading 10 | 11 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. 12 | METEOR_JAR = 'meteor-1.5.jar' 13 | # print METEOR_JAR 14 | 15 | class Meteor: 16 | 17 | def __init__(self): 18 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ 19 | '-', '-', '-stdio', '-l', 'en', '-norm'] 20 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \ 21 | cwd=os.path.dirname(os.path.abspath(__file__)), \ 22 | stdin=subprocess.PIPE, \ 23 | stdout=subprocess.PIPE, \ 24 | stderr=subprocess.PIPE) 25 | # Used to guarantee thread safety 26 | self.lock = threading.Lock() 27 | 28 | def compute_score(self, gts, res): 29 | assert(gts.keys() == res.keys()) 30 | imgIds = gts.keys() 31 | scores = [] 32 | 33 | eval_line = 'EVAL' 34 | self.lock.acquire() 35 | for i in imgIds: 36 | assert(len(res[i]) == 1) 37 | stat = self._stat(res[i][0], gts[i]) 38 | eval_line += ' ||| {}'.format(stat) 39 | 40 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 41 | for i in range(0,len(imgIds)): 42 | scores.append(float(self.meteor_p.stdout.readline().strip())) 43 | score = float(self.meteor_p.stdout.readline().strip()) 44 | self.lock.release() 45 | 46 | return score, scores 47 | 48 | def method(self): 49 | return "METEOR" 50 | 51 | def _stat(self, hypothesis_str, reference_list): 52 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 53 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 54 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 55 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 56 | return self.meteor_p.stdout.readline().strip() 57 | 58 | def _score(self, hypothesis_str, reference_list): 59 | self.lock.acquire() 60 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 61 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 62 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 63 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 64 | stats = self.meteor_p.stdout.readline().strip() 65 | eval_line = 'EVAL ||| {}'.format(stats) 66 | # EVAL ||| stats 67 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 68 | score = float(self.meteor_p.stdout.readline().strip()) 69 | # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice 70 | # thanks for Andrej for pointing this out 71 | score = float(self.meteor_p.stdout.readline().strip()) 72 | self.lock.release() 73 | return score 74 | 75 | def __exit__(self): 76 | self.lock.acquire() 77 | self.meteor_p.stdin.close() 78 | self.meteor_p.kill() 79 | self.meteor_p.wait() 80 | self.lock.release() 81 | -------------------------------------------------------------------------------- /denoise.py: -------------------------------------------------------------------------------- 1 | """ 2 | Yizhe Zhang 3 | 4 | Perturbation to the input 5 | """ 6 | import numpy as np 7 | import os 8 | import scipy.io as sio 9 | from math import floor 10 | import pdb 11 | 12 | def add_noise(sents, opt): 13 | if opt.substitution == 's': 14 | sents_permutated= substitute_sent(sents, opt) 15 | elif opt.substitution == 'p': 16 | sents_permutated= permutate_sent(sents, opt) 17 | elif opt.substitution == 'a': 18 | sents_permutated= add_sent(sents, opt) 19 | elif opt.substitution == 'd': 20 | sents_permutated= delete_sent(sents, opt) 21 | elif opt.substitution == 'm': 22 | sents_permutated= mixed_noise_sent(sents, opt) 23 | elif opt.substitution == 'sc': 24 | sents_permutated = substitute_sent_char(sents, opt) 25 | else: 26 | sents_permutated= sents 27 | 28 | return sents_permutated 29 | 30 | 31 | def permutate_sent(sents, opt): 32 | sents_p = [] 33 | for ss in range(len(sents)): 34 | sent_temp = sents[ss][:] 35 | if len(sent_temp) <= 1: 36 | sents_p.append(sent_temp) 37 | continue 38 | idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True) 39 | temp = sent_temp[idx_s[0]] 40 | for ii in range(opt.permutation-1): 41 | sent_temp[idx_s[ii]] = sent_temp[idx_s[ii+1]] 42 | sent_temp[idx_s[opt.permutation-1]] = temp 43 | sents_p.append(sent_temp) 44 | return sents_p 45 | 46 | 47 | def substitute_sent(sents, opt): 48 | # substitute single word 49 | sents_p = [] 50 | for ss in range(len(sents)): 51 | sent_temp = sents[ss][:] 52 | if len(sent_temp) <= 1: 53 | sents_p.append(sent_temp) 54 | continue 55 | idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True) 56 | for ii in range(opt.permutation): 57 | sent_temp[idx_s[ii]] = np.random.choice(opt.n_words) 58 | sents_p.append(sent_temp) 59 | return sents_p 60 | 61 | def delete_sent(sents, opt): 62 | # substitute single word 63 | sents_p = [] 64 | for ss in range(len(sents)): 65 | sent_temp = sents[ss][:] 66 | if len(sent_temp) <= 1: 67 | sents_p.append(sent_temp) 68 | continue 69 | idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True) 70 | for ii in range(opt.permutation): 71 | sent_temp[idx_s[ii]] = -1 72 | sents_p.append([s for s in sent_temp if s!=-1]) 73 | return sents_p 74 | 75 | def add_sent(sents, opt): 76 | # substitute single word 77 | sents_p = [] 78 | for ss in range(len(sents)): 79 | sent_temp = sents[ss][:] 80 | if len(sent_temp) <= 1: 81 | sents_p.append(sent_temp) 82 | continue 83 | idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True) 84 | for ii in range(opt.permutation): 85 | sent_temp.insert(idx_s[ii], np.random.choice(opt.n_words)) 86 | sents_p.append(sent_temp[:opt.maxlen]) 87 | return sents_p 88 | 89 | 90 | def mixed_noise_sent(sents, opt): 91 | sents = delete_sent(sents, opt) 92 | sents = add_sent(sents, opt) 93 | sents = substitute_sent(sents, opt) 94 | return sents 95 | 96 | def substitute_sent_char(sents, opt): 97 | # substitute single word 98 | sents_p = [] 99 | for ss in range(len(sents)): 100 | sent_temp = sents[ss][:] 101 | if len(sent_temp) <= 1: 102 | sents_p.append(sent_temp) 103 | continue 104 | permute_choice = [ic for ic in range(len(sent_temp)) if sent_temp[ic] != 1] 105 | idx_s= np.random.choice(permute_choice, size=int(opt.permutation * (len(permute_choice))), replace=True) 106 | 107 | for ii in range(len(idx_s)): 108 | sent_temp[idx_s[ii]] = np.random.choice(list(range(2,28))) 109 | sents_p.append(sent_temp) 110 | return sents_p -------------------------------------------------------------------------------- /pycocoevalcap/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(gts.keys() == res.keys()) 86 | imgIds = gts.keys() 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) > 0) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /rougescore.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import collections 3 | 4 | import six 5 | 6 | def _ngrams(words, n): 7 | queue = collections.deque(maxlen=n) 8 | for w in words: 9 | queue.append(w) 10 | if len(queue) == n: 11 | yield tuple(queue) 12 | 13 | def _ngram_counts(words, n): 14 | return collections.Counter(_ngrams(words, n)) 15 | 16 | def _ngram_count(words, n): 17 | return max(len(words) - n + 1, 0) 18 | 19 | def _counter_overlap(counter1, counter2): 20 | result = 0 21 | for k, v in six.iteritems(counter1): 22 | result += min(v, counter2[k]) 23 | return result 24 | 25 | def _safe_divide(numerator, denominator): 26 | if denominator > 0: 27 | return numerator / denominator 28 | else: 29 | return 0 30 | 31 | def _safe_f1(matches, recall_total, precision_total, alpha): 32 | recall_score = _safe_divide(matches, recall_total) 33 | precision_score = _safe_divide(matches, precision_total) 34 | denom = (1.0 - alpha) * precision_score + alpha * recall_score 35 | if denom > 0.0: 36 | return (precision_score * recall_score) / denom 37 | else: 38 | return 0.0 39 | 40 | def rouge_n(peer, models, n, alpha): 41 | """ 42 | Compute the ROUGE-N score of a peer with respect to one or more models, for 43 | a given value of `n`. 44 | """ 45 | matches = 0 46 | recall_total = 0 47 | peer_counter = _ngram_counts(peer, n) 48 | for model in models: 49 | model_counter = _ngram_counts(model, n) 50 | matches += _counter_overlap(peer_counter, model_counter) 51 | recall_total += _ngram_count(model, n) 52 | precision_total = len(models) * _ngram_count(peer, n) 53 | return _safe_f1(matches, recall_total, precision_total, alpha) 54 | 55 | def rouge_1(peer, models, alpha): 56 | """ 57 | Compute the ROUGE-1 (unigram) score of a peer with respect to one or more 58 | models. 59 | """ 60 | return rouge_n(peer, models, 1, alpha) 61 | 62 | def rouge_2(peer, models, alpha): 63 | """ 64 | Compute the ROUGE-2 (bigram) score of a peer with respect to one or more 65 | models. 66 | """ 67 | return rouge_n(peer, models, 2, alpha) 68 | 69 | def rouge_3(peer, models, alpha): 70 | """ 71 | Compute the ROUGE-3 (trigram) score of a peer with respect to one or more 72 | models. 73 | """ 74 | return rouge_n(peer, models, 3, alpha) 75 | 76 | def lcs(a, b): 77 | """ 78 | Compute the length of the longest common subsequence between two sequences. 79 | 80 | Time complexity: O(len(a) * len(b)) 81 | Space complexity: O(min(len(a), len(b))) 82 | """ 83 | # This is an adaptation of the standard LCS dynamic programming algorithm 84 | # tweaked for lower memory consumption. 85 | # Sequence a is laid out along the rows, b along the columns. 86 | # Minimize number of columns to minimize required memory 87 | if len(a) < len(b): 88 | a, b = b, a 89 | # Sequence b now has the minimum length 90 | # Quit early if one sequence is empty 91 | if len(b) == 0: 92 | return 0 93 | # Use a single buffer to store the counts for the current row, and 94 | # overwrite it on each pass 95 | row = [0] * len(b) 96 | for ai in a: 97 | left = 0 98 | diag = 0 99 | for j, bj in enumerate(b): 100 | up = row[j] 101 | if ai == bj: 102 | value = diag + 1 103 | else: 104 | value = max(left, up) 105 | row[j] = value 106 | left = value 107 | diag = up 108 | # Return the last cell of the last row 109 | return left 110 | 111 | def rouge_l(peer, models, alpha): 112 | """ 113 | Compute the ROUGE-L score of a peer with respect to one or more models. 114 | """ 115 | matches = 0 116 | recall_total = 0 117 | for model in models: 118 | matches += lcs(model, peer) 119 | recall_total += len(model) 120 | precision_total = len(models) * len(peer) 121 | return _safe_f1(matches, recall_total, precision_total, alpha) 122 | -------------------------------------------------------------------------------- /error_rate.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy 3 | from collections import defaultdict 4 | 5 | COPY = 0 6 | INSERTION = 1 7 | DELETION = 2 8 | SUBSTITUTION = 3 9 | 10 | INFINITY = 10 ** 9 11 | 12 | 13 | def _edit_distance_matrix(y, y_hat, special_tokens=None): 14 | """Returns the matrix of edit distances. 15 | Parameters 16 | ---------- 17 | Returns 18 | ------- 19 | dist : numpy.ndarray 20 | dist[i, j] is the edit distance between the first 21 | action : numpy.ndarray 22 | action[i, j] is the action applied to y_hat[j - 1] in a chain of 23 | optimal actions transducing y_hat[:j] into y[:i]. 24 | i characters of y and the first j characters of y_hat. 25 | special_tokens : set 26 | Tokens, for which deletion is free. 27 | """ 28 | if not special_tokens: 29 | special_tokens = set() 30 | dist = numpy.zeros((len(y) + 1, len(y_hat) + 1), dtype='int64') 31 | insertion_cost = numpy.ones(len(y)) 32 | deletion_cost = numpy.ones(len(y_hat)) 33 | for i in range(len(y)): 34 | if y[i] in special_tokens: 35 | insertion_cost[i] = 0 36 | for j in range(len(y_hat)): 37 | if y_hat[j] in special_tokens: 38 | deletion_cost[j] = 0 39 | dist[1:, 0] = insertion_cost.cumsum() 40 | dist[0, 1:] = deletion_cost.cumsum() 41 | 42 | for i in xrange(1, len(y) + 1): 43 | for j in xrange(1, len(y_hat) + 1): 44 | if y[i - 1] != y_hat[j - 1]: 45 | cost = 1 46 | else: 47 | cost = 0 48 | insertion_dist = dist[i - 1][j] + insertion_cost[i - 1] 49 | deletion_dist = dist[i][j - 1] + deletion_cost[j - 1] 50 | substitution_dist = dist[i - 1][j - 1] + 1 if cost else INFINITY 51 | copy_dist = dist[i - 1][j - 1] if not cost else INFINITY 52 | best = min(insertion_dist, deletion_dist, 53 | substitution_dist, copy_dist) 54 | 55 | dist[i][j] = best 56 | 57 | return dist 58 | 59 | 60 | def _bleu(y, y_hat, n=4): 61 | """ BLEU score between the reference sequence y 62 | and y_hat for each partial sequence ranging 63 | from the first input token to the last 64 | Parameters 65 | ---------- 66 | y : vector 67 | The reference matrix with dimensions of number 68 | of words (rows) by batch size (columns) 69 | y_hat : vector 70 | The predicted matrix with same dimensions 71 | n : integer 72 | highest n-gram order in the Bleu sense 73 | (e.g Bleu-4) 74 | Returns 75 | ------- 76 | results : vector (len y_hat) 77 | Bleu scores for each partial sequence 78 | y_hat_1..T from T = 1 to len(y_hat) 79 | """ 80 | bleu_scores = numpy.zeros((len(y_hat), n)) 81 | 82 | # count reference ngrams 83 | ref_counts = defaultdict(int) 84 | for k in xrange(1, n+1): 85 | for i in xrange(len(y) - k + 1): 86 | ref_counts[tuple(y[i:i + k])] += 1 87 | 88 | # for each partial sequence, 1) compute addition to # of correct 89 | # 2) apply brevity penalty 90 | # ngrams, magic stability numbers from pycocoeval 91 | ref_len = len(y) 92 | pred_counts = defaultdict(int) 93 | correct = numpy.zeros(4) 94 | for i in xrange(1, len(y_hat) + 1): 95 | for k in xrange(i, max(-1, i - n), -1): 96 | # print i, k 97 | ngram = tuple(y_hat[k-1:i]) 98 | # UNK token hack. Must work for both indices 99 | # and words. Very ugly, I know. 100 | if 0 in ngram or 'UNK' in ngram: 101 | continue 102 | pred_counts[ngram] += 1 103 | if pred_counts[ngram] <= ref_counts.get(ngram, 0): 104 | correct[len(ngram)-1] += 1 105 | 106 | # compute partial bleu score 107 | bleu = 1. 108 | for j in xrange(n): 109 | possible = max(0, i - j) 110 | bleu *= float(correct[j] + 1.) / (possible + 1.) 111 | bleu_scores[i - 1, j] = bleu ** (1./(j+1)) 112 | 113 | # brevity penalty 114 | if i < ref_len: 115 | ratio = (i + 1e-15)/(ref_len + 1e-9) 116 | bleu_scores[i - 1, :] *= math.exp(1 - 1/ratio) 117 | 118 | return bleu_scores.astype('float32'), correct, pred_counts, ref_counts 119 | 120 | 121 | def edit_distance(y, y_hat): 122 | """Edit distance between two sequences. 123 | Parameters 124 | ---------- 125 | y : str 126 | The groundtruth. 127 | y_hat : str 128 | The recognition candidate. 129 | the minimum number of symbol edits (i.e. insertions, 130 | deletions or substitutions) required to change one 131 | word into the other. 132 | """ 133 | return _edit_distance_matrix(y, y_hat)[-1, -1] 134 | 135 | 136 | def wer(y, y_hat): 137 | return edit_distance(y, y_hat) / float(len(y)) 138 | 139 | def cal_cer(y_total, y_hat_total): 140 | error = 0 141 | length = 0 142 | for y, y_hat in zip(y_total, y_hat_total): 143 | y_len = len(''.join(y.split())) 144 | error += min(y_len/float((len(y))), wer(y, y_hat) )* len(y) 145 | length += y_len 146 | return error/length 147 | 148 | 149 | 150 | 151 | 152 | def reward_matrix(y, y_hat, alphabet, eos_label): 153 | dist, _, = _edit_distance_matrix(y, y_hat) 154 | y_alphabet_indices = [alphabet.index(c) for c in y] 155 | if y_alphabet_indices[-1] != eos_label: 156 | raise ValueError("Last character of the groundtruth must be EOS") 157 | 158 | # Optimistic edit distance for every y_hat prefix 159 | optim_dist = dist.min(axis=0) 160 | pess_reward = -optim_dist 161 | 162 | # Optimistic edit distance for every y_hat prefix plus a character 163 | optim_dist_char = numpy.tile( 164 | optim_dist[:, None], [1, len(alphabet)]) + 1 165 | pess_char_reward = numpy.tile( 166 | pess_reward[:, None], [1, len(alphabet)]) - 1 167 | for i in range(len(y)): 168 | for j in range(len(y_hat) + 1): 169 | c = y_alphabet_indices[i] 170 | cand_dist = dist[i, j] 171 | if cand_dist < optim_dist_char[j, c]: 172 | optim_dist_char[j, c] = cand_dist 173 | pess_char_reward[j, c] = -cand_dist 174 | for j in range(len(y_hat) + 1): 175 | # Here we rely on y[-1] being eos_label 176 | pess_char_reward[j, eos_label] = -dist[len(y) - 1, j] 177 | return pess_char_reward 178 | 179 | def gain_matrix(y, y_hat, alphabet=None, given_reward_matrix=None, 180 | eos_label=None): 181 | y_hat_indices = [alphabet.index(c) for c in y_hat] 182 | reward = (given_reward_matrix.copy() if given_reward_matrix is not None 183 | else reward_matrix(y, y_hat, alphabet, eos_label)) 184 | reward[1:] -= reward[:-1][numpy.arange(len(y_hat)), y_hat_indices][:, None] 185 | return reward 186 | 187 | 188 | def prepare_for_cer(sentence, ixtoword): 189 | sent=[x for x in sentence if x!=0] 190 | while len(sent)<4: 191 | sent.append(0) 192 | #sent = ' '.join([ixtoword[x] for x in sent]) 193 | sent = ''.join([ixtoword[x] for x in sent]) 194 | return sent 195 | -------------------------------------------------------------------------------- /pycocoevalcap/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | from collections import defaultdict 7 | import numpy as np 8 | import pdb 9 | import math 10 | 11 | def precook(s, n=4, out=False): 12 | """ 13 | Takes a string as input and returns an object that can be given to 14 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 15 | can take string arguments as well. 16 | :param s: string : sentence to be converted into ngrams 17 | :param n: int : number of ngrams for which representation is calculated 18 | :return: term frequency vector for occuring ngrams 19 | """ 20 | words = s.split() 21 | counts = defaultdict(int) 22 | for k in xrange(1,n+1): 23 | for i in xrange(len(words)-k+1): 24 | ngram = tuple(words[i:i+k]) 25 | counts[ngram] += 1 26 | return counts 27 | 28 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 29 | '''Takes a list of reference sentences for a single segment 30 | and returns an object that encapsulates everything that BLEU 31 | needs to know about them. 32 | :param refs: list of string : reference sentences for some image 33 | :param n: int : number of ngrams for which (ngram) representation is calculated 34 | :return: result (list of dict) 35 | ''' 36 | return [precook(ref, n) for ref in refs] 37 | 38 | def cook_test(test, n=4): 39 | '''Takes a test sentence and returns an object that 40 | encapsulates everything that BLEU needs to know about it. 41 | :param test: list of string : hypothesis sentence for some image 42 | :param n: int : number of ngrams for which (ngram) representation is calculated 43 | :return: result (dict) 44 | ''' 45 | return precook(test, n, True) 46 | 47 | class CiderScorer(object): 48 | """CIDEr scorer. 49 | """ 50 | 51 | def copy(self): 52 | ''' copy the refs.''' 53 | new = CiderScorer(n=self.n) 54 | new.ctest = copy.copy(self.ctest) 55 | new.crefs = copy.copy(self.crefs) 56 | return new 57 | 58 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 59 | ''' singular instance ''' 60 | self.n = n 61 | self.sigma = sigma 62 | self.crefs = [] 63 | self.ctest = [] 64 | self.document_frequency = defaultdict(float) 65 | self.cook_append(test, refs) 66 | self.ref_len = None 67 | 68 | def cook_append(self, test, refs): 69 | '''called by constructor and __iadd__ to avoid creating new instances.''' 70 | 71 | if refs is not None: 72 | self.crefs.append(cook_refs(refs)) 73 | if test is not None: 74 | self.ctest.append(cook_test(test)) ## N.B.: -1 75 | else: 76 | self.ctest.append(None) # lens of crefs and ctest have to match 77 | 78 | def size(self): 79 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 80 | return len(self.crefs) 81 | 82 | def __iadd__(self, other): 83 | '''add an instance (e.g., from another sentence).''' 84 | 85 | if type(other) is tuple: 86 | ## avoid creating new CiderScorer instances 87 | self.cook_append(other[0], other[1]) 88 | else: 89 | self.ctest.extend(other.ctest) 90 | self.crefs.extend(other.crefs) 91 | 92 | return self 93 | def compute_doc_freq(self): 94 | ''' 95 | Compute term frequency for reference data. 96 | This will be used to compute idf (inverse document frequency later) 97 | The term frequency is stored in the object 98 | :return: None 99 | ''' 100 | for refs in self.crefs: 101 | # refs, k ref captions of one image 102 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 103 | self.document_frequency[ngram] += 1 104 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 105 | 106 | def compute_cider(self): 107 | def counts2vec(cnts): 108 | """ 109 | Function maps counts of ngram to vector of tfidf weights. 110 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 111 | The n-th entry of array denotes length of n-grams. 112 | :param cnts: 113 | :return: vec (array of dict), norm (array of float), length (int) 114 | """ 115 | vec = [defaultdict(float) for _ in range(self.n)] 116 | length = 0 117 | norm = [0.0 for _ in range(self.n)] 118 | for (ngram,term_freq) in cnts.iteritems(): 119 | # give word count 1 if it doesn't appear in reference corpus 120 | df = np.log(max(1.0, self.document_frequency[ngram])) 121 | # ngram index 122 | n = len(ngram)-1 123 | # tf (term_freq) * idf (precomputed idf) for n-grams 124 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 125 | # compute norm for the vector. the norm will be used for computing similarity 126 | norm[n] += pow(vec[n][ngram], 2) 127 | 128 | if n == 1: 129 | length += term_freq 130 | norm = [np.sqrt(n) for n in norm] 131 | return vec, norm, length 132 | 133 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 134 | ''' 135 | Compute the cosine similarity of two vectors. 136 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 137 | :param vec_ref: array of dictionary for vector corresponding to reference 138 | :param norm_hyp: array of float for vector corresponding to hypothesis 139 | :param norm_ref: array of float for vector corresponding to reference 140 | :param length_hyp: int containing length of hypothesis 141 | :param length_ref: int containing length of reference 142 | :return: array of score for each n-grams cosine similarity 143 | ''' 144 | delta = float(length_hyp - length_ref) 145 | # measure consine similarity 146 | val = np.array([0.0 for _ in range(self.n)]) 147 | for n in range(self.n): 148 | # ngram 149 | for (ngram,count) in vec_hyp[n].iteritems(): 150 | # vrama91 : added clipping 151 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 152 | 153 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 154 | val[n] /= (norm_hyp[n]*norm_ref[n]) 155 | 156 | assert(not math.isnan(val[n])) 157 | # vrama91: added a length based gaussian penalty 158 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 159 | return val 160 | 161 | # compute log reference length 162 | self.ref_len = np.log(float(len(self.crefs))) 163 | 164 | scores = [] 165 | for test, refs in zip(self.ctest, self.crefs): 166 | # compute vector for test captions 167 | vec, norm, length = counts2vec(test) 168 | # compute vector for ref captions 169 | score = np.array([0.0 for _ in range(self.n)]) 170 | for ref in refs: 171 | vec_ref, norm_ref, length_ref = counts2vec(ref) 172 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 173 | # change by vrama91 - mean of ngram scores, instead of sum 174 | score_avg = np.mean(score) 175 | # divide by number of references 176 | score_avg /= len(refs) 177 | # multiply score by 10 178 | score_avg *= 10.0 179 | # append score of an image to the score list 180 | scores.append(score_avg) 181 | return scores 182 | 183 | def compute_score(self, option=None, verbose=0): 184 | # compute idf 185 | self.compute_doc_freq() 186 | # assert to check document frequency 187 | assert(len(self.ctest) >= max(self.document_frequency.values())) 188 | # compute cider score 189 | score = self.compute_cider() 190 | # debug 191 | # print score 192 | return np.mean(np.array(score)), np.array(score) -------------------------------------------------------------------------------- /pycocoevalcap/bleu/bleu_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # bleu_scorer.py 4 | # David Chiang 5 | 6 | # Copyright (c) 2004-2006 University of Maryland. All rights 7 | # reserved. Do not redistribute without permission from the 8 | # author. Not for commercial use. 9 | 10 | # Modified by: 11 | # Hao Fang 12 | # Tsung-Yi Lin 13 | 14 | '''Provides: 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). 17 | ''' 18 | 19 | import copy 20 | import sys, math, re 21 | from collections import defaultdict 22 | 23 | def precook(s, n=4, out=False): 24 | """Takes a string as input and returns an object that can be given to 25 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 26 | can take string arguments as well.""" 27 | words = s.split() 28 | counts = defaultdict(int) 29 | for k in xrange(1,n+1): 30 | for i in xrange(len(words)-k+1): 31 | ngram = tuple(words[i:i+k]) 32 | counts[ngram] += 1 33 | return (len(words), counts) 34 | 35 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" 36 | '''Takes a list of reference sentences for a single segment 37 | and returns an object that encapsulates everything that BLEU 38 | needs to know about them.''' 39 | 40 | reflen = [] 41 | maxcounts = {} 42 | for ref in refs: 43 | rl, counts = precook(ref, n) 44 | reflen.append(rl) 45 | for (ngram,count) in counts.iteritems(): 46 | maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 47 | 48 | # Calculate effective reference sentence length. 49 | if eff == "shortest": 50 | reflen = min(reflen) 51 | elif eff == "average": 52 | reflen = float(sum(reflen))/len(reflen) 53 | 54 | ## lhuang: N.B.: leave reflen computaiton to the very end!! 55 | 56 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) 57 | 58 | return (reflen, maxcounts) 59 | 60 | def cook_test(test, (reflen, refmaxcounts), eff=None, n=4): 61 | '''Takes a test sentence and returns an object that 62 | encapsulates everything that BLEU needs to know about it.''' 63 | 64 | testlen, counts = precook(test, n, True) 65 | 66 | result = {} 67 | 68 | # Calculate effective reference sentence length. 69 | 70 | if eff == "closest": 71 | result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] 72 | else: ## i.e., "average" or "shortest" or None 73 | result["reflen"] = reflen 74 | 75 | result["testlen"] = testlen 76 | 77 | result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)] 78 | 79 | result['correct'] = [0]*n 80 | for (ngram, count) in counts.iteritems(): 81 | result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) 82 | 83 | return result 84 | 85 | class BleuScorer(object): 86 | """Bleu scorer. 87 | """ 88 | 89 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" 90 | # special_reflen is used in oracle (proportional effective ref len for a node). 91 | 92 | def copy(self): 93 | ''' copy the refs.''' 94 | new = BleuScorer(n=self.n) 95 | new.ctest = copy.copy(self.ctest) 96 | new.crefs = copy.copy(self.crefs) 97 | new._score = None 98 | return new 99 | 100 | def __init__(self, test=None, refs=None, n=4, special_reflen=None): 101 | ''' singular instance ''' 102 | 103 | self.n = n 104 | self.crefs = [] 105 | self.ctest = [] 106 | self.cook_append(test, refs) 107 | self.special_reflen = special_reflen 108 | 109 | def cook_append(self, test, refs): 110 | '''called by constructor and __iadd__ to avoid creating new instances.''' 111 | 112 | if refs is not None: 113 | self.crefs.append(cook_refs(refs)) 114 | if test is not None: 115 | cooked_test = cook_test(test, self.crefs[-1]) 116 | self.ctest.append(cooked_test) ## N.B.: -1 117 | else: 118 | self.ctest.append(None) # lens of crefs and ctest have to match 119 | 120 | self._score = None ## need to recompute 121 | 122 | def ratio(self, option=None): 123 | self.compute_score(option=option) 124 | return self._ratio 125 | 126 | def score_ratio(self, option=None): 127 | '''return (bleu, len_ratio) pair''' 128 | return (self.fscore(option=option), self.ratio(option=option)) 129 | 130 | def score_ratio_str(self, option=None): 131 | return "%.4f (%.2f)" % self.score_ratio(option) 132 | 133 | def reflen(self, option=None): 134 | self.compute_score(option=option) 135 | return self._reflen 136 | 137 | def testlen(self, option=None): 138 | self.compute_score(option=option) 139 | return self._testlen 140 | 141 | def retest(self, new_test): 142 | if type(new_test) is str: 143 | new_test = [new_test] 144 | assert len(new_test) == len(self.crefs), new_test 145 | self.ctest = [] 146 | for t, rs in zip(new_test, self.crefs): 147 | self.ctest.append(cook_test(t, rs)) 148 | self._score = None 149 | 150 | return self 151 | 152 | def rescore(self, new_test): 153 | ''' replace test(s) with new test(s), and returns the new score.''' 154 | 155 | return self.retest(new_test).compute_score() 156 | 157 | def size(self): 158 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 159 | return len(self.crefs) 160 | 161 | def __iadd__(self, other): 162 | '''add an instance (e.g., from another sentence).''' 163 | 164 | if type(other) is tuple: 165 | ## avoid creating new BleuScorer instances 166 | self.cook_append(other[0], other[1]) 167 | else: 168 | assert self.compatible(other), "incompatible BLEUs." 169 | self.ctest.extend(other.ctest) 170 | self.crefs.extend(other.crefs) 171 | self._score = None ## need to recompute 172 | 173 | return self 174 | 175 | def compatible(self, other): 176 | return isinstance(other, BleuScorer) and self.n == other.n 177 | 178 | def single_reflen(self, option="average"): 179 | return self._single_reflen(self.crefs[0][0], option) 180 | 181 | def _single_reflen(self, reflens, option=None, testlen=None): 182 | 183 | if option == "shortest": 184 | reflen = min(reflens) 185 | elif option == "average": 186 | reflen = float(sum(reflens))/len(reflens) 187 | elif option == "closest": 188 | reflen = min((abs(l-testlen), l) for l in reflens)[1] 189 | else: 190 | assert False, "unsupported reflen option %s" % option 191 | 192 | return reflen 193 | 194 | def recompute_score(self, option=None, verbose=0): 195 | self._score = None 196 | return self.compute_score(option, verbose) 197 | 198 | def compute_score(self, option=None, verbose=0): 199 | n = self.n 200 | small = 1e-9 201 | tiny = 1e-15 ## so that if guess is 0 still return 0 202 | bleu_list = [[] for _ in range(n)] 203 | 204 | if self._score is not None: 205 | return self._score 206 | 207 | if option is None: 208 | option = "average" if len(self.crefs) == 1 else "closest" 209 | 210 | self._testlen = 0 211 | self._reflen = 0 212 | totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} 213 | 214 | # for each sentence 215 | for comps in self.ctest: 216 | testlen = comps['testlen'] 217 | self._testlen += testlen 218 | 219 | if self.special_reflen is None: ## need computation 220 | reflen = self._single_reflen(comps['reflen'], option, testlen) 221 | else: 222 | reflen = self.special_reflen 223 | 224 | self._reflen += reflen 225 | 226 | for key in ['guess','correct']: 227 | for k in xrange(n): 228 | totalcomps[key][k] += comps[key][k] 229 | 230 | # append per image bleu score 231 | bleu = 1. 232 | for k in xrange(n): 233 | bleu *= (float(comps['correct'][k]) + tiny) \ 234 | /(float(comps['guess'][k]) + small) 235 | bleu_list[k].append(bleu ** (1./(k+1))) 236 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division 237 | if ratio < 1: 238 | for k in xrange(n): 239 | bleu_list[k][-1] *= math.exp(1 - 1/ratio) 240 | 241 | if verbose > 1: 242 | print comps, reflen 243 | 244 | totalcomps['reflen'] = self._reflen 245 | totalcomps['testlen'] = self._testlen 246 | 247 | bleus = [] 248 | bleu = 1. 249 | for k in xrange(n): 250 | bleu *= float(totalcomps['correct'][k] + tiny) \ 251 | / (totalcomps['guess'][k] + small) 252 | bleus.append(bleu ** (1./(k+1))) 253 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division 254 | if ratio < 1: 255 | for k in xrange(n): 256 | bleus[k] *= math.exp(1 - 1/ratio) 257 | 258 | #if verbose > 0: 259 | # print totalcomps 260 | # print "ratio:", ratio 261 | 262 | self._score = bleus 263 | return self._score, bleu_list 264 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Yizhe Zhang 4 | 5 | TextCNN 6 | """ 7 | ## 152.3.214.203/6006 8 | 9 | import os 10 | 11 | GPUID = 1 12 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID) 13 | 14 | import tensorflow as tf 15 | from tensorflow.contrib import learn 16 | from tensorflow.contrib import layers 17 | # from tensorflow.contrib import metrics 18 | # from tensorflow.contrib.learn import monitors 19 | from tensorflow.contrib import framework 20 | from tensorflow.contrib.learn.python.learn import learn_runner 21 | from tensorflow.python.platform import tf_logging as logging 22 | # from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec 23 | import cPickle 24 | import numpy as np 25 | import os 26 | import scipy.io as sio 27 | from math import floor 28 | import pdb 29 | 30 | from model import * 31 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, get_minibatches_idx, normalizing, restore_from_save, \ 32 | prepare_for_bleu, cal_BLEU, sent2idx 33 | from denoise import * 34 | 35 | # import tempfile 36 | # from tensorflow.examples.tutorials.mnist import input_data 37 | 38 | logging.set_verbosity(logging.INFO) 39 | # Basic model parameters as external flags. 40 | flags = tf.app.flags 41 | FLAGS = flags.FLAGS 42 | 43 | 44 | class Options(object): 45 | def __init__(self): 46 | self.fix_emb = False 47 | self.reuse_w = False 48 | self.reuse_cnn = False 49 | self.reuse_discrimination = True # reuse cnn for discrimination 50 | self.restore = True 51 | self.tanh = True # activation fun for the top layer of cnn, otherwise relu 52 | self.model = 'cnn_deconv' # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv 53 | 54 | self.permutation = 0 55 | self.substitution = 's' # Deletion(d), Insertion(a), Substitution(s) and Permutation(p) 56 | 57 | self.W_emb = None 58 | self.cnn_W = None 59 | self.cnn_b = None 60 | self.maxlen = 253 61 | self.n_words = None 62 | self.filter_shape = 5 63 | self.filter_size = 300 64 | self.embed_size = 300 65 | self.lr = 1e-5 66 | self.layer = 3 67 | self.stride = [2, 2, 2] # for two layer cnn/deconv , use self.stride[0] 68 | self.batch_size = 32 69 | self.max_epochs = 100 70 | self.n_gan = 900 # self.filter_size * 3 71 | self.L = 100 72 | 73 | self.save_path = "./save/" + "hotel_" + str(self.n_gan) + "_dim_" + self.model + "_" + self.substitution + str( 74 | self.permutation) 75 | self.log_path = "./log" 76 | self.print_freq = 100 77 | self.valid_freq = 100 78 | 79 | # batch norm & dropout 80 | self.batch_norm = False 81 | self.cnn_layer_dropout = False 82 | self.dropout = True 83 | self.dropout_ratio = 1.0 84 | self.is_train = True 85 | 86 | self.discrimination = False 87 | self.H_dis = 300 88 | 89 | self.sent_len = self.maxlen + 2 * (self.filter_shape - 1) 90 | self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape) / self.stride[0]) + 1) 91 | self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape) / self.stride[1]) + 1) 92 | self.sent_len4 = np.int32(floor((self.sent_len3 - self.filter_shape)/self.stride[2]) + 1) 93 | print ('Use model %s' % self.model) 94 | print ('Use %d conv/deconv layers' % self.layer) 95 | 96 | def __iter__(self): 97 | for attr, value in self.__dict__.iteritems(): 98 | yield attr, value 99 | 100 | 101 | def auto_encoder(x, x_org, opt, opt_t=None): 102 | # print x.get_shape() # batch L 103 | if not opt_t: opt_t = opt 104 | x_emb, W_norm = embedding(x, opt) # batch L emb 105 | x_emb = tf.expand_dims(x_emb, 3) # batch L emb 1 106 | 107 | res = {} 108 | 109 | # cnn encoder 110 | if opt.layer == 4: 111 | H_enc = conv_model_4layer(x_emb, opt) 112 | elif opt.layer == 3: 113 | H_enc = conv_model_3layer(x_emb, opt) 114 | else: 115 | H_enc = conv_model(x_emb, opt) 116 | 117 | H_dec = H_enc 118 | # deconv decoder 119 | if opt.layer == 4: 120 | x_rec = deconv_model_4layer(H_dec, opt_t) # batch L emb 1 121 | elif opt.layer == 3: 122 | x_rec = deconv_model_3layer(H_dec, opt_t) # batch L emb 1 123 | else: 124 | x_rec = deconv_model(H_dec, opt_t) # batch L emb 1 125 | print("Encoder len %d Decoder len %d Output len %d" % ( 126 | x_emb.get_shape()[1], x_rec.get_shape()[1], x_org.get_shape()[1])) 127 | tf.assert_equal(x_rec.get_shape(), x_emb.get_shape()) 128 | tf.assert_equal(x_emb.get_shape()[1], x_org.get_shape()[1]) 129 | x_rec_norm = normalizing(x_rec, 2) # batch L emb 130 | 131 | if opt.fix_emb: 132 | # cosine sim 133 | # Batch L emb 134 | loss = -tf.reduce_sum(x_rec_norm * x_emb) 135 | rec_sent = tf.argmax(tf.tensordot(tf.squeeze(x_rec_norm), W_norm, [[2], [1]]), 2) 136 | res['rec_sents'] = rec_sent 137 | 138 | 139 | else: 140 | x_temp = tf.reshape(x_org, [-1, ]) 141 | prob_logits = tf.tensordot(tf.squeeze(x_rec_norm), W_norm, [[2], [1]]) # c_blv = sum_e x_ble W_ve 142 | 143 | prob = tf.nn.log_softmax(prob_logits * opt_t.L, dim=-1, name=None) 144 | rec_sent = tf.squeeze(tf.argmax(prob, 2)) 145 | prob = tf.reshape(prob, [-1, opt_t.n_words]) 146 | 147 | idx = tf.range(opt.batch_size * opt_t.sent_len) 148 | 149 | all_idx = tf.transpose(tf.stack(values=[idx, x_temp])) 150 | all_prob = tf.gather_nd(prob, all_idx) 151 | 152 | gen_temp = tf.cast(tf.reshape(rec_sent, [-1, ]), tf.int32) 153 | gen_idx = tf.transpose(tf.stack(values=[idx, gen_temp])) 154 | gen_prob = tf.gather_nd(prob, gen_idx) 155 | 156 | res['rec_sents'] = rec_sent 157 | 158 | res['gen_p'] = tf.exp(gen_prob[0:opt.sent_len]) 159 | res['all_p'] = tf.exp(all_prob[0:opt.sent_len]) 160 | 161 | if opt.discrimination: 162 | logits_real, _ = discriminator(x_org, W_norm, opt_t) 163 | prob_one_hot = tf.nn.log_softmax(prob_logits * opt_t.L * 100, dim=-1, name=None) 164 | logits_syn, _ = discriminator(tf.exp(prob_one_hot), W_norm, opt_t, is_prob=True, is_reuse=True) 165 | 166 | res['prob_r'] = tf.reduce_mean(tf.nn.sigmoid(logits_real)) 167 | res['prob_f'] = tf.reduce_mean(tf.nn.sigmoid(logits_syn)) 168 | 169 | loss = tf.reduce_mean( 170 | tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(logits_real), logits=logits_real)) + \ 171 | tf.reduce_mean( 172 | tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(logits_syn), logits=logits_syn)) 173 | else: 174 | loss = -tf.reduce_mean(all_prob) 175 | 176 | tf.summary.scalar('loss', loss) 177 | 178 | train_op = layers.optimize_loss( 179 | loss, 180 | framework.get_global_step(), 181 | optimizer='Adam', 182 | learning_rate=opt.lr) 183 | return res, loss, train_op 184 | 185 | 186 | def main(): 187 | # global n_words 188 | # Prepare training and testing data 189 | loadpath = "./data/hotel_reviews.p" 190 | x = cPickle.load(open(loadpath, "rb")) 191 | train, val = x[0], x[1] 192 | wordtoix, ixtoword = x[2], x[3] 193 | train = [list(s) for s in train] 194 | val = [list(s) for s in val] 195 | opt = Options() 196 | opt.n_words = len(ixtoword) + 1 197 | ixtoword[opt.n_words - 1] = 'GO_' 198 | print dict(opt) 199 | print('Total words: %d' % opt.n_words) 200 | 201 | try: 202 | params = np.load('./param_g.npz') 203 | if params['Wemb'].shape == (opt.n_words, opt.embed_size): 204 | print('Use saved embedding.') 205 | opt.W_emb = params['Wemb'] 206 | else: 207 | print('Emb Dimension mismatch: param_g.npz:' + str(params['Wemb'].shape) + ' opt: ' + str( 208 | (opt.n_words, opt.embed_size))) 209 | opt.fix_emb = False 210 | except IOError: 211 | print('No embedding file found.') 212 | opt.fix_emb = False 213 | 214 | with tf.device('/gpu:1'): 215 | x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 216 | x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 217 | res_, loss_, train_op = auto_encoder(x_, x_org_, opt) 218 | merged = tf.summary.merge_all() 219 | 220 | 221 | 222 | uidx = 0 223 | config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) 224 | config.gpu_options.allow_growth = True 225 | np.set_printoptions(precision=3) 226 | np.set_printoptions(threshold=np.inf) 227 | saver = tf.train.Saver() 228 | 229 | with tf.Session(config=config) as sess: 230 | train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) 231 | test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) 232 | sess.run(tf.global_variables_initializer()) 233 | if opt.restore: 234 | try: 235 | t_vars = tf.trainable_variables() 236 | loader = restore_from_save(t_vars, sess, opt) 237 | 238 | except Exception as e: 239 | print(e) 240 | print("No saving session, using random initialization") 241 | sess.run(tf.global_variables_initializer()) 242 | 243 | for epoch in range(opt.max_epochs): 244 | print("Starting epoch %d" % epoch) 245 | kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) 246 | for _, train_index in kf: 247 | uidx += 1 248 | sents = [train[t] for t in train_index] 249 | 250 | sents_permutated = add_noise(sents, opt) 251 | 252 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 253 | x_batch_org = prepare_data_for_cnn(sents, opt) # Batch L 254 | else: 255 | x_batch_org = prepare_data_for_rnn(sents, opt) # Batch L 256 | 257 | if opt.model != 'rnn_rnn': 258 | x_batch = prepare_data_for_cnn(sents_permutated, opt) # Batch L 259 | else: 260 | x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO=False) # Batch L 261 | 262 | _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org}) 263 | 264 | if uidx % opt.valid_freq == 0: 265 | opt.is_train = False 266 | valid_index = np.random.choice(len(val), opt.batch_size) 267 | val_sents = [val[t] for t in valid_index] 268 | 269 | val_sents_permutated = add_noise(val_sents, opt) 270 | 271 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 272 | x_val_batch_org = prepare_data_for_cnn(val_sents, opt) 273 | else: 274 | x_val_batch_org = prepare_data_for_rnn(val_sents, opt) 275 | 276 | if opt.model != 'rnn_rnn': 277 | x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt) 278 | else: 279 | x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False) 280 | 281 | loss_val = sess.run(loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org}) 282 | print("Validation loss %f " % (loss_val)) 283 | res = sess.run(res_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org}) 284 | if opt.discrimination: 285 | print ("Real Prob %f Fake Prob %f" % (res['prob_r'], res['prob_f'])) 286 | print "Val Orig :" + " ".join([ixtoword[x] for x in val_sents[0] if x != 0]) 287 | print "Val Perm :" + " ".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0]) 288 | print "Val Recon:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 289 | 290 | val_set = [prepare_for_bleu(s) for s in val_sents] 291 | [bleu2s, bleu3s, bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in res['rec_sents']], {0: val_set}) 292 | print 'Val BLEU (2,3,4): ' + ' '.join([str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)]) 293 | summary = sess.run(merged, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org}) 294 | test_writer.add_summary(summary, uidx) 295 | opt.is_train = True 296 | 297 | 298 | if uidx % opt.print_freq == 0: 299 | print("Iteration %d: loss %f " % (uidx, loss)) 300 | res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org}) 301 | print "Original :" + " ".join([ixtoword[x] for x in sents[0] if x != 0]) 302 | print "Permutated :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0]) 303 | if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn': 304 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0]) 305 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 306 | 307 | 308 | summary = sess.run(merged, feed_dict={x_: x_batch, x_org_: x_batch_org}) 309 | train_writer.add_summary(summary, uidx) 310 | 311 | saver.save(sess, opt.save_path, global_step=epoch) 312 | 313 | 314 | if __name__ == '__main__': 315 | main() 316 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utilities for downloading data from WMT, tokenizing, vocabularies.""" 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import gzip 22 | import os 23 | import re 24 | import tarfile 25 | import pdb 26 | 27 | from six.moves import urllib 28 | 29 | from tensorflow.python.platform import gfile 30 | import tensorflow as tf 31 | 32 | # Special vocabulary symbols - we always put them at the start. 33 | _PAD = b"_PAD" 34 | _GO = b"_GO" 35 | _EOS = b"_EOS" 36 | _UNK = b"_UNK" 37 | _START_VOCAB = [_PAD, _GO, _EOS, _UNK] 38 | 39 | PAD_ID = 0 40 | GO_ID = 1 41 | EOS_ID = 2 42 | UNK_ID = 3 43 | 44 | # Regular expressions used to tokenize. 45 | _WORD_SPLIT = re.compile(b"([.,!?\"':;)(])") 46 | _DIGIT_RE = re.compile(br"\d") 47 | 48 | # URLs for WMT data. 49 | _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar" 50 | _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz" 51 | 52 | 53 | def maybe_download(directory, filename, url): 54 | """Download filename from url unless it's already in directory.""" 55 | if not os.path.exists(directory): 56 | print("Creating directory %s" % directory) 57 | os.mkdir(directory) 58 | filepath = os.path.join(directory, filename) 59 | if not os.path.exists(filepath): 60 | print("Downloading %s to %s" % (url, filepath)) 61 | filepath, _ = urllib.request.urlretrieve(url, filepath) 62 | statinfo = os.stat(filepath) 63 | print("Successfully downloaded", filename, statinfo.st_size, "bytes") 64 | return filepath 65 | 66 | 67 | def gunzip_file(gz_path, new_path): 68 | """Unzips from gz_path into new_path.""" 69 | print("Unpacking %s to %s" % (gz_path, new_path)) 70 | with gzip.open(gz_path, "rb") as gz_file: 71 | with open(new_path, "wb") as new_file: 72 | for line in gz_file: 73 | new_file.write(line) 74 | 75 | 76 | def get_wmt_enfr_train_set(directory): 77 | """Download the WMT en-fr training corpus to directory unless it's there.""" 78 | train_path = os.path.join(directory, "giga-fren.release2.fixed") 79 | if not (gfile.Exists(train_path +".fr") and gfile.Exists(train_path +".en")): 80 | corpus_file = maybe_download(directory, "training-giga-fren.tar", 81 | _WMT_ENFR_TRAIN_URL) 82 | print("Extracting tar file %s" % corpus_file) 83 | with tarfile.open(corpus_file, "r") as corpus_tar: 84 | corpus_tar.extractall(directory) 85 | gunzip_file(train_path + ".fr.gz", train_path + ".fr") 86 | gunzip_file(train_path + ".en.gz", train_path + ".en") 87 | return train_path 88 | 89 | 90 | def get_wmt_enfr_dev_set(directory): 91 | """Download the WMT en-fr training corpus to directory unless it's there.""" 92 | dev_name = "newstest2013" 93 | dev_path = os.path.join(directory, dev_name) 94 | if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")): 95 | dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL) 96 | print("Extracting tgz file %s" % dev_file) 97 | with tarfile.open(dev_file, "r:gz") as dev_tar: 98 | fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr") 99 | en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en") 100 | fr_dev_file.name = dev_name + ".fr" # Extract without "dev/" prefix. 101 | en_dev_file.name = dev_name + ".en" 102 | dev_tar.extract(fr_dev_file, directory) 103 | dev_tar.extract(en_dev_file, directory) 104 | return dev_path 105 | 106 | 107 | def basic_tokenizer(sentence): 108 | """Very basic tokenizer: split the sentence into a list of tokens.""" 109 | words = [] 110 | for space_separated_fragment in sentence.strip().split(): 111 | words.extend(_WORD_SPLIT.split(space_separated_fragment)) 112 | return [w for w in words if w] 113 | 114 | 115 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, 116 | tokenizer=None, normalize_digits=True): 117 | """Create vocabulary file (if it does not exist yet) from data file. 118 | 119 | Data file is assumed to contain one sentence per line. Each sentence is 120 | tokenized and digits are normalized (if normalize_digits is set). 121 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size. 122 | We write it to vocabulary_path in a one-token-per-line format, so that later 123 | token in the first line gets id=0, second line gets id=1, and so on. 124 | 125 | Args: 126 | vocabulary_path: path where the vocabulary will be created. 127 | data_path: data file that will be used to create vocabulary. 128 | max_vocabulary_size: limit on the size of the created vocabulary. 129 | tokenizer: a function to use to tokenize each data sentence; 130 | if None, basic_tokenizer will be used. 131 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 132 | """ 133 | if not gfile.Exists(vocabulary_path): 134 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) 135 | vocab = {} 136 | #pdb.set_trace() 137 | with gfile.GFile(data_path, mode="rb") as f: 138 | counter = 0 139 | for line in f: 140 | counter += 1 141 | if counter % 100000 == 0: 142 | print(" processing line %d" % counter) 143 | line = tf.compat.as_bytes(line) 144 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) 145 | for w in tokens: 146 | word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w 147 | if word in vocab: 148 | vocab[word] += 1 149 | else: 150 | vocab[word] = 1 151 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) 152 | if len(vocab_list) > max_vocabulary_size: 153 | vocab_list = vocab_list[:max_vocabulary_size] 154 | with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: 155 | for w in vocab_list: 156 | vocab_file.write(w + b"\n") 157 | 158 | 159 | def initialize_vocabulary(vocabulary_path): 160 | """Initialize vocabulary from file. 161 | 162 | We assume the vocabulary is stored one-item-per-line, so a file: 163 | dog 164 | cat 165 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will 166 | also return the reversed-vocabulary ["dog", "cat"]. 167 | 168 | Args: 169 | vocabulary_path: path to the file containing the vocabulary. 170 | 171 | Returns: 172 | a pair: the vocabulary (a dictionary mapping string to integers), and 173 | the reversed vocabulary (a list, which reverses the vocabulary mapping). 174 | 175 | Raises: 176 | ValueError: if the provided vocabulary_path does not exist. 177 | """ 178 | if gfile.Exists(vocabulary_path): 179 | rev_vocab = [] 180 | with gfile.GFile(vocabulary_path, mode="rb") as f: 181 | rev_vocab.extend(f.readlines()) 182 | rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab] 183 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 184 | idtoword = dict([(y, x) for (y, x) in enumerate(rev_vocab)]) 185 | return vocab, idtoword 186 | else: 187 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 188 | 189 | 190 | def sentence_to_token_ids(sentence, vocabulary, 191 | tokenizer=None, normalize_digits=True): 192 | """Convert a string to list of integers representing token-ids. 193 | 194 | For example, a sentence "I have a dog" may become tokenized into 195 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2, 196 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7]. 197 | 198 | Args: 199 | sentence: the sentence in bytes format to convert to token-ids. 200 | vocabulary: a dictionary mapping tokens to integers. 201 | tokenizer: a function to use to tokenize each sentence; 202 | if None, basic_tokenizer will be used. 203 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 204 | 205 | Returns: 206 | a list of integers, the token-ids for the sentence. 207 | """ 208 | 209 | if tokenizer: 210 | words = tokenizer(sentence) 211 | else: 212 | words = basic_tokenizer(sentence) 213 | if not normalize_digits: 214 | return [vocabulary.get(w, UNK_ID) for w in words] 215 | # Normalize digits by 0 before looking words up in the vocabulary. 216 | return [vocabulary.get(_DIGIT_RE.sub(b"0", w), UNK_ID) for w in words] 217 | 218 | 219 | def data_to_token_ids(data_path, target_path, vocabulary_path, 220 | tokenizer=None, normalize_digits=True): 221 | """Tokenize data file and turn into token-ids using given vocabulary file. 222 | 223 | This function loads data line-by-line from data_path, calls the above 224 | sentence_to_token_ids, and saves the result to target_path. See comment 225 | for sentence_to_token_ids on the details of token-ids format. 226 | 227 | Args: 228 | data_path: path to the data file in one-sentence-per-line format. 229 | target_path: path where the file with token-ids will be created. 230 | vocabulary_path: path to the vocabulary file. 231 | tokenizer: a function to use to tokenize each sentence; 232 | if None, basic_tokenizer will be used. 233 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 234 | """ 235 | if not gfile.Exists(target_path): 236 | print("Tokenizing data in %s" % data_path) 237 | vocab, _ = initialize_vocabulary(vocabulary_path) 238 | with gfile.GFile(data_path, mode="rb") as data_file: 239 | with gfile.GFile(target_path, mode="w") as tokens_file: 240 | counter = 0 241 | for line in data_file: 242 | counter += 1 243 | if counter % 100000 == 0: 244 | print(" tokenizing line %d" % counter) 245 | token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab, 246 | tokenizer, normalize_digits) 247 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") 248 | 249 | 250 | def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None): 251 | """Get WMT data into data_dir, create vocabularies and tokenize data. 252 | 253 | Args: 254 | data_dir: directory in which the data sets will be stored. 255 | en_vocabulary_size: size of the English vocabulary to create and use. 256 | fr_vocabulary_size: size of the French vocabulary to create and use. 257 | tokenizer: a function to use to tokenize each data sentence; 258 | if None, basic_tokenizer will be used. 259 | 260 | Returns: 261 | A tuple of 6 elements: 262 | (1) path to the token-ids for English training data-set, 263 | (2) path to the token-ids for French training data-set, 264 | (3) path to the token-ids for English development data-set, 265 | (4) path to the token-ids for French development data-set, 266 | (5) path to the English vocabulary file, 267 | (6) path to the French vocabulary file. 268 | """ 269 | # Get wmt data to the specified directory. 270 | train_path = get_wmt_enfr_train_set(data_dir) 271 | dev_path = get_wmt_enfr_dev_set(data_dir) 272 | 273 | from_train_path = train_path + ".en" 274 | to_train_path = train_path + ".fr" 275 | from_dev_path = dev_path + ".en" 276 | to_dev_path = dev_path + ".fr" 277 | return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size, 278 | fr_vocabulary_size, tokenizer) 279 | 280 | 281 | def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size, 282 | to_vocabulary_size, tokenizer=None): 283 | """Preapre all necessary files that are required for the training. 284 | 285 | Args: 286 | data_dir: directory in which the data sets will be stored. 287 | from_train_path: path to the file that includes "from" training samples. 288 | to_train_path: path to the file that includes "to" training samples. 289 | from_dev_path: path to the file that includes "from" dev samples. 290 | to_dev_path: path to the file that includes "to" dev samples. 291 | from_vocabulary_size: size of the "from language" vocabulary to create and use. 292 | to_vocabulary_size: size of the "to language" vocabulary to create and use. 293 | tokenizer: a function to use to tokenize each data sentence; 294 | if None, basic_tokenizer will be used. 295 | 296 | Returns: 297 | A tuple of 6 elements: 298 | (1) path to the token-ids for "from language" training data-set, 299 | (2) path to the token-ids for "to language" training data-set, 300 | (3) path to the token-ids for "from language" development data-set, 301 | (4) path to the token-ids for "to language" development data-set, 302 | (5) path to the "from language" vocabulary file, 303 | (6) path to the "to language" vocabulary file. 304 | """ 305 | # Create vocabularies of the appropriate sizes. 306 | to_vocab_path = os.path.join(data_dir, "vocab%d.to" % to_vocabulary_size) 307 | from_vocab_path = os.path.join(data_dir, "vocab%d.from" % from_vocabulary_size) 308 | create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer) 309 | create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer) 310 | 311 | # Create token ids for the training data. 312 | to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size) 313 | from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size) 314 | data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer) 315 | data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer) 316 | 317 | # Create token ids for the development data. 318 | to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size) 319 | from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size) 320 | data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer) 321 | data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer) 322 | 323 | return (from_train_ids_path, to_train_ids_path, 324 | from_dev_ids_path, to_dev_ids_path, 325 | from_vocab_path, to_vocab_path) 326 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # import theano 3 | # from theano import config 4 | import tensorflow as tf 5 | from collections import OrderedDict 6 | import nltk 7 | from pycocoevalcap.bleu.bleu import Bleu 8 | from pycocoevalcap.rouge.rouge import Rouge 9 | from tensorflow.python import pywrap_tensorflow 10 | import pdb 11 | import data_utils 12 | import sys 13 | from tensorflow.python.ops import clip_ops 14 | from rougescore import rouge_n, rouge_1, rouge_2, rouge_l 15 | 16 | def lrelu(x, leak=0.2, name="lrelu"): 17 | with tf.variable_scope(name): 18 | f1 = 0.5 * (1 + leak) 19 | f2 = 0.5 * (1 - leak) 20 | return f1 * x + f2 * tf.abs(x) 21 | 22 | def sent2idx(text, wordtoix, opt, is_cnn = True): 23 | 24 | sent = [wordtoix[x] for x in text.split()] 25 | 26 | return prepare_data_for_cnn([sent for i in range(opt.batch_size)], opt) 27 | 28 | 29 | 30 | def prepare_data_for_cnn(seqs_x, opt): 31 | maxlen=opt.maxlen 32 | filter_h=opt.filter_shape 33 | lengths_x = [len(s) for s in seqs_x] 34 | # print lengths_x 35 | if maxlen != None: 36 | new_seqs_x = [] 37 | new_lengths_x = [] 38 | for l_x, s_x in zip(lengths_x, seqs_x): 39 | if l_x < maxlen: 40 | new_seqs_x.append(s_x) 41 | new_lengths_x.append(l_x) 42 | lengths_x = new_lengths_x 43 | seqs_x = new_seqs_x 44 | 45 | if len(lengths_x) < 1 : 46 | return None, None 47 | 48 | pad = filter_h -1 49 | x = [] 50 | for rev in seqs_x: 51 | xx = [] 52 | for i in xrange(pad): 53 | xx.append(0) 54 | for idx in rev: 55 | xx.append(idx) 56 | while len(xx) < maxlen + 2*pad: 57 | xx.append(0) 58 | x.append(xx) 59 | x = np.array(x,dtype='int32') 60 | return x 61 | 62 | 63 | def prepare_data_for_rnn(seqs_x, opt, is_add_GO = True): 64 | 65 | maxlen=opt.maxlen 66 | lengths_x = [len(s) for s in seqs_x] 67 | # print lengths_x 68 | if maxlen != None: 69 | new_seqs_x = [] 70 | new_lengths_x = [] 71 | for l_x, s_x in zip(lengths_x, seqs_x): 72 | if l_x < maxlen: 73 | new_seqs_x.append(s_x) 74 | new_lengths_x.append(l_x) 75 | lengths_x = new_lengths_x 76 | seqs_x = new_seqs_x 77 | 78 | if len(lengths_x) < 1 : 79 | return None, None 80 | 81 | n_samples = len(seqs_x) 82 | maxlen_x = np.max(lengths_x) 83 | x = np.zeros(( n_samples, opt.sent_len)).astype('int32') 84 | for idx, s_x in enumerate(seqs_x): 85 | if is_add_GO: 86 | x[idx, 0] = 1 # GO symbol 87 | x[idx, 1:lengths_x[idx]+1] = s_x 88 | else: 89 | x[idx, :lengths_x[idx]] = s_x 90 | return x 91 | 92 | 93 | 94 | def restore_from_save(t_vars, sess, opt): 95 | save_keys = tensors_key_in_file(opt.save_path) 96 | #print(save_keys.keys()) 97 | ss = set([var.name for var in t_vars])&set([s+":0" for s in save_keys.keys()]) 98 | cc = {var.name:var for var in t_vars} 99 | ss_right_shape = set([s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]]) # only restore variables with correct shape 100 | 101 | if opt.reuse_discrimination: 102 | ss2 = set([var.name[2:] for var in t_vars])&set([s+":0" for s in save_keys.keys()]) 103 | cc2 = {var.name[2:][:-2]:var for var in t_vars if var.name[2:] in ss2 if var.get_shape() == save_keys[var.name[2:][:-2]]} 104 | for s_iter in ss_right_shape: 105 | cc2[s_iter[:-2]] = cc[s_iter] 106 | 107 | loader = tf.train.Saver(var_list=cc2) 108 | loader.restore(sess, opt.save_path) 109 | print("Loaded variables for discriminator:"+str(cc2.keys())) 110 | 111 | else: 112 | # for var in t_vars: 113 | # if var.name[:-2] in ss: 114 | # tf.assign(t_vars, save_keys[var.name[:-2]]) 115 | loader = tf.train.Saver(var_list= [var for var in t_vars if var.name in ss_right_shape]) 116 | loader.restore(sess, opt.save_path) 117 | print("Loading variables from '%s'." % opt.save_path) 118 | print("Loaded variables:"+str(ss_right_shape)) 119 | 120 | 121 | 122 | 123 | 124 | 125 | return loader 126 | 127 | 128 | _buckets = [(60,60)] 129 | 130 | def read_data(source_path, target_path, opt): 131 | """ 132 | From tensorflow tutorial translate.py 133 | Read data from source and target files and put into buckets. 134 | Args: 135 | source_path: path to the files with token-ids for the source language. 136 | target_path: path to the file with token-ids for the target language; 137 | it must be aligned with the source file: n-th line contains the desired 138 | output for n-th line from the source_path. 139 | max_size: maximum number of lines to read, all other will be ignored; 140 | if 0 or None, data files will be read completely (no limit). 141 | 142 | Returns: 143 | data_set: a list of length len(_buckets); data_set[n] contains a list of 144 | (source, target) pairs read from the provided data files that fit 145 | into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and 146 | len(target) < _buckets[n][1]; source and target are lists of token-ids. 147 | """ 148 | data_set = [[] for _ in _buckets] 149 | with tf.gfile.GFile(source_path, mode="r") as source_file: 150 | with tf.gfile.GFile(target_path, mode="r") as target_file: 151 | source, target = source_file.readline(), target_file.readline() 152 | counter = 0 153 | while source and target and (not opt.max_train_data_size or counter < opt.max_train_data_size): 154 | counter += 1 155 | if counter % 100000 == 0: 156 | print(" reading data line %d" % counter) 157 | sys.stdout.flush() 158 | source_ids = [int(x) for x in source.split()] 159 | target_ids = [int(x) for x in target.split()] 160 | target_ids.append(data_utils.EOS_ID) 161 | for bucket_id, (source_size, target_size) in enumerate(_buckets): 162 | if opt.minlen = maxlen: 215 | # return None 216 | # else: 217 | # new_p.append([0]*pad + it + [0]*(maxlen-len(it)+pad)) 218 | # return np.array(new_p) 219 | # return [padding(pair) for pair in pair_x] 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | def tensors_key_in_file(file_name): 231 | """Return tensors key in a checkpoint file. 232 | Args: 233 | file_name: Name of the checkpoint file. 234 | """ 235 | try: 236 | reader = pywrap_tensorflow.NewCheckpointReader(file_name) 237 | return reader.get_variable_to_shape_map() 238 | except Exception as e: # pylint: disable=broad-except 239 | print(str(e)) 240 | return None 241 | 242 | 243 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 244 | idx_list = np.arange(n, dtype="int32") 245 | 246 | if shuffle: 247 | np.random.shuffle(idx_list) 248 | 249 | minibatches = [] 250 | minibatch_start = 0 251 | for i in range(n // minibatch_size): 252 | minibatches.append(idx_list[minibatch_start: 253 | minibatch_start + minibatch_size]) 254 | minibatch_start += minibatch_size 255 | 256 | # if (minibatch_start != n): 257 | # # Make a minibatch out of what is left 258 | # minibatches.append(idx_list[minibatch_start:]) 259 | 260 | return zip(range(len(minibatches)), minibatches) 261 | 262 | 263 | # def normalizing_L1(x, axis): 264 | # norm = tf.sqrt(tf.reduce_sum(tf.square(x), axis=axis, keep_dims=True)) 265 | # normalized = x / (norm) 266 | # return normalized 267 | 268 | def normalizing(x, axis): 269 | norm = tf.sqrt(tf.reduce_sum(tf.square(x), axis=axis, keep_dims=True)) 270 | normalized = x / (norm) 271 | return normalized 272 | 273 | def _p(pp, name): 274 | return '%s_%s' % (pp, name) 275 | 276 | def dropout(X, trng, p=0.): 277 | if p != 0: 278 | retain_prob = 1 - p 279 | X = X / retain_prob * trng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) 280 | return X 281 | 282 | """ used for initialization of the parameters. """ 283 | 284 | def ortho_weight(ndim): 285 | W = np.random.randn(ndim, ndim) 286 | u, s, v = np.linalg.svd(W) 287 | return u.astype(config.floatX) 288 | 289 | def uniform_weight(nin,nout=None, scale=0.05): 290 | if nout == None: 291 | nout = nin 292 | W = np.random.uniform(low=-scale, high=scale, size=(nin, nout)) 293 | return W.astype(config.floatX) 294 | 295 | def normal_weight(nin,nout=None, scale=0.05): 296 | if nout == None: 297 | nout = nin 298 | W = np.random.randn(nin, nout) * scale 299 | return W.astype(config.floatX) 300 | 301 | def zero_bias(ndim): 302 | b = np.zeros((ndim,)) 303 | return b.astype(config.floatX) 304 | 305 | """auxiliary function for KDE""" 306 | def log_mean_exp(A,b,sigma): 307 | a=-0.5*((A-theano.tensor.tile(b,[A.shape[0],1]))**2).sum(1)/(sigma**2) 308 | max_=a.max() 309 | return max_+theano.tensor.log(theano.tensor.exp(a-theano.tensor.tile(max_,a.shape[0])).mean()) 310 | 311 | '''calculate KDE''' 312 | def cal_nkde(X,mu,sigma): 313 | s1,updates=theano.scan(lambda i,s: s+log_mean_exp(mu,X[i,:],sigma), sequences=[theano.tensor.arange(X.shape[0])],outputs_info=[np.asarray(0.,dtype="float32")]) 314 | E=s1[-1] 315 | Z=mu.shape[0]*theano.tensor.log(sigma*np.sqrt(np.pi*2)) 316 | return (Z-E)/mu.shape[0] 317 | 318 | 319 | """ BLEU score""" 320 | # def cal_BLEU(generated, reference): 321 | # #the maximum is bigram, so assign the weight into 2 half. 322 | # BLEUscore = 0.0 323 | # for g in generated: 324 | # BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g) 325 | # BLEUscore = BLEUscore/len(generated) 326 | # return BLEUscore 327 | 328 | def cal_ROUGE(generated, reference, is_corpus = False): 329 | # ref and sample are both dict 330 | # scorers = [ 331 | # (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 332 | # (Meteor(),"METEOR"), 333 | # (Rouge(), "ROUGE_L"), 334 | # (Cider(), "CIDEr") 335 | # ] 336 | # output rouge 1-4 and rouge L and rouge L from pycocoevaluate 337 | 338 | 339 | ROUGEscore = [0.0]*6 340 | for idx, g in enumerate(generated): 341 | score = [0.0]*6 342 | if is_corpus: 343 | for order in range(4): 344 | score[order] = rouge_n(g.split(), [x.split() for x in reference[0]], order+1, 0.5) 345 | score[4] = rouge_l(g.split(), [x.split() for x in reference[0]], 0.5) 346 | score[5], _ = Rouge().compute_score(reference, {0: [g]}) 347 | 348 | 349 | else: 350 | for order in range(4): 351 | score[order] = rouge_n(g.split(), [reference[0][idx].split()], order+1, 0.5) 352 | score[4] = rouge_l(g.split(), [reference[0][idx].split()], 0.5) 353 | score[5], _ = Rouge().compute_score({0: [reference[0][idx]]}, {0: [g]}) 354 | #pdb.set_trace() 355 | #print g, score 356 | ROUGEscore = [ r+score[idx] for idx,r in enumerate(ROUGEscore)] 357 | #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight) 358 | ROUGEscore = [r/len(generated) for r in ROUGEscore] 359 | return ROUGEscore 360 | 361 | 362 | 363 | 364 | def cal_BLEU(generated, reference, is_corpus = False): 365 | #print 'in BLEU score calculation' 366 | #the maximum is bigram, so assign the weight into 2 half. 367 | BLEUscore = [0.0,0.0,0.0] 368 | for idx, g in enumerate(generated): 369 | if is_corpus: 370 | score, scores = Bleu(4).compute_score(reference, {0: [g]}) 371 | else: 372 | score, scores = Bleu(4).compute_score({0: [reference[0][idx]]} , {0: [g]}) 373 | #print g, score 374 | for i, s in zip([0,1,2],score[1:]): 375 | BLEUscore[i]+=s 376 | #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight) 377 | BLEUscore[0] = BLEUscore[0]/len(generated) 378 | BLEUscore[1] = BLEUscore[1]/len(generated) 379 | BLEUscore[2] = BLEUscore[2]/len(generated) 380 | return BLEUscore 381 | 382 | def cal_BLEU_4(generated, reference, is_corpus = False): 383 | #print 'in BLEU score calculation' 384 | #the maximum is bigram, so assign the weight into 2 half. 385 | BLEUscore = [0.0,0.0,0.0,0.0] 386 | for idx, g in enumerate(generated): 387 | if is_corpus: 388 | score, scores = Bleu(4).compute_score(reference, {0: [g]}) 389 | else: 390 | score, scores = Bleu(4).compute_score({0: [reference[0][idx]]} , {0: [g]}) 391 | #print g, score 392 | for i, s in zip([0,1,2,3],score): 393 | BLEUscore[i]+=s 394 | #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight) 395 | BLEUscore[0] = BLEUscore[0]/len(generated) 396 | BLEUscore[1] = BLEUscore[1]/len(generated) 397 | BLEUscore[2] = BLEUscore[2]/len(generated) 398 | BLEUscore[3] = BLEUscore[3]/len(generated) 399 | return BLEUscore 400 | 401 | def prepare_for_bleu(sentence): 402 | sent=[x for x in sentence if x!=0] 403 | while len(sent)<4: 404 | sent.append(0) 405 | #sent = ' '.join([ixtoword[x] for x in sent]) 406 | sent = ' '.join([str(x) for x in sent]) 407 | return sent 408 | 409 | 410 | 411 | def _clip_gradients_seperate_norm(grads_and_vars, clip_gradients): 412 | """Clips gradients by global norm.""" 413 | gradients, variables = zip(*grads_and_vars) 414 | clipped_gradients = [clip_ops.clip_by_norm(grad, clip_gradients) for grad in gradients] 415 | return list(zip(clipped_gradients, variables)) 416 | -------------------------------------------------------------------------------- /char_correction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Yizhe Zhang, Dinghan Shen, Guoyin Wang 4 | 5 | TextCNN 6 | """ 7 | 8 | import os 9 | import tensorflow as tf 10 | from tensorflow.contrib import learn 11 | from tensorflow.contrib import layers 12 | from tensorflow.contrib import framework 13 | from tensorflow.contrib.learn.python.learn import learn_runner 14 | from tensorflow.python.platform import tf_logging as logging 15 | import cPickle 16 | import numpy as np 17 | import os 18 | import scipy.io as sio 19 | from math import floor 20 | import pdb 21 | from model import * 22 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, \ 23 | get_minibatches_idx, normalizing, restore_from_save, \ 24 | prepare_for_bleu, cal_BLEU, sent2idx, _clip_gradients_seperate_norm 25 | from denoise import * 26 | from error_rate import prepare_for_cer, cal_cer 27 | 28 | 29 | GPUID = 0 30 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID) 31 | profile = False 32 | 33 | logging.set_verbosity(logging.INFO) 34 | # Basic model parameters as external flags. 35 | flags = tf.app.flags 36 | FLAGS = flags.FLAGS 37 | 38 | 39 | class Options(object): 40 | def __init__(self): 41 | # self.fix_emb = False 42 | self.reuse_w = False 43 | self.reuse_cnn = False 44 | self.reuse_discrimination = False # reuse cnn for discrimination 45 | self.restore = True 46 | self.tanh = False # activation fun for the top layer of cnn, otherwise relu 47 | self.model = 'cnn_deconv' #'cnn_deconv' # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv 48 | 49 | self.permutation = 0.3 50 | self.substitution = 'sc' # Deletion(d), Insertion(a), Substitution(s) and Permutation(p), c for char special 51 | 52 | self.W_emb = None 53 | self.cnn_W = None 54 | self.cnn_b = None 55 | self.maxlen = 221 56 | self.n_words = None 57 | self.filter_shape = 5 58 | self.filter_size = 300 59 | self.multiplier = 2 60 | self.lr = 1e-4 61 | 62 | self.layer = 3 63 | self.stride = [2,2] # for two layer cnn/deconv , use self.stride[0] 64 | self.batch_size = 32 65 | self.max_epochs = 100 66 | self.n_gan = 900 # self.filter_size * 3 67 | self.L = 50 68 | 69 | self.optimizer = 'Adam' #tf.train.AdamOptimizer(beta1=0.9) #'Adam' # 'Momentum' , 'RMSProp' 70 | self.clip_grad = None #100 # 20# 71 | self.attentive_emb = False 72 | self.decay_rate = 0.99 73 | self.relu_w = True 74 | 75 | self.save_path = "./save/" +str(self.n_gan) + "_dim_" + self.model + "_" + self.substitution + str(self.permutation) 76 | self.log_path = "./log" 77 | 78 | self.print_freq = 1 79 | self.valid_freq = 1 80 | 81 | # batch norm & dropout 82 | self.batch_norm = False 83 | self.cnn_layer_dropout = False 84 | self.dropout = False 85 | self.dropout_ratio = 0.5 86 | 87 | self.discrimination = False 88 | 89 | self.H_dis = 300 90 | 91 | self.sent_len = self.maxlen + 2*(self.filter_shape-1) 92 | self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape) 93 | / self.stride[0]) + 1) 94 | self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape) 95 | / self.stride[1]) + 1) 96 | 97 | # add char label 98 | self.char = True 99 | # dataset label 100 | self.data = 'yahoo' # option is three_small, three_char, imdb 101 | print('Use model %s' % self.model) 102 | print('Use %d conv/deconv layers' % self.layer) 103 | 104 | def __iter__(self): 105 | for attr, value in self.__dict__.iteritems(): 106 | yield attr, value 107 | 108 | def auto_encoder(x, x_org, is_train, opt, opt_t=None): 109 | if not opt_t: 110 | opt_t = opt 111 | x_emb, W_norm = embedding(x, opt) # batch L emb 112 | x_emb = tf.expand_dims(x_emb, 3) # batch L emb 1 113 | res = {} 114 | # cnn encoder 115 | 116 | H_enc, res = conv_encoder(x_emb, is_train, opt, res) 117 | 118 | H_dec = H_enc 119 | 120 | if opt.model == 'rnn_rnn': 121 | loss, rec_sent_1, _ = seq2seq(x, x_org, opt) 122 | _, rec_sent_2, _ = seq2seq(x, x_org, opt, feed_previous=True, is_reuse=True) 123 | 124 | res['rec_sents_feed_y'] = rec_sent_1 125 | res['rec_sents'] = rec_sent_2 126 | 127 | 128 | elif opt.model == 'cnn_rnn': 129 | # lstm decoder 130 | H_dec2 = tf.identity(H_dec) 131 | loss, rec_sent_1, _ = lstm_decoder(H_dec, x_org, opt) # 132 | 133 | _, rec_sent_2, _ = lstm_decoder(H_dec, x_org, opt, feed_previous=True, is_reuse=True) 134 | 135 | res['rec_sents_feed_y'] = rec_sent_1 136 | res['rec_sents'] = rec_sent_2 137 | 138 | else: 139 | 140 | # deconv decoder 141 | loss, res = deconv_decoder(H_dec, x_org, W_norm, is_train, opt_t, res) 142 | 143 | tf.summary.scalar('loss', loss) 144 | summaries = [ 145 | "learning_rate", 146 | "loss", 147 | "gradients", 148 | "gradient_norm", 149 | ] 150 | 151 | global_step = tf.Variable(0, trainable=False) 152 | 153 | 154 | train_op = layers.optimize_loss( 155 | loss, 156 | global_step=global_step, 157 | #aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N, 158 | #framework.get_global_step(), 159 | optimizer=opt.optimizer, 160 | clip_gradients=(lambda grad: _clip_gradients_seperate_norm(grad, opt.clip_grad)) if opt.clip_grad else None, 161 | learning_rate_decay_fn=lambda lr,g: tf.train.exponential_decay(learning_rate=lr, global_step=g, decay_rate=opt.decay_rate, decay_steps=3000), 162 | learning_rate=opt.lr, 163 | summaries=summaries 164 | ) 165 | return res, loss, train_op 166 | 167 | 168 | def run_model(opt, train, val, test, wordtoix, ixtoword): 169 | 170 | with tf.device('/gpu:1'): 171 | x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 172 | x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 173 | is_train_ = tf.placeholder(tf.bool, name='is_train_') 174 | res_, loss_, train_op = auto_encoder(x_, x_org_, is_train_, opt) 175 | merged = tf.summary.merge_all() 176 | summary_ext = tf.Summary() 177 | 178 | uidx = 0 179 | config = tf.ConfigProto(log_device_placement=False, 180 | allow_soft_placement=True, 181 | graph_options=tf.GraphOptions(build_cost_model=1)) 182 | config.gpu_options.allow_growth = True 183 | np.set_printoptions(precision=3) 184 | np.set_printoptions(threshold=np.inf) 185 | saver = tf.train.Saver() 186 | 187 | run_metadata = tf.RunMetadata() 188 | 189 | with tf.Session(config=config) as sess: 190 | train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) 191 | test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) 192 | sess.run(tf.global_variables_initializer()) 193 | if opt.restore: 194 | try: 195 | t_vars = tf.trainable_variables() 196 | loader = restore_from_save(t_vars, sess, opt) 197 | except Exception as e: 198 | print(e) 199 | print("No saving session, using random initialization") 200 | sess.run(tf.global_variables_initializer()) 201 | 202 | for epoch in range(opt.max_epochs): 203 | print("Starting epoch %d" % epoch) 204 | kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) 205 | for _, train_index in kf: 206 | uidx += 1 207 | sents = [train[t] for t in train_index] 208 | 209 | sents_permutated = add_noise(sents, opt) 210 | 211 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 212 | x_batch_org = prepare_data_for_cnn(sents, opt) # Batch L 213 | else: 214 | x_batch_org = prepare_data_for_rnn(sents, opt) # Batch L 215 | 216 | if opt.model != 'rnn_rnn': 217 | x_batch = prepare_data_for_cnn(sents_permutated, opt) # Batch L 218 | else: 219 | x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO = False) # Batch L 220 | # x_print = sess.run([x_emb],feed_dict={x_: x_train} ) 221 | # print x_print 222 | 223 | 224 | # res = sess.run(res_, feed_dict={x_: x_batch, x_org_:x_batch_org}) 225 | # pdb.set_trace() 226 | 227 | # 228 | if profile: 229 | _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1},options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),run_metadata=run_metadata) 230 | else: 231 | _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1}) 232 | 233 | #pdb.set_trace() 234 | 235 | if uidx % opt.valid_freq == 0: 236 | is_train = None 237 | valid_index = np.random.choice(len(val), opt.batch_size) 238 | val_sents = [val[t] for t in valid_index] 239 | 240 | val_sents_permutated = add_noise(val_sents, opt) 241 | 242 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 243 | x_val_batch_org = prepare_data_for_cnn(val_sents, opt) 244 | else: 245 | x_val_batch_org = prepare_data_for_rnn(val_sents, opt) 246 | 247 | if opt.model != 'rnn_rnn': 248 | x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt) 249 | else: 250 | x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False) 251 | 252 | loss_val = sess.run(loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train }) 253 | print("Validation loss %f " % (loss_val)) 254 | res = sess.run(res_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train }) 255 | if opt.discrimination: 256 | print ("Real Prob %f Fake Prob %f"%(res['prob_r'], res['prob_f'])) 257 | 258 | if opt.char: 259 | print "Val Orig :" + "".join([ixtoword[x] for x in val_sents[0] if x != 0]) 260 | print "Val Perm :" + "".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0]) 261 | print "Val Recon:" + "".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 262 | # print "Val Recon one hot:" + "".join([ixtoword[x] for x in res['rec_sents_one_hot'][0] if x != 0]) 263 | else: 264 | print "Val Orig :" + " ".join([ixtoword[x] for x in val_sents[0] if x != 0]) 265 | print "Val Perm :" + " ".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0]) 266 | print "Val Recon:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 267 | 268 | 269 | val_set = [prepare_for_bleu(s) for s in val_sents] 270 | [bleu2s,bleu3s,bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in res['rec_sents']], {0: val_set}) 271 | print 'Val BLEU (2,3,4): ' + ' '.join([str(round(it, 3)) for it in (bleu2s,bleu3s,bleu4s)]) 272 | 273 | 274 | val_set_char = [prepare_for_cer(s, ixtoword) for s in val_sents] 275 | cer = cal_cer([prepare_for_cer(s, ixtoword) for s in res['rec_sents']], val_set_char) 276 | print 'Val CER: ' + str(round(cer, 3)) 277 | # summary_ext.Value(tag='CER', simple_value=cer) 278 | summary_ext = tf.Summary(value=[tf.Summary.Value(tag='CER', simple_value=cer)]) 279 | # tf.summary.scalar('CER', cer) 280 | 281 | #if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 282 | #print "Gen Probs:" + " ".join([str(np.round(res['gen_p'][i], 1)) for i in range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0]) 283 | summary = sess.run(merged, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train }) 284 | test_writer.add_summary(summary, uidx) 285 | test_writer.add_summary(summary_ext, uidx) 286 | is_train = True 287 | 288 | 289 | if uidx%opt.print_freq == 0: 290 | print("Iteration %d: loss %f " %(uidx, loss)) 291 | res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1}) 292 | 293 | # if 1 in res['rec_sents'][0] or 1 in sents[0]: 294 | # pdb.set_trace() 295 | if opt.char: 296 | print "Original :" + "".join([ixtoword[x] for x in sents[0] if x != 0]) 297 | print "Permutated :" + "".join([ixtoword[x] for x in sents_permutated[0] if x != 0]) 298 | if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn': 299 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0]) 300 | print "Reconstructed:" + "".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 301 | 302 | 303 | else: 304 | print "Original :" + " ".join([ixtoword[x] for x in sents[0] if x != 0]) 305 | print "Permutated :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0]) 306 | if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn': 307 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0]) 308 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 309 | 310 | 311 | summary = sess.run(merged, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1}) 312 | train_writer.add_summary(summary, uidx) 313 | # print res['x_rec'][0][0] 314 | # print res['x_emb'][0][0] 315 | if profile: 316 | tf.contrib.tfprof.model_analyzer.print_model_analysis( 317 | tf.get_default_graph(), 318 | run_meta=run_metadata, 319 | tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY) 320 | 321 | saver.save(sess, opt.save_path) 322 | 323 | 324 | 325 | def main(): 326 | 327 | 328 | opt = Options() 329 | if opt.char: 330 | opt.n_words = 35 331 | opt.embed_size = 35 332 | opt.fix_emb = False 333 | opt.filter_size= 300 334 | 335 | if opt.data == 'three_char': 336 | loadpath = './data/three_corpus_correct_large_char.p' 337 | elif opt.data == 'yahoo': 338 | loadpath = './data/yahoo_char.p' 339 | 340 | # loadpath = "./data/three_corpus_corrected_large.p" 341 | x = cPickle.load(open(loadpath,"rb")) 342 | train, val, test = x[0], x[1], x[2] 343 | train_text, val_text, test_text = x[3], x[4], x[5] 344 | train_lab, val_lab, test_lab = x[6], x[7], x[8] 345 | # wordtoix, ixtoword = x[9], x[10] 346 | if opt.char: 347 | wordtoix, ixtoword, alphabet = x[9], x[10], x[11] 348 | else: 349 | wordtoix, ixtoword = x[9], x[10] 350 | 351 | 352 | # opt = Options() 353 | if not opt.char: 354 | opt.n_words = len(ixtoword) + 1 355 | ixtoword[opt.n_words-1] = 'GO_' 356 | print dict(opt) 357 | print('Total words: %d' % opt.n_words) 358 | 359 | 360 | run_model(opt, train, val, test, wordtoix, ixtoword) 361 | 362 | 363 | 364 | if __name__ == '__main__': 365 | main() 366 | -------------------------------------------------------------------------------- /auto_encoding_cnn_denoise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Yizhe Zhang 4 | 5 | TextCNN 6 | """ 7 | ## 152.3.214.203/6006 8 | 9 | import os 10 | GPUID = 0 11 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID) 12 | 13 | import tensorflow as tf 14 | from tensorflow.contrib import learn 15 | from tensorflow.contrib import layers 16 | #from tensorflow.contrib import metrics 17 | #from tensorflow.contrib.learn import monitors 18 | from tensorflow.contrib import framework 19 | from tensorflow.contrib.learn.python.learn import learn_runner 20 | from tensorflow.python.platform import tf_logging as logging 21 | #from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec 22 | import cPickle 23 | import numpy as np 24 | import os 25 | import scipy.io as sio 26 | from math import floor 27 | import pdb 28 | 29 | from model import * 30 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, get_minibatches_idx, normalizing, restore_from_save, \ 31 | prepare_for_bleu, cal_BLEU, sent2idx, _clip_gradients_seperate_norm 32 | from denoise import * 33 | 34 | profile = False 35 | #import tempfile 36 | #from tensorflow.examples.tutorials.mnist import input_data 37 | 38 | logging.set_verbosity(logging.INFO) 39 | #tf.logging.verbosity(1) 40 | # Basic model parameters as external flags. 41 | flags = tf.app.flags 42 | FLAGS = flags.FLAGS 43 | #flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.') 44 | 45 | 46 | 47 | 48 | 49 | class Options(object): 50 | def __init__(self): 51 | self.fix_emb = False 52 | self.reuse_w = False 53 | self.reuse_cnn = False 54 | self.reuse_discrimination = False # reuse cnn for discrimination 55 | self.restore = True 56 | self.tanh = False # activation fun for the top layer of cnn, otherwise relu 57 | self.model = 'rnn_rnn' #'cnn_deconv' # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv 58 | 59 | self.permutation = 0 60 | self.substitution = 's' # Deletion(d), Insertion(a), Substitution(s) and Permutation(p) 61 | 62 | self.W_emb = None 63 | self.cnn_W = None 64 | self.cnn_b = None 65 | self.maxlen = 61 66 | self.n_words = None 67 | self.filter_shape = 5 68 | self.filter_size = 300 69 | self.multiplier = 2 70 | self.embed_size = 300 71 | self.lr = 1e-4 72 | 73 | self.layer = 3 74 | self.stride = [2, 2, 2] # for two layer cnn/deconv , use self.stride[0] 75 | self.batch_size = 32 76 | self.max_epochs = 100 77 | self.n_gan = 100 # self.filter_size * 3 78 | self.L = 100 79 | 80 | self.optimizer = 'Adam' #tf.train.AdamOptimizer(beta1=0.9) #'Adam' # 'Momentum' , 'RMSProp' 81 | self.clip_grad = None #None #100 # 20# 82 | self.attentive_emb = False 83 | self.decay_rate = 0.99 84 | self.relu_w = False 85 | 86 | self.save_path = "./save/" +str(self.n_gan) + "_dim_" + self.model + "_" + self.substitution + str(self.permutation) 87 | self.log_path = "./log" 88 | self.print_freq = 1000 89 | self.valid_freq = 1000 90 | 91 | # batch norm & dropout 92 | self.batch_norm = False 93 | self.dropout = False 94 | self.dropout_ratio = 0.5 95 | 96 | self.discrimination = False 97 | self.H_dis = 300 98 | 99 | self.sent_len = self.maxlen + 2*(self.filter_shape-1) 100 | self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape)/self.stride[0]) + 1) 101 | self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape)/self.stride[1]) + 1) 102 | self.sent_len4 = np.int32(floor((self.sent_len3 - self.filter_shape)/self.stride[2]) + 1) 103 | print ('Use model %s' % self.model) 104 | print ('Use %d conv/deconv layers' % self.layer) 105 | 106 | def __iter__(self): 107 | for attr, value in self.__dict__.iteritems(): 108 | yield attr, value 109 | 110 | def auto_encoder(x, x_org, is_train, opt, opt_t=None): 111 | # print x.get_shape() # batch L 112 | if not opt_t: opt_t = opt 113 | x_emb, W_norm = embedding(x, opt) # batch L emb 114 | x_emb = tf.expand_dims(x_emb, 3) # batch L emb 1 115 | 116 | res = {} 117 | #res['W'] = W_norm 118 | # cnn encoder 119 | H_enc, res = conv_encoder(x_emb, is_train, opt, res) 120 | 121 | # H_dec = layers.relu(Y4, 200, biases_initializer=biasInit) 122 | H_dec = H_enc 123 | # print x_rec.get_shape() 124 | if opt.model == 'rnn_rnn': 125 | loss, rec_sent_1, _ = seq2seq(x, x_org, opt) 126 | _, rec_sent_2, _ = seq2seq(x, x_org, opt, feed_previous=True, is_reuse=True) 127 | #res['logits'] = logits 128 | res['rec_sents_feed_y'] = rec_sent_1 129 | res['rec_sents'] = rec_sent_2 130 | 131 | 132 | elif opt.model == 'cnn_rnn': 133 | # lstm decoder 134 | H_dec2 = tf.identity(H_dec) 135 | if opt.rnn_share_emb: 136 | loss, rec_sent_1, _ = lstm_decoder_embedding(H_dec2, x_org, W_norm, opt_t) # 137 | _, rec_sent_2, _ = lstm_decoder_embedding(H_dec2, x_org, W_norm, opt_t, feed_previous=True, is_reuse=True) 138 | else: 139 | loss, rec_sent_1, _ = lstm_decoder(H_dec2, x_org, opt_t) # 140 | _, rec_sent_2, _ = lstm_decoder(H_dec2, x_org, opt_t, feed_previous=True, is_reuse=True) 141 | 142 | 143 | res['rec_sents_feed_y'] = rec_sent_1 144 | res['rec_sents'] = rec_sent_2 145 | # res['H1'],res['H2'],res['o1'],res['o2'] = H1, H2, o1, o2 146 | 147 | else: 148 | 149 | # deconv decoder 150 | loss, res = deconv_decoder(H_dec, x_org, W_norm, is_train, opt_t, res) 151 | 152 | # *tf.cast(tf.not_equal(x_temp,0), tf.float32) 153 | tf.summary.scalar('loss', loss) 154 | summaries = [ 155 | "learning_rate", 156 | "loss", 157 | # "gradients", 158 | # "gradient_norm", 159 | ] 160 | global_step = tf.Variable(0, trainable=False) 161 | 162 | 163 | train_op = layers.optimize_loss( 164 | loss, 165 | global_step = global_step, 166 | #aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N, 167 | #framework.get_global_step(), 168 | optimizer=opt.optimizer, 169 | clip_gradients=(lambda grad: _clip_gradients_seperate_norm(grad, opt.clip_grad)) if opt.clip_grad else None, 170 | learning_rate_decay_fn=lambda lr,g: tf.train.exponential_decay(learning_rate=lr, global_step = g, decay_rate=opt.decay_rate, decay_steps=3000), 171 | learning_rate=opt.lr, 172 | summaries = summaries 173 | ) 174 | 175 | # optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) # Or another optimization algorithm. 176 | # train_op = optimizer.minimize( 177 | # loss, 178 | # aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 179 | 180 | 181 | return res, loss, train_op 182 | 183 | 184 | def run_model(opt, train, val, test, wordtoix, ixtoword): 185 | 186 | 187 | try: 188 | params = np.load('./param_g.npz') 189 | if params['Wemb'].shape == (opt.n_words, opt.embed_size): 190 | print('Use saved embedding.') 191 | opt.W_emb = params['Wemb'] 192 | else: 193 | print('Emb Dimension mismatch: param_g.npz:'+ str(params['Wemb'].shape) + ' opt: ' + str((opt.n_words, opt.embed_size))) 194 | opt.fix_emb = False 195 | except IOError: 196 | print('No embedding file found.') 197 | opt.fix_emb = False 198 | 199 | with tf.device('/gpu:1'): 200 | x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 201 | x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 202 | is_train_ = tf.placeholder(tf.bool, name='is_train_') 203 | res_, loss_, train_op = auto_encoder(x_, x_org_, is_train_, opt) 204 | merged = tf.summary.merge_all() 205 | # opt.is_train = False 206 | # res_val_, loss_val_, _ = auto_encoder(x_, x_org_, opt) 207 | # merged_val = tf.summary.merge_all() 208 | 209 | #tensorboard --logdir=run1:/tmp/tensorflow/ --port 6006 210 | #writer = tf.train.SummaryWriter(opt.log_path, graph=tf.get_default_graph()) 211 | 212 | 213 | uidx = 0 214 | config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=tf.GraphOptions(build_cost_model=1)) 215 | #config = tf.ConfigProto(device_count={'GPU':0}) 216 | config.gpu_options.allow_growth = True 217 | np.set_printoptions(precision=3) 218 | np.set_printoptions(threshold=np.inf) 219 | saver = tf.train.Saver() 220 | 221 | 222 | 223 | run_metadata = tf.RunMetadata() 224 | 225 | 226 | with tf.Session(config = config) as sess: 227 | train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) 228 | test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) 229 | sess.run(tf.global_variables_initializer()) 230 | if opt.restore: 231 | try: 232 | #pdb.set_trace() 233 | 234 | t_vars = tf.trainable_variables() 235 | #print([var.name[:-2] for var in t_vars]) 236 | loader = restore_from_save(t_vars, sess, opt) 237 | 238 | 239 | except Exception as e: 240 | print(e) 241 | print("No saving session, using random initialization") 242 | sess.run(tf.global_variables_initializer()) 243 | 244 | for epoch in range(opt.max_epochs): 245 | print("Starting epoch %d" % epoch) 246 | # if epoch >= 10: 247 | # print("Relax embedding ") 248 | # opt.fix_emb = False 249 | # opt.batch_size = 2 250 | kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True) 251 | for _, train_index in kf: 252 | uidx += 1 253 | sents = [train[t] for t in train_index] 254 | 255 | sents_permutated = add_noise(sents, opt) 256 | 257 | #sents[0] = np.random.permutation(sents[0]) 258 | 259 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 260 | x_batch_org = prepare_data_for_cnn(sents, opt) # Batch L 261 | else: 262 | x_batch_org = prepare_data_for_rnn(sents, opt) # Batch L 263 | 264 | if opt.model != 'rnn_rnn': 265 | x_batch = prepare_data_for_cnn(sents_permutated, opt) # Batch L 266 | else: 267 | x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO = False) # Batch L 268 | # x_print = sess.run([x_emb],feed_dict={x_: x_train} ) 269 | # print x_print 270 | 271 | 272 | # res = sess.run(res_, feed_dict={x_: x_batch, x_org_:x_batch_org}) 273 | # pdb.set_trace() 274 | 275 | # 276 | if profile: 277 | _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1},options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),run_metadata=run_metadata) 278 | else: 279 | _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1}) 280 | 281 | #pdb.set_trace() 282 | 283 | if uidx % opt.valid_freq == 0: 284 | is_train = None 285 | valid_index = np.random.choice(len(val), opt.batch_size) 286 | val_sents = [val[t] for t in valid_index] 287 | 288 | val_sents_permutated = add_noise(val_sents, opt) 289 | 290 | 291 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 292 | x_val_batch_org = prepare_data_for_cnn(val_sents, opt) 293 | else: 294 | x_val_batch_org = prepare_data_for_rnn(val_sents, opt) 295 | 296 | if opt.model != 'rnn_rnn': 297 | x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt) 298 | else: 299 | x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False) 300 | 301 | loss_val = sess.run(loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train }) 302 | print("Validation loss %f " % (loss_val)) 303 | res = sess.run(res_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train }) 304 | if opt.discrimination: 305 | print ("Real Prob %f Fake Prob %f" % (res['prob_r'], res['prob_f'])) 306 | print "Val Orig :" + " ".join([ixtoword[x] for x in val_sents[0] if x != 0]) 307 | #print "Val Perm :" + " ".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0]) 308 | print "Val Recon:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 309 | 310 | val_set = [prepare_for_bleu(s) for s in val_sents] 311 | [bleu2s, bleu3s, bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in res['rec_sents']], {0: val_set}) 312 | print 'Val BLEU (2,3,4): ' + ' '.join([str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)]) 313 | 314 | # if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 315 | # print "Org Probs:" + " ".join( 316 | # [ixtoword[x_val_batch_org[0][i]] + '(' + str(np.round(res['all_p'][i], 1)) + ')' for i in 317 | # range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0]) 318 | # print "Gen Probs:" + " ".join( 319 | # [ixtoword[res['rec_sents'][0][i]] + '(' + str(np.round(res['gen_p'][i], 1)) + ')' for i in 320 | # range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0]) 321 | 322 | summary = sess.run(merged, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train }) 323 | test_writer.add_summary(summary, uidx) 324 | is_train = True 325 | 326 | def test_input(text): 327 | x_input = sent2idx(text, wordtoix, opt) 328 | res = sess.run(res_, feed_dict={x_: x_input, x_org_: x_batch_org}) 329 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 330 | 331 | if uidx % opt.print_freq == 0: 332 | #pdb.set_trace() 333 | print("Iteration %d: loss %f " % (uidx, loss)) 334 | res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1}) 335 | print "Original :" + " ".join([ixtoword[x] for x in sents[0] if x != 0]) 336 | #print "Permutated :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0]) 337 | if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn': 338 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0]) 339 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 340 | 341 | # print "Probs:" + " ".join([ixtoword[res['rec_sents'][0][i]] +'(' +str(np.round(res['all_p'][i],2))+')' for i in range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0]) 342 | 343 | summary = sess.run(merged, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1}) 344 | train_writer.add_summary(summary, uidx) 345 | # print res['x_rec'][0][0] 346 | # print res['x_emb'][0][0] 347 | if profile: 348 | tf.contrib.tfprof.model_analyzer.print_model_analysis( 349 | tf.get_default_graph(), 350 | run_meta=run_metadata, 351 | tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY) 352 | 353 | saver.save(sess, opt.save_path, global_step=epoch) 354 | 355 | 356 | 357 | def main(): 358 | #global n_words 359 | # Prepare training and testing data 360 | #loadpath = "./data/three_corpus_small.p" 361 | loadpath = "./data/three_corpus_corrected_large.p" 362 | x = cPickle.load(open(loadpath,"rb")) 363 | train, val, test = x[0], x[1], x[2] 364 | train_text, val_text, test_text = x[3], x[4], x[5] 365 | train_lab, val_lab, test_lab = x[6], x[7], x[8] 366 | wordtoix, ixtoword = x[9], x[10] 367 | 368 | opt = Options() 369 | opt.n_words = len(ixtoword) + 1 370 | ixtoword[opt.n_words-1] = 'GO_' 371 | print dict(opt) 372 | print('Total words: %d' % opt.n_words) 373 | 374 | 375 | run_model(opt, train, val, test, wordtoix, ixtoword) 376 | 377 | # model_fn = auto_encoder 378 | # ae = learn.Estimator(model_fn=model_fn) 379 | # ae.fit(train, opt , steps=opt.max_epochs) 380 | 381 | 382 | # 383 | # def main(argv=None): 384 | # learn_runner.run(experiment_fn, FLAGS.train_dir) 385 | 386 | if __name__ == '__main__': 387 | main() 388 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Yizhe Zhang 3 | 4 | Main model file 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.contrib import learn 8 | from tensorflow.contrib import layers 9 | from tensorflow.contrib import metrics 10 | #from tensorflow.contrib.learn import monitors 11 | from tensorflow.contrib import framework 12 | from tensorflow.contrib.learn.python.learn import learn_runner 13 | from tensorflow.python.platform import tf_logging as logging 14 | from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec 15 | from tensorflow.contrib.legacy_seq2seq import rnn_decoder, embedding_rnn_decoder, sequence_loss, embedding_rnn_seq2seq, embedding_tied_rnn_seq2seq 16 | import pdb 17 | import copy 18 | from utils import normalizing, lrelu 19 | from tensorflow.python.framework import ops 20 | from tensorflow.python.ops import nn_ops, math_ops, embedding_ops, variable_scope 21 | 22 | 23 | 24 | def embedding(features, opt, prefix = '', is_reuse = None): 25 | """Customized function to transform batched x into embeddings.""" 26 | # Convert indexes of words into embeddings. 27 | 28 | 29 | 30 | 31 | # b = tf.get_variable('b', [opt.embed_size], initializer = tf,random_uniform_initializer(-0.01, 0.01)) 32 | with tf.variable_scope(prefix+'embed', reuse=is_reuse): 33 | if opt.fix_emb: 34 | assert(hasattr(opt,'emb')) 35 | assert(np.shape(np.array(opt.emb))==(opt.n_words, opt.embed_size)) 36 | W = tf.get_variable('W', [opt.n_words, opt.embed_size], weights_initializer = opt.emb, is_trainable = False) 37 | else: 38 | weightInit = tf.random_uniform_initializer(-0.001, 0.001) 39 | W = tf.get_variable('W', [opt.n_words, opt.embed_size], initializer = weightInit) 40 | # tf.stop_gradient(W) 41 | if hasattr(opt, 'relu_w') and opt.relu_w: 42 | W = tf.nn.relu(W) 43 | 44 | W_norm = normalizing(W, 1) 45 | word_vectors = tf.nn.embedding_lookup(W_norm, features) 46 | 47 | 48 | return word_vectors, W_norm 49 | 50 | 51 | def embedding_only(opt, prefix = '', is_reuse = None): 52 | """Customized function to transform batched x into embeddings.""" 53 | # Convert indexes of words into embeddings. 54 | with tf.variable_scope(prefix+'embed', reuse=is_reuse): 55 | if opt.fix_emb: 56 | assert(hasattr(opt,'emb')) 57 | assert(np.shape(np.array(opt.emb))==(opt.n_words, opt.embed_size)) 58 | W = tf.get_variable('W', [opt.n_words, opt.embed_size], weights_initializer = opt.emb, is_trainable = False) 59 | else: 60 | weightInit = tf.random_uniform_initializer(-0.001, 0.001) 61 | W = tf.get_variable('W', [opt.n_words, opt.embed_size], initializer = weightInit) 62 | # b = tf.get_variable('b', [opt.embed_size], initializer = tf,random_uniform_initializer(-0.01, 0.01)) 63 | if hasattr(opt, 'relu_w') and opt.relu_w: 64 | W = tf.nn.relu(W) 65 | 66 | W_norm = normalizing(W, 1) 67 | 68 | return W_norm 69 | 70 | def classifier_2layer(H, opt, dropout = 1, prefix = '', num_outputs=1, is_reuse= None): 71 | # last layer must be linear 72 | H = tf.squeeze(H) 73 | biasInit = tf.constant_initializer(0.001, dtype=tf.float32) 74 | H_dis = layers.fully_connected(tf.nn.dropout(H, keep_prob = dropout), num_outputs = opt.H_dis, biases_initializer=biasInit, activation_fn = tf.nn.relu, scope = prefix + 'dis_1', reuse = is_reuse) 75 | logits = layers.linear(tf.nn.dropout(H_dis, keep_prob = dropout), num_outputs = num_outputs, biases_initializer=biasInit, scope = prefix + 'dis_2', reuse = is_reuse) 76 | return logits 77 | 78 | 79 | 80 | def discriminator(x, W, opt, prefix = 'd_', is_prob = False, is_reuse = None): 81 | W_norm_d = tf.identity(W) # deep copy 82 | tf.stop_gradient(W_norm_d) # the discriminator won't update W 83 | if is_prob: 84 | x_emb = tf.tensordot(x, W_norm_d, [[2],[0]]) # batch L emb 85 | else: 86 | x_emb = tf.nn.embedding_lookup(W_norm_d, x) # batch L emb 87 | 88 | # print x_emb.get_shape() 89 | x_emb = tf.expand_dims(x_emb,3) # batch L emb 1 90 | 91 | 92 | if opt.layer == 4: 93 | H = conv_model_4layer(x_emb, opt, prefix = prefix, is_reuse = is_reuse) 94 | elif opt.layer == 3: 95 | H = conv_model_3layer(x_emb, opt, prefix = prefix, is_reuse = is_reuse) 96 | else: # layer == 2 97 | H = conv_model(x_emb, opt, prefix = prefix, is_reuse = is_reuse) 98 | 99 | logits = discriminator_2layer(H, opt, prefix= prefix, is_reuse = is_reuse) 100 | return logits, tf.squeeze(H) 101 | 102 | 103 | def conv_encoder(x_emb, is_train, opt, res, is_reuse = None, prefix = ''): 104 | if hasattr(opt, 'multiplier'): 105 | multiplier = opt.multiplier 106 | else: 107 | multiplier = 2 108 | if opt.layer == 4: 109 | H_enc = conv_model_4layer(x_emb, opt, is_train = is_train, is_reuse = is_reuse, prefix = prefix) 110 | elif opt.layer == 3: 111 | H_enc = conv_model_3layer(x_emb, opt, is_train = is_train, multiplier = multiplier, is_reuse = is_reuse, prefix = prefix) 112 | elif opt.layer == 0: 113 | H_enc = conv_model_3layer_old(x_emb, opt, is_reuse = is_reuse, prefix = prefix) 114 | else: 115 | H_enc = conv_model(x_emb, opt, is_train = is_train, is_reuse = is_reuse, prefix = prefix) 116 | return H_enc, res 117 | 118 | def deconv_decoder(H_dec, x_org, W_norm, is_train, opt, res, prefix = '', is_reuse = None): 119 | if hasattr(opt, 'multiplier'): 120 | multiplier = opt.multiplier 121 | else: 122 | multiplier = 2 123 | # H_dec batch 1 1 n_gan 124 | if opt.layer == 4: 125 | x_rec = deconv_model_4layer(H_dec, opt, is_train = is_train, prefix = prefix, is_reuse = is_reuse) # batch L emb 1 126 | elif opt.layer == 3: 127 | x_rec = deconv_model_3layer(H_dec, opt, is_train = is_train, multiplier = multiplier, prefix= prefix, is_reuse = is_reuse) # batch L emb 1 128 | elif opt.layer == 0: 129 | x_rec = deconv_model_3layer(H_dec, opt, prefix= prefix, is_reuse = is_reuse) # batch L emb 1 130 | else: 131 | x_rec = deconv_model(H_dec, opt, is_train = is_train, prefix= prefix, is_reuse = is_reuse) # batch L emb 1 132 | print("Decoder len %d Output len %d" % (x_rec.get_shape()[1], x_org.get_shape()[1])) 133 | tf.assert_equal(x_rec.get_shape()[1], x_org.get_shape()[1]) 134 | x_rec_norm = normalizing(x_rec, 2) # batch L emb 135 | #W_reshape = tf.reshape(tf.transpose(W),[1,1,opt.embed_size,opt.n_words]) 136 | #print all_idx.get_shape() 137 | 138 | # if opt.fix_emb: 139 | # 140 | # #loss = tf.reduce_sum((x_emb-x_rec)**2) # L2 is bad 141 | # # cosine sim 142 | # # Batch L emb 143 | # loss = -tf.reduce_sum(x_rec_norm * x_emb) 144 | # rec_sent = tf.argmax(tf.tensordot(tf.squeeze(x_rec_norm) , W_norm, [[2],[1]]),2) 145 | # res['rec_sents'] = rec_sent 146 | # 147 | # else: 148 | x_temp = tf.reshape(x_org, [-1,]) 149 | if hasattr(opt, 'attentive_emb') and opt.attentive_emb: 150 | emb_att = tf.get_variable(prefix+'emb_att', [1,opt.embed_size], initializer = tf.constant_initializer(1.0, dtype=tf.float32)) 151 | prob_logits = tf.tensordot(tf.squeeze(x_rec_norm), emb_att*W_norm, [[2],[1]]) # c_blv = sum_e x_ble W_ve 152 | else: 153 | prob_logits = tf.tensordot(tf.squeeze(x_rec_norm), W_norm, [[2],[1]]) # c_blv = sum_e x_ble W_ve 154 | 155 | prob = tf.nn.log_softmax(prob_logits*opt.L, dim=-1, name=None) 156 | #prob = normalizing(tf.reduce_sum(x_rec_norm * W_reshape, 2), 2) 157 | #prob = softmax_prediction(x_rec_norm, opt) 158 | rec_sent = tf.squeeze(tf.argmax(prob,2)) 159 | prob = tf.reshape(prob, [-1,opt.n_words]) 160 | 161 | idx = tf.range(opt.batch_size * opt.sent_len) 162 | #print idx.get_shape(), idx.dtype 163 | 164 | all_idx = tf.transpose(tf.stack(values=[idx,x_temp])) 165 | all_prob = tf.gather_nd(prob, all_idx) 166 | 167 | #pdb.set_trace() 168 | 169 | gen_temp = tf.cast(tf.reshape(rec_sent, [-1,]), tf.int32) 170 | gen_idx = tf.transpose(tf.stack(values=[idx,gen_temp])) 171 | gen_prob = tf.gather_nd(prob, gen_idx) 172 | 173 | res['rec_sents'] = rec_sent 174 | 175 | #res['gen_p'] = tf.exp(gen_prob[0:opt.sent_len]) 176 | #res['all_p'] = tf.exp(all_prob[0:opt.sent_len]) 177 | 178 | if opt.discrimination: 179 | logits_real, _ = discriminator(x_org, W_norm, opt) 180 | prob_one_hot = tf.nn.log_softmax(prob_logits*opt.L, dim=-1, name=None) 181 | logits_syn, _ = discriminator(tf.exp(prob_one_hot), W_norm, opt, is_prob = True, is_reuse = True) 182 | 183 | res['prob_r'] = tf.reduce_mean(tf.nn.sigmoid(logits_real)) 184 | res['prob_f'] = tf.reduce_mean(tf.nn.sigmoid(logits_syn)) 185 | 186 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(logits_real), logits = logits_real)) + \ 187 | tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(logits_syn), logits = logits_syn)) 188 | else: 189 | loss = -tf.reduce_mean( all_prob) 190 | return loss, res 191 | 192 | 193 | 194 | 195 | 196 | 197 | def regularization(X, opt, is_train, prefix= '', is_reuse= None): 198 | if '_X' not in prefix and '_H_dec' not in prefix: 199 | if opt.batch_norm: 200 | X = layers.batch_norm(X, decay=0.9, center=True, scale=True, is_training=is_train, scope=prefix+'_bn', reuse = is_reuse) 201 | X = tf.nn.relu(X) 202 | X = X if not opt.cnn_layer_dropout else layers.dropout(X, keep_prob = opt.dropout_ratio, scope=prefix + '_dropout') 203 | 204 | return X 205 | 206 | 207 | conv_acf = tf.nn.tanh # tf.nn.relu 208 | 209 | def conv_model(X, opt, prefix = '', is_reuse= None, is_train = True): # 2layers 210 | #XX = tf.reshape(X, [-1, , 28, 1]) 211 | #X shape: batchsize L emb 1 212 | if opt.reuse_cnn: 213 | biasInit = opt.cnn_b 214 | weightInit = opt.cnn_W 215 | else: 216 | biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32) 217 | weightInit = tf.constant_initializer(0.001, dtype=tf.float32) 218 | 219 | X = regularization(X, opt, prefix= prefix + 'reg_X', is_reuse= is_reuse, is_train = is_train) 220 | H1 = layers.conv2d(X, num_outputs=opt.filter_size, kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H1', reuse = is_reuse) # batch L-3 1 Filtersize 221 | 222 | H1 = regularization(H1, opt, prefix= prefix + 'reg_H1', is_reuse= is_reuse, is_train = is_train) 223 | H2 = layers.conv2d(H1, num_outputs=opt.filter_size*2, kernel_size=[opt.sent_len2, 1], activation_fn=conv_acf , padding = 'VALID', scope = prefix + 'H2', reuse = is_reuse) # batch 1 1 2*Filtersize 224 | return H2 225 | 226 | 227 | def conv_model_3layer(X, opt, prefix = '', is_reuse= None, num_outputs = None, is_train = True, multiplier = 2): 228 | #XX = tf.reshape(X, [-1, , 28, 1]) 229 | #X shape: batchsize L emb 1 230 | if opt.reuse_cnn: 231 | biasInit = opt.cnn_b 232 | weightInit = opt.cnn_W 233 | else: 234 | biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32) 235 | weightInit = tf.constant_initializer(0.001, dtype=tf.float32) 236 | 237 | X = regularization(X, opt, prefix= prefix + 'reg_X', is_reuse= is_reuse, is_train = is_train) 238 | H1 = layers.conv2d(X, num_outputs=opt.filter_size, kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H1_3', reuse = is_reuse) # batch L-3 1 Filtersize 239 | 240 | H1 = regularization(H1, opt, prefix= prefix + 'reg_H1', is_reuse= is_reuse, is_train = is_train) 241 | H2 = layers.conv2d(H1, num_outputs=opt.filter_size*multiplier, kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1], biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H2_3', reuse = is_reuse) 242 | #print H2.get_shape() 243 | H2 = regularization(H2, opt, prefix= prefix + 'reg_H2', is_reuse= is_reuse, is_train = is_train) 244 | H3 = layers.conv2d(H2, num_outputs= (num_outputs if num_outputs else opt.n_gan), kernel_size=[opt.sent_len3, 1], activation_fn=tf.nn.tanh , padding = 'VALID', scope = prefix + 'H3_3', reuse = is_reuse) # batch 1 1 2*Filtersize 245 | 246 | #pdb.set_trace() 247 | return H3 248 | 249 | 250 | def conv_model_3layer_old(X, opt, prefix = '', is_reuse= None, num_outputs = None): 251 | #XX = tf.reshape(X, [-1, , 28, 1]) 252 | #X shape: batchsize L emb 1 253 | 254 | biasInit = tf.constant_initializer(0.001, dtype=tf.float32) 255 | weightInit = tf.constant_initializer(0.001, dtype=tf.float32) 256 | 257 | H1 = layers.conv2d(X, num_outputs=opt.filter_size, kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=tf.nn.relu, padding = 'VALID', scope = prefix + 'H1_3', reuse = is_reuse) # batch L-3 1 Filtersize 258 | H2 = layers.conv2d(H1, num_outputs=opt.filter_size*2, kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1], biases_initializer=biasInit, activation_fn=tf.nn.relu, padding = 'VALID', scope = prefix + 'H2_3', reuse = is_reuse) 259 | #print H2.get_shape() 260 | H3 = layers.conv2d(H2, num_outputs= (num_outputs if num_outputs else opt.n_gan), kernel_size=[opt.sent_len3, 1], biases_initializer=biasInit, activation_fn=tf.nn.tanh, padding = 'VALID', scope = prefix + 'H3_3', reuse = is_reuse) # batch 1 1 2*Filtersize 261 | return H3 262 | 263 | 264 | def conv_model_4layer(X, opt, prefix = '', is_reuse= None, num_outputs = None, is_train = True): 265 | #XX = tf.reshape(X, [-1, , 28, 1]) 266 | #X shape: batchsize L emb 1 267 | if opt.reuse_cnn: 268 | biasInit = opt.cnn_b 269 | weightInit = opt.cnn_W 270 | else: 271 | biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32) 272 | weightInit = tf.constant_initializer(0.001, dtype=tf.float32) 273 | 274 | X = regularization(X, opt, prefix= prefix + 'reg_X', is_reuse= is_reuse, is_train = is_train) 275 | H1 = layers.conv2d(X, num_outputs=opt.filter_size, kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H1_3', reuse = is_reuse) # batch L-3 1 Filtersize 276 | 277 | H1 = regularization(H1, opt, prefix= prefix + 'reg_H1', is_reuse= is_reuse, is_train = is_train) 278 | H2 = layers.conv2d(H1, num_outputs=opt.filter_size*2, kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1], biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H2_3', reuse = is_reuse) 279 | 280 | H2 = regularization(H2, opt, prefix= prefix + 'reg_H2', is_reuse= is_reuse, is_train = is_train) 281 | H3 = layers.conv2d(H2, num_outputs=opt.filter_size*4, kernel_size=[opt.filter_shape, 1], stride = [opt.stride[2],1], biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H3_3', reuse = is_reuse) 282 | #print H2.get_shape() 283 | H3 = regularization(H3, opt, prefix= prefix + 'reg_H3', is_reuse= is_reuse, is_train = is_train) 284 | H4 = layers.conv2d(H3, num_outputs= (num_outputs if num_outputs else opt.n_gan), kernel_size=[opt.sent_len4, 1], activation_fn=conv_acf , padding = 'VALID', scope = prefix + 'H4', reuse = is_reuse) # batch 1 1 2*Filtersize 285 | return H4 286 | 287 | 288 | dec_acf = tf.nn.relu #tf.nn.tanh 289 | dec_bias = None # tf.constant_initializer(0.001, dtype=tf.float32) 290 | 291 | def deconv_model(H, opt, prefix = '', is_reuse= None, is_train = True): 292 | biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32) 293 | #H2t = tf.reshape(H, [H.shape[0],1,1,H.shape[1]]) 294 | # print tf.shape(H) 295 | # H2t = tf.expand_dims(H,1) 296 | # H2t = tf.expand_dims(H,1) 297 | 298 | H2t = H 299 | 300 | H2t = regularization(H2t, opt, prefix= prefix + 'reg_H_dec', is_reuse= is_reuse, is_train = is_train) 301 | H1t = layers.conv2d_transpose(H2t, num_outputs=opt.filter_size, kernel_size=[opt.sent_len2, 1], biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H1_t', reuse = is_reuse) 302 | 303 | H1t = regularization(H1t, opt, prefix= prefix + 'reg_H1_dec', is_reuse= is_reuse, is_train = is_train) 304 | Xhat = layers.conv2d_transpose(H1t, num_outputs=1, kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], biases_initializer=dec_bias, activation_fn=dec_acf, padding = 'VALID',scope = prefix + 'Xhat_t', reuse = is_reuse) 305 | #print H2t.get_shape(), H1t.get_shape(), Xhat.get_shape() 306 | return Xhat 307 | 308 | def deconv_model_3layer(H, opt, prefix = '', is_reuse= None, is_train = True, multiplier = 2): 309 | #XX = tf.reshape(X, [-1, , 28, 1]) 310 | #X shape: batchsize L emb 1 311 | biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32) 312 | 313 | H3t = H 314 | 315 | H3t = regularization(H3t, opt, prefix= prefix + 'reg_H_dec', is_reuse= is_reuse, is_train = is_train) 316 | H2t = layers.conv2d_transpose(H3t, num_outputs=opt.filter_size*multiplier, kernel_size=[opt.sent_len3, 1], biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H2_t_3', reuse = is_reuse) 317 | 318 | H2t = regularization(H2t, opt, prefix= prefix + 'reg_H2_dec', is_reuse= is_reuse, is_train = is_train) 319 | H1t = layers.conv2d_transpose(H2t, num_outputs=opt.filter_size, kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1], biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H1_t_3', reuse = is_reuse) 320 | 321 | H1t = regularization(H1t, opt, prefix= prefix + 'reg_H1_dec', is_reuse= is_reuse, is_train = is_train) 322 | Xhat = layers.conv2d_transpose(H1t, num_outputs=1, kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], biases_initializer=dec_bias, activation_fn=dec_acf, padding = 'VALID',scope = prefix + 'Xhat_t_3', reuse = is_reuse) 323 | #print H2t.get_shape(),H1t.get_shape(),Xhat.get_shape() 324 | 325 | return Xhat 326 | 327 | 328 | 329 | 330 | def deconv_model_4layer(H, opt, prefix = '', is_reuse= None, is_train = True): 331 | #XX = tf.reshape(X, [-1, , 28, 1]) 332 | #X shape: batchsize L emb 1 333 | biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32) 334 | 335 | H4t = H 336 | 337 | H4t = regularization(H4t, opt, prefix= prefix + 'reg_H_dec', is_reuse= is_reuse, is_train = is_train) 338 | H3t = layers.conv2d_transpose(H4t, num_outputs=opt.filter_size*4, kernel_size=[opt.sent_len4, 1], biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H3_t_3', reuse = is_reuse) 339 | 340 | H3t = regularization(H3t, opt, prefix= prefix + 'reg_H3_dec', is_reuse= is_reuse, is_train = is_train) 341 | H2t = layers.conv2d_transpose(H3t, num_outputs=opt.filter_size*2, kernel_size=[opt.filter_shape, 1], stride = [opt.stride[2],1], biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H2_t_3', reuse = is_reuse) 342 | 343 | H2t = regularization(H2t, opt, prefix= prefix + 'reg_H2_dec', is_reuse= is_reuse, is_train = is_train) 344 | H1t = layers.conv2d_transpose(H2t, num_outputs=opt.filter_size, kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1], biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H1_t_3', reuse = is_reuse) 345 | 346 | H1t = regularization(H1t, opt, prefix= prefix + 'reg_H1_dec', is_reuse= is_reuse, is_train = is_train) 347 | Xhat = layers.conv2d_transpose(H1t, num_outputs=1, kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], biases_initializer=dec_bias, activation_fn=dec_acf, padding = 'VALID',scope = prefix + 'Xhat_t_3', reuse = is_reuse) 348 | #print H2t.get_shape(),H1t.get_shape(),Xhat.get_shape() 349 | return Xhat 350 | 351 | 352 | 353 | 354 | 355 | 356 | -------------------------------------------------------------------------------- /semi_supervised.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Yizhe Zhang 4 | 5 | TextCNN 6 | """ 7 | ## 152.3.214.203/6006 8 | 9 | import os 10 | 11 | GPUID = 1 12 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID) 13 | 14 | import tensorflow as tf 15 | from tensorflow.contrib import learn 16 | from tensorflow.contrib import layers 17 | # from tensorflow.contrib import metrics 18 | # from tensorflow.contrib.learn import monitors 19 | from tensorflow.contrib import framework 20 | from tensorflow.contrib.learn.python.learn import learn_runner 21 | from tensorflow.python.platform import tf_logging as logging 22 | # from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec 23 | import cPickle 24 | import numpy as np 25 | import os 26 | import scipy.io as sio 27 | from math import floor 28 | import pdb 29 | 30 | from model import * 31 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, get_minibatches_idx, normalizing, restore_from_save, \ 32 | prepare_for_bleu, cal_BLEU, sent2idx, _clip_gradients_seperate_norm 33 | from denoise import * 34 | 35 | # import tempfile 36 | # from tensorflow.examples.tutorials.mnist import input_data 37 | 38 | logging.set_verbosity(logging.INFO) 39 | # Basic model parameters as external flags. 40 | flags = tf.app.flags 41 | FLAGS = flags.FLAGS 42 | 43 | 44 | # flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.') 45 | 46 | class Options(object): 47 | def __init__(self): 48 | self.fix_emb = False 49 | self.reuse_w = True 50 | self.reuse_cnn = False 51 | self.reuse_discrimination = True # reuse cnn for discrimination 52 | self.restore = True 53 | self.tanh = True # activation fun for the top layer of cnn, otherwise relu 54 | self.model = 'cnn_deconv' # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv 55 | 56 | self.permutation = 0 57 | self.substitution = 's' # Deletion(d), Insertion(a), Substitution(s) and Permutation(p) 58 | 59 | self.W_emb = None 60 | self.cnn_W = None 61 | self.cnn_b = None 62 | self.maxlen = 305 63 | self.n_words = None 64 | self.filter_shape = 5 65 | self.filter_size = 300 66 | self.multiplier = 1 # filtersize multiplier 67 | self.embed_size = 300 68 | self.lr = 2e-4 69 | self.layer = 3 70 | self.stride = [2, 2 ,2] # for two layer cnn/deconv , use self.stride[0] 71 | self.batch_size = 64 72 | self.dis_batch_size = 64 73 | self.max_epochs = 1000 74 | self.n_gan = 500 # self.filter_size * 3 75 | self.L = 100 76 | 77 | self.optimizer = 'Adam' # tf.train.AdamOptimizer(beta1=0.9) #'Adam' # 'Momentum' , 'RMSProp' 78 | self.clip_grad = None # None #100 # 20# 79 | self.attentive_emb = False 80 | self.decay_rate = 1 81 | 82 | self.save_path = "./save/yelp" #"./save/yelp_500_new" 83 | self.log_path = "./log" 84 | self.print_freq = 100 85 | self.valid_freq = 1000 86 | 87 | self.part_data = False 88 | #self.portion = float(sys.argv[1]) # 10% 1% 89 | self.portion = 1.0 # 10% 1% 90 | 91 | # batch norm & dropout 92 | self.batch_norm = False 93 | self.cnn_layer_dropout = False 94 | self.dropout_ratio = 0.5 # keep probability. 95 | self.rec_alpha = 1 96 | self.rec_decay_freq = 50 97 | self.pretrain_step = 50000 98 | 99 | self.discrimination = False 100 | self.H_dis = 300 101 | 102 | self.sent_len = self.maxlen + 2 * (self.filter_shape - 1) 103 | self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape) / self.stride[0]) + 1) 104 | self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape) / self.stride[1]) + 1) 105 | self.sent_len4 = np.int32(floor((self.sent_len3 - self.filter_shape) / self.stride[2]) + 1) 106 | print ('Use model %s' % self.model) 107 | print ('Use %d conv/deconv layers' % self.layer) 108 | 109 | def __iter__(self): 110 | for attr, value in self.__dict__.iteritems(): 111 | yield attr, value 112 | 113 | 114 | def semi_classifier(alpha, x, x_org, x_lab, y, dp_ratio, opt, opt_t=None): 115 | # print x.get_shape() # batch L 116 | is_train = True 117 | if not opt_t: opt_t = opt 118 | x_lab_emb, W_norm = embedding(x_lab, opt) # batch L emb 119 | x_emb = tf.nn.embedding_lookup(W_norm, x) 120 | x_emb = tf.expand_dims(x_emb, 3) # batch L emb 1 121 | x_lab_emb = tf.expand_dims(x_lab_emb, 3) # batch L emb 1 122 | x_lab_emb= tf.nn.dropout(x_lab_emb, dp_ratio) 123 | res = {} 124 | 125 | # cnn encoder 126 | H_enc, res = conv_encoder(x_emb, is_train, opt, res) 127 | H_lab_enc, res = conv_encoder(x_lab_emb, is_train, opt, res, is_reuse = True) 128 | H_dec = H_enc 129 | 130 | #H_lab_enc = tf.nn.dropout(H_lab_enc, opt.dropout_ratio) 131 | logits = classifier_2layer(H_lab_enc, opt, dropout = dp_ratio, prefix='classify', is_reuse=None) 132 | dis_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)) 133 | 134 | # calculate the accuracy 135 | prob = tf.nn.sigmoid(logits) 136 | 137 | # if opt.model == 'rnn_rnn': 138 | # rec_loss, rec_sent_1, _ = seq2seq(x, x_org, opt) 139 | # _, rec_sent_2, _ = seq2seq(x, x_org, opt, feed_previous=True, is_reuse=True) 140 | # res['rec_sents_feed_y'] = rec_sent_1 141 | # res['rec_sents'] = rec_sent_2 142 | 143 | # elif opt.model == 'cnn_rnn': 144 | # # lstm decoder 145 | # H_dec2 = tf.identity(H_dec) 146 | # rec_loss, rec_sent_1, _ = lstm_decoder(H_dec, x_org, opt) # 147 | 148 | # _, rec_sent_2, _ = lstm_decoder(H_dec, x_org, opt, feed_previous=True, is_reuse=True) 149 | 150 | # res['rec_sents_feed_y'] = rec_sent_1 151 | # res['rec_sents'] = rec_sent_2 152 | 153 | # else: 154 | 155 | # # deconv decoder 156 | rec_loss, res = deconv_decoder(H_dec, x_org, W_norm, is_train, opt_t, res) 157 | 158 | correct_prediction = tf.equal(tf.round(prob), tf.round(y)) 159 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 160 | 161 | # calculate the total loss 162 | loss = alpha * rec_loss + (1-alpha) * dis_loss 163 | 164 | tf.summary.scalar('loss', loss) 165 | tf.summary.scalar('rec_loss', rec_loss) 166 | tf.summary.scalar('dis_loss', dis_loss) 167 | summaries = [ 168 | # "learning_rate", 169 | "loss" 170 | # "gradients", 171 | # "gradient_norm", 172 | ] 173 | global_step = tf.Variable(0, trainable=False) 174 | train_op = layers.optimize_loss( 175 | loss, 176 | global_step=global_step, 177 | # framework.get_global_step(), 178 | optimizer=opt.optimizer, 179 | clip_gradients=(lambda grad: _clip_gradients_seperate_norm(grad, opt.clip_grad)) if opt.clip_grad else None, 180 | #learning_rate_decay_fn=lambda lr, g: tf.train.exponential_decay(learning_rate=lr, global_step=g, 181 | # decay_rate=opt.decay_rate, decay_steps=3000), 182 | learning_rate=opt.lr, 183 | summaries=summaries 184 | ) 185 | return res, dis_loss, rec_loss, loss, train_op, prob, accuracy 186 | 187 | 188 | def run_model(opt, train_unlab_x, train_lab_x, train_lab, val_unlab_x, val_lab_x, val_lab, test, test_y, wordtoix, ixtoword): 189 | try: 190 | params = np.load('./param_g.npz') 191 | if params['Wemb'].shape == (opt.n_words, opt.embed_size): 192 | print('Use saved embedding.') 193 | opt.W_emb = params['Wemb'] 194 | else: 195 | print('Emb Dimension mismatch: param_g.npz:' + str(params['Wemb'].shape) + ' opt: ' + str( 196 | (opt.n_words, opt.embed_size))) 197 | opt.fix_emb = False 198 | except IOError: 199 | print('No embedding file found.') 200 | opt.fix_emb = False 201 | 202 | with tf.device('/gpu:1'): 203 | alpha_ = tf.placeholder(tf.float32, shape=()) 204 | x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 205 | x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len]) 206 | x_lab_ = tf.placeholder(tf.int32, shape=[opt.dis_batch_size, opt.sent_len]) 207 | y_ = tf.placeholder(tf.float32, shape=[opt.dis_batch_size, 1]) 208 | dp_ratio_ = tf.placeholder(tf.float32, name='dp_ratio_') 209 | res_, dis_loss_, rec_loss_, loss_, train_op, prob_, acc_ = semi_classifier(alpha_, x_, x_org_, x_lab_, y_, dp_ratio_, opt) 210 | merged = tf.summary.merge_all() 211 | 212 | uidx = 0 213 | max_val_accuracy = 0.0 214 | max_test_accuracy = 0.0 215 | config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) 216 | # config = tf.ConfigProto(device_count={'GPU':0}) 217 | config.gpu_options.allow_growth = True 218 | np.set_printoptions(precision=3) 219 | np.set_printoptions(threshold=np.inf) 220 | saver = tf.train.Saver() 221 | 222 | with tf.Session(config=config) as sess: 223 | train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph) 224 | test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph) 225 | sess.run(tf.global_variables_initializer()) 226 | if opt.restore: 227 | try: 228 | t_vars = tf.trainable_variables() 229 | loader = restore_from_save(t_vars, sess, opt) 230 | 231 | except Exception as e: 232 | print(e) 233 | print("No saving session, using random initialization") 234 | sess.run(tf.global_variables_initializer()) 235 | 236 | for epoch in range(opt.max_epochs): 237 | 238 | print("Starting epoch %d" % epoch) 239 | 240 | kf = get_minibatches_idx(len(train_unlab_x), opt.batch_size, shuffle=True) 241 | for _, train_index in kf: 242 | uidx += 1 243 | 244 | if opt.rec_alpha > 0 and uidx > opt.pretrain_step and uidx % opt.rec_decay_freq == 0: 245 | opt.rec_alpha -= 0.01 246 | print "alpha: "+ str(opt.rec_alpha) 247 | 248 | sents = [train_unlab_x[t] for t in train_index] 249 | 250 | lab_index = np.random.choice(len(train_lab), opt.dis_batch_size, replace=False) 251 | lab_sents = [train_lab_x[t] for t in lab_index] 252 | batch_lab = [train_lab[t] for t in lab_index] 253 | batch_lab = np.array(batch_lab) 254 | batch_lab = batch_lab.reshape((len(batch_lab), 1)) 255 | x_batch_lab = prepare_data_for_cnn(lab_sents, opt) 256 | 257 | sents_permutated = add_noise(sents, opt) 258 | 259 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 260 | x_batch_org = prepare_data_for_cnn(sents, opt) # Batch L 261 | else: 262 | x_batch_org = prepare_data_for_rnn(sents, opt) # Batch L 263 | 264 | if opt.model != 'rnn_rnn': 265 | x_batch = prepare_data_for_cnn(sents_permutated, opt) # Batch L 266 | else: 267 | x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO=False) # Batch L 268 | 269 | _, dis_loss, rec_loss, loss, acc = sess.run([train_op, dis_loss_, rec_loss_, loss_, acc_], 270 | feed_dict= {alpha_: opt.rec_alpha, x_: x_batch, x_org_: x_batch_org, x_lab_: x_batch_lab, y_: batch_lab, dp_ratio_: opt.dropout_ratio}) 271 | summary = sess.run(merged, feed_dict={alpha_: opt.rec_alpha, x_: x_batch, x_org_: x_batch_org, x_lab_: x_batch_lab, y_: batch_lab, dp_ratio_: opt.dropout_ratio}) 272 | train_writer.add_summary(summary, uidx) 273 | 274 | 275 | if uidx % opt.print_freq == 0: 276 | print("Iteration %d: dis_loss %f, rec_loss %f, loss %f, acc %f " % (uidx, dis_loss, rec_loss, loss, acc)) 277 | 278 | if uidx % opt.valid_freq == 0: 279 | #print("Iteration %d: dis_loss %f, rec_loss %f, loss %f " % (uidx, dis_loss, rec_loss, loss)) 280 | valid_index = np.random.choice(len(val_unlab_x), opt.batch_size) 281 | val_sents = [val_unlab_x[t] for t in valid_index] 282 | 283 | val_sents_permutated = add_noise(val_sents, opt) 284 | 285 | if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn': 286 | x_val_batch_org = prepare_data_for_cnn(val_sents, opt) 287 | else: 288 | x_val_batch_org = prepare_data_for_rnn(val_sents, opt) 289 | 290 | if opt.model != 'rnn_rnn': 291 | x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt) 292 | else: 293 | x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False) 294 | 295 | rec_loss_val = sess.run(rec_loss_, feed_dict={x_: x_val_batch, 296 | x_org_: x_val_batch_org, dp_ratio_: 1.0}) 297 | print("Validation rec loss %f " % rec_loss_val) 298 | 299 | kf_val = get_minibatches_idx(len(val_lab_x), opt.dis_batch_size, shuffle=False) 300 | 301 | prob_val = [] 302 | for _, val_ind in kf_val: 303 | val_sents = [val_lab_x[t] for t in val_ind] 304 | x_val_dis = prepare_data_for_cnn(val_sents, opt) 305 | val_y = np.array([val_lab[t] for t in val_ind]).reshape((opt.dis_batch_size, 1)) 306 | val_prob = sess.run(prob_, feed_dict={x_lab_: x_val_dis, dp_ratio_: 1.0}) 307 | for x in val_prob: 308 | prob_val.append(x) 309 | 310 | ##### DON'T UNDERSTAND :error val_index 311 | # probs = [] 312 | # val_truth = [] 313 | # for i in range(len(val_lab)): 314 | # val_truth.append(val_lab[i]) 315 | # if type(val_index[i]) != int: 316 | # temp = [] 317 | # for j in val_index[i]: 318 | # temp.append(prob_val[j]) 319 | # aver = sum(temp) * 1.0 / len(temp) 320 | # probs.append(aver) 321 | # else: 322 | # probs.append(prob_val[val_index[i]]) 323 | 324 | probs = [] 325 | val_truth = [] 326 | for i in range(len(prob_val)): 327 | val_truth.append(val_lab[i]) 328 | probs.append(prob_val[i]) 329 | 330 | count = 0.0 331 | for i in range(len(probs)): 332 | p = probs[i] 333 | if p > 0.5: 334 | if val_truth[i] == 1: 335 | count += 1.0 336 | else: 337 | if val_truth[i] == 0: 338 | count += 1.0 339 | 340 | val_accuracy = count * 1.0 / len(probs) 341 | 342 | 343 | 344 | print("Validation accuracy %f " % val_accuracy) 345 | 346 | summary = sess.run(merged, 347 | feed_dict={alpha_: opt.rec_alpha, x_: x_val_batch, x_org_: x_val_batch_org, x_lab_: x_val_dis, y_: val_y, dp_ratio_: 1.0}) 348 | test_writer.add_summary(summary, uidx) 349 | 350 | if val_accuracy >= max_val_accuracy: 351 | max_val_accuracy = val_accuracy 352 | 353 | kf_test = get_minibatches_idx(len(test), opt.dis_batch_size, shuffle=False) 354 | prob_test = [] 355 | for _, test_ind in kf_test: 356 | test_sents = [test[t] for t in test_ind] 357 | x_test_batch = prepare_data_for_cnn(test_sents, opt) 358 | test_prob = sess.run(prob_, feed_dict={x_lab_: x_test_batch, dp_ratio_: 1.0}) 359 | for x in test_prob: 360 | prob_test.append(x) 361 | 362 | probs = [] 363 | test_truth = [] 364 | for i in range(len(prob_test)): 365 | test_truth.append(test_y[i]) 366 | probs.append(prob_test[i]) 367 | 368 | # probs = [] 369 | # test_truth = [] 370 | # for i in range(len(test_y)): 371 | # test_truth.append(test_y[i]) 372 | # if type(test_index[i]) != int: 373 | # temp = [prob_test[j] for j in test_index[i]] 374 | # aver = sum(temp) * 1.0 / len(temp) 375 | # probs.append(aver) 376 | # else: 377 | # probs.append(prob_test[test_index[i]]) 378 | 379 | count = 0.0 380 | for i in range(len(probs)): 381 | p = probs[i] 382 | if p > 0.5: 383 | if test_truth[i] == 1.0: 384 | count += 1.0 385 | else: 386 | if test_truth[i] == 0.0: 387 | count += 1.0 388 | 389 | test_accuracy = count * 1.0 / len(probs) 390 | 391 | print("Test accuracy %f " % test_accuracy) 392 | 393 | max_test_accuracy = test_accuracy 394 | 395 | def test_input(text): 396 | x_input = sent2idx(text, wordtoix, opt) 397 | res = sess.run(res_, feed_dict={x_: x_input, x_org_: x_batch_org}) 398 | print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 399 | 400 | 401 | # res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_: 1}) 402 | # print "Original :" + " ".join([ixtoword[x] for x in sents[0] if x != 0]) 403 | # # print "Permutated :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0]) 404 | # if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn': 405 | # print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0]) 406 | # print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0]) 407 | 408 | # print "Probs:" + " ".join([ixtoword[res['rec_sents'][0][i]] +'(' +str(np.round(res['all_p'][i],2))+')' for i in range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0]) 409 | 410 | 411 | print(opt.rec_alpha) 412 | print("Epoch %d: Max Valid accuracy %f" % (epoch, max_val_accuracy)) 413 | print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy)) 414 | 415 | 416 | 417 | saver.save(sess, opt.save_path, global_step=epoch) 418 | 419 | 420 | def main(): 421 | # global n_words 422 | # Prepare training and testing data 423 | loadpath = "./data/yelp.p" 424 | x = cPickle.load(open(loadpath, "rb")) 425 | train, val, test = x[0], x[1], x[2] 426 | train_lab, val_lab, test_lab = x[3], x[4], x[5] 427 | wordtoix, ixtoword = x[6], x[7] 428 | 429 | train_unlab_x = [list(s) for s in train] 430 | train_lab_x = [list(s) for s in train] 431 | val_unlab_x = [list(s) for s in val] 432 | val_lab_x = [list(s) for s in val] 433 | test = [list(s) for s in test] 434 | 435 | train_lab = np.array(train_lab, dtype='float32') 436 | val_lab = np.array(val_lab, dtype='float32') 437 | test_lab = np.array(test_lab, dtype='float32') 438 | 439 | opt = Options() 440 | opt.n_words = len(ixtoword) 441 | print dict(opt) 442 | print('Total words: %d' % opt.n_words) 443 | 444 | run_model(opt, train_unlab_x, train_lab_x, train_lab, val_unlab_x, val_lab_x, val_lab, 445 | test, test_lab, wordtoix, ixtoword) 446 | 447 | 448 | 449 | 450 | if __name__ == '__main__': 451 | main() 452 | --------------------------------------------------------------------------------