├── pycocoevalcap
    ├── __init__.py
    ├── bleu
    │   ├── __init__.py
    │   ├── bleu.pyc
    │   ├── __init__.pyc
    │   ├── bleu_scorer.pyc
    │   ├── LICENSE
    │   ├── bleu.py
    │   └── bleu_scorer.py
    ├── cider
    │   ├── __init__.py
    │   ├── cider.pyc
    │   ├── __init__.pyc
    │   ├── cider_scorer.pyc
    │   ├── cider.py
    │   └── cider_scorer.py
    ├── meteor
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── meteor.pyc
    │   └── meteor.py
    ├── rouge
    │   ├── __init__.py
    │   ├── rouge.pyc
    │   ├── __init__.pyc
    │   └── rouge.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── ptbtokenizer.pyc
    │   └── ptbtokenizer.py
    ├── eval.pyc
    ├── __init__.pyc
    └── eval.py
├── requirements.txt
├── .DS_Store
├── README.md
├── char_preprocessing.py
├── denoise.py
├── rougescore.py
├── error_rate.py
├── demo.py
├── data_utils.py
├── utils.py
├── char_correction.py
├── auto_encoding_cnn_denoise.py
├── model.py
└── semi_supervised.py


/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.15.4
2 | scipy
3 | nltk


--------------------------------------------------------------------------------
/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/pycocoevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/.DS_Store


--------------------------------------------------------------------------------
/pycocoevalcap/eval.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/eval.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/__init__.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/bleu/bleu.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/bleu/bleu.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/cider/cider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/cider/cider.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/rouge/rouge.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/rouge/rouge.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/bleu/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/bleu/__init__.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/cider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/cider/__init__.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/meteor/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/meteor/__init__.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/meteor/meteor.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/meteor/meteor.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/rouge/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/rouge/__init__.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/bleu/bleu_scorer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/bleu/bleu_scorer.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/cider/cider_scorer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/cider/cider_scorer.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/tokenizer/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/tokenizer/__init__.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/tokenizer/ptbtokenizer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dreasysnail/deconv_paragraph_represention/HEAD/pycocoevalcap/tokenizer/ptbtokenizer.pyc


--------------------------------------------------------------------------------
/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from bleu_scorer import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(gts.keys() == res.keys())
24 |         imgIds = gts.keys()
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             #print(ref)
36 |             #assert(len(ref) > 1)
37 | 
38 |             bleu_scorer += (hypo[0], ref)
39 | 
40 |         #score, scores = bleu_scorer.compute_score(option='shortest')
41 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
42 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
43 | 
44 |         # return (bleu, bleu_info)
45 |         return score, scores
46 | 
47 |     def method(self):
48 |         return "Bleu"
49 | 


--------------------------------------------------------------------------------
/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from cider_scorer import CiderScorer
11 | import pdb
12 | 
13 | class Cider:
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         assert(gts.keys() == res.keys())
33 |         imgIds = gts.keys()
34 | 
35 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 | 
37 |         for id in imgIds:
38 |             hypo = res[id]
39 |             ref = gts[id]
40 | 
41 |             # Sanity check.
42 |             assert(type(hypo) is list)
43 |             assert(len(hypo) == 1)
44 |             assert(type(ref) is list)
45 |             assert(len(ref) > 0)
46 | 
47 |             cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deconvolutional Paragraph Representation Learning
 2 | 
 3 | Implementations of the models in the paper "Deconvolutional Paragraph Representation Learning" by Yizhe Zhang, Dinghan Shen, Guoyin Wang, Zhe Gan, Ricardo Henao and Lawrence Carin, NIPS 2017
 4 | 
 5 | ## Prerequisite: 
 6 | * CUDA, cudnn
 7 | * Tensorflow (version >1.0). We used tensorflow 1.2.
 8 | Run: `pip install -r requirements.txt` to install requirements
 9 | 
10 | 
11 | ## Run 
12 | * Run: `python demo.py` for reconstruction task
13 | * Run: `python char_correction.py` for character-level correction task
14 | * Run: `python semi_supervised.py` for semi-supervised task
15 | * Options: options can be made by changing `option` class in the demo.py code. 
16 | 
17 | - `opt.n_hidden`: number of hidden units.
18 | - `opt.layer`: number of CNN/DCNN layer [2,3,4].
19 | - `opt.lr`: learning rate.
20 | - `opt.batch_size`: number of batchsize.
21 | 
22 | * Training roughly takes 6-7 hours (around 10-20 epochs) (for recontruction task) to converge on a K80 GPU machine.
23 | * See `output.txt` for a sample of screen output for reconstruction task.
24 | 
25 | ## Data: 
26 | * Download from :
27 | 	* Reconstruction: [Hotel review (1.52GB)](https://drive.google.com/file/d/0B52eYWrYWqIpQzhBNkVxaV9mMjQ/view)
28 | 	* Char-level correction: [Yahoo! review (character-level, 451MB)](https://drive.google.com/open?id=1kBIAWyi3kvcMme-_1q4OU881yWH_j3ki)
29 | 	* Semi-supervised classification: [Yelp review (629MB)](https://drive.google.com/open?id=1qKos_wB45MzMu7Sn8RdvE6SRVAKCTC6e)
30 | 
31 | 
32 | ## Citation 
33 | Please cite our paper if it helps with your research
34 | * Arxiv link: [https://arxiv.org/abs/1708.04729](https://arxiv.org/abs/1708.04729)
35 | ```latex
36 | @inproceedings{zhang2017deconvolutional,
37 |   title={Deconvolutional Paragraph Representation Learning},
38 |   author={Zhang, Yizhe and Shen, Dinghan and Wang, Guoyin and Gan, Zhe and Henao, Ricardo and Carin, Lawrence},
39 |   Booktitle={NIPS},
40 |   year={2017}
41 | }
42 | ```
43 | For any question or suggestions, feel free to contact yizhe.zhang@microsoft.com
44 | 


--------------------------------------------------------------------------------
/char_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import cPickle
 2 | import pdb
 3 | import numpy as np
 4 | def idx2sent(text, alphabet):
 5 |     char_seq = []
 6 |     # print(text)
 7 |     for it in text:
 8 |         it_list = list(it)
 9 |         # padded = pad_sentence(it_list)
10 |         # text_int8_repr = string_to_int8_conversion(padded, alphabet)
11 |         text_int8_repr = string_to_int8_conversion(it_list, alphabet)
12 |         char_seq.append(text_int8_repr)
13 |     return char_seq
14 | 
15 | 
16 | def pad_sentence(char_seq, padding_char=" ", char_seq_length=301):
17 |     # char_seq_length = 1014
18 | 
19 |     num_padding = char_seq_length - len(char_seq)
20 | 
21 |     new_char_seq = char_seq + [padding_char] * num_padding
22 |     return new_char_seq
23 | 
24 | 
25 | def string_to_int8_conversion(char_seq, alphabet):
26 |     x = [alphabet.find(char) + 2 for char in char_seq]
27 |     # x = np.array([alphabet.find(char) for char in char_seq], dtype=np.int8)
28 |     return x
29 | 
30 | def prepare_data_for_charCNN(loadpath = "./data/yahoo4char.p"):
31 | 
32 |     x = cPickle.load(open(loadpath,"rb"))
33 | 
34 |     train, val, test                    = x[0], x[1], x[2]
35 |     train_text, val_text, test_text     = x[3], x[4], x[5]
36 |     train_lab, val_lab, test_lab        = x[6], x[7], x[8]
37 |     wordtoix, ixtoword                  = x[9], x[10]
38 | 
39 |     # alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}\n"
40 | 
41 |     alphabet = "abcdefghijklmnopqrstuvwxyz0,.!?()"
42 | 
43 | 
44 | 
45 |     train_char = idx2sent(train_text, alphabet)
46 |     val_char = idx2sent(val_text, alphabet)
47 |     test_char = idx2sent(test_text, alphabet)
48 |     chartoix = { c: i + 2 for i, c in enumerate(alphabet)} # make sure 0 is space
49 |     chartoix[' '] = 1
50 |     ixtochar = { i+2:c for i, c in enumerate(alphabet)}
51 |     ixtochar[1] = ' '
52 | 
53 | 
54 |     # add padding character
55 |     chartoix['N'] = 0
56 |     ixtochar[0] = 'N'
57 | 
58 |     with open('./data/yahoo_char.p', 'w+') as f:
59 |         cPickle.dump([train_char, val_char, test_char, train_text, val_text, test_text, train_lab, val_lab, test_lab, chartoix, ixtochar, alphabet, ], f)
60 | 
61 | if __name__ == '__main__':
62 | 
63 |     prepare_data_for_charCNN()
64 | 


--------------------------------------------------------------------------------
/pycocoevalcap/eval.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'tylin'
 2 | from tokenizer.ptbtokenizer import PTBTokenizer
 3 | from bleu.bleu import Bleu
 4 | from meteor.meteor import Meteor
 5 | from rouge.rouge import Rouge
 6 | from cider.cider import Cider
 7 | 
 8 | class COCOEvalCap:
 9 |     def __init__(self, coco, cocoRes):
10 |         self.evalImgs = []
11 |         self.eval = {}
12 |         self.imgToEval = {}
13 |         self.coco = coco
14 |         self.cocoRes = cocoRes
15 |         self.params = {'image_id': coco.getImgIds()}
16 | 
17 |     def evaluate(self):
18 |         imgIds = self.params['image_id']
19 |         # imgIds = self.coco.getImgIds()
20 |         gts = {}
21 |         res = {}
22 |         for imgId in imgIds:
23 |             gts[imgId] = self.coco.imgToAnns[imgId]
24 |             res[imgId] = self.cocoRes.imgToAnns[imgId]
25 | 
26 |         # =================================================
27 |         # Set up scorers
28 |         # =================================================
29 |         print 'tokenization...'
30 |         tokenizer = PTBTokenizer()
31 |         gts  = tokenizer.tokenize(gts)
32 |         res = tokenizer.tokenize(res)
33 | 
34 |         # =================================================
35 |         # Set up scorers
36 |         # =================================================
37 |         print 'setting up scorers...'
38 |         scorers = [
39 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
40 |             (Meteor(),"METEOR"),
41 |             (Rouge(), "ROUGE_L"),
42 |             (Cider(), "CIDEr")
43 |         ]
44 | 
45 |         # =================================================
46 |         # Compute scores
47 |         # =================================================
48 |         for scorer, method in scorers:
49 |             print 'computing %s score...'%(scorer.method())
50 |             score, scores = scorer.compute_score(gts, res)
51 |             if type(method) == list:
52 |                 for sc, scs, m in zip(score, scores, method):
53 |                     self.setEval(sc, m)
54 |                     self.setImgToEvalImgs(scs, gts.keys(), m)
55 |                     print "%s: %0.3f"%(m, sc)
56 |             else:
57 |                 self.setEval(score, method)
58 |                 self.setImgToEvalImgs(scores, gts.keys(), method)
59 |                 print "%s: %0.3f"%(method, score)
60 |         self.setEvalImgs()
61 | 
62 |     def setEval(self, score, method):
63 |         self.eval[method] = score
64 | 
65 |     def setImgToEvalImgs(self, scores, imgIds, method):
66 |         for imgId, score in zip(imgIds, scores):
67 |             if not imgId in self.imgToEval:
68 |                 self.imgToEval[imgId] = {}
69 |                 self.imgToEval[imgId]["image_id"] = imgId
70 |             self.imgToEval[imgId][method] = score
71 | 
72 |     def setEvalImgs(self):
73 |         self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]


--------------------------------------------------------------------------------
/pycocoevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import os
12 | import sys
13 | import subprocess
14 | import tempfile
15 | import itertools
16 | 
17 | # path to the stanford corenlp jar
18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
19 | 
20 | # punctuations to be removed from the sentences
21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
22 |         ".", "?", "!", ",", ":", "-", "--", "...", ";"] 
23 | 
24 | class PTBTokenizer:
25 |     """Python wrapper of Stanford PTBTokenizer"""
26 | 
27 |     def tokenize(self, captions_for_image):
28 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
29 |                 'edu.stanford.nlp.process.PTBTokenizer', \
30 |                 '-preserveLines', '-lowerCase']
31 | 
32 |         # ======================================================
33 |         # prepare data for PTB Tokenizer
34 |         # ======================================================
35 |         final_tokenized_captions_for_image = {}
36 |         image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
37 |         sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
38 | 
39 |         # ======================================================
40 |         # save sentences to temporary file
41 |         # ======================================================
42 |         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
43 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
44 |         tmp_file.write(sentences)
45 |         tmp_file.close()
46 | 
47 |         # ======================================================
48 |         # tokenize sentence
49 |         # ======================================================
50 |         cmd.append(os.path.basename(tmp_file.name))
51 |         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
52 |                 stdout=subprocess.PIPE)
53 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
54 |         lines = token_lines.split('\n')
55 |         # remove temp file
56 |         os.remove(tmp_file.name)
57 | 
58 |         # ======================================================
59 |         # create dictionary for tokenized captions
60 |         # ======================================================
61 |         for k, line in zip(image_id, lines):
62 |             if not k in final_tokenized_captions_for_image:
63 |                 final_tokenized_captions_for_image[k] = []
64 |             tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
65 |                     if w not in PUNCTUATIONS])
66 |             final_tokenized_captions_for_image[k].append(tokenized_caption)
67 | 
68 |         return final_tokenized_captions_for_image
69 | 


--------------------------------------------------------------------------------
/pycocoevalcap/meteor/meteor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Python wrapper for METEOR implementation, by Xinlei Chen
 4 | # Acknowledge Michael Denkowski for the generous discussion and help 
 5 | 
 6 | import os
 7 | import sys
 8 | import subprocess
 9 | import threading
10 | 
11 | # Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
12 | METEOR_JAR = 'meteor-1.5.jar'
13 | # print METEOR_JAR
14 | 
15 | class Meteor:
16 | 
17 |     def __init__(self):
18 |         self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
19 |                 '-', '-', '-stdio', '-l', 'en', '-norm']
20 |         self.meteor_p = subprocess.Popen(self.meteor_cmd, \
21 |                 cwd=os.path.dirname(os.path.abspath(__file__)), \
22 |                 stdin=subprocess.PIPE, \
23 |                 stdout=subprocess.PIPE, \
24 |                 stderr=subprocess.PIPE)
25 |         # Used to guarantee thread safety
26 |         self.lock = threading.Lock()
27 | 
28 |     def compute_score(self, gts, res):
29 |         assert(gts.keys() == res.keys())
30 |         imgIds = gts.keys()
31 |         scores = []
32 | 
33 |         eval_line = 'EVAL'
34 |         self.lock.acquire()
35 |         for i in imgIds:
36 |             assert(len(res[i]) == 1)
37 |             stat = self._stat(res[i][0], gts[i])
38 |             eval_line += ' ||| {}'.format(stat)
39 | 
40 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
41 |         for i in range(0,len(imgIds)):
42 |             scores.append(float(self.meteor_p.stdout.readline().strip()))
43 |         score = float(self.meteor_p.stdout.readline().strip())
44 |         self.lock.release()
45 | 
46 |         return score, scores
47 | 
48 |     def method(self):
49 |         return "METEOR"
50 | 
51 |     def _stat(self, hypothesis_str, reference_list):
52 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
53 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
54 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
55 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
56 |         return self.meteor_p.stdout.readline().strip()
57 | 
58 |     def _score(self, hypothesis_str, reference_list):
59 |         self.lock.acquire()
60 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
61 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
62 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
63 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
64 |         stats = self.meteor_p.stdout.readline().strip()
65 |         eval_line = 'EVAL ||| {}'.format(stats)
66 |         # EVAL ||| stats 
67 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
68 |         score = float(self.meteor_p.stdout.readline().strip())
69 |         # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
70 |         # thanks for Andrej for pointing this out
71 |         score = float(self.meteor_p.stdout.readline().strip())
72 |         self.lock.release()
73 |         return score
74 |  
75 |     def __exit__(self):
76 |         self.lock.acquire()
77 |         self.meteor_p.stdin.close()
78 |         self.meteor_p.kill()
79 |         self.meteor_p.wait()
80 |         self.lock.release()
81 | 


--------------------------------------------------------------------------------
/denoise.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Yizhe Zhang
  3 | 
  4 | Perturbation to the input
  5 | """
  6 | import numpy as np
  7 | import os
  8 | import scipy.io as sio
  9 | from math import floor
 10 | import pdb
 11 | 
 12 | def add_noise(sents, opt):
 13 |     if opt.substitution == 's':
 14 |         sents_permutated= substitute_sent(sents, opt)
 15 |     elif opt.substitution == 'p':
 16 |         sents_permutated= permutate_sent(sents, opt)
 17 |     elif opt.substitution == 'a':
 18 |         sents_permutated= add_sent(sents, opt)   
 19 |     elif opt.substitution == 'd':
 20 |         sents_permutated= delete_sent(sents, opt) 
 21 |     elif opt.substitution == 'm':
 22 |         sents_permutated= mixed_noise_sent(sents, opt)
 23 |     elif opt.substitution == 'sc':
 24 |         sents_permutated = substitute_sent_char(sents, opt)
 25 |     else:
 26 |         sents_permutated= sents
 27 |         
 28 |     return sents_permutated
 29 | 
 30 | 
 31 | def permutate_sent(sents, opt):
 32 |     sents_p = []
 33 |     for ss in range(len(sents)):
 34 |         sent_temp = sents[ss][:]
 35 |         if len(sent_temp) <= 1: 
 36 |             sents_p.append(sent_temp)
 37 |             continue
 38 |         idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True)
 39 |         temp = sent_temp[idx_s[0]]
 40 |         for ii in range(opt.permutation-1):
 41 |             sent_temp[idx_s[ii]] = sent_temp[idx_s[ii+1]]
 42 |         sent_temp[idx_s[opt.permutation-1]] = temp
 43 |         sents_p.append(sent_temp)
 44 |     return sents_p
 45 |     
 46 |     
 47 | def substitute_sent(sents, opt):
 48 |     # substitute single word 
 49 |     sents_p = []
 50 |     for ss in range(len(sents)):
 51 |         sent_temp = sents[ss][:]
 52 |         if len(sent_temp) <= 1: 
 53 |             sents_p.append(sent_temp)
 54 |             continue
 55 |         idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True)   
 56 |         for ii in range(opt.permutation):
 57 |             sent_temp[idx_s[ii]] = np.random.choice(opt.n_words)
 58 |         sents_p.append(sent_temp)
 59 |     return sents_p       
 60 | 
 61 | def delete_sent(sents, opt):
 62 |     # substitute single word 
 63 |     sents_p = []
 64 |     for ss in range(len(sents)):
 65 |         sent_temp = sents[ss][:]
 66 |         if len(sent_temp) <= 1: 
 67 |             sents_p.append(sent_temp)
 68 |             continue
 69 |         idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True)   
 70 |         for ii in range(opt.permutation):
 71 |             sent_temp[idx_s[ii]] = -1
 72 |         sents_p.append([s for s in sent_temp if s!=-1])
 73 |     return sents_p 
 74 |     
 75 | def add_sent(sents, opt):
 76 |     # substitute single word 
 77 |     sents_p = []
 78 |     for ss in range(len(sents)):
 79 |         sent_temp = sents[ss][:]
 80 |         if len(sent_temp) <= 1: 
 81 |             sents_p.append(sent_temp)
 82 |             continue
 83 |         idx_s= np.random.choice(len(sent_temp)-1, size=opt.permutation, replace=True)   
 84 |         for ii in range(opt.permutation):
 85 |             sent_temp.insert(idx_s[ii], np.random.choice(opt.n_words))
 86 |         sents_p.append(sent_temp[:opt.maxlen])
 87 |     return sents_p  
 88 | 
 89 | 
 90 | def mixed_noise_sent(sents, opt):
 91 |     sents = delete_sent(sents, opt)
 92 |     sents = add_sent(sents, opt)
 93 |     sents = substitute_sent(sents, opt)
 94 |     return sents
 95 |     
 96 | def substitute_sent_char(sents, opt):
 97 |     # substitute single word
 98 |     sents_p = []
 99 |     for ss in range(len(sents)):
100 |         sent_temp = sents[ss][:]
101 |         if len(sent_temp) <= 1: 
102 |             sents_p.append(sent_temp)
103 |             continue
104 |         permute_choice = [ic for ic in range(len(sent_temp)) if sent_temp[ic] != 1]
105 |         idx_s= np.random.choice(permute_choice, size=int(opt.permutation * (len(permute_choice))), replace=True)
106 | 
107 |         for ii in range(len(idx_s)):
108 |             sent_temp[idx_s[ii]] = np.random.choice(list(range(2,28)))
109 |         sents_p.append(sent_temp)
110 |     return sents_p


--------------------------------------------------------------------------------
/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 | 
 20 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 21 |     """
 22 |     if(len(string)< len(sub)):
 23 |         sub, string = string, sub
 24 | 
 25 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 26 | 
 27 |     for j in range(1,len(sub)+1):
 28 |         for i in range(1,len(string)+1):
 29 |             if(string[i-1] == sub[j-1]):
 30 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 31 |             else:
 32 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 33 | 
 34 |     return lengths[len(string)][len(sub)]
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 | 
 40 |     '''
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert(len(candidate)==1)	
 53 |         assert(len(refs)>0)         
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 |     	
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs/float(len(token_c)))
 66 |             rec.append(lcs/float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if(prec_max!=0 and rec_max !=0):
 72 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py 
 81 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 82 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert(gts.keys() == res.keys())
 86 |         imgIds = gts.keys()
 87 | 
 88 |         score = []
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref  = gts[id]
 92 | 
 93 |             score.append(self.calc_score(hypo, ref))
 94 | 
 95 |             # Sanity check.
 96 |             assert(type(hypo) is list)
 97 |             assert(len(hypo) == 1)
 98 |             assert(type(ref) is list)
 99 |             assert(len(ref) > 0)
100 | 
101 |         average_score = np.mean(np.array(score))
102 |         return average_score, np.array(score)
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/rougescore.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import collections
  3 | 
  4 | import six
  5 | 
  6 | def _ngrams(words, n):
  7 |     queue = collections.deque(maxlen=n)
  8 |     for w in words:
  9 |         queue.append(w)
 10 |         if len(queue) == n:
 11 |             yield tuple(queue)
 12 | 
 13 | def _ngram_counts(words, n):
 14 |     return collections.Counter(_ngrams(words, n))
 15 | 
 16 | def _ngram_count(words, n):
 17 |     return max(len(words) - n + 1, 0)
 18 | 
 19 | def _counter_overlap(counter1, counter2):
 20 |     result = 0
 21 |     for k, v in six.iteritems(counter1):
 22 |         result += min(v, counter2[k])
 23 |     return result
 24 | 
 25 | def _safe_divide(numerator, denominator):
 26 |     if denominator > 0:
 27 |         return numerator / denominator
 28 |     else:
 29 |         return 0
 30 | 
 31 | def _safe_f1(matches, recall_total, precision_total, alpha):
 32 |     recall_score = _safe_divide(matches, recall_total)
 33 |     precision_score = _safe_divide(matches, precision_total)
 34 |     denom = (1.0 - alpha) * precision_score + alpha * recall_score
 35 |     if denom > 0.0:
 36 |         return (precision_score * recall_score) / denom
 37 |     else:
 38 |         return 0.0
 39 | 
 40 | def rouge_n(peer, models, n, alpha):
 41 |     """
 42 |     Compute the ROUGE-N score of a peer with respect to one or more models, for
 43 |     a given value of `n`.
 44 |     """
 45 |     matches = 0
 46 |     recall_total = 0
 47 |     peer_counter = _ngram_counts(peer, n)
 48 |     for model in models:
 49 |         model_counter = _ngram_counts(model, n)
 50 |         matches += _counter_overlap(peer_counter, model_counter)
 51 |         recall_total += _ngram_count(model, n)
 52 |     precision_total = len(models) * _ngram_count(peer, n)
 53 |     return _safe_f1(matches, recall_total, precision_total, alpha)
 54 | 
 55 | def rouge_1(peer, models, alpha):
 56 |     """
 57 |     Compute the ROUGE-1 (unigram) score of a peer with respect to one or more
 58 |     models.
 59 |     """
 60 |     return rouge_n(peer, models, 1, alpha)
 61 | 
 62 | def rouge_2(peer, models, alpha):
 63 |     """
 64 |     Compute the ROUGE-2 (bigram) score of a peer with respect to one or more
 65 |     models.
 66 |     """
 67 |     return rouge_n(peer, models, 2, alpha)
 68 | 
 69 | def rouge_3(peer, models, alpha):
 70 |     """
 71 |     Compute the ROUGE-3 (trigram) score of a peer with respect to one or more
 72 |     models.
 73 |     """
 74 |     return rouge_n(peer, models, 3, alpha)
 75 | 
 76 | def lcs(a, b):
 77 |     """
 78 |     Compute the length of the longest common subsequence between two sequences.
 79 | 
 80 |     Time complexity: O(len(a) * len(b))
 81 |     Space complexity: O(min(len(a), len(b)))
 82 |     """
 83 |     # This is an adaptation of the standard LCS dynamic programming algorithm
 84 |     # tweaked for lower memory consumption.
 85 |     # Sequence a is laid out along the rows, b along the columns.
 86 |     # Minimize number of columns to minimize required memory
 87 |     if len(a) < len(b):
 88 |         a, b = b, a
 89 |     # Sequence b now has the minimum length
 90 |     # Quit early if one sequence is empty
 91 |     if len(b) == 0:
 92 |         return 0
 93 |     # Use a single buffer to store the counts for the current row, and
 94 |     # overwrite it on each pass
 95 |     row = [0] * len(b)
 96 |     for ai in a:
 97 |         left = 0
 98 |         diag = 0
 99 |         for j, bj in enumerate(b):
100 |             up = row[j]
101 |             if ai == bj:
102 |                 value = diag + 1
103 |             else:
104 |                 value = max(left, up)
105 |             row[j] = value
106 |             left = value
107 |             diag = up
108 |     # Return the last cell of the last row
109 |     return left
110 | 
111 | def rouge_l(peer, models, alpha):
112 |     """
113 |     Compute the ROUGE-L score of a peer with respect to one or more models.
114 |     """
115 |     matches = 0
116 |     recall_total = 0
117 |     for model in models:
118 |         matches += lcs(model, peer)
119 |         recall_total += len(model)
120 |     precision_total = len(models) * len(peer)
121 |     return _safe_f1(matches, recall_total, precision_total, alpha)
122 | 


--------------------------------------------------------------------------------
/error_rate.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy
  3 | from collections import defaultdict
  4 | 
  5 | COPY = 0
  6 | INSERTION = 1
  7 | DELETION = 2
  8 | SUBSTITUTION = 3
  9 | 
 10 | INFINITY = 10 ** 9
 11 | 
 12 | 
 13 | def _edit_distance_matrix(y, y_hat, special_tokens=None):
 14 |     """Returns the matrix of edit distances.
 15 |     Parameters
 16 |     ----------
 17 |     Returns
 18 |     -------
 19 |     dist : numpy.ndarray
 20 |         dist[i, j] is the edit distance between the first
 21 |     action : numpy.ndarray
 22 |         action[i, j] is the action applied to y_hat[j - 1]  in a chain of
 23 |         optimal actions transducing y_hat[:j] into y[:i].
 24 |         i characters of y and the first j characters of y_hat.
 25 |     special_tokens : set
 26 |         Tokens, for which deletion is free.
 27 |     """
 28 |     if not special_tokens:
 29 |         special_tokens = set()
 30 |     dist = numpy.zeros((len(y) + 1, len(y_hat) + 1), dtype='int64')
 31 |     insertion_cost = numpy.ones(len(y))
 32 |     deletion_cost = numpy.ones(len(y_hat))
 33 |     for i in range(len(y)):
 34 |         if y[i] in special_tokens:
 35 |             insertion_cost[i] = 0
 36 |     for j in range(len(y_hat)):
 37 |         if y_hat[j] in special_tokens:
 38 |             deletion_cost[j] = 0
 39 |     dist[1:, 0] = insertion_cost.cumsum()
 40 |     dist[0, 1:] = deletion_cost.cumsum()
 41 | 
 42 |     for i in xrange(1, len(y) + 1):
 43 |         for j in xrange(1, len(y_hat) + 1):
 44 |             if y[i - 1] != y_hat[j - 1]:
 45 |                 cost = 1
 46 |             else:
 47 |                 cost = 0
 48 |             insertion_dist = dist[i - 1][j] + insertion_cost[i - 1]
 49 |             deletion_dist = dist[i][j - 1] + deletion_cost[j - 1]
 50 |             substitution_dist = dist[i - 1][j - 1] + 1 if cost else INFINITY
 51 |             copy_dist = dist[i - 1][j - 1] if not cost else INFINITY
 52 |             best = min(insertion_dist, deletion_dist,
 53 |                        substitution_dist, copy_dist)
 54 | 
 55 |             dist[i][j] = best
 56 | 
 57 |     return dist
 58 | 
 59 | 
 60 | def _bleu(y, y_hat, n=4):
 61 |     """ BLEU score between the reference sequence y
 62 |     and y_hat for each partial sequence ranging
 63 |     from the first input token to the last
 64 |     Parameters
 65 |     ----------
 66 |     y : vector
 67 |         The reference matrix with dimensions of number
 68 |         of words (rows) by batch size (columns)
 69 |     y_hat : vector
 70 |         The predicted matrix with same dimensions
 71 |     n : integer
 72 |         highest n-gram order in the Bleu sense
 73 |         (e.g Bleu-4)
 74 |     Returns
 75 |     -------
 76 |     results : vector (len y_hat)
 77 |         Bleu scores for each partial sequence
 78 |         y_hat_1..T from T = 1 to len(y_hat)
 79 |     """
 80 |     bleu_scores = numpy.zeros((len(y_hat), n))
 81 | 
 82 |     # count reference ngrams
 83 |     ref_counts = defaultdict(int)
 84 |     for k in xrange(1, n+1):
 85 |         for i in xrange(len(y) - k + 1):
 86 |             ref_counts[tuple(y[i:i + k])] += 1
 87 | 
 88 |     # for each partial sequence, 1) compute addition to # of correct
 89 |     # 2) apply brevity penalty
 90 |     # ngrams, magic stability numbers from pycocoeval
 91 |     ref_len = len(y)
 92 |     pred_counts = defaultdict(int)
 93 |     correct = numpy.zeros(4)
 94 |     for i in xrange(1, len(y_hat) + 1):
 95 |         for k in xrange(i, max(-1, i - n), -1):
 96 |             # print i, k
 97 |             ngram = tuple(y_hat[k-1:i])
 98 |             # UNK token hack. Must work for both indices
 99 |             # and words. Very ugly, I know.
100 |             if 0 in ngram or 'UNK' in ngram:
101 |                 continue
102 |             pred_counts[ngram] += 1
103 |             if pred_counts[ngram] <= ref_counts.get(ngram, 0):
104 |                 correct[len(ngram)-1] += 1
105 | 
106 |         # compute partial bleu score
107 |         bleu = 1.
108 |         for j in xrange(n):
109 |             possible = max(0, i - j)
110 |             bleu *= float(correct[j] + 1.) / (possible + 1.)
111 |             bleu_scores[i - 1, j] = bleu ** (1./(j+1))
112 | 
113 |         # brevity penalty
114 |         if i < ref_len:
115 |             ratio = (i + 1e-15)/(ref_len + 1e-9)
116 |             bleu_scores[i - 1, :] *= math.exp(1 - 1/ratio)
117 | 
118 |     return bleu_scores.astype('float32'), correct, pred_counts, ref_counts
119 | 
120 | 
121 | def edit_distance(y, y_hat):
122 |     """Edit distance between two sequences.
123 |     Parameters
124 |     ----------
125 |     y : str
126 |         The groundtruth.
127 |     y_hat : str
128 |         The recognition candidate.
129 |    the minimum number of symbol edits (i.e. insertions,
130 |     deletions or substitutions) required to change one
131 |     word into the other.
132 |     """
133 |     return _edit_distance_matrix(y, y_hat)[-1, -1]
134 | 
135 | 
136 | def wer(y, y_hat):
137 |     return edit_distance(y, y_hat) / float(len(y))
138 | 
139 | def cal_cer(y_total, y_hat_total):
140 |     error = 0
141 |     length = 0
142 |     for y, y_hat in zip(y_total, y_hat_total):
143 |         y_len = len(''.join(y.split()))
144 |         error += min(y_len/float((len(y))), wer(y, y_hat) )* len(y)
145 |         length += y_len
146 |     return error/length
147 | 
148 | 
149 | 
150 | 
151 | 
152 | def reward_matrix(y, y_hat, alphabet, eos_label):
153 |     dist, _,  = _edit_distance_matrix(y, y_hat)
154 |     y_alphabet_indices = [alphabet.index(c) for c in y]
155 |     if y_alphabet_indices[-1] != eos_label:
156 |         raise ValueError("Last character of the groundtruth must be EOS")
157 | 
158 |     # Optimistic edit distance for every y_hat prefix
159 |     optim_dist = dist.min(axis=0)
160 |     pess_reward = -optim_dist
161 | 
162 |     # Optimistic edit distance for every y_hat prefix plus a character
163 |     optim_dist_char = numpy.tile(
164 |         optim_dist[:, None], [1, len(alphabet)]) + 1
165 |     pess_char_reward = numpy.tile(
166 |         pess_reward[:, None], [1, len(alphabet)]) - 1
167 |     for i in range(len(y)):
168 |         for j in range(len(y_hat) + 1):
169 |             c = y_alphabet_indices[i]
170 |             cand_dist = dist[i, j]
171 |             if cand_dist < optim_dist_char[j, c]:
172 |                 optim_dist_char[j, c] = cand_dist
173 |                 pess_char_reward[j, c] = -cand_dist
174 |     for j in range(len(y_hat) + 1):
175 |         # Here we rely on y[-1] being eos_label
176 |         pess_char_reward[j, eos_label] = -dist[len(y) - 1, j]
177 |     return pess_char_reward
178 | 
179 | def gain_matrix(y, y_hat, alphabet=None, given_reward_matrix=None,
180 |                 eos_label=None):
181 |     y_hat_indices = [alphabet.index(c) for c in y_hat]
182 |     reward = (given_reward_matrix.copy() if given_reward_matrix is not None
183 |               else reward_matrix(y, y_hat, alphabet, eos_label))
184 |     reward[1:] -= reward[:-1][numpy.arange(len(y_hat)), y_hat_indices][:, None]
185 |     return reward
186 | 
187 | 
188 | def prepare_for_cer(sentence, ixtoword):
189 |     sent=[x for x in sentence if x!=0]
190 |     while len(sent)<4:
191 |         sent.append(0)
192 |     #sent = ' '.join([ixtoword[x] for x in sent])
193 |     sent = ''.join([ixtoword[x] for x in sent])
194 |     return sent
195 | 


--------------------------------------------------------------------------------
/pycocoevalcap/cider/cider_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | 
  5 | import copy
  6 | from collections import defaultdict
  7 | import numpy as np
  8 | import pdb
  9 | import math
 10 | 
 11 | def precook(s, n=4, out=False):
 12 |     """
 13 |     Takes a string as input and returns an object that can be given to
 14 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 15 |     can take string arguments as well.
 16 |     :param s: string : sentence to be converted into ngrams
 17 |     :param n: int    : number of ngrams for which representation is calculated
 18 |     :return: term frequency vector for occuring ngrams
 19 |     """
 20 |     words = s.split()
 21 |     counts = defaultdict(int)
 22 |     for k in xrange(1,n+1):
 23 |         for i in xrange(len(words)-k+1):
 24 |             ngram = tuple(words[i:i+k])
 25 |             counts[ngram] += 1
 26 |     return counts
 27 | 
 28 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 29 |     '''Takes a list of reference sentences for a single segment
 30 |     and returns an object that encapsulates everything that BLEU
 31 |     needs to know about them.
 32 |     :param refs: list of string : reference sentences for some image
 33 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 34 |     :return: result (list of dict)
 35 |     '''
 36 |     return [precook(ref, n) for ref in refs]
 37 | 
 38 | def cook_test(test, n=4):
 39 |     '''Takes a test sentence and returns an object that
 40 |     encapsulates everything that BLEU needs to know about it.
 41 |     :param test: list of string : hypothesis sentence for some image
 42 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 43 |     :return: result (dict)
 44 |     '''
 45 |     return precook(test, n, True)
 46 | 
 47 | class CiderScorer(object):
 48 |     """CIDEr scorer.
 49 |     """
 50 | 
 51 |     def copy(self):
 52 |         ''' copy the refs.'''
 53 |         new = CiderScorer(n=self.n)
 54 |         new.ctest = copy.copy(self.ctest)
 55 |         new.crefs = copy.copy(self.crefs)
 56 |         return new
 57 | 
 58 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
 59 |         ''' singular instance '''
 60 |         self.n = n
 61 |         self.sigma = sigma
 62 |         self.crefs = []
 63 |         self.ctest = []
 64 |         self.document_frequency = defaultdict(float)
 65 |         self.cook_append(test, refs)
 66 |         self.ref_len = None
 67 | 
 68 |     def cook_append(self, test, refs):
 69 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 70 | 
 71 |         if refs is not None:
 72 |             self.crefs.append(cook_refs(refs))
 73 |             if test is not None:
 74 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 75 |             else:
 76 |                 self.ctest.append(None) # lens of crefs and ctest have to match
 77 | 
 78 |     def size(self):
 79 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
 80 |         return len(self.crefs)
 81 | 
 82 |     def __iadd__(self, other):
 83 |         '''add an instance (e.g., from another sentence).'''
 84 | 
 85 |         if type(other) is tuple:
 86 |             ## avoid creating new CiderScorer instances
 87 |             self.cook_append(other[0], other[1])
 88 |         else:
 89 |             self.ctest.extend(other.ctest)
 90 |             self.crefs.extend(other.crefs)
 91 | 
 92 |         return self
 93 |     def compute_doc_freq(self):
 94 |         '''
 95 |         Compute term frequency for reference data.
 96 |         This will be used to compute idf (inverse document frequency later)
 97 |         The term frequency is stored in the object
 98 |         :return: None
 99 |         '''
100 |         for refs in self.crefs:
101 |             # refs, k ref captions of one image
102 |             for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
103 |                 self.document_frequency[ngram] += 1
104 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
105 | 
106 |     def compute_cider(self):
107 |         def counts2vec(cnts):
108 |             """
109 |             Function maps counts of ngram to vector of tfidf weights.
110 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
111 |             The n-th entry of array denotes length of n-grams.
112 |             :param cnts:
113 |             :return: vec (array of dict), norm (array of float), length (int)
114 |             """
115 |             vec = [defaultdict(float) for _ in range(self.n)]
116 |             length = 0
117 |             norm = [0.0 for _ in range(self.n)]
118 |             for (ngram,term_freq) in cnts.iteritems():
119 |                 # give word count 1 if it doesn't appear in reference corpus
120 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
121 |                 # ngram index
122 |                 n = len(ngram)-1
123 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
124 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
125 |                 # compute norm for the vector.  the norm will be used for computing similarity
126 |                 norm[n] += pow(vec[n][ngram], 2)
127 | 
128 |                 if n == 1:
129 |                     length += term_freq
130 |             norm = [np.sqrt(n) for n in norm]
131 |             return vec, norm, length
132 | 
133 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
134 |             '''
135 |             Compute the cosine similarity of two vectors.
136 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
137 |             :param vec_ref: array of dictionary for vector corresponding to reference
138 |             :param norm_hyp: array of float for vector corresponding to hypothesis
139 |             :param norm_ref: array of float for vector corresponding to reference
140 |             :param length_hyp: int containing length of hypothesis
141 |             :param length_ref: int containing length of reference
142 |             :return: array of score for each n-grams cosine similarity
143 |             '''
144 |             delta = float(length_hyp - length_ref)
145 |             # measure consine similarity
146 |             val = np.array([0.0 for _ in range(self.n)])
147 |             for n in range(self.n):
148 |                 # ngram
149 |                 for (ngram,count) in vec_hyp[n].iteritems():
150 |                     # vrama91 : added clipping
151 |                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
152 | 
153 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
154 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
155 | 
156 |                 assert(not math.isnan(val[n]))
157 |                 # vrama91: added a length based gaussian penalty
158 |                 val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
159 |             return val
160 | 
161 |         # compute log reference length
162 |         self.ref_len = np.log(float(len(self.crefs)))
163 | 
164 |         scores = []
165 |         for test, refs in zip(self.ctest, self.crefs):
166 |             # compute vector for test captions
167 |             vec, norm, length = counts2vec(test)
168 |             # compute vector for ref captions
169 |             score = np.array([0.0 for _ in range(self.n)])
170 |             for ref in refs:
171 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
172 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
173 |             # change by vrama91 - mean of ngram scores, instead of sum
174 |             score_avg = np.mean(score)
175 |             # divide by number of references
176 |             score_avg /= len(refs)
177 |             # multiply score by 10
178 |             score_avg *= 10.0
179 |             # append score of an image to the score list
180 |             scores.append(score_avg)
181 |         return scores
182 | 
183 |     def compute_score(self, option=None, verbose=0):
184 |         # compute idf
185 |         self.compute_doc_freq()
186 |         # assert to check document frequency
187 |         assert(len(self.ctest) >= max(self.document_frequency.values()))
188 |         # compute cider score
189 |         score = self.compute_cider()
190 |         # debug
191 |         # print score
192 |         return np.mean(np.array(score)), np.array(score)


--------------------------------------------------------------------------------
/pycocoevalcap/bleu/bleu_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # bleu_scorer.py
  4 | # David Chiang <chiang@isi.edu>
  5 | 
  6 | # Copyright (c) 2004-2006 University of Maryland. All rights
  7 | # reserved. Do not redistribute without permission from the
  8 | # author. Not for commercial use.
  9 | 
 10 | # Modified by: 
 11 | # Hao Fang <hfang@uw.edu>
 12 | # Tsung-Yi Lin <tl483@cornell.edu>
 13 | 
 14 | '''Provides:
 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 17 | '''
 18 | 
 19 | import copy
 20 | import sys, math, re
 21 | from collections import defaultdict
 22 | 
 23 | def precook(s, n=4, out=False):
 24 |     """Takes a string as input and returns an object that can be given to
 25 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 26 |     can take string arguments as well."""
 27 |     words = s.split()
 28 |     counts = defaultdict(int)
 29 |     for k in xrange(1,n+1):
 30 |         for i in xrange(len(words)-k+1):
 31 |             ngram = tuple(words[i:i+k])
 32 |             counts[ngram] += 1
 33 |     return (len(words), counts)
 34 | 
 35 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
 36 |     '''Takes a list of reference sentences for a single segment
 37 |     and returns an object that encapsulates everything that BLEU
 38 |     needs to know about them.'''
 39 | 
 40 |     reflen = []
 41 |     maxcounts = {}
 42 |     for ref in refs:
 43 |         rl, counts = precook(ref, n)
 44 |         reflen.append(rl)
 45 |         for (ngram,count) in counts.iteritems():
 46 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 47 | 
 48 |     # Calculate effective reference sentence length.
 49 |     if eff == "shortest":
 50 |         reflen = min(reflen)
 51 |     elif eff == "average":
 52 |         reflen = float(sum(reflen))/len(reflen)
 53 | 
 54 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
 55 |     
 56 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 57 | 
 58 |     return (reflen, maxcounts)
 59 | 
 60 | def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
 61 |     '''Takes a test sentence and returns an object that
 62 |     encapsulates everything that BLEU needs to know about it.'''
 63 | 
 64 |     testlen, counts = precook(test, n, True)
 65 | 
 66 |     result = {}
 67 | 
 68 |     # Calculate effective reference sentence length.
 69 |     
 70 |     if eff == "closest":
 71 |         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
 72 |     else: ## i.e., "average" or "shortest" or None
 73 |         result["reflen"] = reflen
 74 | 
 75 |     result["testlen"] = testlen
 76 | 
 77 |     result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
 78 | 
 79 |     result['correct'] = [0]*n
 80 |     for (ngram, count) in counts.iteritems():
 81 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 82 | 
 83 |     return result
 84 | 
 85 | class BleuScorer(object):
 86 |     """Bleu scorer.
 87 |     """
 88 | 
 89 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
 90 |     # special_reflen is used in oracle (proportional effective ref len for a node).
 91 | 
 92 |     def copy(self):
 93 |         ''' copy the refs.'''
 94 |         new = BleuScorer(n=self.n)
 95 |         new.ctest = copy.copy(self.ctest)
 96 |         new.crefs = copy.copy(self.crefs)
 97 |         new._score = None
 98 |         return new
 99 | 
100 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
101 |         ''' singular instance '''
102 | 
103 |         self.n = n
104 |         self.crefs = []
105 |         self.ctest = []
106 |         self.cook_append(test, refs)
107 |         self.special_reflen = special_reflen
108 | 
109 |     def cook_append(self, test, refs):
110 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
111 |         
112 |         if refs is not None:
113 |             self.crefs.append(cook_refs(refs))
114 |             if test is not None:
115 |                 cooked_test = cook_test(test, self.crefs[-1])
116 |                 self.ctest.append(cooked_test) ## N.B.: -1
117 |             else:
118 |                 self.ctest.append(None) # lens of crefs and ctest have to match
119 | 
120 |         self._score = None ## need to recompute
121 | 
122 |     def ratio(self, option=None):
123 |         self.compute_score(option=option)
124 |         return self._ratio
125 | 
126 |     def score_ratio(self, option=None):
127 |         '''return (bleu, len_ratio) pair'''
128 |         return (self.fscore(option=option), self.ratio(option=option))
129 | 
130 |     def score_ratio_str(self, option=None):
131 |         return "%.4f (%.2f)" % self.score_ratio(option)
132 | 
133 |     def reflen(self, option=None):
134 |         self.compute_score(option=option)
135 |         return self._reflen
136 | 
137 |     def testlen(self, option=None):
138 |         self.compute_score(option=option)
139 |         return self._testlen        
140 | 
141 |     def retest(self, new_test):
142 |         if type(new_test) is str:
143 |             new_test = [new_test]
144 |         assert len(new_test) == len(self.crefs), new_test
145 |         self.ctest = []
146 |         for t, rs in zip(new_test, self.crefs):
147 |             self.ctest.append(cook_test(t, rs))
148 |         self._score = None
149 | 
150 |         return self
151 | 
152 |     def rescore(self, new_test):
153 |         ''' replace test(s) with new test(s), and returns the new score.'''
154 |         
155 |         return self.retest(new_test).compute_score()
156 | 
157 |     def size(self):
158 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
159 |         return len(self.crefs)
160 | 
161 |     def __iadd__(self, other):
162 |         '''add an instance (e.g., from another sentence).'''
163 | 
164 |         if type(other) is tuple:
165 |             ## avoid creating new BleuScorer instances
166 |             self.cook_append(other[0], other[1])
167 |         else:
168 |             assert self.compatible(other), "incompatible BLEUs."
169 |             self.ctest.extend(other.ctest)
170 |             self.crefs.extend(other.crefs)
171 |             self._score = None ## need to recompute
172 | 
173 |         return self        
174 | 
175 |     def compatible(self, other):
176 |         return isinstance(other, BleuScorer) and self.n == other.n
177 | 
178 |     def single_reflen(self, option="average"):
179 |         return self._single_reflen(self.crefs[0][0], option)
180 | 
181 |     def _single_reflen(self, reflens, option=None, testlen=None):
182 |         
183 |         if option == "shortest":
184 |             reflen = min(reflens)
185 |         elif option == "average":
186 |             reflen = float(sum(reflens))/len(reflens)
187 |         elif option == "closest":
188 |             reflen = min((abs(l-testlen), l) for l in reflens)[1]
189 |         else:
190 |             assert False, "unsupported reflen option %s" % option
191 | 
192 |         return reflen
193 | 
194 |     def recompute_score(self, option=None, verbose=0):
195 |         self._score = None
196 |         return self.compute_score(option, verbose)
197 |         
198 |     def compute_score(self, option=None, verbose=0):
199 |         n = self.n
200 |         small = 1e-9
201 |         tiny = 1e-15 ## so that if guess is 0 still return 0
202 |         bleu_list = [[] for _ in range(n)]
203 | 
204 |         if self._score is not None:
205 |             return self._score
206 | 
207 |         if option is None:
208 |             option = "average" if len(self.crefs) == 1 else "closest"
209 | 
210 |         self._testlen = 0
211 |         self._reflen = 0
212 |         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
213 | 
214 |         # for each sentence
215 |         for comps in self.ctest:            
216 |             testlen = comps['testlen']
217 |             self._testlen += testlen
218 | 
219 |             if self.special_reflen is None: ## need computation
220 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
221 |             else:
222 |                 reflen = self.special_reflen
223 | 
224 |             self._reflen += reflen
225 |                 
226 |             for key in ['guess','correct']:
227 |                 for k in xrange(n):
228 |                     totalcomps[key][k] += comps[key][k]
229 | 
230 |             # append per image bleu score
231 |             bleu = 1.
232 |             for k in xrange(n):
233 |                 bleu *= (float(comps['correct'][k]) + tiny) \
234 |                         /(float(comps['guess'][k]) + small) 
235 |                 bleu_list[k].append(bleu ** (1./(k+1)))
236 |             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
237 |             if ratio < 1:
238 |                 for k in xrange(n):
239 |                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
240 | 
241 |             if verbose > 1:
242 |                 print comps, reflen
243 | 
244 |         totalcomps['reflen'] = self._reflen
245 |         totalcomps['testlen'] = self._testlen
246 | 
247 |         bleus = []
248 |         bleu = 1.
249 |         for k in xrange(n):
250 |             bleu *= float(totalcomps['correct'][k] + tiny) \
251 |                     / (totalcomps['guess'][k] + small)
252 |             bleus.append(bleu ** (1./(k+1)))
253 |         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
254 |         if ratio < 1:
255 |             for k in xrange(n):
256 |                 bleus[k] *= math.exp(1 - 1/ratio)
257 | 
258 |         #if verbose > 0:
259 |         #    print totalcomps
260 |         #    print "ratio:", ratio
261 | 
262 |         self._score = bleus
263 |         return self._score, bleu_list
264 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Yizhe Zhang
  4 | 
  5 | TextCNN
  6 | """
  7 | ## 152.3.214.203/6006
  8 | 
  9 | import os
 10 | 
 11 | GPUID = 1
 12 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID)
 13 | 
 14 | import tensorflow as tf
 15 | from tensorflow.contrib import learn
 16 | from tensorflow.contrib import layers
 17 | # from tensorflow.contrib import metrics
 18 | # from tensorflow.contrib.learn import monitors
 19 | from tensorflow.contrib import framework
 20 | from tensorflow.contrib.learn.python.learn import learn_runner
 21 | from tensorflow.python.platform import tf_logging as logging
 22 | # from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 23 | import cPickle
 24 | import numpy as np
 25 | import os
 26 | import scipy.io as sio
 27 | from math import floor
 28 | import pdb
 29 | 
 30 | from model import *
 31 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, get_minibatches_idx, normalizing, restore_from_save, \
 32 |     prepare_for_bleu, cal_BLEU, sent2idx
 33 | from denoise import *
 34 | 
 35 | # import tempfile
 36 | # from tensorflow.examples.tutorials.mnist import input_data
 37 | 
 38 | logging.set_verbosity(logging.INFO)
 39 | # Basic model parameters as external flags.
 40 | flags = tf.app.flags
 41 | FLAGS = flags.FLAGS
 42 | 
 43 | 
 44 | class Options(object):
 45 |     def __init__(self):
 46 |         self.fix_emb = False
 47 |         self.reuse_w = False
 48 |         self.reuse_cnn = False
 49 |         self.reuse_discrimination = True  # reuse cnn for discrimination
 50 |         self.restore = True
 51 |         self.tanh = True  # activation fun for the top layer of cnn, otherwise relu
 52 |         self.model = 'cnn_deconv'  # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv
 53 | 
 54 |         self.permutation = 0
 55 |         self.substitution = 's'  # Deletion(d), Insertion(a), Substitution(s) and Permutation(p)
 56 | 
 57 |         self.W_emb = None
 58 |         self.cnn_W = None
 59 |         self.cnn_b = None
 60 |         self.maxlen = 253
 61 |         self.n_words = None
 62 |         self.filter_shape = 5
 63 |         self.filter_size = 300
 64 |         self.embed_size = 300
 65 |         self.lr = 1e-5
 66 |         self.layer = 3
 67 |         self.stride = [2, 2, 2]  # for two layer cnn/deconv , use self.stride[0]
 68 |         self.batch_size = 32
 69 |         self.max_epochs = 100
 70 |         self.n_gan = 900  # self.filter_size * 3
 71 |         self.L = 100
 72 | 
 73 |         self.save_path = "./save/" + "hotel_" + str(self.n_gan) + "_dim_" + self.model + "_" + self.substitution + str(
 74 |             self.permutation)
 75 |         self.log_path = "./log"
 76 |         self.print_freq = 100
 77 |         self.valid_freq = 100
 78 | 
 79 |         # batch norm & dropout
 80 |         self.batch_norm = False
 81 |         self.cnn_layer_dropout = False
 82 |         self.dropout = True
 83 |         self.dropout_ratio = 1.0
 84 |         self.is_train = True
 85 | 
 86 |         self.discrimination = False
 87 |         self.H_dis = 300
 88 | 
 89 |         self.sent_len = self.maxlen + 2 * (self.filter_shape - 1)
 90 |         self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape) / self.stride[0]) + 1)
 91 |         self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape) / self.stride[1]) + 1)
 92 |         self.sent_len4 = np.int32(floor((self.sent_len3 - self.filter_shape)/self.stride[2]) + 1)
 93 |         print ('Use model %s' % self.model)
 94 |         print ('Use %d conv/deconv layers' % self.layer)
 95 | 
 96 |     def __iter__(self):
 97 |         for attr, value in self.__dict__.iteritems():
 98 |             yield attr, value
 99 | 
100 | 
101 | def auto_encoder(x, x_org, opt, opt_t=None):
102 |     # print x.get_shape()  # batch L
103 |     if not opt_t: opt_t = opt
104 |     x_emb, W_norm = embedding(x, opt)  # batch L emb
105 |     x_emb = tf.expand_dims(x_emb, 3)  # batch L emb 1
106 | 
107 |     res = {}
108 | 
109 |     # cnn encoder
110 |     if opt.layer == 4:
111 |         H_enc = conv_model_4layer(x_emb, opt)
112 |     elif opt.layer == 3:
113 |         H_enc = conv_model_3layer(x_emb, opt)
114 |     else:
115 |         H_enc = conv_model(x_emb, opt)
116 | 
117 |     H_dec = H_enc
118 |     # deconv decoder
119 |     if opt.layer == 4:
120 |         x_rec = deconv_model_4layer(H_dec, opt_t)  # batch L emb 1
121 |     elif opt.layer == 3:
122 |         x_rec = deconv_model_3layer(H_dec, opt_t)  # batch L emb 1
123 |     else:
124 |         x_rec = deconv_model(H_dec, opt_t)  # batch L emb 1
125 |     print("Encoder len %d Decoder len %d Output len %d" % (
126 |     x_emb.get_shape()[1], x_rec.get_shape()[1], x_org.get_shape()[1]))
127 |     tf.assert_equal(x_rec.get_shape(), x_emb.get_shape())
128 |     tf.assert_equal(x_emb.get_shape()[1], x_org.get_shape()[1])
129 |     x_rec_norm = normalizing(x_rec, 2)  # batch L emb
130 | 
131 |     if opt.fix_emb:
132 |         # cosine sim
133 |         # Batch L emb
134 |         loss = -tf.reduce_sum(x_rec_norm * x_emb)
135 |         rec_sent = tf.argmax(tf.tensordot(tf.squeeze(x_rec_norm), W_norm, [[2], [1]]), 2)
136 |         res['rec_sents'] = rec_sent
137 | 
138 | 
139 |     else:
140 |         x_temp = tf.reshape(x_org, [-1, ])
141 |         prob_logits = tf.tensordot(tf.squeeze(x_rec_norm), W_norm, [[2], [1]])  # c_blv = sum_e x_ble W_ve
142 | 
143 |         prob = tf.nn.log_softmax(prob_logits * opt_t.L, dim=-1, name=None)
144 |         rec_sent = tf.squeeze(tf.argmax(prob, 2))
145 |         prob = tf.reshape(prob, [-1, opt_t.n_words])
146 | 
147 |         idx = tf.range(opt.batch_size * opt_t.sent_len)
148 | 
149 |         all_idx = tf.transpose(tf.stack(values=[idx, x_temp]))
150 |         all_prob = tf.gather_nd(prob, all_idx)
151 | 
152 |         gen_temp = tf.cast(tf.reshape(rec_sent, [-1, ]), tf.int32)
153 |         gen_idx = tf.transpose(tf.stack(values=[idx, gen_temp]))
154 |         gen_prob = tf.gather_nd(prob, gen_idx)
155 | 
156 |         res['rec_sents'] = rec_sent
157 | 
158 |         res['gen_p'] = tf.exp(gen_prob[0:opt.sent_len])
159 |         res['all_p'] = tf.exp(all_prob[0:opt.sent_len])
160 | 
161 |         if opt.discrimination:
162 |             logits_real, _ = discriminator(x_org, W_norm, opt_t)
163 |             prob_one_hot = tf.nn.log_softmax(prob_logits * opt_t.L * 100, dim=-1, name=None)
164 |             logits_syn, _ = discriminator(tf.exp(prob_one_hot), W_norm, opt_t, is_prob=True, is_reuse=True)
165 | 
166 |             res['prob_r'] = tf.reduce_mean(tf.nn.sigmoid(logits_real))
167 |             res['prob_f'] = tf.reduce_mean(tf.nn.sigmoid(logits_syn))
168 | 
169 |             loss = tf.reduce_mean(
170 |                 tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(logits_real), logits=logits_real)) + \
171 |                    tf.reduce_mean(
172 |                        tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(logits_syn), logits=logits_syn))
173 |         else:
174 |             loss = -tf.reduce_mean(all_prob)
175 |             
176 |     tf.summary.scalar('loss', loss)
177 | 
178 |     train_op = layers.optimize_loss(
179 |         loss,
180 |         framework.get_global_step(),
181 |         optimizer='Adam',
182 |         learning_rate=opt.lr)
183 |     return res, loss, train_op
184 | 
185 | 
186 | def main():
187 |     # global n_words
188 |     # Prepare training and testing data
189 |     loadpath = "./data/hotel_reviews.p"
190 |     x = cPickle.load(open(loadpath, "rb"))
191 |     train, val = x[0], x[1]
192 |     wordtoix, ixtoword = x[2], x[3]
193 |     train = [list(s) for s in train]
194 |     val = [list(s) for s in val]
195 |     opt = Options()
196 |     opt.n_words = len(ixtoword) + 1
197 |     ixtoword[opt.n_words - 1] = 'GO_'
198 |     print dict(opt)
199 |     print('Total words: %d' % opt.n_words)
200 | 
201 |     try:
202 |         params = np.load('./param_g.npz')
203 |         if params['Wemb'].shape == (opt.n_words, opt.embed_size):
204 |             print('Use saved embedding.')
205 |             opt.W_emb = params['Wemb']
206 |         else:
207 |             print('Emb Dimension mismatch: param_g.npz:' + str(params['Wemb'].shape) + ' opt: ' + str(
208 |                 (opt.n_words, opt.embed_size)))
209 |             opt.fix_emb = False
210 |     except IOError:
211 |         print('No embedding file found.')
212 |         opt.fix_emb = False
213 | 
214 |     with tf.device('/gpu:1'):
215 |         x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
216 |         x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
217 |         res_, loss_, train_op = auto_encoder(x_, x_org_, opt)
218 |         merged = tf.summary.merge_all()
219 | 
220 | 
221 | 
222 |     uidx = 0
223 |     config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
224 |     config.gpu_options.allow_growth = True
225 |     np.set_printoptions(precision=3)
226 |     np.set_printoptions(threshold=np.inf)
227 |     saver = tf.train.Saver()
228 | 
229 |     with tf.Session(config=config) as sess:
230 |         train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
231 |         test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
232 |         sess.run(tf.global_variables_initializer())
233 |         if opt.restore:
234 |             try:
235 |                 t_vars = tf.trainable_variables()
236 |                 loader = restore_from_save(t_vars, sess, opt)
237 | 
238 |             except Exception as e:
239 |                 print(e)
240 |                 print("No saving session, using random initialization")
241 |                 sess.run(tf.global_variables_initializer())
242 | 
243 |         for epoch in range(opt.max_epochs):
244 |             print("Starting epoch %d" % epoch)
245 |             kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True)
246 |             for _, train_index in kf:
247 |                 uidx += 1
248 |                 sents = [train[t] for t in train_index]
249 | 
250 |                 sents_permutated = add_noise(sents, opt)
251 | 
252 |                 if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
253 |                     x_batch_org = prepare_data_for_cnn(sents, opt)  # Batch L
254 |                 else:
255 |                     x_batch_org = prepare_data_for_rnn(sents, opt)  # Batch L
256 | 
257 |                 if opt.model != 'rnn_rnn':
258 |                     x_batch = prepare_data_for_cnn(sents_permutated, opt)  # Batch L
259 |                 else:
260 |                     x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO=False)  # Batch L
261 | 
262 |                 _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org})
263 | 
264 |                 if uidx % opt.valid_freq == 0:
265 |                     opt.is_train = False
266 |                     valid_index = np.random.choice(len(val), opt.batch_size)
267 |                     val_sents = [val[t] for t in valid_index]
268 | 
269 |                     val_sents_permutated = add_noise(val_sents, opt)
270 | 
271 |                     if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
272 |                         x_val_batch_org = prepare_data_for_cnn(val_sents, opt)
273 |                     else:
274 |                         x_val_batch_org = prepare_data_for_rnn(val_sents, opt)
275 | 
276 |                     if opt.model != 'rnn_rnn':
277 |                         x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt)
278 |                     else:
279 |                         x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False)
280 | 
281 |                     loss_val = sess.run(loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org})
282 |                     print("Validation loss %f " % (loss_val))
283 |                     res = sess.run(res_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org})
284 |                     if opt.discrimination:
285 |                         print ("Real Prob %f Fake Prob %f" % (res['prob_r'], res['prob_f']))
286 |                     print "Val Orig :" + " ".join([ixtoword[x] for x in val_sents[0] if x != 0])
287 |                     print "Val Perm :" + " ".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0])
288 |                     print "Val Recon:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
289 | 
290 |                     val_set = [prepare_for_bleu(s) for s in val_sents]
291 |                     [bleu2s, bleu3s, bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in res['rec_sents']], {0: val_set})
292 |                     print 'Val BLEU (2,3,4): ' + ' '.join([str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)])
293 |                     summary = sess.run(merged, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org})
294 |                     test_writer.add_summary(summary, uidx)
295 |                     opt.is_train = True
296 | 
297 | 
298 |                 if uidx % opt.print_freq == 0:
299 |                     print("Iteration %d: loss %f " % (uidx, loss))
300 |                     res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org})
301 |                     print "Original     :" + " ".join([ixtoword[x] for x in sents[0] if x != 0])
302 |                     print "Permutated   :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0])
303 |                     if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn':
304 |                         print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0])
305 |                     print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
306 | 
307 | 
308 |                     summary = sess.run(merged, feed_dict={x_: x_batch, x_org_: x_batch_org})
309 |                     train_writer.add_summary(summary, uidx)
310 | 
311 |             saver.save(sess, opt.save_path, global_step=epoch)
312 | 
313 | 
314 | if __name__ == '__main__':
315 |     main()
316 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Utilities for downloading data from WMT, tokenizing, vocabularies."""
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import gzip
 22 | import os
 23 | import re
 24 | import tarfile
 25 | import pdb
 26 | 
 27 | from six.moves import urllib
 28 | 
 29 | from tensorflow.python.platform import gfile
 30 | import tensorflow as tf
 31 | 
 32 | # Special vocabulary symbols - we always put them at the start.
 33 | _PAD = b"_PAD"
 34 | _GO = b"_GO"
 35 | _EOS = b"_EOS"
 36 | _UNK = b"_UNK"
 37 | _START_VOCAB = [_PAD, _GO, _EOS, _UNK]
 38 | 
 39 | PAD_ID = 0
 40 | GO_ID = 1
 41 | EOS_ID = 2
 42 | UNK_ID = 3
 43 | 
 44 | # Regular expressions used to tokenize.
 45 | _WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
 46 | _DIGIT_RE = re.compile(br"\d")
 47 | 
 48 | # URLs for WMT data.
 49 | _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar"
 50 | _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz"
 51 | 
 52 | 
 53 | def maybe_download(directory, filename, url):
 54 |   """Download filename from url unless it's already in directory."""
 55 |   if not os.path.exists(directory):
 56 |     print("Creating directory %s" % directory)
 57 |     os.mkdir(directory)
 58 |   filepath = os.path.join(directory, filename)
 59 |   if not os.path.exists(filepath):
 60 |     print("Downloading %s to %s" % (url, filepath))
 61 |     filepath, _ = urllib.request.urlretrieve(url, filepath)
 62 |     statinfo = os.stat(filepath)
 63 |     print("Successfully downloaded", filename, statinfo.st_size, "bytes")
 64 |   return filepath
 65 | 
 66 | 
 67 | def gunzip_file(gz_path, new_path):
 68 |   """Unzips from gz_path into new_path."""
 69 |   print("Unpacking %s to %s" % (gz_path, new_path))
 70 |   with gzip.open(gz_path, "rb") as gz_file:
 71 |     with open(new_path, "wb") as new_file:
 72 |       for line in gz_file:
 73 |         new_file.write(line)
 74 | 
 75 | 
 76 | def get_wmt_enfr_train_set(directory):
 77 |   """Download the WMT en-fr training corpus to directory unless it's there."""
 78 |   train_path = os.path.join(directory, "giga-fren.release2.fixed")
 79 |   if not (gfile.Exists(train_path +".fr") and gfile.Exists(train_path +".en")):
 80 |     corpus_file = maybe_download(directory, "training-giga-fren.tar",
 81 |                                  _WMT_ENFR_TRAIN_URL)
 82 |     print("Extracting tar file %s" % corpus_file)
 83 |     with tarfile.open(corpus_file, "r") as corpus_tar:
 84 |       corpus_tar.extractall(directory)
 85 |     gunzip_file(train_path + ".fr.gz", train_path + ".fr")
 86 |     gunzip_file(train_path + ".en.gz", train_path + ".en")
 87 |   return train_path
 88 | 
 89 | 
 90 | def get_wmt_enfr_dev_set(directory):
 91 |   """Download the WMT en-fr training corpus to directory unless it's there."""
 92 |   dev_name = "newstest2013"
 93 |   dev_path = os.path.join(directory, dev_name)
 94 |   if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")):
 95 |     dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL)
 96 |     print("Extracting tgz file %s" % dev_file)
 97 |     with tarfile.open(dev_file, "r:gz") as dev_tar:
 98 |       fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr")
 99 |       en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en")
100 |       fr_dev_file.name = dev_name + ".fr"  # Extract without "dev/" prefix.
101 |       en_dev_file.name = dev_name + ".en"
102 |       dev_tar.extract(fr_dev_file, directory)
103 |       dev_tar.extract(en_dev_file, directory)
104 |   return dev_path
105 | 
106 | 
107 | def basic_tokenizer(sentence):
108 |   """Very basic tokenizer: split the sentence into a list of tokens."""
109 |   words = []
110 |   for space_separated_fragment in sentence.strip().split():
111 |     words.extend(_WORD_SPLIT.split(space_separated_fragment))
112 |   return [w for w in words if w]
113 | 
114 | 
115 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
116 |                       tokenizer=None, normalize_digits=True):
117 |   """Create vocabulary file (if it does not exist yet) from data file.
118 | 
119 |   Data file is assumed to contain one sentence per line. Each sentence is
120 |   tokenized and digits are normalized (if normalize_digits is set).
121 |   Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
122 |   We write it to vocabulary_path in a one-token-per-line format, so that later
123 |   token in the first line gets id=0, second line gets id=1, and so on.
124 | 
125 |   Args:
126 |     vocabulary_path: path where the vocabulary will be created.
127 |     data_path: data file that will be used to create vocabulary.
128 |     max_vocabulary_size: limit on the size of the created vocabulary.
129 |     tokenizer: a function to use to tokenize each data sentence;
130 |       if None, basic_tokenizer will be used.
131 |     normalize_digits: Boolean; if true, all digits are replaced by 0s.
132 |   """
133 |   if not gfile.Exists(vocabulary_path):
134 |     print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
135 |     vocab = {}
136 |     #pdb.set_trace()
137 |     with gfile.GFile(data_path, mode="rb") as f:
138 |       counter = 0
139 |       for line in f:
140 |         counter += 1
141 |         if counter % 100000 == 0:
142 |           print("  processing line %d" % counter)
143 |         line = tf.compat.as_bytes(line)
144 |         tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
145 |         for w in tokens:
146 |           word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w
147 |           if word in vocab:
148 |             vocab[word] += 1
149 |           else:
150 |             vocab[word] = 1
151 |       vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
152 |       if len(vocab_list) > max_vocabulary_size:
153 |         vocab_list = vocab_list[:max_vocabulary_size]
154 |       with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
155 |         for w in vocab_list:
156 |           vocab_file.write(w + b"\n")
157 | 
158 | 
159 | def initialize_vocabulary(vocabulary_path):
160 |   """Initialize vocabulary from file.
161 | 
162 |   We assume the vocabulary is stored one-item-per-line, so a file:
163 |     dog
164 |     cat
165 |   will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
166 |   also return the reversed-vocabulary ["dog", "cat"].
167 | 
168 |   Args:
169 |     vocabulary_path: path to the file containing the vocabulary.
170 | 
171 |   Returns:
172 |     a pair: the vocabulary (a dictionary mapping string to integers), and
173 |     the reversed vocabulary (a list, which reverses the vocabulary mapping).
174 | 
175 |   Raises:
176 |     ValueError: if the provided vocabulary_path does not exist.
177 |   """
178 |   if gfile.Exists(vocabulary_path):
179 |     rev_vocab = []
180 |     with gfile.GFile(vocabulary_path, mode="rb") as f:
181 |       rev_vocab.extend(f.readlines())
182 |     rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab]
183 |     vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
184 |     idtoword = dict([(y, x) for (y, x) in enumerate(rev_vocab)])
185 |     return vocab, idtoword
186 |   else:
187 |     raise ValueError("Vocabulary file %s not found.", vocabulary_path)
188 | 
189 | 
190 | def sentence_to_token_ids(sentence, vocabulary,
191 |                           tokenizer=None, normalize_digits=True):
192 |   """Convert a string to list of integers representing token-ids.
193 | 
194 |   For example, a sentence "I have a dog" may become tokenized into
195 |   ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
196 |   "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
197 | 
198 |   Args:
199 |     sentence: the sentence in bytes format to convert to token-ids.
200 |     vocabulary: a dictionary mapping tokens to integers.
201 |     tokenizer: a function to use to tokenize each sentence;
202 |       if None, basic_tokenizer will be used.
203 |     normalize_digits: Boolean; if true, all digits are replaced by 0s.
204 | 
205 |   Returns:
206 |     a list of integers, the token-ids for the sentence.
207 |   """
208 | 
209 |   if tokenizer:
210 |     words = tokenizer(sentence)
211 |   else:
212 |     words = basic_tokenizer(sentence)
213 |   if not normalize_digits:
214 |     return [vocabulary.get(w, UNK_ID) for w in words]
215 |   # Normalize digits by 0 before looking words up in the vocabulary.
216 |   return [vocabulary.get(_DIGIT_RE.sub(b"0", w), UNK_ID) for w in words]
217 | 
218 | 
219 | def data_to_token_ids(data_path, target_path, vocabulary_path,
220 |                       tokenizer=None, normalize_digits=True):
221 |   """Tokenize data file and turn into token-ids using given vocabulary file.
222 | 
223 |   This function loads data line-by-line from data_path, calls the above
224 |   sentence_to_token_ids, and saves the result to target_path. See comment
225 |   for sentence_to_token_ids on the details of token-ids format.
226 | 
227 |   Args:
228 |     data_path: path to the data file in one-sentence-per-line format.
229 |     target_path: path where the file with token-ids will be created.
230 |     vocabulary_path: path to the vocabulary file.
231 |     tokenizer: a function to use to tokenize each sentence;
232 |       if None, basic_tokenizer will be used.
233 |     normalize_digits: Boolean; if true, all digits are replaced by 0s.
234 |   """
235 |   if not gfile.Exists(target_path):
236 |     print("Tokenizing data in %s" % data_path)
237 |     vocab, _ = initialize_vocabulary(vocabulary_path)
238 |     with gfile.GFile(data_path, mode="rb") as data_file:
239 |       with gfile.GFile(target_path, mode="w") as tokens_file:
240 |         counter = 0
241 |         for line in data_file:
242 |           counter += 1
243 |           if counter % 100000 == 0:
244 |             print("  tokenizing line %d" % counter)
245 |           token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab,
246 |                                             tokenizer, normalize_digits)
247 |           tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
248 | 
249 | 
250 | def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None):
251 |   """Get WMT data into data_dir, create vocabularies and tokenize data.
252 | 
253 |   Args:
254 |     data_dir: directory in which the data sets will be stored.
255 |     en_vocabulary_size: size of the English vocabulary to create and use.
256 |     fr_vocabulary_size: size of the French vocabulary to create and use.
257 |     tokenizer: a function to use to tokenize each data sentence;
258 |       if None, basic_tokenizer will be used.
259 | 
260 |   Returns:
261 |     A tuple of 6 elements:
262 |       (1) path to the token-ids for English training data-set,
263 |       (2) path to the token-ids for French training data-set,
264 |       (3) path to the token-ids for English development data-set,
265 |       (4) path to the token-ids for French development data-set,
266 |       (5) path to the English vocabulary file,
267 |       (6) path to the French vocabulary file.
268 |   """
269 |   # Get wmt data to the specified directory.
270 |   train_path = get_wmt_enfr_train_set(data_dir)
271 |   dev_path = get_wmt_enfr_dev_set(data_dir)
272 | 
273 |   from_train_path = train_path + ".en"
274 |   to_train_path = train_path + ".fr"
275 |   from_dev_path = dev_path + ".en"
276 |   to_dev_path = dev_path + ".fr"
277 |   return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size,
278 |                       fr_vocabulary_size, tokenizer)
279 | 
280 | 
281 | def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
282 |                  to_vocabulary_size, tokenizer=None):
283 |   """Preapre all necessary files that are required for the training.
284 | 
285 |     Args:
286 |       data_dir: directory in which the data sets will be stored.
287 |       from_train_path: path to the file that includes "from" training samples.
288 |       to_train_path: path to the file that includes "to" training samples.
289 |       from_dev_path: path to the file that includes "from" dev samples.
290 |       to_dev_path: path to the file that includes "to" dev samples.
291 |       from_vocabulary_size: size of the "from language" vocabulary to create and use.
292 |       to_vocabulary_size: size of the "to language" vocabulary to create and use.
293 |       tokenizer: a function to use to tokenize each data sentence;
294 |         if None, basic_tokenizer will be used.
295 | 
296 |     Returns:
297 |       A tuple of 6 elements:
298 |         (1) path to the token-ids for "from language" training data-set,
299 |         (2) path to the token-ids for "to language" training data-set,
300 |         (3) path to the token-ids for "from language" development data-set,
301 |         (4) path to the token-ids for "to language" development data-set,
302 |         (5) path to the "from language" vocabulary file,
303 |         (6) path to the "to language" vocabulary file.
304 |     """
305 |   # Create vocabularies of the appropriate sizes.
306 |   to_vocab_path = os.path.join(data_dir, "vocab%d.to" % to_vocabulary_size)
307 |   from_vocab_path = os.path.join(data_dir, "vocab%d.from" % from_vocabulary_size)
308 |   create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer)
309 |   create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer)
310 | 
311 |   # Create token ids for the training data.
312 |   to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
313 |   from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
314 |   data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
315 |   data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)
316 | 
317 |   # Create token ids for the development data.
318 |   to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
319 |   from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
320 |   data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
321 |   data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)
322 | 
323 |   return (from_train_ids_path, to_train_ids_path,
324 |           from_dev_ids_path, to_dev_ids_path,
325 |           from_vocab_path, to_vocab_path)
326 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | # import theano
  3 | # from theano import config
  4 | import tensorflow as tf
  5 | from collections import OrderedDict
  6 | import nltk
  7 | from pycocoevalcap.bleu.bleu import Bleu
  8 | from pycocoevalcap.rouge.rouge import Rouge
  9 | from tensorflow.python import pywrap_tensorflow
 10 | import pdb
 11 | import data_utils
 12 | import sys
 13 | from tensorflow.python.ops import clip_ops
 14 | from rougescore import rouge_n, rouge_1, rouge_2, rouge_l
 15 | 
 16 | def lrelu(x, leak=0.2, name="lrelu"):
 17 |     with tf.variable_scope(name):
 18 |         f1 = 0.5 * (1 + leak)
 19 |         f2 = 0.5 * (1 - leak)
 20 |         return f1 * x + f2 * tf.abs(x)
 21 | 
 22 | def sent2idx(text, wordtoix, opt, is_cnn = True):
 23 |     
 24 |     sent = [wordtoix[x] for x in text.split()]
 25 |     
 26 |     return prepare_data_for_cnn([sent for i in range(opt.batch_size)], opt)
 27 |     
 28 | 
 29 | 
 30 | def prepare_data_for_cnn(seqs_x, opt): 
 31 |     maxlen=opt.maxlen
 32 |     filter_h=opt.filter_shape
 33 |     lengths_x = [len(s) for s in seqs_x]
 34 |     # print lengths_x
 35 |     if maxlen != None:
 36 |         new_seqs_x = []
 37 |         new_lengths_x = []
 38 |         for l_x, s_x in zip(lengths_x, seqs_x):
 39 |             if l_x < maxlen:
 40 |                 new_seqs_x.append(s_x)
 41 |                 new_lengths_x.append(l_x)
 42 |         lengths_x = new_lengths_x
 43 |         seqs_x = new_seqs_x
 44 |         
 45 |         if len(lengths_x) < 1  :
 46 |             return None, None
 47 |     
 48 |     pad = filter_h -1
 49 |     x = []   
 50 |     for rev in seqs_x:    
 51 |         xx = []
 52 |         for i in xrange(pad):
 53 |             xx.append(0)
 54 |         for idx in rev:
 55 |             xx.append(idx)
 56 |         while len(xx) < maxlen + 2*pad:
 57 |             xx.append(0)
 58 |         x.append(xx)
 59 |     x = np.array(x,dtype='int32')
 60 |     return x   
 61 |     
 62 |     
 63 | def prepare_data_for_rnn(seqs_x, opt, is_add_GO = True):
 64 |     
 65 |     maxlen=opt.maxlen
 66 |     lengths_x = [len(s) for s in seqs_x]
 67 |     # print lengths_x
 68 |     if maxlen != None:
 69 |         new_seqs_x = []
 70 |         new_lengths_x = []
 71 |         for l_x, s_x in zip(lengths_x, seqs_x):
 72 |             if l_x < maxlen:
 73 |                 new_seqs_x.append(s_x)
 74 |                 new_lengths_x.append(l_x)
 75 |         lengths_x = new_lengths_x
 76 |         seqs_x = new_seqs_x
 77 |         
 78 |         if len(lengths_x) < 1  :
 79 |             return None, None
 80 | 
 81 |     n_samples = len(seqs_x)
 82 |     maxlen_x = np.max(lengths_x)
 83 |     x = np.zeros(( n_samples, opt.sent_len)).astype('int32')
 84 |     for idx, s_x in enumerate(seqs_x):
 85 |         if is_add_GO:
 86 |             x[idx, 0] = 1 # GO symbol
 87 |             x[idx, 1:lengths_x[idx]+1] = s_x
 88 |         else:
 89 |             x[idx, :lengths_x[idx]] = s_x
 90 |     return x   
 91 |     
 92 | 
 93 | 
 94 | def restore_from_save(t_vars, sess, opt):
 95 |     save_keys = tensors_key_in_file(opt.save_path)
 96 |     #print(save_keys.keys()) 
 97 |     ss = set([var.name for var in t_vars])&set([s+":0" for s in save_keys.keys()])
 98 |     cc = {var.name:var for var in t_vars}
 99 |     ss_right_shape = set([s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])  # only restore variables with correct shape
100 |     
101 |     if opt.reuse_discrimination:
102 |         ss2 = set([var.name[2:] for var in t_vars])&set([s+":0" for s in save_keys.keys()])
103 |         cc2 = {var.name[2:][:-2]:var for var in t_vars if var.name[2:] in ss2 if var.get_shape() == save_keys[var.name[2:][:-2]]}
104 |         for s_iter in ss_right_shape:
105 |             cc2[s_iter[:-2]] = cc[s_iter]
106 |         
107 |         loader = tf.train.Saver(var_list=cc2)
108 |         loader.restore(sess, opt.save_path)
109 |         print("Loaded variables for discriminator:"+str(cc2.keys()))
110 |     
111 |     else:    
112 |         # for var in t_vars:
113 |         #     if var.name[:-2] in ss:
114 |         #         tf.assign(t_vars, save_keys[var.name[:-2]])
115 |         loader = tf.train.Saver(var_list= [var for var in t_vars if var.name in ss_right_shape])
116 |         loader.restore(sess, opt.save_path)
117 |         print("Loading variables from '%s'." % opt.save_path)
118 |         print("Loaded variables:"+str(ss_right_shape))
119 |     
120 | 
121 |     
122 |     
123 | 
124 |     
125 |     return loader
126 |     
127 |     
128 | _buckets = [(60,60)]    
129 |     
130 | def read_data(source_path, target_path, opt):
131 |     """
132 |     From tensorflow tutorial translate.py
133 |     Read data from source and target files and put into buckets.
134 |     Args:
135 |     source_path: path to the files with token-ids for the source language.
136 |     target_path: path to the file with token-ids for the target language;
137 |       it must be aligned with the source file: n-th line contains the desired
138 |       output for n-th line from the source_path.
139 |     max_size: maximum number of lines to read, all other will be ignored;
140 |       if 0 or None, data files will be read completely (no limit).
141 | 
142 |     Returns:
143 |     data_set: a list of length len(_buckets); data_set[n] contains a list of
144 |       (source, target) pairs read from the provided data files that fit
145 |       into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
146 |       len(target) < _buckets[n][1]; source and target are lists of token-ids.
147 |     """
148 |     data_set = [[] for _ in _buckets]
149 |     with tf.gfile.GFile(source_path, mode="r") as source_file:
150 |         with tf.gfile.GFile(target_path, mode="r") as target_file:
151 |             source, target = source_file.readline(), target_file.readline()            
152 |             counter = 0
153 |             while source and target and (not opt.max_train_data_size or counter < opt.max_train_data_size):
154 |                 counter += 1
155 |                 if counter % 100000 == 0:
156 |                     print("  reading data line %d" % counter)
157 |                     sys.stdout.flush()
158 |                 source_ids = [int(x) for x in source.split()]
159 |                 target_ids = [int(x) for x in target.split()]
160 |                 target_ids.append(data_utils.EOS_ID)
161 |                 for bucket_id, (source_size, target_size) in enumerate(_buckets):
162 |                     if opt.minlen <len(source_ids) < min(source_size, opt.maxlen) and opt.minlen <len(target_ids) < min(target_size, opt.maxlen):
163 |                         data_set[bucket_id].append([source_ids, target_ids])
164 |                         break
165 |                 source, target = source_file.readline(), target_file.readline()
166 |             
167 |             
168 |             
169 |     return data_set    
170 |     
171 |     
172 |     
173 | def prepare_data_for_cnn(seqs_x, opt): 
174 |     maxlen=opt.maxlen
175 |     filter_h=opt.filter_shape
176 |     lengths_x = [len(s) for s in seqs_x]
177 |     # print lengths_x
178 |     if maxlen != None:
179 |         new_seqs_x = []
180 |         new_lengths_x = []
181 |         for l_x, s_x in zip(lengths_x, seqs_x):
182 |             if l_x < maxlen:
183 |                 new_seqs_x.append(s_x)
184 |                 new_lengths_x.append(l_x)
185 |         lengths_x = new_lengths_x
186 |         seqs_x = new_seqs_x
187 |         
188 |         if len(lengths_x) < 1  :
189 |             return None, None
190 |     
191 |     pad = filter_h -1
192 |     x = []   
193 |     for rev in seqs_x:    
194 |         xx = []
195 |         for i in xrange(pad):
196 |             xx.append(0)
197 |         for idx in rev:
198 |             xx.append(idx)
199 |         while len(xx) < maxlen + 2*pad:
200 |             xx.append(0)
201 |         x.append(xx)
202 |     x = np.array(x,dtype='int32')
203 |     return x
204 |     
205 |     
206 | # def prepare_data_for_machine_translation(pair_x, opt):
207 | #     maxlen=opt.maxlen
208 | #     filter_h=opt.filter_shape
209 | #     def padding(p):
210 | #         pad = filter_h -1
211 | #         new_p = []
212 | #         pdb.set_trace()
213 | #         for it in p:
214 | #             if len(it)>= maxlen:
215 | #                 return None
216 | #             else:
217 | #                 new_p.append([0]*pad + it + [0]*(maxlen-len(it)+pad))
218 | #         return np.array(new_p)
219 | #     return [padding(pair) for pair in pair_x]
220 |     
221 |     
222 |     
223 |     
224 | 
225 |     
226 |     
227 |           
228 |     
229 | 
230 | def tensors_key_in_file(file_name):
231 |     """Return tensors key in a checkpoint file.
232 |     Args:
233 |     file_name: Name of the checkpoint file.
234 |     """
235 |     try:
236 |         reader = pywrap_tensorflow.NewCheckpointReader(file_name)
237 |         return reader.get_variable_to_shape_map()
238 |     except Exception as e:  # pylint: disable=broad-except
239 |         print(str(e))
240 |         return None
241 | 
242 |      
243 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
244 |     idx_list = np.arange(n, dtype="int32")
245 | 
246 |     if shuffle:
247 |         np.random.shuffle(idx_list)
248 | 
249 |     minibatches = []
250 |     minibatch_start = 0
251 |     for i in range(n // minibatch_size):
252 |         minibatches.append(idx_list[minibatch_start:
253 |                                     minibatch_start + minibatch_size])
254 |         minibatch_start += minibatch_size
255 | 
256 |     # if (minibatch_start != n):
257 |     #     # Make a minibatch out of what is left
258 |     #     minibatches.append(idx_list[minibatch_start:])
259 | 
260 |     return zip(range(len(minibatches)), minibatches)
261 |     
262 |     
263 | # def normalizing_L1(x, axis):
264 | #     norm = tf.sqrt(tf.reduce_sum(tf.square(x), axis=axis, keep_dims=True))
265 | #     normalized = x / (norm)
266 | #     return normalized   
267 |     
268 | def normalizing(x, axis):    
269 |     norm = tf.sqrt(tf.reduce_sum(tf.square(x), axis=axis, keep_dims=True))
270 |     normalized = x / (norm)   
271 |     return normalized
272 |     
273 | def _p(pp, name):
274 |     return '%s_%s' % (pp, name)
275 | 
276 | def dropout(X, trng, p=0.):
277 |     if p != 0:
278 |         retain_prob = 1 - p
279 |         X = X / retain_prob * trng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
280 |     return X
281 | 
282 | """ used for initialization of the parameters. """
283 | 
284 | def ortho_weight(ndim):
285 |     W = np.random.randn(ndim, ndim)
286 |     u, s, v = np.linalg.svd(W)
287 |     return u.astype(config.floatX)
288 |     
289 | def uniform_weight(nin,nout=None, scale=0.05):
290 |     if nout == None:
291 |         nout = nin
292 |     W = np.random.uniform(low=-scale, high=scale, size=(nin, nout))
293 |     return W.astype(config.floatX)
294 |     
295 | def normal_weight(nin,nout=None, scale=0.05):
296 |     if nout == None:
297 |         nout = nin
298 |     W = np.random.randn(nin, nout) * scale
299 |     return W.astype(config.floatX)
300 |     
301 | def zero_bias(ndim):
302 |     b = np.zeros((ndim,))
303 |     return b.astype(config.floatX)
304 | 
305 | """auxiliary function for KDE"""
306 | def log_mean_exp(A,b,sigma):
307 |     a=-0.5*((A-theano.tensor.tile(b,[A.shape[0],1]))**2).sum(1)/(sigma**2)
308 |     max_=a.max()
309 |     return max_+theano.tensor.log(theano.tensor.exp(a-theano.tensor.tile(max_,a.shape[0])).mean())
310 | 
311 | '''calculate KDE'''
312 | def cal_nkde(X,mu,sigma):
313 |     s1,updates=theano.scan(lambda i,s: s+log_mean_exp(mu,X[i,:],sigma), sequences=[theano.tensor.arange(X.shape[0])],outputs_info=[np.asarray(0.,dtype="float32")])
314 |     E=s1[-1]
315 |     Z=mu.shape[0]*theano.tensor.log(sigma*np.sqrt(np.pi*2))
316 |     return (Z-E)/mu.shape[0]
317 | 
318 | 
319 | """ BLEU score"""
320 | # def cal_BLEU(generated, reference):
321 | #     #the maximum is bigram, so assign the weight into 2 half.
322 | #     BLEUscore = 0.0
323 | #     for g in generated:
324 | #         BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g)
325 | #     BLEUscore = BLEUscore/len(generated)
326 | #     return BLEUscore
327 | 
328 | def cal_ROUGE(generated, reference, is_corpus = False):
329 |     # ref and sample are both dict
330 |     # scorers = [
331 |     #     (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
332 |     #     (Meteor(),"METEOR"),
333 |     #     (Rouge(), "ROUGE_L"),
334 |     #     (Cider(), "CIDEr")
335 |     # ]
336 |     # output rouge 1-4 and rouge L and rouge L from pycocoevaluate
337 |     
338 |     
339 |     ROUGEscore = [0.0]*6
340 |     for idx, g in enumerate(generated):
341 |         score = [0.0]*6
342 |         if is_corpus:
343 |             for order in range(4):
344 |                 score[order] = rouge_n(g.split(), [x.split() for x in reference[0]], order+1, 0.5)
345 |             score[4] = rouge_l(g.split(), [x.split() for x in reference[0]], 0.5)
346 |             score[5], _ = Rouge().compute_score(reference, {0: [g]})
347 |             
348 |             
349 |         else:
350 |             for order in range(4):
351 |                 score[order] = rouge_n(g.split(), [reference[0][idx].split()], order+1, 0.5)
352 |             score[4] = rouge_l(g.split(), [reference[0][idx].split()], 0.5)
353 |             score[5], _ = Rouge().compute_score({0: [reference[0][idx]]}, {0: [g]})
354 |             #pdb.set_trace()
355 |         #print g, score
356 |         ROUGEscore = [ r+score[idx]  for idx,r in enumerate(ROUGEscore)] 
357 |         #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight)
358 |     ROUGEscore = [r/len(generated) for r in ROUGEscore]
359 |     return ROUGEscore
360 |     
361 |     
362 | 
363 | 
364 | def cal_BLEU(generated, reference, is_corpus = False):
365 |     #print 'in BLEU score calculation'
366 |     #the maximum is bigram, so assign the weight into 2 half.
367 |     BLEUscore = [0.0,0.0,0.0]
368 |     for idx, g in enumerate(generated):
369 |         if is_corpus:
370 |             score, scores = Bleu(4).compute_score(reference, {0: [g]})
371 |         else:
372 |             score, scores = Bleu(4).compute_score({0: [reference[0][idx]]} , {0: [g]})
373 |         #print g, score
374 |         for i, s in zip([0,1,2],score[1:]):
375 |             BLEUscore[i]+=s
376 |         #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight)
377 |     BLEUscore[0] = BLEUscore[0]/len(generated)
378 |     BLEUscore[1] = BLEUscore[1]/len(generated)
379 |     BLEUscore[2] = BLEUscore[2]/len(generated)
380 |     return BLEUscore
381 |     
382 | def cal_BLEU_4(generated, reference, is_corpus = False):
383 |     #print 'in BLEU score calculation'
384 |     #the maximum is bigram, so assign the weight into 2 half.
385 |     BLEUscore = [0.0,0.0,0.0,0.0]
386 |     for idx, g in enumerate(generated):
387 |         if is_corpus:
388 |             score, scores = Bleu(4).compute_score(reference, {0: [g]})
389 |         else:
390 |             score, scores = Bleu(4).compute_score({0: [reference[0][idx]]} , {0: [g]})
391 |         #print g, score
392 |         for i, s in zip([0,1,2,3],score):
393 |             BLEUscore[i]+=s
394 |         #BLEUscore += nltk.translate.bleu_score.sentence_bleu(reference, g, weight)
395 |     BLEUscore[0] = BLEUscore[0]/len(generated)
396 |     BLEUscore[1] = BLEUscore[1]/len(generated)
397 |     BLEUscore[2] = BLEUscore[2]/len(generated)
398 |     BLEUscore[3] = BLEUscore[3]/len(generated)
399 |     return BLEUscore
400 |  
401 | def prepare_for_bleu(sentence):
402 |     sent=[x for x in sentence if x!=0]
403 |     while len(sent)<4:
404 |         sent.append(0)
405 |     #sent = ' '.join([ixtoword[x] for x in sent])
406 |     sent = ' '.join([str(x) for x in sent])
407 |     return sent
408 | 
409 | 
410 | 
411 | def _clip_gradients_seperate_norm(grads_and_vars, clip_gradients):
412 |   """Clips gradients by global norm."""
413 |   gradients, variables = zip(*grads_and_vars)
414 |   clipped_gradients = [clip_ops.clip_by_norm(grad, clip_gradients) for grad in gradients]
415 |   return list(zip(clipped_gradients, variables))
416 | 


--------------------------------------------------------------------------------
/char_correction.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Yizhe Zhang, Dinghan Shen, Guoyin Wang
  4 | 
  5 | TextCNN
  6 | """
  7 | 
  8 | import os
  9 | import tensorflow as tf
 10 | from tensorflow.contrib import learn
 11 | from tensorflow.contrib import layers
 12 | from tensorflow.contrib import framework
 13 | from tensorflow.contrib.learn.python.learn import learn_runner
 14 | from tensorflow.python.platform import tf_logging as logging
 15 | import cPickle
 16 | import numpy as np
 17 | import os
 18 | import scipy.io as sio
 19 | from math import floor
 20 | import pdb
 21 | from model import *
 22 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, \
 23 |     get_minibatches_idx, normalizing, restore_from_save, \
 24 |     prepare_for_bleu, cal_BLEU, sent2idx, _clip_gradients_seperate_norm
 25 | from denoise import *
 26 | from error_rate import prepare_for_cer, cal_cer
 27 | 
 28 | 
 29 | GPUID = 0
 30 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID)
 31 | profile = False
 32 | 
 33 | logging.set_verbosity(logging.INFO)
 34 | # Basic model parameters as external flags.
 35 | flags = tf.app.flags
 36 | FLAGS = flags.FLAGS
 37 | 
 38 | 
 39 | class Options(object):
 40 |     def __init__(self):
 41 |         # self.fix_emb = False
 42 |         self.reuse_w = False
 43 |         self.reuse_cnn = False
 44 |         self.reuse_discrimination = False  # reuse cnn for discrimination
 45 |         self.restore = True
 46 |         self.tanh = False  # activation fun for the top layer of cnn, otherwise relu
 47 |         self.model = 'cnn_deconv' #'cnn_deconv'  # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv
 48 | 
 49 |         self.permutation = 0.3
 50 |         self.substitution = 'sc'  # Deletion(d), Insertion(a), Substitution(s) and Permutation(p), c for char special
 51 | 
 52 |         self.W_emb = None
 53 |         self.cnn_W = None
 54 |         self.cnn_b = None
 55 |         self.maxlen = 221
 56 |         self.n_words = None
 57 |         self.filter_shape = 5
 58 |         self.filter_size = 300
 59 |         self.multiplier = 2
 60 |         self.lr = 1e-4
 61 | 
 62 |         self.layer = 3
 63 |         self.stride = [2,2]   # for two layer cnn/deconv , use self.stride[0]
 64 |         self.batch_size = 32
 65 |         self.max_epochs = 100
 66 |         self.n_gan = 900  # self.filter_size * 3
 67 |         self.L = 50
 68 | 
 69 |         self.optimizer = 'Adam' #tf.train.AdamOptimizer(beta1=0.9) #'Adam' # 'Momentum' , 'RMSProp'
 70 |         self.clip_grad = None  #100  #  20#
 71 |         self.attentive_emb = False
 72 |         self.decay_rate = 0.99
 73 |         self.relu_w = True
 74 | 
 75 |         self.save_path = "./save/" +str(self.n_gan) + "_dim_" + self.model + "_" + self.substitution + str(self.permutation)
 76 |         self.log_path = "./log"
 77 | 
 78 |         self.print_freq = 1
 79 |         self.valid_freq = 1
 80 | 
 81 |         # batch norm & dropout
 82 |         self.batch_norm = False
 83 |         self.cnn_layer_dropout = False
 84 |         self.dropout = False
 85 |         self.dropout_ratio = 0.5
 86 | 
 87 |         self.discrimination = False
 88 | 
 89 |         self.H_dis = 300
 90 | 
 91 |         self.sent_len = self.maxlen + 2*(self.filter_shape-1)
 92 |         self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape)
 93 |                             / self.stride[0]) + 1)
 94 |         self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape)
 95 |                             / self.stride[1]) + 1)
 96 | 
 97 |         # add char label
 98 |         self.char = True
 99 |         # dataset label
100 |         self.data = 'yahoo'  # option is three_small, three_char, imdb
101 |         print('Use model %s' % self.model)
102 |         print('Use %d conv/deconv layers' % self.layer)
103 | 
104 |     def __iter__(self):
105 |         for attr, value in self.__dict__.iteritems():
106 |             yield attr, value
107 | 
108 | def auto_encoder(x, x_org, is_train, opt, opt_t=None):
109 |     if not opt_t:
110 |         opt_t = opt
111 |     x_emb, W_norm = embedding(x, opt)   # batch L emb
112 |     x_emb = tf.expand_dims(x_emb, 3)   # batch L emb 1
113 |     res = {}
114 |     # cnn encoder
115 | 
116 |     H_enc, res = conv_encoder(x_emb, is_train, opt, res)
117 | 
118 |     H_dec = H_enc
119 | 
120 |     if opt.model == 'rnn_rnn':
121 |         loss, rec_sent_1, _ = seq2seq(x, x_org, opt)
122 |         _, rec_sent_2, _ = seq2seq(x, x_org, opt, feed_previous=True, is_reuse=True)
123 | 
124 |         res['rec_sents_feed_y'] = rec_sent_1
125 |         res['rec_sents'] = rec_sent_2
126 | 
127 | 
128 |     elif opt.model == 'cnn_rnn':
129 |         # lstm decoder
130 |         H_dec2 = tf.identity(H_dec)
131 |         loss, rec_sent_1, _ = lstm_decoder(H_dec, x_org, opt)  #
132 | 
133 |         _, rec_sent_2, _ = lstm_decoder(H_dec, x_org, opt, feed_previous=True, is_reuse=True)
134 | 
135 |         res['rec_sents_feed_y'] = rec_sent_1
136 |         res['rec_sents'] = rec_sent_2
137 | 
138 |     else:
139 | 
140 |         # deconv decoder
141 |         loss, res = deconv_decoder(H_dec, x_org, W_norm, is_train, opt_t, res)
142 | 
143 |     tf.summary.scalar('loss', loss)
144 |     summaries = [
145 |                 "learning_rate",
146 |                 "loss",
147 |                 "gradients",
148 |                 "gradient_norm",
149 |                 ]
150 | 
151 |     global_step = tf.Variable(0, trainable=False)
152 | 
153 | 
154 |     train_op = layers.optimize_loss(
155 |         loss,
156 |         global_step=global_step,
157 |         #aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N,
158 |         #framework.get_global_step(),
159 |         optimizer=opt.optimizer,
160 |         clip_gradients=(lambda grad: _clip_gradients_seperate_norm(grad, opt.clip_grad)) if opt.clip_grad else None,
161 |         learning_rate_decay_fn=lambda lr,g: tf.train.exponential_decay(learning_rate=lr, global_step=g, decay_rate=opt.decay_rate, decay_steps=3000),
162 |         learning_rate=opt.lr,
163 |         summaries=summaries
164 |         )
165 |     return res, loss, train_op
166 | 
167 | 
168 | def run_model(opt, train, val, test, wordtoix, ixtoword):
169 | 
170 |     with tf.device('/gpu:1'):
171 |         x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
172 |         x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
173 |         is_train_ = tf.placeholder(tf.bool, name='is_train_')
174 |         res_, loss_, train_op = auto_encoder(x_, x_org_, is_train_, opt)
175 |         merged = tf.summary.merge_all()
176 |         summary_ext = tf.Summary()
177 | 
178 |     uidx = 0
179 |     config = tf.ConfigProto(log_device_placement=False,
180 |                             allow_soft_placement=True,
181 |                             graph_options=tf.GraphOptions(build_cost_model=1))
182 |     config.gpu_options.allow_growth = True
183 |     np.set_printoptions(precision=3)
184 |     np.set_printoptions(threshold=np.inf)
185 |     saver = tf.train.Saver()
186 | 
187 |     run_metadata = tf.RunMetadata()
188 | 
189 |     with tf.Session(config=config) as sess:
190 |         train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
191 |         test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
192 |         sess.run(tf.global_variables_initializer())
193 |         if opt.restore:
194 |             try:
195 |                 t_vars = tf.trainable_variables()
196 |                 loader = restore_from_save(t_vars, sess, opt)
197 |             except Exception as e:
198 |                 print(e)
199 |                 print("No saving session, using random initialization")
200 |                 sess.run(tf.global_variables_initializer())
201 | 
202 |         for epoch in range(opt.max_epochs):
203 |             print("Starting epoch %d" % epoch)
204 |             kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True)
205 |             for _, train_index in kf:
206 |                 uidx += 1
207 |                 sents = [train[t] for t in train_index]
208 | 
209 |                 sents_permutated = add_noise(sents, opt)
210 | 
211 |                 if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
212 |                     x_batch_org = prepare_data_for_cnn(sents, opt) # Batch L
213 |                 else:
214 |                     x_batch_org = prepare_data_for_rnn(sents, opt) # Batch L
215 | 
216 |                 if opt.model != 'rnn_rnn':
217 |                     x_batch = prepare_data_for_cnn(sents_permutated, opt) # Batch L
218 |                 else:
219 |                     x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO = False) # Batch L
220 |                 # x_print = sess.run([x_emb],feed_dict={x_: x_train} )
221 |                 # print x_print
222 | 
223 | 
224 |                 # res = sess.run(res_, feed_dict={x_: x_batch, x_org_:x_batch_org})
225 |                 # pdb.set_trace()
226 | 
227 |                 #
228 |                 if profile:
229 |                     _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1},options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),run_metadata=run_metadata)
230 |                 else:
231 |                     _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1})
232 | 
233 |                 #pdb.set_trace()
234 | 
235 |                 if uidx % opt.valid_freq == 0:
236 |                     is_train = None
237 |                     valid_index = np.random.choice(len(val), opt.batch_size)
238 |                     val_sents = [val[t] for t in valid_index]
239 | 
240 |                     val_sents_permutated = add_noise(val_sents, opt)
241 | 
242 |                     if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
243 |                         x_val_batch_org = prepare_data_for_cnn(val_sents, opt)
244 |                     else:
245 |                         x_val_batch_org = prepare_data_for_rnn(val_sents, opt)
246 | 
247 |                     if opt.model != 'rnn_rnn':
248 |                         x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt)
249 |                     else:
250 |                         x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False)
251 | 
252 |                     loss_val = sess.run(loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train })
253 |                     print("Validation loss %f " % (loss_val))
254 |                     res = sess.run(res_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train })
255 |                     if opt.discrimination:
256 |                         print ("Real Prob %f Fake Prob %f"%(res['prob_r'], res['prob_f']))
257 | 
258 |                     if opt.char:
259 |                         print "Val Orig :" + "".join([ixtoword[x] for x in val_sents[0] if x != 0])
260 |                         print "Val Perm :" + "".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0])
261 |                         print "Val Recon:" + "".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
262 |                         # print "Val Recon one hot:" + "".join([ixtoword[x] for x in res['rec_sents_one_hot'][0] if x != 0])
263 |                     else:
264 |                         print "Val Orig :" + " ".join([ixtoword[x] for x in val_sents[0] if x != 0])
265 |                         print "Val Perm :" + " ".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0])
266 |                         print "Val Recon:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
267 | 
268 | 
269 |                     val_set = [prepare_for_bleu(s) for s in val_sents]
270 |                     [bleu2s,bleu3s,bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in res['rec_sents']], {0: val_set})
271 |                     print 'Val BLEU (2,3,4): ' + ' '.join([str(round(it, 3)) for it in (bleu2s,bleu3s,bleu4s)])
272 | 
273 | 
274 |                     val_set_char = [prepare_for_cer(s, ixtoword) for s in val_sents]
275 |                     cer = cal_cer([prepare_for_cer(s, ixtoword) for s in res['rec_sents']], val_set_char)
276 |                     print 'Val CER: ' + str(round(cer, 3))
277 |                     # summary_ext.Value(tag='CER', simple_value=cer)
278 |                     summary_ext = tf.Summary(value=[tf.Summary.Value(tag='CER', simple_value=cer)])
279 |                     # tf.summary.scalar('CER', cer)
280 | 
281 |                     #if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
282 |                         #print "Gen Probs:" + " ".join([str(np.round(res['gen_p'][i], 1)) for i in range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0])
283 |                     summary = sess.run(merged, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train })
284 |                     test_writer.add_summary(summary, uidx)
285 |                     test_writer.add_summary(summary_ext, uidx)
286 |                     is_train = True
287 | 
288 | 
289 |                 if uidx%opt.print_freq == 0:
290 |                     print("Iteration %d: loss %f " %(uidx, loss))
291 |                     res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1})
292 | 
293 |                     # if 1 in res['rec_sents'][0] or 1 in sents[0]:
294 |                     #     pdb.set_trace()
295 |                     if opt.char:
296 |                         print "Original     :" + "".join([ixtoword[x] for x in sents[0] if x != 0])
297 |                         print "Permutated   :" + "".join([ixtoword[x] for x in sents_permutated[0] if x != 0])
298 |                         if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn':
299 |                             print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0])
300 |                         print "Reconstructed:" + "".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
301 | 
302 | 
303 |                     else:
304 |                         print "Original     :" + " ".join([ixtoword[x] for x in sents[0] if x != 0])
305 |                         print "Permutated   :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0])
306 |                         if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn':
307 |                             print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0])
308 |                         print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
309 | 
310 | 
311 |                     summary = sess.run(merged, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1})
312 |                     train_writer.add_summary(summary, uidx)
313 |                     # print res['x_rec'][0][0]
314 |                     # print res['x_emb'][0][0]
315 |                     if profile:
316 |                         tf.contrib.tfprof.model_analyzer.print_model_analysis(
317 |                         tf.get_default_graph(),
318 |                         run_meta=run_metadata,
319 |                         tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
320 | 
321 |             saver.save(sess, opt.save_path)
322 | 
323 | 
324 | 
325 | def main():
326 | 
327 | 
328 |     opt = Options()
329 |     if opt.char:
330 |         opt.n_words = 35
331 |         opt.embed_size = 35
332 |         opt.fix_emb = False
333 |         opt.filter_size= 300
334 | 
335 |     if opt.data == 'three_char':
336 |         loadpath = './data/three_corpus_correct_large_char.p'
337 |     elif opt.data == 'yahoo':
338 |         loadpath = './data/yahoo_char.p'
339 | 
340 |     # loadpath = "./data/three_corpus_corrected_large.p"
341 |     x = cPickle.load(open(loadpath,"rb"))
342 |     train, val, test                    = x[0], x[1], x[2]
343 |     train_text, val_text, test_text     = x[3], x[4], x[5]
344 |     train_lab, val_lab, test_lab        = x[6], x[7], x[8]
345 |     # wordtoix, ixtoword                  = x[9], x[10]
346 |     if opt.char:
347 |         wordtoix, ixtoword, alphabet = x[9], x[10], x[11]
348 |     else:
349 |         wordtoix, ixtoword = x[9], x[10]
350 | 
351 | 
352 |     # opt = Options()
353 |     if not opt.char:
354 |         opt.n_words = len(ixtoword) + 1
355 |         ixtoword[opt.n_words-1] = 'GO_'
356 |     print dict(opt)
357 |     print('Total words: %d' % opt.n_words)
358 | 
359 | 
360 |     run_model(opt, train, val, test, wordtoix, ixtoword)
361 | 
362 | 
363 | 
364 | if __name__ == '__main__':
365 |     main()
366 | 


--------------------------------------------------------------------------------
/auto_encoding_cnn_denoise.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Yizhe Zhang
  4 | 
  5 | TextCNN
  6 | """
  7 | ## 152.3.214.203/6006
  8 | 
  9 | import os
 10 | GPUID = 0
 11 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID)
 12 | 
 13 | import tensorflow as tf
 14 | from tensorflow.contrib import learn
 15 | from tensorflow.contrib import layers
 16 | #from tensorflow.contrib import metrics
 17 | #from tensorflow.contrib.learn import monitors
 18 | from tensorflow.contrib import framework
 19 | from tensorflow.contrib.learn.python.learn import learn_runner
 20 | from tensorflow.python.platform import tf_logging as logging
 21 | #from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 22 | import cPickle
 23 | import numpy as np
 24 | import os
 25 | import scipy.io as sio
 26 | from math import floor
 27 | import pdb
 28 | 
 29 | from model import *
 30 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, get_minibatches_idx, normalizing, restore_from_save, \
 31 |     prepare_for_bleu, cal_BLEU, sent2idx, _clip_gradients_seperate_norm
 32 | from denoise import *
 33 | 
 34 | profile = False
 35 | #import tempfile
 36 | #from tensorflow.examples.tutorials.mnist import input_data
 37 | 
 38 | logging.set_verbosity(logging.INFO)
 39 | #tf.logging.verbosity(1)
 40 | # Basic model parameters as external flags.
 41 | flags = tf.app.flags
 42 | FLAGS = flags.FLAGS
 43 | #flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.')
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 
 49 | class Options(object):
 50 |     def __init__(self):
 51 |         self.fix_emb = False
 52 |         self.reuse_w = False
 53 |         self.reuse_cnn = False
 54 |         self.reuse_discrimination = False  # reuse cnn for discrimination
 55 |         self.restore = True
 56 |         self.tanh = False  # activation fun for the top layer of cnn, otherwise relu
 57 |         self.model = 'rnn_rnn' #'cnn_deconv'  # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv
 58 | 
 59 |         self.permutation = 0
 60 |         self.substitution = 's'  # Deletion(d), Insertion(a), Substitution(s) and Permutation(p)
 61 | 
 62 |         self.W_emb = None
 63 |         self.cnn_W = None
 64 |         self.cnn_b = None
 65 |         self.maxlen = 61
 66 |         self.n_words = None
 67 |         self.filter_shape = 5
 68 |         self.filter_size = 300
 69 |         self.multiplier = 2
 70 |         self.embed_size = 300
 71 |         self.lr = 1e-4
 72 | 
 73 |         self.layer = 3
 74 |         self.stride = [2, 2, 2]   # for two layer cnn/deconv , use self.stride[0]
 75 |         self.batch_size = 32
 76 |         self.max_epochs = 100
 77 |         self.n_gan = 100  # self.filter_size * 3
 78 |         self.L = 100
 79 | 
 80 |         self.optimizer = 'Adam' #tf.train.AdamOptimizer(beta1=0.9) #'Adam' # 'Momentum' , 'RMSProp'
 81 |         self.clip_grad = None #None  #100  #  20#
 82 |         self.attentive_emb = False
 83 |         self.decay_rate = 0.99
 84 |         self.relu_w = False
 85 | 
 86 |         self.save_path = "./save/" +str(self.n_gan) + "_dim_" + self.model + "_" + self.substitution + str(self.permutation)
 87 |         self.log_path = "./log"
 88 |         self.print_freq = 1000
 89 |         self.valid_freq = 1000
 90 | 
 91 |         # batch norm & dropout
 92 |         self.batch_norm = False
 93 |         self.dropout = False
 94 |         self.dropout_ratio = 0.5
 95 | 
 96 |         self.discrimination = False
 97 |         self.H_dis = 300
 98 | 
 99 |         self.sent_len = self.maxlen + 2*(self.filter_shape-1)
100 |         self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape)/self.stride[0]) + 1)
101 |         self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape)/self.stride[1]) + 1)
102 |         self.sent_len4 = np.int32(floor((self.sent_len3 - self.filter_shape)/self.stride[2]) + 1)
103 |         print ('Use model %s' % self.model)
104 |         print ('Use %d conv/deconv layers' % self.layer)
105 | 
106 |     def __iter__(self):
107 |         for attr, value in self.__dict__.iteritems():
108 |             yield attr, value
109 | 
110 | def auto_encoder(x, x_org, is_train, opt, opt_t=None):
111 |     # print x.get_shape()  # batch L
112 |     if not opt_t: opt_t = opt
113 |     x_emb, W_norm = embedding(x, opt)  # batch L emb
114 |     x_emb = tf.expand_dims(x_emb, 3)  # batch L emb 1
115 | 
116 |     res = {}
117 |     #res['W'] = W_norm
118 |     # cnn encoder
119 |     H_enc, res = conv_encoder(x_emb, is_train, opt, res)
120 | 
121 |     # H_dec = layers.relu(Y4, 200, biases_initializer=biasInit)
122 |     H_dec = H_enc
123 |     # print x_rec.get_shape()
124 |     if opt.model == 'rnn_rnn':
125 |         loss, rec_sent_1, _ = seq2seq(x, x_org, opt)
126 |         _, rec_sent_2, _ = seq2seq(x, x_org, opt, feed_previous=True, is_reuse=True)
127 |         #res['logits'] = logits
128 |         res['rec_sents_feed_y'] = rec_sent_1
129 |         res['rec_sents'] = rec_sent_2
130 | 
131 | 
132 |     elif opt.model == 'cnn_rnn':
133 |         # lstm decoder
134 |         H_dec2 = tf.identity(H_dec)
135 |         if opt.rnn_share_emb:
136 |             loss, rec_sent_1, _ = lstm_decoder_embedding(H_dec2, x_org, W_norm, opt_t)  #
137 |             _, rec_sent_2, _ = lstm_decoder_embedding(H_dec2, x_org, W_norm, opt_t, feed_previous=True, is_reuse=True)
138 |         else:
139 |             loss, rec_sent_1, _ = lstm_decoder(H_dec2, x_org, opt_t)  #
140 |             _, rec_sent_2, _ = lstm_decoder(H_dec2, x_org, opt_t, feed_previous=True, is_reuse=True)
141 | 
142 | 
143 |         res['rec_sents_feed_y'] = rec_sent_1
144 |         res['rec_sents'] = rec_sent_2
145 |         # res['H1'],res['H2'],res['o1'],res['o2'] = H1, H2, o1, o2
146 | 
147 |     else:
148 | 
149 |         # deconv decoder
150 |         loss, res = deconv_decoder(H_dec, x_org, W_norm, is_train, opt_t, res)
151 | 
152 |     # *tf.cast(tf.not_equal(x_temp,0), tf.float32)
153 |     tf.summary.scalar('loss', loss)
154 |     summaries = [
155 |                 "learning_rate",
156 |                 "loss",
157 |                 # "gradients",
158 |                 # "gradient_norm",
159 |                 ]
160 |     global_step = tf.Variable(0, trainable=False)
161 | 
162 | 
163 |     train_op = layers.optimize_loss(
164 |         loss,
165 |         global_step = global_step,
166 |         #aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N,
167 |         #framework.get_global_step(),
168 |         optimizer=opt.optimizer,
169 |         clip_gradients=(lambda grad: _clip_gradients_seperate_norm(grad, opt.clip_grad)) if opt.clip_grad else None,
170 |         learning_rate_decay_fn=lambda lr,g: tf.train.exponential_decay(learning_rate=lr, global_step = g, decay_rate=opt.decay_rate, decay_steps=3000),
171 |         learning_rate=opt.lr,
172 |         summaries = summaries
173 |         )
174 | 
175 |     # optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr)  # Or another optimization algorithm.
176 |     # train_op = optimizer.minimize(
177 |     #     loss,
178 |     #     aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
179 | 
180 | 
181 |     return res, loss, train_op
182 | 
183 | 
184 | def run_model(opt, train, val, test, wordtoix, ixtoword):
185 | 
186 | 
187 |     try:
188 |         params = np.load('./param_g.npz')
189 |         if params['Wemb'].shape == (opt.n_words, opt.embed_size):
190 |             print('Use saved embedding.')
191 |             opt.W_emb = params['Wemb']
192 |         else:
193 |             print('Emb Dimension mismatch: param_g.npz:'+ str(params['Wemb'].shape) + ' opt: ' + str((opt.n_words, opt.embed_size)))
194 |             opt.fix_emb = False
195 |     except IOError:
196 |         print('No embedding file found.')
197 |         opt.fix_emb = False
198 | 
199 |     with tf.device('/gpu:1'):
200 |         x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
201 |         x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
202 |         is_train_ = tf.placeholder(tf.bool, name='is_train_')
203 |         res_, loss_, train_op = auto_encoder(x_, x_org_, is_train_, opt)
204 |         merged = tf.summary.merge_all()
205 |         # opt.is_train = False
206 |         # res_val_, loss_val_, _ = auto_encoder(x_, x_org_, opt)
207 |         # merged_val = tf.summary.merge_all()
208 | 
209 |     #tensorboard --logdir=run1:/tmp/tensorflow/ --port 6006
210 |     #writer = tf.train.SummaryWriter(opt.log_path, graph=tf.get_default_graph())
211 | 
212 | 
213 |     uidx = 0
214 |     config = tf.ConfigProto(log_device_placement = False, allow_soft_placement=True, graph_options=tf.GraphOptions(build_cost_model=1))
215 |     #config = tf.ConfigProto(device_count={'GPU':0})
216 |     config.gpu_options.allow_growth = True
217 |     np.set_printoptions(precision=3)
218 |     np.set_printoptions(threshold=np.inf)
219 |     saver = tf.train.Saver()
220 | 
221 | 
222 | 
223 |     run_metadata = tf.RunMetadata()
224 | 
225 | 
226 |     with tf.Session(config = config) as sess:
227 |         train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
228 |         test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
229 |         sess.run(tf.global_variables_initializer())
230 |         if opt.restore:
231 |             try:
232 |                 #pdb.set_trace()
233 | 
234 |                 t_vars = tf.trainable_variables()
235 |                 #print([var.name[:-2] for var in t_vars])
236 |                 loader = restore_from_save(t_vars, sess, opt)
237 | 
238 | 
239 |             except Exception as e:
240 |                 print(e)
241 |                 print("No saving session, using random initialization")
242 |                 sess.run(tf.global_variables_initializer())
243 | 
244 |         for epoch in range(opt.max_epochs):
245 |             print("Starting epoch %d" % epoch)
246 |             # if epoch >= 10:
247 |             #     print("Relax embedding ")
248 |             #     opt.fix_emb = False
249 |             #     opt.batch_size = 2
250 |             kf = get_minibatches_idx(len(train), opt.batch_size, shuffle=True)
251 |             for _, train_index in kf:
252 |                 uidx += 1
253 |                 sents = [train[t] for t in train_index]
254 | 
255 |                 sents_permutated = add_noise(sents, opt)
256 | 
257 |                 #sents[0] = np.random.permutation(sents[0])
258 | 
259 |                 if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
260 |                     x_batch_org = prepare_data_for_cnn(sents, opt) # Batch L
261 |                 else:
262 |                     x_batch_org = prepare_data_for_rnn(sents, opt) # Batch L
263 | 
264 |                 if opt.model != 'rnn_rnn':
265 |                     x_batch = prepare_data_for_cnn(sents_permutated, opt) # Batch L
266 |                 else:
267 |                     x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO = False) # Batch L
268 |                 # x_print = sess.run([x_emb],feed_dict={x_: x_train} )
269 |                 # print x_print
270 | 
271 | 
272 |                 # res = sess.run(res_, feed_dict={x_: x_batch, x_org_:x_batch_org})
273 |                 # pdb.set_trace()
274 | 
275 |                 #
276 |                 if profile:
277 |                     _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1},options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),run_metadata=run_metadata)
278 |                 else:
279 |                     _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1})
280 | 
281 |                 #pdb.set_trace()
282 | 
283 |                 if uidx % opt.valid_freq == 0:
284 |                     is_train = None
285 |                     valid_index = np.random.choice(len(val), opt.batch_size)
286 |                     val_sents = [val[t] for t in valid_index]
287 | 
288 |                     val_sents_permutated = add_noise(val_sents, opt)
289 | 
290 | 
291 |                     if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
292 |                         x_val_batch_org = prepare_data_for_cnn(val_sents, opt)
293 |                     else:
294 |                         x_val_batch_org = prepare_data_for_rnn(val_sents, opt)
295 | 
296 |                     if opt.model != 'rnn_rnn':
297 |                         x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt)
298 |                     else:
299 |                         x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False)
300 | 
301 |                     loss_val = sess.run(loss_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train })
302 |                     print("Validation loss %f " % (loss_val))
303 |                     res = sess.run(res_, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train })
304 |                     if opt.discrimination:
305 |                         print ("Real Prob %f Fake Prob %f" % (res['prob_r'], res['prob_f']))
306 |                     print "Val Orig :" + " ".join([ixtoword[x] for x in val_sents[0] if x != 0])
307 |                     #print "Val Perm :" + " ".join([ixtoword[x] for x in val_sents_permutated[0] if x != 0])
308 |                     print "Val Recon:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
309 | 
310 |                     val_set = [prepare_for_bleu(s) for s in val_sents]
311 |                     [bleu2s, bleu3s, bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in res['rec_sents']], {0: val_set})
312 |                     print 'Val BLEU (2,3,4): ' + ' '.join([str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)])
313 | 
314 |                     # if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
315 |                     #     print "Org Probs:" + " ".join(
316 |                     #         [ixtoword[x_val_batch_org[0][i]] + '(' + str(np.round(res['all_p'][i], 1)) + ')' for i in
317 |                     #          range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0])
318 |                     #     print "Gen Probs:" + " ".join(
319 |                     #         [ixtoword[res['rec_sents'][0][i]] + '(' + str(np.round(res['gen_p'][i], 1)) + ')' for i in
320 |                     #          range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0])
321 | 
322 |                     summary = sess.run(merged, feed_dict={x_: x_val_batch, x_org_: x_val_batch_org, is_train_:is_train })
323 |                     test_writer.add_summary(summary, uidx)
324 |                     is_train = True
325 | 
326 |                 def test_input(text):
327 |                     x_input = sent2idx(text, wordtoix, opt)
328 |                     res = sess.run(res_, feed_dict={x_: x_input, x_org_: x_batch_org})
329 |                     print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
330 | 
331 |                 if uidx % opt.print_freq == 0:
332 |                     #pdb.set_trace()
333 |                     print("Iteration %d: loss %f " % (uidx, loss))
334 |                     res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1})
335 |                     print "Original     :" + " ".join([ixtoword[x] for x in sents[0] if x != 0])
336 |                     #print "Permutated   :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0])
337 |                     if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn':
338 |                         print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0])
339 |                     print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
340 | 
341 |                     # print "Probs:" + " ".join([ixtoword[res['rec_sents'][0][i]] +'(' +str(np.round(res['all_p'][i],2))+')' for i in range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0])
342 | 
343 |                     summary = sess.run(merged, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_:1})
344 |                     train_writer.add_summary(summary, uidx)
345 |                     # print res['x_rec'][0][0]
346 |                     # print res['x_emb'][0][0]
347 |                     if profile:
348 |                         tf.contrib.tfprof.model_analyzer.print_model_analysis(
349 |                         tf.get_default_graph(),
350 |                         run_meta=run_metadata,
351 |                         tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
352 | 
353 |             saver.save(sess, opt.save_path, global_step=epoch)
354 | 
355 | 
356 | 
357 | def main():
358 |     #global n_words
359 |     # Prepare training and testing data
360 |     #loadpath = "./data/three_corpus_small.p"
361 |     loadpath = "./data/three_corpus_corrected_large.p"
362 |     x = cPickle.load(open(loadpath,"rb"))
363 |     train, val, test                    = x[0], x[1], x[2]
364 |     train_text, val_text, test_text     = x[3], x[4], x[5]
365 |     train_lab, val_lab, test_lab        = x[6], x[7], x[8]
366 |     wordtoix, ixtoword                  = x[9], x[10]
367 | 
368 |     opt = Options()
369 |     opt.n_words = len(ixtoword) + 1
370 |     ixtoword[opt.n_words-1] = 'GO_'
371 |     print dict(opt)
372 |     print('Total words: %d' % opt.n_words)
373 | 
374 | 
375 |     run_model(opt, train, val, test, wordtoix, ixtoword)
376 | 
377 |     # model_fn = auto_encoder
378 |     # ae = learn.Estimator(model_fn=model_fn)
379 |     # ae.fit(train, opt , steps=opt.max_epochs)
380 | 
381 | 
382 | #
383 | # def main(argv=None):
384 | #     learn_runner.run(experiment_fn, FLAGS.train_dir)
385 | 
386 | if __name__ == '__main__':
387 |     main()
388 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Yizhe Zhang
  3 | 
  4 | Main model file
  5 | """
  6 | import tensorflow as tf
  7 | from tensorflow.contrib import learn
  8 | from tensorflow.contrib import layers
  9 | from tensorflow.contrib import metrics
 10 | #from tensorflow.contrib.learn import monitors
 11 | from tensorflow.contrib import framework
 12 | from tensorflow.contrib.learn.python.learn import learn_runner
 13 | from tensorflow.python.platform import tf_logging as logging
 14 | from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 15 | from tensorflow.contrib.legacy_seq2seq import rnn_decoder, embedding_rnn_decoder, sequence_loss, embedding_rnn_seq2seq, embedding_tied_rnn_seq2seq
 16 | import pdb
 17 | import copy
 18 | from utils import normalizing, lrelu
 19 | from tensorflow.python.framework import ops
 20 | from tensorflow.python.ops import nn_ops, math_ops, embedding_ops, variable_scope
 21 | 
 22 | 
 23 | 
 24 | def embedding(features, opt, prefix = '', is_reuse = None):
 25 |     """Customized function to transform batched x into embeddings."""
 26 |     # Convert indexes of words into embeddings.
 27 | 
 28 | 
 29 | 
 30 | 
 31 |     #    b = tf.get_variable('b', [opt.embed_size], initializer = tf,random_uniform_initializer(-0.01, 0.01))
 32 |     with tf.variable_scope(prefix+'embed', reuse=is_reuse):
 33 |         if opt.fix_emb:
 34 |             assert(hasattr(opt,'emb'))
 35 |             assert(np.shape(np.array(opt.emb))==(opt.n_words, opt.embed_size))
 36 |             W = tf.get_variable('W', [opt.n_words, opt.embed_size], weights_initializer = opt.emb, is_trainable = False)
 37 |         else:
 38 |             weightInit = tf.random_uniform_initializer(-0.001, 0.001)
 39 |             W = tf.get_variable('W', [opt.n_words, opt.embed_size], initializer = weightInit)
 40 |         # tf.stop_gradient(W)
 41 |     if hasattr(opt, 'relu_w') and opt.relu_w:
 42 |         W = tf.nn.relu(W)
 43 | 
 44 |     W_norm = normalizing(W, 1)
 45 |     word_vectors = tf.nn.embedding_lookup(W_norm, features)
 46 | 
 47 | 
 48 |     return word_vectors, W_norm
 49 | 
 50 | 
 51 | def embedding_only(opt, prefix = '', is_reuse = None):
 52 |     """Customized function to transform batched x into embeddings."""
 53 |     # Convert indexes of words into embeddings.
 54 |     with tf.variable_scope(prefix+'embed', reuse=is_reuse):
 55 |         if opt.fix_emb:
 56 |             assert(hasattr(opt,'emb'))
 57 |             assert(np.shape(np.array(opt.emb))==(opt.n_words, opt.embed_size))
 58 |             W = tf.get_variable('W', [opt.n_words, opt.embed_size], weights_initializer = opt.emb, is_trainable = False)
 59 |         else:
 60 |             weightInit = tf.random_uniform_initializer(-0.001, 0.001)
 61 |             W = tf.get_variable('W', [opt.n_words, opt.embed_size], initializer = weightInit)
 62 |     #    b = tf.get_variable('b', [opt.embed_size], initializer = tf,random_uniform_initializer(-0.01, 0.01))
 63 |     if hasattr(opt, 'relu_w') and opt.relu_w:
 64 |         W = tf.nn.relu(W)
 65 | 
 66 |     W_norm = normalizing(W, 1)
 67 | 
 68 |     return W_norm
 69 | 
 70 | def classifier_2layer(H, opt, dropout = 1, prefix = '', num_outputs=1, is_reuse= None):
 71 |     # last layer must be linear
 72 |     H = tf.squeeze(H)
 73 |     biasInit = tf.constant_initializer(0.001, dtype=tf.float32)
 74 |     H_dis = layers.fully_connected(tf.nn.dropout(H, keep_prob = dropout), num_outputs = opt.H_dis, biases_initializer=biasInit, activation_fn = tf.nn.relu, scope = prefix + 'dis_1', reuse = is_reuse)
 75 |     logits = layers.linear(tf.nn.dropout(H_dis, keep_prob = dropout), num_outputs = num_outputs, biases_initializer=biasInit, scope = prefix + 'dis_2', reuse = is_reuse)
 76 |     return logits
 77 | 
 78 | 
 79 | 
 80 | def discriminator(x, W, opt, prefix = 'd_', is_prob = False, is_reuse = None):
 81 |     W_norm_d = tf.identity(W)   # deep copy
 82 |     tf.stop_gradient(W_norm_d)  # the discriminator won't update W
 83 |     if is_prob:
 84 |         x_emb = tf.tensordot(x, W_norm_d, [[2],[0]])  # batch L emb
 85 |     else:
 86 |         x_emb = tf.nn.embedding_lookup(W_norm_d, x)   # batch L emb
 87 | 
 88 |     # print x_emb.get_shape()
 89 |     x_emb = tf.expand_dims(x_emb,3)   # batch L emb 1
 90 | 
 91 | 
 92 |     if opt.layer == 4:
 93 |         H = conv_model_4layer(x_emb, opt, prefix = prefix, is_reuse = is_reuse)
 94 |     elif opt.layer == 3:
 95 |         H = conv_model_3layer(x_emb, opt, prefix = prefix, is_reuse = is_reuse)
 96 |     else: # layer == 2
 97 |         H = conv_model(x_emb, opt, prefix = prefix, is_reuse = is_reuse)
 98 | 
 99 |     logits = discriminator_2layer(H, opt, prefix= prefix, is_reuse = is_reuse)
100 |     return logits, tf.squeeze(H)
101 | 
102 | 
103 | def conv_encoder(x_emb, is_train, opt, res, is_reuse = None, prefix = ''):
104 |     if hasattr(opt, 'multiplier'):
105 |         multiplier = opt.multiplier
106 |     else:
107 |         multiplier = 2
108 |     if opt.layer == 4:
109 |         H_enc = conv_model_4layer(x_emb, opt, is_train = is_train, is_reuse = is_reuse, prefix = prefix)
110 |     elif opt.layer == 3:
111 |         H_enc = conv_model_3layer(x_emb, opt, is_train = is_train, multiplier = multiplier, is_reuse = is_reuse, prefix = prefix)
112 |     elif opt.layer == 0:
113 |         H_enc = conv_model_3layer_old(x_emb, opt, is_reuse = is_reuse, prefix = prefix)
114 |     else:
115 |         H_enc = conv_model(x_emb, opt, is_train = is_train, is_reuse = is_reuse, prefix = prefix)
116 |     return H_enc, res
117 | 
118 | def deconv_decoder(H_dec, x_org, W_norm, is_train, opt, res, prefix = '', is_reuse = None):
119 |     if hasattr(opt, 'multiplier'):
120 |         multiplier = opt.multiplier
121 |     else:
122 |         multiplier = 2
123 |     # H_dec  batch 1 1 n_gan
124 |     if opt.layer == 4:
125 |         x_rec = deconv_model_4layer(H_dec, opt, is_train = is_train, prefix = prefix, is_reuse = is_reuse)  #  batch L emb 1
126 |     elif opt.layer == 3:
127 |         x_rec = deconv_model_3layer(H_dec, opt, is_train = is_train, multiplier = multiplier, prefix= prefix, is_reuse = is_reuse)  #  batch L emb 1
128 |     elif opt.layer == 0:
129 |         x_rec = deconv_model_3layer(H_dec, opt, prefix= prefix, is_reuse = is_reuse)  #  batch L emb 1
130 |     else:
131 |         x_rec = deconv_model(H_dec, opt, is_train = is_train, prefix= prefix, is_reuse = is_reuse)  #  batch L emb 1
132 |     print("Decoder len %d Output len %d" % (x_rec.get_shape()[1], x_org.get_shape()[1]))
133 |     tf.assert_equal(x_rec.get_shape()[1], x_org.get_shape()[1])
134 |     x_rec_norm = normalizing(x_rec, 2)    # batch L emb
135 |     #W_reshape = tf.reshape(tf.transpose(W),[1,1,opt.embed_size,opt.n_words])
136 |     #print all_idx.get_shape()
137 | 
138 |     # if opt.fix_emb:
139 |     #
140 |     #     #loss = tf.reduce_sum((x_emb-x_rec)**2) # L2 is bad
141 |     #     # cosine sim
142 |     #       # Batch L emb
143 |     #     loss = -tf.reduce_sum(x_rec_norm * x_emb)
144 |     #     rec_sent = tf.argmax(tf.tensordot(tf.squeeze(x_rec_norm) , W_norm, [[2],[1]]),2)
145 |     #     res['rec_sents'] = rec_sent
146 |     #
147 |     # else:
148 |     x_temp = tf.reshape(x_org, [-1,])
149 |     if hasattr(opt, 'attentive_emb') and opt.attentive_emb:
150 |         emb_att = tf.get_variable(prefix+'emb_att', [1,opt.embed_size], initializer = tf.constant_initializer(1.0, dtype=tf.float32))
151 |         prob_logits = tf.tensordot(tf.squeeze(x_rec_norm), emb_att*W_norm, [[2],[1]])  # c_blv = sum_e x_ble W_ve
152 |     else:
153 |         prob_logits = tf.tensordot(tf.squeeze(x_rec_norm), W_norm, [[2],[1]])  # c_blv = sum_e x_ble W_ve
154 | 
155 |     prob = tf.nn.log_softmax(prob_logits*opt.L, dim=-1, name=None)
156 |     #prob = normalizing(tf.reduce_sum(x_rec_norm * W_reshape, 2), 2)
157 |     #prob = softmax_prediction(x_rec_norm, opt)
158 |     rec_sent = tf.squeeze(tf.argmax(prob,2))
159 |     prob = tf.reshape(prob, [-1,opt.n_words])
160 | 
161 |     idx = tf.range(opt.batch_size * opt.sent_len)
162 |     #print idx.get_shape(), idx.dtype
163 | 
164 |     all_idx = tf.transpose(tf.stack(values=[idx,x_temp]))
165 |     all_prob = tf.gather_nd(prob, all_idx)
166 | 
167 |     #pdb.set_trace()
168 | 
169 |     gen_temp = tf.cast(tf.reshape(rec_sent, [-1,]), tf.int32)
170 |     gen_idx = tf.transpose(tf.stack(values=[idx,gen_temp]))
171 |     gen_prob = tf.gather_nd(prob, gen_idx)
172 | 
173 |     res['rec_sents'] = rec_sent
174 | 
175 |     #res['gen_p'] = tf.exp(gen_prob[0:opt.sent_len])
176 |     #res['all_p'] = tf.exp(all_prob[0:opt.sent_len])
177 | 
178 |     if opt.discrimination:
179 |         logits_real, _ = discriminator(x_org, W_norm, opt)
180 |         prob_one_hot = tf.nn.log_softmax(prob_logits*opt.L, dim=-1, name=None)
181 |         logits_syn, _ = discriminator(tf.exp(prob_one_hot), W_norm, opt, is_prob = True, is_reuse = True)
182 | 
183 |         res['prob_r'] =  tf.reduce_mean(tf.nn.sigmoid(logits_real))
184 |         res['prob_f'] = tf.reduce_mean(tf.nn.sigmoid(logits_syn))
185 | 
186 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(logits_real), logits = logits_real)) + \
187 |                      tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(logits_syn), logits = logits_syn))
188 |     else:
189 |         loss = -tf.reduce_mean( all_prob)
190 |     return loss, res
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | def regularization(X, opt, is_train, prefix= '', is_reuse= None):
198 |     if '_X' not in prefix and '_H_dec' not in prefix:
199 |         if opt.batch_norm:
200 |             X = layers.batch_norm(X, decay=0.9, center=True, scale=True, is_training=is_train, scope=prefix+'_bn', reuse = is_reuse)
201 |         X = tf.nn.relu(X)
202 |     X = X if not opt.cnn_layer_dropout else layers.dropout(X, keep_prob = opt.dropout_ratio, scope=prefix + '_dropout')
203 | 
204 |     return X
205 | 
206 | 
207 | conv_acf = tf.nn.tanh # tf.nn.relu
208 | 
209 | def conv_model(X, opt, prefix = '', is_reuse= None, is_train = True):  # 2layers
210 |     #XX = tf.reshape(X, [-1, , 28, 1])
211 |     #X shape: batchsize L emb 1
212 |     if opt.reuse_cnn:
213 |         biasInit = opt.cnn_b
214 |         weightInit = opt.cnn_W
215 |     else:
216 |         biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32)
217 |         weightInit = tf.constant_initializer(0.001, dtype=tf.float32)
218 | 
219 |     X = regularization(X, opt,  prefix= prefix + 'reg_X', is_reuse= is_reuse, is_train = is_train)
220 |     H1 = layers.conv2d(X,  num_outputs=opt.filter_size,  kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1],  weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H1', reuse = is_reuse)  # batch L-3 1 Filtersize
221 | 
222 |     H1 = regularization(H1, opt, prefix= prefix + 'reg_H1', is_reuse= is_reuse, is_train = is_train)
223 |     H2 = layers.conv2d(H1,  num_outputs=opt.filter_size*2,  kernel_size=[opt.sent_len2, 1],  activation_fn=conv_acf , padding = 'VALID', scope = prefix + 'H2', reuse = is_reuse) # batch 1 1 2*Filtersize
224 |     return H2
225 | 
226 | 
227 | def conv_model_3layer(X, opt, prefix = '', is_reuse= None, num_outputs = None, is_train = True, multiplier = 2):
228 |     #XX = tf.reshape(X, [-1, , 28, 1])
229 |     #X shape: batchsize L emb 1
230 |     if opt.reuse_cnn:
231 |         biasInit = opt.cnn_b
232 |         weightInit = opt.cnn_W
233 |     else:
234 |         biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32)
235 |         weightInit = tf.constant_initializer(0.001, dtype=tf.float32)
236 | 
237 |     X = regularization(X, opt,  prefix= prefix + 'reg_X', is_reuse= is_reuse, is_train = is_train)
238 |     H1 = layers.conv2d(X,  num_outputs=opt.filter_size,  kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1],  weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H1_3', reuse = is_reuse)  # batch L-3 1 Filtersize
239 | 
240 |     H1 = regularization(H1, opt, prefix= prefix + 'reg_H1', is_reuse= is_reuse, is_train = is_train)
241 |     H2 = layers.conv2d(H1,  num_outputs=opt.filter_size*multiplier,  kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1],  biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H2_3', reuse = is_reuse)
242 |     #print H2.get_shape()
243 |     H2 = regularization(H2, opt,  prefix= prefix + 'reg_H2', is_reuse= is_reuse, is_train = is_train)
244 |     H3 = layers.conv2d(H2,  num_outputs= (num_outputs if num_outputs else opt.n_gan),  kernel_size=[opt.sent_len3, 1], activation_fn=tf.nn.tanh , padding = 'VALID', scope = prefix + 'H3_3', reuse = is_reuse) # batch 1 1 2*Filtersize
245 | 
246 |     #pdb.set_trace()
247 |     return H3
248 | 
249 | 
250 | def conv_model_3layer_old(X, opt, prefix = '', is_reuse= None, num_outputs = None):
251 |     #XX = tf.reshape(X, [-1, , 28, 1])
252 |     #X shape: batchsize L emb 1
253 | 
254 |     biasInit = tf.constant_initializer(0.001, dtype=tf.float32)
255 |     weightInit = tf.constant_initializer(0.001, dtype=tf.float32)
256 | 
257 |     H1 = layers.conv2d(X,  num_outputs=opt.filter_size,  kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=tf.nn.relu, padding = 'VALID', scope = prefix + 'H1_3', reuse = is_reuse)  # batch L-3 1 Filtersize
258 |     H2 = layers.conv2d(H1,  num_outputs=opt.filter_size*2,  kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1], biases_initializer=biasInit, activation_fn=tf.nn.relu, padding = 'VALID', scope = prefix + 'H2_3', reuse = is_reuse)
259 |     #print H2.get_shape()
260 |     H3 = layers.conv2d(H2,  num_outputs= (num_outputs if num_outputs else opt.n_gan),  kernel_size=[opt.sent_len3, 1], biases_initializer=biasInit, activation_fn=tf.nn.tanh, padding = 'VALID', scope = prefix + 'H3_3', reuse = is_reuse) # batch 1 1 2*Filtersize
261 |     return H3
262 | 
263 | 
264 | def conv_model_4layer(X, opt, prefix = '', is_reuse= None, num_outputs = None, is_train = True):
265 |     #XX = tf.reshape(X, [-1, , 28, 1])
266 |     #X shape: batchsize L emb 1
267 |     if opt.reuse_cnn:
268 |         biasInit = opt.cnn_b
269 |         weightInit = opt.cnn_W
270 |     else:
271 |         biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32)
272 |         weightInit = tf.constant_initializer(0.001, dtype=tf.float32)
273 | 
274 |     X = regularization(X, opt,  prefix= prefix + 'reg_X', is_reuse= is_reuse, is_train = is_train)
275 |     H1 = layers.conv2d(X,  num_outputs=opt.filter_size,  kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1],  weights_initializer = weightInit, biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H1_3', reuse = is_reuse)  # batch L-3 1 Filtersize
276 | 
277 |     H1 = regularization(H1, opt, prefix= prefix + 'reg_H1', is_reuse= is_reuse, is_train = is_train)
278 |     H2 = layers.conv2d(H1,  num_outputs=opt.filter_size*2,  kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1],  biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H2_3', reuse = is_reuse)
279 | 
280 |     H2 = regularization(H2, opt, prefix= prefix + 'reg_H2', is_reuse= is_reuse, is_train = is_train)
281 |     H3 = layers.conv2d(H2,  num_outputs=opt.filter_size*4,  kernel_size=[opt.filter_shape, 1], stride = [opt.stride[2],1],  biases_initializer=biasInit, activation_fn=None, padding = 'VALID', scope = prefix + 'H3_3', reuse = is_reuse)
282 |     #print H2.get_shape()
283 |     H3 = regularization(H3, opt, prefix= prefix + 'reg_H3', is_reuse= is_reuse, is_train = is_train)
284 |     H4 = layers.conv2d(H3,  num_outputs= (num_outputs if num_outputs else opt.n_gan),  kernel_size=[opt.sent_len4, 1], activation_fn=conv_acf , padding = 'VALID', scope = prefix + 'H4', reuse = is_reuse) # batch 1 1 2*Filtersize
285 |     return H4
286 | 
287 | 
288 | dec_acf = tf.nn.relu #tf.nn.tanh
289 | dec_bias = None # tf.constant_initializer(0.001, dtype=tf.float32)
290 | 
291 | def deconv_model(H, opt, prefix = '', is_reuse= None, is_train = True):
292 |     biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32)
293 |     #H2t = tf.reshape(H, [H.shape[0],1,1,H.shape[1]])
294 | #    print tf.shape(H)
295 | #    H2t = tf.expand_dims(H,1)
296 | #    H2t = tf.expand_dims(H,1)
297 | 
298 |     H2t = H
299 | 
300 |     H2t = regularization(H2t, opt, prefix= prefix + 'reg_H_dec', is_reuse= is_reuse, is_train = is_train)
301 |     H1t = layers.conv2d_transpose(H2t, num_outputs=opt.filter_size,  kernel_size=[opt.sent_len2, 1],  biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope =  prefix + 'H1_t', reuse = is_reuse)
302 | 
303 |     H1t = regularization(H1t, opt, prefix= prefix + 'reg_H1_dec', is_reuse= is_reuse, is_train = is_train)
304 |     Xhat = layers.conv2d_transpose(H1t, num_outputs=1,  kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1], biases_initializer=dec_bias, activation_fn=dec_acf, padding = 'VALID',scope = prefix + 'Xhat_t', reuse = is_reuse)
305 |     #print H2t.get_shape(), H1t.get_shape(), Xhat.get_shape()
306 |     return Xhat
307 | 
308 | def deconv_model_3layer(H, opt, prefix = '', is_reuse= None, is_train = True, multiplier = 2):
309 |     #XX = tf.reshape(X, [-1, , 28, 1])
310 |     #X shape: batchsize L emb 1
311 |     biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32)
312 | 
313 |     H3t = H
314 | 
315 |     H3t = regularization(H3t, opt, prefix= prefix + 'reg_H_dec', is_reuse= is_reuse, is_train = is_train)
316 |     H2t = layers.conv2d_transpose(H3t, num_outputs=opt.filter_size*multiplier,  kernel_size=[opt.sent_len3, 1],  biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H2_t_3', reuse = is_reuse)
317 | 
318 |     H2t = regularization(H2t, opt, prefix= prefix + 'reg_H2_dec', is_reuse= is_reuse, is_train = is_train)
319 |     H1t = layers.conv2d_transpose(H2t, num_outputs=opt.filter_size,  kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1],  biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H1_t_3', reuse = is_reuse)
320 | 
321 |     H1t = regularization(H1t, opt, prefix= prefix + 'reg_H1_dec', is_reuse= is_reuse, is_train = is_train)
322 |     Xhat = layers.conv2d_transpose(H1t, num_outputs=1,  kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1],  biases_initializer=dec_bias, activation_fn=dec_acf, padding = 'VALID',scope = prefix + 'Xhat_t_3', reuse = is_reuse)
323 |     #print H2t.get_shape(),H1t.get_shape(),Xhat.get_shape()
324 | 
325 |     return Xhat
326 | 
327 | 
328 | 
329 | 
330 | def deconv_model_4layer(H, opt, prefix = '', is_reuse= None, is_train = True):
331 |     #XX = tf.reshape(X, [-1, , 28, 1])
332 |     #X shape: batchsize L emb 1
333 |     biasInit = None if opt.batch_norm else tf.constant_initializer(0.001, dtype=tf.float32)
334 | 
335 |     H4t = H
336 | 
337 |     H4t = regularization(H4t, opt, prefix= prefix + 'reg_H_dec', is_reuse= is_reuse, is_train = is_train)
338 |     H3t = layers.conv2d_transpose(H4t, num_outputs=opt.filter_size*4,  kernel_size=[opt.sent_len4, 1],   biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H3_t_3', reuse = is_reuse)
339 | 
340 |     H3t = regularization(H3t, opt, prefix= prefix + 'reg_H3_dec', is_reuse= is_reuse, is_train = is_train)
341 |     H2t = layers.conv2d_transpose(H3t, num_outputs=opt.filter_size*2,  kernel_size=[opt.filter_shape, 1], stride = [opt.stride[2],1],  biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H2_t_3', reuse = is_reuse)
342 | 
343 |     H2t = regularization(H2t, opt, prefix= prefix + 'reg_H2_dec', is_reuse= is_reuse, is_train = is_train)
344 |     H1t = layers.conv2d_transpose(H2t, num_outputs=opt.filter_size,  kernel_size=[opt.filter_shape, 1], stride = [opt.stride[1],1],  biases_initializer=biasInit, activation_fn=None ,padding = 'VALID', scope = prefix + 'H1_t_3', reuse = is_reuse)
345 | 
346 |     H1t = regularization(H1t, opt, prefix= prefix + 'reg_H1_dec', is_reuse= is_reuse, is_train = is_train)
347 |     Xhat = layers.conv2d_transpose(H1t, num_outputs=1,  kernel_size=[opt.filter_shape, opt.embed_size], stride = [opt.stride[0],1],  biases_initializer=dec_bias, activation_fn=dec_acf, padding = 'VALID',scope = prefix + 'Xhat_t_3', reuse = is_reuse)
348 |     #print H2t.get_shape(),H1t.get_shape(),Xhat.get_shape()
349 |     return Xhat
350 | 
351 | 
352 | 
353 | 
354 | 
355 | 
356 | 


--------------------------------------------------------------------------------
/semi_supervised.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Yizhe Zhang
  4 | 
  5 | TextCNN
  6 | """
  7 | ## 152.3.214.203/6006
  8 | 
  9 | import os
 10 | 
 11 | GPUID = 1
 12 | os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID)
 13 | 
 14 | import tensorflow as tf
 15 | from tensorflow.contrib import learn
 16 | from tensorflow.contrib import layers
 17 | # from tensorflow.contrib import metrics
 18 | # from tensorflow.contrib.learn import monitors
 19 | from tensorflow.contrib import framework
 20 | from tensorflow.contrib.learn.python.learn import learn_runner
 21 | from tensorflow.python.platform import tf_logging as logging
 22 | # from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 23 | import cPickle
 24 | import numpy as np
 25 | import os
 26 | import scipy.io as sio
 27 | from math import floor
 28 | import pdb
 29 | 
 30 | from model import *
 31 | from utils import prepare_data_for_cnn, prepare_data_for_rnn, get_minibatches_idx, normalizing, restore_from_save, \
 32 |     prepare_for_bleu, cal_BLEU, sent2idx, _clip_gradients_seperate_norm
 33 | from denoise import *
 34 | 
 35 | # import tempfile
 36 | # from tensorflow.examples.tutorials.mnist import input_data
 37 | 
 38 | logging.set_verbosity(logging.INFO)
 39 | # Basic model parameters as external flags.
 40 | flags = tf.app.flags
 41 | FLAGS = flags.FLAGS
 42 | 
 43 | 
 44 | # flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.')
 45 | 
 46 | class Options(object):
 47 |     def __init__(self):
 48 |         self.fix_emb = False
 49 |         self.reuse_w = True
 50 |         self.reuse_cnn = False
 51 |         self.reuse_discrimination = True  # reuse cnn for discrimination
 52 |         self.restore = True
 53 |         self.tanh = True  # activation fun for the top layer of cnn, otherwise relu
 54 |         self.model = 'cnn_deconv'  # 'cnn_rnn', 'rnn_rnn' , default: cnn_deconv
 55 | 
 56 |         self.permutation = 0
 57 |         self.substitution = 's'  # Deletion(d), Insertion(a), Substitution(s) and Permutation(p)
 58 | 
 59 |         self.W_emb = None
 60 |         self.cnn_W = None
 61 |         self.cnn_b = None
 62 |         self.maxlen = 305
 63 |         self.n_words = None
 64 |         self.filter_shape = 5
 65 |         self.filter_size = 300
 66 |         self.multiplier = 1 # filtersize multiplier
 67 |         self.embed_size = 300
 68 |         self.lr = 2e-4
 69 |         self.layer = 3
 70 |         self.stride = [2, 2 ,2]  # for two layer cnn/deconv , use self.stride[0]
 71 |         self.batch_size = 64
 72 |         self.dis_batch_size = 64
 73 |         self.max_epochs = 1000
 74 |         self.n_gan = 500  # self.filter_size * 3
 75 |         self.L = 100
 76 | 
 77 |         self.optimizer = 'Adam'  # tf.train.AdamOptimizer(beta1=0.9) #'Adam' # 'Momentum' , 'RMSProp'
 78 |         self.clip_grad = None  # None  #100  #  20#
 79 |         self.attentive_emb = False
 80 |         self.decay_rate = 1
 81 | 
 82 |         self.save_path = "./save/yelp" #"./save/yelp_500_new"
 83 |         self.log_path = "./log"
 84 |         self.print_freq = 100
 85 |         self.valid_freq = 1000
 86 |         
 87 |         self.part_data = False
 88 |         #self.portion = float(sys.argv[1])  # 10%  1%
 89 |         self.portion = 1.0  # 10%  1%
 90 | 
 91 |         # batch norm & dropout
 92 |         self.batch_norm = False
 93 |         self.cnn_layer_dropout = False
 94 |         self.dropout_ratio = 0.5 # keep probability.
 95 |         self.rec_alpha = 1
 96 |         self.rec_decay_freq = 50
 97 |         self.pretrain_step = 50000
 98 | 
 99 |         self.discrimination = False
100 |         self.H_dis = 300
101 | 
102 |         self.sent_len = self.maxlen + 2 * (self.filter_shape - 1)
103 |         self.sent_len2 = np.int32(floor((self.sent_len - self.filter_shape) / self.stride[0]) + 1)
104 |         self.sent_len3 = np.int32(floor((self.sent_len2 - self.filter_shape) / self.stride[1]) + 1)
105 |         self.sent_len4 = np.int32(floor((self.sent_len3 - self.filter_shape) / self.stride[2]) + 1)
106 |         print ('Use model %s' % self.model)
107 |         print ('Use %d conv/deconv layers' % self.layer)
108 | 
109 |     def __iter__(self):
110 |         for attr, value in self.__dict__.iteritems():
111 |             yield attr, value
112 | 
113 | 
114 | def semi_classifier(alpha, x, x_org, x_lab, y, dp_ratio, opt, opt_t=None):
115 |     # print x.get_shape()  # batch L
116 |     is_train = True
117 |     if not opt_t: opt_t = opt
118 |     x_lab_emb, W_norm = embedding(x_lab, opt)  # batch L emb
119 |     x_emb = tf.nn.embedding_lookup(W_norm, x)
120 |     x_emb = tf.expand_dims(x_emb, 3)  # batch L emb 1
121 |     x_lab_emb = tf.expand_dims(x_lab_emb, 3)  # batch L emb 1
122 |     x_lab_emb= tf.nn.dropout(x_lab_emb, dp_ratio)
123 |     res = {}
124 | 
125 |     # cnn encoder
126 |     H_enc, res = conv_encoder(x_emb, is_train, opt, res)
127 |     H_lab_enc, res = conv_encoder(x_lab_emb, is_train, opt, res, is_reuse = True)
128 |     H_dec = H_enc
129 | 
130 |     #H_lab_enc = tf.nn.dropout(H_lab_enc, opt.dropout_ratio)
131 |     logits = classifier_2layer(H_lab_enc, opt, dropout = dp_ratio, prefix='classify', is_reuse=None)
132 |     dis_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits))
133 |     
134 |     # calculate the accuracy
135 |     prob = tf.nn.sigmoid(logits)
136 | 
137 |     # if opt.model == 'rnn_rnn':
138 |     #     rec_loss, rec_sent_1, _ = seq2seq(x, x_org, opt)
139 |     #     _, rec_sent_2, _ = seq2seq(x, x_org, opt, feed_previous=True, is_reuse=True)
140 |     #     res['rec_sents_feed_y'] = rec_sent_1
141 |     #     res['rec_sents'] = rec_sent_2
142 | 
143 |     # elif opt.model == 'cnn_rnn':
144 |     #     # lstm decoder
145 |     #     H_dec2 = tf.identity(H_dec)
146 |     #     rec_loss, rec_sent_1, _ = lstm_decoder(H_dec, x_org, opt)  #
147 | 
148 |     #     _, rec_sent_2, _ = lstm_decoder(H_dec, x_org, opt, feed_previous=True, is_reuse=True)
149 | 
150 |     #     res['rec_sents_feed_y'] = rec_sent_1
151 |     #     res['rec_sents'] = rec_sent_2
152 | 
153 |     # else:
154 | 
155 |     #     # deconv decoder
156 |     rec_loss, res = deconv_decoder(H_dec, x_org, W_norm, is_train, opt_t, res)
157 | 
158 |     correct_prediction = tf.equal(tf.round(prob), tf.round(y))
159 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
160 | 
161 |     # calculate the total loss
162 |     loss = alpha * rec_loss + (1-alpha) * dis_loss
163 | 
164 |     tf.summary.scalar('loss', loss)
165 |     tf.summary.scalar('rec_loss', rec_loss)
166 |     tf.summary.scalar('dis_loss', dis_loss)
167 |     summaries = [
168 |         # "learning_rate",
169 |         "loss"
170 |         # "gradients",
171 |         # "gradient_norm",
172 |     ]
173 |     global_step = tf.Variable(0, trainable=False)
174 |     train_op = layers.optimize_loss(
175 |         loss,
176 |         global_step=global_step,
177 |         # framework.get_global_step(),
178 |         optimizer=opt.optimizer,
179 |         clip_gradients=(lambda grad: _clip_gradients_seperate_norm(grad, opt.clip_grad)) if opt.clip_grad else None,
180 |         #learning_rate_decay_fn=lambda lr, g: tf.train.exponential_decay(learning_rate=lr, global_step=g,
181 |         #                                                                decay_rate=opt.decay_rate, decay_steps=3000),
182 |         learning_rate=opt.lr,
183 |         summaries=summaries
184 |     )
185 |     return res, dis_loss, rec_loss, loss, train_op, prob, accuracy
186 | 
187 | 
188 | def run_model(opt, train_unlab_x, train_lab_x, train_lab, val_unlab_x, val_lab_x, val_lab, test, test_y, wordtoix, ixtoword):
189 |     try:
190 |         params = np.load('./param_g.npz')
191 |         if params['Wemb'].shape == (opt.n_words, opt.embed_size):
192 |             print('Use saved embedding.')
193 |             opt.W_emb = params['Wemb']
194 |         else:
195 |             print('Emb Dimension mismatch: param_g.npz:' + str(params['Wemb'].shape) + ' opt: ' + str(
196 |                 (opt.n_words, opt.embed_size)))
197 |             opt.fix_emb = False
198 |     except IOError:
199 |         print('No embedding file found.')
200 |         opt.fix_emb = False
201 | 
202 |     with tf.device('/gpu:1'):
203 |         alpha_ = tf.placeholder(tf.float32, shape=())
204 |         x_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
205 |         x_org_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.sent_len])
206 |         x_lab_ = tf.placeholder(tf.int32, shape=[opt.dis_batch_size, opt.sent_len])
207 |         y_ = tf.placeholder(tf.float32, shape=[opt.dis_batch_size, 1])
208 |         dp_ratio_ = tf.placeholder(tf.float32, name='dp_ratio_')
209 |         res_, dis_loss_, rec_loss_, loss_, train_op, prob_, acc_ = semi_classifier(alpha_, x_, x_org_, x_lab_, y_, dp_ratio_, opt)
210 |         merged = tf.summary.merge_all()
211 | 
212 |     uidx = 0
213 |     max_val_accuracy = 0.0
214 |     max_test_accuracy = 0.0
215 |     config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
216 |     # config = tf.ConfigProto(device_count={'GPU':0})
217 |     config.gpu_options.allow_growth = True
218 |     np.set_printoptions(precision=3)
219 |     np.set_printoptions(threshold=np.inf)
220 |     saver = tf.train.Saver()
221 | 
222 |     with tf.Session(config=config) as sess:
223 |         train_writer = tf.summary.FileWriter(opt.log_path + '/train', sess.graph)
224 |         test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
225 |         sess.run(tf.global_variables_initializer())
226 |         if opt.restore:
227 |             try:
228 |                 t_vars = tf.trainable_variables()
229 |                 loader = restore_from_save(t_vars, sess, opt)
230 | 
231 |             except Exception as e:
232 |                 print(e)
233 |                 print("No saving session, using random initialization")
234 |                 sess.run(tf.global_variables_initializer())
235 | 
236 |         for epoch in range(opt.max_epochs):
237 | 
238 |             print("Starting epoch %d" % epoch)
239 | 
240 |             kf = get_minibatches_idx(len(train_unlab_x), opt.batch_size, shuffle=True)
241 |             for _, train_index in kf:
242 |                 uidx += 1
243 |                 
244 |                 if opt.rec_alpha > 0 and uidx > opt.pretrain_step and uidx % opt.rec_decay_freq == 0:
245 |                     opt.rec_alpha -= 0.01
246 |                     print "alpha: "+  str(opt.rec_alpha)
247 |                 
248 |                 sents = [train_unlab_x[t] for t in train_index]
249 | 
250 |                 lab_index = np.random.choice(len(train_lab), opt.dis_batch_size, replace=False)
251 |                 lab_sents = [train_lab_x[t] for t in lab_index]
252 |                 batch_lab = [train_lab[t] for t in lab_index]
253 |                 batch_lab = np.array(batch_lab)
254 |                 batch_lab = batch_lab.reshape((len(batch_lab), 1))
255 |                 x_batch_lab = prepare_data_for_cnn(lab_sents, opt)
256 | 
257 |                 sents_permutated = add_noise(sents, opt)
258 | 
259 |                 if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
260 |                     x_batch_org = prepare_data_for_cnn(sents, opt)  # Batch L
261 |                 else:
262 |                     x_batch_org = prepare_data_for_rnn(sents, opt)  # Batch L
263 | 
264 |                 if opt.model != 'rnn_rnn':
265 |                     x_batch = prepare_data_for_cnn(sents_permutated, opt)  # Batch L
266 |                 else:
267 |                     x_batch = prepare_data_for_rnn(sents_permutated, opt, is_add_GO=False)  # Batch L
268 | 
269 |                 _, dis_loss, rec_loss, loss, acc = sess.run([train_op, dis_loss_, rec_loss_, loss_, acc_],
270 |                                                        feed_dict= {alpha_: opt.rec_alpha, x_: x_batch, x_org_: x_batch_org, x_lab_: x_batch_lab, y_: batch_lab, dp_ratio_: opt.dropout_ratio})
271 |                 summary = sess.run(merged, feed_dict={alpha_: opt.rec_alpha, x_: x_batch, x_org_: x_batch_org, x_lab_: x_batch_lab, y_: batch_lab, dp_ratio_: opt.dropout_ratio})
272 |                 train_writer.add_summary(summary, uidx)
273 | 
274 | 
275 |                 if uidx % opt.print_freq == 0:
276 |                     print("Iteration %d: dis_loss %f, rec_loss %f, loss %f, acc %f " % (uidx, dis_loss, rec_loss, loss, acc))
277 | 
278 |                 if uidx % opt.valid_freq == 0:
279 |                     #print("Iteration %d: dis_loss %f, rec_loss %f, loss %f " % (uidx, dis_loss, rec_loss, loss))
280 |                     valid_index = np.random.choice(len(val_unlab_x), opt.batch_size)
281 |                     val_sents = [val_unlab_x[t] for t in valid_index]
282 |                     
283 |                     val_sents_permutated = add_noise(val_sents, opt)
284 | 
285 |                     if opt.model != 'rnn_rnn' and opt.model != 'cnn_rnn':
286 |                         x_val_batch_org = prepare_data_for_cnn(val_sents, opt)
287 |                     else:
288 |                         x_val_batch_org = prepare_data_for_rnn(val_sents, opt)
289 | 
290 |                     if opt.model != 'rnn_rnn':
291 |                         x_val_batch = prepare_data_for_cnn(val_sents_permutated, opt)
292 |                     else:
293 |                         x_val_batch = prepare_data_for_rnn(val_sents_permutated, opt, is_add_GO=False)
294 | 
295 |                     rec_loss_val = sess.run(rec_loss_, feed_dict={x_: x_val_batch,
296 |                                                                   x_org_: x_val_batch_org, dp_ratio_: 1.0})
297 |                     print("Validation rec loss %f " % rec_loss_val)
298 | 
299 |                     kf_val = get_minibatches_idx(len(val_lab_x), opt.dis_batch_size, shuffle=False)
300 |                     
301 |                     prob_val = []
302 |                     for _, val_ind in kf_val:
303 |                         val_sents = [val_lab_x[t] for t in val_ind]
304 |                         x_val_dis = prepare_data_for_cnn(val_sents, opt)
305 |                         val_y = np.array([val_lab[t] for t in val_ind]).reshape((opt.dis_batch_size, 1))
306 |                         val_prob = sess.run(prob_, feed_dict={x_lab_: x_val_dis, dp_ratio_: 1.0})
307 |                         for x in val_prob:
308 |                             prob_val.append(x)
309 | 
310 |                     ##### DON'T UNDERSTAND :error   val_index
311 |                     # probs = []
312 |                     # val_truth = []
313 |                     # for i in range(len(val_lab)):
314 |                     #     val_truth.append(val_lab[i])
315 |                     #     if type(val_index[i]) != int:
316 |                     #         temp = []
317 |                     #         for j in val_index[i]:
318 |                     #             temp.append(prob_val[j])
319 |                     #         aver = sum(temp) * 1.0 / len(temp)
320 |                     #         probs.append(aver)
321 |                     #     else:
322 |                     #         probs.append(prob_val[val_index[i]])
323 | 
324 |                     probs = []
325 |                     val_truth = []
326 |                     for i in range(len(prob_val)):
327 |                         val_truth.append(val_lab[i])
328 |                         probs.append(prob_val[i])        
329 | 
330 |                     count = 0.0
331 |                     for i in range(len(probs)):
332 |                         p = probs[i]
333 |                         if p > 0.5:
334 |                             if val_truth[i] == 1:
335 |                                 count += 1.0
336 |                         else:
337 |                             if val_truth[i] == 0:
338 |                                 count += 1.0
339 | 
340 |                     val_accuracy = count * 1.0 / len(probs)
341 | 
342 | 
343 | 
344 |                     print("Validation accuracy %f " % val_accuracy)
345 | 
346 |                     summary = sess.run(merged,
347 |                                        feed_dict={alpha_: opt.rec_alpha, x_: x_val_batch, x_org_: x_val_batch_org, x_lab_: x_val_dis, y_: val_y, dp_ratio_: 1.0})
348 |                     test_writer.add_summary(summary, uidx)
349 | 
350 |                     if val_accuracy >= max_val_accuracy:
351 |                         max_val_accuracy = val_accuracy
352 | 
353 |                         kf_test = get_minibatches_idx(len(test), opt.dis_batch_size, shuffle=False)
354 |                         prob_test = []
355 |                         for _, test_ind in kf_test:
356 |                             test_sents = [test[t] for t in test_ind]
357 |                             x_test_batch = prepare_data_for_cnn(test_sents, opt)
358 |                             test_prob = sess.run(prob_, feed_dict={x_lab_: x_test_batch, dp_ratio_: 1.0})
359 |                             for x in test_prob:
360 |                                 prob_test.append(x)
361 | 
362 |                         probs = []
363 |                         test_truth = []
364 |                         for i in range(len(prob_test)):
365 |                             test_truth.append(test_y[i])
366 |                             probs.append(prob_test[i])
367 | 
368 |                         # probs = []
369 |                         # test_truth = []
370 |                         # for i in range(len(test_y)):
371 |                         #     test_truth.append(test_y[i])
372 |                         #     if type(test_index[i]) != int:
373 |                         #         temp = [prob_test[j] for j in test_index[i]]
374 |                         #         aver = sum(temp) * 1.0 / len(temp)
375 |                         #         probs.append(aver)
376 |                         #     else:
377 |                         #         probs.append(prob_test[test_index[i]])
378 | 
379 |                         count = 0.0
380 |                         for i in range(len(probs)):
381 |                             p = probs[i]
382 |                             if p > 0.5:
383 |                                 if test_truth[i] == 1.0:
384 |                                     count += 1.0
385 |                             else:
386 |                                 if test_truth[i] == 0.0:
387 |                                     count += 1.0
388 | 
389 |                         test_accuracy = count * 1.0 / len(probs)
390 | 
391 |                         print("Test accuracy %f " % test_accuracy)
392 | 
393 |                         max_test_accuracy = test_accuracy
394 | 
395 |                 def test_input(text):
396 |                     x_input = sent2idx(text, wordtoix, opt)
397 |                     res = sess.run(res_, feed_dict={x_: x_input, x_org_: x_batch_org})
398 |                     print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
399 | 
400 | 
401 |                     # res = sess.run(res_, feed_dict={x_: x_batch, x_org_: x_batch_org, is_train_: 1})
402 |                     # print "Original     :" + " ".join([ixtoword[x] for x in sents[0] if x != 0])
403 |                     # # print "Permutated   :" + " ".join([ixtoword[x] for x in sents_permutated[0] if x != 0])
404 |                     # if opt.model == 'rnn_rnn' or opt.model == 'cnn_rnn':
405 |                     #     print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents_feed_y'][0] if x != 0])
406 |                     # print "Reconstructed:" + " ".join([ixtoword[x] for x in res['rec_sents'][0] if x != 0])
407 | 
408 |                     # print "Probs:" + " ".join([ixtoword[res['rec_sents'][0][i]] +'(' +str(np.round(res['all_p'][i],2))+')' for i in range(len(res['rec_sents'][0])) if res['rec_sents'][0][i] != 0])
409 | 
410 | 
411 |             print(opt.rec_alpha)
412 |             print("Epoch %d: Max Valid accuracy %f" % (epoch, max_val_accuracy))
413 |             print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy))
414 | 
415 | 
416 | 
417 |             saver.save(sess, opt.save_path, global_step=epoch)
418 | 
419 | 
420 | def main():
421 |     # global n_words
422 |     # Prepare training and testing data
423 |     loadpath = "./data/yelp.p"
424 |     x = cPickle.load(open(loadpath, "rb"))
425 |     train, val, test = x[0], x[1], x[2]
426 |     train_lab, val_lab, test_lab = x[3], x[4], x[5]
427 |     wordtoix, ixtoword = x[6], x[7]
428 |     
429 |     train_unlab_x = [list(s) for s in train]
430 |     train_lab_x = [list(s) for s in train]
431 |     val_unlab_x = [list(s) for s in val]
432 |     val_lab_x = [list(s) for s in val]
433 |     test = [list(s) for s in test]
434 | 
435 |     train_lab = np.array(train_lab, dtype='float32')
436 |     val_lab = np.array(val_lab, dtype='float32')
437 |     test_lab = np.array(test_lab, dtype='float32')
438 | 
439 |     opt = Options()
440 |     opt.n_words = len(ixtoword)
441 |     print dict(opt)
442 |     print('Total words: %d' % opt.n_words)
443 | 
444 |     run_model(opt, train_unlab_x, train_lab_x, train_lab, val_unlab_x, val_lab_x, val_lab,
445 |               test, test_lab, wordtoix, ixtoword)
446 | 
447 | 
448 | 
449 | 
450 | if __name__ == '__main__':
451 |     main()
452 | 


--------------------------------------------------------------------------------