├── README.md
├── picture
    └── s2vt.png
└── s2vt
    ├── .gitignore
    ├── cnn_util.py
    ├── coco_eval
        ├── create_reference.py
        ├── create_result_json.py
        ├── eval.py
        ├── parse_video_csv.py
        ├── pycocoevalcap
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── bleu
        │   │   ├── LICENSE
        │   │   ├── __init__.py
        │   │   ├── __init__.pyc
        │   │   ├── bleu.py
        │   │   ├── bleu.pyc
        │   │   ├── bleu_scorer.py
        │   │   └── bleu_scorer.pyc
        │   ├── cider
        │   │   ├── __init__.py
        │   │   ├── __init__.pyc
        │   │   ├── cider.py
        │   │   ├── cider.pyc
        │   │   ├── cider_scorer.py
        │   │   └── cider_scorer.pyc
        │   ├── eval.py
        │   ├── eval.pyc
        │   ├── meteor
        │   │   ├── __init__.py
        │   │   ├── __init__.pyc
        │   │   ├── data
        │   │   │   └── paraphrase-en.gz
        │   │   ├── meteor-1.5.jar
        │   │   ├── meteor.py
        │   │   └── meteor.pyc
        │   ├── rouge
        │   │   ├── __init__.py
        │   │   ├── __init__.pyc
        │   │   ├── rouge.py
        │   │   └── rouge.pyc
        │   └── tokenizer
        │   │   ├── __init__.py
        │   │   ├── __init__.pyc
        │   │   ├── ptbtokenizer.py
        │   │   ├── ptbtokenizer.pyc
        │   │   ├── stanford-corenlp-3.4.1.jar
        │   │   ├── tmp0vwKED
        │   │   ├── tmp3FFDJw
        │   │   ├── tmp_jaQxJ
        │   │   ├── tmpc6XoAB
        │   │   ├── tmpcRaDxK
        │   │   ├── tmpkJaFVH
        │   │   ├── tmpoInl8I
        │   │   └── tmprhiH1L
        └── pycocotools
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── coco.py
        │   └── coco.pyc
    ├── data
        └── readme.md
    ├── extract_RGB_feats.py
    ├── model_temp
        └── readme.md
    ├── readme.md
    ├── rgb_test_features
        └── readme.md
    ├── rgb_train_features
        └── readme.md
    └── s2vt.py


/README.md:
--------------------------------------------------------------------------------
1 | # pytorch_video_caption
2 | some models for video caption implemented by pytorch. 
3 | ### s2vt
4 | seqence to sequence: from video to text (S2VT), implement by pytorch.  
5 | some other implement:  
6 | - [tensorflow](https://github.com/chenxinpeng/S2VT) by chenxinpeng 
7 | - [caffe](https://gist.github.com/vsubhashini/38d087e140854fee4b14) by the author.
8 | 
9 | 


--------------------------------------------------------------------------------
/picture/s2vt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/picture/s2vt.png


--------------------------------------------------------------------------------
/s2vt/.gitignore:
--------------------------------------------------------------------------------
1 | # python
2 | 
3 | # data
4 | *.npy
5 | *.csv
6 | *.pyc
7 | 


--------------------------------------------------------------------------------
/s2vt/cnn_util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | sys.path.append('/root/workspace/caffe/python')
 4 | import caffe
 5 | 
 6 | #import ipdb
 7 | import cv2
 8 | import numpy as np
 9 | import skimage
10 | 
11 | deploy = '/root/workspace/caffe/models/vgg_16/VGG_ILSVRC_16_layers_deploy.prototxt'
12 | model = '/root/workspace/caffe/models/vgg/VGG_ILSVRC_16_layers.caffemodel'
13 | mean = '/root/workspace/caffe/python/caffe/imagenet/ilsvrc_2012_mean.npy'
14 | 
15 | class CNN(object):
16 | 
17 |     def __init__(self, deploy=deploy, model=model, mean=mean, batch_size=10, width=227, height=227):
18 | 
19 |         self.deploy = deploy
20 |         self.model = model
21 |         self.mean = mean
22 | 
23 |         self.batch_size = batch_size
24 |         self.net, self.transformer = self.get_net()
25 |         self.net.blobs['data'].reshape(self.batch_size, 3, height, width)
26 | 
27 |         self.width = width
28 |         self.height = height
29 | 
30 |     def get_net(self):
31 |         caffe.set_mode_gpu()
32 |         net = caffe.Net(self.deploy, self.model, caffe.TEST)
33 | 
34 |         transformer = caffe.io.Transformer({'data':net.blobs['data'].data.shape})
35 |         transformer.set_transpose('data', (2,0,1))
36 |         transformer.set_mean('data', np.load(self.mean).mean(1).mean(1))
37 |         transformer.set_raw_scale('data', 255)
38 |         transformer.set_channel_swap('data', (2,1,0))
39 | 
40 |         return net, transformer
41 | 
42 |     def get_features(self, image_list, layers='fc7', layer_sizes=[4096]):
43 |         iter_until = len(image_list) + self.batch_size
44 |         # we fill the zeros 
45 |         #num_frames = 80
46 |         #all_feats = np.zeros([num_frames] + layer_sizes)
47 |         all_feats = np.zeros([len(image_list)] + layer_sizes)
48 | 
49 |         for start, end in zip(range(0, iter_until, self.batch_size), \
50 |                               range(self.batch_size, iter_until, self.batch_size)):
51 | 
52 |             image_batch = image_list[start:end]
53 | 
54 |             caffe_in = np.zeros(np.array(image_batch.shape)[[0,3,1,2]], dtype=np.float32)
55 | 
56 |             for idx, in_ in enumerate(image_batch):
57 |                 caffe_in[idx] = self.transformer.preprocess('data', in_)
58 | 
59 |             out = self.net.forward_all(blobs=[layers], **{'data':caffe_in})
60 |             feats = out[layers]
61 | 
62 |             all_feats[start:end] = feats
63 | 
64 |         return all_feats
65 | 
66 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/create_reference.py:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | 
  3 | import os
  4 | import glob
  5 | import numpy as np
  6 | import pandas as pd
  7 | import cPickle as pickle
  8 | 
  9 | 
 10 | video_src_dir = "../data/video_corpus.csv"
 11 | 
 12 | video_data = pd.read_csv(video_src_dir, sep=',')
 13 | video_data = video_data[video_data['Language'] == 'English']
 14 | 
 15 | videoID_lists = list(video_data['VideoID'])
 16 | videoID_start_lists = list(video_data['Start'])
 17 | videoID_end_lists = list(video_data['End'])
 18 | video_descriptions_lists = list(video_data['Description'])
 19 | 
 20 | videoID_with_Frames = []
 21 | for idx, item in enumerate(videoID_lists):
 22 |     temp = videoID_lists[idx] + '_' + str(int(videoID_start_lists[idx])) + '_' + str(int(videoID_end_lists[idx]))
 23 |     videoID_with_Frames.append(temp)
 24 | 
 25 | # videoID map
 26 | videoID_shrinked = list(set(videoID_with_Frames))
 27 | 
 28 | map_videoID = {}
 29 | for idx, item in enumerate(videoID_shrinked):
 30 |     map_videoID[idx] = item
 31 | 
 32 | with open('map_videoID.pkl', 'w') as f:
 33 |     pickle.dump(map_videoID, f)
 34 | 
 35 | ###########################################################################################################
 36 | # judge the ascii
 37 | ###########################################################################################################
 38 | def is_ascii(s):
 39 |     return all(ord(c) < 128 for c in s)
 40 | 
 41 | 
 42 | json_fd = open('reference.json', 'w')
 43 | json_fd.write('{"info": {"description": "test", "url": "https://github.com/chenxinpeng", "version": "1.0", "year": 2017, "contributor": "ChenXinpeng", "date_created": "2017-01-27"}, "images": [')
 44 | 
 45 | 
 46 | tmp_idx = 1
 47 | for key in map_videoID:
 48 |     if tmp_idx != len(map_videoID):
 49 |         json_fd.write('{"license": 1, "file_name": "' + str(map_videoID[key]) + '", "id": ' + str(key) + '}, ')
 50 |     if tmp_idx == len(map_videoID):
 51 |         json_fd.write('{"license": 1, "file_name": "' + str(map_videoID[key]) + '", "id": ' + str(key) + '}], ')
 52 |     tmp_idx += 1
 53 | 
 54 | json_fd.write('"licenses": [{"url": "http://creativecommons.org/licenses/by-nc-sa/2.0/", "id": 1, "name": "Attribution-NonCommercial-ShareAlike License"}], ')
 55 | json_fd.write('"type": "captions", "annotations": [')
 56 | 
 57 | id_count = 0
 58 | for count, key in enumerate(map_videoID):
 59 |     video_frame = map_videoID[key]
 60 |     indices = [i for i, x in enumerate(videoID_with_Frames) if x == video_frame]
 61 |     if count != len(map_videoID)-1:
 62 |         for idx in indices:
 63 |             if type(video_descriptions_lists[idx]) == type(1.0):
 64 |                 continue
 65 |         
 66 |             if '\\' in video_descriptions_lists[idx]:
 67 |                 print video_descriptions_lists[idx]
 68 |                 continue
 69 | 
 70 |             if '"' in video_descriptions_lists[idx]:
 71 |                 print video_descriptions_lists[int(idx)]
 72 |                 continue
 73 |         
 74 |             if "\n" in video_descriptions_lists[idx]:
 75 |                 print video_descriptions_lists[int(idx)]
 76 |                 continue
 77 | 
 78 |             if is_ascii(video_descriptions_lists[idx]):
 79 |                 json_fd.write('{"image_id": ' + str(key) + ', "id": ' + str(id_count) + ', "caption": "' + str(video_descriptions_lists[idx]) + '"}, ')
 80 |                 id_count = id_count + 1
 81 | 
 82 |     if count == len(map_videoID)-1:
 83 |         for ii, idx in enumerate(indices):
 84 |             if type(video_descriptions_lists[idx]) == type(1.0):
 85 |                 continue
 86 |         
 87 |             if '\\' in video_descriptions_lists[idx]:
 88 |                 print video_descriptions_lists[idx]
 89 |                 continue
 90 |         
 91 |             if '"' in video_descriptions_lists[idx]:
 92 |                 print video_descriptions_lists[int(idx)]
 93 |                 continue
 94 |         
 95 |             if "\n" in video_descriptions_lists[idx]:
 96 |                 print video_descriptions_lists[int(idx)]
 97 |                 continue
 98 |             
 99 |             if ii != len(indices)-1:
100 |                 json_fd.write('{"image_id": ' + str(key) + ', "id": ' + str(id_count) + ', "caption": "' + str(video_descriptions_lists[idx]) + '"}, ')
101 |                 id_count = id_count + 1
102 |             if ii == len(indices)-1:
103 |                 json_fd.write('{"image_id": ' + str(key) + ', "id": ' + str(id_count) + ', "caption": "' + str(video_descriptions_lists[idx]) + '"}]}')
104 | 
105 | json_fd.close()
106 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/create_result_json.py:
--------------------------------------------------------------------------------
 1 | # encoding: UTF-8
 2 | 
 3 | import os
 4 | import glob
 5 | import numpy as np
 6 | 
 7 | import cPickle
 8 | 
 9 | #  change the output file
10 | output_txt_dir = "../test_result.txt"
11 | output = open(output_txt_dir).read().splitlines()
12 | 
13 | num_all_output = len(output)
14 | 
15 | avi_names_lists = []
16 | machine_produced_sentences = []
17 | 
18 | for idx, item in enumerate(output):
19 |     if (idx % 3) == 0:
20 |         avi_names_lists.append(item)
21 |     if (idx % 3) == 1:
22 |         machine_produced_sentences.append(item)
23 | 
24 | avi_npy_basenames = map(lambda item: os.path.basename(item), avi_names_lists)
25 | 
26 | avi_names = []
27 | for each_avi in avi_npy_basenames:
28 |     tmp1, tmp2, tmp3 = each_avi.split('.')
29 |     avi_names.append(tmp1)
30 | 
31 | fd = open('map_videoID.pkl', 'rb')
32 | map_videoID = cPickle.load(fd)
33 | 
34 | map_videoID_reverse = {}
35 | for key in map_videoID:
36 |     val = map_videoID[key]
37 |     map_videoID_reverse[val] = key
38 | 
39 | json_fd = open('generation.json', 'w')
40 | json_fd.write('[')
41 | for idx, item in enumerate(avi_names):
42 |     if idx != len(avi_names)-1:
43 |         json_fd.write('{"image_id": ' + str(map_videoID_reverse[item]) + ', "caption": "' + str(machine_produced_sentences[idx]) + '"}, ')
44 |     if idx == len(avi_names)-1:
45 |         json_fd.write('{"image_id": ' + str(map_videoID_reverse[item]) + ', "caption": "' + str(machine_produced_sentences[idx]) + '"}]')
46 | 
47 | json_fd.close()
48 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/eval.py:
--------------------------------------------------------------------------------
 1 | #! encoding: UTF-8
 2 | 
 3 | import os
 4 | from pycocotools.coco import COCO
 5 | from pycocoevalcap.eval import COCOEvalCap
 6 | 
 7 | annFile = 'reference.json'
 8 | resFile = 'generation.json'
 9 | 
10 | # create coco object and cocoRes object
11 | coco = COCO(annFile)
12 | cocoRes = coco.loadRes(resFile)
13 | 
14 | # create cocoEval object by taking coco and cocoRes
15 | cocoEval = COCOEvalCap(coco, cocoRes)
16 | 
17 | # evaluate on a subset of images by setting
18 | # cocoEval.params['image_id'] = cocoRes.getImgIds()
19 | # please remove this line when evaluating the full validation set
20 | cocoEval.params['image_id'] = cocoRes.getImgIds()
21 | 
22 | # evaluate results
23 | cocoEval.evaluate()
24 | 
25 | # print output evaluation scores
26 | for metric, score in cocoEval.eval.items():
27 |     print '%s: %.3f'%(metric, score)
28 |  
29 | 
30 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/parse_video_csv.py:
--------------------------------------------------------------------------------
 1 | # encoding: UTF-8
 2 | 
 3 | import os
 4 | import glob
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | video_src_dir = "../data/video_corpus.csv"
10 | 
11 | video_data = pd.read_csv(video_src_dir, sep=',')
12 | video_data = video_data[video_data['Language'] == 'English']
13 | 
14 | videoID_lists = list(video_data['VideoID'])
15 | videoID_start_lists = list(video_data['Start'])
16 | videoID_end_lists = list(video_data['End'])
17 | video_descriptions_lists = list(video_data['Description'])
18 | 
19 | videoID_with_Frames = []
20 | for idx, item in enumerate(videoID_lists):
21 |     temp = videoID_lists[idx] + '_' + str(videoID_start_lists[idx]) + '_' + str(videoID_end_lists[idx])
22 |     videoID_with_Frames.append(temp)
23 | 
24 | #print video_descriptions_lists
25 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/__init__.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/bleu/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/bleu/__init__.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from bleu_scorer import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(gts.keys() == res.keys())
24 |         imgIds = gts.keys()
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/bleu/bleu.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/bleu/bleu.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/bleu/bleu_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # bleu_scorer.py
  4 | # David Chiang <chiang@isi.edu>
  5 | 
  6 | # Copyright (c) 2004-2006 University of Maryland. All rights
  7 | # reserved. Do not redistribute without permission from the
  8 | # author. Not for commercial use.
  9 | 
 10 | # Modified by: 
 11 | # Hao Fang <hfang@uw.edu>
 12 | # Tsung-Yi Lin <tl483@cornell.edu>
 13 | 
 14 | '''Provides:
 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 17 | '''
 18 | 
 19 | import copy
 20 | import sys, math, re
 21 | from collections import defaultdict
 22 | 
 23 | def precook(s, n=4, out=False):
 24 |     """Takes a string as input and returns an object that can be given to
 25 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 26 |     can take string arguments as well."""
 27 |     words = s.split()
 28 |     counts = defaultdict(int)
 29 |     for k in xrange(1,n+1):
 30 |         for i in xrange(len(words)-k+1):
 31 |             ngram = tuple(words[i:i+k])
 32 |             counts[ngram] += 1
 33 |     return (len(words), counts)
 34 | 
 35 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
 36 |     '''Takes a list of reference sentences for a single segment
 37 |     and returns an object that encapsulates everything that BLEU
 38 |     needs to know about them.'''
 39 | 
 40 |     reflen = []
 41 |     maxcounts = {}
 42 |     for ref in refs:
 43 |         rl, counts = precook(ref, n)
 44 |         reflen.append(rl)
 45 |         for (ngram,count) in counts.iteritems():
 46 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 47 | 
 48 |     # Calculate effective reference sentence length.
 49 |     if eff == "shortest":
 50 |         reflen = min(reflen)
 51 |     elif eff == "average":
 52 |         reflen = float(sum(reflen))/len(reflen)
 53 | 
 54 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
 55 |     
 56 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 57 | 
 58 |     return (reflen, maxcounts)
 59 | 
 60 | def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
 61 |     '''Takes a test sentence and returns an object that
 62 |     encapsulates everything that BLEU needs to know about it.'''
 63 | 
 64 |     testlen, counts = precook(test, n, True)
 65 | 
 66 |     result = {}
 67 | 
 68 |     # Calculate effective reference sentence length.
 69 |     
 70 |     if eff == "closest":
 71 |         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
 72 |     else: ## i.e., "average" or "shortest" or None
 73 |         result["reflen"] = reflen
 74 | 
 75 |     result["testlen"] = testlen
 76 | 
 77 |     result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
 78 | 
 79 |     result['correct'] = [0]*n
 80 |     for (ngram, count) in counts.iteritems():
 81 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 82 | 
 83 |     return result
 84 | 
 85 | class BleuScorer(object):
 86 |     """Bleu scorer.
 87 |     """
 88 | 
 89 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
 90 |     # special_reflen is used in oracle (proportional effective ref len for a node).
 91 | 
 92 |     def copy(self):
 93 |         ''' copy the refs.'''
 94 |         new = BleuScorer(n=self.n)
 95 |         new.ctest = copy.copy(self.ctest)
 96 |         new.crefs = copy.copy(self.crefs)
 97 |         new._score = None
 98 |         return new
 99 | 
100 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
101 |         ''' singular instance '''
102 | 
103 |         self.n = n
104 |         self.crefs = []
105 |         self.ctest = []
106 |         self.cook_append(test, refs)
107 |         self.special_reflen = special_reflen
108 | 
109 |     def cook_append(self, test, refs):
110 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
111 |         
112 |         if refs is not None:
113 |             self.crefs.append(cook_refs(refs))
114 |             if test is not None:
115 |                 cooked_test = cook_test(test, self.crefs[-1])
116 |                 self.ctest.append(cooked_test) ## N.B.: -1
117 |             else:
118 |                 self.ctest.append(None) # lens of crefs and ctest have to match
119 | 
120 |         self._score = None ## need to recompute
121 | 
122 |     def ratio(self, option=None):
123 |         self.compute_score(option=option)
124 |         return self._ratio
125 | 
126 |     def score_ratio(self, option=None):
127 |         '''return (bleu, len_ratio) pair'''
128 |         return (self.fscore(option=option), self.ratio(option=option))
129 | 
130 |     def score_ratio_str(self, option=None):
131 |         return "%.4f (%.2f)" % self.score_ratio(option)
132 | 
133 |     def reflen(self, option=None):
134 |         self.compute_score(option=option)
135 |         return self._reflen
136 | 
137 |     def testlen(self, option=None):
138 |         self.compute_score(option=option)
139 |         return self._testlen        
140 | 
141 |     def retest(self, new_test):
142 |         if type(new_test) is str:
143 |             new_test = [new_test]
144 |         assert len(new_test) == len(self.crefs), new_test
145 |         self.ctest = []
146 |         for t, rs in zip(new_test, self.crefs):
147 |             self.ctest.append(cook_test(t, rs))
148 |         self._score = None
149 | 
150 |         return self
151 | 
152 |     def rescore(self, new_test):
153 |         ''' replace test(s) with new test(s), and returns the new score.'''
154 |         
155 |         return self.retest(new_test).compute_score()
156 | 
157 |     def size(self):
158 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
159 |         return len(self.crefs)
160 | 
161 |     def __iadd__(self, other):
162 |         '''add an instance (e.g., from another sentence).'''
163 | 
164 |         if type(other) is tuple:
165 |             ## avoid creating new BleuScorer instances
166 |             self.cook_append(other[0], other[1])
167 |         else:
168 |             assert self.compatible(other), "incompatible BLEUs."
169 |             self.ctest.extend(other.ctest)
170 |             self.crefs.extend(other.crefs)
171 |             self._score = None ## need to recompute
172 | 
173 |         return self        
174 | 
175 |     def compatible(self, other):
176 |         return isinstance(other, BleuScorer) and self.n == other.n
177 | 
178 |     def single_reflen(self, option="average"):
179 |         return self._single_reflen(self.crefs[0][0], option)
180 | 
181 |     def _single_reflen(self, reflens, option=None, testlen=None):
182 |         
183 |         if option == "shortest":
184 |             reflen = min(reflens)
185 |         elif option == "average":
186 |             reflen = float(sum(reflens))/len(reflens)
187 |         elif option == "closest":
188 |             reflen = min((abs(l-testlen), l) for l in reflens)[1]
189 |         else:
190 |             assert False, "unsupported reflen option %s" % option
191 | 
192 |         return reflen
193 | 
194 |     def recompute_score(self, option=None, verbose=0):
195 |         self._score = None
196 |         return self.compute_score(option, verbose)
197 |         
198 |     def compute_score(self, option=None, verbose=0):
199 |         n = self.n
200 |         small = 1e-9
201 |         tiny = 1e-15 ## so that if guess is 0 still return 0
202 |         bleu_list = [[] for _ in range(n)]
203 | 
204 |         if self._score is not None:
205 |             return self._score
206 | 
207 |         if option is None:
208 |             option = "average" if len(self.crefs) == 1 else "closest"
209 | 
210 |         self._testlen = 0
211 |         self._reflen = 0
212 |         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
213 | 
214 |         # for each sentence
215 |         for comps in self.ctest:            
216 |             testlen = comps['testlen']
217 |             self._testlen += testlen
218 | 
219 |             if self.special_reflen is None: ## need computation
220 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
221 |             else:
222 |                 reflen = self.special_reflen
223 | 
224 |             self._reflen += reflen
225 |                 
226 |             for key in ['guess','correct']:
227 |                 for k in xrange(n):
228 |                     totalcomps[key][k] += comps[key][k]
229 | 
230 |             # append per image bleu score
231 |             bleu = 1.
232 |             for k in xrange(n):
233 |                 bleu *= (float(comps['correct'][k]) + tiny) \
234 |                         /(float(comps['guess'][k]) + small) 
235 |                 bleu_list[k].append(bleu ** (1./(k+1)))
236 |             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
237 |             if ratio < 1:
238 |                 for k in xrange(n):
239 |                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
240 | 
241 |             if verbose > 1:
242 |                 print comps, reflen
243 | 
244 |         totalcomps['reflen'] = self._reflen
245 |         totalcomps['testlen'] = self._testlen
246 | 
247 |         bleus = []
248 |         bleu = 1.
249 |         for k in xrange(n):
250 |             bleu *= float(totalcomps['correct'][k] + tiny) \
251 |                     / (totalcomps['guess'][k] + small)
252 |             bleus.append(bleu ** (1./(k+1)))
253 |         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
254 |         if ratio < 1:
255 |             for k in xrange(n):
256 |                 bleus[k] *= math.exp(1 - 1/ratio)
257 | 
258 |         if verbose > 0:
259 |             print totalcomps
260 |             print "ratio:", ratio
261 | 
262 |         self._score = bleus
263 |         return self._score, bleu_list
264 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/bleu/bleu_scorer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/bleu/bleu_scorer.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/cider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/cider/__init__.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from cider_scorer import CiderScorer
11 | import pdb
12 | 
13 | class Cider:
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         assert(gts.keys() == res.keys())
33 |         imgIds = gts.keys()
34 | 
35 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 | 
37 |         for id in imgIds:
38 |             hypo = res[id]
39 |             ref = gts[id]
40 | 
41 |             # Sanity check.
42 |             assert(type(hypo) is list)
43 |             assert(len(hypo) == 1)
44 |             assert(type(ref) is list)
45 |             assert(len(ref) > 0)
46 | 
47 |             cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr"


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/cider/cider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/cider/cider.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/cider/cider_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | 
  5 | import copy
  6 | from collections import defaultdict
  7 | import numpy as np
  8 | import pdb
  9 | import math
 10 | 
 11 | def precook(s, n=4, out=False):
 12 |     """
 13 |     Takes a string as input and returns an object that can be given to
 14 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 15 |     can take string arguments as well.
 16 |     :param s: string : sentence to be converted into ngrams
 17 |     :param n: int    : number of ngrams for which representation is calculated
 18 |     :return: term frequency vector for occuring ngrams
 19 |     """
 20 |     words = s.split()
 21 |     counts = defaultdict(int)
 22 |     for k in xrange(1,n+1):
 23 |         for i in xrange(len(words)-k+1):
 24 |             ngram = tuple(words[i:i+k])
 25 |             counts[ngram] += 1
 26 |     return counts
 27 | 
 28 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 29 |     '''Takes a list of reference sentences for a single segment
 30 |     and returns an object that encapsulates everything that BLEU
 31 |     needs to know about them.
 32 |     :param refs: list of string : reference sentences for some image
 33 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 34 |     :return: result (list of dict)
 35 |     '''
 36 |     return [precook(ref, n) for ref in refs]
 37 | 
 38 | def cook_test(test, n=4):
 39 |     '''Takes a test sentence and returns an object that
 40 |     encapsulates everything that BLEU needs to know about it.
 41 |     :param test: list of string : hypothesis sentence for some image
 42 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 43 |     :return: result (dict)
 44 |     '''
 45 |     return precook(test, n, True)
 46 | 
 47 | class CiderScorer(object):
 48 |     """CIDEr scorer.
 49 |     """
 50 | 
 51 |     def copy(self):
 52 |         ''' copy the refs.'''
 53 |         new = CiderScorer(n=self.n)
 54 |         new.ctest = copy.copy(self.ctest)
 55 |         new.crefs = copy.copy(self.crefs)
 56 |         return new
 57 | 
 58 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
 59 |         ''' singular instance '''
 60 |         self.n = n
 61 |         self.sigma = sigma
 62 |         self.crefs = []
 63 |         self.ctest = []
 64 |         self.document_frequency = defaultdict(float)
 65 |         self.cook_append(test, refs)
 66 |         self.ref_len = None
 67 | 
 68 |     def cook_append(self, test, refs):
 69 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 70 | 
 71 |         if refs is not None:
 72 |             self.crefs.append(cook_refs(refs))
 73 |             if test is not None:
 74 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 75 |             else:
 76 |                 self.ctest.append(None) # lens of crefs and ctest have to match
 77 | 
 78 |     def size(self):
 79 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
 80 |         return len(self.crefs)
 81 | 
 82 |     def __iadd__(self, other):
 83 |         '''add an instance (e.g., from another sentence).'''
 84 | 
 85 |         if type(other) is tuple:
 86 |             ## avoid creating new CiderScorer instances
 87 |             self.cook_append(other[0], other[1])
 88 |         else:
 89 |             self.ctest.extend(other.ctest)
 90 |             self.crefs.extend(other.crefs)
 91 | 
 92 |         return self
 93 |     def compute_doc_freq(self):
 94 |         '''
 95 |         Compute term frequency for reference data.
 96 |         This will be used to compute idf (inverse document frequency later)
 97 |         The term frequency is stored in the object
 98 |         :return: None
 99 |         '''
100 |         for refs in self.crefs:
101 |             # refs, k ref captions of one image
102 |             for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
103 |                 self.document_frequency[ngram] += 1
104 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
105 | 
106 |     def compute_cider(self):
107 |         def counts2vec(cnts):
108 |             """
109 |             Function maps counts of ngram to vector of tfidf weights.
110 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
111 |             The n-th entry of array denotes length of n-grams.
112 |             :param cnts:
113 |             :return: vec (array of dict), norm (array of float), length (int)
114 |             """
115 |             vec = [defaultdict(float) for _ in range(self.n)]
116 |             length = 0
117 |             norm = [0.0 for _ in range(self.n)]
118 |             for (ngram,term_freq) in cnts.iteritems():
119 |                 # give word count 1 if it doesn't appear in reference corpus
120 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
121 |                 # ngram index
122 |                 n = len(ngram)-1
123 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
124 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
125 |                 # compute norm for the vector.  the norm will be used for computing similarity
126 |                 norm[n] += pow(vec[n][ngram], 2)
127 | 
128 |                 if n == 1:
129 |                     length += term_freq
130 |             norm = [np.sqrt(n) for n in norm]
131 |             return vec, norm, length
132 | 
133 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
134 |             '''
135 |             Compute the cosine similarity of two vectors.
136 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
137 |             :param vec_ref: array of dictionary for vector corresponding to reference
138 |             :param norm_hyp: array of float for vector corresponding to hypothesis
139 |             :param norm_ref: array of float for vector corresponding to reference
140 |             :param length_hyp: int containing length of hypothesis
141 |             :param length_ref: int containing length of reference
142 |             :return: array of score for each n-grams cosine similarity
143 |             '''
144 |             delta = float(length_hyp - length_ref)
145 |             # measure consine similarity
146 |             val = np.array([0.0 for _ in range(self.n)])
147 |             for n in range(self.n):
148 |                 # ngram
149 |                 for (ngram,count) in vec_hyp[n].iteritems():
150 |                     # vrama91 : added clipping
151 |                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
152 | 
153 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
154 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
155 | 
156 |                 assert(not math.isnan(val[n]))
157 |                 # vrama91: added a length based gaussian penalty
158 |                 val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
159 |             return val
160 | 
161 |         # compute log reference length
162 |         self.ref_len = np.log(float(len(self.crefs)))
163 | 
164 |         scores = []
165 |         for test, refs in zip(self.ctest, self.crefs):
166 |             # compute vector for test captions
167 |             vec, norm, length = counts2vec(test)
168 |             # compute vector for ref captions
169 |             score = np.array([0.0 for _ in range(self.n)])
170 |             for ref in refs:
171 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
172 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
173 |             # change by vrama91 - mean of ngram scores, instead of sum
174 |             score_avg = np.mean(score)
175 |             # divide by number of references
176 |             score_avg /= len(refs)
177 |             # multiply score by 10
178 |             score_avg *= 10.0
179 |             # append score of an image to the score list
180 |             scores.append(score_avg)
181 |         return scores
182 | 
183 |     def compute_score(self, option=None, verbose=0):
184 |         # compute idf
185 |         self.compute_doc_freq()
186 |         # assert to check document frequency
187 |         assert(len(self.ctest) >= max(self.document_frequency.values()))
188 |         # compute cider score
189 |         score = self.compute_cider()
190 |         # debug
191 |         # print score
192 |         return np.mean(np.array(score)), np.array(score)


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/cider/cider_scorer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/cider/cider_scorer.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/eval.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'tylin'
 2 | from tokenizer.ptbtokenizer import PTBTokenizer
 3 | from bleu.bleu import Bleu
 4 | from meteor.meteor import Meteor
 5 | from rouge.rouge import Rouge
 6 | from cider.cider import Cider
 7 | 
 8 | class COCOEvalCap:
 9 |     def __init__(self, coco, cocoRes):
10 |         self.evalImgs = []
11 |         self.eval = {}
12 |         self.imgToEval = {}
13 |         self.coco = coco
14 |         self.cocoRes = cocoRes
15 |         self.params = {'image_id': coco.getImgIds()}
16 | 
17 |     def evaluate(self):
18 |         imgIds = self.params['image_id']
19 |         # imgIds = self.coco.getImgIds()
20 |         gts = {}
21 |         res = {}
22 |         for imgId in imgIds:
23 |             gts[imgId] = self.coco.imgToAnns[imgId]
24 |             res[imgId] = self.cocoRes.imgToAnns[imgId]
25 | 
26 |         # =================================================
27 |         # Set up scorers
28 |         # =================================================
29 |         print 'tokenization...'
30 |         tokenizer = PTBTokenizer()
31 |         gts  = tokenizer.tokenize(gts)
32 |         res = tokenizer.tokenize(res)
33 | 
34 |         # =================================================
35 |         # Set up scorers
36 |         # =================================================
37 |         print 'setting up scorers...'
38 |         scorers = [
39 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
40 |             (Meteor(),"METEOR"),
41 |             (Rouge(), "ROUGE_L"),
42 |             (Cider(), "CIDEr")
43 |         ]
44 | 
45 |         # =================================================
46 |         # Compute scores
47 |         # =================================================
48 |         for scorer, method in scorers:
49 |             print 'computing %s score...'%(scorer.method())
50 |             score, scores = scorer.compute_score(gts, res)
51 |             if type(method) == list:
52 |                 for sc, scs, m in zip(score, scores, method):
53 |                     self.setEval(sc, m)
54 |                     self.setImgToEvalImgs(scs, gts.keys(), m)
55 |                     print "%s: %0.3f"%(m, sc)
56 |             else:
57 |                 self.setEval(score, method)
58 |                 self.setImgToEvalImgs(scores, gts.keys(), method)
59 |                 print "%s: %0.3f"%(method, score)
60 |         self.setEvalImgs()
61 | 
62 |     def setEval(self, score, method):
63 |         self.eval[method] = score
64 | 
65 |     def setImgToEvalImgs(self, scores, imgIds, method):
66 |         for imgId, score in zip(imgIds, scores):
67 |             if not imgId in self.imgToEval:
68 |                 self.imgToEval[imgId] = {}
69 |                 self.imgToEval[imgId]["image_id"] = imgId
70 |             self.imgToEval[imgId][method] = score
71 | 
72 |     def setEvalImgs(self):
73 |         self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/eval.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/eval.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/meteor/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/__init__.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/meteor/data/paraphrase-en.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/data/paraphrase-en.gz


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/meteor/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/meteor-1.5.jar


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/meteor/meteor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Python wrapper for METEOR implementation, by Xinlei Chen
 4 | # Acknowledge Michael Denkowski for the generous discussion and help 
 5 | 
 6 | import os
 7 | import sys
 8 | import subprocess
 9 | import threading
10 | 
11 | # Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
12 | METEOR_JAR = 'meteor-1.5.jar'
13 | # print METEOR_JAR
14 | 
15 | class Meteor:
16 | 
17 |     def __init__(self):
18 |         self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
19 |                 '-', '-', '-stdio', '-l', 'en', '-norm']
20 |         self.meteor_p = subprocess.Popen(self.meteor_cmd, \
21 |                 cwd=os.path.dirname(os.path.abspath(__file__)), \
22 |                 stdin=subprocess.PIPE, \
23 |                 stdout=subprocess.PIPE, \
24 |                 stderr=subprocess.PIPE)
25 |         # Used to guarantee thread safety
26 |         self.lock = threading.Lock()
27 | 
28 |     def compute_score(self, gts, res):
29 |         assert(gts.keys() == res.keys())
30 |         imgIds = gts.keys()
31 |         scores = []
32 | 
33 |         eval_line = 'EVAL'
34 |         self.lock.acquire()
35 |         for i in imgIds:
36 |             assert(len(res[i]) == 1)
37 |             stat = self._stat(res[i][0], gts[i])
38 |             eval_line += ' ||| {}'.format(stat)
39 | 
40 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
41 |         for i in range(0,len(imgIds)):
42 |             scores.append(float(self.meteor_p.stdout.readline().strip()))
43 |         score = float(self.meteor_p.stdout.readline().strip())
44 |         self.lock.release()
45 | 
46 |         return score, scores
47 | 
48 |     def method(self):
49 |         return "METEOR"
50 | 
51 |     def _stat(self, hypothesis_str, reference_list):
52 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
53 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
54 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
55 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
56 |         return self.meteor_p.stdout.readline().strip()
57 | 
58 |     def _score(self, hypothesis_str, reference_list):
59 |         self.lock.acquire()
60 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
61 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
62 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
63 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
64 |         stats = self.meteor_p.stdout.readline().strip()
65 |         eval_line = 'EVAL ||| {}'.format(stats)
66 |         # EVAL ||| stats 
67 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
68 |         score = float(self.meteor_p.stdout.readline().strip())
69 |         # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
70 |         # thanks for Andrej for pointing this out
71 |         score = float(self.meteor_p.stdout.readline().strip())
72 |         self.lock.release()
73 |         return score
74 |  
75 |     def __del__(self):
76 |         self.lock.acquire()
77 |         self.meteor_p.stdin.close()
78 |         self.meteor_p.kill()
79 |         self.meteor_p.wait()
80 |         self.lock.release()
81 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/meteor/meteor.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/meteor.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/rouge/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/rouge/__init__.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 | 
 20 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 21 |     """
 22 |     if(len(string)< len(sub)):
 23 |         sub, string = string, sub
 24 | 
 25 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 26 | 
 27 |     for j in range(1,len(sub)+1):
 28 |         for i in range(1,len(string)+1):
 29 |             if(string[i-1] == sub[j-1]):
 30 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 31 |             else:
 32 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 33 | 
 34 |     return lengths[len(string)][len(sub)]
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 | 
 40 |     '''
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert(len(candidate)==1)	
 53 |         assert(len(refs)>0)         
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 |     	
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs/float(len(token_c)))
 66 |             rec.append(lcs/float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if(prec_max!=0 and rec_max !=0):
 72 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py 
 81 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 82 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert(gts.keys() == res.keys())
 86 |         imgIds = gts.keys()
 87 | 
 88 |         score = []
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref  = gts[id]
 92 | 
 93 |             score.append(self.calc_score(hypo, ref))
 94 | 
 95 |             # Sanity check.
 96 |             assert(type(hypo) is list)
 97 |             assert(len(hypo) == 1)
 98 |             assert(type(ref) is list)
 99 |             assert(len(ref) > 0)
100 | 
101 |         average_score = np.mean(np.array(score))
102 |         return average_score, np.array(score)
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/rouge/rouge.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/rouge/rouge.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/tokenizer/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/tokenizer/__init__.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import os
12 | import sys
13 | import subprocess
14 | import tempfile
15 | import itertools
16 | 
17 | # path to the stanford corenlp jar
18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
19 | 
20 | # punctuations to be removed from the sentences
21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
22 |         ".", "?", "!", ",", ":", "-", "--", "...", ";"] 
23 | 
24 | class PTBTokenizer:
25 |     """Python wrapper of Stanford PTBTokenizer"""
26 | 
27 |     def tokenize(self, captions_for_image):
28 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
29 |                 'edu.stanford.nlp.process.PTBTokenizer', \
30 |                 '-preserveLines', '-lowerCase']
31 | 
32 |         # ======================================================
33 |         # prepare data for PTB Tokenizer
34 |         # ======================================================
35 |         final_tokenized_captions_for_image = {}
36 |         image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
37 |         sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
38 | 
39 |         # ======================================================
40 |         # save sentences to temporary file
41 |         # ======================================================
42 |         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
43 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
44 |         tmp_file.write(sentences)
45 |         tmp_file.close()
46 | 
47 |         # ======================================================
48 |         # tokenize sentence
49 |         # ======================================================
50 |         cmd.append(os.path.basename(tmp_file.name))
51 |         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
52 |                 stdout=subprocess.PIPE)
53 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
54 |         lines = token_lines.split('\n')
55 |         # remove temp file
56 |         os.remove(tmp_file.name)
57 | 
58 |         # ======================================================
59 |         # create dictionary for tokenized captions
60 |         # ======================================================
61 |         for k, line in zip(image_id, lines):
62 |             if not k in final_tokenized_captions_for_image:
63 |                 final_tokenized_captions_for_image[k] = []
64 |             tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
65 |                     if w not in PUNCTUATIONS])
66 |             final_tokenized_captions_for_image[k].append(tokenized_caption)
67 | 
68 |         return final_tokenized_captions_for_image
69 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/tokenizer/ptbtokenizer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/tokenizer/ptbtokenizer.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocotools/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocotools/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocotools/__init__.pyc


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocotools/coco.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'tylin'
  2 | __version__ = '1.0.1'
  3 | # Interface for accessing the Microsoft COCO dataset.
  4 | 
  5 | # Microsoft COCO is a large image dataset designed for object detection,
  6 | # segmentation, and caption generation. pycocotools is a Python API that
  7 | # assists in loading, parsing and visualizing the annotations in COCO.
  8 | # Please visit http://mscoco.org/ for more information on COCO, including
  9 | # for the data, paper, and tutorials. The exact format of the annotations
 10 | # is also described on the COCO website. For example usage of the pycocotools
 11 | # please see pycocotools_demo.ipynb. In addition to this API, please download both
 12 | # the COCO images and annotations in order to run the demo.
 13 | 
 14 | # An alternative to using the API is to load the annotations directly
 15 | # into Python dictionary
 16 | # Using the API provides additional utility functions. Note that this API
 17 | # supports both *instance* and *caption* annotations. In the case of
 18 | # captions not all functions are defined (e.g. categories are undefined).
 19 | 
 20 | # The following API functions are defined:
 21 | #  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
 22 | #  decodeMask - Decode binary mask M encoded via run-length encoding.
 23 | #  encodeMask - Encode binary mask M using run-length encoding.
 24 | #  getAnnIds  - Get ann ids that satisfy given filter conditions.
 25 | #  getCatIds  - Get cat ids that satisfy given filter conditions.
 26 | #  getImgIds  - Get img ids that satisfy given filter conditions.
 27 | #  loadAnns   - Load anns with the specified ids.
 28 | #  loadCats   - Load cats with the specified ids.
 29 | #  loadImgs   - Load imgs with the specified ids.
 30 | #  segToMask  - Convert polygon segmentation to binary mask.
 31 | #  showAnns   - Display the specified annotations.
 32 | #  loadRes    - Load result file and create result api object.
 33 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
 34 | # Help on each functions can be accessed by: "help COCO>function".
 35 | 
 36 | # See also COCO>decodeMask,
 37 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
 38 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
 39 | # COCO>loadImgs, COCO>segToMask, COCO>showAnns
 40 | 
 41 | # Microsoft COCO Toolbox.      Version 1.0
 42 | # Data, paper, and tutorials available at:  http://mscoco.org/
 43 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
 44 | # Licensed under the Simplified BSD License [see bsd.txt]
 45 | 
 46 | import json
 47 | import datetime
 48 | import matplotlib.pyplot as plt
 49 | from matplotlib.collections import PatchCollection
 50 | from matplotlib.patches import Polygon
 51 | import numpy as np
 52 | from skimage.draw import polygon
 53 | import copy
 54 | 
 55 | class COCO:
 56 |     def __init__(self, annotation_file=None):
 57 |         """
 58 |         Constructor of Microsoft COCO helper class for reading and visualizing annotations.
 59 |         :param annotation_file (str): location of annotation file
 60 |         :param image_folder (str): location to the folder that hosts images.
 61 |         :return:
 62 |         """
 63 |         # load dataset
 64 |         self.dataset = {}
 65 |         self.anns = []
 66 |         self.imgToAnns = {}
 67 |         self.catToImgs = {}
 68 |         self.imgs = []
 69 |         self.cats = []
 70 |         if not annotation_file == None:
 71 |             print 'loading annotations into memory...'
 72 |             time_t = datetime.datetime.utcnow()
 73 |             dataset = json.load(open(annotation_file, 'r'))
 74 |             print datetime.datetime.utcnow() - time_t
 75 |             self.dataset = dataset
 76 |             self.createIndex()
 77 | 
 78 |     def createIndex(self):
 79 |         # create index
 80 |         print 'creating index...'
 81 |         imgToAnns = {ann['image_id']: [] for ann in self.dataset['annotations']}
 82 |         anns =      {ann['id']:       [] for ann in self.dataset['annotations']}
 83 |         for ann in self.dataset['annotations']:
 84 |             imgToAnns[ann['image_id']] += [ann]
 85 |             anns[ann['id']] = ann
 86 | 
 87 |         imgs      = {im['id']: {} for im in self.dataset['images']}
 88 |         for img in self.dataset['images']:
 89 |             imgs[img['id']] = img
 90 | 
 91 |         cats = []
 92 |         catToImgs = []
 93 |         if self.dataset['type'] == 'instances':
 94 |             cats = {cat['id']: [] for cat in self.dataset['categories']}
 95 |             for cat in self.dataset['categories']:
 96 |                 cats[cat['id']] = cat
 97 |             catToImgs = {cat['id']: [] for cat in self.dataset['categories']}
 98 |             for ann in self.dataset['annotations']:
 99 |                 catToImgs[ann['category_id']] += [ann['image_id']]
100 | 
101 |         print 'index created!'
102 | 
103 |         # create class members
104 |         self.anns = anns
105 |         self.imgToAnns = imgToAnns
106 |         self.catToImgs = catToImgs
107 |         self.imgs = imgs
108 |         self.cats = cats
109 | 
110 |     def info(self):
111 |         """
112 |         Print information about the annotation file.
113 |         :return:
114 |         """
115 |         for key, value in self.datset['info'].items():
116 |             print '%s: %s'%(key, value)
117 | 
118 |     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
119 |         """
120 |         Get ann ids that satisfy given filter conditions. default skips that filter
121 |         :param imgIds  (int array)     : get anns for given imgs
122 |                catIds  (int array)     : get anns for given cats
123 |                areaRng (float array)   : get anns for given area range (e.g. [0 inf])
124 |                iscrowd (boolean)       : get anns for given crowd label (False or True)
125 |         :return: ids (int array)       : integer array of ann ids
126 |         """
127 |         imgIds = imgIds if type(imgIds) == list else [imgIds]
128 |         catIds = catIds if type(catIds) == list else [catIds]
129 | 
130 |         if len(imgIds) == len(catIds) == len(areaRng) == 0:
131 |             anns = self.dataset['annotations']
132 |         else:
133 |             if not len(imgIds) == 0:
134 |                 anns = sum([self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns],[])
135 |             else:
136 |                 anns = self.dataset['annotations']
137 |             anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
138 |             anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
139 |         if self.dataset['type'] == 'instances':
140 |             if not iscrowd == None:
141 |                 ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
142 |             else:
143 |                 ids = [ann['id'] for ann in anns]
144 |         else:
145 |             ids = [ann['id'] for ann in anns]
146 |         return ids
147 | 
148 |     def getCatIds(self, catNms=[], supNms=[], catIds=[]):
149 |         """
150 |         filtering parameters. default skips that filter.
151 |         :param catNms (str array)  : get cats for given cat names
152 |         :param supNms (str array)  : get cats for given supercategory names
153 |         :param catIds (int array)  : get cats for given cat ids
154 |         :return: ids (int array)   : integer array of cat ids
155 |         """
156 |         catNms = catNms if type(catNms) == list else [catNms]
157 |         supNms = supNms if type(supNms) == list else [supNms]
158 |         catIds = catIds if type(catIds) == list else [catIds]
159 | 
160 |         if len(catNms) == len(supNms) == len(catIds) == 0:
161 |             cats = self.dataset['categories']
162 |         else:
163 |             cats = self.dataset['categories']
164 |             cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
165 |             cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
166 |             cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
167 |         ids = [cat['id'] for cat in cats]
168 |         return ids
169 | 
170 |     def getImgIds(self, imgIds=[], catIds=[]):
171 |         '''
172 |         Get img ids that satisfy given filter conditions.
173 |         :param imgIds (int array) : get imgs for given ids
174 |         :param catIds (int array) : get imgs with all given cats
175 |         :return: ids (int array)  : integer array of img ids
176 |         '''
177 |         imgIds = imgIds if type(imgIds) == list else [imgIds]
178 |         catIds = catIds if type(catIds) == list else [catIds]
179 | 
180 |         if len(imgIds) == len(catIds) == 0:
181 |             ids = self.imgs.keys()
182 |         else:
183 |             ids = set(imgIds)
184 |             for catId in catIds:
185 |                 if len(ids) == 0:
186 |                     ids = set(self.catToImgs[catId])
187 |                 else:
188 |                     ids &= set(self.catToImgs[catId])
189 |         return list(ids)
190 | 
191 |     def loadAnns(self, ids=[]):
192 |         """
193 |         Load anns with the specified ids.
194 |         :param ids (int array)       : integer ids specifying anns
195 |         :return: anns (object array) : loaded ann objects
196 |         """
197 |         if type(ids) == list:
198 |             return [self.anns[id] for id in ids]
199 |         elif type(ids) == int:
200 |             return [self.anns[ids]]
201 | 
202 |     def loadCats(self, ids=[]):
203 |         """
204 |         Load cats with the specified ids.
205 |         :param ids (int array)       : integer ids specifying cats
206 |         :return: cats (object array) : loaded cat objects
207 |         """
208 |         if type(ids) == list:
209 |             return [self.cats[id] for id in ids]
210 |         elif type(ids) == int:
211 |             return [self.cats[ids]]
212 | 
213 |     def loadImgs(self, ids=[]):
214 |         """
215 |         Load anns with the specified ids.
216 |         :param ids (int array)       : integer ids specifying img
217 |         :return: imgs (object array) : loaded img objects
218 |         """
219 |         if type(ids) == list:
220 |             return [self.imgs[id] for id in ids]
221 |         elif type(ids) == int:
222 |             return [self.imgs[ids]]
223 | 
224 |     def showAnns(self, anns):
225 |         """
226 |         Display the specified annotations.
227 |         :param anns (array of object): annotations to display
228 |         :return: None
229 |         """
230 |         if len(anns) == 0:
231 |             return 0
232 |         if self.dataset['type'] == 'instances':
233 |             ax = plt.gca()
234 |             polygons = []
235 |             color = []
236 |             for ann in anns:
237 |                 c = np.random.random((1, 3)).tolist()[0]
238 |                 if type(ann['segmentation']) == list:
239 |                     # polygon
240 |                     for seg in ann['segmentation']:
241 |                         poly = np.array(seg).reshape((len(seg)/2, 2))
242 |                         polygons.append(Polygon(poly, True,alpha=0.4))
243 |                         color.append(c)
244 |                 else:
245 |                     # mask
246 |                     mask = COCO.decodeMask(ann['segmentation'])
247 |                     img = np.ones( (mask.shape[0], mask.shape[1], 3) )
248 |                     if ann['iscrowd'] == 1:
249 |                         color_mask = np.array([2.0,166.0,101.0])/255
250 |                     if ann['iscrowd'] == 0:
251 |                         color_mask = np.random.random((1, 3)).tolist()[0]
252 |                     for i in range(3):
253 |                         img[:,:,i] = color_mask[i]
254 |                     ax.imshow(np.dstack( (img, mask*0.5) ))
255 |             p = PatchCollection(polygons, facecolors=color, edgecolors=(0,0,0,1), linewidths=3, alpha=0.4)
256 |             ax.add_collection(p)
257 |         if self.dataset['type'] == 'captions':
258 |             for ann in anns:
259 |                 print ann['caption']
260 | 
261 |     def loadRes(self, resFile):
262 |         """
263 |         Load result file and return a result api object.
264 |         :param   resFile (str)     : file name of result file
265 |         :return: res (obj)         : result api object
266 |         """
267 |         res = COCO()
268 |         res.dataset['images'] = [img for img in self.dataset['images']]
269 |         res.dataset['info'] = copy.deepcopy(self.dataset['info'])
270 |         res.dataset['type'] = copy.deepcopy(self.dataset['type'])
271 |         res.dataset['licenses'] = copy.deepcopy(self.dataset['licenses'])
272 | 
273 |         print 'Loading and preparing results...     '
274 |         time_t = datetime.datetime.utcnow()
275 |         anns    = json.load(open(resFile))
276 |         assert type(anns) == list, 'results in not an array of objects'
277 |         annsImgIds = [ann['image_id'] for ann in anns]
278 |         assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
279 |                'Results do not correspond to current coco set'
280 |         if 'caption' in anns[0]:
281 |             imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
282 |             res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
283 |             for id, ann in enumerate(anns):
284 |                 ann['id'] = id
285 |         elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
286 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
287 |             for id, ann in enumerate(anns):
288 |                 bb = ann['bbox']
289 |                 x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
290 |                 ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
291 |                 ann['area'] = bb[2]*bb[3]
292 |                 ann['id'] = id
293 |                 ann['iscrowd'] = 0
294 |         elif 'segmentation' in anns[0]:
295 |             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
296 |             for id, ann in enumerate(anns):
297 |                 ann['area']=sum(ann['segmentation']['counts'][2:-1:2])
298 |                 ann['bbox'] = []
299 |                 ann['id'] = id
300 |                 ann['iscrowd'] = 0
301 |         print 'DONE (t=%0.2fs)'%((datetime.datetime.utcnow() - time_t).total_seconds())
302 | 
303 |         res.dataset['annotations'] = anns
304 |         res.createIndex()
305 |         return res
306 | 
307 | 
308 |     @staticmethod
309 |     def decodeMask(R):
310 |         """
311 |         Decode binary mask M encoded via run-length encoding.
312 |         :param   R (object RLE)    : run-length encoding of binary mask
313 |         :return: M (bool 2D array) : decoded binary mask
314 |         """
315 |         N = len(R['counts'])
316 |         M = np.zeros( (R['size'][0]*R['size'][1], ))
317 |         n = 0
318 |         val = 1
319 |         for pos in range(N):
320 |             val = not val
321 |             for c in range(R['counts'][pos]):
322 |                 R['counts'][pos]
323 |                 M[n] = val
324 |                 n += 1
325 |         return M.reshape((R['size']), order='F')
326 | 
327 |     @staticmethod
328 |     def encodeMask(M):
329 |         """
330 |         Encode binary mask M using run-length encoding.
331 |         :param   M (bool 2D array)  : binary mask to encode
332 |         :return: R (object RLE)     : run-length encoding of binary mask
333 |         """
334 |         [h, w] = M.shape
335 |         M = M.flatten(order='F')
336 |         N = len(M)
337 |         counts_list = []
338 |         pos = 0
339 |         # counts
340 |         counts_list.append(1)
341 |         diffs = np.logical_xor(M[0:N-1], M[1:N])
342 |         for diff in diffs:
343 |             if diff:
344 |                 pos +=1
345 |                 counts_list.append(1)
346 |             else:
347 |                 counts_list[pos] += 1
348 |         # if array starts from 1. start with 0 counts for 0
349 |         if M[0] == 1:
350 |             counts_list = [0] + counts_list
351 |         return {'size':      [h, w],
352 |                'counts':    counts_list ,
353 |                }
354 | 
355 |     @staticmethod
356 |     def segToMask( S, h, w ):
357 |          """
358 |          Convert polygon segmentation to binary mask.
359 |          :param   S (float array)   : polygon segmentation mask
360 |          :param   h (int)           : target mask height
361 |          :param   w (int)           : target mask width
362 |          :return: M (bool 2D array) : binary mask
363 |          """
364 |          M = np.zeros((h,w), dtype=np.bool)
365 |          for s in S:
366 |              N = len(s)
367 |              rr, cc = polygon(np.array(s[1:N:2]), np.array(s[0:N:2])) # (y, x)
368 |              M[rr, cc] = 1
369 |          return M


--------------------------------------------------------------------------------
/s2vt/coco_eval/pycocotools/coco.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocotools/coco.pyc


--------------------------------------------------------------------------------
/s2vt/data/readme.md:
--------------------------------------------------------------------------------
1 | put .csv to this folder
2 | 


--------------------------------------------------------------------------------
/s2vt/extract_RGB_feats.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import cv2
 4 | import os
 5 | #import ipdb
 6 | import numpy as np
 7 | import pandas as pd
 8 | import skimage
 9 | from cnn_util import *
10 | 
11 | 
12 | def preprocess_frame(image, target_height=224, target_width=224):
13 | 
14 |     if len(image.shape) == 2:
15 |         image = np.tile(image[:,:,None], 3)
16 |     elif len(image.shape) == 4:
17 |         image = image[:,:,:,0]
18 | 
19 |     image = skimage.img_as_float(image).astype(np.float32)
20 |     height, width, rgb = image.shape
21 |     if width == height:
22 |         resized_image = cv2.resize(image, (target_height,target_width))
23 | 
24 |     elif height < width:
25 |         resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width))
26 |         cropping_length = int((resized_image.shape[1] - target_height) / 2)
27 |         resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length]
28 | 
29 |     else:
30 |         resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
31 |         cropping_length = int((resized_image.shape[0] - target_width) / 2)
32 |         resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:]
33 | 
34 |     return cv2.resize(resized_image, (target_height, target_width))
35 | 
36 | def main():
37 |     num_frames = 80
38 |     vgg_model = '/root/workspace/caffe/models/vgg_16/VGG_ILSVRC_16_layers.caffemodel'
39 |     vgg_deploy = '/root/workspace/caffe/models/vgg_16/VGG_ILSVRC_16_layers_deploy.prototxt'
40 |     video_path = '/root/Downloads/YouTubeClips/'
41 |     video_save_path = './rgb_feats'
42 |     videos = os.listdir(video_path)
43 |     videos = filter(lambda x: x.endswith('avi'), videos)
44 | 
45 |     cnn = CNN(model=vgg_model, deploy=vgg_deploy, width=224, height=224)
46 | 
47 |     for idx, video in enumerate(videos):
48 |         print idx, video
49 | 
50 |         if os.path.exists( os.path.join(video_save_path, video) ):
51 |             print "Already processed ... "
52 |             continue
53 | 
54 |         video_fullpath = os.path.join(video_path, video)
55 |         try:
56 |             cap  = cv2.VideoCapture( video_fullpath )
57 |         except:
58 |             pass
59 | 
60 |         frame_count = 0
61 |         frame_list = []
62 | 
63 |         while True:
64 |             ret, frame = cap.read()
65 |             if ret is False:
66 |                 break
67 | 
68 |             frame_list.append(frame)
69 |             frame_count += 1
70 | 
71 |         frame_list = np.array(frame_list)
72 | 
73 |         if frame_count > 80:
74 |             frame_indices = np.linspace(0, frame_count, num=num_frames, endpoint=False).astype(int)
75 |             frame_list = frame_list[frame_indices]
76 | 
77 |         cropped_frame_list = np.array(map(lambda x: preprocess_frame(x), frame_list))
78 |         feats = cnn.get_features(cropped_frame_list)
79 | 
80 |         save_full_path = os.path.join(video_save_path, video + '.npy')
81 |         np.save(save_full_path, feats)
82 | 
83 | if __name__=="__main__":
84 |     main()
85 |     
86 | 


--------------------------------------------------------------------------------
/s2vt/model_temp/readme.md:
--------------------------------------------------------------------------------
1 | the snapshots in training process will  be put here
2 | 


--------------------------------------------------------------------------------
/s2vt/readme.md:
--------------------------------------------------------------------------------
 1 | ### environment
 2 | python2.7
 3 | pytorch
 4 | 
 5 | ### prepare for train
 6 | 1. download the video clips from [here](http://www.cs.utexas.edu/users/ml/clamp/videoDescription/)
 7 | 
 8 | 2. download caffe and VGG model
 9 | download caffe from [BVLC/caffe](https://github.com/BVLC/caffe) to "/root/workspace/"    
10 | download [VGG16](http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_16_layers.caffemodel) model and its [prototxt](https://gist.githubusercontent.com/ksimonyan/211839e770f7b538e2d8/raw/0067c9b32f60362c74f4c445a080beed06b07eb3/VGG_ILSVRC_16_layers_deploy.prototxt) from [caffe/model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo#models-used-by-the-vgg-team-in-ilsvrc-2014) and put it into "/root/workspace/caffe/models/vgg_16"
11 | 
12 | 3. extract feature from video clip
13 | ```
14 | python extract_RGB_feats.py
15 | ```
16 | then put these feature data into training data (put into this folder "./rgb_train_features") and test data (put into this folder "./rgb_test_features").
17 | 
18 | ### how to train/test
19 | ```
20 | python s2vt.py train
21 | python s2vt.py test
22 | ```
23 | and you will get a file named "test_result.txt" in the root folder of this project.
24 | 
25 | ### how to evaluation
26 | 1. install java
27 | ```
28 | conda install -c cyclus java-jdk 
29 | ```
30 | 2. download data
31 | MSVD data set:[Microsoft Research Video Description Corpus](https://www.microsoft.com/en-us/download/details.aspx?id=52422&from=http%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fdownloads%2F38cf15fd-b8df-477e-a4e4-a4680caa75af%2Fdefault.aspx)
32 | put it to ./data folder.
33 | 
34 | 3. parse video csv
35 | ```
36 | python ./coco_eval/parse_video_csv.py
37 | ```
38 | 4. create reference and result .json file
39 | ```
40 | python ./coco_eval/create_reference.py
41 | python ./coco_eval/create_result_json.py
42 | ```
43 | 5. evalute the result
44 | ```
45 | python ./coco_eval/eval.py
46 | ```
47 | 
48 | and you will get the evalution result.
49 | 
50 | ### my model
51 | [here]() is a pytorch model trained by myself
52 | ### acknowledgement
53 | some code copy from chenxinpeng(https://github.com/chenxinpeng/S2VT)
54 | 
55 | ### note
56 | the project does not work now(20180119)
57 | 
58 | ### related paper
59 | Sequence to Sequence - Video to Text(ICCV2015)(https://vsubhashini.github.io/s2vt.html)
60 | 
61 | ### note
62 | any questions, please contact me: jiguo.li@vipl.ict.ac.cn or jgli@pku.edu.cn
63 | 
64 | 


--------------------------------------------------------------------------------
/s2vt/rgb_test_features/readme.md:
--------------------------------------------------------------------------------
1 | this folder is for test data
2 | 


--------------------------------------------------------------------------------
/s2vt/rgb_train_features/readme.md:
--------------------------------------------------------------------------------
1 | this folder is for training video features
2 | 


--------------------------------------------------------------------------------
/s2vt/s2vt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.autograd import Variable
  6 | from torchvision import transforms
  7 | import numpy as np
  8 | import time
  9 | #from model_RGB import get_video_train_data,get_video_test_data,preProBuildWordVocab
 10 | import pandas as pd
 11 | from keras.preprocessing import sequence
 12 | import matplotlib.pyplot as plt
 13 | import os
 14 | import re
 15 | import sys
 16 |     
 17 | class DataSet_MSVD():
 18 |     def __init__(self,csv_path,video_step,caption_step,image_dim,batch_size=50,video_train_data_path="./data/video_corpus.csv",\
 19 |                  video_train_feat_path="./rgb_train_features",video_test_data_path="./data/video_corpus.csv",\
 20 |                  video_test_feat_path="./rgb_test_features"):
 21 |         self.csv_path=csv_path;
 22 |         self.batch_size=batch_size;
 23 |         self.video_train_data_path = video_train_data_path;
 24 |         self.video_train_feat_path = video_train_feat_path;
 25 |         self.video_test_data_path = video_test_data_path;
 26 |         self.video_test_feat_path = video_test_feat_path;
 27 |         self.video_step = video_step;
 28 |         self.caption_step = caption_step;
 29 |         self.image_dim = image_dim;
 30 |         
 31 |         
 32 |         train_data = self.get_video_train_data(video_train_data_path, video_train_feat_path);
 33 |         train_captions = train_data['Description'].values;
 34 |         test_data = self.get_video_test_data(video_test_data_path, video_test_feat_path);
 35 |         test_captions = test_data['Description'].values;
 36 |         if not os.path.exists("./data/wordtoix.npy") or not os.path.exists('./data/ixtoword.npy') \
 37 |                                         or not os.path.exists("./data/bias_init_vector.npy"):
 38 |             captions_list = list(train_captions) + list(test_captions);
 39 |             captions = np.asarray(captions_list, dtype=np.object);
 40 |             
 41 |             #replace some characters to space, this means delete them
 42 |             captions = map(lambda x: x.replace('.', ''), captions);
 43 |             captions = map(lambda x: x.replace(',', ''), captions)
 44 |             captions = map(lambda x: x.replace('"', ''), captions)
 45 |             captions = map(lambda x: x.replace('\n', ''), captions)
 46 |             captions = map(lambda x: x.replace('?', ''), captions)
 47 |             captions = map(lambda x: x.replace('!', ''), captions)
 48 |             captions = map(lambda x: x.replace('\\', ''), captions)
 49 |             captions = map(lambda x: x.replace('/', ''), captions)
 50 |         
 51 |             #word to index, index to word,
 52 |             wordtoix, ixtoword, bias_init_vector = self.preProBuildWordVocab(captions, word_count_threshold=0)
 53 |             
 54 |             np.save("./data/wordtoix", wordtoix)
 55 |             np.save('./data/ixtoword', ixtoword)
 56 |             np.save("./data/bias_init_vector", bias_init_vector)
 57 |         else:
 58 |             wordtoix, ixtoword, bias_init_vector = np.load("./data/wordtoix.npy").tolist(),\
 59 |                                                    np.load('./data/ixtoword.npy').tolist(),\
 60 |                                                    np.load("./data/bias_init_vector.npy").tolist();
 61 |         #reset dataset
 62 |         current_train_data = self.reset_train_data(train_data);
 63 |         
 64 |         self.train_data = train_data;
 65 |         self.current_train_data = current_train_data;
 66 |         self.test_data = test_data;
 67 |         self.word2idx = wordtoix;
 68 |         self.idx2word = ixtoword;
 69 |         self.bias_init_vector = bias_init_vector;
 70 |         
 71 |         self.minibatch_start=0;
 72 |         self.train_data_size = len(current_train_data);
 73 |         self.word_num = len(self.word2idx);
 74 |         
 75 |     def next_batch(self):
 76 |         if self.minibatch_start+self.batch_size<self.train_data_size:
 77 |             minibatch_data = self.current_train_data[self.minibatch_start:(self.minibatch_start+self.batch_size)];
 78 |             #read data
 79 |             video_data_path = minibatch_data["video_path"].values;
 80 |             video_features = map(lambda video_path: np.load(video_path),video_data_path);
 81 |             video_data = np.zeros([self.batch_size,self.video_step,self.image_dim]);
 82 |             for idx,feature in enumerate(video_features):
 83 |                 video_data[idx,:len(feature)]=feature;
 84 |             
 85 |             caption_data = minibatch_data["Description"].values;
 86 |             caption_data = self.preprocess_caption(caption_data);
 87 |             
 88 |             self.minibatch_start += self.batch_size;
 89 |         else:
 90 | # =============================================================================
 91 | #             minibatch_data = self.current_train_data[self.minibatch_start:self.train_data_size];
 92 | #             minibatch_size1 = self.batch_size+self.minibatch_start-self.train_data_size;
 93 | #             #read data
 94 | #             video_data_path = minibatch_data["video_path"].values;
 95 | #             video_features = map(lambda video_path: np.load(video_path),video_data_path);
 96 | #             video_data = np.zeros([self.batch_size,self.video_step,self.image_dim]);
 97 | #             for idx,feature in enumerate(video_features):
 98 | #                 video_data[idx,:len(feature)]=feature;
 99 | #             
100 | #             caption_data = minibatch_data["Description"].values;
101 | #             caption_data = self.preprocess_caption(caption_data);
102 | # =============================================================================
103 |             
104 |             self.current_train_data = self.reset_train_data(self.train_data);
105 |             self.minibatch_start = 0;
106 |             
107 |             video_data = [];
108 |             caption_data = [];
109 |         
110 |         return video_data,caption_data;
111 |     
112 |     
113 |     def preprocess_caption(self,current_captions):
114 |         current_captions = map(lambda x: '<bos> ' + x, current_captions)
115 |         current_captions = map(lambda x: x.replace('.', ''), current_captions)
116 |         current_captions = map(lambda x: x.replace(',', ''), current_captions)
117 |         current_captions = map(lambda x: x.replace('"', ''), current_captions)
118 |         current_captions = map(lambda x: x.replace('\n', ''), current_captions)
119 |         current_captions = map(lambda x: x.replace('?', ''), current_captions)
120 |         current_captions = map(lambda x: x.replace('!', ''), current_captions)
121 |         current_captions = map(lambda x: x.replace('\\', ''), current_captions)
122 |         current_captions = map(lambda x: x.replace('/', ''), current_captions)
123 | 
124 |         for idx, each_cap in enumerate(current_captions):
125 |             word = each_cap.lower().split(' ')
126 |             if len(word) < self.caption_step:
127 |                 current_captions[idx] = current_captions[idx] + ' <eos>'
128 |             else:
129 |                 new_word = ''
130 |                 for i in range(self.caption_step-1):
131 |                     new_word = new_word + word[i] + ' '
132 |                 current_captions[idx] = new_word + '<eos>'
133 | 
134 |         current_caption_ind = []
135 |         for cap in current_captions:
136 |             current_word_ind = []
137 |             for word in cap.lower().split(' '):
138 |                 if word in self.word2idx:
139 |                     current_word_ind.append(self.word2idx[word])
140 |                 else:
141 |                     current_word_ind.append(self.word2idx['<unk>']);
142 |             current_caption_ind.append(current_word_ind)
143 | 
144 |         current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=self.caption_step)
145 |         current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix), 1] ) ] ).astype(int)
146 |     
147 |         return current_caption_matrix;
148 | 
149 |     def reset_train_data(self,train_data):
150 |         index = list(train_data.index)
151 |         np.random.shuffle(index)
152 |         train_data = train_data.loc[index];
153 |         
154 |         #group_data = train_data.groupby('video_path');
155 |         #print group_data;
156 |         current_train_data = train_data.groupby('video_path').apply(lambda x: x.iloc[np.random.choice(len(x))])
157 |         current_train_data = current_train_data.reset_index(drop=True);
158 |         return current_train_data;
159 | 
160 |     def get_video_train_data(self,video_data_path, video_feat_path):
161 |         #read csv file using panda
162 |         video_data = pd.read_csv(video_data_path, sep=',')
163 |         #filte Language==English
164 |         video_data = video_data[video_data['Language'] == 'English']
165 |         #get file name VideoID_Start_End.avi.npy
166 |         video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
167 |         #add video path
168 |         video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
169 |         #filte video path exist
170 |         video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
171 |         #filte Description is a string (not none or other)
172 |         video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]
173 |         #filte video is unique
174 |         unique_filenames = sorted(video_data['video_path'].unique())
175 |         train_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
176 |         return train_data
177 | 
178 |     def get_video_test_data(self,video_data_path, video_feat_path):
179 |         video_data = pd.read_csv(video_data_path, sep=',')
180 |         video_data = video_data[video_data['Language'] == 'English']
181 |         video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1)
182 |         video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
183 |         video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
184 |         video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]
185 |     
186 |         unique_filenames = sorted(video_data['video_path'].unique())
187 |         test_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)]
188 |         return test_data
189 |     
190 |     def preProBuildWordVocab(self,sentence_iterator, word_count_threshold=5):
191 |         # borrowed this function from NeuralTalk
192 |         print 'preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold)
193 |         word_counts = {}
194 |         nsents = 0
195 |         #statistic the word
196 |         for sent in sentence_iterator:
197 |             nsents += 1
198 |             for w in sent.lower().split(' '):
199 |                word_counts[w] = word_counts.get(w, 0) + 1
200 |         #filte the word whose number is lower than the threshold
201 |         #vocab is a list containing all words which is more than threshold
202 |         vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
203 |         print 'filtered words from %d to %d' % (len(word_counts), len(vocab))
204 |     
205 |         ixtoword = {}
206 |         ixtoword[0] = '<pad>'
207 |         ixtoword[1] = '<bos>'
208 |         ixtoword[2] = '<eos>'
209 |         ixtoword[3] = '<unk>'
210 |     
211 |         wordtoix = {}
212 |         wordtoix['<pad>'] = 0
213 |         wordtoix['<bos>'] = 1
214 |         wordtoix['<eos>'] = 2
215 |         wordtoix['<unk>'] = 3
216 |     
217 |         #get the index and content, that is (idx,word)
218 |         for idx, w in enumerate(vocab):
219 |             wordtoix[w] = idx+4
220 |             ixtoword[idx+4] = w
221 |     
222 |         word_counts['<pad>'] = nsents
223 |         word_counts['<bos>'] = nsents
224 |         word_counts['<eos>'] = nsents
225 |         word_counts['<unk>'] = nsents
226 |     
227 |         #how to use this variable? bias_init_vector?
228 |         bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
229 |         bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
230 |         bias_init_vector = np.log(bias_init_vector)
231 |         bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
232 |     
233 |         return wordtoix, ixtoword, bias_init_vector
234 | 
235 | class VideoCaption(nn.Module):
236 |     def __init__(self,batch_size,image_dim,word_num,img_embed_dim,word_embed_dim,hidden_dim,video_step,caption_step):
237 |         super(VideoCaption,self).__init__();
238 |         
239 |         self.batch_size = batch_size;
240 |         self.image_dim = image_dim;
241 |         self.img_embed_dim = img_embed_dim;
242 |         self.word_embed_dim = word_embed_dim;
243 |         self.hidden_dim = hidden_dim;
244 |         self.video_step = video_step;
245 |         self.caption_step = caption_step;
246 |         self.word_num = word_num;
247 |         self.word2vec = nn.Embedding(word_num,word_embed_dim);
248 |         
249 |         nn.init.uniform(self.word2vec.weight,-0.1,0.1);
250 |         self.vec2word = nn.Linear(hidden_dim,word_num);
251 |         nn.init.uniform(self.vec2word.weight,-0.1,0.1);
252 |         nn.init.constant(self.vec2word.bias,0);
253 |         self.img_embed = nn.Linear(image_dim,img_embed_dim);
254 |         nn.init.uniform(self.img_embed.weight,-0.1,0.1);
255 |         nn.init.constant(self.img_embed.bias,0);
256 |         self.lstm1 = nn.LSTMCell(input_size=img_embed_dim,hidden_size=hidden_dim);
257 |         #nn.init.uniform(self.lstm1.weight_hh,-0.1,0.1);
258 |         #nn.init.uniform(self.lstm1.weight_ih,-0.1,0.1);
259 |         nn.init.orthogonal(self.lstm1.weight_hh);
260 |         nn.init.orthogonal(self.lstm1.weight_ih);
261 |         
262 |         self.lstm2 = nn.LSTMCell(input_size=word_embed_dim+hidden_dim, hidden_size=hidden_dim);
263 |         #nn.init.uniform(self.lstm2.weight_hh,-0.1,0.1);
264 |         #nn.init.uniform(self.lstm2.weight_ih,-0.1,0.1);
265 |         nn.init.orthogonal(self.lstm2.weight_hh);
266 |         nn.init.orthogonal(self.lstm2.weight_ih);
267 |         
268 |     def forward(self, input_image, input_caption, caption_mask):
269 |         '''
270 |         input_image: int Variable, batch_size x video_step x image_dim
271 |         input_caption: int Variable, batch_size x (1+caption_step) x 1 (word is idx, so the dim is 1)
272 |         '''
273 |         image_embeded_vector = self.img_embed(input_image);
274 |         word_vec = self.word2vec(input_caption);
275 |         
276 |         #encoding
277 |         state1 = Variable(torch.zeros(self.batch_size,self.lstm1.hidden_size)).cuda();
278 |         state2 = Variable(torch.zeros(self.batch_size,self.lstm2.hidden_size)).cuda();
279 |         output1 = Variable(torch.zeros(self.batch_size,self.lstm1.hidden_size)).cuda();
280 |         output2 = Variable(torch.zeros(self.batch_size,self.lstm2.hidden_size)).cuda();
281 |         padding_for_lstm1 = Variable(torch.zeros(self.batch_size,self.img_embed_dim)).cuda();
282 |         padding_for_lstm2 = Variable(torch.zeros(self.batch_size,self.word_embed_dim)).cuda();
283 |         
284 |         for step in xrange(self.video_step):
285 |             output1,state1 = self.lstm1(image_embeded_vector[:,step,:],(output1,state1));
286 |             output2,state2 = self.lstm2(torch.cat((padding_for_lstm2,output1),1),(output2,state2));
287 |             
288 |         
289 |         loss=Variable(torch.FloatTensor([0])).cuda();
290 |         #decoding
291 |         #one_hot_eye = np.eye(self.word_num).astype('int64');
292 |         for step in xrange(self.caption_step):
293 |             output1,state1 = self.lstm1(padding_for_lstm1,(output1, state1));
294 |             output2,state2 = self.lstm2(torch.cat((word_vec[:,step,:],output1),1),(output2, state2));
295 |             
296 |             word_onehot = self.vec2word(output2);
297 |             #word_onehot_softmax = nn.Softmax(dim=1)(word_onehot);
298 |             labels = input_caption[:,step+1];
299 |             
300 |             #one_hot_labels = np.zeros((labels.data.shape[0],self.word_num),dtype='int64');
301 |             #for idx,data in enumerate(labels.data):
302 |              #   one_hot_labels[idx]=one_hot_eye[data];
303 |             
304 |             #labels_onehot = Variable(torch.FloatTensor(one_hot_labels)).cuda();
305 |             #labels_onehot_list = labels_onehot.cpu().data.tolist()
306 |             #print len(labels_onehot_list)
307 |             #loss_func = nn.BCEWithLogitsLoss()*caption_mask[:,step];
308 |             loss_func = nn.CrossEntropyLoss(reduce=False);
309 |             
310 |             loss_temp = loss_func(word_onehot,labels)*(caption_mask[:,step+1].float());
311 |             loss += torch.sum(loss_temp)/self.batch_size;
312 |         
313 |         return loss;
314 |     
315 |     def generate_cpu(self, input_image):
316 |         image_embeded_vector = self.img_embed(input_image);
317 |         
318 |         #encoding
319 |         state1 = Variable(torch.zeros(1,self.lstm1.hidden_size));
320 |         state2 = Variable(torch.zeros(1,self.lstm2.hidden_size));
321 |         output1 = Variable(torch.zeros(1,self.lstm1.hidden_size));
322 |         output2 = Variable(torch.zeros(1,self.lstm2.hidden_size));
323 |         padding_for_lstm1 = Variable(torch.zeros(1,self.img_embed_dim));
324 |         padding_for_lstm2 = Variable(torch.zeros(1,self.word_embed_dim));
325 |         
326 |         for step in xrange(self.video_step):
327 |             output1,state1 = self.lstm1(image_embeded_vector[step,:],(output1,state1));
328 |             output2,state2 = self.lstm2(torch.cat((padding_for_lstm2,output1),1),(output2,state2));
329 |             
330 |         
331 |         words=[]
332 |         #decoding
333 |         #set '<bos>'
334 |         previous_word = self.word2vec(Variable(torch.LongTensor([1])));
335 |         for step in xrange(self.caption_step):
336 |             output1,state1 = self.lstm1(padding_for_lstm1,(output1, state1));
337 |             output2,state2 = self.lstm2(torch.cat((previous_word,output1),1),(output2, state2));
338 |             #previous_word = output2;
339 |             
340 |             word_onehot = self.vec2word(output2);
341 |             #print word_onehot.shape
342 |             _,word_idx = torch.max(word_onehot,1);
343 |             #print word_idx.data[0];
344 |             
345 |             words.append(word_idx.data[0]);
346 |             
347 |             previous_word = self.word2vec(word_idx);
348 |         
349 |         return words;
350 | 
351 | batch_size = 100;
352 | image_dim = 4096;
353 | img_embed_dim = 1000;
354 | word_embed_dim = 1000;
355 | hidden_dim = 1000;
356 | video_step = 80;
357 | caption_step = 20;
358 | epoches=1001;
359 | csv_path = "./data/video_corpus.csv"
360 | 
361 | 
362 | def train(check_point=None):
363 |     #parameters
364 |     
365 |     loss_log_file = "./loss.txt";
366 |     data_set = DataSet_MSVD(csv_path=csv_path,video_step=video_step,caption_step=caption_step,\
367 |                             image_dim=image_dim,batch_size=batch_size);
368 |     word_num = data_set.word_num;
369 |     video_caption_net = VideoCaption(batch_size=batch_size, image_dim=image_dim,\
370 |                                      word_num = word_num, img_embed_dim=img_embed_dim,\
371 |                                      word_embed_dim=word_embed_dim,\
372 |                                      hidden_dim=hidden_dim,video_step=video_step,\
373 |                                      caption_step=caption_step);
374 |     if check_point != None:
375 |         video_caption_net.load_state_dict(state_dict=torch.load(check_point));
376 |         start_epoche = int(re.search(r"(\d*)\Z",check_point).groups()[-1]);
377 |     else:
378 |         start_epoche = int(0);
379 |         
380 |     video_caption_net.cuda();
381 |     print video_caption_net;
382 |     
383 |     optimizer = torch.optim.Adam(video_caption_net.parameters(),lr=10e-4);
384 |     
385 |     loss_list=[];
386 |     log_fp = open(loss_log_file,"w");
387 |     #torch.backends.cudnn = False;
388 | 
389 |     for epoche in range(start_epoche,epoches):
390 |         mini_batch_idx=0;
391 |         while True:
392 |             start_time = time.time();
393 |             mini_batch_idx += 1;
394 |             video_data,caption_data = data_set.next_batch();
395 |             if len(video_data) == 0:
396 |                 break;
397 |             
398 |             caption_mask = np.zeros( (caption_data.shape[0], caption_data.shape[1]) )
399 |             nonzeros = np.array( map(lambda x: (x != 0).sum() + 1, caption_data ) )
400 | 
401 |             for ind, row in enumerate(caption_mask):
402 |                 row[:nonzeros[ind]] = 1
403 |                 
404 |             video_data,caption_data,caption_mask = Variable(torch.FloatTensor(video_data)),Variable(torch.LongTensor(caption_data)),\
405 |                                                         Variable(torch.LongTensor(caption_mask));
406 |             video_data,caption_data,caption_mask = video_data.cuda(),caption_data.cuda(),caption_mask.cuda();
407 |             loss = video_caption_net(video_data,caption_data,caption_mask);
408 |             optimizer.zero_grad();
409 |             loss.backward();
410 |             optimizer.step();
411 |             
412 |             loss_list.append(loss.data[0]);
413 |             
414 |             print("epoche:{0},mini_batch:{1},loss:{2},Escape time:{3}".format(\
415 |                     epoche,mini_batch_idx,loss.data[0],str(time.time()-start_time)));
416 |             log_fp.write("epoche:{0},mini_batch:{1},loss:{2},Escape time:{3}\n".format(\
417 |                     epoche,mini_batch_idx,loss.data[0],str(time.time()-start_time)));
418 |         if epoche%10 == 0:
419 |             torch.save(video_caption_net.state_dict(),"./model_temp/s2vt.pytorch.{0}".format(epoche));
420 |             ax=plt.subplot(111)
421 |             plt.plot(range(len(loss_list)),loss_list,color='black');
422 |             #plt.plot(range(epoches),test_loss_list,color="red");
423 |             plt.show();   
424 |     
425 |     #save model
426 |     torch.save(video_caption_net.state_dict(),"./s2vt.pytorch");
427 |              
428 |     ax=plt.subplot(111)
429 |     plt.plot(range(len(loss_list)),loss_list,color='black');
430 |     #plt.plot(range(epoches),test_loss_list,color="red");
431 |     plt.show();    
432 |     log_fp.close();
433 |     
434 |     
435 | def test(state_dict_path):
436 |     #parameters
437 | 
438 |     data_set = DataSet_MSVD(csv_path=csv_path,video_step=video_step,caption_step=caption_step,\
439 |                             image_dim=image_dim,batch_size=batch_size);
440 |     #test 
441 |     word_num = data_set.word_num;
442 |     video_caption_net = VideoCaption(batch_size=batch_size, image_dim=image_dim,\
443 |                                      word_num = word_num, img_embed_dim=img_embed_dim,\
444 |                                      word_embed_dim=word_embed_dim,\
445 |                                      hidden_dim=hidden_dim,video_step=video_step,\
446 |                                      caption_step=caption_step);
447 |     #video_caption_net.cuda();
448 |     print video_caption_net;
449 |     video_caption_net.load_state_dict(state_dict=torch.load(state_dict_path));
450 |     
451 |     #test data
452 |     test_output_txt_fd = open("./test_result.txt","wb");
453 |     test_data_path = data_set.test_data["video_path"].unique();
454 |     for idx,data_path in enumerate(test_data_path):
455 |         print("idx:{0},data_path:{1}".format(idx,data_path))
456 |         video_feature = np.load(data_path);
457 |         if video_feature.shape[0] != video_step:
458 |             continue;
459 |         video_feature = Variable(torch.FloatTensor(video_feature));
460 |         #video_feature.cuda();
461 |         words = video_caption_net.generate_cpu(video_feature);
462 |         
463 |         generated_words = [data_set.idx2word[word] for word in words];
464 | 
465 |         punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1
466 |         generated_words = generated_words[:punctuation]
467 | 
468 |         generated_sentence = ' '.join(generated_words)
469 |         generated_sentence = generated_sentence.replace('<bos> ', '')
470 |         generated_sentence = generated_sentence.replace(' <eos>', '')
471 |         print generated_sentence,'\n'
472 |         test_output_txt_fd.write(data_path + '\n')
473 |         test_output_txt_fd.write(generated_sentence + '\n\n')
474 | 
475 | if __name__=="__main__":
476 |     if len(sys.argv)>=2:
477 |         if sys.argv[1]=='train':
478 |             train(check_point=None);    
479 |         elif sys.argv[1]=='test':
480 |             test("./model_temp/s2vt.pytorch.1000");
481 |         else:
482 |             print("use 'python s2vt.py train' to train the model and use 'python s2vt.py test' to test the model");    
483 |     else:
484 |         print("use 'python s2vt.py train' to train the model and use 'python s2vt.py test' to test the model");    
485 |     
486 |     
487 | 
488 |     
489 |     
490 | 
491 |     


--------------------------------------------------------------------------------