├── README.md ├── picture └── s2vt.png └── s2vt ├── .gitignore ├── cnn_util.py ├── coco_eval ├── create_reference.py ├── create_result_json.py ├── eval.py ├── parse_video_csv.py ├── pycocoevalcap │ ├── __init__.py │ ├── __init__.pyc │ ├── bleu │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── bleu.py │ │ ├── bleu.pyc │ │ ├── bleu_scorer.py │ │ └── bleu_scorer.pyc │ ├── cider │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── cider.py │ │ ├── cider.pyc │ │ ├── cider_scorer.py │ │ └── cider_scorer.pyc │ ├── eval.py │ ├── eval.pyc │ ├── meteor │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── data │ │ │ └── paraphrase-en.gz │ │ ├── meteor-1.5.jar │ │ ├── meteor.py │ │ └── meteor.pyc │ ├── rouge │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── rouge.py │ │ └── rouge.pyc │ └── tokenizer │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── ptbtokenizer.py │ │ ├── ptbtokenizer.pyc │ │ ├── stanford-corenlp-3.4.1.jar │ │ ├── tmp0vwKED │ │ ├── tmp3FFDJw │ │ ├── tmp_jaQxJ │ │ ├── tmpc6XoAB │ │ ├── tmpcRaDxK │ │ ├── tmpkJaFVH │ │ ├── tmpoInl8I │ │ └── tmprhiH1L └── pycocotools │ ├── __init__.py │ ├── __init__.pyc │ ├── coco.py │ └── coco.pyc ├── data └── readme.md ├── extract_RGB_feats.py ├── model_temp └── readme.md ├── readme.md ├── rgb_test_features └── readme.md ├── rgb_train_features └── readme.md └── s2vt.py /README.md: -------------------------------------------------------------------------------- 1 | # pytorch_video_caption 2 | some models for video caption implemented by pytorch. 3 | ### s2vt 4 | seqence to sequence: from video to text (S2VT), implement by pytorch. 5 | some other implement: 6 | - [tensorflow](https://github.com/chenxinpeng/S2VT) by chenxinpeng 7 | - [caffe](https://gist.github.com/vsubhashini/38d087e140854fee4b14) by the author. 8 | 9 | -------------------------------------------------------------------------------- /picture/s2vt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/picture/s2vt.png -------------------------------------------------------------------------------- /s2vt/.gitignore: -------------------------------------------------------------------------------- 1 | # python 2 | 3 | # data 4 | *.npy 5 | *.csv 6 | *.pyc 7 | -------------------------------------------------------------------------------- /s2vt/cnn_util.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | sys.path.append('/root/workspace/caffe/python') 4 | import caffe 5 | 6 | #import ipdb 7 | import cv2 8 | import numpy as np 9 | import skimage 10 | 11 | deploy = '/root/workspace/caffe/models/vgg_16/VGG_ILSVRC_16_layers_deploy.prototxt' 12 | model = '/root/workspace/caffe/models/vgg/VGG_ILSVRC_16_layers.caffemodel' 13 | mean = '/root/workspace/caffe/python/caffe/imagenet/ilsvrc_2012_mean.npy' 14 | 15 | class CNN(object): 16 | 17 | def __init__(self, deploy=deploy, model=model, mean=mean, batch_size=10, width=227, height=227): 18 | 19 | self.deploy = deploy 20 | self.model = model 21 | self.mean = mean 22 | 23 | self.batch_size = batch_size 24 | self.net, self.transformer = self.get_net() 25 | self.net.blobs['data'].reshape(self.batch_size, 3, height, width) 26 | 27 | self.width = width 28 | self.height = height 29 | 30 | def get_net(self): 31 | caffe.set_mode_gpu() 32 | net = caffe.Net(self.deploy, self.model, caffe.TEST) 33 | 34 | transformer = caffe.io.Transformer({'data':net.blobs['data'].data.shape}) 35 | transformer.set_transpose('data', (2,0,1)) 36 | transformer.set_mean('data', np.load(self.mean).mean(1).mean(1)) 37 | transformer.set_raw_scale('data', 255) 38 | transformer.set_channel_swap('data', (2,1,0)) 39 | 40 | return net, transformer 41 | 42 | def get_features(self, image_list, layers='fc7', layer_sizes=[4096]): 43 | iter_until = len(image_list) + self.batch_size 44 | # we fill the zeros 45 | #num_frames = 80 46 | #all_feats = np.zeros([num_frames] + layer_sizes) 47 | all_feats = np.zeros([len(image_list)] + layer_sizes) 48 | 49 | for start, end in zip(range(0, iter_until, self.batch_size), \ 50 | range(self.batch_size, iter_until, self.batch_size)): 51 | 52 | image_batch = image_list[start:end] 53 | 54 | caffe_in = np.zeros(np.array(image_batch.shape)[[0,3,1,2]], dtype=np.float32) 55 | 56 | for idx, in_ in enumerate(image_batch): 57 | caffe_in[idx] = self.transformer.preprocess('data', in_) 58 | 59 | out = self.net.forward_all(blobs=[layers], **{'data':caffe_in}) 60 | feats = out[layers] 61 | 62 | all_feats[start:end] = feats 63 | 64 | return all_feats 65 | 66 | -------------------------------------------------------------------------------- /s2vt/coco_eval/create_reference.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | import os 4 | import glob 5 | import numpy as np 6 | import pandas as pd 7 | import cPickle as pickle 8 | 9 | 10 | video_src_dir = "../data/video_corpus.csv" 11 | 12 | video_data = pd.read_csv(video_src_dir, sep=',') 13 | video_data = video_data[video_data['Language'] == 'English'] 14 | 15 | videoID_lists = list(video_data['VideoID']) 16 | videoID_start_lists = list(video_data['Start']) 17 | videoID_end_lists = list(video_data['End']) 18 | video_descriptions_lists = list(video_data['Description']) 19 | 20 | videoID_with_Frames = [] 21 | for idx, item in enumerate(videoID_lists): 22 | temp = videoID_lists[idx] + '_' + str(int(videoID_start_lists[idx])) + '_' + str(int(videoID_end_lists[idx])) 23 | videoID_with_Frames.append(temp) 24 | 25 | # videoID map 26 | videoID_shrinked = list(set(videoID_with_Frames)) 27 | 28 | map_videoID = {} 29 | for idx, item in enumerate(videoID_shrinked): 30 | map_videoID[idx] = item 31 | 32 | with open('map_videoID.pkl', 'w') as f: 33 | pickle.dump(map_videoID, f) 34 | 35 | ########################################################################################################### 36 | # judge the ascii 37 | ########################################################################################################### 38 | def is_ascii(s): 39 | return all(ord(c) < 128 for c in s) 40 | 41 | 42 | json_fd = open('reference.json', 'w') 43 | json_fd.write('{"info": {"description": "test", "url": "https://github.com/chenxinpeng", "version": "1.0", "year": 2017, "contributor": "ChenXinpeng", "date_created": "2017-01-27"}, "images": [') 44 | 45 | 46 | tmp_idx = 1 47 | for key in map_videoID: 48 | if tmp_idx != len(map_videoID): 49 | json_fd.write('{"license": 1, "file_name": "' + str(map_videoID[key]) + '", "id": ' + str(key) + '}, ') 50 | if tmp_idx == len(map_videoID): 51 | json_fd.write('{"license": 1, "file_name": "' + str(map_videoID[key]) + '", "id": ' + str(key) + '}], ') 52 | tmp_idx += 1 53 | 54 | json_fd.write('"licenses": [{"url": "http://creativecommons.org/licenses/by-nc-sa/2.0/", "id": 1, "name": "Attribution-NonCommercial-ShareAlike License"}], ') 55 | json_fd.write('"type": "captions", "annotations": [') 56 | 57 | id_count = 0 58 | for count, key in enumerate(map_videoID): 59 | video_frame = map_videoID[key] 60 | indices = [i for i, x in enumerate(videoID_with_Frames) if x == video_frame] 61 | if count != len(map_videoID)-1: 62 | for idx in indices: 63 | if type(video_descriptions_lists[idx]) == type(1.0): 64 | continue 65 | 66 | if '\\' in video_descriptions_lists[idx]: 67 | print video_descriptions_lists[idx] 68 | continue 69 | 70 | if '"' in video_descriptions_lists[idx]: 71 | print video_descriptions_lists[int(idx)] 72 | continue 73 | 74 | if "\n" in video_descriptions_lists[idx]: 75 | print video_descriptions_lists[int(idx)] 76 | continue 77 | 78 | if is_ascii(video_descriptions_lists[idx]): 79 | json_fd.write('{"image_id": ' + str(key) + ', "id": ' + str(id_count) + ', "caption": "' + str(video_descriptions_lists[idx]) + '"}, ') 80 | id_count = id_count + 1 81 | 82 | if count == len(map_videoID)-1: 83 | for ii, idx in enumerate(indices): 84 | if type(video_descriptions_lists[idx]) == type(1.0): 85 | continue 86 | 87 | if '\\' in video_descriptions_lists[idx]: 88 | print video_descriptions_lists[idx] 89 | continue 90 | 91 | if '"' in video_descriptions_lists[idx]: 92 | print video_descriptions_lists[int(idx)] 93 | continue 94 | 95 | if "\n" in video_descriptions_lists[idx]: 96 | print video_descriptions_lists[int(idx)] 97 | continue 98 | 99 | if ii != len(indices)-1: 100 | json_fd.write('{"image_id": ' + str(key) + ', "id": ' + str(id_count) + ', "caption": "' + str(video_descriptions_lists[idx]) + '"}, ') 101 | id_count = id_count + 1 102 | if ii == len(indices)-1: 103 | json_fd.write('{"image_id": ' + str(key) + ', "id": ' + str(id_count) + ', "caption": "' + str(video_descriptions_lists[idx]) + '"}]}') 104 | 105 | json_fd.close() 106 | -------------------------------------------------------------------------------- /s2vt/coco_eval/create_result_json.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | import os 4 | import glob 5 | import numpy as np 6 | 7 | import cPickle 8 | 9 | # change the output file 10 | output_txt_dir = "../test_result.txt" 11 | output = open(output_txt_dir).read().splitlines() 12 | 13 | num_all_output = len(output) 14 | 15 | avi_names_lists = [] 16 | machine_produced_sentences = [] 17 | 18 | for idx, item in enumerate(output): 19 | if (idx % 3) == 0: 20 | avi_names_lists.append(item) 21 | if (idx % 3) == 1: 22 | machine_produced_sentences.append(item) 23 | 24 | avi_npy_basenames = map(lambda item: os.path.basename(item), avi_names_lists) 25 | 26 | avi_names = [] 27 | for each_avi in avi_npy_basenames: 28 | tmp1, tmp2, tmp3 = each_avi.split('.') 29 | avi_names.append(tmp1) 30 | 31 | fd = open('map_videoID.pkl', 'rb') 32 | map_videoID = cPickle.load(fd) 33 | 34 | map_videoID_reverse = {} 35 | for key in map_videoID: 36 | val = map_videoID[key] 37 | map_videoID_reverse[val] = key 38 | 39 | json_fd = open('generation.json', 'w') 40 | json_fd.write('[') 41 | for idx, item in enumerate(avi_names): 42 | if idx != len(avi_names)-1: 43 | json_fd.write('{"image_id": ' + str(map_videoID_reverse[item]) + ', "caption": "' + str(machine_produced_sentences[idx]) + '"}, ') 44 | if idx == len(avi_names)-1: 45 | json_fd.write('{"image_id": ' + str(map_videoID_reverse[item]) + ', "caption": "' + str(machine_produced_sentences[idx]) + '"}]') 46 | 47 | json_fd.close() 48 | -------------------------------------------------------------------------------- /s2vt/coco_eval/eval.py: -------------------------------------------------------------------------------- 1 | #! encoding: UTF-8 2 | 3 | import os 4 | from pycocotools.coco import COCO 5 | from pycocoevalcap.eval import COCOEvalCap 6 | 7 | annFile = 'reference.json' 8 | resFile = 'generation.json' 9 | 10 | # create coco object and cocoRes object 11 | coco = COCO(annFile) 12 | cocoRes = coco.loadRes(resFile) 13 | 14 | # create cocoEval object by taking coco and cocoRes 15 | cocoEval = COCOEvalCap(coco, cocoRes) 16 | 17 | # evaluate on a subset of images by setting 18 | # cocoEval.params['image_id'] = cocoRes.getImgIds() 19 | # please remove this line when evaluating the full validation set 20 | cocoEval.params['image_id'] = cocoRes.getImgIds() 21 | 22 | # evaluate results 23 | cocoEval.evaluate() 24 | 25 | # print output evaluation scores 26 | for metric, score in cocoEval.eval.items(): 27 | print '%s: %.3f'%(metric, score) 28 | 29 | 30 | -------------------------------------------------------------------------------- /s2vt/coco_eval/parse_video_csv.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | import os 4 | import glob 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | video_src_dir = "../data/video_corpus.csv" 10 | 11 | video_data = pd.read_csv(video_src_dir, sep=',') 12 | video_data = video_data[video_data['Language'] == 'English'] 13 | 14 | videoID_lists = list(video_data['VideoID']) 15 | videoID_start_lists = list(video_data['Start']) 16 | videoID_end_lists = list(video_data['End']) 17 | video_descriptions_lists = list(video_data['Description']) 18 | 19 | videoID_with_Frames = [] 20 | for idx, item in enumerate(videoID_lists): 21 | temp = videoID_lists[idx] + '_' + str(videoID_start_lists[idx]) + '_' + str(videoID_end_lists[idx]) 22 | videoID_with_Frames.append(temp) 23 | 24 | #print video_descriptions_lists 25 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/__init__.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/bleu/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/bleu/__init__.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(gts.keys() == res.keys()) 24 | imgIds = gts.keys() 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/bleu/bleu.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/bleu/bleu.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/bleu/bleu_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # bleu_scorer.py 4 | # David Chiang 5 | 6 | # Copyright (c) 2004-2006 University of Maryland. All rights 7 | # reserved. Do not redistribute without permission from the 8 | # author. Not for commercial use. 9 | 10 | # Modified by: 11 | # Hao Fang 12 | # Tsung-Yi Lin 13 | 14 | '''Provides: 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). 17 | ''' 18 | 19 | import copy 20 | import sys, math, re 21 | from collections import defaultdict 22 | 23 | def precook(s, n=4, out=False): 24 | """Takes a string as input and returns an object that can be given to 25 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 26 | can take string arguments as well.""" 27 | words = s.split() 28 | counts = defaultdict(int) 29 | for k in xrange(1,n+1): 30 | for i in xrange(len(words)-k+1): 31 | ngram = tuple(words[i:i+k]) 32 | counts[ngram] += 1 33 | return (len(words), counts) 34 | 35 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" 36 | '''Takes a list of reference sentences for a single segment 37 | and returns an object that encapsulates everything that BLEU 38 | needs to know about them.''' 39 | 40 | reflen = [] 41 | maxcounts = {} 42 | for ref in refs: 43 | rl, counts = precook(ref, n) 44 | reflen.append(rl) 45 | for (ngram,count) in counts.iteritems(): 46 | maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 47 | 48 | # Calculate effective reference sentence length. 49 | if eff == "shortest": 50 | reflen = min(reflen) 51 | elif eff == "average": 52 | reflen = float(sum(reflen))/len(reflen) 53 | 54 | ## lhuang: N.B.: leave reflen computaiton to the very end!! 55 | 56 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) 57 | 58 | return (reflen, maxcounts) 59 | 60 | def cook_test(test, (reflen, refmaxcounts), eff=None, n=4): 61 | '''Takes a test sentence and returns an object that 62 | encapsulates everything that BLEU needs to know about it.''' 63 | 64 | testlen, counts = precook(test, n, True) 65 | 66 | result = {} 67 | 68 | # Calculate effective reference sentence length. 69 | 70 | if eff == "closest": 71 | result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] 72 | else: ## i.e., "average" or "shortest" or None 73 | result["reflen"] = reflen 74 | 75 | result["testlen"] = testlen 76 | 77 | result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)] 78 | 79 | result['correct'] = [0]*n 80 | for (ngram, count) in counts.iteritems(): 81 | result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) 82 | 83 | return result 84 | 85 | class BleuScorer(object): 86 | """Bleu scorer. 87 | """ 88 | 89 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" 90 | # special_reflen is used in oracle (proportional effective ref len for a node). 91 | 92 | def copy(self): 93 | ''' copy the refs.''' 94 | new = BleuScorer(n=self.n) 95 | new.ctest = copy.copy(self.ctest) 96 | new.crefs = copy.copy(self.crefs) 97 | new._score = None 98 | return new 99 | 100 | def __init__(self, test=None, refs=None, n=4, special_reflen=None): 101 | ''' singular instance ''' 102 | 103 | self.n = n 104 | self.crefs = [] 105 | self.ctest = [] 106 | self.cook_append(test, refs) 107 | self.special_reflen = special_reflen 108 | 109 | def cook_append(self, test, refs): 110 | '''called by constructor and __iadd__ to avoid creating new instances.''' 111 | 112 | if refs is not None: 113 | self.crefs.append(cook_refs(refs)) 114 | if test is not None: 115 | cooked_test = cook_test(test, self.crefs[-1]) 116 | self.ctest.append(cooked_test) ## N.B.: -1 117 | else: 118 | self.ctest.append(None) # lens of crefs and ctest have to match 119 | 120 | self._score = None ## need to recompute 121 | 122 | def ratio(self, option=None): 123 | self.compute_score(option=option) 124 | return self._ratio 125 | 126 | def score_ratio(self, option=None): 127 | '''return (bleu, len_ratio) pair''' 128 | return (self.fscore(option=option), self.ratio(option=option)) 129 | 130 | def score_ratio_str(self, option=None): 131 | return "%.4f (%.2f)" % self.score_ratio(option) 132 | 133 | def reflen(self, option=None): 134 | self.compute_score(option=option) 135 | return self._reflen 136 | 137 | def testlen(self, option=None): 138 | self.compute_score(option=option) 139 | return self._testlen 140 | 141 | def retest(self, new_test): 142 | if type(new_test) is str: 143 | new_test = [new_test] 144 | assert len(new_test) == len(self.crefs), new_test 145 | self.ctest = [] 146 | for t, rs in zip(new_test, self.crefs): 147 | self.ctest.append(cook_test(t, rs)) 148 | self._score = None 149 | 150 | return self 151 | 152 | def rescore(self, new_test): 153 | ''' replace test(s) with new test(s), and returns the new score.''' 154 | 155 | return self.retest(new_test).compute_score() 156 | 157 | def size(self): 158 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 159 | return len(self.crefs) 160 | 161 | def __iadd__(self, other): 162 | '''add an instance (e.g., from another sentence).''' 163 | 164 | if type(other) is tuple: 165 | ## avoid creating new BleuScorer instances 166 | self.cook_append(other[0], other[1]) 167 | else: 168 | assert self.compatible(other), "incompatible BLEUs." 169 | self.ctest.extend(other.ctest) 170 | self.crefs.extend(other.crefs) 171 | self._score = None ## need to recompute 172 | 173 | return self 174 | 175 | def compatible(self, other): 176 | return isinstance(other, BleuScorer) and self.n == other.n 177 | 178 | def single_reflen(self, option="average"): 179 | return self._single_reflen(self.crefs[0][0], option) 180 | 181 | def _single_reflen(self, reflens, option=None, testlen=None): 182 | 183 | if option == "shortest": 184 | reflen = min(reflens) 185 | elif option == "average": 186 | reflen = float(sum(reflens))/len(reflens) 187 | elif option == "closest": 188 | reflen = min((abs(l-testlen), l) for l in reflens)[1] 189 | else: 190 | assert False, "unsupported reflen option %s" % option 191 | 192 | return reflen 193 | 194 | def recompute_score(self, option=None, verbose=0): 195 | self._score = None 196 | return self.compute_score(option, verbose) 197 | 198 | def compute_score(self, option=None, verbose=0): 199 | n = self.n 200 | small = 1e-9 201 | tiny = 1e-15 ## so that if guess is 0 still return 0 202 | bleu_list = [[] for _ in range(n)] 203 | 204 | if self._score is not None: 205 | return self._score 206 | 207 | if option is None: 208 | option = "average" if len(self.crefs) == 1 else "closest" 209 | 210 | self._testlen = 0 211 | self._reflen = 0 212 | totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} 213 | 214 | # for each sentence 215 | for comps in self.ctest: 216 | testlen = comps['testlen'] 217 | self._testlen += testlen 218 | 219 | if self.special_reflen is None: ## need computation 220 | reflen = self._single_reflen(comps['reflen'], option, testlen) 221 | else: 222 | reflen = self.special_reflen 223 | 224 | self._reflen += reflen 225 | 226 | for key in ['guess','correct']: 227 | for k in xrange(n): 228 | totalcomps[key][k] += comps[key][k] 229 | 230 | # append per image bleu score 231 | bleu = 1. 232 | for k in xrange(n): 233 | bleu *= (float(comps['correct'][k]) + tiny) \ 234 | /(float(comps['guess'][k]) + small) 235 | bleu_list[k].append(bleu ** (1./(k+1))) 236 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division 237 | if ratio < 1: 238 | for k in xrange(n): 239 | bleu_list[k][-1] *= math.exp(1 - 1/ratio) 240 | 241 | if verbose > 1: 242 | print comps, reflen 243 | 244 | totalcomps['reflen'] = self._reflen 245 | totalcomps['testlen'] = self._testlen 246 | 247 | bleus = [] 248 | bleu = 1. 249 | for k in xrange(n): 250 | bleu *= float(totalcomps['correct'][k] + tiny) \ 251 | / (totalcomps['guess'][k] + small) 252 | bleus.append(bleu ** (1./(k+1))) 253 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division 254 | if ratio < 1: 255 | for k in xrange(n): 256 | bleus[k] *= math.exp(1 - 1/ratio) 257 | 258 | if verbose > 0: 259 | print totalcomps 260 | print "ratio:", ratio 261 | 262 | self._score = bleus 263 | return self._score, bleu_list 264 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/bleu/bleu_scorer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/bleu/bleu_scorer.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/cider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/cider/__init__.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(gts.keys() == res.keys()) 33 | imgIds = gts.keys() 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) > 0) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/cider/cider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/cider/cider.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | from collections import defaultdict 7 | import numpy as np 8 | import pdb 9 | import math 10 | 11 | def precook(s, n=4, out=False): 12 | """ 13 | Takes a string as input and returns an object that can be given to 14 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 15 | can take string arguments as well. 16 | :param s: string : sentence to be converted into ngrams 17 | :param n: int : number of ngrams for which representation is calculated 18 | :return: term frequency vector for occuring ngrams 19 | """ 20 | words = s.split() 21 | counts = defaultdict(int) 22 | for k in xrange(1,n+1): 23 | for i in xrange(len(words)-k+1): 24 | ngram = tuple(words[i:i+k]) 25 | counts[ngram] += 1 26 | return counts 27 | 28 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 29 | '''Takes a list of reference sentences for a single segment 30 | and returns an object that encapsulates everything that BLEU 31 | needs to know about them. 32 | :param refs: list of string : reference sentences for some image 33 | :param n: int : number of ngrams for which (ngram) representation is calculated 34 | :return: result (list of dict) 35 | ''' 36 | return [precook(ref, n) for ref in refs] 37 | 38 | def cook_test(test, n=4): 39 | '''Takes a test sentence and returns an object that 40 | encapsulates everything that BLEU needs to know about it. 41 | :param test: list of string : hypothesis sentence for some image 42 | :param n: int : number of ngrams for which (ngram) representation is calculated 43 | :return: result (dict) 44 | ''' 45 | return precook(test, n, True) 46 | 47 | class CiderScorer(object): 48 | """CIDEr scorer. 49 | """ 50 | 51 | def copy(self): 52 | ''' copy the refs.''' 53 | new = CiderScorer(n=self.n) 54 | new.ctest = copy.copy(self.ctest) 55 | new.crefs = copy.copy(self.crefs) 56 | return new 57 | 58 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 59 | ''' singular instance ''' 60 | self.n = n 61 | self.sigma = sigma 62 | self.crefs = [] 63 | self.ctest = [] 64 | self.document_frequency = defaultdict(float) 65 | self.cook_append(test, refs) 66 | self.ref_len = None 67 | 68 | def cook_append(self, test, refs): 69 | '''called by constructor and __iadd__ to avoid creating new instances.''' 70 | 71 | if refs is not None: 72 | self.crefs.append(cook_refs(refs)) 73 | if test is not None: 74 | self.ctest.append(cook_test(test)) ## N.B.: -1 75 | else: 76 | self.ctest.append(None) # lens of crefs and ctest have to match 77 | 78 | def size(self): 79 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 80 | return len(self.crefs) 81 | 82 | def __iadd__(self, other): 83 | '''add an instance (e.g., from another sentence).''' 84 | 85 | if type(other) is tuple: 86 | ## avoid creating new CiderScorer instances 87 | self.cook_append(other[0], other[1]) 88 | else: 89 | self.ctest.extend(other.ctest) 90 | self.crefs.extend(other.crefs) 91 | 92 | return self 93 | def compute_doc_freq(self): 94 | ''' 95 | Compute term frequency for reference data. 96 | This will be used to compute idf (inverse document frequency later) 97 | The term frequency is stored in the object 98 | :return: None 99 | ''' 100 | for refs in self.crefs: 101 | # refs, k ref captions of one image 102 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 103 | self.document_frequency[ngram] += 1 104 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 105 | 106 | def compute_cider(self): 107 | def counts2vec(cnts): 108 | """ 109 | Function maps counts of ngram to vector of tfidf weights. 110 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 111 | The n-th entry of array denotes length of n-grams. 112 | :param cnts: 113 | :return: vec (array of dict), norm (array of float), length (int) 114 | """ 115 | vec = [defaultdict(float) for _ in range(self.n)] 116 | length = 0 117 | norm = [0.0 for _ in range(self.n)] 118 | for (ngram,term_freq) in cnts.iteritems(): 119 | # give word count 1 if it doesn't appear in reference corpus 120 | df = np.log(max(1.0, self.document_frequency[ngram])) 121 | # ngram index 122 | n = len(ngram)-1 123 | # tf (term_freq) * idf (precomputed idf) for n-grams 124 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 125 | # compute norm for the vector. the norm will be used for computing similarity 126 | norm[n] += pow(vec[n][ngram], 2) 127 | 128 | if n == 1: 129 | length += term_freq 130 | norm = [np.sqrt(n) for n in norm] 131 | return vec, norm, length 132 | 133 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 134 | ''' 135 | Compute the cosine similarity of two vectors. 136 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 137 | :param vec_ref: array of dictionary for vector corresponding to reference 138 | :param norm_hyp: array of float for vector corresponding to hypothesis 139 | :param norm_ref: array of float for vector corresponding to reference 140 | :param length_hyp: int containing length of hypothesis 141 | :param length_ref: int containing length of reference 142 | :return: array of score for each n-grams cosine similarity 143 | ''' 144 | delta = float(length_hyp - length_ref) 145 | # measure consine similarity 146 | val = np.array([0.0 for _ in range(self.n)]) 147 | for n in range(self.n): 148 | # ngram 149 | for (ngram,count) in vec_hyp[n].iteritems(): 150 | # vrama91 : added clipping 151 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 152 | 153 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 154 | val[n] /= (norm_hyp[n]*norm_ref[n]) 155 | 156 | assert(not math.isnan(val[n])) 157 | # vrama91: added a length based gaussian penalty 158 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 159 | return val 160 | 161 | # compute log reference length 162 | self.ref_len = np.log(float(len(self.crefs))) 163 | 164 | scores = [] 165 | for test, refs in zip(self.ctest, self.crefs): 166 | # compute vector for test captions 167 | vec, norm, length = counts2vec(test) 168 | # compute vector for ref captions 169 | score = np.array([0.0 for _ in range(self.n)]) 170 | for ref in refs: 171 | vec_ref, norm_ref, length_ref = counts2vec(ref) 172 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 173 | # change by vrama91 - mean of ngram scores, instead of sum 174 | score_avg = np.mean(score) 175 | # divide by number of references 176 | score_avg /= len(refs) 177 | # multiply score by 10 178 | score_avg *= 10.0 179 | # append score of an image to the score list 180 | scores.append(score_avg) 181 | return scores 182 | 183 | def compute_score(self, option=None, verbose=0): 184 | # compute idf 185 | self.compute_doc_freq() 186 | # assert to check document frequency 187 | assert(len(self.ctest) >= max(self.document_frequency.values())) 188 | # compute cider score 189 | score = self.compute_cider() 190 | # debug 191 | # print score 192 | return np.mean(np.array(score)), np.array(score) -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/cider/cider_scorer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/cider/cider_scorer.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | from tokenizer.ptbtokenizer import PTBTokenizer 3 | from bleu.bleu import Bleu 4 | from meteor.meteor import Meteor 5 | from rouge.rouge import Rouge 6 | from cider.cider import Cider 7 | 8 | class COCOEvalCap: 9 | def __init__(self, coco, cocoRes): 10 | self.evalImgs = [] 11 | self.eval = {} 12 | self.imgToEval = {} 13 | self.coco = coco 14 | self.cocoRes = cocoRes 15 | self.params = {'image_id': coco.getImgIds()} 16 | 17 | def evaluate(self): 18 | imgIds = self.params['image_id'] 19 | # imgIds = self.coco.getImgIds() 20 | gts = {} 21 | res = {} 22 | for imgId in imgIds: 23 | gts[imgId] = self.coco.imgToAnns[imgId] 24 | res[imgId] = self.cocoRes.imgToAnns[imgId] 25 | 26 | # ================================================= 27 | # Set up scorers 28 | # ================================================= 29 | print 'tokenization...' 30 | tokenizer = PTBTokenizer() 31 | gts = tokenizer.tokenize(gts) 32 | res = tokenizer.tokenize(res) 33 | 34 | # ================================================= 35 | # Set up scorers 36 | # ================================================= 37 | print 'setting up scorers...' 38 | scorers = [ 39 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 40 | (Meteor(),"METEOR"), 41 | (Rouge(), "ROUGE_L"), 42 | (Cider(), "CIDEr") 43 | ] 44 | 45 | # ================================================= 46 | # Compute scores 47 | # ================================================= 48 | for scorer, method in scorers: 49 | print 'computing %s score...'%(scorer.method()) 50 | score, scores = scorer.compute_score(gts, res) 51 | if type(method) == list: 52 | for sc, scs, m in zip(score, scores, method): 53 | self.setEval(sc, m) 54 | self.setImgToEvalImgs(scs, gts.keys(), m) 55 | print "%s: %0.3f"%(m, sc) 56 | else: 57 | self.setEval(score, method) 58 | self.setImgToEvalImgs(scores, gts.keys(), method) 59 | print "%s: %0.3f"%(method, score) 60 | self.setEvalImgs() 61 | 62 | def setEval(self, score, method): 63 | self.eval[method] = score 64 | 65 | def setImgToEvalImgs(self, scores, imgIds, method): 66 | for imgId, score in zip(imgIds, scores): 67 | if not imgId in self.imgToEval: 68 | self.imgToEval[imgId] = {} 69 | self.imgToEval[imgId]["image_id"] = imgId 70 | self.imgToEval[imgId][method] = score 71 | 72 | def setEvalImgs(self): 73 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()] -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/eval.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/eval.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/meteor/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/__init__.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/meteor/data/paraphrase-en.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/data/paraphrase-en.gz -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/meteor/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/meteor-1.5.jar -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/meteor/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python wrapper for METEOR implementation, by Xinlei Chen 4 | # Acknowledge Michael Denkowski for the generous discussion and help 5 | 6 | import os 7 | import sys 8 | import subprocess 9 | import threading 10 | 11 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. 12 | METEOR_JAR = 'meteor-1.5.jar' 13 | # print METEOR_JAR 14 | 15 | class Meteor: 16 | 17 | def __init__(self): 18 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ 19 | '-', '-', '-stdio', '-l', 'en', '-norm'] 20 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \ 21 | cwd=os.path.dirname(os.path.abspath(__file__)), \ 22 | stdin=subprocess.PIPE, \ 23 | stdout=subprocess.PIPE, \ 24 | stderr=subprocess.PIPE) 25 | # Used to guarantee thread safety 26 | self.lock = threading.Lock() 27 | 28 | def compute_score(self, gts, res): 29 | assert(gts.keys() == res.keys()) 30 | imgIds = gts.keys() 31 | scores = [] 32 | 33 | eval_line = 'EVAL' 34 | self.lock.acquire() 35 | for i in imgIds: 36 | assert(len(res[i]) == 1) 37 | stat = self._stat(res[i][0], gts[i]) 38 | eval_line += ' ||| {}'.format(stat) 39 | 40 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 41 | for i in range(0,len(imgIds)): 42 | scores.append(float(self.meteor_p.stdout.readline().strip())) 43 | score = float(self.meteor_p.stdout.readline().strip()) 44 | self.lock.release() 45 | 46 | return score, scores 47 | 48 | def method(self): 49 | return "METEOR" 50 | 51 | def _stat(self, hypothesis_str, reference_list): 52 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 53 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 54 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 55 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 56 | return self.meteor_p.stdout.readline().strip() 57 | 58 | def _score(self, hypothesis_str, reference_list): 59 | self.lock.acquire() 60 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 61 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 62 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 63 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 64 | stats = self.meteor_p.stdout.readline().strip() 65 | eval_line = 'EVAL ||| {}'.format(stats) 66 | # EVAL ||| stats 67 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 68 | score = float(self.meteor_p.stdout.readline().strip()) 69 | # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice 70 | # thanks for Andrej for pointing this out 71 | score = float(self.meteor_p.stdout.readline().strip()) 72 | self.lock.release() 73 | return score 74 | 75 | def __del__(self): 76 | self.lock.acquire() 77 | self.meteor_p.stdin.close() 78 | self.meteor_p.kill() 79 | self.meteor_p.wait() 80 | self.lock.release() 81 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/meteor/meteor.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/meteor/meteor.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/rouge/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/rouge/__init__.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(gts.keys() == res.keys()) 86 | imgIds = gts.keys() 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) > 0) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/rouge/rouge.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/rouge/rouge.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/tokenizer/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/tokenizer/__init__.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import sys 13 | import subprocess 14 | import tempfile 15 | import itertools 16 | 17 | # path to the stanford corenlp jar 18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 19 | 20 | # punctuations to be removed from the sentences 21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 22 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 23 | 24 | class PTBTokenizer: 25 | """Python wrapper of Stanford PTBTokenizer""" 26 | 27 | def tokenize(self, captions_for_image): 28 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 29 | 'edu.stanford.nlp.process.PTBTokenizer', \ 30 | '-preserveLines', '-lowerCase'] 31 | 32 | # ====================================================== 33 | # prepare data for PTB Tokenizer 34 | # ====================================================== 35 | final_tokenized_captions_for_image = {} 36 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 37 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 38 | 39 | # ====================================================== 40 | # save sentences to temporary file 41 | # ====================================================== 42 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 43 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 44 | tmp_file.write(sentences) 45 | tmp_file.close() 46 | 47 | # ====================================================== 48 | # tokenize sentence 49 | # ====================================================== 50 | cmd.append(os.path.basename(tmp_file.name)) 51 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 52 | stdout=subprocess.PIPE) 53 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 54 | lines = token_lines.split('\n') 55 | # remove temp file 56 | os.remove(tmp_file.name) 57 | 58 | # ====================================================== 59 | # create dictionary for tokenized captions 60 | # ====================================================== 61 | for k, line in zip(image_id, lines): 62 | if not k in final_tokenized_captions_for_image: 63 | final_tokenized_captions_for_image[k] = [] 64 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 65 | if w not in PUNCTUATIONS]) 66 | final_tokenized_captions_for_image[k].append(tokenized_caption) 67 | 68 | return final_tokenized_captions_for_image 69 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/tokenizer/ptbtokenizer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/tokenizer/ptbtokenizer.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocotools/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocotools/__init__.pyc -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocotools/coco.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | __version__ = '1.0.1' 3 | # Interface for accessing the Microsoft COCO dataset. 4 | 5 | # Microsoft COCO is a large image dataset designed for object detection, 6 | # segmentation, and caption generation. pycocotools is a Python API that 7 | # assists in loading, parsing and visualizing the annotations in COCO. 8 | # Please visit http://mscoco.org/ for more information on COCO, including 9 | # for the data, paper, and tutorials. The exact format of the annotations 10 | # is also described on the COCO website. For example usage of the pycocotools 11 | # please see pycocotools_demo.ipynb. In addition to this API, please download both 12 | # the COCO images and annotations in order to run the demo. 13 | 14 | # An alternative to using the API is to load the annotations directly 15 | # into Python dictionary 16 | # Using the API provides additional utility functions. Note that this API 17 | # supports both *instance* and *caption* annotations. In the case of 18 | # captions not all functions are defined (e.g. categories are undefined). 19 | 20 | # The following API functions are defined: 21 | # COCO - COCO api class that loads COCO annotation file and prepare data structures. 22 | # decodeMask - Decode binary mask M encoded via run-length encoding. 23 | # encodeMask - Encode binary mask M using run-length encoding. 24 | # getAnnIds - Get ann ids that satisfy given filter conditions. 25 | # getCatIds - Get cat ids that satisfy given filter conditions. 26 | # getImgIds - Get img ids that satisfy given filter conditions. 27 | # loadAnns - Load anns with the specified ids. 28 | # loadCats - Load cats with the specified ids. 29 | # loadImgs - Load imgs with the specified ids. 30 | # segToMask - Convert polygon segmentation to binary mask. 31 | # showAnns - Display the specified annotations. 32 | # loadRes - Load result file and create result api object. 33 | # Throughout the API "ann"=annotation, "cat"=category, and "img"=image. 34 | # Help on each functions can be accessed by: "help COCO>function". 35 | 36 | # See also COCO>decodeMask, 37 | # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, 38 | # COCO>getImgIds, COCO>loadAnns, COCO>loadCats, 39 | # COCO>loadImgs, COCO>segToMask, COCO>showAnns 40 | 41 | # Microsoft COCO Toolbox. Version 1.0 42 | # Data, paper, and tutorials available at: http://mscoco.org/ 43 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. 44 | # Licensed under the Simplified BSD License [see bsd.txt] 45 | 46 | import json 47 | import datetime 48 | import matplotlib.pyplot as plt 49 | from matplotlib.collections import PatchCollection 50 | from matplotlib.patches import Polygon 51 | import numpy as np 52 | from skimage.draw import polygon 53 | import copy 54 | 55 | class COCO: 56 | def __init__(self, annotation_file=None): 57 | """ 58 | Constructor of Microsoft COCO helper class for reading and visualizing annotations. 59 | :param annotation_file (str): location of annotation file 60 | :param image_folder (str): location to the folder that hosts images. 61 | :return: 62 | """ 63 | # load dataset 64 | self.dataset = {} 65 | self.anns = [] 66 | self.imgToAnns = {} 67 | self.catToImgs = {} 68 | self.imgs = [] 69 | self.cats = [] 70 | if not annotation_file == None: 71 | print 'loading annotations into memory...' 72 | time_t = datetime.datetime.utcnow() 73 | dataset = json.load(open(annotation_file, 'r')) 74 | print datetime.datetime.utcnow() - time_t 75 | self.dataset = dataset 76 | self.createIndex() 77 | 78 | def createIndex(self): 79 | # create index 80 | print 'creating index...' 81 | imgToAnns = {ann['image_id']: [] for ann in self.dataset['annotations']} 82 | anns = {ann['id']: [] for ann in self.dataset['annotations']} 83 | for ann in self.dataset['annotations']: 84 | imgToAnns[ann['image_id']] += [ann] 85 | anns[ann['id']] = ann 86 | 87 | imgs = {im['id']: {} for im in self.dataset['images']} 88 | for img in self.dataset['images']: 89 | imgs[img['id']] = img 90 | 91 | cats = [] 92 | catToImgs = [] 93 | if self.dataset['type'] == 'instances': 94 | cats = {cat['id']: [] for cat in self.dataset['categories']} 95 | for cat in self.dataset['categories']: 96 | cats[cat['id']] = cat 97 | catToImgs = {cat['id']: [] for cat in self.dataset['categories']} 98 | for ann in self.dataset['annotations']: 99 | catToImgs[ann['category_id']] += [ann['image_id']] 100 | 101 | print 'index created!' 102 | 103 | # create class members 104 | self.anns = anns 105 | self.imgToAnns = imgToAnns 106 | self.catToImgs = catToImgs 107 | self.imgs = imgs 108 | self.cats = cats 109 | 110 | def info(self): 111 | """ 112 | Print information about the annotation file. 113 | :return: 114 | """ 115 | for key, value in self.datset['info'].items(): 116 | print '%s: %s'%(key, value) 117 | 118 | def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): 119 | """ 120 | Get ann ids that satisfy given filter conditions. default skips that filter 121 | :param imgIds (int array) : get anns for given imgs 122 | catIds (int array) : get anns for given cats 123 | areaRng (float array) : get anns for given area range (e.g. [0 inf]) 124 | iscrowd (boolean) : get anns for given crowd label (False or True) 125 | :return: ids (int array) : integer array of ann ids 126 | """ 127 | imgIds = imgIds if type(imgIds) == list else [imgIds] 128 | catIds = catIds if type(catIds) == list else [catIds] 129 | 130 | if len(imgIds) == len(catIds) == len(areaRng) == 0: 131 | anns = self.dataset['annotations'] 132 | else: 133 | if not len(imgIds) == 0: 134 | anns = sum([self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns],[]) 135 | else: 136 | anns = self.dataset['annotations'] 137 | anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] 138 | anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] 139 | if self.dataset['type'] == 'instances': 140 | if not iscrowd == None: 141 | ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] 142 | else: 143 | ids = [ann['id'] for ann in anns] 144 | else: 145 | ids = [ann['id'] for ann in anns] 146 | return ids 147 | 148 | def getCatIds(self, catNms=[], supNms=[], catIds=[]): 149 | """ 150 | filtering parameters. default skips that filter. 151 | :param catNms (str array) : get cats for given cat names 152 | :param supNms (str array) : get cats for given supercategory names 153 | :param catIds (int array) : get cats for given cat ids 154 | :return: ids (int array) : integer array of cat ids 155 | """ 156 | catNms = catNms if type(catNms) == list else [catNms] 157 | supNms = supNms if type(supNms) == list else [supNms] 158 | catIds = catIds if type(catIds) == list else [catIds] 159 | 160 | if len(catNms) == len(supNms) == len(catIds) == 0: 161 | cats = self.dataset['categories'] 162 | else: 163 | cats = self.dataset['categories'] 164 | cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] 165 | cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] 166 | cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] 167 | ids = [cat['id'] for cat in cats] 168 | return ids 169 | 170 | def getImgIds(self, imgIds=[], catIds=[]): 171 | ''' 172 | Get img ids that satisfy given filter conditions. 173 | :param imgIds (int array) : get imgs for given ids 174 | :param catIds (int array) : get imgs with all given cats 175 | :return: ids (int array) : integer array of img ids 176 | ''' 177 | imgIds = imgIds if type(imgIds) == list else [imgIds] 178 | catIds = catIds if type(catIds) == list else [catIds] 179 | 180 | if len(imgIds) == len(catIds) == 0: 181 | ids = self.imgs.keys() 182 | else: 183 | ids = set(imgIds) 184 | for catId in catIds: 185 | if len(ids) == 0: 186 | ids = set(self.catToImgs[catId]) 187 | else: 188 | ids &= set(self.catToImgs[catId]) 189 | return list(ids) 190 | 191 | def loadAnns(self, ids=[]): 192 | """ 193 | Load anns with the specified ids. 194 | :param ids (int array) : integer ids specifying anns 195 | :return: anns (object array) : loaded ann objects 196 | """ 197 | if type(ids) == list: 198 | return [self.anns[id] for id in ids] 199 | elif type(ids) == int: 200 | return [self.anns[ids]] 201 | 202 | def loadCats(self, ids=[]): 203 | """ 204 | Load cats with the specified ids. 205 | :param ids (int array) : integer ids specifying cats 206 | :return: cats (object array) : loaded cat objects 207 | """ 208 | if type(ids) == list: 209 | return [self.cats[id] for id in ids] 210 | elif type(ids) == int: 211 | return [self.cats[ids]] 212 | 213 | def loadImgs(self, ids=[]): 214 | """ 215 | Load anns with the specified ids. 216 | :param ids (int array) : integer ids specifying img 217 | :return: imgs (object array) : loaded img objects 218 | """ 219 | if type(ids) == list: 220 | return [self.imgs[id] for id in ids] 221 | elif type(ids) == int: 222 | return [self.imgs[ids]] 223 | 224 | def showAnns(self, anns): 225 | """ 226 | Display the specified annotations. 227 | :param anns (array of object): annotations to display 228 | :return: None 229 | """ 230 | if len(anns) == 0: 231 | return 0 232 | if self.dataset['type'] == 'instances': 233 | ax = plt.gca() 234 | polygons = [] 235 | color = [] 236 | for ann in anns: 237 | c = np.random.random((1, 3)).tolist()[0] 238 | if type(ann['segmentation']) == list: 239 | # polygon 240 | for seg in ann['segmentation']: 241 | poly = np.array(seg).reshape((len(seg)/2, 2)) 242 | polygons.append(Polygon(poly, True,alpha=0.4)) 243 | color.append(c) 244 | else: 245 | # mask 246 | mask = COCO.decodeMask(ann['segmentation']) 247 | img = np.ones( (mask.shape[0], mask.shape[1], 3) ) 248 | if ann['iscrowd'] == 1: 249 | color_mask = np.array([2.0,166.0,101.0])/255 250 | if ann['iscrowd'] == 0: 251 | color_mask = np.random.random((1, 3)).tolist()[0] 252 | for i in range(3): 253 | img[:,:,i] = color_mask[i] 254 | ax.imshow(np.dstack( (img, mask*0.5) )) 255 | p = PatchCollection(polygons, facecolors=color, edgecolors=(0,0,0,1), linewidths=3, alpha=0.4) 256 | ax.add_collection(p) 257 | if self.dataset['type'] == 'captions': 258 | for ann in anns: 259 | print ann['caption'] 260 | 261 | def loadRes(self, resFile): 262 | """ 263 | Load result file and return a result api object. 264 | :param resFile (str) : file name of result file 265 | :return: res (obj) : result api object 266 | """ 267 | res = COCO() 268 | res.dataset['images'] = [img for img in self.dataset['images']] 269 | res.dataset['info'] = copy.deepcopy(self.dataset['info']) 270 | res.dataset['type'] = copy.deepcopy(self.dataset['type']) 271 | res.dataset['licenses'] = copy.deepcopy(self.dataset['licenses']) 272 | 273 | print 'Loading and preparing results... ' 274 | time_t = datetime.datetime.utcnow() 275 | anns = json.load(open(resFile)) 276 | assert type(anns) == list, 'results in not an array of objects' 277 | annsImgIds = [ann['image_id'] for ann in anns] 278 | assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 279 | 'Results do not correspond to current coco set' 280 | if 'caption' in anns[0]: 281 | imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) 282 | res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] 283 | for id, ann in enumerate(anns): 284 | ann['id'] = id 285 | elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: 286 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 287 | for id, ann in enumerate(anns): 288 | bb = ann['bbox'] 289 | x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] 290 | ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 291 | ann['area'] = bb[2]*bb[3] 292 | ann['id'] = id 293 | ann['iscrowd'] = 0 294 | elif 'segmentation' in anns[0]: 295 | res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) 296 | for id, ann in enumerate(anns): 297 | ann['area']=sum(ann['segmentation']['counts'][2:-1:2]) 298 | ann['bbox'] = [] 299 | ann['id'] = id 300 | ann['iscrowd'] = 0 301 | print 'DONE (t=%0.2fs)'%((datetime.datetime.utcnow() - time_t).total_seconds()) 302 | 303 | res.dataset['annotations'] = anns 304 | res.createIndex() 305 | return res 306 | 307 | 308 | @staticmethod 309 | def decodeMask(R): 310 | """ 311 | Decode binary mask M encoded via run-length encoding. 312 | :param R (object RLE) : run-length encoding of binary mask 313 | :return: M (bool 2D array) : decoded binary mask 314 | """ 315 | N = len(R['counts']) 316 | M = np.zeros( (R['size'][0]*R['size'][1], )) 317 | n = 0 318 | val = 1 319 | for pos in range(N): 320 | val = not val 321 | for c in range(R['counts'][pos]): 322 | R['counts'][pos] 323 | M[n] = val 324 | n += 1 325 | return M.reshape((R['size']), order='F') 326 | 327 | @staticmethod 328 | def encodeMask(M): 329 | """ 330 | Encode binary mask M using run-length encoding. 331 | :param M (bool 2D array) : binary mask to encode 332 | :return: R (object RLE) : run-length encoding of binary mask 333 | """ 334 | [h, w] = M.shape 335 | M = M.flatten(order='F') 336 | N = len(M) 337 | counts_list = [] 338 | pos = 0 339 | # counts 340 | counts_list.append(1) 341 | diffs = np.logical_xor(M[0:N-1], M[1:N]) 342 | for diff in diffs: 343 | if diff: 344 | pos +=1 345 | counts_list.append(1) 346 | else: 347 | counts_list[pos] += 1 348 | # if array starts from 1. start with 0 counts for 0 349 | if M[0] == 1: 350 | counts_list = [0] + counts_list 351 | return {'size': [h, w], 352 | 'counts': counts_list , 353 | } 354 | 355 | @staticmethod 356 | def segToMask( S, h, w ): 357 | """ 358 | Convert polygon segmentation to binary mask. 359 | :param S (float array) : polygon segmentation mask 360 | :param h (int) : target mask height 361 | :param w (int) : target mask width 362 | :return: M (bool 2D array) : binary mask 363 | """ 364 | M = np.zeros((h,w), dtype=np.bool) 365 | for s in S: 366 | N = len(s) 367 | rr, cc = polygon(np.array(s[1:N:2]), np.array(s[0:N:2])) # (y, x) 368 | M[rr, cc] = 1 369 | return M -------------------------------------------------------------------------------- /s2vt/coco_eval/pycocotools/coco.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/pytorch_video_caption/c725e0a1662c6c606f3c25b0e46b7cf576fb86e6/s2vt/coco_eval/pycocotools/coco.pyc -------------------------------------------------------------------------------- /s2vt/data/readme.md: -------------------------------------------------------------------------------- 1 | put .csv to this folder 2 | -------------------------------------------------------------------------------- /s2vt/extract_RGB_feats.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import cv2 4 | import os 5 | #import ipdb 6 | import numpy as np 7 | import pandas as pd 8 | import skimage 9 | from cnn_util import * 10 | 11 | 12 | def preprocess_frame(image, target_height=224, target_width=224): 13 | 14 | if len(image.shape) == 2: 15 | image = np.tile(image[:,:,None], 3) 16 | elif len(image.shape) == 4: 17 | image = image[:,:,:,0] 18 | 19 | image = skimage.img_as_float(image).astype(np.float32) 20 | height, width, rgb = image.shape 21 | if width == height: 22 | resized_image = cv2.resize(image, (target_height,target_width)) 23 | 24 | elif height < width: 25 | resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width)) 26 | cropping_length = int((resized_image.shape[1] - target_height) / 2) 27 | resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length] 28 | 29 | else: 30 | resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width))) 31 | cropping_length = int((resized_image.shape[0] - target_width) / 2) 32 | resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:] 33 | 34 | return cv2.resize(resized_image, (target_height, target_width)) 35 | 36 | def main(): 37 | num_frames = 80 38 | vgg_model = '/root/workspace/caffe/models/vgg_16/VGG_ILSVRC_16_layers.caffemodel' 39 | vgg_deploy = '/root/workspace/caffe/models/vgg_16/VGG_ILSVRC_16_layers_deploy.prototxt' 40 | video_path = '/root/Downloads/YouTubeClips/' 41 | video_save_path = './rgb_feats' 42 | videos = os.listdir(video_path) 43 | videos = filter(lambda x: x.endswith('avi'), videos) 44 | 45 | cnn = CNN(model=vgg_model, deploy=vgg_deploy, width=224, height=224) 46 | 47 | for idx, video in enumerate(videos): 48 | print idx, video 49 | 50 | if os.path.exists( os.path.join(video_save_path, video) ): 51 | print "Already processed ... " 52 | continue 53 | 54 | video_fullpath = os.path.join(video_path, video) 55 | try: 56 | cap = cv2.VideoCapture( video_fullpath ) 57 | except: 58 | pass 59 | 60 | frame_count = 0 61 | frame_list = [] 62 | 63 | while True: 64 | ret, frame = cap.read() 65 | if ret is False: 66 | break 67 | 68 | frame_list.append(frame) 69 | frame_count += 1 70 | 71 | frame_list = np.array(frame_list) 72 | 73 | if frame_count > 80: 74 | frame_indices = np.linspace(0, frame_count, num=num_frames, endpoint=False).astype(int) 75 | frame_list = frame_list[frame_indices] 76 | 77 | cropped_frame_list = np.array(map(lambda x: preprocess_frame(x), frame_list)) 78 | feats = cnn.get_features(cropped_frame_list) 79 | 80 | save_full_path = os.path.join(video_save_path, video + '.npy') 81 | np.save(save_full_path, feats) 82 | 83 | if __name__=="__main__": 84 | main() 85 | 86 | -------------------------------------------------------------------------------- /s2vt/model_temp/readme.md: -------------------------------------------------------------------------------- 1 | the snapshots in training process will be put here 2 | -------------------------------------------------------------------------------- /s2vt/readme.md: -------------------------------------------------------------------------------- 1 | ### environment 2 | python2.7 3 | pytorch 4 | 5 | ### prepare for train 6 | 1. download the video clips from [here](http://www.cs.utexas.edu/users/ml/clamp/videoDescription/) 7 | 8 | 2. download caffe and VGG model 9 | download caffe from [BVLC/caffe](https://github.com/BVLC/caffe) to "/root/workspace/" 10 | download [VGG16](http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_16_layers.caffemodel) model and its [prototxt](https://gist.githubusercontent.com/ksimonyan/211839e770f7b538e2d8/raw/0067c9b32f60362c74f4c445a080beed06b07eb3/VGG_ILSVRC_16_layers_deploy.prototxt) from [caffe/model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo#models-used-by-the-vgg-team-in-ilsvrc-2014) and put it into "/root/workspace/caffe/models/vgg_16" 11 | 12 | 3. extract feature from video clip 13 | ``` 14 | python extract_RGB_feats.py 15 | ``` 16 | then put these feature data into training data (put into this folder "./rgb_train_features") and test data (put into this folder "./rgb_test_features"). 17 | 18 | ### how to train/test 19 | ``` 20 | python s2vt.py train 21 | python s2vt.py test 22 | ``` 23 | and you will get a file named "test_result.txt" in the root folder of this project. 24 | 25 | ### how to evaluation 26 | 1. install java 27 | ``` 28 | conda install -c cyclus java-jdk 29 | ``` 30 | 2. download data 31 | MSVD data set:[Microsoft Research Video Description Corpus](https://www.microsoft.com/en-us/download/details.aspx?id=52422&from=http%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fdownloads%2F38cf15fd-b8df-477e-a4e4-a4680caa75af%2Fdefault.aspx) 32 | put it to ./data folder. 33 | 34 | 3. parse video csv 35 | ``` 36 | python ./coco_eval/parse_video_csv.py 37 | ``` 38 | 4. create reference and result .json file 39 | ``` 40 | python ./coco_eval/create_reference.py 41 | python ./coco_eval/create_result_json.py 42 | ``` 43 | 5. evalute the result 44 | ``` 45 | python ./coco_eval/eval.py 46 | ``` 47 | 48 | and you will get the evalution result. 49 | 50 | ### my model 51 | [here]() is a pytorch model trained by myself 52 | ### acknowledgement 53 | some code copy from chenxinpeng(https://github.com/chenxinpeng/S2VT) 54 | 55 | ### note 56 | the project does not work now(20180119) 57 | 58 | ### related paper 59 | Sequence to Sequence - Video to Text(ICCV2015)(https://vsubhashini.github.io/s2vt.html) 60 | 61 | ### note 62 | any questions, please contact me: jiguo.li@vipl.ict.ac.cn or jgli@pku.edu.cn 63 | 64 | -------------------------------------------------------------------------------- /s2vt/rgb_test_features/readme.md: -------------------------------------------------------------------------------- 1 | this folder is for test data 2 | -------------------------------------------------------------------------------- /s2vt/rgb_train_features/readme.md: -------------------------------------------------------------------------------- 1 | this folder is for training video features 2 | -------------------------------------------------------------------------------- /s2vt/s2vt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | from torch import nn 5 | from torch.autograd import Variable 6 | from torchvision import transforms 7 | import numpy as np 8 | import time 9 | #from model_RGB import get_video_train_data,get_video_test_data,preProBuildWordVocab 10 | import pandas as pd 11 | from keras.preprocessing import sequence 12 | import matplotlib.pyplot as plt 13 | import os 14 | import re 15 | import sys 16 | 17 | class DataSet_MSVD(): 18 | def __init__(self,csv_path,video_step,caption_step,image_dim,batch_size=50,video_train_data_path="./data/video_corpus.csv",\ 19 | video_train_feat_path="./rgb_train_features",video_test_data_path="./data/video_corpus.csv",\ 20 | video_test_feat_path="./rgb_test_features"): 21 | self.csv_path=csv_path; 22 | self.batch_size=batch_size; 23 | self.video_train_data_path = video_train_data_path; 24 | self.video_train_feat_path = video_train_feat_path; 25 | self.video_test_data_path = video_test_data_path; 26 | self.video_test_feat_path = video_test_feat_path; 27 | self.video_step = video_step; 28 | self.caption_step = caption_step; 29 | self.image_dim = image_dim; 30 | 31 | 32 | train_data = self.get_video_train_data(video_train_data_path, video_train_feat_path); 33 | train_captions = train_data['Description'].values; 34 | test_data = self.get_video_test_data(video_test_data_path, video_test_feat_path); 35 | test_captions = test_data['Description'].values; 36 | if not os.path.exists("./data/wordtoix.npy") or not os.path.exists('./data/ixtoword.npy') \ 37 | or not os.path.exists("./data/bias_init_vector.npy"): 38 | captions_list = list(train_captions) + list(test_captions); 39 | captions = np.asarray(captions_list, dtype=np.object); 40 | 41 | #replace some characters to space, this means delete them 42 | captions = map(lambda x: x.replace('.', ''), captions); 43 | captions = map(lambda x: x.replace(',', ''), captions) 44 | captions = map(lambda x: x.replace('"', ''), captions) 45 | captions = map(lambda x: x.replace('\n', ''), captions) 46 | captions = map(lambda x: x.replace('?', ''), captions) 47 | captions = map(lambda x: x.replace('!', ''), captions) 48 | captions = map(lambda x: x.replace('\\', ''), captions) 49 | captions = map(lambda x: x.replace('/', ''), captions) 50 | 51 | #word to index, index to word, 52 | wordtoix, ixtoword, bias_init_vector = self.preProBuildWordVocab(captions, word_count_threshold=0) 53 | 54 | np.save("./data/wordtoix", wordtoix) 55 | np.save('./data/ixtoword', ixtoword) 56 | np.save("./data/bias_init_vector", bias_init_vector) 57 | else: 58 | wordtoix, ixtoword, bias_init_vector = np.load("./data/wordtoix.npy").tolist(),\ 59 | np.load('./data/ixtoword.npy').tolist(),\ 60 | np.load("./data/bias_init_vector.npy").tolist(); 61 | #reset dataset 62 | current_train_data = self.reset_train_data(train_data); 63 | 64 | self.train_data = train_data; 65 | self.current_train_data = current_train_data; 66 | self.test_data = test_data; 67 | self.word2idx = wordtoix; 68 | self.idx2word = ixtoword; 69 | self.bias_init_vector = bias_init_vector; 70 | 71 | self.minibatch_start=0; 72 | self.train_data_size = len(current_train_data); 73 | self.word_num = len(self.word2idx); 74 | 75 | def next_batch(self): 76 | if self.minibatch_start+self.batch_size ' + x, current_captions) 115 | current_captions = map(lambda x: x.replace('.', ''), current_captions) 116 | current_captions = map(lambda x: x.replace(',', ''), current_captions) 117 | current_captions = map(lambda x: x.replace('"', ''), current_captions) 118 | current_captions = map(lambda x: x.replace('\n', ''), current_captions) 119 | current_captions = map(lambda x: x.replace('?', ''), current_captions) 120 | current_captions = map(lambda x: x.replace('!', ''), current_captions) 121 | current_captions = map(lambda x: x.replace('\\', ''), current_captions) 122 | current_captions = map(lambda x: x.replace('/', ''), current_captions) 123 | 124 | for idx, each_cap in enumerate(current_captions): 125 | word = each_cap.lower().split(' ') 126 | if len(word) < self.caption_step: 127 | current_captions[idx] = current_captions[idx] + ' ' 128 | else: 129 | new_word = '' 130 | for i in range(self.caption_step-1): 131 | new_word = new_word + word[i] + ' ' 132 | current_captions[idx] = new_word + '' 133 | 134 | current_caption_ind = [] 135 | for cap in current_captions: 136 | current_word_ind = [] 137 | for word in cap.lower().split(' '): 138 | if word in self.word2idx: 139 | current_word_ind.append(self.word2idx[word]) 140 | else: 141 | current_word_ind.append(self.word2idx['']); 142 | current_caption_ind.append(current_word_ind) 143 | 144 | current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=self.caption_step) 145 | current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix), 1] ) ] ).astype(int) 146 | 147 | return current_caption_matrix; 148 | 149 | def reset_train_data(self,train_data): 150 | index = list(train_data.index) 151 | np.random.shuffle(index) 152 | train_data = train_data.loc[index]; 153 | 154 | #group_data = train_data.groupby('video_path'); 155 | #print group_data; 156 | current_train_data = train_data.groupby('video_path').apply(lambda x: x.iloc[np.random.choice(len(x))]) 157 | current_train_data = current_train_data.reset_index(drop=True); 158 | return current_train_data; 159 | 160 | def get_video_train_data(self,video_data_path, video_feat_path): 161 | #read csv file using panda 162 | video_data = pd.read_csv(video_data_path, sep=',') 163 | #filte Language==English 164 | video_data = video_data[video_data['Language'] == 'English'] 165 | #get file name VideoID_Start_End.avi.npy 166 | video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1) 167 | #add video path 168 | video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x)) 169 | #filte video path exist 170 | video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))] 171 | #filte Description is a string (not none or other) 172 | video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))] 173 | #filte video is unique 174 | unique_filenames = sorted(video_data['video_path'].unique()) 175 | train_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)] 176 | return train_data 177 | 178 | def get_video_test_data(self,video_data_path, video_feat_path): 179 | video_data = pd.read_csv(video_data_path, sep=',') 180 | video_data = video_data[video_data['Language'] == 'English'] 181 | video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(int(row['Start']))+'_'+str(int(row['End']))+'.avi.npy', axis=1) 182 | video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x)) 183 | video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))] 184 | video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))] 185 | 186 | unique_filenames = sorted(video_data['video_path'].unique()) 187 | test_data = video_data[video_data['video_path'].map(lambda x: x in unique_filenames)] 188 | return test_data 189 | 190 | def preProBuildWordVocab(self,sentence_iterator, word_count_threshold=5): 191 | # borrowed this function from NeuralTalk 192 | print 'preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold) 193 | word_counts = {} 194 | nsents = 0 195 | #statistic the word 196 | for sent in sentence_iterator: 197 | nsents += 1 198 | for w in sent.lower().split(' '): 199 | word_counts[w] = word_counts.get(w, 0) + 1 200 | #filte the word whose number is lower than the threshold 201 | #vocab is a list containing all words which is more than threshold 202 | vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] 203 | print 'filtered words from %d to %d' % (len(word_counts), len(vocab)) 204 | 205 | ixtoword = {} 206 | ixtoword[0] = '' 207 | ixtoword[1] = '' 208 | ixtoword[2] = '' 209 | ixtoword[3] = '' 210 | 211 | wordtoix = {} 212 | wordtoix[''] = 0 213 | wordtoix[''] = 1 214 | wordtoix[''] = 2 215 | wordtoix[''] = 3 216 | 217 | #get the index and content, that is (idx,word) 218 | for idx, w in enumerate(vocab): 219 | wordtoix[w] = idx+4 220 | ixtoword[idx+4] = w 221 | 222 | word_counts[''] = nsents 223 | word_counts[''] = nsents 224 | word_counts[''] = nsents 225 | word_counts[''] = nsents 226 | 227 | #how to use this variable? bias_init_vector? 228 | bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword]) 229 | bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies 230 | bias_init_vector = np.log(bias_init_vector) 231 | bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range 232 | 233 | return wordtoix, ixtoword, bias_init_vector 234 | 235 | class VideoCaption(nn.Module): 236 | def __init__(self,batch_size,image_dim,word_num,img_embed_dim,word_embed_dim,hidden_dim,video_step,caption_step): 237 | super(VideoCaption,self).__init__(); 238 | 239 | self.batch_size = batch_size; 240 | self.image_dim = image_dim; 241 | self.img_embed_dim = img_embed_dim; 242 | self.word_embed_dim = word_embed_dim; 243 | self.hidden_dim = hidden_dim; 244 | self.video_step = video_step; 245 | self.caption_step = caption_step; 246 | self.word_num = word_num; 247 | self.word2vec = nn.Embedding(word_num,word_embed_dim); 248 | 249 | nn.init.uniform(self.word2vec.weight,-0.1,0.1); 250 | self.vec2word = nn.Linear(hidden_dim,word_num); 251 | nn.init.uniform(self.vec2word.weight,-0.1,0.1); 252 | nn.init.constant(self.vec2word.bias,0); 253 | self.img_embed = nn.Linear(image_dim,img_embed_dim); 254 | nn.init.uniform(self.img_embed.weight,-0.1,0.1); 255 | nn.init.constant(self.img_embed.bias,0); 256 | self.lstm1 = nn.LSTMCell(input_size=img_embed_dim,hidden_size=hidden_dim); 257 | #nn.init.uniform(self.lstm1.weight_hh,-0.1,0.1); 258 | #nn.init.uniform(self.lstm1.weight_ih,-0.1,0.1); 259 | nn.init.orthogonal(self.lstm1.weight_hh); 260 | nn.init.orthogonal(self.lstm1.weight_ih); 261 | 262 | self.lstm2 = nn.LSTMCell(input_size=word_embed_dim+hidden_dim, hidden_size=hidden_dim); 263 | #nn.init.uniform(self.lstm2.weight_hh,-0.1,0.1); 264 | #nn.init.uniform(self.lstm2.weight_ih,-0.1,0.1); 265 | nn.init.orthogonal(self.lstm2.weight_hh); 266 | nn.init.orthogonal(self.lstm2.weight_ih); 267 | 268 | def forward(self, input_image, input_caption, caption_mask): 269 | ''' 270 | input_image: int Variable, batch_size x video_step x image_dim 271 | input_caption: int Variable, batch_size x (1+caption_step) x 1 (word is idx, so the dim is 1) 272 | ''' 273 | image_embeded_vector = self.img_embed(input_image); 274 | word_vec = self.word2vec(input_caption); 275 | 276 | #encoding 277 | state1 = Variable(torch.zeros(self.batch_size,self.lstm1.hidden_size)).cuda(); 278 | state2 = Variable(torch.zeros(self.batch_size,self.lstm2.hidden_size)).cuda(); 279 | output1 = Variable(torch.zeros(self.batch_size,self.lstm1.hidden_size)).cuda(); 280 | output2 = Variable(torch.zeros(self.batch_size,self.lstm2.hidden_size)).cuda(); 281 | padding_for_lstm1 = Variable(torch.zeros(self.batch_size,self.img_embed_dim)).cuda(); 282 | padding_for_lstm2 = Variable(torch.zeros(self.batch_size,self.word_embed_dim)).cuda(); 283 | 284 | for step in xrange(self.video_step): 285 | output1,state1 = self.lstm1(image_embeded_vector[:,step,:],(output1,state1)); 286 | output2,state2 = self.lstm2(torch.cat((padding_for_lstm2,output1),1),(output2,state2)); 287 | 288 | 289 | loss=Variable(torch.FloatTensor([0])).cuda(); 290 | #decoding 291 | #one_hot_eye = np.eye(self.word_num).astype('int64'); 292 | for step in xrange(self.caption_step): 293 | output1,state1 = self.lstm1(padding_for_lstm1,(output1, state1)); 294 | output2,state2 = self.lstm2(torch.cat((word_vec[:,step,:],output1),1),(output2, state2)); 295 | 296 | word_onehot = self.vec2word(output2); 297 | #word_onehot_softmax = nn.Softmax(dim=1)(word_onehot); 298 | labels = input_caption[:,step+1]; 299 | 300 | #one_hot_labels = np.zeros((labels.data.shape[0],self.word_num),dtype='int64'); 301 | #for idx,data in enumerate(labels.data): 302 | # one_hot_labels[idx]=one_hot_eye[data]; 303 | 304 | #labels_onehot = Variable(torch.FloatTensor(one_hot_labels)).cuda(); 305 | #labels_onehot_list = labels_onehot.cpu().data.tolist() 306 | #print len(labels_onehot_list) 307 | #loss_func = nn.BCEWithLogitsLoss()*caption_mask[:,step]; 308 | loss_func = nn.CrossEntropyLoss(reduce=False); 309 | 310 | loss_temp = loss_func(word_onehot,labels)*(caption_mask[:,step+1].float()); 311 | loss += torch.sum(loss_temp)/self.batch_size; 312 | 313 | return loss; 314 | 315 | def generate_cpu(self, input_image): 316 | image_embeded_vector = self.img_embed(input_image); 317 | 318 | #encoding 319 | state1 = Variable(torch.zeros(1,self.lstm1.hidden_size)); 320 | state2 = Variable(torch.zeros(1,self.lstm2.hidden_size)); 321 | output1 = Variable(torch.zeros(1,self.lstm1.hidden_size)); 322 | output2 = Variable(torch.zeros(1,self.lstm2.hidden_size)); 323 | padding_for_lstm1 = Variable(torch.zeros(1,self.img_embed_dim)); 324 | padding_for_lstm2 = Variable(torch.zeros(1,self.word_embed_dim)); 325 | 326 | for step in xrange(self.video_step): 327 | output1,state1 = self.lstm1(image_embeded_vector[step,:],(output1,state1)); 328 | output2,state2 = self.lstm2(torch.cat((padding_for_lstm2,output1),1),(output2,state2)); 329 | 330 | 331 | words=[] 332 | #decoding 333 | #set '' 334 | previous_word = self.word2vec(Variable(torch.LongTensor([1]))); 335 | for step in xrange(self.caption_step): 336 | output1,state1 = self.lstm1(padding_for_lstm1,(output1, state1)); 337 | output2,state2 = self.lstm2(torch.cat((previous_word,output1),1),(output2, state2)); 338 | #previous_word = output2; 339 | 340 | word_onehot = self.vec2word(output2); 341 | #print word_onehot.shape 342 | _,word_idx = torch.max(word_onehot,1); 343 | #print word_idx.data[0]; 344 | 345 | words.append(word_idx.data[0]); 346 | 347 | previous_word = self.word2vec(word_idx); 348 | 349 | return words; 350 | 351 | batch_size = 100; 352 | image_dim = 4096; 353 | img_embed_dim = 1000; 354 | word_embed_dim = 1000; 355 | hidden_dim = 1000; 356 | video_step = 80; 357 | caption_step = 20; 358 | epoches=1001; 359 | csv_path = "./data/video_corpus.csv" 360 | 361 | 362 | def train(check_point=None): 363 | #parameters 364 | 365 | loss_log_file = "./loss.txt"; 366 | data_set = DataSet_MSVD(csv_path=csv_path,video_step=video_step,caption_step=caption_step,\ 367 | image_dim=image_dim,batch_size=batch_size); 368 | word_num = data_set.word_num; 369 | video_caption_net = VideoCaption(batch_size=batch_size, image_dim=image_dim,\ 370 | word_num = word_num, img_embed_dim=img_embed_dim,\ 371 | word_embed_dim=word_embed_dim,\ 372 | hidden_dim=hidden_dim,video_step=video_step,\ 373 | caption_step=caption_step); 374 | if check_point != None: 375 | video_caption_net.load_state_dict(state_dict=torch.load(check_point)); 376 | start_epoche = int(re.search(r"(\d*)\Z",check_point).groups()[-1]); 377 | else: 378 | start_epoche = int(0); 379 | 380 | video_caption_net.cuda(); 381 | print video_caption_net; 382 | 383 | optimizer = torch.optim.Adam(video_caption_net.parameters(),lr=10e-4); 384 | 385 | loss_list=[]; 386 | log_fp = open(loss_log_file,"w"); 387 | #torch.backends.cudnn = False; 388 | 389 | for epoche in range(start_epoche,epoches): 390 | mini_batch_idx=0; 391 | while True: 392 | start_time = time.time(); 393 | mini_batch_idx += 1; 394 | video_data,caption_data = data_set.next_batch(); 395 | if len(video_data) == 0: 396 | break; 397 | 398 | caption_mask = np.zeros( (caption_data.shape[0], caption_data.shape[1]) ) 399 | nonzeros = np.array( map(lambda x: (x != 0).sum() + 1, caption_data ) ) 400 | 401 | for ind, row in enumerate(caption_mask): 402 | row[:nonzeros[ind]] = 1 403 | 404 | video_data,caption_data,caption_mask = Variable(torch.FloatTensor(video_data)),Variable(torch.LongTensor(caption_data)),\ 405 | Variable(torch.LongTensor(caption_mask)); 406 | video_data,caption_data,caption_mask = video_data.cuda(),caption_data.cuda(),caption_mask.cuda(); 407 | loss = video_caption_net(video_data,caption_data,caption_mask); 408 | optimizer.zero_grad(); 409 | loss.backward(); 410 | optimizer.step(); 411 | 412 | loss_list.append(loss.data[0]); 413 | 414 | print("epoche:{0},mini_batch:{1},loss:{2},Escape time:{3}".format(\ 415 | epoche,mini_batch_idx,loss.data[0],str(time.time()-start_time))); 416 | log_fp.write("epoche:{0},mini_batch:{1},loss:{2},Escape time:{3}\n".format(\ 417 | epoche,mini_batch_idx,loss.data[0],str(time.time()-start_time))); 418 | if epoche%10 == 0: 419 | torch.save(video_caption_net.state_dict(),"./model_temp/s2vt.pytorch.{0}".format(epoche)); 420 | ax=plt.subplot(111) 421 | plt.plot(range(len(loss_list)),loss_list,color='black'); 422 | #plt.plot(range(epoches),test_loss_list,color="red"); 423 | plt.show(); 424 | 425 | #save model 426 | torch.save(video_caption_net.state_dict(),"./s2vt.pytorch"); 427 | 428 | ax=plt.subplot(111) 429 | plt.plot(range(len(loss_list)),loss_list,color='black'); 430 | #plt.plot(range(epoches),test_loss_list,color="red"); 431 | plt.show(); 432 | log_fp.close(); 433 | 434 | 435 | def test(state_dict_path): 436 | #parameters 437 | 438 | data_set = DataSet_MSVD(csv_path=csv_path,video_step=video_step,caption_step=caption_step,\ 439 | image_dim=image_dim,batch_size=batch_size); 440 | #test 441 | word_num = data_set.word_num; 442 | video_caption_net = VideoCaption(batch_size=batch_size, image_dim=image_dim,\ 443 | word_num = word_num, img_embed_dim=img_embed_dim,\ 444 | word_embed_dim=word_embed_dim,\ 445 | hidden_dim=hidden_dim,video_step=video_step,\ 446 | caption_step=caption_step); 447 | #video_caption_net.cuda(); 448 | print video_caption_net; 449 | video_caption_net.load_state_dict(state_dict=torch.load(state_dict_path)); 450 | 451 | #test data 452 | test_output_txt_fd = open("./test_result.txt","wb"); 453 | test_data_path = data_set.test_data["video_path"].unique(); 454 | for idx,data_path in enumerate(test_data_path): 455 | print("idx:{0},data_path:{1}".format(idx,data_path)) 456 | video_feature = np.load(data_path); 457 | if video_feature.shape[0] != video_step: 458 | continue; 459 | video_feature = Variable(torch.FloatTensor(video_feature)); 460 | #video_feature.cuda(); 461 | words = video_caption_net.generate_cpu(video_feature); 462 | 463 | generated_words = [data_set.idx2word[word] for word in words]; 464 | 465 | punctuation = np.argmax(np.array(generated_words) == '') + 1 466 | generated_words = generated_words[:punctuation] 467 | 468 | generated_sentence = ' '.join(generated_words) 469 | generated_sentence = generated_sentence.replace(' ', '') 470 | generated_sentence = generated_sentence.replace(' ', '') 471 | print generated_sentence,'\n' 472 | test_output_txt_fd.write(data_path + '\n') 473 | test_output_txt_fd.write(generated_sentence + '\n\n') 474 | 475 | if __name__=="__main__": 476 | if len(sys.argv)>=2: 477 | if sys.argv[1]=='train': 478 | train(check_point=None); 479 | elif sys.argv[1]=='test': 480 | test("./model_temp/s2vt.pytorch.1000"); 481 | else: 482 | print("use 'python s2vt.py train' to train the model and use 'python s2vt.py test' to test the model"); 483 | else: 484 | print("use 'python s2vt.py train' to train the model and use 'python s2vt.py test' to test the model"); 485 | 486 | 487 | 488 | 489 | 490 | 491 | --------------------------------------------------------------------------------