├── .gitignore ├── 2d_feat_extract.sh ├── LICENSE ├── README.md ├── c3d_feat_extract ├── LICENSE ├── README.md ├── c3d_feat_extract.sh ├── class_names_list ├── classify.py ├── dataset.py ├── input ├── main.py ├── mean.py ├── model.py ├── models │ ├── __init__.py │ ├── densenet.py │ ├── pre_act_resnet.py │ ├── resnet.py │ ├── resnext.py │ └── wide_resnet.py ├── opts.py ├── spatial_transforms.py ├── temporal_transforms.py ├── test.py ├── train.py └── validation.py ├── caffe_feat_extract.py ├── caffe_feat_extract.sh ├── coco-caption ├── LICENSE ├── pyciderevalcap │ ├── __init__.py │ ├── cider │ │ ├── __init__.py │ │ ├── cider.py │ │ └── cider_scorer.py │ ├── ciderD │ │ ├── __init__.py │ │ ├── ciderD.py │ │ └── ciderD_scorer.py │ ├── eval.py │ └── tokenizer │ │ ├── __init__.py │ │ ├── ptbtokenizer.py │ │ ├── stanford-corenlp-3.4.1.jar │ │ ├── tmpBF49XX │ │ ├── tmpql9uU7 │ │ ├── tmpuCp_T0 │ │ ├── tmpxAmV_C │ │ └── tmpzNW4I2 ├── pycocoevalcap │ ├── __init__.py │ ├── bleu │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── bleu.py │ │ └── bleu_scorer.py │ ├── cider │ │ ├── __init__.py │ │ ├── cider.py │ │ └── cider_scorer.py │ ├── eval.py │ ├── meteor │ │ ├── __init__.py │ │ ├── meteor-1.5.jar │ │ └── meteor.py │ ├── rouge │ │ ├── __init__.py │ │ └── rouge.py │ └── tokenizer │ │ ├── __init__.py │ │ ├── ptbtokenizer.py │ │ └── stanford-corenlp-3.4.1.jar └── pycocotools │ ├── __init__.py │ ├── _mask.c │ ├── _mask.pyx │ ├── coco.py │ ├── cocoeval.py │ └── mask.py ├── dataloader.py ├── eval.py ├── eval_s2vt.sh ├── finetune_cnn.py ├── misc ├── __init__.py ├── cocoeval.py ├── rewards.py └── utils.py ├── models ├── Attention.py ├── DecoderRNN.py ├── EncoderRNN.py ├── S2VTAttModel.py ├── S2VTModel.py └── __init__.py ├── opts.py ├── prepro_coco.py ├── prepro_feats.py ├── prepro_ngrams.py ├── prepro_vocab.py ├── train.py ├── train_s2vt.sh └── train_s2vt_att.sh /.gitignore: -------------------------------------------------------------------------------- 1 | feats/ 2 | save*/ 3 | result/ 4 | results/ 5 | foo/ 6 | log/ 7 | data/ 8 | checkpoint/ 9 | pretrained_models/ 10 | *video* 11 | *.json 12 | *.ipynb 13 | .idea/ 14 | !scripts/*.py 15 | *.pth 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | .hypothesis/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | .static_storage/ 71 | .media/ 72 | local_settings.py 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | -------------------------------------------------------------------------------- /2d_feat_extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | feat=nasnet 3 | python prepro_feats.py \ 4 | --video_path data/videos \ 5 | --model ${feat} \ 6 | --output_dir data/feats/${feat} \ 7 | --n_frame_steps 80 \ 8 | --gpu 0 \ 9 | 10 | 11 | # --saved_model pretrain_models/resnet152-b121ed2d.pth \ 12 | # vgg16-397923af.pth 13 | # resnet101-5d3b4d8f.pth 14 | # resnet152-b121ed2d.pth -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 DingXia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # requirements # 2 | 3 | - cuda 4 | - pytorch 0.3.1 5 | - python3(未测试) or python2(已测试,最好统一用py2吧) 6 | - ffmpeg (can install using anaconda) 7 | 8 | # usage # 9 | 10 | 1. 2d特征提取, 如resnet101, nasnet等 11 | ```bash 12 | sh ./2d_extract_feat.sh 13 | # model 模型选择 14 | # n_frame_steps 一段视频提取多少帧,默认选80吧 15 | ``` 16 | 17 | 2. 3d特征提取 18 | ```bash 19 | cd c3d_feat_extract 20 | sh ./c3d_feat_extract.sh 21 | # --mode feature 提取特征模式,无需改动 22 | # 以下根据所选模型不同进行更改 23 | # --model_name resnext \ 24 | # --model_depth 101 \ 25 | # --resnext_cardinality 32 \ 26 | # --resnet_shortcut B \ 27 | # --model pretrained_models/resnext-101-64f-kinetics.pth 28 | ``` 29 | 3. 训练 30 | 31 | ```bash 32 | ./train_s2vt.sh 33 | # 根据相关配置进行设置,具体选项含义参考opts.py 34 | ``` 35 | 36 | 4. 测试和评分 37 | 38 | ```bash 39 | ./eval_s2vt.sh 40 | # 根据相关配置进行设置,具体选项含义参考eval.py 41 | ``` 42 | # file tree # 43 | 44 | 相关文件下载 45 | 链接: https://pan.baidu.com/s/1RDNygrWtz_PtVH8nh4vG3w 密码: nxyk 46 | ``` 47 | data 48 | │ all_caption.json 49 | │ all_info.json 50 | │ all_videodatainfo_2017.json 51 | └───feats 52 | │ └───nasnet 53 | │ │ │ videoxxx.npy 54 | │ │ │ ... 55 | │ └───resnet 56 | │ │ │ videoxxx.npy 57 | │ │ │ ... 58 | │ └───xxnet 59 | │ │ videoxxx.npy 60 | │ │ ... 61 | └───videos 62 | │ │ videoxxx.mp4 63 | │ │ ... 64 | │ 65 | │ 66 | 新建这些目录 67 | log 68 | checkpoint 69 | result 70 | 71 | ``` 72 | 73 | # pytorch implementation of video captioning 74 | 75 | recommend installing pytorch and python packages using Anaconda 76 | 77 | 78 | ### python packages 79 | 80 | - tqdm 81 | - pillow 82 | - pretrainedmodels 83 | - nltk 84 | 85 | ## Data 86 | 87 | MSR-VTT. Test video doesn't have captions, so I spilit train-viedo to train/val/test. Extract and put them in `./data/` directory 88 | 89 | - train-video: [download link](https://drive.google.com/file/d/1Qi6Gn_l93SzrvmKQQu-drI90L-x8B0ly/view?usp=sharing) 90 | - test-video: [download link](https://drive.google.com/file/d/10fPbEhD-ENVQihrRvKFvxcMzkDlhvf4Q/view?usp=sharing) 91 | - json info of train-video: [download link](https://drive.google.com/file/d/1LcTtsAvfnHhUfHMiI4YkDgN7lF1-_-m7/view?usp=sharing) 92 | - json info of test-video: [download link](https://drive.google.com/file/d/1Kgra0uMKDQssclNZXRLfbj9UQgBv-1YE/view?usp=sharing) 93 | 94 | ## Options 95 | 96 | all default options are defined in opt.py or corresponding code file, change them for your like. 97 | 98 | ## Usage 99 | 100 | ### (Optional) c3d features 101 | you can use [video-classification-3d-cnn-pytorch](https://github.com/kenshohara/video-classification-3d-cnn-pytorch) to extract features from video. Then mean pool to get a 2048 dim feature for each video. 102 | 103 | ### Steps 104 | 105 | 1. preprocess videos and labels 106 | 107 | this steps take about 3 hours for msr-vtt datasets use one titan XP gpu 108 | 109 | ```bash 110 | python prepro_feats.py --output_dir data/feats/resnet152 --model resnet152 --n_frame_steps 40 --gpu 4,5 111 | 112 | python prepro_vocab.py 113 | ``` 114 | 115 | 2. Training a model 116 | 117 | ```bash 118 | 119 | python train.py --gpu 5,6,7 --epochs 9001 --batch_size 450 --checkpoint_path data/save --feats_dir data/feats/resnet152 --dim_vid 2048 --model S2VTAttModel 120 | ``` 121 | 122 | 3. test 123 | 124 | opt_info.json will be in same directory as saved model. 125 | 126 | ```bash 127 | python eval.py --recover_opt data/save/opt_info.json --saved_model data/save/model_1000.pth --batch_size 100 --gpu 1,0 128 | ``` 129 | 130 | ## Metrics 131 | 132 | I fork the [coco-caption XgDuan](https://github.com/XgDuan/coco-caption/tree/python3). Thanks to port it to python3. 133 | 134 | ## TODO 135 | - lstm 136 | - beam search 137 | - reinforcement learning 138 | 139 | ## Note 140 | This repository is not maintained, please see my another repository [video-caption-openNMT.py](https://github.com/xiadingZ/video-caption-openNMT.pytorch). It has higher performence and test score. 141 | -------------------------------------------------------------------------------- /c3d_feat_extract/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Kensho Hara 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /c3d_feat_extract/README.md: -------------------------------------------------------------------------------- 1 | # Video Classification Using 3D ResNet 2 | This is a pytorch code for video (action) classification using 3D ResNet trained by [this code](https://github.com/kenshohara/3D-ResNets-PyTorch). 3 | The 3D ResNet is trained on the Kinetics dataset, which includes 400 action classes. 4 | This code uses videos as inputs and outputs class names and predicted class scores for each 16 frames in the score mode. 5 | In the feature mode, this code outputs features of 512 dims (after global average pooling) for each 16 frames. 6 | 7 | **Torch (Lua) version of this code is available [here](https://github.com/kenshohara/video-classification-3d-cnn).** 8 | 9 | ## Requirements 10 | * [PyTorch](http://pytorch.org/) 11 | ``` 12 | conda install pytorch torchvision cuda80 -c soumith 13 | ``` 14 | * FFmpeg, FFprobe 15 | ``` 16 | wget http://johnvansickle.com/ffmpeg/releases/ffmpeg-release-64bit-static.tar.xz 17 | tar xvf ffmpeg-release-64bit-static.tar.xz 18 | cd ./ffmpeg-3.3.3-64bit-static/; sudo cp ffmpeg ffprobe /usr/local/bin; 19 | ``` 20 | * Python 3 21 | 22 | ## Preparation 23 | * Download this code. 24 | * Download the [pretrained model](https://drive.google.com/drive/folders/14KRBqT8ySfPtFSuLsFS2U4I-ihTDs0Y9?usp=sharing). 25 | * ResNeXt-101 achieved the best performance in our experiments. (See [paper](https://arxiv.org/abs/1711.09577) in details.) 26 | 27 | ## Usage 28 | Assume input video files are located in ```./videos```. 29 | 30 | To calculate class scores for each 16 frames, use ```--mode score```. 31 | ``` 32 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode score 33 | ``` 34 | To visualize the classification results, use ```generate_result_video/generate_result_video.py```. 35 | 36 | To calculate video features for each 16 frames, use ```--mode feature```. 37 | ``` 38 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode feature 39 | ``` 40 | 41 | 42 | ## Citation 43 | If you use this code, please cite the following: 44 | ``` 45 | @article{hara3dcnns, 46 | author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh}, 47 | title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?}, 48 | journal={arXiv preprint}, 49 | volume={arXiv:1711.09577}, 50 | year={2017}, 51 | } 52 | ``` 53 | -------------------------------------------------------------------------------- /c3d_feat_extract/c3d_feat_extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python main.py \ 3 | --n_classes 20 \ 4 | --gpu 1 \ 5 | --input input \ 6 | --model_name resnext \ 7 | --model_depth 101 \ 8 | --resnext_cardinality 32 \ 9 | --resnet_shortcut B \ 10 | --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_msrvtt \ 11 | --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \ 12 | --output output.json \ 13 | --model pretrained_models/resnext-101-MSR-VTT-finetuned-25-epochs.pth \ 14 | --mode feature 15 | 16 | 17 | # python main.py \ 18 | # --n_classes 101 \ 19 | # --gpu 1 \ 20 | # --input input \ 21 | # --model_name resnext \ 22 | # --model_depth 101 \ 23 | # --resnext_cardinality 32 \ 24 | # --resnet_shortcut B \ 25 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_hmdb \ 26 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \ 27 | # --output output.json \ 28 | # --model pretrained_models/resnext-101-kinetics-hmdb51_split1.pth \ 29 | # --mode feature 30 | 31 | # python main.py \ 32 | # --n_classes 51 \ 33 | # --gpu 1 \ 34 | # --input input \ 35 | # --model_name resnext \ 36 | # --model_depth 101 \ 37 | # --resnext_cardinality 32 \ 38 | # --resnet_shortcut B \ 39 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_ucf \ 40 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \ 41 | # --output output.json \ 42 | # --model pretrained_models/resnext-101-kinetics-ucf101_split1.pth \ 43 | # --mode feature 44 | 45 | # python main.py \ 46 | # --n_classes 400 \ 47 | # --gpu 0 \ 48 | # --input input \ 49 | # --model_name resnext \ 50 | # --model_depth 101 \ 51 | # --resnext_cardinality 32 \ 52 | # --resnet_shortcut B \ 53 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics \ 54 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \ 55 | # --output output.json \ 56 | # --model pretrained_models/resnext-101-kinetics.pth \ 57 | # --mode feature 58 | 59 | # python main.py \ 60 | # --n_classes 400 \ 61 | # --sample_duration 64 \ 62 | # --gpu 0 \ 63 | # --input input \ 64 | # --model_name resnext \ 65 | # --model_depth 101 \ 66 | # --resnext_cardinality 32 \ 67 | # --resnet_shortcut B \ 68 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_64f \ 69 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \ 70 | # --output output.json \ 71 | # --model pretrained_models/resnext-101-64f-kinetics.pth \ 72 | # --mode feature 73 | 74 | -------------------------------------------------------------------------------- /c3d_feat_extract/class_names_list: -------------------------------------------------------------------------------- 1 | abseiling 2 | air drumming 3 | answering questions 4 | applauding 5 | applying cream 6 | archery 7 | arm wrestling 8 | arranging flowers 9 | assembling computer 10 | auctioning 11 | baby waking up 12 | baking cookies 13 | balloon blowing 14 | bandaging 15 | barbequing 16 | bartending 17 | beatboxing 18 | bee keeping 19 | belly dancing 20 | bench pressing 21 | bending back 22 | bending metal 23 | biking through snow 24 | blasting sand 25 | blowing glass 26 | blowing leaves 27 | blowing nose 28 | blowing out candles 29 | bobsledding 30 | bookbinding 31 | bouncing on trampoline 32 | bowling 33 | braiding hair 34 | breading or breadcrumbing 35 | breakdancing 36 | brush painting 37 | brushing hair 38 | brushing teeth 39 | building cabinet 40 | building shed 41 | bungee jumping 42 | busking 43 | canoeing or kayaking 44 | capoeira 45 | carrying baby 46 | cartwheeling 47 | carving pumpkin 48 | catching fish 49 | catching or throwing baseball 50 | catching or throwing frisbee 51 | catching or throwing softball 52 | celebrating 53 | changing oil 54 | changing wheel 55 | checking tires 56 | cheerleading 57 | chopping wood 58 | clapping 59 | clay pottery making 60 | clean and jerk 61 | cleaning floor 62 | cleaning gutters 63 | cleaning pool 64 | cleaning shoes 65 | cleaning toilet 66 | cleaning windows 67 | climbing a rope 68 | climbing ladder 69 | climbing tree 70 | contact juggling 71 | cooking chicken 72 | cooking egg 73 | cooking on campfire 74 | cooking sausages 75 | counting money 76 | country line dancing 77 | cracking neck 78 | crawling baby 79 | crossing river 80 | crying 81 | curling hair 82 | cutting nails 83 | cutting pineapple 84 | cutting watermelon 85 | dancing ballet 86 | dancing charleston 87 | dancing gangnam style 88 | dancing macarena 89 | deadlifting 90 | decorating the christmas tree 91 | digging 92 | dining 93 | disc golfing 94 | diving cliff 95 | dodgeball 96 | doing aerobics 97 | doing laundry 98 | doing nails 99 | drawing 100 | dribbling basketball 101 | drinking 102 | drinking beer 103 | drinking shots 104 | driving car 105 | driving tractor 106 | drop kicking 107 | drumming fingers 108 | dunking basketball 109 | dying hair 110 | eating burger 111 | eating cake 112 | eating carrots 113 | eating chips 114 | eating doughnuts 115 | eating hotdog 116 | eating ice cream 117 | eating spaghetti 118 | eating watermelon 119 | egg hunting 120 | exercising arm 121 | exercising with an exercise ball 122 | extinguishing fire 123 | faceplanting 124 | feeding birds 125 | feeding fish 126 | feeding goats 127 | filling eyebrows 128 | finger snapping 129 | fixing hair 130 | flipping pancake 131 | flying kite 132 | folding clothes 133 | folding napkins 134 | folding paper 135 | front raises 136 | frying vegetables 137 | garbage collecting 138 | gargling 139 | getting a haircut 140 | getting a tattoo 141 | giving or receiving award 142 | golf chipping 143 | golf driving 144 | golf putting 145 | grinding meat 146 | grooming dog 147 | grooming horse 148 | gymnastics tumbling 149 | hammer throw 150 | headbanging 151 | headbutting 152 | high jump 153 | high kick 154 | hitting baseball 155 | hockey stop 156 | holding snake 157 | hopscotch 158 | hoverboarding 159 | hugging 160 | hula hooping 161 | hurdling 162 | hurling (sport) 163 | ice climbing 164 | ice fishing 165 | ice skating 166 | ironing 167 | javelin throw 168 | jetskiing 169 | jogging 170 | juggling balls 171 | juggling fire 172 | juggling soccer ball 173 | jumping into pool 174 | jumpstyle dancing 175 | kicking field goal 176 | kicking soccer ball 177 | kissing 178 | kitesurfing 179 | knitting 180 | krumping 181 | laughing 182 | laying bricks 183 | long jump 184 | lunge 185 | making a cake 186 | making a sandwich 187 | making bed 188 | making jewelry 189 | making pizza 190 | making snowman 191 | making sushi 192 | making tea 193 | marching 194 | massaging back 195 | massaging feet 196 | massaging legs 197 | massaging person's head 198 | milking cow 199 | mopping floor 200 | motorcycling 201 | moving furniture 202 | mowing lawn 203 | news anchoring 204 | opening bottle 205 | opening present 206 | paragliding 207 | parasailing 208 | parkour 209 | passing American football (in game) 210 | passing American football (not in game) 211 | peeling apples 212 | peeling potatoes 213 | petting animal (not cat) 214 | petting cat 215 | picking fruit 216 | planting trees 217 | plastering 218 | playing accordion 219 | playing badminton 220 | playing bagpipes 221 | playing basketball 222 | playing bass guitar 223 | playing cards 224 | playing cello 225 | playing chess 226 | playing clarinet 227 | playing controller 228 | playing cricket 229 | playing cymbals 230 | playing didgeridoo 231 | playing drums 232 | playing flute 233 | playing guitar 234 | playing harmonica 235 | playing harp 236 | playing ice hockey 237 | playing keyboard 238 | playing kickball 239 | playing monopoly 240 | playing organ 241 | playing paintball 242 | playing piano 243 | playing poker 244 | playing recorder 245 | playing saxophone 246 | playing squash or racquetball 247 | playing tennis 248 | playing trombone 249 | playing trumpet 250 | playing ukulele 251 | playing violin 252 | playing volleyball 253 | playing xylophone 254 | pole vault 255 | presenting weather forecast 256 | pull ups 257 | pumping fist 258 | pumping gas 259 | punching bag 260 | punching person (boxing) 261 | push up 262 | pushing car 263 | pushing cart 264 | pushing wheelchair 265 | reading book 266 | reading newspaper 267 | recording music 268 | riding a bike 269 | riding camel 270 | riding elephant 271 | riding mechanical bull 272 | riding mountain bike 273 | riding mule 274 | riding or walking with horse 275 | riding scooter 276 | riding unicycle 277 | ripping paper 278 | robot dancing 279 | rock climbing 280 | rock scissors paper 281 | roller skating 282 | running on treadmill 283 | sailing 284 | salsa dancing 285 | sanding floor 286 | scrambling eggs 287 | scuba diving 288 | setting table 289 | shaking hands 290 | shaking head 291 | sharpening knives 292 | sharpening pencil 293 | shaving head 294 | shaving legs 295 | shearing sheep 296 | shining shoes 297 | shooting basketball 298 | shooting goal (soccer) 299 | shot put 300 | shoveling snow 301 | shredding paper 302 | shuffling cards 303 | side kick 304 | sign language interpreting 305 | singing 306 | situp 307 | skateboarding 308 | ski jumping 309 | skiing (not slalom or crosscountry) 310 | skiing crosscountry 311 | skiing slalom 312 | skipping rope 313 | skydiving 314 | slacklining 315 | slapping 316 | sled dog racing 317 | smoking 318 | smoking hookah 319 | snatch weight lifting 320 | sneezing 321 | sniffing 322 | snorkeling 323 | snowboarding 324 | snowkiting 325 | snowmobiling 326 | somersaulting 327 | spinning poi 328 | spray painting 329 | spraying 330 | springboard diving 331 | squat 332 | sticking tongue out 333 | stomping grapes 334 | stretching arm 335 | stretching leg 336 | strumming guitar 337 | surfing crowd 338 | surfing water 339 | sweeping floor 340 | swimming backstroke 341 | swimming breast stroke 342 | swimming butterfly stroke 343 | swing dancing 344 | swinging legs 345 | swinging on something 346 | sword fighting 347 | tai chi 348 | taking a shower 349 | tango dancing 350 | tap dancing 351 | tapping guitar 352 | tapping pen 353 | tasting beer 354 | tasting food 355 | testifying 356 | texting 357 | throwing axe 358 | throwing ball 359 | throwing discus 360 | tickling 361 | tobogganing 362 | tossing coin 363 | tossing salad 364 | training dog 365 | trapezing 366 | trimming or shaving beard 367 | trimming trees 368 | triple jump 369 | tying bow tie 370 | tying knot (not on a tie) 371 | tying tie 372 | unboxing 373 | unloading truck 374 | using computer 375 | using remote controller (not gaming) 376 | using segway 377 | vault 378 | waiting in line 379 | walking the dog 380 | washing dishes 381 | washing feet 382 | washing hair 383 | washing hands 384 | water skiing 385 | water sliding 386 | watering plants 387 | waxing back 388 | waxing chest 389 | waxing eyebrows 390 | waxing legs 391 | weaving basket 392 | welding 393 | whistling 394 | windsurfing 395 | wrapping present 396 | wrestling 397 | writing 398 | yawning 399 | yoga 400 | zumba 401 | -------------------------------------------------------------------------------- /c3d_feat_extract/classify.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | from dataset import Video 5 | from spatial_transforms import (Compose, Normalize, Scale, CenterCrop, ToTensor) 6 | from temporal_transforms import LoopPadding 7 | 8 | def classify_video(video_dir, video_name, class_names, model, opt): 9 | assert opt.mode in ['score', 'feature'] 10 | 11 | spatial_transform = Compose([Scale(opt.sample_size), 12 | CenterCrop(opt.sample_size), 13 | ToTensor(), 14 | Normalize(opt.mean, [1, 1, 1])]) 15 | temporal_transform = LoopPadding(opt.sample_duration) 16 | data = Video(video_dir, spatial_transform=spatial_transform, 17 | temporal_transform=temporal_transform, 18 | sample_duration=opt.sample_duration) 19 | data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, 20 | shuffle=False, num_workers=opt.n_threads, pin_memory=True) 21 | 22 | video_outputs = [] 23 | video_segments = [] 24 | for i, (inputs, segments) in enumerate(data_loader): 25 | inputs = Variable(inputs, volatile=True) 26 | outputs = model(inputs) 27 | 28 | video_outputs.append(outputs.cpu().data) 29 | video_segments.append(segments) 30 | 31 | video_outputs = torch.cat(video_outputs) 32 | video_segments = torch.cat(video_segments) 33 | if opt.mode == 'feature': 34 | return video_outputs.numpy() 35 | elif opt.mode == 'score': 36 | results = { 37 | 'video': video_name, 38 | 'clips': [] 39 | } 40 | 41 | _, max_indices = video_outputs.max(dim=1) 42 | for i in range(video_outputs.size(0)): 43 | clip_results = { 44 | 'segment': video_segments[i].tolist(), 45 | } 46 | clip_results['label'] = class_names[max_indices[i]] 47 | clip_results['scores'] = video_outputs[i].tolist() 48 | results['clips'].append(clip_results) 49 | return results 50 | -------------------------------------------------------------------------------- /c3d_feat_extract/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import copy 8 | import numpy as np 9 | 10 | def pil_loader(path): 11 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 12 | with open(path, 'rb') as f: 13 | with Image.open(f) as img: 14 | return img.convert('RGB') 15 | 16 | 17 | def accimage_loader(path): 18 | try: 19 | return accimage.Image(path) 20 | except IOError: 21 | # Potentially a decoding problem, fall back to PIL.Image 22 | return pil_loader(path) 23 | 24 | 25 | def get_default_image_loader(): 26 | from torchvision import get_image_backend 27 | if get_image_backend() == 'accimage': 28 | import accimage 29 | return accimage_loader 30 | else: 31 | return pil_loader 32 | 33 | 34 | def video_loader(video_dir_path, frame_indices, image_loader): 35 | video = [] 36 | for i in frame_indices: 37 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 38 | if os.path.exists(image_path): 39 | video.append(image_loader(image_path)) 40 | else: 41 | return video 42 | 43 | return video 44 | 45 | 46 | def get_default_video_loader(): 47 | image_loader = get_default_image_loader() 48 | return functools.partial(video_loader, image_loader=image_loader) 49 | 50 | 51 | def load_annotation_data(data_file_path): 52 | with open(data_file_path, 'r') as data_file: 53 | return json.load(data_file) 54 | 55 | 56 | def get_class_labels(data): 57 | class_labels_map = {} 58 | index = 0 59 | for class_label in data['labels']: 60 | class_labels_map[class_label] = index 61 | index += 1 62 | return class_labels_map 63 | 64 | 65 | def get_video_names_and_annotations(data, subset): 66 | video_names = [] 67 | annotations = [] 68 | 69 | for key, value in data['database'].items(): 70 | this_subset = value['subset'] 71 | if this_subset == subset: 72 | if subset == 'testing': 73 | video_names.append('test/{}'.format(key)) 74 | else: 75 | label = value['annotations']['label'] 76 | video_names.append('{}/{}'.format(label, key)) 77 | annotations.append(value['annotations']) 78 | 79 | return video_names, annotations 80 | 81 | 82 | def make_dataset(video_path, sample_duration): 83 | dataset = [] 84 | 85 | n_frames = len(os.listdir(video_path)) 86 | 87 | begin_t = 1 88 | end_t = n_frames 89 | sample = { 90 | 'video': video_path, 91 | 'segment': [begin_t, end_t], 92 | 'n_frames': n_frames, 93 | } 94 | 95 | step = sample_duration 96 | for i in range(1, (n_frames - sample_duration + 1), step): 97 | sample_i = copy.deepcopy(sample) 98 | sample_i['frame_indices'] = list(range(i, i + sample_duration)) # [i: i + sample_duration) same as segment 99 | sample_i['segment'] = torch.IntTensor([i, i + sample_duration - 1]) 100 | dataset.append(sample_i) 101 | if n_frames % sample_duration != 0: 102 | sample_i = copy.deepcopy(sample) 103 | if n_frames - sample_duration + 1 >= 1: 104 | sample_i['frame_indices'] = list(range(n_frames - sample_duration + 1, n_frames + 1)) 105 | sample_i['segment'] = torch.IntTensor([n_frames - sample_duration + 1, n_frames]) 106 | else: 107 | sample_i['frame_indices'] = np.round(np.linspace(1, n_frames, sample_duration))\ 108 | .astype(np.int32).tolist() 109 | sample_i['segment'] = torch.IntTensor([1, n_frames]) 110 | dataset.append(sample_i) 111 | return dataset 112 | 113 | 114 | class Video(data.Dataset): 115 | def __init__(self, video_path, 116 | spatial_transform=None, temporal_transform=None, 117 | sample_duration=16, get_loader=get_default_video_loader): 118 | self.data = make_dataset(video_path, sample_duration) 119 | 120 | self.spatial_transform = spatial_transform 121 | self.temporal_transform = temporal_transform 122 | self.loader = get_loader() 123 | 124 | def __getitem__(self, index): 125 | """ 126 | Args: 127 | index (int): Index 128 | Returns: 129 | tuple: (image, target) where target is class_index of the target class. 130 | """ 131 | path = self.data[index]['video'] 132 | 133 | frame_indices = self.data[index]['frame_indices'] 134 | if self.temporal_transform is not None: 135 | frame_indices = self.temporal_transform(frame_indices) 136 | clip = self.loader(path, frame_indices) 137 | if self.spatial_transform is not None: 138 | clip = [self.spatial_transform(img) for img in clip] 139 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 140 | 141 | target = self.data[index]['segment'] 142 | 143 | return clip, target 144 | 145 | def __len__(self): 146 | return len(self.data) 147 | -------------------------------------------------------------------------------- /c3d_feat_extract/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import subprocess 5 | import numpy as np 6 | import torch 7 | from torch import nn 8 | 9 | from opts import parse_opts 10 | from model import generate_model 11 | from mean import get_mean 12 | from classify import classify_video 13 | 14 | if __name__=="__main__": 15 | opt = parse_opts() 16 | opt.mean = get_mean() 17 | opt.arch = '{}-{}'.format(opt.model_name, opt.model_depth) 18 | opt.sample_size = 112 19 | # if opt.model.find('64f') != -1: 20 | # opt.sample_duration = 64 21 | # else: 22 | # opt.sample_duration = 16 23 | os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu 24 | model = generate_model(opt) 25 | print('loading model {}'.format(opt.model)) 26 | model_data = torch.load(opt.model) 27 | assert opt.arch == model_data['arch'] 28 | model.load_state_dict(model_data['state_dict']) 29 | model.eval() 30 | if opt.verbose: 31 | print(model) 32 | 33 | input_files = [] 34 | with open(opt.input, 'r') as f: 35 | for row in f: 36 | input_files.append(row[:-1]) 37 | 38 | class_names = [] 39 | with open('class_names_list') as f: 40 | for row in f: 41 | class_names.append(row[:-1]) 42 | 43 | ffmpeg_loglevel = 'quiet' 44 | if opt.verbose: 45 | ffmpeg_loglevel = 'info' 46 | 47 | if os.path.exists('tmp'): 48 | subprocess.call('rm -rf tmp', shell=True) 49 | 50 | outputs = [] 51 | for input_file in input_files: 52 | video_path = os.path.join(opt.video_root, input_file) 53 | if os.path.exists(video_path): 54 | print(video_path) 55 | subprocess.call('mkdir tmp', shell=True) 56 | subprocess.call('ffmpeg -i {} tmp/image_%05d.jpg'.format(video_path), 57 | shell=True) 58 | 59 | result = classify_video('tmp', input_file, class_names, model, opt) 60 | if opt.mode == 'score': 61 | outputs.append(result) 62 | elif opt.mode == 'feature': 63 | feat_path = os.path.join(opt.feat_dir, input_file.split('.')[0]+'.npy') 64 | np.save(feat_path,result) 65 | subprocess.call('rm -rf tmp', shell=True) 66 | else: 67 | print('{} does not exist'.format(input_file)) 68 | 69 | if os.path.exists('tmp'): 70 | subprocess.call('rm -rf tmp', shell=True) 71 | if opt.mode == 'score': 72 | with open(opt.output, 'w') as f: 73 | json.dump(outputs, f) 74 | -------------------------------------------------------------------------------- /c3d_feat_extract/mean.py: -------------------------------------------------------------------------------- 1 | def get_mean(): 2 | return [114.7748, 107.7354, 99.4750] 3 | -------------------------------------------------------------------------------- /c3d_feat_extract/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from models import resnet, pre_act_resnet, wide_resnet, resnext, densenet 5 | 6 | 7 | def generate_model(opt): 8 | assert opt.mode in ['score', 'feature'] 9 | if opt.mode == 'score': 10 | last_fc = True 11 | elif opt.mode == 'feature': 12 | last_fc = False 13 | 14 | assert opt.model_name in ['resnet', 'preresnet', 'wideresnet', 'resnext', 'densenet'] 15 | 16 | if opt.model_name == 'resnet': 17 | assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200] 18 | 19 | if opt.model_depth == 10: 20 | model = resnet.resnet10(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 21 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 22 | last_fc=last_fc) 23 | elif opt.model_depth == 18: 24 | model = resnet.resnet18(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 25 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 26 | last_fc=last_fc) 27 | elif opt.model_depth == 34: 28 | model = resnet.resnet34(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 29 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 30 | last_fc=last_fc) 31 | elif opt.model_depth == 50: 32 | model = resnet.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 33 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 34 | last_fc=last_fc) 35 | elif opt.model_depth == 101: 36 | model = resnet.resnet101(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 37 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 38 | last_fc=last_fc) 39 | elif opt.model_depth == 152: 40 | model = resnet.resnet152(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 41 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 42 | last_fc=last_fc) 43 | elif opt.model_depth == 200: 44 | model = resnet.resnet200(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 45 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 46 | last_fc=last_fc) 47 | elif opt.model_name == 'wideresnet': 48 | assert opt.model_depth in [50] 49 | 50 | if opt.model_depth == 50: 51 | model = wide_resnet.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, k=opt.wide_resnet_k, 52 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 53 | last_fc=last_fc) 54 | elif opt.model_name == 'resnext': 55 | assert opt.model_depth in [50, 101, 152] 56 | 57 | if opt.model_depth == 50: 58 | model = resnext.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, cardinality=opt.resnext_cardinality, 59 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 60 | last_fc=last_fc) 61 | elif opt.model_depth == 101: 62 | model = resnext.resnet101(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, cardinality=opt.resnext_cardinality, 63 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 64 | last_fc=last_fc) 65 | elif opt.model_depth == 152: 66 | model = resnext.resnet152(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, cardinality=opt.resnext_cardinality, 67 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 68 | last_fc=last_fc) 69 | elif opt.model_name == 'preresnet': 70 | assert opt.model_depth in [18, 34, 50, 101, 152, 200] 71 | 72 | if opt.model_depth == 18: 73 | model = pre_act_resnet.resnet18(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 74 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 75 | last_fc=last_fc) 76 | elif opt.model_depth == 34: 77 | model = pre_act_resnet.resnet34(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 78 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 79 | last_fc=last_fc) 80 | elif opt.model_depth == 50: 81 | model = pre_act_resnet.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 82 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 83 | last_fc=last_fc) 84 | elif opt.model_depth == 101: 85 | model = pre_act_resnet.resnet101(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 86 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 87 | last_fc=last_fc) 88 | elif opt.model_depth == 152: 89 | model = pre_act_resnet.resnet152(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 90 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 91 | last_fc=last_fc) 92 | elif opt.model_depth == 200: 93 | model = pre_act_resnet.resnet200(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, 94 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 95 | last_fc=last_fc) 96 | elif opt.model_name == 'densenet': 97 | assert opt.model_depth in [121, 169, 201, 264] 98 | 99 | if opt.model_depth == 121: 100 | model = densenet.densenet121(num_classes=opt.n_classes, 101 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 102 | last_fc=last_fc) 103 | elif opt.model_depth == 169: 104 | model = densenet.densenet169(num_classes=opt.n_classes, 105 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 106 | last_fc=last_fc) 107 | elif opt.model_depth == 201: 108 | model = densenet.densenet201(num_classes=opt.n_classes, 109 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 110 | last_fc=last_fc) 111 | elif opt.model_depth == 264: 112 | model = densenet.densenet264(num_classes=opt.n_classes, 113 | sample_size=opt.sample_size, sample_duration=opt.sample_duration, 114 | last_fc=last_fc) 115 | 116 | if not opt.no_cuda: 117 | model = model.cuda() 118 | model = nn.DataParallel(model, device_ids=None) 119 | 120 | return model 121 | -------------------------------------------------------------------------------- /c3d_feat_extract/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/c3d_feat_extract/models/__init__.py -------------------------------------------------------------------------------- /c3d_feat_extract/models/densenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | import math 6 | 7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264'] 8 | 9 | 10 | def densenet121(**kwargs): 11 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), 12 | **kwargs) 13 | return model 14 | 15 | 16 | def densenet169(**kwargs): 17 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32), 18 | **kwargs) 19 | return model 20 | 21 | 22 | def densenet201(**kwargs): 23 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32), 24 | **kwargs) 25 | return model 26 | 27 | 28 | def densenet264(**kwargs): 29 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48), 30 | **kwargs) 31 | return model 32 | 33 | 34 | def get_fine_tuning_parameters(model, ft_begin_index): 35 | if ft_begin_index == 0: 36 | return model.parameters() 37 | 38 | ft_module_names = [] 39 | for i in range(ft_begin_index, 5): 40 | ft_module_names.append('denseblock{}'.format(ft_begin_index)) 41 | ft_module_names.append('transition{}'.format(ft_begin_index)) 42 | ft_module_names.append('norm5') 43 | ft_module_names.append('classifier') 44 | 45 | parameters = [] 46 | for k, v in model.named_parameters(): 47 | for ft_module in ft_module_names: 48 | if ft_module in k: 49 | parameters.append({'params': v}) 50 | break 51 | else: 52 | parameters.append({'params': v, 'lr': 0.0}) 53 | 54 | return parameters 55 | 56 | 57 | class _DenseLayer(nn.Sequential): 58 | def __init__(self, num_input_features, growth_rate, bn_size, drop_rate): 59 | super(_DenseLayer, self).__init__() 60 | self.add_module('norm.1', nn.BatchNorm3d(num_input_features)) 61 | self.add_module('relu.1', nn.ReLU(inplace=True)) 62 | self.add_module('conv.1', nn.Conv3d(num_input_features, bn_size * growth_rate, 63 | kernel_size=1, stride=1, bias=False)) 64 | self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate)) 65 | self.add_module('relu.2', nn.ReLU(inplace=True)) 66 | self.add_module('conv.2', nn.Conv3d(bn_size * growth_rate, growth_rate, 67 | kernel_size=3, stride=1, padding=1, bias=False)) 68 | self.drop_rate = drop_rate 69 | 70 | def forward(self, x): 71 | new_features = super(_DenseLayer, self).forward(x) 72 | if self.drop_rate > 0: 73 | new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) 74 | return torch.cat([x, new_features], 1) 75 | 76 | 77 | class _DenseBlock(nn.Sequential): 78 | def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate): 79 | super(_DenseBlock, self).__init__() 80 | for i in range(num_layers): 81 | layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate) 82 | self.add_module('denselayer%d' % (i + 1), layer) 83 | 84 | 85 | class _Transition(nn.Sequential): 86 | def __init__(self, num_input_features, num_output_features): 87 | super(_Transition, self).__init__() 88 | self.add_module('norm', nn.BatchNorm3d(num_input_features)) 89 | self.add_module('relu', nn.ReLU(inplace=True)) 90 | self.add_module('conv', nn.Conv3d(num_input_features, num_output_features, 91 | kernel_size=1, stride=1, bias=False)) 92 | self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2)) 93 | 94 | 95 | class DenseNet(nn.Module): 96 | """Densenet-BC model class 97 | Args: 98 | growth_rate (int) - how many filters to add each layer (k in paper) 99 | block_config (list of 4 ints) - how many layers in each pooling block 100 | num_init_features (int) - the number of filters to learn in the first convolution layer 101 | bn_size (int) - multiplicative factor for number of bottle neck layers 102 | (i.e. bn_size * k features in the bottleneck layer) 103 | drop_rate (float) - dropout rate after each dense layer 104 | num_classes (int) - number of classification classes 105 | """ 106 | def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16), 107 | num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True): 108 | 109 | super(DenseNet, self).__init__() 110 | 111 | self.last_fc = last_fc 112 | 113 | self.sample_size = sample_size 114 | self.sample_duration = sample_duration 115 | 116 | # First convolution 117 | self.features = nn.Sequential(OrderedDict([ 118 | ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7, 119 | stride=(1, 2, 2), padding=(3, 3, 3), bias=False)), 120 | ('norm0', nn.BatchNorm3d(num_init_features)), 121 | ('relu0', nn.ReLU(inplace=True)), 122 | ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)), 123 | ])) 124 | 125 | # Each denseblock 126 | num_features = num_init_features 127 | for i, num_layers in enumerate(block_config): 128 | block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, 129 | bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate) 130 | self.features.add_module('denseblock%d' % (i + 1), block) 131 | num_features = num_features + num_layers * growth_rate 132 | if i != len(block_config) - 1: 133 | trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2) 134 | self.features.add_module('transition%d' % (i + 1), trans) 135 | num_features = num_features // 2 136 | 137 | # Final batch norm 138 | self.features.add_module('norm5', nn.BatchNorm2d(num_features)) 139 | 140 | # Linear layer 141 | self.classifier = nn.Linear(num_features, num_classes) 142 | 143 | def forward(self, x): 144 | features = self.features(x) 145 | out = F.relu(features, inplace=True) 146 | last_duration = math.ceil(self.sample_duration / 16) 147 | last_size = math.floor(self.sample_size / 32) 148 | out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1) 149 | if self.last_fc: 150 | out = self.classifier(out) 151 | return out 152 | -------------------------------------------------------------------------------- /c3d_feat_extract/models/pre_act_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class PreActivationBasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(PreActivationBasicBlock, self).__init__() 35 | self.bn1 = nn.BatchNorm3d(inplanes) 36 | self.conv1 = conv3x3x3(inplanes, planes, stride) 37 | self.bn2 = nn.BatchNorm3d(planes) 38 | self.conv2 = conv3x3x3(planes, planes) 39 | self.relu = nn.ReLU(inplace=True) 40 | self.downsample = downsample 41 | self.stride = stride 42 | 43 | def forward(self, x): 44 | residual = x 45 | 46 | out = self.bn1(x) 47 | out = self.relu(out) 48 | out = self.conv1(out) 49 | 50 | out = self.bn2(out) 51 | out = self.relu(out) 52 | out = self.conv2(out) 53 | 54 | if self.downsample is not None: 55 | residual = self.downsample(x) 56 | 57 | out += residual 58 | 59 | return out 60 | 61 | 62 | class PreActivationBottleneck(nn.Module): 63 | expansion = 4 64 | 65 | def __init__(self, inplanes, planes, stride=1, downsample=None): 66 | super(PreActivationBottleneck, self).__init__() 67 | self.bn1 = nn.BatchNorm3d(inplanes) 68 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 69 | self.bn2 = nn.BatchNorm3d(planes) 70 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, 71 | padding=1, bias=False) 72 | self.bn3 = nn.BatchNorm3d(planes) 73 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 74 | self.relu = nn.ReLU(inplace=True) 75 | self.downsample = downsample 76 | self.stride = stride 77 | 78 | def forward(self, x): 79 | residual = x 80 | 81 | out = self.bn1(x) 82 | out = self.relu(out) 83 | out = self.conv1(out) 84 | 85 | out = self.bn2(out) 86 | out = self.relu(out) 87 | out = self.conv2(out) 88 | 89 | out = self.bn3(out) 90 | out = self.relu(out) 91 | out = self.conv3(out) 92 | 93 | if self.downsample is not None: 94 | residual = self.downsample(x) 95 | 96 | out += residual 97 | 98 | return out 99 | 100 | 101 | class PreActivationResNet(nn.Module): 102 | 103 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True): 104 | self.last_fc = last_fc 105 | 106 | self.inplanes = 64 107 | super(PreActivationResNet, self).__init__() 108 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 109 | padding=(3, 3, 3), bias=False) 110 | self.bn1 = nn.BatchNorm3d(64) 111 | self.relu = nn.ReLU(inplace=True) 112 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 113 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 114 | self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2) 115 | self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2) 116 | self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2) 117 | last_duration = math.ceil(sample_duration / 16) 118 | last_size = math.ceil(sample_size / 32) 119 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 120 | self.fc = nn.Linear(512 * block.expansion, num_classes) 121 | 122 | for m in self.modules(): 123 | if isinstance(m, nn.Conv3d): 124 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 125 | m.weight.data.normal_(0, math.sqrt(2. / n)) 126 | elif isinstance(m, nn.BatchNorm3d): 127 | m.weight.data.fill_(1) 128 | m.bias.data.zero_() 129 | 130 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 131 | downsample = None 132 | if stride != 1 or self.inplanes != planes * block.expansion: 133 | if shortcut_type == 'A': 134 | downsample = partial(downsample_basic_block, 135 | planes=planes * block.expansion, 136 | stride=stride) 137 | else: 138 | downsample = nn.Sequential( 139 | nn.Conv3d(self.inplanes, planes * block.expansion, 140 | kernel_size=1, stride=stride, bias=False), 141 | nn.BatchNorm3d(planes * block.expansion) 142 | ) 143 | 144 | layers = [] 145 | layers.append(block(self.inplanes, planes, stride, downsample)) 146 | self.inplanes = planes * block.expansion 147 | for i in range(1, blocks): 148 | layers.append(block(self.inplanes, planes)) 149 | 150 | return nn.Sequential(*layers) 151 | 152 | def forward(self, x): 153 | x = self.conv1(x) 154 | x = self.bn1(x) 155 | x = self.relu(x) 156 | x = self.maxpool(x) 157 | 158 | x = self.layer1(x) 159 | x = self.layer2(x) 160 | x = self.layer3(x) 161 | x = self.layer4(x) 162 | 163 | x = self.avgpool(x) 164 | 165 | x = x.view(x.size(0), -1) 166 | if self.last_fc: 167 | x = self.fc(x) 168 | 169 | return x 170 | 171 | def get_fine_tuning_parameters(model, ft_begin_index): 172 | if ft_begin_index == 0: 173 | return model.parameters() 174 | 175 | ft_module_names = [] 176 | for i in range(ft_begin_index, 5): 177 | ft_module_names.append('layer{}'.format(ft_begin_index)) 178 | ft_module_names.append('fc') 179 | 180 | parameters = [] 181 | for k, v in model.named_parameters(): 182 | for ft_module in ft_module_names: 183 | if ft_module in k: 184 | parameters.append({'params': v}) 185 | break 186 | else: 187 | parameters.append({'params': v, 'lr': 0.0}) 188 | 189 | return parameters 190 | 191 | def resnet18(**kwargs): 192 | """Constructs a ResNet-18 model. 193 | """ 194 | model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs) 195 | return model 196 | 197 | def resnet34(**kwargs): 198 | """Constructs a ResNet-34 model. 199 | """ 200 | model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs) 201 | return model 202 | 203 | 204 | def resnet50(**kwargs): 205 | """Constructs a ResNet-50 model. 206 | """ 207 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs) 208 | return model 209 | 210 | def resnet101(**kwargs): 211 | """Constructs a ResNet-101 model. 212 | """ 213 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3], **kwargs) 214 | return model 215 | 216 | def resnet152(**kwargs): 217 | """Constructs a ResNet-101 model. 218 | """ 219 | model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3], **kwargs) 220 | return model 221 | 222 | def resnet200(**kwargs): 223 | """Constructs a ResNet-101 model. 224 | """ 225 | model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3], **kwargs) 226 | return model 227 | -------------------------------------------------------------------------------- /c3d_feat_extract/models/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class BasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(BasicBlock, self).__init__() 35 | self.conv1 = conv3x3x3(inplanes, planes, stride) 36 | self.bn1 = nn.BatchNorm3d(planes) 37 | self.relu = nn.ReLU(inplace=True) 38 | self.conv2 = conv3x3x3(planes, planes) 39 | self.bn2 = nn.BatchNorm3d(planes) 40 | self.downsample = downsample 41 | self.stride = stride 42 | 43 | def forward(self, x): 44 | residual = x 45 | 46 | out = self.conv1(x) 47 | out = self.bn1(out) 48 | out = self.relu(out) 49 | 50 | out = self.conv2(out) 51 | out = self.bn2(out) 52 | 53 | if self.downsample is not None: 54 | residual = self.downsample(x) 55 | 56 | out += residual 57 | out = self.relu(out) 58 | 59 | return out 60 | 61 | 62 | class Bottleneck(nn.Module): 63 | expansion = 4 64 | 65 | def __init__(self, inplanes, planes, stride=1, downsample=None): 66 | super(Bottleneck, self).__init__() 67 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 68 | self.bn1 = nn.BatchNorm3d(planes) 69 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, 70 | padding=1, bias=False) 71 | self.bn2 = nn.BatchNorm3d(planes) 72 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 73 | self.bn3 = nn.BatchNorm3d(planes * 4) 74 | self.relu = nn.ReLU(inplace=True) 75 | self.downsample = downsample 76 | self.stride = stride 77 | 78 | def forward(self, x): 79 | residual = x 80 | 81 | out = self.conv1(x) 82 | out = self.bn1(out) 83 | out = self.relu(out) 84 | 85 | out = self.conv2(out) 86 | out = self.bn2(out) 87 | out = self.relu(out) 88 | 89 | out = self.conv3(out) 90 | out = self.bn3(out) 91 | 92 | if self.downsample is not None: 93 | residual = self.downsample(x) 94 | 95 | out += residual 96 | out = self.relu(out) 97 | 98 | return out 99 | 100 | 101 | class ResNet(nn.Module): 102 | 103 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True): 104 | self.last_fc = last_fc 105 | 106 | self.inplanes = 64 107 | super(ResNet, self).__init__() 108 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 109 | padding=(3, 3, 3), bias=False) 110 | self.bn1 = nn.BatchNorm3d(64) 111 | self.relu = nn.ReLU(inplace=True) 112 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 113 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 114 | self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2) 115 | self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2) 116 | self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2) 117 | last_duration = math.ceil(sample_duration / 16) 118 | last_size = math.ceil(sample_size / 32) 119 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 120 | self.fc = nn.Linear(512 * block.expansion, num_classes) 121 | 122 | for m in self.modules(): 123 | if isinstance(m, nn.Conv3d): 124 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 125 | m.weight.data.normal_(0, math.sqrt(2. / n)) 126 | elif isinstance(m, nn.BatchNorm3d): 127 | m.weight.data.fill_(1) 128 | m.bias.data.zero_() 129 | 130 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 131 | downsample = None 132 | if stride != 1 or self.inplanes != planes * block.expansion: 133 | if shortcut_type == 'A': 134 | downsample = partial(downsample_basic_block, 135 | planes=planes * block.expansion, 136 | stride=stride) 137 | else: 138 | downsample = nn.Sequential( 139 | nn.Conv3d(self.inplanes, planes * block.expansion, 140 | kernel_size=1, stride=stride, bias=False), 141 | nn.BatchNorm3d(planes * block.expansion) 142 | ) 143 | 144 | layers = [] 145 | layers.append(block(self.inplanes, planes, stride, downsample)) 146 | self.inplanes = planes * block.expansion 147 | for i in range(1, blocks): 148 | layers.append(block(self.inplanes, planes)) 149 | 150 | return nn.Sequential(*layers) 151 | 152 | def forward(self, x): 153 | x = self.conv1(x) 154 | x = self.bn1(x) 155 | x = self.relu(x) 156 | x = self.maxpool(x) 157 | 158 | x = self.layer1(x) 159 | x = self.layer2(x) 160 | x = self.layer3(x) 161 | x = self.layer4(x) 162 | 163 | x = self.avgpool(x) 164 | 165 | x = x.view(x.size(0), -1) 166 | if self.last_fc: 167 | x = self.fc(x) 168 | 169 | return x 170 | 171 | 172 | def get_fine_tuning_parameters(model, ft_begin_index): 173 | if ft_begin_index == 0: 174 | return model.parameters() 175 | 176 | ft_module_names = [] 177 | for i in range(ft_begin_index, 5): 178 | ft_module_names.append('layer{}'.format(ft_begin_index)) 179 | ft_module_names.append('fc') 180 | 181 | parameters = [] 182 | for k, v in model.named_parameters(): 183 | for ft_module in ft_module_names: 184 | if ft_module in k: 185 | parameters.append({'params': v}) 186 | break 187 | else: 188 | parameters.append({'params': v, 'lr': 0.0}) 189 | 190 | return parameters 191 | 192 | 193 | def resnet10(**kwargs): 194 | """Constructs a ResNet-18 model. 195 | """ 196 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs) 197 | return model 198 | 199 | def resnet18(**kwargs): 200 | """Constructs a ResNet-18 model. 201 | """ 202 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 203 | return model 204 | 205 | def resnet34(**kwargs): 206 | """Constructs a ResNet-34 model. 207 | """ 208 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 209 | return model 210 | 211 | def resnet50(**kwargs): 212 | """Constructs a ResNet-50 model. 213 | """ 214 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 215 | return model 216 | 217 | def resnet101(**kwargs): 218 | """Constructs a ResNet-101 model. 219 | """ 220 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 221 | return model 222 | 223 | def resnet152(**kwargs): 224 | """Constructs a ResNet-101 model. 225 | """ 226 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 227 | return model 228 | 229 | def resnet200(**kwargs): 230 | """Constructs a ResNet-101 model. 231 | """ 232 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs) 233 | return model 234 | -------------------------------------------------------------------------------- /c3d_feat_extract/models/resnext.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | import math 7 | from functools import partial 8 | 9 | __all__ = ['ResNeXt', 'resnet50', 'resnet101'] 10 | 11 | 12 | def conv3x3x3(in_planes, out_planes, stride=1): 13 | # 3x3x3 convolution with padding 14 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 15 | stride=stride, padding=1, bias=False) 16 | 17 | 18 | def downsample_basic_block(x, planes, stride): 19 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 20 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 21 | out.size(2), out.size(3), 22 | out.size(4)).zero_() 23 | if isinstance(out.data, torch.cuda.FloatTensor): 24 | zero_pads = zero_pads.cuda() 25 | 26 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 27 | 28 | return out 29 | 30 | 31 | class ResNeXtBottleneck(nn.Module): 32 | expansion = 2 33 | 34 | def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None): 35 | super(ResNeXtBottleneck, self).__init__() 36 | mid_planes = cardinality * int(planes / 32) 37 | self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False) 38 | self.bn1 = nn.BatchNorm3d(mid_planes) 39 | self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride, 40 | padding=1, groups=cardinality, bias=False) 41 | self.bn2 = nn.BatchNorm3d(mid_planes) 42 | self.conv3 = nn.Conv3d(mid_planes, planes * self.expansion, kernel_size=1, bias=False) 43 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 44 | self.relu = nn.ReLU(inplace=True) 45 | self.downsample = downsample 46 | self.stride = stride 47 | 48 | def forward(self, x): 49 | residual = x 50 | 51 | out = self.conv1(x) 52 | out = self.bn1(out) 53 | out = self.relu(out) 54 | 55 | out = self.conv2(out) 56 | out = self.bn2(out) 57 | out = self.relu(out) 58 | 59 | out = self.conv3(out) 60 | out = self.bn3(out) 61 | 62 | if self.downsample is not None: 63 | residual = self.downsample(x) 64 | 65 | out += residual 66 | out = self.relu(out) 67 | 68 | return out 69 | 70 | 71 | class ResNeXt(nn.Module): 72 | 73 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, last_fc=True): 74 | self.last_fc = last_fc 75 | 76 | self.inplanes = 64 77 | super(ResNeXt, self).__init__() 78 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 79 | padding=(3, 3, 3), bias=False) 80 | self.bn1 = nn.BatchNorm3d(64) 81 | self.relu = nn.ReLU(inplace=True) 82 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 83 | self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality) 84 | self.layer2 = self._make_layer(block, 256, layers[1], shortcut_type, cardinality, stride=2) 85 | self.layer3 = self._make_layer(block, 512, layers[2], shortcut_type, cardinality, stride=2) 86 | self.layer4 = self._make_layer(block, 1024, layers[3], shortcut_type, cardinality, stride=2) 87 | last_duration = int(math.ceil(sample_duration / 16)) 88 | last_size = int(math.ceil(sample_size / 32)) 89 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 90 | self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes) 91 | 92 | for m in self.modules(): 93 | if isinstance(m, nn.Conv3d): 94 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 95 | m.weight.data.normal_(0, math.sqrt(2. / n)) 96 | elif isinstance(m, nn.BatchNorm3d): 97 | m.weight.data.fill_(1) 98 | m.bias.data.zero_() 99 | 100 | def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1): 101 | downsample = None 102 | if stride != 1 or self.inplanes != planes * block.expansion: 103 | if shortcut_type == 'A': 104 | downsample = partial(downsample_basic_block, 105 | planes=planes * block.expansion, 106 | stride=stride) 107 | else: 108 | downsample = nn.Sequential( 109 | nn.Conv3d(self.inplanes, planes * block.expansion, 110 | kernel_size=1, stride=stride, bias=False), 111 | nn.BatchNorm3d(planes * block.expansion) 112 | ) 113 | 114 | layers = [] 115 | layers.append(block(self.inplanes, planes, cardinality, stride, downsample)) 116 | self.inplanes = planes * block.expansion 117 | for i in range(1, blocks): 118 | layers.append(block(self.inplanes, planes, cardinality)) 119 | 120 | return nn.Sequential(*layers) 121 | 122 | def forward(self, x): 123 | x = self.conv1(x) 124 | x = self.bn1(x) 125 | x = self.relu(x) 126 | x = self.maxpool(x) 127 | 128 | x = self.layer1(x) 129 | x = self.layer2(x) 130 | x = self.layer3(x) 131 | x = self.layer4(x) 132 | 133 | x = self.avgpool(x) 134 | 135 | x = x.view(x.size(0), -1) 136 | if self.last_fc: 137 | x = self.fc(x) 138 | 139 | return x 140 | 141 | def get_fine_tuning_parameters(model, ft_begin_index): 142 | if ft_begin_index == 0: 143 | return model.parameters() 144 | 145 | ft_module_names = [] 146 | for i in range(ft_begin_index, 5): 147 | ft_module_names.append('layer{}'.format(ft_begin_index)) 148 | ft_module_names.append('fc') 149 | 150 | parameters = [] 151 | for k, v in model.named_parameters(): 152 | for ft_module in ft_module_names: 153 | if ft_module in k: 154 | parameters.append({'params': v}) 155 | break 156 | else: 157 | parameters.append({'params': v, 'lr': 0.0}) 158 | 159 | return parameters 160 | 161 | def resnet50(**kwargs): 162 | """Constructs a ResNet-50 model. 163 | """ 164 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs) 165 | return model 166 | 167 | def resnet101(**kwargs): 168 | """Constructs a ResNet-101 model. 169 | """ 170 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs) 171 | return model 172 | 173 | def resnet152(**kwargs): 174 | """Constructs a ResNet-101 model. 175 | """ 176 | model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs) 177 | return model 178 | -------------------------------------------------------------------------------- /c3d_feat_extract/models/wide_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class WideBottleneck(nn.Module): 31 | expansion = 2 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(WideBottleneck, self).__init__() 35 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 36 | self.bn1 = nn.BatchNorm3d(planes) 37 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, 38 | padding=1, bias=False) 39 | self.bn2 = nn.BatchNorm3d(planes) 40 | self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False) 41 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 42 | self.relu = nn.ReLU(inplace=True) 43 | self.downsample = downsample 44 | self.stride = stride 45 | 46 | def forward(self, x): 47 | residual = x 48 | 49 | out = self.conv1(x) 50 | out = self.bn1(out) 51 | out = self.relu(out) 52 | 53 | out = self.conv2(out) 54 | out = self.bn2(out) 55 | out = self.relu(out) 56 | 57 | out = self.conv3(out) 58 | out = self.bn3(out) 59 | 60 | if self.downsample is not None: 61 | residual = self.downsample(x) 62 | 63 | out += residual 64 | out = self.relu(out) 65 | 66 | return out 67 | 68 | 69 | class WideResNet(nn.Module): 70 | 71 | def __init__(self, block, layers, sample_size, sample_duration, k=1, shortcut_type='B', num_classes=400, last_fc=True): 72 | self.last_fc = last_fc 73 | 74 | self.inplanes = 64 75 | super(WideResNet, self).__init__() 76 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 77 | padding=(3, 3, 3), bias=False) 78 | self.bn1 = nn.BatchNorm3d(64) 79 | self.relu = nn.ReLU(inplace=True) 80 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 81 | self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type) 82 | self.layer2 = self._make_layer(block, 128 * k, layers[1], shortcut_type, stride=2) 83 | self.layer3 = self._make_layer(block, 256 * k, layers[2], shortcut_type, stride=2) 84 | self.layer4 = self._make_layer(block, 512 * k, layers[3], shortcut_type, stride=2) 85 | last_duration = math.ceil(sample_duration / 16) 86 | last_size = math.ceil(sample_size / 32) 87 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 88 | self.fc = nn.Linear(512 * k * block.expansion, num_classes) 89 | 90 | for m in self.modules(): 91 | if isinstance(m, nn.Conv3d): 92 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 93 | m.weight.data.normal_(0, math.sqrt(2. / n)) 94 | elif isinstance(m, nn.BatchNorm3d): 95 | m.weight.data.fill_(1) 96 | m.bias.data.zero_() 97 | 98 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 99 | downsample = None 100 | if stride != 1 or self.inplanes != planes * block.expansion: 101 | if shortcut_type == 'A': 102 | downsample = partial(downsample_basic_block, 103 | planes=planes * block.expansion, 104 | stride=stride) 105 | else: 106 | downsample = nn.Sequential( 107 | nn.Conv3d(self.inplanes, planes * block.expansion, 108 | kernel_size=1, stride=stride, bias=False), 109 | nn.BatchNorm3d(planes * block.expansion) 110 | ) 111 | 112 | layers = [] 113 | layers.append(block(self.inplanes, planes, stride, downsample)) 114 | self.inplanes = planes * block.expansion 115 | for i in range(1, blocks): 116 | layers.append(block(self.inplanes, planes)) 117 | 118 | return nn.Sequential(*layers) 119 | 120 | def forward(self, x): 121 | x = self.conv1(x) 122 | x = self.bn1(x) 123 | x = self.relu(x) 124 | x = self.maxpool(x) 125 | 126 | x = self.layer1(x) 127 | x = self.layer2(x) 128 | x = self.layer3(x) 129 | x = self.layer4(x) 130 | 131 | x = self.avgpool(x) 132 | 133 | x = x.view(x.size(0), -1) 134 | if self.last_fc: 135 | x = self.fc(x) 136 | 137 | return x 138 | 139 | def get_fine_tuning_parameters(model, ft_begin_index): 140 | if ft_begin_index == 0: 141 | return model.parameters() 142 | 143 | ft_module_names = [] 144 | for i in range(ft_begin_index, 5): 145 | ft_module_names.append('layer{}'.format(ft_begin_index)) 146 | ft_module_names.append('fc') 147 | 148 | parameters = [] 149 | for k, v in model.named_parameters(): 150 | for ft_module in ft_module_names: 151 | if ft_module in k: 152 | parameters.append({'params': v}) 153 | break 154 | else: 155 | parameters.append({'params': v, 'lr': 0.0}) 156 | 157 | return parameters 158 | 159 | def resnet50(**kwargs): 160 | """Constructs a ResNet-50 model. 161 | """ 162 | model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs) 163 | return model 164 | -------------------------------------------------------------------------------- /c3d_feat_extract/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_opts(): 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('--input', default='input', type=str, help='Input file path') 6 | parser.add_argument('--video_root', default='', type=str, help='Root path of input videos') 7 | parser.add_argument('--model', default='', type=str, help='Model file path') 8 | parser.add_argument('--output', default='output.json', type=str, help='Output file path') 9 | parser.add_argument('--feat_dir', default='./feat', type=str, help='c3d feat file path') 10 | parser.add_argument('--mode', default='score', type=str, help='Mode (score | feature). score outputs class scores. feature outputs features (after global average pooling).') 11 | parser.add_argument('--batch_size', default=32, type=int, help='Batch Size') 12 | parser.add_argument('--n_threads', default=4, type=int, help='Number of threads for multi-thread loading') 13 | parser.add_argument('--model_name', default='resnet', type=str, help='Currently only support resnet') 14 | parser.add_argument('--model_depth', default=34, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)') 15 | parser.add_argument('--resnet_shortcut', default='A', type=str, help='Shortcut type of resnet (A | B)') 16 | parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k') 17 | parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality') 18 | parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.') 19 | parser.add_argument('--gpu', type=str, default='0', help='gpu device number') 20 | parser.add_argument('--n_classes', type=int, default=400, help='numbers of video class') 21 | parser.add_argument('--sample_duration', type=int, default=16, help='sample_duration') 22 | parser.set_defaults(verbose=False) 23 | parser.add_argument('--verbose', action='store_true', help='') 24 | parser.set_defaults(verbose=False) 25 | 26 | args = parser.parse_args() 27 | 28 | return args 29 | -------------------------------------------------------------------------------- /c3d_feat_extract/spatial_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import numbers 4 | import collections 5 | import numpy as np 6 | import torch 7 | from PIL import Image, ImageOps 8 | try: 9 | import accimage 10 | except ImportError: 11 | accimage = None 12 | 13 | 14 | class Compose(object): 15 | """Composes several transforms together. 16 | Args: 17 | transforms (list of ``Transform`` objects): list of transforms to compose. 18 | Example: 19 | >>> transforms.Compose([ 20 | >>> transforms.CenterCrop(10), 21 | >>> transforms.ToTensor(), 22 | >>> ]) 23 | """ 24 | 25 | def __init__(self, transforms): 26 | self.transforms = transforms 27 | 28 | def __call__(self, img): 29 | for t in self.transforms: 30 | img = t(img) 31 | return img 32 | 33 | 34 | class ToTensor(object): 35 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 36 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 37 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 38 | """ 39 | 40 | def __call__(self, pic): 41 | """ 42 | Args: 43 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 44 | Returns: 45 | Tensor: Converted image. 46 | """ 47 | if isinstance(pic, np.ndarray): 48 | # handle numpy array 49 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 50 | # backward compatibility 51 | return img.float() 52 | 53 | if accimage is not None and isinstance(pic, accimage.Image): 54 | nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32) 55 | pic.copyto(nppic) 56 | return torch.from_numpy(nppic) 57 | 58 | # handle PIL Image 59 | if pic.mode == 'I': 60 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 61 | elif pic.mode == 'I;16': 62 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 63 | else: 64 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 65 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 66 | if pic.mode == 'YCbCr': 67 | nchannel = 3 68 | elif pic.mode == 'I;16': 69 | nchannel = 1 70 | else: 71 | nchannel = len(pic.mode) 72 | img = img.view(pic.size[1], pic.size[0], nchannel) 73 | # put it from HWC to CHW format 74 | # yikes, this transpose takes 80% of the loading time/CPU 75 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 76 | if isinstance(img, torch.ByteTensor): 77 | return img.float() 78 | else: 79 | return img 80 | 81 | 82 | class Normalize(object): 83 | """Normalize an tensor image with mean and standard deviation. 84 | Given mean: (R, G, B) and std: (R, G, B), 85 | will normalize each channel of the torch.*Tensor, i.e. 86 | channel = (channel - mean) / std 87 | Args: 88 | mean (sequence): Sequence of means for R, G, B channels respecitvely. 89 | std (sequence): Sequence of standard deviations for R, G, B channels 90 | respecitvely. 91 | """ 92 | 93 | def __init__(self, mean, std): 94 | self.mean = mean 95 | self.std = std 96 | 97 | def __call__(self, tensor): 98 | """ 99 | Args: 100 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 101 | Returns: 102 | Tensor: Normalized image. 103 | """ 104 | # TODO: make efficient 105 | for t, m, s in zip(tensor, self.mean, self.std): 106 | t.sub_(m).div_(s) 107 | return tensor 108 | 109 | 110 | class Scale(object): 111 | """Rescale the input PIL.Image to the given size. 112 | Args: 113 | size (sequence or int): Desired output size. If size is a sequence like 114 | (w, h), output size will be matched to this. If size is an int, 115 | smaller edge of the image will be matched to this number. 116 | i.e, if height > width, then image will be rescaled to 117 | (size * height / width, size) 118 | interpolation (int, optional): Desired interpolation. Default is 119 | ``PIL.Image.BILINEAR`` 120 | """ 121 | 122 | def __init__(self, size, interpolation=Image.BILINEAR): 123 | assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2) 124 | self.size = size 125 | self.interpolation = interpolation 126 | 127 | def __call__(self, img): 128 | """ 129 | Args: 130 | img (PIL.Image): Image to be scaled. 131 | Returns: 132 | PIL.Image: Rescaled image. 133 | """ 134 | if isinstance(self.size, int): 135 | w, h = img.size 136 | if (w <= h and w == self.size) or (h <= w and h == self.size): 137 | return img 138 | if w < h: 139 | ow = self.size 140 | oh = int(self.size * h / w) 141 | return img.resize((ow, oh), self.interpolation) 142 | else: 143 | oh = self.size 144 | ow = int(self.size * w / h) 145 | return img.resize((ow, oh), self.interpolation) 146 | else: 147 | return img.resize(self.size, self.interpolation) 148 | 149 | 150 | class CenterCrop(object): 151 | """Crops the given PIL.Image at the center. 152 | Args: 153 | size (sequence or int): Desired output size of the crop. If size is an 154 | int instead of sequence like (h, w), a square crop (size, size) is 155 | made. 156 | """ 157 | 158 | def __init__(self, size): 159 | if isinstance(size, numbers.Number): 160 | self.size = (int(size), int(size)) 161 | else: 162 | self.size = size 163 | 164 | def __call__(self, img): 165 | """ 166 | Args: 167 | img (PIL.Image): Image to be cropped. 168 | Returns: 169 | PIL.Image: Cropped image. 170 | """ 171 | w, h = img.size 172 | th, tw = self.size 173 | x1 = int(round((w - tw) / 2.)) 174 | y1 = int(round((h - th) / 2.)) 175 | return img.crop((x1, y1, x1 + tw, y1 + th)) 176 | -------------------------------------------------------------------------------- /c3d_feat_extract/temporal_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class LoopPadding(object): 6 | def __init__(self, size): 7 | self.size = size 8 | 9 | def __call__(self, frame_indices): 10 | out = frame_indices 11 | 12 | for index in out: 13 | if len(out) >= self.size: 14 | break 15 | out.append(index) 16 | 17 | return out 18 | 19 | 20 | class TemporalCenterCrop(object): 21 | """Temporally crop the given frame indices at a center. 22 | 23 | If the number of frames is less than the size, 24 | loop the indices as many times as necessary to satisfy the size. 25 | 26 | Args: 27 | size (int): Desired output size of the crop. 28 | """ 29 | 30 | def __init__(self, size): 31 | self.size = size 32 | 33 | def __call__(self, frame_indices): 34 | """ 35 | Args: 36 | frame_indices (list): frame indices to be cropped. 37 | Returns: 38 | list: Cropped frame indices. 39 | """ 40 | 41 | center_index = len(frame_indices) // 2 42 | begin_index = max(0, center_index - (self.size // 2)) 43 | end_index = min(begin_index + self.size, len(frame_indices)) 44 | 45 | out = frame_indices[begin_index:end_index] 46 | 47 | for index in out: 48 | if len(out) >= self.size: 49 | break 50 | out.append(index) 51 | 52 | return out 53 | -------------------------------------------------------------------------------- /c3d_feat_extract/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import os 5 | import sys 6 | import json 7 | 8 | from utils import AverageMeter 9 | 10 | 11 | def calculate_video_results(output_buffer, video_id, test_results, class_names): 12 | video_outputs = torch.stack(output_buffer) 13 | average_scores = torch.mean(video_outputs, dim=0) 14 | sorted_scores, locs = torch.topk(average_scores, k=10) 15 | 16 | video_results = [] 17 | for i in range(sorted_scores.size(0)): 18 | video_results.append({'label': class_names[locs[i]], 'score': sorted_scores[i]}) 19 | 20 | test_results['results'][video_id] = video_results 21 | 22 | 23 | def test(data_loader, model, opt, class_names): 24 | print('test') 25 | 26 | model.eval() 27 | 28 | batch_time = AverageMeter() 29 | data_time = AverageMeter() 30 | 31 | end_time = time.time() 32 | output_buffer = [] 33 | previous_video_id = '' 34 | test_results = {'results': {}} 35 | for i, (inputs, targets) in enumerate(data_loader): 36 | data_time.update(time.time() - end_time) 37 | 38 | inputs = Variable(inputs, volatile=True) 39 | outputs = model(inputs) 40 | 41 | for j in range(outputs.size(0)): 42 | if not (i == 0 and j == 0) and targets[j] != previous_video_id: 43 | calculate_video_results(output_buffer, previous_video_id, 44 | test_results, class_names) 45 | output_buffer = [] 46 | output_buffer.append(outputs[j].data.cpu()) 47 | previous_video_id = targets[j] 48 | 49 | if (i % 100) == 0: 50 | with open(os.path.join(opt.result_path, 51 | '{}.json'.format(opt.test_subset)), 52 | 'w') as f: 53 | json.dump(test_results, f) 54 | 55 | batch_time.update(time.time() - end_time) 56 | end_time = time.time() 57 | 58 | print('[{}/{}]\t' 59 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 60 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( 61 | i + 1, len(data_loader), batch_time=batch_time, data_time=data_time)) 62 | with open(os.path.join(opt.result_path, 63 | '{}.json'.format(opt.test_subset)), 64 | 'w') as f: 65 | json.dump(test_results, f) 66 | -------------------------------------------------------------------------------- /c3d_feat_extract/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import os 5 | import sys 6 | 7 | from utils import AverageMeter, calculate_accuracy 8 | 9 | 10 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, 11 | epoch_logger, batch_logger): 12 | print('train at epoch {}'.format(epoch)) 13 | 14 | model.train() 15 | 16 | batch_time = AverageMeter() 17 | data_time = AverageMeter() 18 | losses = AverageMeter() 19 | accuracies = AverageMeter() 20 | 21 | end_time = time.time() 22 | for i, (inputs, targets) in enumerate(data_loader): 23 | data_time.update(time.time() - end_time) 24 | 25 | if not opt.no_cuda: 26 | targets = targets.cuda(async=True) 27 | inputs = Variable(inputs) 28 | targets = Variable(targets) 29 | outputs = model(inputs) 30 | loss = criterion(outputs, targets) 31 | acc = calculate_accuracy(outputs, targets) 32 | 33 | losses.update(loss.data[0], inputs.size(0)) 34 | accuracies.update(acc, inputs.size(0)) 35 | 36 | optimizer.zero_grad() 37 | loss.backward() 38 | optimizer.step() 39 | 40 | batch_time.update(time.time() - end_time) 41 | end_time = time.time() 42 | 43 | batch_logger.log({ 44 | 'epoch': epoch, 45 | 'batch': i + 1, 46 | 'iter': (epoch - 1) * len(data_loader) + (i + 1), 47 | 'loss': losses.val, 48 | 'acc': accuracies.val, 49 | 'lr': optimizer.param_groups[0]['lr'] 50 | }) 51 | 52 | print('Epoch: [{0}][{1}/{2}]\t' 53 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 54 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 55 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 56 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 57 | epoch, i + 1, len(data_loader), batch_time=batch_time, 58 | data_time=data_time, loss=losses, acc=accuracies)) 59 | 60 | epoch_logger.log({ 61 | 'epoch': epoch, 62 | 'loss': losses.avg, 63 | 'acc': accuracies.avg, 64 | 'lr': optimizer.param_groups[0]['lr'] 65 | }) 66 | 67 | if epoch % opt.checkpoint == 0: 68 | save_file_path = os.path.join(opt.result_path, 'save_{}.pth'.format(epoch)) 69 | states = { 70 | 'epoch': epoch + 1, 71 | 'arch': opt.arch, 72 | 'state_dict': model.state_dict(), 73 | 'optimizer' : optimizer.state_dict(), 74 | } 75 | torch.save(states, save_file_path) 76 | -------------------------------------------------------------------------------- /c3d_feat_extract/validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import sys 5 | 6 | from utils import AverageMeter, calculate_accuracy 7 | 8 | 9 | def val_epoch(epoch, data_loader, model, criterion, opt, logger): 10 | print('validation at epoch {}'.format(epoch)) 11 | 12 | model.eval() 13 | 14 | batch_time = AverageMeter() 15 | data_time = AverageMeter() 16 | losses = AverageMeter() 17 | accuracies = AverageMeter() 18 | 19 | end_time = time.time() 20 | for i, (inputs, targets) in enumerate(data_loader): 21 | data_time.update(time.time() - end_time) 22 | 23 | if not opt.no_cuda: 24 | targets = targets.cuda(async=True) 25 | inputs = Variable(inputs, volatile=True) 26 | targets = Variable(targets, volatile=True) 27 | outputs = model(inputs) 28 | loss = criterion(outputs, targets) 29 | acc = calculate_accuracy(outputs, targets) 30 | 31 | losses.update(loss.data[0], inputs.size(0)) 32 | accuracies.update(acc, inputs.size(0)) 33 | 34 | batch_time.update(time.time() - end_time) 35 | end_time = time.time() 36 | 37 | print('Epoch: [{0}][{1}/{2}]\t' 38 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 39 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 40 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 41 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 42 | epoch, i + 1, len(data_loader), batch_time=batch_time, 43 | data_time=data_time, loss=losses, acc=accuracies)) 44 | 45 | logger.log({ 46 | 'epoch': epoch, 47 | 'loss': losses.avg, 48 | 'acc': accuracies.avg 49 | }) 50 | 51 | return losses.avg 52 | -------------------------------------------------------------------------------- /caffe_feat_extract.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import glob 4 | import os 5 | import numpy as np 6 | import os 7 | import argparse 8 | import sys 9 | from tqdm import tqdm 10 | from PIL import Image 11 | # caffe_root = 'your caffe root' 12 | # sys.path.insert(0, caffe_root + '/python') 13 | import caffe 14 | 15 | def extract_frames(video, dst): 16 | with open(os.devnull, "w") as ffmpeg_log: 17 | if os.path.exists(dst): 18 | print(" cleanup: " + dst + "/") 19 | shutil.rmtree(dst) 20 | os.makedirs(dst) 21 | video_to_frames_command = ["ffmpeg", 22 | # (optional) overwrite output file if it exists 23 | '-y', 24 | '-i', video, # input file 25 | '-vf', "scale=400:300", # input file 26 | '-qscale:v', "2", # quality for JPEG 27 | '{0}/%06d.jpg'.format(dst)] 28 | subprocess.call(video_to_frames_command, 29 | stdout=ffmpeg_log, stderr=ffmpeg_log) 30 | 31 | 32 | def extract_feats(params, net): 33 | dir_fc = params['output_dir'] 34 | if not os.path.isdir(dir_fc): 35 | os.mkdir(dir_fc) 36 | print("save video feats to %s" % (dir_fc)) 37 | video_list = glob.glob(os.path.join(params['video_path'], '*.mp4')) 38 | mean_value = np.array([104.00698793, 116.66876762, 122.67891434]) 39 | # np.array((102.144, 102.144, 108.64)) 40 | for video in tqdm(video_list): 41 | video_id = video.split("/")[-1].split(".")[0] 42 | dst = video_id 43 | extract_frames(video, dst) 44 | image_list = sorted(glob.glob(os.path.join(dst, '*.jpg'))) 45 | samples = np.round(np.linspace( 46 | 0, len(image_list) - 1, params['n_frame_steps'])) 47 | image_list = [image_list[int(sample)] for sample in samples] 48 | ims = [] 49 | img_feats = [] 50 | for index, iImg in enumerate(range(len(image_list))): 51 | im = Image.open(image_list[iImg]) 52 | im = im.resize((224, 224), Image.BILINEAR) 53 | im = np.array(im, dtype=np.float32) 54 | im = im[:, :, ::-1] # RGB->BGR 55 | im -= mean_value # BGR 56 | im = im.transpose((2, 0, 1)) # im:(c,h,w) 57 | im = im[np.newaxis, ...] 58 | ims.append(im) 59 | if (index+1) % params['batch_size'] == 0: 60 | ims = np.concatenate(ims, axis=0) 61 | net.blobs['data'].reshape(*ims.shape) 62 | net.blobs['data'].data[...] = ims 63 | output = net.forward() 64 | img_feats.append(net.blobs['pool5'].data.squeeze()) 65 | ims = [] 66 | img_feats = np.concatenate(img_feats, axis=0) 67 | # Save the inception features 68 | outfile = os.path.join(dir_fc, video_id + '.npy') 69 | np.save(outfile, img_feats) 70 | # cleanup 71 | shutil.rmtree(dst) 72 | 73 | 74 | if __name__ == '__main__': 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument("--gpu", dest='gpu', type=int, default=0, 77 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 78 | parser.add_argument('--batch_size', type=int, default=20, help='minibatch size') 79 | parser.add_argument("--output_dir", dest='output_dir', type=str, 80 | default='data/feats/resnet152', help='directory to store features') 81 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80, 82 | help='how many frames to sampler per video') 83 | parser.add_argument("--video_path", dest='video_path', type=str, 84 | default='data/videos', help='path to video dataset') 85 | parser.add_argument("--model_weight", dest="model_weight", type=str, 86 | default='pretrained_models/resnet152_places365.caffemodel', 87 | help='model_weight') 88 | parser.add_argument("--model_deploy", dest="model_deploy", type=str, 89 | default='pretrained_models/deploy_resnet152_places365.prototxt', 90 | help='deploy') 91 | args = parser.parse_args() 92 | params = vars(args) 93 | # TODO: remove this limit 94 | assert params['n_frame_steps'] % params['batch_size'] == 0, 'For simplicity, n_frame_steps%batch_size must = 0' 95 | caffe.set_device(params['gpu']) 96 | caffe.set_mode_gpu() 97 | model_weights = params['model_weight'] 98 | model_def = params['model_deploy'] 99 | net = caffe.Net(model_def, model_weights, caffe.TEST) 100 | extract_feats(params, net) 101 | -------------------------------------------------------------------------------- /caffe_feat_extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python caffe_feat_extract.py \ 4 | --video_path data/videos \ 5 | --output_dir data/feats/resnet269 \ 6 | --model_weight pretrained_models/resnet269-v2.caffemodel \ 7 | --model_deploy pretrained_models/deploy_resnet269-v2.prototxt \ 8 | --n_frame_steps 80 \ 9 | --gpu 1 \ 10 | --batch_size 10 \ 11 | 12 | 13 | #python caffe_feat_extract.py \ 14 | #--video_path data/videos \ 15 | #--output_dir data/feats/resnet152_places365 \ 16 | #--model_weight pretrained_models/resnet152_places365.caffemodel \ 17 | #--model_deploy pretrained_models/deploy_resnet152_places365.prototxt \ 18 | #--n_frame_steps 80 \ 19 | #--gpu 0 \ 20 | -------------------------------------------------------------------------------- /coco-caption/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 DingXia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # 4 | # Description: Describes the class to compute the CIDEr 5 | # (Consensus-Based Image Description Evaluation) Metric 6 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 7 | # 8 | # Creation Date: Sun Feb 8 14:16:54 2015 9 | # 10 | # Authors: Ramakrishna Vedantam and 11 | # Tsung-Yi Lin 12 | 13 | from cider_scorer import CiderScorer 14 | 15 | 16 | class Cider: 17 | """ 18 | Main Class to compute the CIDEr metric 19 | 20 | """ 21 | def __init__(self, n=4, df="corpus"): 22 | """ 23 | Initialize the CIDEr scoring function 24 | : param n (int): n-gram size 25 | : param df (string): specifies where to get the IDF values from 26 | takes values 'corpus', 'coco-train' 27 | : return: None 28 | """ 29 | # set cider to sum over 1 to 4-grams 30 | self._n = n 31 | self._df = df 32 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 33 | 34 | def compute_score(self, gts, res): 35 | """ 36 | Main function to compute CIDEr score 37 | : param gts (dict) : {image:tokenized reference sentence} 38 | : param res (dict) : {image:tokenized candidate sentence} 39 | : return: cider (float) : computed CIDEr score for the corpus 40 | """ 41 | 42 | # clear all the previous hypos and refs 43 | self.cider_scorer.clear() 44 | 45 | for res_id in res: 46 | 47 | hypo = res_id['caption'] 48 | ref = gts[res_id['image_id']] 49 | 50 | # Sanity check. 51 | assert(type(hypo) is list) 52 | assert(len(hypo) == 1) 53 | assert(type(ref) is list) 54 | assert(len(ref) > 0) 55 | self.cider_scorer += (hypo[0], ref) 56 | 57 | (score, scores) = self.cider_scorer.compute_score() 58 | 59 | return score, scores 60 | 61 | def method(self): 62 | return "CIDEr" 63 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/ciderD/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/ciderD/ciderD.py: -------------------------------------------------------------------------------- 1 | # Filename: ciderD.py 2 | # 3 | # Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from .ciderD_scorer import CiderScorer 11 | 12 | 13 | class CiderD: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, n=4, sigma=6.0, df="corpus"): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | # set which where to compute document frequencies from 24 | self._df = df 25 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 26 | 27 | def compute_score(self, gts, res): 28 | """ 29 | Main function to compute CIDEr score 30 | :param hypo_for_image (dict) : dictionary with key and value 31 | ref_for_image (dict) : dictionary with key and value 32 | :return: cider (float) : computed CIDEr score for the corpus 33 | """ 34 | 35 | # clear all the previous hypos and refs 36 | self.cider_scorer.clear() 37 | for res_id in res: 38 | 39 | hypo = res_id['caption'] 40 | ref = gts[res_id['image_id']] 41 | 42 | # Sanity check. 43 | assert(type(hypo) is list) 44 | assert(len(hypo) == 1) 45 | assert(type(ref) is list) 46 | assert(len(ref) > 0) 47 | self.cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = self.cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr-D" 55 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rama' 2 | from tokenizer.ptbtokenizer import PTBTokenizer 3 | from cider.cider import Cider 4 | from ciderD.ciderD import CiderD 5 | 6 | 7 | class CIDErEvalCap: 8 | def __init__(self, gts, res, df): 9 | print 'tokenization...' 10 | tokenizer = PTBTokenizer('gts') 11 | _gts = tokenizer.tokenize(gts) 12 | print 'tokenized refs' 13 | tokenizer = PTBTokenizer('res') 14 | _res = tokenizer.tokenize(res) 15 | print 'tokenized cands' 16 | 17 | self.gts = _gts 18 | self.res = _res 19 | self.df = df 20 | 21 | def evaluate(self): 22 | # ================================================= 23 | # Set up scorers 24 | # ================================================= 25 | 26 | print 'setting up scorers...' 27 | scorers = [ 28 | (Cider(df=self.df), "CIDEr"), (CiderD(df=self.df), "CIDErD") 29 | ] 30 | 31 | # ================================================= 32 | # Compute scores 33 | # ================================================= 34 | metric_scores = {} 35 | for scorer, method in scorers: 36 | print 'computing %s score...' % (scorer.method()) 37 | score, scores = scorer.compute_score(self.gts, self.res) 38 | print "Mean %s score: %0.3f" % (method, score) 39 | metric_scores[method] = list(scores) 40 | return metric_scores 41 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import pdb # python debugger 13 | import sys 14 | import subprocess 15 | import re 16 | import tempfile 17 | import itertools 18 | 19 | # path to the stanford corenlp jar 20 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 21 | 22 | # punctuations to be removed from the sentences 23 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 24 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 25 | 26 | class PTBTokenizer: 27 | """Python wrapper of Stanford PTBTokenizer""" 28 | def __init__(self, _source='gts'): 29 | self.source = _source 30 | 31 | def tokenize(self, captions_for_image): 32 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 33 | 'edu.stanford.nlp.process.PTBTokenizer', \ 34 | '-preserveLines', '-lowerCase'] 35 | 36 | # ====================================================== 37 | # prepare data for PTB Tokenizer 38 | # ====================================================== 39 | 40 | if self.source == 'gts': 41 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 42 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 43 | final_tokenized_captions_for_image = {} 44 | 45 | elif self.source == 'res': 46 | index = [i for i, v in enumerate(captions_for_image)] 47 | image_id = [v["image_id"] for v in captions_for_image] 48 | sentences = '\n'.join(v["caption"].replace('\n', ' ') for v in captions_for_image ) 49 | final_tokenized_captions_for_index = [] 50 | 51 | # ====================================================== 52 | # save sentences to temporary file 53 | # ====================================================== 54 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 55 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 56 | tmp_file.write(sentences) 57 | tmp_file.close() 58 | 59 | # ====================================================== 60 | # tokenize sentence 61 | # ====================================================== 62 | cmd.append(os.path.basename(tmp_file.name)) 63 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 64 | stdout=subprocess.PIPE) 65 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 66 | lines = token_lines.split('\n') 67 | # remove temp file 68 | os.remove(tmp_file.name) 69 | 70 | # ====================================================== 71 | # create dictionary for tokenized captions 72 | # ====================================================== 73 | if self.source == 'gts': 74 | for k, line in zip(image_id, lines): 75 | if not k in final_tokenized_captions_for_image: 76 | final_tokenized_captions_for_image[k] = [] 77 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 78 | if w not in PUNCTUATIONS]) 79 | final_tokenized_captions_for_image[k].append(tokenized_caption) 80 | 81 | return final_tokenized_captions_for_image 82 | 83 | elif self.source == 'res': 84 | for k, img, line in zip(index, image_id, lines): 85 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 86 | if w not in PUNCTUATIONS]) 87 | final_tokenized_captions_for_index.append({'image_id': img, 'caption': [tokenized_caption]}) 88 | 89 | return final_tokenized_captions_for_index 90 | -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/tmpBF49XX: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpBF49XX -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/tmpql9uU7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpql9uU7 -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/tmpuCp_T0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpuCp_T0 -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/tmpxAmV_C: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpxAmV_C -------------------------------------------------------------------------------- /coco-caption/pyciderevalcap/tokenizer/tmpzNW4I2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpzNW4I2 -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from .bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(sorted(gts.keys()) == sorted(res.keys())) 24 | imgIds = sorted(gts.keys()) 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from .cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(sorted(gts.keys()) == sorted(res.keys())) 33 | imgIds = sorted(gts.keys()) 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) >= 1) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" 55 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | from collections import defaultdict 7 | import numpy as np 8 | import pdb 9 | import math 10 | 11 | def precook(s, n=4, out=False): 12 | """ 13 | Takes a string as input and returns an object that can be given to 14 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 15 | can take string arguments as well. 16 | :param s: string : sentence to be converted into ngrams 17 | :param n: int : number of ngrams for which representation is calculated 18 | :return: term frequency vector for occuring ngrams 19 | """ 20 | words = s.split() 21 | counts = defaultdict(int) 22 | for k in range(1,n+1): 23 | for i in range(len(words)-k+1): 24 | ngram = tuple(words[i:i+k]) 25 | counts[ngram] += 1 26 | return counts 27 | 28 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 29 | '''Takes a list of reference sentences for a single segment 30 | and returns an object that encapsulates everything that BLEU 31 | needs to know about them. 32 | :param refs: list of string : reference sentences for some image 33 | :param n: int : number of ngrams for which (ngram) representation is calculated 34 | :return: result (list of dict) 35 | ''' 36 | return [precook(ref, n) for ref in refs] 37 | 38 | def cook_test(test, n=4): 39 | '''Takes a test sentence and returns an object that 40 | encapsulates everything that BLEU needs to know about it. 41 | :param test: list of string : hypothesis sentence for some image 42 | :param n: int : number of ngrams for which (ngram) representation is calculated 43 | :return: result (dict) 44 | ''' 45 | return precook(test, n, True) 46 | 47 | class CiderScorer(object): 48 | """CIDEr scorer. 49 | """ 50 | 51 | def copy(self): 52 | ''' copy the refs.''' 53 | new = CiderScorer(n=self.n) 54 | new.ctest = copy.copy(self.ctest) 55 | new.crefs = copy.copy(self.crefs) 56 | return new 57 | 58 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 59 | ''' singular instance ''' 60 | self.n = n 61 | self.sigma = sigma 62 | self.crefs = [] 63 | self.ctest = [] 64 | self.document_frequency = defaultdict(float) 65 | self.cook_append(test, refs) 66 | self.ref_len = None 67 | 68 | def cook_append(self, test, refs): 69 | '''called by constructor and __iadd__ to avoid creating new instances.''' 70 | 71 | if refs is not None: 72 | self.crefs.append(cook_refs(refs)) 73 | if test is not None: 74 | self.ctest.append(cook_test(test)) ## N.B.: -1 75 | else: 76 | self.ctest.append(None) # lens of crefs and ctest have to match 77 | 78 | def size(self): 79 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 80 | return len(self.crefs) 81 | 82 | def __iadd__(self, other): 83 | '''add an instance (e.g., from another sentence).''' 84 | 85 | if type(other) is tuple: 86 | ## avoid creating new CiderScorer instances 87 | self.cook_append(other[0], other[1]) 88 | else: 89 | self.ctest.extend(other.ctest) 90 | self.crefs.extend(other.crefs) 91 | 92 | return self 93 | def compute_doc_freq(self): 94 | ''' 95 | Compute term frequency for reference data. 96 | This will be used to compute idf (inverse document frequency later) 97 | The term frequency is stored in the object 98 | :return: None 99 | ''' 100 | for refs in self.crefs: 101 | # refs, k ref captions of one image 102 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]): 103 | self.document_frequency[ngram] += 1 104 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 105 | 106 | def compute_cider(self): 107 | def counts2vec(cnts): 108 | """ 109 | Function maps counts of ngram to vector of tfidf weights. 110 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 111 | The n-th entry of array denotes length of n-grams. 112 | :param cnts: 113 | :return: vec (array of dict), norm (array of float), length (int) 114 | """ 115 | vec = [defaultdict(float) for _ in range(self.n)] 116 | length = 0 117 | norm = [0.0 for _ in range(self.n)] 118 | for (ngram,term_freq) in cnts.items(): 119 | # give word count 1 if it doesn't appear in reference corpus 120 | df = np.log(max(1.0, self.document_frequency[ngram])) 121 | # ngram index 122 | n = len(ngram)-1 123 | # tf (term_freq) * idf (precomputed idf) for n-grams 124 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 125 | # compute norm for the vector. the norm will be used for computing similarity 126 | norm[n] += pow(vec[n][ngram], 2) 127 | 128 | if n == 1: 129 | length += term_freq 130 | norm = [np.sqrt(n) for n in norm] 131 | return vec, norm, length 132 | 133 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 134 | ''' 135 | Compute the cosine similarity of two vectors. 136 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 137 | :param vec_ref: array of dictionary for vector corresponding to reference 138 | :param norm_hyp: array of float for vector corresponding to hypothesis 139 | :param norm_ref: array of float for vector corresponding to reference 140 | :param length_hyp: int containing length of hypothesis 141 | :param length_ref: int containing length of reference 142 | :return: array of score for each n-grams cosine similarity 143 | ''' 144 | delta = float(length_hyp - length_ref) 145 | # measure consine similarity 146 | val = np.array([0.0 for _ in range(self.n)]) 147 | for n in range(self.n): 148 | # ngram 149 | for (ngram,count) in vec_hyp[n].items(): 150 | # vrama91 : added clipping 151 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 152 | 153 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 154 | val[n] /= (norm_hyp[n]*norm_ref[n]) 155 | 156 | assert(not math.isnan(val[n])) 157 | # vrama91: added a length based gaussian penalty 158 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 159 | return val 160 | 161 | # compute log reference length 162 | self.ref_len = np.log(float(len(self.crefs))) 163 | 164 | scores = [] 165 | for test, refs in zip(self.ctest, self.crefs): 166 | # compute vector for test captions 167 | vec, norm, length = counts2vec(test) 168 | # compute vector for ref captions 169 | score = np.array([0.0 for _ in range(self.n)]) 170 | for ref in refs: 171 | vec_ref, norm_ref, length_ref = counts2vec(ref) 172 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 173 | # change by vrama91 - mean of ngram scores, instead of sum 174 | score_avg = np.mean(score) 175 | # divide by number of references 176 | score_avg /= len(refs) 177 | # multiply score by 10 178 | score_avg *= 10.0 179 | # append score of an image to the score list 180 | scores.append(score_avg) 181 | return scores 182 | 183 | def compute_score(self, option=None, verbose=0): 184 | # compute idf 185 | self.compute_doc_freq() 186 | # assert to check document frequency 187 | assert(len(self.ctest) >= max(self.document_frequency.values())) 188 | # compute cider score 189 | score = self.compute_cider() 190 | # debug 191 | # print score 192 | return np.mean(np.array(score)), np.array(score) -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | from .tokenizer.ptbtokenizer import PTBTokenizer 3 | from .bleu.bleu import Bleu 4 | from .meteor.meteor import Meteor 5 | from .rouge.rouge import Rouge 6 | from .cider.cider import Cider 7 | from .spice.spice import Spice 8 | 9 | class COCOEvalCap: 10 | def __init__(self, coco, cocoRes): 11 | self.evalImgs = [] 12 | self.eval = {} 13 | self.imgToEval = {} 14 | self.coco = coco 15 | self.cocoRes = cocoRes 16 | self.params = {'image_id': coco.getImgIds()} 17 | 18 | def evaluate(self): 19 | imgIds = self.params['image_id'] 20 | # imgIds = self.coco.getImgIds() 21 | gts = {} 22 | res = {} 23 | for imgId in imgIds: 24 | gts[imgId] = self.coco.imgToAnns[imgId] 25 | res[imgId] = self.cocoRes.imgToAnns[imgId] 26 | 27 | # ================================================= 28 | # Set up scorers 29 | # ================================================= 30 | print('tokenization...') 31 | tokenizer = PTBTokenizer() 32 | gts = tokenizer.tokenize(gts) 33 | res = tokenizer.tokenize(res) 34 | 35 | # ================================================= 36 | # Set up scorers 37 | # ================================================= 38 | print('setting up scorers...') 39 | scorers = [ 40 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 41 | (Meteor(),"METEOR"), 42 | (Rouge(), "ROUGE_L"), 43 | (Cider(), "CIDEr"), 44 | (Spice(), "SPICE") 45 | ] 46 | 47 | # ================================================= 48 | # Compute scores 49 | # ================================================= 50 | for scorer, method in scorers: 51 | print('computing %s score...'%(scorer.method())) 52 | score, scores = scorer.compute_score(gts, res) 53 | if type(method) == list: 54 | for sc, scs, m in zip(score, scores, method): 55 | self.setEval(sc, m) 56 | self.setImgToEvalImgs(scs, gts.keys(), m) 57 | print("%s: %0.3f"%(m, sc)) 58 | else: 59 | self.setEval(score, method) 60 | self.setImgToEvalImgs(scores, gts.keys(), method) 61 | print("%s: %0.3f"%(method, score)) 62 | self.setEvalImgs() 63 | 64 | def setEval(self, score, method): 65 | self.eval[method] = score 66 | 67 | def setImgToEvalImgs(self, scores, imgIds, method): 68 | for imgId, score in zip(sorted(imgIds), scores): 69 | if not imgId in self.imgToEval: 70 | self.imgToEval[imgId] = {} 71 | self.imgToEval[imgId]["image_id"] = imgId 72 | self.imgToEval[imgId][method] = score 73 | 74 | def setEvalImgs(self): 75 | self.evalImgs = [self.imgToEval[imgId] for imgId in sorted(self.imgToEval.keys())] 76 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pycocoevalcap/meteor/meteor-1.5.jar -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/meteor/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python wrapper for METEOR implementation, by Xinlei Chen 4 | # Acknowledge Michael Denkowski for the generous discussion and help 5 | 6 | import os 7 | import sys 8 | import subprocess 9 | import threading 10 | 11 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. 12 | METEOR_JAR = 'meteor-1.5.jar' 13 | # print METEOR_JAR 14 | 15 | class Meteor: 16 | 17 | def __init__(self): 18 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ 19 | '-', '-', '-stdio', '-l', 'en', '-norm'] 20 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \ 21 | cwd=os.path.dirname(os.path.abspath(__file__)), \ 22 | stdin=subprocess.PIPE, \ 23 | stdout=subprocess.PIPE, \ 24 | stderr=subprocess.PIPE) 25 | # Used to guarantee thread safety 26 | self.lock = threading.Lock() 27 | 28 | def compute_score(self, gts, res): 29 | assert(sorted(gts.keys()) == sorted(res.keys())) 30 | imgIds = sorted(gts.keys()) 31 | scores = [] 32 | 33 | eval_line = 'EVAL' 34 | self.lock.acquire() 35 | for i in imgIds: 36 | assert(len(res[i]) == 1) 37 | stat = self._stat(res[i][0], gts[i]) 38 | eval_line += ' ||| {}'.format(stat) 39 | 40 | self.meteor_p.stdin.write('{}\n'.format(eval_line).encode()) 41 | self.meteor_p.stdin.flush() 42 | for i in range(0, len(imgIds)): 43 | scores.append(float(self.meteor_p.stdout.readline().decode().strip())) 44 | score = float(self.meteor_p.stdout.readline().decode().strip()) 45 | self.lock.release() 46 | 47 | return score, scores 48 | 49 | def method(self): 50 | return "METEOR" 51 | 52 | def _stat(self, hypothesis_str, reference_list): 53 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 54 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 55 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 56 | self.meteor_p.stdin.write('{}\n'.format(score_line).encode()) 57 | self.meteor_p.stdin.flush() 58 | return self.meteor_p.stdout.readline().decode().strip() 59 | 60 | def _score(self, hypothesis_str, reference_list): 61 | self.lock.acquire() 62 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 63 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 64 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 65 | self.meteor_p.stdin.write('{}\n'.format(score_line).encode()) 66 | self.meteor_p.stdin.flush() 67 | stats = self.meteor_p.stdout.readline().decode().strip() 68 | eval_line = 'EVAL ||| {}'.format(stats) 69 | # EVAL ||| stats 70 | self.meteor_p.stdin.write('{}\n'.format(eval_line).encode()) 71 | self.meteor_p.stdin.flush() 72 | score = float(self.meteor_p.stdout.readline().decode().strip()) 73 | # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice 74 | # thanks for Andrej for pointing this out 75 | score = float(self.meteor_p.stdout.readline().strip()) 76 | self.lock.release() 77 | return score 78 | 79 | def __exit__(self): 80 | self.lock.acquire() 81 | self.meteor_p.stdin.close() 82 | self.meteor_p.kill() 83 | self.meteor_p.wait() 84 | self.lock.release() 85 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(sorted(gts.keys()) == sorted(res.keys())) 86 | imgIds = sorted(gts.keys()) 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) >= 1) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import sys 13 | import subprocess 14 | import tempfile 15 | import itertools 16 | 17 | # path to the stanford corenlp jar 18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 19 | 20 | # punctuations to be removed from the sentences 21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 22 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 23 | 24 | class PTBTokenizer: 25 | """Python wrapper of Stanford PTBTokenizer""" 26 | 27 | def tokenize(self, captions_for_image): 28 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 29 | 'edu.stanford.nlp.process.PTBTokenizer', \ 30 | '-preserveLines', '-lowerCase'] 31 | 32 | # ====================================================== 33 | # prepare data for PTB Tokenizer 34 | # ====================================================== 35 | final_tokenized_captions_for_image = {} 36 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 37 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 38 | 39 | # ====================================================== 40 | # save sentences to temporary file 41 | # ====================================================== 42 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 43 | tmp_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, dir=path_to_jar_dirname) 44 | tmp_file.write(sentences) 45 | tmp_file.close() 46 | 47 | # ====================================================== 48 | # tokenize sentence 49 | # ====================================================== 50 | cmd.append(os.path.basename(tmp_file.name)) 51 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 52 | stdout=subprocess.PIPE) 53 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 54 | lines = token_lines.decode().split('\n') 55 | # remove temp file 56 | os.remove(tmp_file.name) 57 | 58 | # ====================================================== 59 | # create dictionary for tokenized captions 60 | # ====================================================== 61 | for k, line in zip(image_id, lines): 62 | if not k in final_tokenized_captions_for_image: 63 | final_tokenized_captions_for_image[k] = [] 64 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 65 | if w not in PUNCTUATIONS]) 66 | final_tokenized_captions_for_image[k].append(tokenized_caption) 67 | 68 | return final_tokenized_captions_for_image 69 | -------------------------------------------------------------------------------- /coco-caption/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /coco-caption/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /coco-caption/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | iou = _mask.iou 77 | merge = _mask.merge 78 | frPyObjects = _mask.frPyObjects 79 | 80 | def encode(bimask): 81 | if len(bimask.shape) == 3: 82 | return _mask.encode(bimask) 83 | elif len(bimask.shape) == 2: 84 | h, w = bimask.shape 85 | return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] 86 | 87 | def decode(rleObjs): 88 | if type(rleObjs) == list: 89 | return _mask.decode(rleObjs) 90 | else: 91 | return _mask.decode([rleObjs])[:,:,0] 92 | 93 | def area(rleObjs): 94 | if type(rleObjs) == list: 95 | return _mask.area(rleObjs) 96 | else: 97 | return _mask.area([rleObjs])[0] 98 | 99 | def toBbox(rleObjs): 100 | if type(rleObjs) == list: 101 | return _mask.toBbox(rleObjs) 102 | else: 103 | return _mask.toBbox([rleObjs])[0] -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import random 4 | import os 5 | import numpy as np 6 | import torch 7 | from torch.utils.data import Dataset 8 | 9 | 10 | class CocoDataset(Dataset): 11 | 12 | def __init__(self, coco_labels): 13 | # python 3 14 | # super().__init__() 15 | super(CocoDataset, self).__init__() 16 | self.coco_labels = list(coco_labels['labels'].items()) 17 | self.num_classes = coco_labels['num_classes'] 18 | 19 | def __getitem__(self, ix): 20 | labels = torch.zeros(self.num_classes) 21 | image_id, labels_ids = self.coco_labels[ix] 22 | labels[labels_ids] = 1 23 | data = {} 24 | data['image_ids'] = image_id 25 | data['labels'] = labels 26 | return data 27 | 28 | def __len__(self): 29 | return len(self.coco_labels) 30 | 31 | 32 | class VideoDataset(Dataset): 33 | 34 | def get_vocab_size(self): 35 | return len(self.get_vocab()) 36 | 37 | def get_vocab(self): 38 | return self.ix_to_word 39 | 40 | def get_seq_length(self): 41 | return self.seq_length 42 | 43 | def __init__(self, opt, mode): 44 | # python 3 45 | # super().__init__() 46 | super(VideoDataset, self).__init__() 47 | self.mode = mode # to load train/val/test data 48 | 49 | # load the json file which contains information about the dataset 50 | self.captions = json.load(open(opt["caption_json"])) 51 | info = json.load(open(opt["info_json"])) 52 | self.ix_to_word = info['ix_to_word'] 53 | self.word_to_ix = info['word_to_ix'] 54 | print('vocab size is ', len(self.ix_to_word)) 55 | self.splits = info['videos'] 56 | print('number of train videos: ', len(self.splits['train'])) 57 | print('number of val videos: ', len(self.splits['val'])) 58 | print('number of test videos: ', len(self.splits['test'])) 59 | self.n_frame_steps = opt['n_frame_steps'] 60 | self.feats_dir = opt["feats_dir"] 61 | self.c3d_feats_dir = opt['c3d_feats_dir'] 62 | self.with_c3d = opt['with_c3d'] 63 | print('load feats from %s' % (self.feats_dir)) 64 | # load in the sequence data 65 | self.max_len = opt["max_len"] 66 | print('max sequence length in data is', self.max_len) 67 | 68 | def __getitem__(self, ix): 69 | """This function returns a tuple that is further passed to collate_fn 70 | """ 71 | # which part of data to load 72 | if self.mode == 'val': 73 | ix += len(self.splits['train']) 74 | elif self.mode == 'test': 75 | ix = ix + len(self.splits['train']) + len(self.splits['val']) 76 | fc_feat = [] 77 | for dir in self.feats_dir: 78 | fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % (ix)))) 79 | fc_feat = np.concatenate(fc_feat, axis=1) 80 | samples = np.round(np.linspace( 81 | 0, fc_feat.shape[0] - 1, self.n_frame_steps)).astype(np.int32) 82 | fc_feat = fc_feat[samples, :] 83 | if self.with_c3d == 1: 84 | c3d_feat = np.load(os.path.join(self.c3d_feats_dir, 'video%i.npy'%(ix))) 85 | if len(c3d_feat.shape) == 1: 86 | fc_feat = np.concatenate((fc_feat, np.tile(c3d_feat, (fc_feat.shape[0], 1))), axis=1) 87 | elif len(c3d_feat.shape) == 2: 88 | samples = np.round(np.linspace( 89 | 0, c3d_feat.shape[0] - 1, fc_feat.shape[0])).astype(np.int32) 90 | fc_feat = np.concatenate((fc_feat, c3d_feat[samples, :]), axis=1) 91 | # label = torch.zeros(self.max_len) 92 | mask = torch.zeros(self.max_len) 93 | captions = self.captions['video%i'%(ix)]['final_captions'] 94 | gts = torch.zeros(len(captions), self.max_len).long() 95 | for i, cap in enumerate(captions): 96 | if len(cap) > self.max_len: 97 | cap = cap[:self.max_len] 98 | cap[-1] = '' 99 | for j, w in enumerate(cap): 100 | gts[i, j] = self.word_to_ix[w] 101 | # # add by rgh 102 | # if w in self.word_to_ix.keys(): 103 | # gts[i, j] = self.word_to_ix[w] 104 | # else: 105 | # gts[i, j] = 0 106 | 107 | # random select a caption for this video 108 | cap_ix = random.randint(0, len(captions) - 1) 109 | label = gts[cap_ix] 110 | non_zero = (label == 0).nonzero() 111 | mask[:int(non_zero[0]) + 1] = 1 112 | 113 | data = {} 114 | data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor) 115 | data['labels'] = label 116 | data['masks'] = mask 117 | data['gts'] = gts 118 | data['video_ids'] = 'video%i'%(ix) 119 | return data 120 | 121 | def __len__(self): 122 | return len(self.splits[self.mode]) 123 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | import torch 5 | from torch import nn 6 | from torch.autograd import Variable 7 | from torch.utils.data import DataLoader 8 | from models import EncoderRNN, DecoderRNN, S2VTAttModel, S2VTModel 9 | from dataloader import VideoDataset 10 | import misc.utils as utils 11 | from misc.cocoeval import suppress_stdout_stderr, COCOScorer 12 | from collections import OrderedDict 13 | from pandas.io.json import json_normalize 14 | 15 | 16 | def convert_data_to_coco_scorer_format(data_frame): 17 | gts = {} 18 | for row in zip(data_frame["caption"], data_frame["video_id"]): 19 | if row[1] in gts: 20 | gts[row[1]].append( 21 | {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]}) 22 | else: 23 | gts[row[1]] = [] 24 | gts[row[1]].append( 25 | {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]}) 26 | return gts 27 | 28 | 29 | def test(model, crit, dataset, vocab, opt): 30 | model.eval() 31 | loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=False) 32 | scorer = COCOScorer() 33 | gt_dataframe = json_normalize( 34 | json.load(open(opt["input_json"]))['sentences']) 35 | gts = convert_data_to_coco_scorer_format(gt_dataframe) 36 | #results = [] 37 | samples = {} 38 | for index, data in enumerate(loader): 39 | print 'batch: '+str((index+1)*opt["batch_size"]) 40 | # forward the model to get loss 41 | fc_feats = Variable(data['fc_feats'], volatile=True).cuda() 42 | labels = Variable(data['labels'], volatile=True).long().cuda() 43 | masks = Variable(data['masks'], volatile=True).cuda() 44 | video_ids = data['video_ids'] 45 | 46 | # forward the model to also get generated samples for each image 47 | seq_probs, seq_preds = model( 48 | fc_feats, mode='inference', opt=opt) 49 | # print(seq_preds) 50 | 51 | sents = utils.decode_sequence(vocab, seq_preds) 52 | 53 | for k, sent in enumerate(sents): 54 | video_id = video_ids[k] 55 | samples[video_id] = [{'image_id': video_id, 'caption': sent}] 56 | # break 57 | with suppress_stdout_stderr(): 58 | valid_score = scorer.score(gts, samples, samples.keys()) 59 | #results.append(valid_score) 60 | #print(valid_score) 61 | 62 | if not os.path.exists(opt["results_path"]): 63 | os.makedirs(opt["results_path"]) 64 | result = OrderedDict() 65 | result['checkpoint'] = opt["saved_model"][opt["saved_model"].rfind('/')+1:] 66 | score_sum = 0 67 | for key, value in valid_score.items(): 68 | score_sum += float(value) 69 | result['sum'] = str(score_sum) 70 | #result = OrderedDict(result, **valid_score) 71 | result = OrderedDict(result.items() + valid_score.items()) 72 | print result 73 | if not os.path.exists(opt["results_path"]): 74 | os.makedirs(opt["results_path"]) 75 | with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table: 76 | scores_table.write(json.dumps(result) + "\n") 77 | with open(os.path.join(opt["results_path"], 78 | opt["model"].split("/")[-1].split('.')[0] + ".json"), 'w') as prediction_results: 79 | json.dump({"predictions": samples, "scores": valid_score}, 80 | prediction_results) 81 | 82 | 83 | def main(opt): 84 | dataset = VideoDataset(opt, "test") 85 | opt["vocab_size"] = dataset.get_vocab_size() 86 | opt["seq_length"] = dataset.max_len 87 | if opt['beam_size'] != 1: 88 | assert opt["batch_size"] == 1 89 | if opt["model"] == 'S2VTModel': 90 | model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], 91 | n_layers=opt['num_layers'], 92 | rnn_cell=opt['rnn_type'], 93 | bidirectional=opt["bidirectional"], 94 | rnn_dropout_p=opt["rnn_dropout_p"]).cuda() 95 | elif opt["model"] == "S2VTAttModel": 96 | encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], 97 | n_layers=opt['num_layers'], 98 | rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], 99 | input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) 100 | decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], 101 | n_layers=opt['num_layers'], 102 | rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], 103 | rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) 104 | model = S2VTAttModel(encoder, decoder).cuda() 105 | model = nn.DataParallel(model) 106 | # Setup the model 107 | model.load_state_dict(torch.load(opt["saved_model"])) 108 | crit = utils.LanguageModelCriterion() 109 | 110 | test(model, crit, dataset, dataset.get_vocab(), opt) 111 | 112 | 113 | if __name__ == '__main__': 114 | parser = argparse.ArgumentParser() 115 | parser.add_argument('--recover_opt', type=str, required=True, 116 | help='recover train opts from saved opt_json') 117 | parser.add_argument('--saved_model', type=str, default='', 118 | help='path to saved model to evaluate') 119 | # parser.add_argument('--rnn_type', type=str, default='gru', help='lstm or gru') 120 | parser.add_argument('--dump_json', type=int, default=1, 121 | help='Dump json with predictions into vis folder? (1=yes,0=no)') 122 | parser.add_argument('--results_path', type=str, default='results/') 123 | parser.add_argument('--dump_path', type=int, default=0, 124 | help='Write image paths along with predictions into vis json? (1=yes,0=no)') 125 | parser.add_argument('--gpu', type=str, default='0', 126 | help='gpu device number') 127 | parser.add_argument('--batch_size', type=int, default=128, 128 | help='minibatch size') 129 | parser.add_argument('--sample_max', type=int, default=1, 130 | help='0/1. whether sample max probs to get next word in inference stage') 131 | parser.add_argument('--temperature', type=float, default=1.0) 132 | parser.add_argument('--beam_size', type=int, default=1, 133 | help='used when sample_max = 1. Usually 2 or 3 works well.') 134 | 135 | args = parser.parse_args() 136 | args = vars((args)) 137 | opt = json.load(open(args["recover_opt"])) 138 | for k, v in args.items(): 139 | opt[k] = v 140 | os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"] 141 | main(opt) 142 | -------------------------------------------------------------------------------- /eval_s2vt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### nasnet_resnet101_40frames 4 | feat=nasnet_resnet101_40frames 5 | epoch=60 6 | python eval.py \ 7 | --rnn_type lstm \ 8 | --results_path result/$feat/s2vt \ 9 | --recover_opt checkpoint/$feat/s2vt/opt_info.json \ 10 | --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \ 11 | --batch_size 100 \ 12 | --gpu 0 13 | 14 | 15 | 16 | # ### nasnet_40frames 17 | # feat=nasnet_40frames 18 | # epoch=250 19 | # python eval.py \ 20 | # --rnn_type lstm \ 21 | # --results_path result/$feat/s2vt \ 22 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \ 23 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \ 24 | # --batch_size 100 \ 25 | # --gpu 0 26 | 27 | 28 | ### inception_v4 40frames 29 | # feat=inception_v4_40frames 30 | # epoch=300 31 | # python eval.py \ 32 | # --rnn_type lstm \ 33 | # --results_path result/$feat/s2vt \ 34 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \ 35 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \ 36 | # --batch_size 100 \ 37 | # --gpu 0 38 | 39 | 40 | # feat=resnet101_40frames 41 | # epoch=150 42 | # python eval.py \ 43 | # --rnn_type lstm \ 44 | # --results_path result/$feat/s2vt \ 45 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \ 46 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \ 47 | # --batch_size 100 \ 48 | # --gpu 1 49 | 50 | 51 | # feat=resnet101_c3d_fc7_wo_ft 52 | # epoch=150 53 | # python eval.py \ 54 | # --rnn_type lstm \ 55 | # --results_path result/$feat/s2vt \ 56 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \ 57 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \ 58 | # --batch_size 100 \ 59 | # --gpu 1 60 | 61 | 62 | # feat=resnet101_80frames 63 | # feat=resnet101 64 | # epoch=150 65 | # python eval.py \ 66 | # --rnn_type lstm \ 67 | # --results_path result/$feat/s2vt \ 68 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \ 69 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \ 70 | # --batch_size 100 \ 71 | # --gpu 1 72 | -------------------------------------------------------------------------------- /finetune_cnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader 3 | from torch.autograd import Variable 4 | from torch import nn 5 | import torch.optim as optim 6 | import os 7 | import json 8 | import argparse 9 | from dataloader import CocoDataset 10 | import pretrainedmodels 11 | from pretrainedmodels import utils 12 | 13 | 14 | C, H, W = 3, 224, 224 15 | 16 | 17 | class MILModel(nn.Module): 18 | def __init__(self, cnn_model, dim_hidden, num_classes): 19 | # python 3 20 | # super().__init__() 21 | super(MILModel, self).__init__() 22 | self.cnn_model = cnn_model 23 | self.num_classes = num_classes 24 | self.dim_hidden = dim_hidden 25 | self.linear = nn.Linear(dim_hidden, num_classes) 26 | 27 | def forward(self, x): 28 | feature_map = self.cnn_model.features(x) 29 | feature_map = feature_map.permute(0, 2, 3, 1) 30 | b, x, y, h = feature_map.size() 31 | feature_map = feature_map.contiguous().view(b, x * y, h) 32 | logits = self.linear(feature_map) 33 | logits = 1 - logits 34 | probs = Variable(torch.ones(logits.shape[0], logits.shape[2])).cuda() 35 | for i in range(x * y): 36 | probs = probs * logits[:, i, :] 37 | probs = 1 - probs 38 | return probs 39 | 40 | 41 | def train(dataloader, model, crit, optimizer, lr_scheduler, load_image_fn, params): 42 | model.train() 43 | model = nn.DataParallel(model) 44 | images_path = json.load(open(params.coco_path)) 45 | 46 | for epoch in range(params.epochs): 47 | lr_scheduler.step() 48 | iteration = 0 49 | for data in dataloader: 50 | iteration += 1 51 | image_ids, image_labels = data['image_ids'], data['labels'] 52 | images = torch.zeros(image_labels.shape[0], C, H, W) 53 | for i, image_id in enumerate(image_ids): 54 | image_path = os.path.join( 55 | params.coco_dir, images_path[image_id]) 56 | images[i] = load_image_fn(image_path) 57 | logits = model(Variable(images).cuda()) 58 | loss = crit(logits, Variable(image_labels).cuda()) 59 | 60 | optimizer.zero_grad() 61 | loss.backward() 62 | optimizer.step() 63 | train_loss = loss.data[0] 64 | torch.cuda.synchronize() 65 | 66 | print("iter %d (epoch %d), train_loss = %.6f" % 67 | (iteration, epoch, train_loss)) 68 | 69 | if epoch % params.save_checkpoint_every == 0: 70 | checkpoint_path = os.path.join( 71 | params.checkpoint_path, 'cnn_model_%d.pth' % (epoch)) 72 | torch.save(model.state_dict(), checkpoint_path) 73 | print("model saved to %s" % (checkpoint_path)) 74 | 75 | 76 | def main(args): 77 | global C, H, W 78 | coco_labels = json.load(open(args.coco_labels)) 79 | num_classes = coco_labels['num_classes'] 80 | if args.model == 'inception_v3': 81 | C, H, W = 3, 299, 299 82 | model = pretrainedmodels.inceptionv3(pretrained='imagenet') 83 | 84 | elif args.model == 'resnet152': 85 | C, H, W = 3, 224, 224 86 | model = pretrainedmodels.resnet152(pretrained='imagenet') 87 | 88 | elif args.model == 'inception_v4': 89 | C, H, W = 3, 299, 299 90 | model = pretrainedmodels.inceptionv4( 91 | num_classes=1000, pretrained='imagenet') 92 | 93 | else: 94 | print("doesn't support %s" % (args['model'])) 95 | 96 | load_image_fn = utils.LoadTransformImage(model) 97 | dim_feats = model.last_linear.in_features 98 | model = MILModel(model, dim_feats, num_classes) 99 | model = model.cuda() 100 | dataset = CocoDataset(coco_labels) 101 | dataloader = DataLoader( 102 | dataset, batch_size=args.batch_size, shuffle=True) 103 | optimizer = optim.Adam( 104 | model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) 105 | exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.learning_rate_decay_every, 106 | gamma=args.learning_rate_decay_rate) 107 | 108 | crit = nn.MultiLabelSoftMarginLoss() 109 | if not os.path.isdir(args.checkpoint_path): 110 | os.mkdir(args.checkpoint_path) 111 | train(dataloader, model, crit, optimizer, 112 | exp_lr_scheduler, load_image_fn, args) 113 | 114 | 115 | if __name__ == '__main__': 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument('--coco_path', type=str, 118 | default='data/coco_path.json', help='') 119 | parser.add_argument('--coco_labels', type=str, 120 | default='data/coco_labels.json', help='path to processed coco caption json') 121 | parser.add_argument('--coco_dir', type=str, 122 | default='data/mscoco/train2014') 123 | parser.add_argument('--epochs', type=int, default=200, 124 | help='number of epochs') 125 | parser.add_argument('--checkpoint_path', type=str, 126 | help='path to trained model') 127 | parser.add_argument("--gpu", dest='gpu', type=str, default='0', 128 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 129 | parser.add_argument("--model", dest="model", type=str, default='resnet152', 130 | help='the CNN model you want to use to extract_feats') 131 | 132 | parser.add_argument('--save_checkpoint_every', type=int, default=20, 133 | help='how often to save a model checkpoint (in epoch)?') 134 | parser.add_argument('--batch_size', type=int, default=512) 135 | parser.add_argument('--learning_rate', type=float, default=1e-5, 136 | help='learning rate') 137 | 138 | parser.add_argument('--learning_rate_decay_every', type=int, default=2, 139 | help='every how many epoch thereafter to drop LR?') 140 | parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8) 141 | parser.add_argument('--optim_alpha', type=float, default=0.9, 142 | help='alpha for adam') 143 | parser.add_argument('--optim_beta', type=float, default=0.999, 144 | help='beta used for adam') 145 | parser.add_argument('--optim_epsilon', type=float, default=1e-8, 146 | help='epsilon that goes into denominator for smoothing') 147 | parser.add_argument('--weight_decay', type=float, default=5e-4, 148 | help='weight_decay') 149 | args = parser.parse_args() 150 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 151 | 152 | main(args) 153 | -------------------------------------------------------------------------------- /misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/misc/__init__.py -------------------------------------------------------------------------------- /misc/cocoeval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Wrapper for evaluation on CIDEr, ROUGE_L, METEOR and Bleu_N 3 | using coco-caption repo https://github.com/tylin/coco-caption 4 | 5 | class COCOScorer is taken from https://github.com/yaoli/arctic-capgen-vid 6 | ''' 7 | 8 | import json 9 | import os 10 | import sys 11 | sys.path.append('coco-caption') 12 | 13 | from pycocoevalcap.bleu.bleu import Bleu 14 | from pycocoevalcap.rouge.rouge import Rouge 15 | from pycocoevalcap.cider.cider import Cider 16 | from pycocoevalcap.meteor.meteor import Meteor 17 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 18 | from collections import OrderedDict 19 | # Define a context manager to suppress stdout and stderr. 20 | 21 | 22 | class suppress_stdout_stderr: 23 | ''' 24 | A context manager for doing a "deep suppression" of stdout and stderr in 25 | Python, i.e. will suppress all print, even if the print originates in a 26 | compiled C/Fortran sub-function. 27 | This will not suppress raised exceptions, since exceptions are printed 28 | to stderr just before a script exits, and after the context manager has 29 | exited (at least, I think that is why it lets exceptions through). 30 | 31 | ''' 32 | 33 | def __init__(self): 34 | # Open a pair of null files 35 | self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] 36 | # Save the actual stdout (1) and stderr (2) file descriptors. 37 | self.save_fds = (os.dup(1), os.dup(2)) 38 | 39 | def __enter__(self): 40 | # Assign the null pointers to stdout and stderr. 41 | os.dup2(self.null_fds[0], 1) 42 | os.dup2(self.null_fds[1], 2) 43 | 44 | def __exit__(self, *_): 45 | # Re-assign the real stdout/stderr back to (1) and (2) 46 | os.dup2(self.save_fds[0], 1) 47 | os.dup2(self.save_fds[1], 2) 48 | # Close the null files 49 | os.close(self.null_fds[0]) 50 | os.close(self.null_fds[1]) 51 | 52 | 53 | class COCOScorer(object): 54 | def __init__(self): 55 | print('init COCO-EVAL scorer') 56 | 57 | def score(self, GT, RES, IDs): 58 | # edited by rgh 59 | #self.eval = {} 60 | self.eval = OrderedDict() 61 | self.imgToEval = {} 62 | gts = {} 63 | res = {} 64 | for ID in IDs: 65 | # print ID 66 | gts[ID] = GT[ID] 67 | res[ID] = RES[ID] 68 | print('tokenization...') 69 | tokenizer = PTBTokenizer() 70 | gts = tokenizer.tokenize(gts) 71 | res = tokenizer.tokenize(res) 72 | 73 | # ================================================= 74 | # Set up scorers 75 | # ================================================= 76 | print('setting up scorers...') 77 | # edited by rgh 78 | # scorers = [ 79 | # (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 80 | # (Meteor(),"METEOR"), 81 | # (Rouge(), "ROUGE_L"), 82 | # (Cider(), "CIDEr"), 83 | # #(Spice(), "SPICE") 84 | # ] 85 | scorers = [ 86 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 87 | (Meteor(), "METEOR"), 88 | (Cider(), "CIDEr"), 89 | (Rouge(), "ROUGE_L"), 90 | # (Spice(), "SPICE") 91 | ] 92 | 93 | # ================================================= 94 | # Compute scores 95 | # ================================================= 96 | eval = {} 97 | for scorer, method in scorers: 98 | print('computing %s score...' % (scorer.method())) 99 | score, scores = scorer.compute_score(gts, res) 100 | if type(method) == list: 101 | # added by rgh 102 | # for sc, scs, m in zip(score, scores, method): 103 | # self.setEval(sc, m) 104 | # self.setImgToEvalImgs(scs, IDs, m) 105 | # print("%s: %0.3f" % (m, sc)) 106 | self.setEval("%.4f" % score[-1], method[-1]) 107 | self.setImgToEvalImgs(scores[-1], IDs, method[-1]) 108 | print("%s: %0.4f" % (method[-1], score[-1])) 109 | else: 110 | self.setEval("%.4f" % score, method) 111 | self.setImgToEvalImgs(scores, IDs, method) 112 | print("%s: %0.4f" % (method, score)) 113 | 114 | # for metric, score in self.eval.items(): 115 | # print '%s: %.3f'%(metric, score) 116 | return self.eval 117 | 118 | def setEval(self, score, method): 119 | self.eval[method] = score 120 | 121 | def setImgToEvalImgs(self, scores, imgIds, method): 122 | for imgId, score in zip(imgIds, scores): 123 | if imgId not in self.imgToEval: 124 | self.imgToEval[imgId] = {} 125 | self.imgToEval[imgId]["image_id"] = imgId 126 | self.imgToEval[imgId][method] = score 127 | 128 | 129 | def score(ref, sample): 130 | # ref and sample are both dict 131 | scorers = [ 132 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 133 | (Rouge(), "ROUGE_L"), 134 | (Cider(), "CIDEr") 135 | ] 136 | final_scores = {} 137 | for scorer, method in scorers: 138 | print('computing %s score with COCO-EVAL...' % (scorer.method())) 139 | score, scores = scorer.compute_score(ref, sample) 140 | if type(score) == list: 141 | for m, s in zip(method, score): 142 | final_scores[m] = s 143 | else: 144 | final_scores[method] = score 145 | return final_scores 146 | 147 | 148 | def test_cocoscorer(): 149 | '''gts = { 150 | 184321:[ 151 | {u'image_id': 184321, u'id': 352188, u'caption': u'A train traveling down-tracks next to lights.'}, 152 | {u'image_id': 184321, u'id': 356043, u'caption': u"A blue and silver train next to train's station and trees."}, 153 | {u'image_id': 184321, u'id': 356382, u'caption': u'A blue train is next to a sidewalk on the rails.'}, 154 | {u'image_id': 184321, u'id': 361110, u'caption': u'A passenger train pulls into a train station.'}, 155 | {u'image_id': 184321, u'id': 362544, u'caption': u'A train coming down the tracks arriving at a station.'}], 156 | 81922: [ 157 | {u'image_id': 81922, u'id': 86779, u'caption': u'A large jetliner flying over a traffic filled street.'}, 158 | {u'image_id': 81922, u'id': 90172, u'caption': u'An airplane flies low in the sky over a city street. '}, 159 | {u'image_id': 81922, u'id': 91615, u'caption': u'An airplane flies over a street with many cars.'}, 160 | {u'image_id': 81922, u'id': 92689, u'caption': u'An airplane comes in to land over a road full of cars'}, 161 | {u'image_id': 81922, u'id': 823814, u'caption': u'The plane is flying over top of the cars'}] 162 | } 163 | 164 | samples = { 165 | 184321: [{u'image_id': 184321, 'id': 111, u'caption': u'train traveling down a track in front of a road'}], 166 | 81922: [{u'image_id': 81922, 'id': 219, u'caption': u'plane is flying through the sky'}], 167 | } 168 | ''' 169 | gts = { 170 | '184321': [ 171 | {u'image_id': '184321', u'cap_id': 0, u'caption': u'A train traveling down tracks next to lights.', 172 | 'tokenized': 'a train traveling down tracks next to lights'}, 173 | {u'image_id': '184321', u'cap_id': 1, u'caption': u'A train coming down the tracks arriving at a station.', 174 | 'tokenized': 'a train coming down the tracks arriving at a station'}], 175 | '81922': [ 176 | {u'image_id': '81922', u'cap_id': 0, u'caption': u'A large jetliner flying over a traffic filled street.', 177 | 'tokenized': 'a large jetliner flying over a traffic filled street'}, 178 | {u'image_id': '81922', u'cap_id': 1, u'caption': u'The plane is flying over top of the cars', 179 | 'tokenized': 'the plan is flying over top of the cars'}, ] 180 | } 181 | 182 | samples = { 183 | '184321': [{u'image_id': '184321', u'caption': u'train traveling down a track in front of a road'}], 184 | '81922': [{u'image_id': '81922', u'caption': u'plane is flying through the sky'}], 185 | } 186 | IDs = ['184321', '81922'] 187 | scorer = COCOScorer() 188 | scorer.score(gts, samples, IDs) 189 | 190 | 191 | if __name__ == '__main__': 192 | test_cocoscorer() 193 | -------------------------------------------------------------------------------- /misc/rewards.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | import torch 4 | import sys 5 | sys.path.append("coco-caption") 6 | from pyciderevalcap.ciderD.ciderD import CiderD 7 | 8 | CiderD_scorer = None 9 | # CiderD_scorer = CiderD(df='corpus') 10 | 11 | 12 | def init_cider_scorer(cached_tokens): 13 | global CiderD_scorer 14 | CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) 15 | 16 | 17 | def array_to_str(arr): 18 | out = '' 19 | for i in range(len(arr)): 20 | out += str(arr[i]) + ' ' 21 | if arr[i] == 0: 22 | break 23 | return out.strip() 24 | 25 | 26 | def get_self_critical_reward(model, fc_feats, data, gen_result): 27 | batch_size = gen_result.size(0) 28 | 29 | # get greedy decoding baseline 30 | _, greedy_res = model(fc_feats, mode='inference') 31 | 32 | res = OrderedDict() 33 | 34 | gen_result = gen_result.cpu().data.numpy() 35 | greedy_res = greedy_res.cpu().data.numpy() 36 | for i in range(batch_size): 37 | res[i] = [array_to_str(gen_result[i])] 38 | for i in range(batch_size): 39 | res[batch_size + i] = [array_to_str(greedy_res[i])] 40 | 41 | gts = OrderedDict() 42 | for i in range(data['gts'].size(0)): 43 | gts[i] = [array_to_str(data['gts'][i][j]) 44 | for j in range(data['gts'].size(1))] 45 | 46 | res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)] 47 | gts = {i: gts[i % batch_size] for i in range(2 * batch_size)} 48 | _, scores = CiderD_scorer.compute_score(gts, res) 49 | print('Cider scores:', _) 50 | 51 | scores = scores[:batch_size] - scores[batch_size:] 52 | 53 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1) 54 | 55 | return rewards 56 | -------------------------------------------------------------------------------- /misc/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | 6 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token. 7 | def decode_sequence(ix_to_word, seq): 8 | N, D = seq.size() 9 | out = [] 10 | for i in range(N): 11 | txt = '' 12 | for j in range(D): 13 | ix = seq[i, j].data.cpu().numpy()[0] 14 | if ix > 0: 15 | if j >= 1: 16 | txt = txt + ' ' 17 | txt = txt + ix_to_word[str(ix)] 18 | else: 19 | break 20 | out.append(txt) 21 | return out 22 | 23 | 24 | def to_contiguous(tensor): 25 | if tensor.is_contiguous(): 26 | return tensor 27 | else: 28 | return tensor.contiguous() 29 | 30 | 31 | class RewardCriterion(nn.Module): 32 | 33 | def __init__(self): 34 | super(RewardCriterion, self).__init__() 35 | 36 | def forward(self, input, seq, reward): 37 | input = to_contiguous(input).view(-1) 38 | reward = to_contiguous(reward).view(-1) 39 | mask = (seq > 0).float() 40 | mask = to_contiguous(torch.cat([Variable(mask.data.new(mask.size(0), 1).fill_(1)).cuda(), 41 | mask[:, :-1]], 1)).view(-1) 42 | output = - input * reward * mask 43 | output = torch.sum(output) / torch.sum(mask) 44 | 45 | return output 46 | 47 | 48 | class LanguageModelCriterion(nn.Module): 49 | 50 | def __init__(self): 51 | # python 3 52 | # super().__init__() 53 | super(LanguageModelCriterion, self).__init__() 54 | self.loss_fn = nn.NLLLoss(reduce=False) 55 | 56 | def forward(self, logits, target, mask): 57 | """ 58 | logits: shape of (N, seq_len, vocab_size) 59 | target: shape of (N, seq_len) 60 | mask: shape of (N, seq_len) 61 | """ 62 | # truncate to the same size 63 | batch_size = logits.shape[0] 64 | target = target[:, :logits.shape[1]] 65 | mask = mask[:, :logits.shape[1]] 66 | logits = to_contiguous(logits).view(-1, logits.shape[2]) 67 | target = to_contiguous(target).view(-1) 68 | mask = to_contiguous(mask).view(-1) 69 | loss = self.loss_fn(logits, target) 70 | output = torch.sum(loss * mask) / batch_size 71 | return output 72 | 73 | 74 | def clip_gradient(optimizer, grad_clip): 75 | for group in optimizer.param_groups: 76 | for param in group['params']: 77 | param.grad.data.clamp_(-grad_clip, grad_clip) 78 | -------------------------------------------------------------------------------- /models/Attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Attention(nn.Module): 7 | """ 8 | Applies an attention mechanism on the output features from the decoder. 9 | """ 10 | 11 | def __init__(self, dim): 12 | # python 3 13 | # super().__init__() 14 | super(Attention, self).__init__() 15 | #self.dim = dim 16 | #self.linear1 = nn.Linear(dim * 2, dim) 17 | #self.linear2 = nn.Linear(dim, 1, bias=False) 18 | #self._init_hidden() 19 | #self.dk = dim/2 20 | # self.contextW = nn.Linear(dim, self.dk) 21 | # nn.init.xavier_normal(self.contextW.weight) 22 | # self.hidderW = nn.Linear(dim, self.dk) 23 | # nn.init.xavier_normal(self.hidderW.weight) 24 | def _init_hidden(self): 25 | nn.init.xavier_normal(self.linear1.weight) 26 | nn.init.xavier_normal(self.linear2.weight) 27 | 28 | def forward(self, hidden_state, encoder_outputs): 29 | """ 30 | Arguments: 31 | hidden_state {Variable} -- batch_size x dim 32 | encoder_outputs {Variable} -- batch_size x seq_len x dim 33 | 34 | Returns: 35 | Variable -- context vector of size batch_size x dim 36 | """ 37 | ############### original ################### 38 | ''' 39 | batch_size, seq_len, _ = encoder_outputs.size() 40 | hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1) 41 | (batch, seq_len, dim*2) 42 | inputs = torch.cat((encoder_outputs, hidden_state), 43 | 2).view(-1, self.dim * 2) 44 | (batch, seq_len, dim*2)->(batch, seq_len, dim)->(batch, seq_len, 1) 45 | o = self.linear2(F.tanh(self.linear1(inputs))) 46 | e = o.view(batch_size, seq_len) 47 | alpha = F.softmax(e, dim=1) 48 | context = torch.bmm(alpha.unsqueeze(1), encoder_outputs).squeeze(1) 49 | return context 50 | ''' 51 | ################# seq2seq ####################### 52 | ''' 53 | batch_size, seq_len, hidden_size = encoder_outputs.size() 54 | # batch, seq_len, dim 55 | hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1) 56 | # (batch, seq_len, dim) * (batch, dim, seq_len) -> (batch, seq_len, seq_len) 57 | attn = torch.bmm(hidden_state, encoder_outputs.transpose(1, 2)) 58 | attn = F.softmax(attn.view(-1, seq_len)).view(batch_size, -1, seq_len) 59 | # (batch, seq_len, seq_len) * (batch, seq_len, dim) -> (batch, seq_len, dim) 60 | mix = torch.bmm(attn, encoder_outputs) 61 | # concat -> (batch, seq_len, 2*dim) 62 | combined = torch.cat((mix, hidden_state), dim=2) 63 | # output -> (batch, out_len, dim) 64 | output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size) 65 | return output 66 | ''' 67 | ######## after reducing dim, calculate the similarity of between encoder_outputs and hidden_state ######### 68 | ''' 69 | batch_size, seq_len, hidden_size = encoder_outputs.size() 70 | # (batch, seq_len, self.dk) 71 | encoder_outputs_dk = self.contextW(encoder_outputs) 72 | # (batch, self.dk) 73 | hidden_state_dk = self.hidderW(hidden_state) 74 | # (batch, seq_len, self.dk) * (batch, self.dk, 1) -> (batch, seq_len, 1)-> (batch, seq_len) 75 | attn = torch.bmm(encoder_outputs_dk, hidden_state_dk.unsqueeze(2)).squeeze(2) 76 | # (batch, seq_len)-> (batch, 1, seq_len) 77 | attn = F.softmax(attn, dim=1).unsqueeze(1) 78 | # (batch, 1, seq_len) * (batch, seq_len, dim) -> (batch, 1, dim) 79 | context = torch.bmm(attn, encoder_outputs).squeeze(1) 80 | return context 81 | ''' 82 | ######### directly calculate the similarity of between encoder_outputs and hidden_state ############ 83 | # batch_size, seq_len, hidden_size = encoder_outputs.size() 84 | # (batch, seq_len, dim) * (batch, dim, 1) -> (batch, seq_len, 1)-> (batch, seq_len) 85 | attn = torch.bmm(encoder_outputs, hidden_state.unsqueeze(2)).squeeze(2) 86 | # (batch, seq_len)-> (batch, 1, seq_len) 87 | attn = F.softmax(attn, dim=1).unsqueeze(1) 88 | # (batch, 1, seq_len) * (batch, seq_len, dim) -> (batch, 1, dim) 89 | context = torch.bmm(attn, encoder_outputs).squeeze(1) 90 | return context -------------------------------------------------------------------------------- /models/EncoderRNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class EncoderRNN(nn.Module): 5 | def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5, 6 | n_layers=1, bidirectional=False, rnn_cell='gru'): 7 | """ 8 | 9 | Args: 10 | hidden_dim (int): dim of hidden state of rnn 11 | input_dropout_p (int): dropout probability for the input sequence 12 | dropout_p (float): dropout probability for the output sequence 13 | n_layers (int): number of rnn layers 14 | rnn_cell (str): type of RNN cell ('LSTM'/'GRU') 15 | """ 16 | # python 3 17 | # super().__init__() 18 | super(EncoderRNN, self).__init__() 19 | self.dim_vid = dim_vid 20 | self.dim_hidden = dim_hidden 21 | self.input_dropout_p = input_dropout_p 22 | self.rnn_dropout_p = rnn_dropout_p 23 | self.n_layers = n_layers 24 | self.bidirectional = bidirectional 25 | self.rnn_cell = rnn_cell 26 | 27 | self.vid2hid = nn.Linear(dim_vid, dim_hidden) 28 | self.input_dropout = nn.Dropout(input_dropout_p) 29 | 30 | if rnn_cell.lower() == 'lstm': 31 | self.rnn_cell = nn.LSTM 32 | elif rnn_cell.lower() == 'gru': 33 | self.rnn_cell = nn.GRU 34 | 35 | self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True, 36 | bidirectional=bidirectional, dropout=self.rnn_dropout_p) 37 | 38 | self._init_hidden() 39 | 40 | def _init_hidden(self): 41 | nn.init.xavier_normal(self.vid2hid.weight) 42 | 43 | def forward(self, vid_feats): 44 | """ 45 | Applies a multi-layer RNN to an input sequence. 46 | Args: 47 | input_var (batch, seq_len): tensor containing the features of the input sequence. 48 | input_lengths (list of int, optional): A list that contains the lengths of sequences 49 | in the mini-batch 50 | Returns: output, hidden 51 | - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence 52 | - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h 53 | """ 54 | batch_size, seq_len, dim_vid = vid_feats.size() 55 | vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid)) 56 | vid_feats = self.input_dropout(vid_feats) 57 | vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden) 58 | self.rnn.flatten_parameters() 59 | output, hidden = self.rnn(vid_feats) 60 | return output, hidden 61 | -------------------------------------------------------------------------------- /models/S2VTAttModel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class S2VTAttModel(nn.Module): 5 | def __init__(self, encoder, decoder): 6 | """ 7 | 8 | Args: 9 | encoder (nn.Module): Encoder rnn 10 | decoder (nn.Module): Decoder rnn 11 | """ 12 | # python 3 13 | # super().__init__() 14 | super(S2VTAttModel, self).__init__() 15 | self.encoder = encoder 16 | self.decoder = decoder 17 | 18 | def forward(self, vid_feats, target_variable=None, 19 | mode='train', opt={}): 20 | """ 21 | 22 | Args: 23 | vid_feats (Variable): video feats of shape [batch_size, seq_len, dim_vid] 24 | target_variable (None, optional): groung truth labels 25 | 26 | Returns: 27 | seq_prob: Variable of shape [batch_size, max_len-1, vocab_size] 28 | seq_preds: [] or Variable of shape [batch_size, max_len-1] 29 | """ 30 | encoder_outputs, encoder_hidden = self.encoder(vid_feats) 31 | seq_prob, seq_preds = self.decoder(encoder_outputs, encoder_hidden, target_variable, mode, opt) 32 | return seq_prob, seq_preds 33 | -------------------------------------------------------------------------------- /models/S2VTModel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import random 5 | from torch.autograd import Variable 6 | 7 | 8 | class S2VTModel(nn.Module): 9 | def __init__(self, vocab_size, max_len, dim_hidden, dim_word, dim_vid=2048, sos_id=1, eos_id=0, 10 | n_layers=1, bidirectional=False, rnn_cell='gru', rnn_dropout_p=0.2): 11 | # python 3 12 | # super().__init__() 13 | super(S2VTModel, self).__init__() 14 | if rnn_cell.lower() == 'lstm': 15 | self.rnn_cell = nn.LSTM 16 | elif rnn_cell.lower() == 'gru': 17 | self.rnn_cell = nn.GRU 18 | # hidden_size * num_directions 19 | # num_directions = 2 if bidirectional else 1 20 | rnn_output_size = dim_hidden * 2 if bidirectional else dim_hidden 21 | 22 | self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, bidirectional=bidirectional, 23 | batch_first=True, dropout=rnn_dropout_p) 24 | self.rnn2 = self.rnn_cell(rnn_output_size + dim_word, dim_hidden, n_layers, bidirectional=bidirectional, 25 | batch_first=True, dropout=rnn_dropout_p) 26 | self.rnn_cell_type = rnn_cell.lower() 27 | self.n_layers = n_layers 28 | self.dim_vid = dim_vid 29 | self.dim_output = vocab_size 30 | self.dim_hidden = dim_hidden 31 | self.dim_word = dim_word 32 | self.max_length = max_len 33 | self.sos_id = sos_id 34 | self.eos_id = eos_id 35 | self.embedding = nn.Embedding(self.dim_output, self.dim_word) 36 | 37 | self.out = nn.Linear(rnn_output_size, self.dim_output) 38 | 39 | def forward(self, vid_feats, target_variable=None, 40 | mode='train', opt={}): 41 | 42 | batch_size, n_frames, _ = vid_feats.shape 43 | padding_words = Variable(vid_feats.data.new(batch_size, n_frames, self.dim_word)).zero_() 44 | state1 = None 45 | state2 = None 46 | self.rnn1.flatten_parameters() 47 | self.rnn2.flatten_parameters() 48 | output1, state1 = self.rnn1(vid_feats, state1) 49 | input2 = torch.cat((output1, padding_words), dim=2) 50 | output2, state2 = self.rnn2(input2, state2) 51 | 52 | padding_frames = Variable(vid_feats.data.new(batch_size, 1, self.dim_vid)).zero_() 53 | seq_probs = [] 54 | seq_preds = [] 55 | if mode == 'train': 56 | for i in range(self.max_length - 1): 57 | # doesn't input to the network 58 | current_words = self.embedding(target_variable[:, i]) 59 | self.rnn1.flatten_parameters() 60 | self.rnn2.flatten_parameters() 61 | output1, state1 = self.rnn1(padding_frames, state1) 62 | input2 = torch.cat( 63 | (output1, current_words.unsqueeze(1)), dim=2) 64 | output2, state2 = self.rnn2(input2, state2) 65 | logits = self.out(output2.squeeze(1)) 66 | logits = F.log_softmax(logits, dim=1) 67 | seq_probs.append(logits.unsqueeze(1)) 68 | seq_probs = torch.cat(seq_probs, 1) 69 | else: 70 | beam_size = opt.get('beam_size', 1) 71 | if beam_size == 1: 72 | current_words = self.embedding(Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda()) 73 | for i in range(self.max_length - 1): 74 | self.rnn1.flatten_parameters() 75 | self.rnn2.flatten_parameters() 76 | output1, state1 = self.rnn1(padding_frames, state1) 77 | input2 = torch.cat( 78 | (output1, current_words.unsqueeze(1)), dim=2) 79 | output2, state2 = self.rnn2(input2, state2) 80 | logits = self.out(output2.squeeze(1)) 81 | logits = F.log_softmax(logits, dim=1) 82 | seq_probs.append(logits.unsqueeze(1)) 83 | _, preds = torch.max(logits, 1) 84 | current_words = self.embedding(preds) 85 | seq_preds.append(preds.unsqueeze(1)) 86 | seq_probs = torch.cat(seq_probs, 1) 87 | seq_preds = torch.cat(seq_preds, 1) 88 | else: 89 | # batch*dim_word 90 | start = [Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda()] 91 | current_words = [[start, 0.0, state2]] 92 | for i in range(self.max_length - 1): 93 | self.rnn1.flatten_parameters() 94 | self.rnn2.flatten_parameters() 95 | # output1: batch*1*dim_hidden 96 | output1, state1 = self.rnn1(padding_frames, state1) 97 | temp = [] 98 | for s in current_words: 99 | # s: [[batch*word_embed1, batch*word_embed2...], prob, state2] 100 | input2 = torch.cat( 101 | (output1, self.embedding(s[0][-1]).unsqueeze(1)), dim=2) 102 | output2, s[2] = self.rnn2(input2, s[2]) 103 | logits = self.out(output2.squeeze(1)) 104 | # batch*voc_size 105 | logits = F.log_softmax(logits, dim=1) 106 | # batch*beam 107 | topk_prob, topk_word = torch.topk(logits, k=beam_size, dim=1) 108 | # batch*beam -> beam*batch 109 | topk_prob = topk_prob.permute(1, 0) 110 | topk_word = topk_word.permute(1, 0) 111 | # Getting the top (n) predictions and creating a 112 | # new list so as to put them via the model again 113 | for prob, word in zip(topk_prob, topk_word): 114 | next_cap = s[0][:] 115 | next_cap.append(word) 116 | temp.append([next_cap, s[1]+prob, 117 | (s[2][0].clone(), s[2][1].clone()) if isinstance(s[2], tuple) 118 | else s[2].clone()]) 119 | current_words = temp 120 | # sort by prob 121 | current_words = sorted(current_words, reverse=False, cmp=lambda x,y:cmp(int(x[1]),int(y[1]))) 122 | # get the top words 123 | current_words = current_words[-beam_size:] 124 | seq_preds = torch.cat(current_words[-1][0][1:], 0).unsqueeze(0) 125 | return seq_probs, seq_preds 126 | 127 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .EncoderRNN import EncoderRNN 2 | from .DecoderRNN import DecoderRNN 3 | from .S2VTAttModel import S2VTAttModel 4 | from .S2VTModel import S2VTModel 5 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_opt(): 5 | parser = argparse.ArgumentParser() 6 | # Data input settings 7 | parser.add_argument( 8 | '--input_json', 9 | type=str, 10 | default='data/videodatainfo_2017.json', 11 | help='path to the json file containing video info') 12 | parser.add_argument( 13 | '--info_json', 14 | type=str, 15 | default='data/info.json', 16 | help='path to the json file containing additional info and vocab') 17 | parser.add_argument( 18 | '--caption_json', 19 | type=str, 20 | default='data/caption.json', 21 | help='path to the processed video caption json') 22 | 23 | parser.add_argument( 24 | '--feats_dir', 25 | nargs='+', 26 | type=str, 27 | default=['data/feats/resnet152/'], 28 | help='path to the directory containing the preprocessed fc feats') 29 | 30 | parser.add_argument('--c3d_feats_dir', type=str, default='data/c3d_feats') 31 | parser.add_argument( 32 | '--with_c3d', type=int, default=0, help='whether to use c3d features') 33 | 34 | parser.add_argument( 35 | '--cached_tokens', 36 | type=str, 37 | default='msr-all-idxs', 38 | help='Cached token file for calculating cider score \ 39 | during self critical training.') 40 | 41 | # Model settings 42 | parser.add_argument( 43 | "--model", type=str, default='S2VTModel', help="with model to use") 44 | 45 | parser.add_argument( 46 | "--max_len", 47 | type=int, 48 | default=28, 49 | help='max length of captions(containing ,)') 50 | parser.add_argument( 51 | "--bidirectional", 52 | type=int, 53 | default=0, 54 | help="0 for disable, 1 for enable. encoder/decoder bidirectional.") 55 | parser.add_argument( 56 | '--n_frame_steps', 57 | type=int, 58 | default=80, 59 | help='how many frames to sampler per video') 60 | parser.add_argument( 61 | '--dim_hidden', 62 | type=int, 63 | default=512, 64 | help='size of the rnn hidden layer') 65 | parser.add_argument( 66 | '--num_layers', type=int, default=1, help='number of layers in the RNN') 67 | parser.add_argument( 68 | '--input_dropout_p', 69 | type=float, 70 | default=0.2, 71 | help='strength of dropout in the Language Model RNN') 72 | parser.add_argument( 73 | '--rnn_type', type=str, default='gru', help='lstm or gru') 74 | parser.add_argument( 75 | '--rnn_dropout_p', 76 | type=float, 77 | default=0.5, 78 | help='strength of dropout in the Language Model RNN') 79 | parser.add_argument( 80 | '--dim_word', 81 | type=int, 82 | default=512, 83 | help='the encoding size of each token in the vocabulary, and the video.' 84 | ) 85 | 86 | parser.add_argument( 87 | '--dim_vid', 88 | type=int, 89 | default=2048, 90 | help='dim of features of video frames') 91 | 92 | # Optimization: General 93 | 94 | parser.add_argument( 95 | '--epochs', type=int, default=6001, help='number of epochs') 96 | parser.add_argument( 97 | '--batch_size', type=int, default=128, help='minibatch size') 98 | parser.add_argument( 99 | '--grad_clip', 100 | type=float, 101 | default=5, # 5., 102 | help='clip gradients at this value') 103 | 104 | parser.add_argument( 105 | '--self_crit_after', 106 | type=int, 107 | default=-1, 108 | help='After what epoch do we start finetuning the CNN? \ 109 | (-1 = disable; never finetune, 0 = finetune from start)' 110 | ) 111 | 112 | parser.add_argument( 113 | '--learning_rate', type=float, default=4e-4, help='learning rate') 114 | 115 | parser.add_argument( 116 | '--learning_rate_decay_every', 117 | type=int, 118 | default=200, 119 | help='every how many iterations thereafter to drop LR?(in epoch)') 120 | parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8) 121 | parser.add_argument( 122 | '--optim_alpha', type=float, default=0.9, help='alpha for adam') 123 | parser.add_argument( 124 | '--optim_beta', type=float, default=0.999, help='beta used for adam') 125 | parser.add_argument( 126 | '--optim_epsilon', 127 | type=float, 128 | default=1e-8, 129 | help='epsilon that goes into denominator for smoothing') 130 | parser.add_argument( 131 | '--weight_decay', 132 | type=float, 133 | default=5e-4, 134 | help='weight_decay. strength of weight regularization') 135 | 136 | parser.add_argument( 137 | '--save_checkpoint_every', 138 | type=int, 139 | default=50, 140 | help='how often to save a model checkpoint (in epoch)?') 141 | parser.add_argument( 142 | '--checkpoint_path', 143 | type=str, 144 | default='save', 145 | help='directory to store checkpointed models') 146 | 147 | parser.add_argument( 148 | '--gpu', type=str, default='0', help='gpu device number') 149 | 150 | args = parser.parse_args() 151 | 152 | return args 153 | -------------------------------------------------------------------------------- /prepro_coco.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import nltk 4 | nltk.download('stopwords') 5 | from nltk.corpus import stopwords 6 | from collections import Counter 7 | from tqdm import tqdm 8 | 9 | 10 | def main(args): 11 | coco = json.load( 12 | open(args.coco_json))['annotations'] 13 | msr = json.load(open(args.msr_caption_json)) 14 | stopWords = set(stopwords.words('english')) 15 | coco_wordcounts = open(args.coco_wordcounts) 16 | coco_words = [] 17 | for i in coco_wordcounts: 18 | w = i.split()[0] 19 | coco_words.append(w) 20 | msr_wordcounts = [] 21 | for i in msr.values(): 22 | for j in i['final_captions']: 23 | msr_wordcounts += j 24 | msr_wordcounts = Counter(msr_wordcounts).most_common() 25 | labels = [i for i in msr_wordcounts if i[0] 26 | not in stopWords and i[0] in coco_words][:args.num_classes] 27 | for i in tqdm(coco): 28 | l = [] 29 | for j, w in enumerate(labels): 30 | if w[0] in i['caption']: 31 | l.append(j) 32 | i['labels'] = l 33 | coco_labels = {} 34 | for i in tqdm(coco): 35 | if i['image_id'] in coco_labels: 36 | coco_labels[i['image_id']] = coco_labels[i['image_id']] + \ 37 | list(set(i['labels']) - set(coco_labels[i['image_id']])) 38 | else: 39 | coco_labels[i['image_id']] = i['labels'] 40 | info = {'num_classes': args.num_classes, 'labels': coco_labels} 41 | with open(args.coco_labels_json, 'w') as f: 42 | json.dump(info, f) 43 | 44 | coco = json.load( 45 | open(args.coco_json))['images'] 46 | coco_path = {} 47 | for i in tqdm(coco): 48 | coco_path[i['id']] = i['file_name'] 49 | with open(args.coco_path_json, 'w') as f: 50 | json.dump(coco_path, f) 51 | 52 | 53 | if __name__ == '__main__': 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('--coco_json', type=str, 56 | default='data/mscoco/annotations/captions_train2014.json', help='path to coco train json') 57 | parser.add_argument('-coco_wordcounts', type=str, 58 | default='data/mscoco/word_counts.txt', help='word_counts.txt of coco dataset') 59 | parser.add_argument('--msr_caption_json', type=str, 60 | default='data/caption.json', help='path to processed msr vtt caption json') 61 | parser.add_argument('--num_classes', type=int, default=1000, 62 | help='number of classes each image') 63 | parser.add_argument('--coco_labels_json', type=str, default='data/coco_labels.json', 64 | help='path to processed coco train caption json') 65 | parser.add_argument('--coco_path_json', type=str, default='data/coco_path.json', 66 | help='image id and image file name pairs') 67 | args = parser.parse_args() 68 | main(args) 69 | -------------------------------------------------------------------------------- /prepro_feats.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import glob 4 | from tqdm import tqdm 5 | import numpy as np 6 | import os 7 | import argparse 8 | 9 | import torch 10 | from torch import nn 11 | import torch.nn.functional as F 12 | from torch.autograd import Variable 13 | import pretrainedmodels 14 | from pretrainedmodels import utils 15 | 16 | C, H, W = 3, 224, 224 17 | 18 | 19 | def extract_frames(video, dst): 20 | with open(os.devnull, "w") as ffmpeg_log: 21 | if os.path.exists(dst): 22 | print(" cleanup: " + dst + "/") 23 | shutil.rmtree(dst) 24 | os.makedirs(dst) 25 | video_to_frames_command = ["ffmpeg", 26 | # (optional) overwrite output file if it exists 27 | '-y', 28 | '-i', video, # input file 29 | '-vf', "scale=400:300", # input file 30 | '-qscale:v', "2", # quality for JPEG 31 | '{0}/%06d.jpg'.format(dst)] 32 | subprocess.call(video_to_frames_command, 33 | stdout=ffmpeg_log, stderr=ffmpeg_log) 34 | 35 | 36 | def extract_feats(params, model, load_image_fn): 37 | global C, H, W 38 | model.eval() 39 | 40 | dir_fc = params['output_dir'] 41 | if not os.path.isdir(dir_fc): 42 | os.mkdir(dir_fc) 43 | print("save video feats to %s" % (dir_fc)) 44 | video_list = glob.glob(os.path.join(params['video_path'], '*.mp4')) 45 | for video in tqdm(video_list): 46 | video_id = video.split("/")[-1].split(".")[0] 47 | dst = params['model'] + '_' + video_id 48 | extract_frames(video, dst) 49 | 50 | image_list = sorted(glob.glob(os.path.join(dst, '*.jpg'))) 51 | samples = np.round(np.linspace( 52 | 0, len(image_list) - 1, params['n_frame_steps'])) 53 | image_list = [image_list[int(sample)] for sample in samples] 54 | images = torch.zeros((len(image_list), C, H, W)) 55 | for iImg in range(len(image_list)): 56 | img = load_image_fn(image_list[iImg]) 57 | images[iImg] = img 58 | fc_feats = model(Variable(images, volatile=True).cuda()).squeeze() 59 | img_feats = fc_feats.data.cpu().numpy() 60 | # Save the inception features 61 | outfile = os.path.join(dir_fc, video_id + '.npy') 62 | np.save(outfile, img_feats) 63 | # cleanup 64 | shutil.rmtree(dst) 65 | 66 | 67 | if __name__ == '__main__': 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument("--gpu", dest='gpu', type=str, default='0', 70 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 71 | parser.add_argument("--output_dir", dest='output_dir', type=str, 72 | default='data/feats/resnet152', help='directory to store features') 73 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=40, 74 | help='how many frames to sampler per video') 75 | parser.add_argument("--video_path", dest='video_path', type=str, 76 | default='data/train-video', help='path to video dataset') 77 | parser.add_argument("--model", dest="model", type=str, default='resnet152', 78 | help='the CNN model you want to use to extract_feats') 79 | parser.add_argument("--saved_model", dest="saved_model", type=str, default='', 80 | help='the pretrained CNN model you want to use to extract_feats') 81 | 82 | args = parser.parse_args() 83 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 84 | params = vars(args) 85 | if params['model'] == 'inception_v3': 86 | C, H, W = 3, 299, 299 87 | model = pretrainedmodels.inceptionv3(pretrained='imagenet') 88 | load_image_fn = utils.LoadTransformImage(model) 89 | elif params['model'] == 'vgg16': 90 | C, H, W = 3, 224, 224 91 | model = pretrainedmodels.vgg16(pretrained='imagenet') 92 | load_image_fn = utils.LoadTransformImage(model) 93 | elif params['model'] == 'vgg19': 94 | C, H, W = 3, 224, 224 95 | model = pretrainedmodels.vgg19(pretrained='imagenet') 96 | load_image_fn = utils.LoadTransformImage(model) 97 | elif params['model'] == 'resnet50': 98 | C, H, W = 3, 224, 224 99 | model = pretrainedmodels.resnet50(pretrained='imagenet') 100 | load_image_fn = utils.LoadTransformImage(model) 101 | elif params['model'] == 'resnet101': 102 | C, H, W = 3, 224, 224 103 | model = pretrainedmodels.resnet101(pretrained='imagenet') 104 | load_image_fn = utils.LoadTransformImage(model) 105 | elif params['model'] == 'resnet152': 106 | C, H, W = 3, 224, 224 107 | model = pretrainedmodels.resnet152(pretrained='imagenet') 108 | load_image_fn = utils.LoadTransformImage(model) 109 | elif params['model'] == 'inception_v4': 110 | C, H, W = 3, 299, 299 111 | model = pretrainedmodels.inceptionv4( 112 | num_classes=1000, pretrained='imagenet') 113 | load_image_fn = utils.LoadTransformImage(model) 114 | elif params['model'] == 'nasnet': 115 | C, H, W = 3, 331, 331 116 | model = pretrainedmodels.nasnetalarge(num_classes=1001, pretrained='imagenet+background') 117 | load_image_fn = utils.LoadTransformImage(model) 118 | else: 119 | print("doesn't support %s" % (params['model'])) 120 | 121 | model.last_linear = utils.Identity() 122 | model = nn.DataParallel(model) 123 | if params['saved_model'] != '': 124 | model.load_state_dict(torch.load(params['saved_model']), strict=False) 125 | model = model.cuda() 126 | extract_feats(params, model, load_image_fn) 127 | -------------------------------------------------------------------------------- /prepro_ngrams.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import pickle as pkl 4 | from collections import defaultdict 5 | 6 | 7 | def precook(s, n=4): 8 | """ 9 | Takes a string as input and returns an object that can be given to 10 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 11 | can take string arguments as well. 12 | :param s: string : sentence to be converted into ngrams 13 | :param n: int : number of ngrams for which representation is calculated 14 | :return: term frequency vector for occuring ngrams 15 | """ 16 | words = s.split() 17 | counts = defaultdict(int) 18 | for k in range(1, n+1): 19 | for i in range(len(words)-k+1): 20 | ngram = tuple(words[i:i+k]) 21 | counts[ngram] += 1 22 | return counts 23 | 24 | 25 | def cook_refs(refs, n=4): # lhuang: oracle will call with "average" 26 | '''Takes a list of reference sentences for a single segment 27 | and returns an object that encapsulates everything that BLEU 28 | needs to know about them. 29 | :param refs: list of string : reference sentences for some image 30 | :param n: int : number of ngrams for which (ngram) representation is calculated 31 | :return: result (list of dict) 32 | ''' 33 | return [precook(ref, n) for ref in refs] 34 | 35 | 36 | def create_crefs(refs): 37 | crefs = [] 38 | for ref in refs: 39 | # ref is a list of 5 captions 40 | crefs.append(cook_refs(ref)) 41 | return crefs 42 | 43 | 44 | def compute_doc_freq(crefs): 45 | ''' 46 | Compute term frequency for reference data. 47 | This will be used to compute idf (inverse document frequency later) 48 | The term frequency is stored in the object 49 | :return: None 50 | ''' 51 | document_frequency = defaultdict(float) 52 | for refs in crefs: 53 | # refs, k ref captions of one image 54 | for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]): 55 | document_frequency[ngram] += 1 56 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 57 | return document_frequency 58 | 59 | 60 | def build_dict(vids, wtoi): 61 | refs_words = [] 62 | refs_idxs = [] 63 | count_vids = 0 64 | for vid in vids: 65 | ref_words = [] 66 | ref_idxs = [] 67 | for cap in vids[vid]['final_captions']: 68 | tmp_tokens = cap 69 | tmp_tokens = [_ if _ in wtoi else '' for _ in tmp_tokens] 70 | ref_words.append(' '.join(tmp_tokens)) 71 | ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens])) 72 | refs_words.append(ref_words) 73 | refs_idxs.append(ref_idxs) 74 | count_vids += 1 75 | ngram_words = compute_doc_freq(create_crefs(refs_words)) 76 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs)) 77 | return ngram_words, ngram_idxs, count_vids 78 | 79 | 80 | def main(params): 81 | vids = json.load(open(params['caption_json'])) 82 | wtoi = json.load(open(params['info_json']))['word_to_ix'] 83 | 84 | ngram_words, ngram_idxs, ref_len = build_dict(vids, wtoi) 85 | 86 | pkl.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open( 87 | params['output_pkl']+'-words.p', 'wb')) 88 | pkl.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open( 89 | params['output_pkl']+'-idxs.p', 'wb')) 90 | 91 | if __name__ == "__main__": 92 | 93 | parser = argparse.ArgumentParser() 94 | 95 | # input json 96 | parser.add_argument('--caption_json', default='data/caption.json', 97 | help='input json file to containing video captions') 98 | parser.add_argument('--info_json', default='data/info.json', help='vocab info json file') 99 | parser.add_argument('--output_pkl', default='data/msr-all', help='output pickle file') 100 | args = parser.parse_args() 101 | params = vars(args) # convert to ordinary dict 102 | 103 | main(params) 104 | -------------------------------------------------------------------------------- /prepro_vocab.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import argparse 4 | import numpy as np 5 | 6 | 7 | def build_vocab(vids, params): 8 | count_thr = params['word_count_threshold'] 9 | # count up the number of words 10 | counts = {} 11 | for vid, caps in vids.items(): 12 | for cap in caps['captions']: 13 | ws = re.sub(r'[.!,;?]', ' ', cap).split() 14 | for w in ws: 15 | counts[w] = counts.get(w, 0) + 1 16 | # cw = sorted([(count, w) for w, count in counts.items()], reverse=True) 17 | total_words = sum(counts.values()) 18 | bad_words = [w for w, n in counts.items() if n <= count_thr] 19 | vocab = [w for w, n in counts.items() if n > count_thr] 20 | bad_count = sum(counts[w] for w in bad_words) 21 | print('number of bad words: %d/%d = %.2f%%' % 22 | (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts))) 23 | print('number of words in vocab would be %d' % (len(vocab), )) 24 | print('number of UNKs: %d/%d = %.2f%%' % 25 | (bad_count, total_words, bad_count * 100.0 / total_words)) 26 | # lets now produce the final annotations 27 | if bad_count > 0: 28 | # additional special UNK token we will use below to map infrequent words to 29 | print('inserting the special UNK token') 30 | vocab.append('') 31 | for vid, caps in vids.items(): 32 | caps = caps['captions'] 33 | vids[vid]['final_captions'] = [] 34 | for cap in caps: 35 | ws = re.sub(r'[.!,;?]', ' ', cap).split() 36 | caption = [ 37 | ''] + [w if counts.get(w, 0) > count_thr else '' for w in ws] + [''] 38 | vids[vid]['final_captions'].append(caption) 39 | return vocab 40 | 41 | 42 | def main(params): 43 | videos = json.load(open(params['input_json'], 'r'))['sentences'] 44 | video_caption = {} 45 | for i in videos: 46 | if i['video_id'] not in video_caption.keys(): 47 | video_caption[i['video_id']] = {'captions': []} 48 | video_caption[i['video_id']]['captions'].append(i['caption']) 49 | # create the vocab 50 | vocab = build_vocab(video_caption, params) 51 | itow = {i + 2: w for i, w in enumerate(vocab)} 52 | wtoi = {w: i + 2 for i, w in enumerate(vocab)} # inverse table 53 | wtoi[''] = 0 54 | itow[0] = '' 55 | wtoi[''] = 1 56 | itow[1] = '' 57 | 58 | out = {} 59 | out['ix_to_word'] = itow 60 | out['word_to_ix'] = wtoi 61 | out['videos'] = {'train': [], 'val': [], 'test': []} 62 | videos = json.load(open(params['input_json'], 'r'))['videos'] 63 | for i in videos: 64 | out['videos'][i['split']].append(int(i['id'])) 65 | json.dump(out, open(params['info_json'], 'w')) 66 | json.dump(video_caption, open(params['caption_json'], 'w')) 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | 72 | # input json 73 | parser.add_argument('--input_json', type=str, default='data/all_videodatainfo_2017.json', 74 | help='msr_vtt videoinfo json') 75 | parser.add_argument('--info_json', default='data/all_info.json', 76 | help='info about iw2word and word2ix') 77 | parser.add_argument('--caption_json', default='data/all_caption.json', help='caption json file') 78 | 79 | 80 | parser.add_argument('--word_count_threshold', default=1, type=int, 81 | help='only words that occur more than this number of times will be put in vocab') 82 | 83 | args = parser.parse_args() 84 | params = vars(args) # convert to ordinary dict 85 | main(params) 86 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import numpy as np 5 | 6 | import misc.utils as utils 7 | import opts 8 | import torch 9 | import torch.optim as optim 10 | from dataloader import VideoDataset 11 | from misc.rewards import get_self_critical_reward, init_cider_scorer 12 | from models import DecoderRNN, EncoderRNN, S2VTAttModel, S2VTModel 13 | from torch import nn 14 | from torch.autograd import Variable 15 | from torch.utils.data import DataLoader 16 | 17 | 18 | def train(loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None): 19 | model.train() 20 | model = nn.DataParallel(model) 21 | for epoch in range(opt["epochs"]): 22 | lr_scheduler.step() 23 | 24 | iteration = 0 25 | # If start self crit training 26 | if opt["self_crit_after"] != -1 and epoch >= opt["self_crit_after"]: 27 | sc_flag = True 28 | init_cider_scorer(opt["cached_tokens"]) 29 | else: 30 | sc_flag = False 31 | 32 | for data in loader: 33 | torch.cuda.synchronize() 34 | fc_feats = Variable(data['fc_feats']).cuda() 35 | labels = Variable(data['labels']).long().cuda() 36 | masks = Variable(data['masks']).cuda() 37 | 38 | optimizer.zero_grad() 39 | if not sc_flag: 40 | seq_probs, _ = model(fc_feats, labels, 'train') 41 | loss = crit(seq_probs, labels[:, 1:], masks[:, 1:]) 42 | else: 43 | seq_probs, seq_preds = model( 44 | fc_feats, mode='inference', opt=opt) 45 | reward = get_self_critical_reward(model, fc_feats, data, 46 | seq_preds) 47 | print(reward.shape) 48 | loss = rl_crit(seq_probs, seq_preds, 49 | Variable( 50 | torch.from_numpy(reward).float().cuda())) 51 | 52 | loss.backward() 53 | utils.clip_gradient(optimizer, opt["grad_clip"]) 54 | optimizer.step() 55 | train_loss = loss.data[0] 56 | torch.cuda.synchronize() 57 | iteration += 1 58 | 59 | if not sc_flag: 60 | print("iter %d (epoch %d), train_loss = %.6f" % 61 | (iteration, epoch, train_loss)) 62 | else: 63 | print("iter %d (epoch %d), avg_reward = %.6f" % 64 | (iteration, epoch, np.mean(reward[:, 0]))) 65 | 66 | if epoch != 0 and epoch % opt["save_checkpoint_every"] == 0: 67 | model_path = os.path.join(opt["checkpoint_path"], 68 | 'model_%d.pth' % (epoch)) 69 | model_info_path = os.path.join(opt["checkpoint_path"], 70 | 'model_score.txt') 71 | torch.save(model.state_dict(), model_path) 72 | print("model saved to %s" % (model_path)) 73 | with open(model_info_path, 'a') as f: 74 | f.write("model_%d, loss: %.6f\n" % (epoch, train_loss)) 75 | 76 | 77 | def main(opt): 78 | dataset = VideoDataset(opt, 'train') 79 | dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) 80 | opt["vocab_size"] = dataset.get_vocab_size() 81 | if opt["model"] == 'S2VTModel': 82 | model = S2VTModel( 83 | opt["vocab_size"], 84 | opt["max_len"], 85 | opt["dim_hidden"], 86 | opt["dim_word"], 87 | opt['dim_vid'], 88 | rnn_cell=opt['rnn_type'], 89 | n_layers=opt['num_layers'], 90 | bidirectional=opt["bidirectional"], 91 | rnn_dropout_p=opt["rnn_dropout_p"]).cuda() 92 | elif opt["model"] == "S2VTAttModel": 93 | encoder = EncoderRNN( 94 | opt["dim_vid"], 95 | opt["dim_hidden"], 96 | n_layers=opt['num_layers'], 97 | bidirectional=opt["bidirectional"], 98 | input_dropout_p=opt["input_dropout_p"], 99 | rnn_cell=opt['rnn_type'], 100 | rnn_dropout_p=opt["rnn_dropout_p"]) 101 | decoder = DecoderRNN( 102 | opt["vocab_size"], 103 | opt["max_len"], 104 | opt["dim_hidden"], 105 | opt["dim_word"], 106 | n_layers=opt['num_layers'], 107 | input_dropout_p=opt["input_dropout_p"], 108 | rnn_cell=opt['rnn_type'], 109 | rnn_dropout_p=opt["rnn_dropout_p"], 110 | bidirectional=opt["bidirectional"]) 111 | model = S2VTAttModel(encoder, decoder).cuda() 112 | crit = utils.LanguageModelCriterion() 113 | rl_crit = utils.RewardCriterion() 114 | optimizer = optim.Adam( 115 | model.parameters(), 116 | lr=opt["learning_rate"], 117 | weight_decay=opt["weight_decay"]) 118 | exp_lr_scheduler = optim.lr_scheduler.StepLR( 119 | optimizer, 120 | step_size=opt["learning_rate_decay_every"], 121 | gamma=opt["learning_rate_decay_rate"]) 122 | 123 | train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit) 124 | 125 | 126 | if __name__ == '__main__': 127 | opt = opts.parse_opt() 128 | opt = vars(opt) 129 | for key, value in opt.items(): 130 | print key, value 131 | os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"] 132 | opt_json = os.path.join(opt["checkpoint_path"], 'opt_info.json') 133 | if not os.path.exists(opt["checkpoint_path"]): 134 | os.makedirs(opt["checkpoint_path"]) 135 | with open(opt_json, 'w') as f: 136 | json.dump(opt, f) 137 | print('save opt details to %s' % (opt_json)) 138 | main(opt) 139 | -------------------------------------------------------------------------------- /train_s2vt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### nasnet_resnet101 4 | feat=nasnet_resnet101 5 | LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log 6 | python train.py \ 7 | --gpu 0 \ 8 | --save_checkpoint_every 20 \ 9 | --epochs 1000 \ 10 | --n_frame_steps 40 \ 11 | --batch_size 100 \ 12 | --input_json data/all_videodatainfo_2017.json \ 13 | --info_json data/all_info.json \ 14 | --caption_json data/all_caption.json \ 15 | --checkpoint_path checkpoint/${feat}_40frames/s2vt \ 16 | --feats_dir data/feats/nasnet data/feats/resnet101 \ 17 | --dim_vid 6080 \ 18 | --rnn_type lstm \ 19 | --learning_rate_decay_every 100 \ 20 | --model S2VTModel \ 21 | 2>&1 | tee $LOG 22 | 23 | # ### nasnet 24 | # feat=nasnet 25 | # LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log 26 | # python train.py \ 27 | # --gpu 0 \ 28 | # --save_checkpoint_every 50 \ 29 | # --epochs 1000 \ 30 | # --n_frame_steps 40 \ 31 | # --batch_size 100 \ 32 | # --input_json data/all_videodatainfo_2017.json \ 33 | # --info_json data/all_info.json \ 34 | # --caption_json data/all_caption.json \ 35 | # --checkpoint_path checkpoint/${feat}_40frames/s2vt \ 36 | # --feats_dir data/feats/$feat/ \ 37 | # --dim_vid 4032 \ 38 | # --rnn_type lstm \ 39 | # --learning_rate_decay_every 100 \ 40 | # --model S2VTModel \ 41 | # 2>&1 | tee $LOG 42 | 43 | 44 | ### inception_v4 40frames 45 | # feat=inception_v4 46 | # LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log 47 | # python train.py \ 48 | # --gpu 0 \ 49 | # --save_checkpoint_every 50 \ 50 | # --epochs 1000 \ 51 | # --n_frame_steps 40 \ 52 | # --batch_size 100 \ 53 | # --input_json data/all_videodatainfo_2017.json \ 54 | # --info_json data/all_info.json \ 55 | # --caption_json data/all_caption.json \ 56 | # --checkpoint_path checkpoint/${feat}_40frames/s2vt \ 57 | # --feats_dir data/feats/$feat/ \ 58 | # --dim_vid 1536 \ 59 | # --rnn_type lstm \ 60 | # --learning_rate_decay_every 100 \ 61 | # --model S2VTModel \ 62 | # 2>&1 | tee $LOG 63 | 64 | ### resnet101 40frames 65 | # feat=resnet101 66 | # LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log 67 | # python train.py \ 68 | # --gpu 1 \ 69 | # --save_checkpoint_every 50 \ 70 | # --epochs 1000 \ 71 | # --n_frame_steps 40 \ 72 | # --batch_size 100 \ 73 | # --input_json data/all_videodatainfo_2017.json \ 74 | # --info_json data/all_info.json \ 75 | # --caption_json data/all_caption.json \ 76 | # --checkpoint_path checkpoint/${feat}_40frames/s2vt \ 77 | # --feats_dir data/feats/$feat/ \ 78 | # --dim_vid 2048 \ 79 | # --rnn_type lstm \ 80 | # --learning_rate_decay_every 100 \ 81 | # --model S2VTModel \ 82 | # 2>&1 | tee $LOG 83 | 84 | 85 | ## resnet101_c3d_fc7_wo_ft 86 | # feat=resnet101 87 | # LOG=log/s2vt_resnet101_c3d_fc7_wo_ft-`date +%Y-%m-%d_%H-%M-%S`.log 88 | # python train.py \ 89 | # --gpu 1 \ 90 | # --save_checkpoint_every 50 \ 91 | # --epochs 1000 \ 92 | # --n_frame_steps 80 \ 93 | # --batch_size 100 \ 94 | # --input_json data/all_videodatainfo_2017.json \ 95 | # --info_json data/all_info.json \ 96 | # --caption_json data/all_caption.json \ 97 | # --checkpoint_path checkpoint/resnet101_c3d_fc7_wo_ft/s2vt \ 98 | # --feats_dir data/feats/$feat/ \ 99 | # --dim_vid 6144 \ 100 | # --with_c3d 1 \ 101 | # --c3d_feats_dir data/feats/c3d_fc7_wo_ft \ 102 | # --rnn_type lstm \ 103 | # --learning_rate_decay_every 200 \ 104 | # --model S2VTModel \ 105 | # 2>&1 | tee $LOG 106 | 107 | # ### resnet101 80frames 108 | # feat=resnet101 109 | # LOG=log/s2vt_${feat}_80frames-`date +%Y-%m-%d_%H-%M-%S`.log 110 | # python train.py \ 111 | # --gpu 0 \ 112 | # --save_checkpoint_every 50 \ 113 | # --epochs 500 \ 114 | # --n_frame_steps 80 \ 115 | # --batch_size 200 \ 116 | # --input_json data/all_videodatainfo_2017.json \ 117 | # --info_json data/all_info.json \ 118 | # --caption_json data/all_caption.json \ 119 | # --checkpoint_path checkpoint/$feat/s2vt \ 120 | # --feats_dir data/feats/$feat/ \ 121 | # --dim_vid 2048 \ 122 | # --rnn_type lstm \ 123 | # --learning_rate_decay_every 100 \ 124 | # --model S2VTModel \ 125 | # 2>&1 | tee $LOG -------------------------------------------------------------------------------- /train_s2vt_att.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOG=log/s2vt_att-`date +%Y-%m-%d_%H-%M-%S`.log 3 | python train.py \ 4 | --gpu 0,1 \ 5 | --save_checkpoint_every 10 \ 6 | --epochs 1000 \ 7 | --batch_size 80 \ 8 | --input_json data/all_videodatainfo_2017.json \ 9 | --info_json data/all_info.json \ 10 | --caption_json data/all_caption.json \ 11 | --checkpoint_path checkpoint/vgg16/s2vt_att \ 12 | --feats_dir data/feats/vgg16/trainval/ \ 13 | --dim_vid 4096 \ 14 | --rnn_type lstm \ 15 | --learning_rate_decay_every 100 \ 16 | --model S2VTAttModel \ 17 | 2>&1 | tee $LOG --------------------------------------------------------------------------------