├── .gitignore
├── 2d_feat_extract.sh
├── LICENSE
├── README.md
├── c3d_feat_extract
    ├── LICENSE
    ├── README.md
    ├── c3d_feat_extract.sh
    ├── class_names_list
    ├── classify.py
    ├── dataset.py
    ├── input
    ├── main.py
    ├── mean.py
    ├── model.py
    ├── models
    │   ├── __init__.py
    │   ├── densenet.py
    │   ├── pre_act_resnet.py
    │   ├── resnet.py
    │   ├── resnext.py
    │   └── wide_resnet.py
    ├── opts.py
    ├── spatial_transforms.py
    ├── temporal_transforms.py
    ├── test.py
    ├── train.py
    └── validation.py
├── caffe_feat_extract.py
├── caffe_feat_extract.sh
├── coco-caption
    ├── LICENSE
    ├── pyciderevalcap
    │   ├── __init__.py
    │   ├── cider
    │   │   ├── __init__.py
    │   │   ├── cider.py
    │   │   └── cider_scorer.py
    │   ├── ciderD
    │   │   ├── __init__.py
    │   │   ├── ciderD.py
    │   │   └── ciderD_scorer.py
    │   ├── eval.py
    │   └── tokenizer
    │   │   ├── __init__.py
    │   │   ├── ptbtokenizer.py
    │   │   ├── stanford-corenlp-3.4.1.jar
    │   │   ├── tmpBF49XX
    │   │   ├── tmpql9uU7
    │   │   ├── tmpuCp_T0
    │   │   ├── tmpxAmV_C
    │   │   └── tmpzNW4I2
    ├── pycocoevalcap
    │   ├── __init__.py
    │   ├── bleu
    │   │   ├── LICENSE
    │   │   ├── __init__.py
    │   │   ├── bleu.py
    │   │   └── bleu_scorer.py
    │   ├── cider
    │   │   ├── __init__.py
    │   │   ├── cider.py
    │   │   └── cider_scorer.py
    │   ├── eval.py
    │   ├── meteor
    │   │   ├── __init__.py
    │   │   ├── meteor-1.5.jar
    │   │   └── meteor.py
    │   ├── rouge
    │   │   ├── __init__.py
    │   │   └── rouge.py
    │   └── tokenizer
    │   │   ├── __init__.py
    │   │   ├── ptbtokenizer.py
    │   │   └── stanford-corenlp-3.4.1.jar
    └── pycocotools
    │   ├── __init__.py
    │   ├── _mask.c
    │   ├── _mask.pyx
    │   ├── coco.py
    │   ├── cocoeval.py
    │   └── mask.py
├── dataloader.py
├── eval.py
├── eval_s2vt.sh
├── finetune_cnn.py
├── misc
    ├── __init__.py
    ├── cocoeval.py
    ├── rewards.py
    └── utils.py
├── models
    ├── Attention.py
    ├── DecoderRNN.py
    ├── EncoderRNN.py
    ├── S2VTAttModel.py
    ├── S2VTModel.py
    └── __init__.py
├── opts.py
├── prepro_coco.py
├── prepro_feats.py
├── prepro_ngrams.py
├── prepro_vocab.py
├── train.py
├── train_s2vt.sh
└── train_s2vt_att.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | feats/
  2 | save*/
  3 | result/
  4 | results/
  5 | foo/
  6 | log/
  7 | data/
  8 | checkpoint/
  9 | pretrained_models/
 10 | *video*
 11 | *.json
 12 | *.ipynb
 13 | .idea/
 14 | !scripts/*.py
 15 | *.pth
 16 | # Byte-compiled / optimized / DLL files
 17 | __pycache__/
 18 | *.py[cod]
 19 | *$py.class
 20 | 
 21 | # C extensions
 22 | *.so
 23 | 
 24 | # Distribution / packaging
 25 | .Python
 26 | build/
 27 | develop-eggs/
 28 | dist/
 29 | downloads/
 30 | eggs/
 31 | .eggs/
 32 | lib/
 33 | lib64/
 34 | parts/
 35 | sdist/
 36 | var/
 37 | wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | MANIFEST
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | .hypothesis/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | .static_storage/
 71 | .media/
 72 | local_settings.py
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | 


--------------------------------------------------------------------------------
/2d_feat_extract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | feat=nasnet
 3 | python prepro_feats.py \
 4 | --video_path data/videos \
 5 | --model ${feat} \
 6 | --output_dir data/feats/${feat} \
 7 | --n_frame_steps 80  \
 8 | --gpu 0 \
 9 | 
10 | 
11 | # --saved_model pretrain_models/resnet152-b121ed2d.pth \
12 | # vgg16-397923af.pth
13 | # resnet101-5d3b4d8f.pth
14 | # resnet152-b121ed2d.pth


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 DingXia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # requirements #
  2 | 
  3 | - cuda
  4 | - pytorch 0.3.1
  5 | - python3(未测试) or python2(已测试,最好统一用py2吧)
  6 | - ffmpeg (can install using anaconda)
  7 | 
  8 | # usage #
  9 | 
 10 | 1. 2d特征提取, 如resnet101, nasnet等
 11 | ```bash
 12 | sh ./2d_extract_feat.sh
 13 | # model 模型选择
 14 | # n_frame_steps 一段视频提取多少帧，默认选80吧
 15 | ```
 16 | 
 17 | 2. 3d特征提取
 18 | ```bash
 19 | cd c3d_feat_extract
 20 | sh ./c3d_feat_extract.sh
 21 | # --mode feature 提取特征模式，无需改动
 22 | # 以下根据所选模型不同进行更改
 23 | # --model_name resnext \
 24 | # --model_depth 101 \
 25 | # --resnext_cardinality 32 \
 26 | # --resnet_shortcut B \
 27 | # --model pretrained_models/resnext-101-64f-kinetics.pth
 28 | ```
 29 | 3. 训练
 30 | 
 31 | ```bash
 32 | ./train_s2vt.sh
 33 | # 根据相关配置进行设置，具体选项含义参考opts.py
 34 | ```
 35 | 
 36 | 4. 测试和评分
 37 | 
 38 | ```bash
 39 | ./eval_s2vt.sh
 40 | # 根据相关配置进行设置，具体选项含义参考eval.py
 41 | ```
 42 | # file tree #
 43 | 
 44 | 相关文件下载 
 45 | 链接: https://pan.baidu.com/s/1RDNygrWtz_PtVH8nh4vG3w 密码: nxyk
 46 | ```
 47 | data
 48 | │   all_caption.json
 49 | │   all_info.json    
 50 | │   all_videodatainfo_2017.json
 51 | └───feats
 52 | │   └───nasnet
 53 | │   │   │   videoxxx.npy
 54 | │   │   │   ...
 55 | │   └───resnet
 56 | │   │   │   videoxxx.npy
 57 | │   │   │   ... 
 58 | │   └───xxnet
 59 | │       │   videoxxx.npy
 60 | │       │   ... 
 61 | └───videos
 62 | │   │   videoxxx.mp4
 63 | │   │   ...
 64 | │
 65 | │
 66 | 新建这些目录
 67 | log
 68 | checkpoint
 69 | result
 70 | 
 71 | ```
 72 | 
 73 | # pytorch implementation of video captioning
 74 | 
 75 | recommend installing pytorch and python packages using Anaconda
 76 | 
 77 | 
 78 | ### python packages
 79 | 
 80 | - tqdm
 81 | - pillow
 82 | - pretrainedmodels
 83 | - nltk
 84 | 
 85 | ## Data
 86 | 
 87 | MSR-VTT. Test video doesn't have captions, so I spilit train-viedo to train/val/test. Extract and put them in `./data/` directory
 88 | 
 89 | - train-video: [download link](https://drive.google.com/file/d/1Qi6Gn_l93SzrvmKQQu-drI90L-x8B0ly/view?usp=sharing)
 90 | - test-video: [download link](https://drive.google.com/file/d/10fPbEhD-ENVQihrRvKFvxcMzkDlhvf4Q/view?usp=sharing)
 91 | - json info of train-video: [download link](https://drive.google.com/file/d/1LcTtsAvfnHhUfHMiI4YkDgN7lF1-_-m7/view?usp=sharing)
 92 | - json info of test-video: [download link](https://drive.google.com/file/d/1Kgra0uMKDQssclNZXRLfbj9UQgBv-1YE/view?usp=sharing)
 93 | 
 94 | ## Options
 95 | 
 96 | all default options are defined in opt.py or corresponding code file, change them for your like.
 97 | 
 98 | ## Usage
 99 | 
100 | ### (Optional) c3d features
101 | you can use [video-classification-3d-cnn-pytorch](https://github.com/kenshohara/video-classification-3d-cnn-pytorch) to extract features from video. Then mean pool to get a 2048 dim feature for each video.
102 | 
103 | ### Steps
104 | 
105 | 1. preprocess videos and labels
106 | 
107 |     this steps take about 3 hours for msr-vtt datasets use one titan XP gpu
108 | 
109 | ```bash
110 | python prepro_feats.py --output_dir data/feats/resnet152 --model resnet152 --n_frame_steps 40  --gpu 4,5
111 | 
112 | python prepro_vocab.py
113 | ```
114 | 
115 | 2. Training a model
116 | 
117 | ```bash
118 | 
119 | python train.py --gpu 5,6,7 --epochs 9001 --batch_size 450 --checkpoint_path data/save --feats_dir data/feats/resnet152 --dim_vid 2048 --model S2VTAttModel
120 | ```
121 | 
122 | 3. test
123 | 
124 |     opt_info.json will be in same directory as saved model.
125 | 
126 | ```bash
127 | python eval.py --recover_opt data/save/opt_info.json --saved_model data/save/model_1000.pth --batch_size 100 --gpu 1,0
128 | ```
129 | 
130 | ## Metrics
131 | 
132 | I fork the [coco-caption XgDuan](https://github.com/XgDuan/coco-caption/tree/python3). Thanks to port it to python3.
133 | 
134 | ## TODO
135 | - lstm
136 | - beam search
137 | - reinforcement learning
138 | 
139 | ## Note
140 | This repository is not maintained, please see my another repository [video-caption-openNMT.py](https://github.com/xiadingZ/video-caption-openNMT.pytorch). It has higher performence and test score.
141 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Kensho Hara
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/README.md:
--------------------------------------------------------------------------------
 1 | # Video Classification Using 3D ResNet
 2 | This is a pytorch code for video (action) classification using 3D ResNet trained by [this code](https://github.com/kenshohara/3D-ResNets-PyTorch).  
 3 | The 3D ResNet is trained on the Kinetics dataset, which includes 400 action classes.  
 4 | This code uses videos as inputs and outputs class names and predicted class scores for each 16 frames in the score mode.  
 5 | In the feature mode, this code outputs features of 512 dims (after global average pooling) for each 16 frames.  
 6 | 
 7 | **Torch (Lua) version of this code is available [here](https://github.com/kenshohara/video-classification-3d-cnn).**
 8 | 
 9 | ## Requirements
10 | * [PyTorch](http://pytorch.org/)
11 | ```
12 | conda install pytorch torchvision cuda80 -c soumith
13 | ```
14 | * FFmpeg, FFprobe
15 | ```
16 | wget http://johnvansickle.com/ffmpeg/releases/ffmpeg-release-64bit-static.tar.xz
17 | tar xvf ffmpeg-release-64bit-static.tar.xz
18 | cd ./ffmpeg-3.3.3-64bit-static/; sudo cp ffmpeg ffprobe /usr/local/bin;
19 | ```
20 | * Python 3
21 | 
22 | ## Preparation
23 | * Download this code.
24 | * Download the [pretrained model](https://drive.google.com/drive/folders/14KRBqT8ySfPtFSuLsFS2U4I-ihTDs0Y9?usp=sharing).  
25 |   * ResNeXt-101 achieved the best performance in our experiments. (See [paper](https://arxiv.org/abs/1711.09577) in details.)
26 | 
27 | ## Usage
28 | Assume input video files are located in ```./videos```.
29 | 
30 | To calculate class scores for each 16 frames, use ```--mode score```.
31 | ```
32 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode score
33 | ```
34 | To visualize the classification results, use ```generate_result_video/generate_result_video.py```.
35 | 
36 | To calculate video features for each 16 frames, use ```--mode feature```.
37 | ```
38 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode feature
39 | ```
40 | 
41 | 
42 | ## Citation
43 | If you use this code, please cite the following:
44 | ```
45 | @article{hara3dcnns,
46 |   author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh},
47 |   title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?},
48 |   journal={arXiv preprint},
49 |   volume={arXiv:1711.09577},
50 |   year={2017},
51 | }
52 | ```
53 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/c3d_feat_extract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | python main.py  \
 3 | --n_classes 20 \
 4 | --gpu 1 \
 5 | --input input \
 6 | --model_name resnext \
 7 | --model_depth 101 \
 8 | --resnext_cardinality 32 \
 9 | --resnet_shortcut B \
10 | --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_msrvtt \
11 | --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \
12 | --output output.json \
13 | --model pretrained_models/resnext-101-MSR-VTT-finetuned-25-epochs.pth \
14 | --mode feature
15 | 
16 | 
17 | # python main.py  \
18 | # --n_classes 101 \
19 | # --gpu 1 \
20 | # --input input \
21 | # --model_name resnext \
22 | # --model_depth 101 \
23 | # --resnext_cardinality 32 \
24 | # --resnet_shortcut B \
25 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_hmdb \
26 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \
27 | # --output output.json \
28 | # --model pretrained_models/resnext-101-kinetics-hmdb51_split1.pth \
29 | # --mode feature
30 | 
31 | # python main.py  \
32 | # --n_classes 51 \
33 | # --gpu 1 \
34 | # --input input \
35 | # --model_name resnext \
36 | # --model_depth 101 \
37 | # --resnext_cardinality 32 \
38 | # --resnet_shortcut B \
39 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_ucf \
40 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \
41 | # --output output.json \
42 | # --model pretrained_models/resnext-101-kinetics-ucf101_split1.pth \
43 | # --mode feature
44 | 
45 | # python main.py  \
46 | # --n_classes 400 \
47 | # --gpu 0 \
48 | # --input input \
49 | # --model_name resnext \
50 | # --model_depth 101 \
51 | # --resnext_cardinality 32 \
52 | # --resnet_shortcut B \
53 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics \
54 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \
55 | # --output output.json \
56 | # --model pretrained_models/resnext-101-kinetics.pth \
57 | # --mode feature
58 | 
59 | # python main.py  \
60 | # --n_classes 400 \
61 | # --sample_duration 64 \
62 | # --gpu 0 \
63 | # --input input \
64 | # --model_name resnext \
65 | # --model_depth 101 \
66 | # --resnext_cardinality 32 \
67 | # --resnet_shortcut B \
68 | # --feat_dir /home/rgh/Matches/video-caption.pytorch/data/feats/c3d_kinectics_64f \
69 | # --video_root /home/rgh/Matches/video-caption.pytorch/data/videos \
70 | # --output output.json \
71 | # --model pretrained_models/resnext-101-64f-kinetics.pth \
72 | # --mode feature
73 | 
74 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/class_names_list:
--------------------------------------------------------------------------------
  1 | abseiling
  2 | air drumming
  3 | answering questions
  4 | applauding
  5 | applying cream
  6 | archery
  7 | arm wrestling
  8 | arranging flowers
  9 | assembling computer
 10 | auctioning
 11 | baby waking up
 12 | baking cookies
 13 | balloon blowing
 14 | bandaging
 15 | barbequing
 16 | bartending
 17 | beatboxing
 18 | bee keeping
 19 | belly dancing
 20 | bench pressing
 21 | bending back
 22 | bending metal
 23 | biking through snow
 24 | blasting sand
 25 | blowing glass
 26 | blowing leaves
 27 | blowing nose
 28 | blowing out candles
 29 | bobsledding
 30 | bookbinding
 31 | bouncing on trampoline
 32 | bowling
 33 | braiding hair
 34 | breading or breadcrumbing
 35 | breakdancing
 36 | brush painting
 37 | brushing hair
 38 | brushing teeth
 39 | building cabinet
 40 | building shed
 41 | bungee jumping
 42 | busking
 43 | canoeing or kayaking
 44 | capoeira
 45 | carrying baby
 46 | cartwheeling
 47 | carving pumpkin
 48 | catching fish
 49 | catching or throwing baseball
 50 | catching or throwing frisbee
 51 | catching or throwing softball
 52 | celebrating
 53 | changing oil
 54 | changing wheel
 55 | checking tires
 56 | cheerleading
 57 | chopping wood
 58 | clapping
 59 | clay pottery making
 60 | clean and jerk
 61 | cleaning floor
 62 | cleaning gutters
 63 | cleaning pool
 64 | cleaning shoes
 65 | cleaning toilet
 66 | cleaning windows
 67 | climbing a rope
 68 | climbing ladder
 69 | climbing tree
 70 | contact juggling
 71 | cooking chicken
 72 | cooking egg
 73 | cooking on campfire
 74 | cooking sausages
 75 | counting money
 76 | country line dancing
 77 | cracking neck
 78 | crawling baby
 79 | crossing river
 80 | crying
 81 | curling hair
 82 | cutting nails
 83 | cutting pineapple
 84 | cutting watermelon
 85 | dancing ballet
 86 | dancing charleston
 87 | dancing gangnam style
 88 | dancing macarena
 89 | deadlifting
 90 | decorating the christmas tree
 91 | digging
 92 | dining
 93 | disc golfing
 94 | diving cliff
 95 | dodgeball
 96 | doing aerobics
 97 | doing laundry
 98 | doing nails
 99 | drawing
100 | dribbling basketball
101 | drinking
102 | drinking beer
103 | drinking shots
104 | driving car
105 | driving tractor
106 | drop kicking
107 | drumming fingers
108 | dunking basketball
109 | dying hair
110 | eating burger
111 | eating cake
112 | eating carrots
113 | eating chips
114 | eating doughnuts
115 | eating hotdog
116 | eating ice cream
117 | eating spaghetti
118 | eating watermelon
119 | egg hunting
120 | exercising arm
121 | exercising with an exercise ball
122 | extinguishing fire
123 | faceplanting
124 | feeding birds
125 | feeding fish
126 | feeding goats
127 | filling eyebrows
128 | finger snapping
129 | fixing hair
130 | flipping pancake
131 | flying kite
132 | folding clothes
133 | folding napkins
134 | folding paper
135 | front raises
136 | frying vegetables
137 | garbage collecting
138 | gargling
139 | getting a haircut
140 | getting a tattoo
141 | giving or receiving award
142 | golf chipping
143 | golf driving
144 | golf putting
145 | grinding meat
146 | grooming dog
147 | grooming horse
148 | gymnastics tumbling
149 | hammer throw
150 | headbanging
151 | headbutting
152 | high jump
153 | high kick
154 | hitting baseball
155 | hockey stop
156 | holding snake
157 | hopscotch
158 | hoverboarding
159 | hugging
160 | hula hooping
161 | hurdling
162 | hurling (sport)
163 | ice climbing
164 | ice fishing
165 | ice skating
166 | ironing
167 | javelin throw
168 | jetskiing
169 | jogging
170 | juggling balls
171 | juggling fire
172 | juggling soccer ball
173 | jumping into pool
174 | jumpstyle dancing
175 | kicking field goal
176 | kicking soccer ball
177 | kissing
178 | kitesurfing
179 | knitting
180 | krumping
181 | laughing
182 | laying bricks
183 | long jump
184 | lunge
185 | making a cake
186 | making a sandwich
187 | making bed
188 | making jewelry
189 | making pizza
190 | making snowman
191 | making sushi
192 | making tea
193 | marching
194 | massaging back
195 | massaging feet
196 | massaging legs
197 | massaging person's head
198 | milking cow
199 | mopping floor
200 | motorcycling
201 | moving furniture
202 | mowing lawn
203 | news anchoring
204 | opening bottle
205 | opening present
206 | paragliding
207 | parasailing
208 | parkour
209 | passing American football (in game)
210 | passing American football (not in game)
211 | peeling apples
212 | peeling potatoes
213 | petting animal (not cat)
214 | petting cat
215 | picking fruit
216 | planting trees
217 | plastering
218 | playing accordion
219 | playing badminton
220 | playing bagpipes
221 | playing basketball
222 | playing bass guitar
223 | playing cards
224 | playing cello
225 | playing chess
226 | playing clarinet
227 | playing controller
228 | playing cricket
229 | playing cymbals
230 | playing didgeridoo
231 | playing drums
232 | playing flute
233 | playing guitar
234 | playing harmonica
235 | playing harp
236 | playing ice hockey
237 | playing keyboard
238 | playing kickball
239 | playing monopoly
240 | playing organ
241 | playing paintball
242 | playing piano
243 | playing poker
244 | playing recorder
245 | playing saxophone
246 | playing squash or racquetball
247 | playing tennis
248 | playing trombone
249 | playing trumpet
250 | playing ukulele
251 | playing violin
252 | playing volleyball
253 | playing xylophone
254 | pole vault
255 | presenting weather forecast
256 | pull ups
257 | pumping fist
258 | pumping gas
259 | punching bag
260 | punching person (boxing)
261 | push up
262 | pushing car
263 | pushing cart
264 | pushing wheelchair
265 | reading book
266 | reading newspaper
267 | recording music
268 | riding a bike
269 | riding camel
270 | riding elephant
271 | riding mechanical bull
272 | riding mountain bike
273 | riding mule
274 | riding or walking with horse
275 | riding scooter
276 | riding unicycle
277 | ripping paper
278 | robot dancing
279 | rock climbing
280 | rock scissors paper
281 | roller skating
282 | running on treadmill
283 | sailing
284 | salsa dancing
285 | sanding floor
286 | scrambling eggs
287 | scuba diving
288 | setting table
289 | shaking hands
290 | shaking head
291 | sharpening knives
292 | sharpening pencil
293 | shaving head
294 | shaving legs
295 | shearing sheep
296 | shining shoes
297 | shooting basketball
298 | shooting goal (soccer)
299 | shot put
300 | shoveling snow
301 | shredding paper
302 | shuffling cards
303 | side kick
304 | sign language interpreting
305 | singing
306 | situp
307 | skateboarding
308 | ski jumping
309 | skiing (not slalom or crosscountry)
310 | skiing crosscountry
311 | skiing slalom
312 | skipping rope
313 | skydiving
314 | slacklining
315 | slapping
316 | sled dog racing
317 | smoking
318 | smoking hookah
319 | snatch weight lifting
320 | sneezing
321 | sniffing
322 | snorkeling
323 | snowboarding
324 | snowkiting
325 | snowmobiling
326 | somersaulting
327 | spinning poi
328 | spray painting
329 | spraying
330 | springboard diving
331 | squat
332 | sticking tongue out
333 | stomping grapes
334 | stretching arm
335 | stretching leg
336 | strumming guitar
337 | surfing crowd
338 | surfing water
339 | sweeping floor
340 | swimming backstroke
341 | swimming breast stroke
342 | swimming butterfly stroke
343 | swing dancing
344 | swinging legs
345 | swinging on something
346 | sword fighting
347 | tai chi
348 | taking a shower
349 | tango dancing
350 | tap dancing
351 | tapping guitar
352 | tapping pen
353 | tasting beer
354 | tasting food
355 | testifying
356 | texting
357 | throwing axe
358 | throwing ball
359 | throwing discus
360 | tickling
361 | tobogganing
362 | tossing coin
363 | tossing salad
364 | training dog
365 | trapezing
366 | trimming or shaving beard
367 | trimming trees
368 | triple jump
369 | tying bow tie
370 | tying knot (not on a tie)
371 | tying tie
372 | unboxing
373 | unloading truck
374 | using computer
375 | using remote controller (not gaming)
376 | using segway
377 | vault
378 | waiting in line
379 | walking the dog
380 | washing dishes
381 | washing feet
382 | washing hair
383 | washing hands
384 | water skiing
385 | water sliding
386 | watering plants
387 | waxing back
388 | waxing chest
389 | waxing eyebrows
390 | waxing legs
391 | weaving basket
392 | welding
393 | whistling
394 | windsurfing
395 | wrapping present
396 | wrestling
397 | writing
398 | yawning
399 | yoga
400 | zumba
401 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/classify.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | from dataset import Video
 5 | from spatial_transforms import (Compose, Normalize, Scale, CenterCrop, ToTensor)
 6 | from temporal_transforms import LoopPadding
 7 | 
 8 | def classify_video(video_dir, video_name, class_names, model, opt):
 9 |     assert opt.mode in ['score', 'feature']
10 | 
11 |     spatial_transform = Compose([Scale(opt.sample_size),
12 |                                  CenterCrop(opt.sample_size),
13 |                                  ToTensor(),
14 |                                  Normalize(opt.mean, [1, 1, 1])])
15 |     temporal_transform = LoopPadding(opt.sample_duration)
16 |     data = Video(video_dir, spatial_transform=spatial_transform,
17 |                  temporal_transform=temporal_transform,
18 |                  sample_duration=opt.sample_duration)
19 |     data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size,
20 |                                               shuffle=False, num_workers=opt.n_threads, pin_memory=True)
21 | 
22 |     video_outputs = []
23 |     video_segments = []
24 |     for i, (inputs, segments) in enumerate(data_loader):
25 |         inputs = Variable(inputs, volatile=True)
26 |         outputs = model(inputs)
27 | 
28 |         video_outputs.append(outputs.cpu().data)
29 |         video_segments.append(segments)
30 | 
31 |     video_outputs = torch.cat(video_outputs)
32 |     video_segments = torch.cat(video_segments)
33 |     if opt.mode == 'feature':
34 |         return video_outputs.numpy()
35 |     elif opt.mode == 'score':
36 |         results = {
37 |             'video': video_name,
38 |             'clips': []
39 |         }
40 | 
41 |         _, max_indices = video_outputs.max(dim=1)
42 |         for i in range(video_outputs.size(0)):
43 |             clip_results = {
44 |                 'segment': video_segments[i].tolist(),
45 |             }
46 |             clip_results['label'] = class_names[max_indices[i]]
47 |             clip_results['scores'] = video_outputs[i].tolist()
48 |             results['clips'].append(clip_results)
49 |         return results
50 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import copy
  8 | import numpy as np
  9 | 
 10 | def pil_loader(path):
 11 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 12 |     with open(path, 'rb') as f:
 13 |         with Image.open(f) as img:
 14 |             return img.convert('RGB')
 15 | 
 16 | 
 17 | def accimage_loader(path):
 18 |     try:
 19 |         return accimage.Image(path)
 20 |     except IOError:
 21 |         # Potentially a decoding problem, fall back to PIL.Image
 22 |         return pil_loader(path)
 23 | 
 24 | 
 25 | def get_default_image_loader():
 26 |     from torchvision import get_image_backend
 27 |     if get_image_backend() == 'accimage':
 28 |         import accimage
 29 |         return accimage_loader
 30 |     else:
 31 |         return pil_loader
 32 | 
 33 | 
 34 | def video_loader(video_dir_path, frame_indices, image_loader):
 35 |     video = []
 36 |     for i in frame_indices:
 37 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 38 |         if os.path.exists(image_path):
 39 |             video.append(image_loader(image_path))
 40 |         else:
 41 |             return video
 42 | 
 43 |     return video
 44 | 
 45 | 
 46 | def get_default_video_loader():
 47 |     image_loader = get_default_image_loader()
 48 |     return functools.partial(video_loader, image_loader=image_loader)
 49 | 
 50 | 
 51 | def load_annotation_data(data_file_path):
 52 |     with open(data_file_path, 'r') as data_file:
 53 |         return json.load(data_file)
 54 | 
 55 | 
 56 | def get_class_labels(data):
 57 |     class_labels_map = {}
 58 |     index = 0
 59 |     for class_label in data['labels']:
 60 |         class_labels_map[class_label] = index
 61 |         index += 1
 62 |     return class_labels_map
 63 | 
 64 | 
 65 | def get_video_names_and_annotations(data, subset):
 66 |     video_names = []
 67 |     annotations = []
 68 | 
 69 |     for key, value in data['database'].items():
 70 |         this_subset = value['subset']
 71 |         if this_subset == subset:
 72 |             if subset == 'testing':
 73 |                 video_names.append('test/{}'.format(key))
 74 |             else:
 75 |                 label = value['annotations']['label']
 76 |                 video_names.append('{}/{}'.format(label, key))
 77 |                 annotations.append(value['annotations'])
 78 | 
 79 |     return video_names, annotations
 80 | 
 81 | 
 82 | def make_dataset(video_path, sample_duration):
 83 |     dataset = []
 84 | 
 85 |     n_frames = len(os.listdir(video_path))
 86 | 
 87 |     begin_t = 1
 88 |     end_t = n_frames
 89 |     sample = {
 90 |         'video': video_path,
 91 |         'segment': [begin_t, end_t],
 92 |         'n_frames': n_frames,
 93 |     }
 94 | 
 95 |     step = sample_duration
 96 |     for i in range(1, (n_frames - sample_duration + 1), step):
 97 |         sample_i = copy.deepcopy(sample)
 98 |         sample_i['frame_indices'] = list(range(i, i + sample_duration))  # [i: i + sample_duration) same as segment
 99 |         sample_i['segment'] = torch.IntTensor([i, i + sample_duration - 1])
100 |         dataset.append(sample_i)
101 |     if n_frames % sample_duration != 0:
102 |         sample_i = copy.deepcopy(sample)
103 |         if n_frames - sample_duration + 1 >= 1:
104 |             sample_i['frame_indices'] = list(range(n_frames - sample_duration + 1, n_frames + 1))
105 |             sample_i['segment'] = torch.IntTensor([n_frames - sample_duration + 1, n_frames])
106 |         else:
107 |             sample_i['frame_indices'] = np.round(np.linspace(1, n_frames, sample_duration))\
108 |                 .astype(np.int32).tolist()
109 |             sample_i['segment'] = torch.IntTensor([1, n_frames])
110 |         dataset.append(sample_i)
111 |     return dataset
112 | 
113 | 
114 | class Video(data.Dataset):
115 |     def __init__(self, video_path,
116 |                  spatial_transform=None, temporal_transform=None,
117 |                  sample_duration=16, get_loader=get_default_video_loader):
118 |         self.data = make_dataset(video_path, sample_duration)
119 | 
120 |         self.spatial_transform = spatial_transform
121 |         self.temporal_transform = temporal_transform
122 |         self.loader = get_loader()
123 | 
124 |     def __getitem__(self, index):
125 |         """
126 |         Args:
127 |             index (int): Index
128 |         Returns:
129 |             tuple: (image, target) where target is class_index of the target class.
130 |         """
131 |         path = self.data[index]['video']
132 | 
133 |         frame_indices = self.data[index]['frame_indices']
134 |         if self.temporal_transform is not None:
135 |             frame_indices = self.temporal_transform(frame_indices)
136 |         clip = self.loader(path, frame_indices)
137 |         if self.spatial_transform is not None:
138 |             clip = [self.spatial_transform(img) for img in clip]
139 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
140 | 
141 |         target = self.data[index]['segment']
142 | 
143 |         return clip, target
144 | 
145 |     def __len__(self):
146 |         return len(self.data)
147 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import subprocess
 5 | import numpy as np
 6 | import torch
 7 | from torch import nn
 8 | 
 9 | from opts import parse_opts
10 | from model import generate_model
11 | from mean import get_mean
12 | from classify import classify_video
13 | 
14 | if __name__=="__main__":
15 |     opt = parse_opts()
16 |     opt.mean = get_mean()
17 |     opt.arch = '{}-{}'.format(opt.model_name, opt.model_depth)
18 |     opt.sample_size = 112
19 |     # if opt.model.find('64f') != -1:
20 |     #     opt.sample_duration = 64
21 |     # else:
22 |     #     opt.sample_duration = 16
23 |     os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
24 |     model = generate_model(opt)
25 |     print('loading model {}'.format(opt.model))
26 |     model_data = torch.load(opt.model)
27 |     assert opt.arch == model_data['arch']
28 |     model.load_state_dict(model_data['state_dict'])
29 |     model.eval()
30 |     if opt.verbose:
31 |         print(model)
32 | 
33 |     input_files = []
34 |     with open(opt.input, 'r') as f:
35 |         for row in f:
36 |             input_files.append(row[:-1])
37 | 
38 |     class_names = []
39 |     with open('class_names_list') as f:
40 |         for row in f:
41 |             class_names.append(row[:-1])
42 | 
43 |     ffmpeg_loglevel = 'quiet'
44 |     if opt.verbose:
45 |         ffmpeg_loglevel = 'info'
46 | 
47 |     if os.path.exists('tmp'):
48 |         subprocess.call('rm -rf tmp', shell=True)
49 | 
50 |     outputs = []
51 |     for input_file in input_files:
52 |         video_path = os.path.join(opt.video_root, input_file)
53 |         if os.path.exists(video_path):
54 |             print(video_path)
55 |             subprocess.call('mkdir tmp', shell=True)
56 |             subprocess.call('ffmpeg -i {} tmp/image_%05d.jpg'.format(video_path),
57 |                             shell=True)
58 | 
59 |             result = classify_video('tmp', input_file, class_names, model, opt)
60 |             if opt.mode == 'score':
61 |                 outputs.append(result)
62 |             elif opt.mode == 'feature':
63 |                 feat_path = os.path.join(opt.feat_dir, input_file.split('.')[0]+'.npy')
64 |                 np.save(feat_path,result)
65 |             subprocess.call('rm -rf tmp', shell=True)
66 |         else:
67 |             print('{} does not exist'.format(input_file))
68 | 
69 |     if os.path.exists('tmp'):
70 |         subprocess.call('rm -rf tmp', shell=True)
71 |     if opt.mode == 'score':
72 |         with open(opt.output, 'w') as f:
73 |             json.dump(outputs, f)
74 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/mean.py:
--------------------------------------------------------------------------------
1 | def get_mean():
2 |     return [114.7748, 107.7354, 99.4750]
3 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from models import resnet, pre_act_resnet, wide_resnet, resnext, densenet
  5 | 
  6 | 
  7 | def generate_model(opt):
  8 |     assert opt.mode in ['score', 'feature']
  9 |     if opt.mode == 'score':
 10 |         last_fc = True
 11 |     elif opt.mode == 'feature':
 12 |         last_fc = False
 13 | 
 14 |     assert opt.model_name in ['resnet', 'preresnet', 'wideresnet', 'resnext', 'densenet']
 15 | 
 16 |     if opt.model_name == 'resnet':
 17 |         assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200]
 18 | 
 19 |         if opt.model_depth == 10:
 20 |             model = resnet.resnet10(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 21 |                                     sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 22 |                                     last_fc=last_fc)
 23 |         elif opt.model_depth == 18:
 24 |             model = resnet.resnet18(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 25 |                                     sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 26 |                                     last_fc=last_fc)
 27 |         elif opt.model_depth == 34:
 28 |             model = resnet.resnet34(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 29 |                                     sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 30 |                                     last_fc=last_fc)
 31 |         elif opt.model_depth == 50:
 32 |             model = resnet.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 33 |                                     sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 34 |                                     last_fc=last_fc)
 35 |         elif opt.model_depth == 101:
 36 |             model = resnet.resnet101(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 37 |                                      sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 38 |                                      last_fc=last_fc)
 39 |         elif opt.model_depth == 152:
 40 |             model = resnet.resnet152(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 41 |                                      sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 42 |                                      last_fc=last_fc)
 43 |         elif opt.model_depth == 200:
 44 |             model = resnet.resnet200(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 45 |                                      sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 46 |                                      last_fc=last_fc)
 47 |     elif opt.model_name == 'wideresnet':
 48 |         assert opt.model_depth in [50]
 49 | 
 50 |         if opt.model_depth == 50:
 51 |             model = wide_resnet.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, k=opt.wide_resnet_k,
 52 |                                          sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 53 |                                          last_fc=last_fc)
 54 |     elif opt.model_name == 'resnext':
 55 |         assert opt.model_depth in [50, 101, 152]
 56 | 
 57 |         if opt.model_depth == 50:
 58 |             model = resnext.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, cardinality=opt.resnext_cardinality,
 59 |                                      sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 60 |                                      last_fc=last_fc)
 61 |         elif opt.model_depth == 101:
 62 |             model = resnext.resnet101(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, cardinality=opt.resnext_cardinality,
 63 |                                       sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 64 |                                       last_fc=last_fc)
 65 |         elif opt.model_depth == 152:
 66 |             model = resnext.resnet152(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut, cardinality=opt.resnext_cardinality,
 67 |                                       sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 68 |                                       last_fc=last_fc)
 69 |     elif opt.model_name == 'preresnet':
 70 |         assert opt.model_depth in [18, 34, 50, 101, 152, 200]
 71 | 
 72 |         if opt.model_depth == 18:
 73 |             model = pre_act_resnet.resnet18(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 74 |                                             sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 75 |                                             last_fc=last_fc)
 76 |         elif opt.model_depth == 34:
 77 |             model = pre_act_resnet.resnet34(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 78 |                                             sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 79 |                                             last_fc=last_fc)
 80 |         elif opt.model_depth == 50:
 81 |             model = pre_act_resnet.resnet50(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 82 |                                             sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 83 |                                             last_fc=last_fc)
 84 |         elif opt.model_depth == 101:
 85 |             model = pre_act_resnet.resnet101(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 86 |                                              sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 87 |                                              last_fc=last_fc)
 88 |         elif opt.model_depth == 152:
 89 |             model = pre_act_resnet.resnet152(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 90 |                                              sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 91 |                                              last_fc=last_fc)
 92 |         elif opt.model_depth == 200:
 93 |             model = pre_act_resnet.resnet200(num_classes=opt.n_classes, shortcut_type=opt.resnet_shortcut,
 94 |                                              sample_size=opt.sample_size, sample_duration=opt.sample_duration,
 95 |                                              last_fc=last_fc)
 96 |     elif opt.model_name == 'densenet':
 97 |         assert opt.model_depth in [121, 169, 201, 264]
 98 | 
 99 |         if opt.model_depth == 121:
100 |             model = densenet.densenet121(num_classes=opt.n_classes,
101 |                                          sample_size=opt.sample_size, sample_duration=opt.sample_duration,
102 |                                          last_fc=last_fc)
103 |         elif opt.model_depth == 169:
104 |             model = densenet.densenet169(num_classes=opt.n_classes,
105 |                                          sample_size=opt.sample_size, sample_duration=opt.sample_duration,
106 |                                          last_fc=last_fc)
107 |         elif opt.model_depth == 201:
108 |             model = densenet.densenet201(num_classes=opt.n_classes,
109 |                                          sample_size=opt.sample_size, sample_duration=opt.sample_duration,
110 |                                          last_fc=last_fc)
111 |         elif opt.model_depth == 264:
112 |             model = densenet.densenet264(num_classes=opt.n_classes,
113 |                                          sample_size=opt.sample_size, sample_duration=opt.sample_duration,
114 |                                          last_fc=last_fc)
115 | 
116 |     if not opt.no_cuda:
117 |         model = model.cuda()
118 |         model = nn.DataParallel(model, device_ids=None)
119 | 
120 |     return model
121 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/c3d_feat_extract/models/__init__.py


--------------------------------------------------------------------------------
/c3d_feat_extract/models/densenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from collections import OrderedDict
  5 | import math
  6 | 
  7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264']
  8 | 
  9 | 
 10 | def densenet121(**kwargs):
 11 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
 12 |                      **kwargs)
 13 |     return model
 14 | 
 15 | 
 16 | def densenet169(**kwargs):
 17 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
 18 |                      **kwargs)
 19 |     return model
 20 | 
 21 | 
 22 | def densenet201(**kwargs):
 23 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
 24 |                      **kwargs)
 25 |     return model
 26 | 
 27 | 
 28 | def densenet264(**kwargs):
 29 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48),
 30 |                      **kwargs)
 31 |     return model
 32 | 
 33 | 
 34 | def get_fine_tuning_parameters(model, ft_begin_index):
 35 |     if ft_begin_index == 0:
 36 |         return model.parameters()
 37 | 
 38 |     ft_module_names = []
 39 |     for i in range(ft_begin_index, 5):
 40 |         ft_module_names.append('denseblock{}'.format(ft_begin_index))
 41 |         ft_module_names.append('transition{}'.format(ft_begin_index))
 42 |     ft_module_names.append('norm5')
 43 |     ft_module_names.append('classifier')
 44 | 
 45 |     parameters = []
 46 |     for k, v in model.named_parameters():
 47 |         for ft_module in ft_module_names:
 48 |             if ft_module in k:
 49 |                 parameters.append({'params': v})
 50 |                 break
 51 |         else:
 52 |             parameters.append({'params': v, 'lr': 0.0})
 53 | 
 54 |     return parameters
 55 | 
 56 | 
 57 | class _DenseLayer(nn.Sequential):
 58 |     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
 59 |         super(_DenseLayer, self).__init__()
 60 |         self.add_module('norm.1', nn.BatchNorm3d(num_input_features))
 61 |         self.add_module('relu.1', nn.ReLU(inplace=True))
 62 |         self.add_module('conv.1', nn.Conv3d(num_input_features, bn_size * growth_rate,
 63 |                                             kernel_size=1, stride=1, bias=False))
 64 |         self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate))
 65 |         self.add_module('relu.2', nn.ReLU(inplace=True))
 66 |         self.add_module('conv.2', nn.Conv3d(bn_size * growth_rate, growth_rate,
 67 |                                             kernel_size=3, stride=1, padding=1, bias=False))
 68 |         self.drop_rate = drop_rate
 69 | 
 70 |     def forward(self, x):
 71 |         new_features = super(_DenseLayer, self).forward(x)
 72 |         if self.drop_rate > 0:
 73 |             new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
 74 |         return torch.cat([x, new_features], 1)
 75 | 
 76 | 
 77 | class _DenseBlock(nn.Sequential):
 78 |     def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
 79 |         super(_DenseBlock, self).__init__()
 80 |         for i in range(num_layers):
 81 |             layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
 82 |             self.add_module('denselayer%d' % (i + 1), layer)
 83 | 
 84 | 
 85 | class _Transition(nn.Sequential):
 86 |     def __init__(self, num_input_features, num_output_features):
 87 |         super(_Transition, self).__init__()
 88 |         self.add_module('norm', nn.BatchNorm3d(num_input_features))
 89 |         self.add_module('relu', nn.ReLU(inplace=True))
 90 |         self.add_module('conv', nn.Conv3d(num_input_features, num_output_features,
 91 |                                           kernel_size=1, stride=1, bias=False))
 92 |         self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))
 93 | 
 94 | 
 95 | class DenseNet(nn.Module):
 96 |     """Densenet-BC model class
 97 |     Args:
 98 |         growth_rate (int) - how many filters to add each layer (k in paper)
 99 |         block_config (list of 4 ints) - how many layers in each pooling block
100 |         num_init_features (int) - the number of filters to learn in the first convolution layer
101 |         bn_size (int) - multiplicative factor for number of bottle neck layers
102 |           (i.e. bn_size * k features in the bottleneck layer)
103 |         drop_rate (float) - dropout rate after each dense layer
104 |         num_classes (int) - number of classification classes
105 |     """
106 |     def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16),
107 |                  num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True):
108 | 
109 |         super(DenseNet, self).__init__()
110 | 
111 |         self.last_fc = last_fc
112 | 
113 |         self.sample_size = sample_size
114 |         self.sample_duration = sample_duration
115 | 
116 |         # First convolution
117 |         self.features = nn.Sequential(OrderedDict([
118 |             ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7,
119 |                                 stride=(1, 2, 2), padding=(3, 3, 3), bias=False)),
120 |             ('norm0', nn.BatchNorm3d(num_init_features)),
121 |             ('relu0', nn.ReLU(inplace=True)),
122 |             ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)),
123 |         ]))
124 | 
125 |         # Each denseblock
126 |         num_features = num_init_features
127 |         for i, num_layers in enumerate(block_config):
128 |             block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
129 |                                 bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
130 |             self.features.add_module('denseblock%d' % (i + 1), block)
131 |             num_features = num_features + num_layers * growth_rate
132 |             if i != len(block_config) - 1:
133 |                 trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
134 |                 self.features.add_module('transition%d' % (i + 1), trans)
135 |                 num_features = num_features // 2
136 | 
137 |         # Final batch norm
138 |         self.features.add_module('norm5', nn.BatchNorm2d(num_features))
139 | 
140 |         # Linear layer
141 |         self.classifier = nn.Linear(num_features, num_classes)
142 | 
143 |     def forward(self, x):
144 |         features = self.features(x)
145 |         out = F.relu(features, inplace=True)
146 |         last_duration = math.ceil(self.sample_duration / 16)
147 |         last_size = math.floor(self.sample_size / 32)
148 |         out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1)
149 |         if self.last_fc:
150 |             out = self.classifier(out)
151 |         return out
152 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/models/pre_act_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class PreActivationBasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(PreActivationBasicBlock, self).__init__()
 35 |         self.bn1 = nn.BatchNorm3d(inplanes)
 36 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 37 |         self.bn2 = nn.BatchNorm3d(planes)
 38 |         self.conv2 = conv3x3x3(planes, planes)
 39 |         self.relu = nn.ReLU(inplace=True)
 40 |         self.downsample = downsample
 41 |         self.stride = stride
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out = self.bn1(x)
 47 |         out = self.relu(out)
 48 |         out = self.conv1(out)
 49 | 
 50 |         out = self.bn2(out)
 51 |         out = self.relu(out)
 52 |         out = self.conv2(out)
 53 | 
 54 |         if self.downsample is not None:
 55 |             residual = self.downsample(x)
 56 | 
 57 |         out += residual
 58 | 
 59 |         return out
 60 | 
 61 | 
 62 | class PreActivationBottleneck(nn.Module):
 63 |     expansion = 4
 64 | 
 65 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 66 |         super(PreActivationBottleneck, self).__init__()
 67 |         self.bn1 = nn.BatchNorm3d(inplanes)
 68 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 69 |         self.bn2 = nn.BatchNorm3d(planes)
 70 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
 71 |                                padding=1, bias=False)
 72 |         self.bn3 = nn.BatchNorm3d(planes)
 73 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 74 |         self.relu = nn.ReLU(inplace=True)
 75 |         self.downsample = downsample
 76 |         self.stride = stride
 77 | 
 78 |     def forward(self, x):
 79 |         residual = x
 80 | 
 81 |         out = self.bn1(x)
 82 |         out = self.relu(out)
 83 |         out = self.conv1(out)
 84 | 
 85 |         out = self.bn2(out)
 86 |         out = self.relu(out)
 87 |         out = self.conv2(out)
 88 | 
 89 |         out = self.bn3(out)
 90 |         out = self.relu(out)
 91 |         out = self.conv3(out)
 92 | 
 93 |         if self.downsample is not None:
 94 |             residual = self.downsample(x)
 95 | 
 96 |         out += residual
 97 | 
 98 |         return out
 99 | 
100 | 
101 | class PreActivationResNet(nn.Module):
102 | 
103 |     def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True):
104 |         self.last_fc = last_fc
105 | 
106 |         self.inplanes = 64
107 |         super(PreActivationResNet, self).__init__()
108 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
109 |                                padding=(3, 3, 3), bias=False)
110 |         self.bn1 = nn.BatchNorm3d(64)
111 |         self.relu = nn.ReLU(inplace=True)
112 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
113 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
114 |         self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
115 |         self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
116 |         self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
117 |         last_duration = math.ceil(sample_duration / 16)
118 |         last_size = math.ceil(sample_size / 32)
119 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
120 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
121 | 
122 |         for m in self.modules():
123 |             if isinstance(m, nn.Conv3d):
124 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
125 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
126 |             elif isinstance(m, nn.BatchNorm3d):
127 |                 m.weight.data.fill_(1)
128 |                 m.bias.data.zero_()
129 | 
130 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
131 |         downsample = None
132 |         if stride != 1 or self.inplanes != planes * block.expansion:
133 |             if shortcut_type == 'A':
134 |                 downsample = partial(downsample_basic_block,
135 |                                      planes=planes * block.expansion,
136 |                                      stride=stride)
137 |             else:
138 |                 downsample = nn.Sequential(
139 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
140 |                               kernel_size=1, stride=stride, bias=False),
141 |                     nn.BatchNorm3d(planes * block.expansion)
142 |                 )
143 | 
144 |         layers = []
145 |         layers.append(block(self.inplanes, planes, stride, downsample))
146 |         self.inplanes = planes * block.expansion
147 |         for i in range(1, blocks):
148 |             layers.append(block(self.inplanes, planes))
149 | 
150 |         return nn.Sequential(*layers)
151 | 
152 |     def forward(self, x):
153 |         x = self.conv1(x)
154 |         x = self.bn1(x)
155 |         x = self.relu(x)
156 |         x = self.maxpool(x)
157 | 
158 |         x = self.layer1(x)
159 |         x = self.layer2(x)
160 |         x = self.layer3(x)
161 |         x = self.layer4(x)
162 | 
163 |         x = self.avgpool(x)
164 | 
165 |         x = x.view(x.size(0), -1)
166 |         if self.last_fc:
167 |             x = self.fc(x)
168 | 
169 |         return x
170 | 
171 | def get_fine_tuning_parameters(model, ft_begin_index):
172 |     if ft_begin_index == 0:
173 |         return model.parameters()
174 | 
175 |     ft_module_names = []
176 |     for i in range(ft_begin_index, 5):
177 |         ft_module_names.append('layer{}'.format(ft_begin_index))
178 |     ft_module_names.append('fc')
179 | 
180 |     parameters = []
181 |     for k, v in model.named_parameters():
182 |         for ft_module in ft_module_names:
183 |             if ft_module in k:
184 |                 parameters.append({'params': v})
185 |                 break
186 |         else:
187 |             parameters.append({'params': v, 'lr': 0.0})
188 | 
189 |     return parameters
190 | 
191 | def resnet18(**kwargs):
192 |     """Constructs a ResNet-18 model.
193 |     """
194 |     model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs)
195 |     return model
196 | 
197 | def resnet34(**kwargs):
198 |     """Constructs a ResNet-34 model.
199 |     """
200 |     model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs)
201 |     return model
202 | 
203 | 
204 | def resnet50(**kwargs):
205 |     """Constructs a ResNet-50 model.
206 |     """
207 |     model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs)
208 |     return model
209 | 
210 | def resnet101(**kwargs):
211 |     """Constructs a ResNet-101 model.
212 |     """
213 |     model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3], **kwargs)
214 |     return model
215 | 
216 | def resnet152(**kwargs):
217 |     """Constructs a ResNet-101 model.
218 |     """
219 |     model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3], **kwargs)
220 |     return model
221 | 
222 | def resnet200(**kwargs):
223 |     """Constructs a ResNet-101 model.
224 |     """
225 |     model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3], **kwargs)
226 |     return model
227 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/models/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class BasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(BasicBlock, self).__init__()
 35 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 36 |         self.bn1 = nn.BatchNorm3d(planes)
 37 |         self.relu = nn.ReLU(inplace=True)
 38 |         self.conv2 = conv3x3x3(planes, planes)
 39 |         self.bn2 = nn.BatchNorm3d(planes)
 40 |         self.downsample = downsample
 41 |         self.stride = stride
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out = self.conv1(x)
 47 |         out = self.bn1(out)
 48 |         out = self.relu(out)
 49 | 
 50 |         out = self.conv2(out)
 51 |         out = self.bn2(out)
 52 | 
 53 |         if self.downsample is not None:
 54 |             residual = self.downsample(x)
 55 | 
 56 |         out += residual
 57 |         out = self.relu(out)
 58 | 
 59 |         return out
 60 | 
 61 | 
 62 | class Bottleneck(nn.Module):
 63 |     expansion = 4
 64 | 
 65 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 66 |         super(Bottleneck, self).__init__()
 67 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 68 |         self.bn1 = nn.BatchNorm3d(planes)
 69 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
 70 |                                padding=1, bias=False)
 71 |         self.bn2 = nn.BatchNorm3d(planes)
 72 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 73 |         self.bn3 = nn.BatchNorm3d(planes * 4)
 74 |         self.relu = nn.ReLU(inplace=True)
 75 |         self.downsample = downsample
 76 |         self.stride = stride
 77 | 
 78 |     def forward(self, x):
 79 |         residual = x
 80 | 
 81 |         out = self.conv1(x)
 82 |         out = self.bn1(out)
 83 |         out = self.relu(out)
 84 | 
 85 |         out = self.conv2(out)
 86 |         out = self.bn2(out)
 87 |         out = self.relu(out)
 88 | 
 89 |         out = self.conv3(out)
 90 |         out = self.bn3(out)
 91 | 
 92 |         if self.downsample is not None:
 93 |             residual = self.downsample(x)
 94 | 
 95 |         out += residual
 96 |         out = self.relu(out)
 97 | 
 98 |         return out
 99 | 
100 | 
101 | class ResNet(nn.Module):
102 | 
103 |     def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True):
104 |         self.last_fc = last_fc
105 | 
106 |         self.inplanes = 64
107 |         super(ResNet, self).__init__()
108 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
109 |                                padding=(3, 3, 3), bias=False)
110 |         self.bn1 = nn.BatchNorm3d(64)
111 |         self.relu = nn.ReLU(inplace=True)
112 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
113 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
114 |         self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
115 |         self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
116 |         self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
117 |         last_duration = math.ceil(sample_duration / 16)
118 |         last_size = math.ceil(sample_size / 32)
119 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
120 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
121 | 
122 |         for m in self.modules():
123 |             if isinstance(m, nn.Conv3d):
124 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
125 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
126 |             elif isinstance(m, nn.BatchNorm3d):
127 |                 m.weight.data.fill_(1)
128 |                 m.bias.data.zero_()
129 | 
130 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
131 |         downsample = None
132 |         if stride != 1 or self.inplanes != planes * block.expansion:
133 |             if shortcut_type == 'A':
134 |                 downsample = partial(downsample_basic_block,
135 |                                      planes=planes * block.expansion,
136 |                                      stride=stride)
137 |             else:
138 |                 downsample = nn.Sequential(
139 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
140 |                               kernel_size=1, stride=stride, bias=False),
141 |                     nn.BatchNorm3d(planes * block.expansion)
142 |                 )
143 | 
144 |         layers = []
145 |         layers.append(block(self.inplanes, planes, stride, downsample))
146 |         self.inplanes = planes * block.expansion
147 |         for i in range(1, blocks):
148 |             layers.append(block(self.inplanes, planes))
149 | 
150 |         return nn.Sequential(*layers)
151 | 
152 |     def forward(self, x):
153 |         x = self.conv1(x)
154 |         x = self.bn1(x)
155 |         x = self.relu(x)
156 |         x = self.maxpool(x)
157 | 
158 |         x = self.layer1(x)
159 |         x = self.layer2(x)
160 |         x = self.layer3(x)
161 |         x = self.layer4(x)
162 | 
163 |         x = self.avgpool(x)
164 | 
165 |         x = x.view(x.size(0), -1)
166 |         if self.last_fc:
167 |             x = self.fc(x)
168 | 
169 |         return x
170 | 
171 | 
172 | def get_fine_tuning_parameters(model, ft_begin_index):
173 |     if ft_begin_index == 0:
174 |         return model.parameters()
175 | 
176 |     ft_module_names = []
177 |     for i in range(ft_begin_index, 5):
178 |         ft_module_names.append('layer{}'.format(ft_begin_index))
179 |     ft_module_names.append('fc')
180 | 
181 |     parameters = []
182 |     for k, v in model.named_parameters():
183 |         for ft_module in ft_module_names:
184 |             if ft_module in k:
185 |                 parameters.append({'params': v})
186 |                 break
187 |         else:
188 |             parameters.append({'params': v, 'lr': 0.0})
189 | 
190 |     return parameters
191 | 
192 | 
193 | def resnet10(**kwargs):
194 |     """Constructs a ResNet-18 model.
195 |     """
196 |     model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
197 |     return model
198 | 
199 | def resnet18(**kwargs):
200 |     """Constructs a ResNet-18 model.
201 |     """
202 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
203 |     return model
204 | 
205 | def resnet34(**kwargs):
206 |     """Constructs a ResNet-34 model.
207 |     """
208 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
209 |     return model
210 | 
211 | def resnet50(**kwargs):
212 |     """Constructs a ResNet-50 model.
213 |     """
214 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
215 |     return model
216 | 
217 | def resnet101(**kwargs):
218 |     """Constructs a ResNet-101 model.
219 |     """
220 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
221 |     return model
222 | 
223 | def resnet152(**kwargs):
224 |     """Constructs a ResNet-101 model.
225 |     """
226 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
227 |     return model
228 | 
229 | def resnet200(**kwargs):
230 |     """Constructs a ResNet-101 model.
231 |     """
232 |     model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
233 |     return model
234 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/models/resnext.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | import math
  7 | from functools import partial
  8 | 
  9 | __all__ = ['ResNeXt', 'resnet50', 'resnet101']
 10 | 
 11 | 
 12 | def conv3x3x3(in_planes, out_planes, stride=1):
 13 |     # 3x3x3 convolution with padding
 14 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 15 |                      stride=stride, padding=1, bias=False)
 16 | 
 17 | 
 18 | def downsample_basic_block(x, planes, stride):
 19 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 20 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 21 |                              out.size(2), out.size(3),
 22 |                              out.size(4)).zero_()
 23 |     if isinstance(out.data, torch.cuda.FloatTensor):
 24 |         zero_pads = zero_pads.cuda()
 25 | 
 26 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 27 | 
 28 |     return out
 29 | 
 30 | 
 31 | class ResNeXtBottleneck(nn.Module):
 32 |     expansion = 2
 33 | 
 34 |     def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None):
 35 |         super(ResNeXtBottleneck, self).__init__()
 36 |         mid_planes = cardinality * int(planes / 32)
 37 |         self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False)
 38 |         self.bn1 = nn.BatchNorm3d(mid_planes)
 39 |         self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride,
 40 |                                padding=1, groups=cardinality, bias=False)
 41 |         self.bn2 = nn.BatchNorm3d(mid_planes)
 42 |         self.conv3 = nn.Conv3d(mid_planes, planes * self.expansion, kernel_size=1, bias=False)
 43 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 44 |         self.relu = nn.ReLU(inplace=True)
 45 |         self.downsample = downsample
 46 |         self.stride = stride
 47 | 
 48 |     def forward(self, x):
 49 |         residual = x
 50 | 
 51 |         out = self.conv1(x)
 52 |         out = self.bn1(out)
 53 |         out = self.relu(out)
 54 | 
 55 |         out = self.conv2(out)
 56 |         out = self.bn2(out)
 57 |         out = self.relu(out)
 58 | 
 59 |         out = self.conv3(out)
 60 |         out = self.bn3(out)
 61 | 
 62 |         if self.downsample is not None:
 63 |             residual = self.downsample(x)
 64 | 
 65 |         out += residual
 66 |         out = self.relu(out)
 67 | 
 68 |         return out
 69 | 
 70 | 
 71 | class ResNeXt(nn.Module):
 72 | 
 73 |     def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, last_fc=True):
 74 |         self.last_fc = last_fc
 75 | 
 76 |         self.inplanes = 64
 77 |         super(ResNeXt, self).__init__()
 78 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
 79 |                                padding=(3, 3, 3), bias=False)
 80 |         self.bn1 = nn.BatchNorm3d(64)
 81 |         self.relu = nn.ReLU(inplace=True)
 82 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
 83 |         self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality)
 84 |         self.layer2 = self._make_layer(block, 256, layers[1], shortcut_type, cardinality, stride=2)
 85 |         self.layer3 = self._make_layer(block, 512, layers[2], shortcut_type, cardinality, stride=2)
 86 |         self.layer4 = self._make_layer(block, 1024, layers[3], shortcut_type, cardinality, stride=2)
 87 |         last_duration = int(math.ceil(sample_duration / 16))
 88 |         last_size = int(math.ceil(sample_size / 32))
 89 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
 90 |         self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes)
 91 | 
 92 |         for m in self.modules():
 93 |             if isinstance(m, nn.Conv3d):
 94 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 95 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 96 |             elif isinstance(m, nn.BatchNorm3d):
 97 |                 m.weight.data.fill_(1)
 98 |                 m.bias.data.zero_()
 99 | 
100 |     def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1):
101 |         downsample = None
102 |         if stride != 1 or self.inplanes != planes * block.expansion:
103 |             if shortcut_type == 'A':
104 |                 downsample = partial(downsample_basic_block,
105 |                                      planes=planes * block.expansion,
106 |                                      stride=stride)
107 |             else:
108 |                 downsample = nn.Sequential(
109 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
110 |                               kernel_size=1, stride=stride, bias=False),
111 |                     nn.BatchNorm3d(planes * block.expansion)
112 |                 )
113 | 
114 |         layers = []
115 |         layers.append(block(self.inplanes, planes, cardinality, stride, downsample))
116 |         self.inplanes = planes * block.expansion
117 |         for i in range(1, blocks):
118 |             layers.append(block(self.inplanes, planes, cardinality))
119 | 
120 |         return nn.Sequential(*layers)
121 | 
122 |     def forward(self, x):
123 |         x = self.conv1(x)
124 |         x = self.bn1(x)
125 |         x = self.relu(x)
126 |         x = self.maxpool(x)
127 | 
128 |         x = self.layer1(x)
129 |         x = self.layer2(x)
130 |         x = self.layer3(x)
131 |         x = self.layer4(x)
132 |         
133 |         x = self.avgpool(x)
134 | 
135 |         x = x.view(x.size(0), -1)
136 |         if self.last_fc:
137 |             x = self.fc(x)
138 | 
139 |         return x
140 | 
141 | def get_fine_tuning_parameters(model, ft_begin_index):
142 |     if ft_begin_index == 0:
143 |         return model.parameters()
144 | 
145 |     ft_module_names = []
146 |     for i in range(ft_begin_index, 5):
147 |         ft_module_names.append('layer{}'.format(ft_begin_index))
148 |     ft_module_names.append('fc')
149 | 
150 |     parameters = []
151 |     for k, v in model.named_parameters():
152 |         for ft_module in ft_module_names:
153 |             if ft_module in k:
154 |                 parameters.append({'params': v})
155 |                 break
156 |         else:
157 |             parameters.append({'params': v, 'lr': 0.0})
158 | 
159 |     return parameters
160 | 
161 | def resnet50(**kwargs):
162 |     """Constructs a ResNet-50 model.
163 |     """
164 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs)
165 |     return model
166 | 
167 | def resnet101(**kwargs):
168 |     """Constructs a ResNet-101 model.
169 |     """
170 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs)
171 |     return model
172 | 
173 | def resnet152(**kwargs):
174 |     """Constructs a ResNet-101 model.
175 |     """
176 |     model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs)
177 |     return model
178 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/models/wide_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class WideBottleneck(nn.Module):
 31 |     expansion = 2
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(WideBottleneck, self).__init__()
 35 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 36 |         self.bn1 = nn.BatchNorm3d(planes)
 37 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
 38 |                                padding=1, bias=False)
 39 |         self.bn2 = nn.BatchNorm3d(planes)
 40 |         self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False)
 41 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 42 |         self.relu = nn.ReLU(inplace=True)
 43 |         self.downsample = downsample
 44 |         self.stride = stride
 45 | 
 46 |     def forward(self, x):
 47 |         residual = x
 48 | 
 49 |         out = self.conv1(x)
 50 |         out = self.bn1(out)
 51 |         out = self.relu(out)
 52 | 
 53 |         out = self.conv2(out)
 54 |         out = self.bn2(out)
 55 |         out = self.relu(out)
 56 | 
 57 |         out = self.conv3(out)
 58 |         out = self.bn3(out)
 59 | 
 60 |         if self.downsample is not None:
 61 |             residual = self.downsample(x)
 62 | 
 63 |         out += residual
 64 |         out = self.relu(out)
 65 | 
 66 |         return out
 67 | 
 68 | 
 69 | class WideResNet(nn.Module):
 70 | 
 71 |     def __init__(self, block, layers, sample_size, sample_duration, k=1, shortcut_type='B', num_classes=400, last_fc=True):
 72 |         self.last_fc = last_fc
 73 | 
 74 |         self.inplanes = 64
 75 |         super(WideResNet, self).__init__()
 76 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
 77 |                                padding=(3, 3, 3), bias=False)
 78 |         self.bn1 = nn.BatchNorm3d(64)
 79 |         self.relu = nn.ReLU(inplace=True)
 80 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
 81 |         self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type)
 82 |         self.layer2 = self._make_layer(block, 128 * k, layers[1], shortcut_type, stride=2)
 83 |         self.layer3 = self._make_layer(block, 256 * k, layers[2], shortcut_type, stride=2)
 84 |         self.layer4 = self._make_layer(block, 512 * k, layers[3], shortcut_type, stride=2)
 85 |         last_duration = math.ceil(sample_duration / 16)
 86 |         last_size = math.ceil(sample_size / 32)
 87 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
 88 |         self.fc = nn.Linear(512 * k * block.expansion, num_classes)
 89 | 
 90 |         for m in self.modules():
 91 |             if isinstance(m, nn.Conv3d):
 92 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 93 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 94 |             elif isinstance(m, nn.BatchNorm3d):
 95 |                 m.weight.data.fill_(1)
 96 |                 m.bias.data.zero_()
 97 | 
 98 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
 99 |         downsample = None
100 |         if stride != 1 or self.inplanes != planes * block.expansion:
101 |             if shortcut_type == 'A':
102 |                 downsample = partial(downsample_basic_block,
103 |                                      planes=planes * block.expansion,
104 |                                      stride=stride)
105 |             else:
106 |                 downsample = nn.Sequential(
107 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
108 |                               kernel_size=1, stride=stride, bias=False),
109 |                     nn.BatchNorm3d(planes * block.expansion)
110 |                 )
111 | 
112 |         layers = []
113 |         layers.append(block(self.inplanes, planes, stride, downsample))
114 |         self.inplanes = planes * block.expansion
115 |         for i in range(1, blocks):
116 |             layers.append(block(self.inplanes, planes))
117 | 
118 |         return nn.Sequential(*layers)
119 | 
120 |     def forward(self, x):
121 |         x = self.conv1(x)
122 |         x = self.bn1(x)
123 |         x = self.relu(x)
124 |         x = self.maxpool(x)
125 | 
126 |         x = self.layer1(x)
127 |         x = self.layer2(x)
128 |         x = self.layer3(x)
129 |         x = self.layer4(x)
130 | 
131 |         x = self.avgpool(x)
132 | 
133 |         x = x.view(x.size(0), -1)
134 |         if self.last_fc:
135 |             x = self.fc(x)
136 | 
137 |         return x
138 | 
139 | def get_fine_tuning_parameters(model, ft_begin_index):
140 |     if ft_begin_index == 0:
141 |         return model.parameters()
142 | 
143 |     ft_module_names = []
144 |     for i in range(ft_begin_index, 5):
145 |         ft_module_names.append('layer{}'.format(ft_begin_index))
146 |     ft_module_names.append('fc')
147 | 
148 |     parameters = []
149 |     for k, v in model.named_parameters():
150 |         for ft_module in ft_module_names:
151 |             if ft_module in k:
152 |                 parameters.append({'params': v})
153 |                 break
154 |         else:
155 |             parameters.append({'params': v, 'lr': 0.0})
156 | 
157 |     return parameters
158 | 
159 | def resnet50(**kwargs):
160 |     """Constructs a ResNet-50 model.
161 |     """
162 |     model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs)
163 |     return model
164 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/opts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def parse_opts():
 4 |     parser = argparse.ArgumentParser()
 5 |     parser.add_argument('--input', default='input', type=str, help='Input file path')
 6 |     parser.add_argument('--video_root', default='', type=str, help='Root path of input videos')
 7 |     parser.add_argument('--model', default='', type=str, help='Model file path')
 8 |     parser.add_argument('--output', default='output.json', type=str, help='Output file path')
 9 |     parser.add_argument('--feat_dir', default='./feat', type=str, help='c3d feat file path')
10 |     parser.add_argument('--mode', default='score', type=str, help='Mode (score | feature). score outputs class scores. feature outputs features (after global average pooling).')
11 |     parser.add_argument('--batch_size', default=32, type=int, help='Batch Size')
12 |     parser.add_argument('--n_threads', default=4, type=int, help='Number of threads for multi-thread loading')
13 |     parser.add_argument('--model_name', default='resnet', type=str, help='Currently only support resnet')
14 |     parser.add_argument('--model_depth', default=34, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
15 |     parser.add_argument('--resnet_shortcut', default='A', type=str, help='Shortcut type of resnet (A | B)')
16 |     parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
17 |     parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
18 |     parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
19 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device number')
20 |     parser.add_argument('--n_classes', type=int, default=400, help='numbers of video class')
21 |     parser.add_argument('--sample_duration', type=int, default=16, help='sample_duration')
22 |     parser.set_defaults(verbose=False)
23 |     parser.add_argument('--verbose', action='store_true', help='')
24 |     parser.set_defaults(verbose=False)
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     return args
29 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/spatial_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | import numbers
  4 | import collections
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image, ImageOps
  8 | try:
  9 |     import accimage
 10 | except ImportError:
 11 |     accimage = None
 12 | 
 13 | 
 14 | class Compose(object):
 15 |     """Composes several transforms together.
 16 |     Args:
 17 |         transforms (list of ``Transform`` objects): list of transforms to compose.
 18 |     Example:
 19 |         >>> transforms.Compose([
 20 |         >>>     transforms.CenterCrop(10),
 21 |         >>>     transforms.ToTensor(),
 22 |         >>> ])
 23 |     """
 24 | 
 25 |     def __init__(self, transforms):
 26 |         self.transforms = transforms
 27 | 
 28 |     def __call__(self, img):
 29 |         for t in self.transforms:
 30 |             img = t(img)
 31 |         return img
 32 | 
 33 | 
 34 | class ToTensor(object):
 35 |     """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
 36 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
 37 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
 38 |     """
 39 | 
 40 |     def __call__(self, pic):
 41 |         """
 42 |         Args:
 43 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
 44 |         Returns:
 45 |             Tensor: Converted image.
 46 |         """
 47 |         if isinstance(pic, np.ndarray):
 48 |             # handle numpy array
 49 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 50 |             # backward compatibility
 51 |             return img.float()
 52 | 
 53 |         if accimage is not None and isinstance(pic, accimage.Image):
 54 |             nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
 55 |             pic.copyto(nppic)
 56 |             return torch.from_numpy(nppic)
 57 | 
 58 |         # handle PIL Image
 59 |         if pic.mode == 'I':
 60 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 61 |         elif pic.mode == 'I;16':
 62 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 63 |         else:
 64 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
 65 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 66 |         if pic.mode == 'YCbCr':
 67 |             nchannel = 3
 68 |         elif pic.mode == 'I;16':
 69 |             nchannel = 1
 70 |         else:
 71 |             nchannel = len(pic.mode)
 72 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 73 |         # put it from HWC to CHW format
 74 |         # yikes, this transpose takes 80% of the loading time/CPU
 75 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 76 |         if isinstance(img, torch.ByteTensor):
 77 |             return img.float()
 78 |         else:
 79 |             return img
 80 | 
 81 | 
 82 | class Normalize(object):
 83 |     """Normalize an tensor image with mean and standard deviation.
 84 |     Given mean: (R, G, B) and std: (R, G, B),
 85 |     will normalize each channel of the torch.*Tensor, i.e.
 86 |     channel = (channel - mean) / std
 87 |     Args:
 88 |         mean (sequence): Sequence of means for R, G, B channels respecitvely.
 89 |         std (sequence): Sequence of standard deviations for R, G, B channels
 90 |             respecitvely.
 91 |     """
 92 | 
 93 |     def __init__(self, mean, std):
 94 |         self.mean = mean
 95 |         self.std = std
 96 | 
 97 |     def __call__(self, tensor):
 98 |         """
 99 |         Args:
100 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
101 |         Returns:
102 |             Tensor: Normalized image.
103 |         """
104 |         # TODO: make efficient
105 |         for t, m, s in zip(tensor, self.mean, self.std):
106 |             t.sub_(m).div_(s)
107 |         return tensor
108 | 
109 | 
110 | class Scale(object):
111 |     """Rescale the input PIL.Image to the given size.
112 |     Args:
113 |         size (sequence or int): Desired output size. If size is a sequence like
114 |             (w, h), output size will be matched to this. If size is an int,
115 |             smaller edge of the image will be matched to this number.
116 |             i.e, if height > width, then image will be rescaled to
117 |             (size * height / width, size)
118 |         interpolation (int, optional): Desired interpolation. Default is
119 |             ``PIL.Image.BILINEAR``
120 |     """
121 | 
122 |     def __init__(self, size, interpolation=Image.BILINEAR):
123 |         assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
124 |         self.size = size
125 |         self.interpolation = interpolation
126 | 
127 |     def __call__(self, img):
128 |         """
129 |         Args:
130 |             img (PIL.Image): Image to be scaled.
131 |         Returns:
132 |             PIL.Image: Rescaled image.
133 |         """
134 |         if isinstance(self.size, int):
135 |             w, h = img.size
136 |             if (w <= h and w == self.size) or (h <= w and h == self.size):
137 |                 return img
138 |             if w < h:
139 |                 ow = self.size
140 |                 oh = int(self.size * h / w)
141 |                 return img.resize((ow, oh), self.interpolation)
142 |             else:
143 |                 oh = self.size
144 |                 ow = int(self.size * w / h)
145 |                 return img.resize((ow, oh), self.interpolation)
146 |         else:
147 |             return img.resize(self.size, self.interpolation)
148 | 
149 | 
150 | class CenterCrop(object):
151 |     """Crops the given PIL.Image at the center.
152 |     Args:
153 |         size (sequence or int): Desired output size of the crop. If size is an
154 |             int instead of sequence like (h, w), a square crop (size, size) is
155 |             made.
156 |     """
157 | 
158 |     def __init__(self, size):
159 |         if isinstance(size, numbers.Number):
160 |             self.size = (int(size), int(size))
161 |         else:
162 |             self.size = size
163 | 
164 |     def __call__(self, img):
165 |         """
166 |         Args:
167 |             img (PIL.Image): Image to be cropped.
168 |         Returns:
169 |             PIL.Image: Cropped image.
170 |         """
171 |         w, h = img.size
172 |         th, tw = self.size
173 |         x1 = int(round((w - tw) / 2.))
174 |         y1 = int(round((h - th) / 2.))
175 |         return img.crop((x1, y1, x1 + tw, y1 + th))
176 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/temporal_transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | 
 5 | class LoopPadding(object):
 6 |     def __init__(self, size):
 7 |         self.size = size
 8 | 
 9 |     def __call__(self, frame_indices):
10 |         out = frame_indices
11 | 
12 |         for index in out:
13 |             if len(out) >= self.size:
14 |                 break
15 |             out.append(index)
16 | 
17 |         return out
18 | 
19 | 
20 | class TemporalCenterCrop(object):
21 |     """Temporally crop the given frame indices at a center.
22 | 
23 |     If the number of frames is less than the size,
24 |     loop the indices as many times as necessary to satisfy the size.
25 | 
26 |     Args:
27 |         size (int): Desired output size of the crop.
28 |     """
29 | 
30 |     def __init__(self, size):
31 |         self.size = size
32 | 
33 |     def __call__(self, frame_indices):
34 |         """
35 |         Args:
36 |             frame_indices (list): frame indices to be cropped.
37 |         Returns:
38 |             list: Cropped frame indices.
39 |         """
40 | 
41 |         center_index = len(frame_indices) // 2
42 |         begin_index = max(0, center_index - (self.size // 2))
43 |         end_index = min(begin_index + self.size, len(frame_indices))
44 | 
45 |         out = frame_indices[begin_index:end_index]
46 | 
47 |         for index in out:
48 |             if len(out) >= self.size:
49 |                 break
50 |             out.append(index)
51 | 
52 |         return out
53 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import os
 5 | import sys
 6 | import json
 7 | 
 8 | from utils import AverageMeter
 9 | 
10 | 
11 | def calculate_video_results(output_buffer, video_id, test_results, class_names):
12 |     video_outputs = torch.stack(output_buffer)
13 |     average_scores = torch.mean(video_outputs, dim=0)
14 |     sorted_scores, locs = torch.topk(average_scores, k=10)
15 | 
16 |     video_results = []
17 |     for i in range(sorted_scores.size(0)):
18 |         video_results.append({'label': class_names[locs[i]], 'score': sorted_scores[i]})
19 | 
20 |     test_results['results'][video_id] = video_results
21 | 
22 | 
23 | def test(data_loader, model, opt, class_names):
24 |     print('test')
25 | 
26 |     model.eval()
27 | 
28 |     batch_time = AverageMeter()
29 |     data_time = AverageMeter()
30 | 
31 |     end_time = time.time()
32 |     output_buffer = []
33 |     previous_video_id = ''
34 |     test_results = {'results': {}}
35 |     for i, (inputs, targets) in enumerate(data_loader):
36 |         data_time.update(time.time() - end_time)
37 | 
38 |         inputs = Variable(inputs, volatile=True)
39 |         outputs = model(inputs)
40 | 
41 |         for j in range(outputs.size(0)):
42 |             if not (i == 0 and j == 0) and targets[j] != previous_video_id:
43 |                 calculate_video_results(output_buffer, previous_video_id,
44 |                                         test_results, class_names)
45 |                 output_buffer = []
46 |             output_buffer.append(outputs[j].data.cpu())
47 |             previous_video_id = targets[j]
48 | 
49 |         if (i % 100) == 0:
50 |             with open(os.path.join(opt.result_path,
51 |                                    '{}.json'.format(opt.test_subset)),
52 |                       'w') as f:
53 |                 json.dump(test_results, f)
54 | 
55 |         batch_time.update(time.time() - end_time)
56 |         end_time = time.time()
57 | 
58 |         print('[{}/{}]\t'
59 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
60 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
61 |                   i + 1, len(data_loader), batch_time=batch_time, data_time=data_time))
62 |     with open(os.path.join(opt.result_path,
63 |                            '{}.json'.format(opt.test_subset)),
64 |               'w') as f:
65 |         json.dump(test_results, f)
66 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import os
 5 | import sys
 6 | 
 7 | from utils import AverageMeter, calculate_accuracy
 8 | 
 9 | 
10 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt,
11 |                 epoch_logger, batch_logger):
12 |     print('train at epoch {}'.format(epoch))
13 | 
14 |     model.train()
15 | 
16 |     batch_time = AverageMeter()
17 |     data_time = AverageMeter()
18 |     losses = AverageMeter()
19 |     accuracies = AverageMeter()
20 | 
21 |     end_time = time.time()
22 |     for i, (inputs, targets) in enumerate(data_loader):
23 |         data_time.update(time.time() - end_time)
24 | 
25 |         if not opt.no_cuda:
26 |             targets = targets.cuda(async=True)
27 |         inputs = Variable(inputs)
28 |         targets = Variable(targets)
29 |         outputs = model(inputs)
30 |         loss = criterion(outputs, targets)
31 |         acc = calculate_accuracy(outputs, targets)
32 | 
33 |         losses.update(loss.data[0], inputs.size(0))
34 |         accuracies.update(acc, inputs.size(0))
35 | 
36 |         optimizer.zero_grad()
37 |         loss.backward()
38 |         optimizer.step()
39 | 
40 |         batch_time.update(time.time() - end_time)
41 |         end_time = time.time()
42 | 
43 |         batch_logger.log({
44 |             'epoch': epoch,
45 |             'batch': i + 1,
46 |             'iter': (epoch - 1) * len(data_loader) + (i + 1),
47 |             'loss': losses.val,
48 |             'acc': accuracies.val,
49 |             'lr': optimizer.param_groups[0]['lr']
50 |         })
51 | 
52 |         print('Epoch: [{0}][{1}/{2}]\t'
53 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
54 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
55 |               'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
56 |               'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
57 |                   epoch, i + 1, len(data_loader), batch_time=batch_time,
58 |                   data_time=data_time, loss=losses, acc=accuracies))
59 | 
60 |     epoch_logger.log({
61 |         'epoch': epoch,
62 |         'loss': losses.avg,
63 |         'acc': accuracies.avg,
64 |         'lr': optimizer.param_groups[0]['lr']
65 |     })
66 | 
67 |     if epoch % opt.checkpoint == 0:
68 |         save_file_path = os.path.join(opt.result_path, 'save_{}.pth'.format(epoch))
69 |         states = {
70 |             'epoch': epoch + 1,
71 |             'arch': opt.arch,
72 |             'state_dict': model.state_dict(),
73 |             'optimizer' : optimizer.state_dict(),
74 |         }
75 |         torch.save(states, save_file_path)
76 | 


--------------------------------------------------------------------------------
/c3d_feat_extract/validation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import sys
 5 | 
 6 | from utils import AverageMeter, calculate_accuracy
 7 | 
 8 | 
 9 | def val_epoch(epoch, data_loader, model, criterion, opt, logger):
10 |     print('validation at epoch {}'.format(epoch))
11 | 
12 |     model.eval()
13 | 
14 |     batch_time = AverageMeter()
15 |     data_time = AverageMeter()
16 |     losses = AverageMeter()
17 |     accuracies = AverageMeter()
18 | 
19 |     end_time = time.time()
20 |     for i, (inputs, targets) in enumerate(data_loader):
21 |         data_time.update(time.time() - end_time)
22 | 
23 |         if not opt.no_cuda:
24 |             targets = targets.cuda(async=True)
25 |         inputs = Variable(inputs, volatile=True)
26 |         targets = Variable(targets, volatile=True)
27 |         outputs = model(inputs)
28 |         loss = criterion(outputs, targets)
29 |         acc = calculate_accuracy(outputs, targets)
30 | 
31 |         losses.update(loss.data[0], inputs.size(0))
32 |         accuracies.update(acc, inputs.size(0))
33 | 
34 |         batch_time.update(time.time() - end_time)
35 |         end_time = time.time()
36 | 
37 |         print('Epoch: [{0}][{1}/{2}]\t'
38 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
39 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
40 |               'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
41 |               'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
42 |                   epoch, i + 1, len(data_loader), batch_time=batch_time,
43 |                   data_time=data_time, loss=losses, acc=accuracies))
44 | 
45 |     logger.log({
46 |         'epoch': epoch,
47 |         'loss': losses.avg,
48 |         'acc': accuracies.avg
49 |     })
50 | 
51 |     return losses.avg
52 | 


--------------------------------------------------------------------------------
/caffe_feat_extract.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import subprocess
  3 | import glob
  4 | import os
  5 | import numpy as np
  6 | import os
  7 | import argparse
  8 | import sys
  9 | from tqdm import tqdm
 10 | from PIL import Image
 11 | # caffe_root = 'your caffe root'
 12 | # sys.path.insert(0, caffe_root + '/python')
 13 | import caffe
 14 | 
 15 | def extract_frames(video, dst):
 16 |     with open(os.devnull, "w") as ffmpeg_log:
 17 |         if os.path.exists(dst):
 18 |             print(" cleanup: " + dst + "/")
 19 |             shutil.rmtree(dst)
 20 |         os.makedirs(dst)
 21 |         video_to_frames_command = ["ffmpeg",
 22 |                                    # (optional) overwrite output file if it exists
 23 |                                    '-y',
 24 |                                    '-i', video,  # input file
 25 |                                    '-vf', "scale=400:300",  # input file
 26 |                                    '-qscale:v', "2",  # quality for JPEG
 27 |                                    '{0}/%06d.jpg'.format(dst)]
 28 |         subprocess.call(video_to_frames_command,
 29 |                         stdout=ffmpeg_log, stderr=ffmpeg_log)
 30 | 
 31 | 
 32 | def extract_feats(params, net):
 33 |     dir_fc = params['output_dir']
 34 |     if not os.path.isdir(dir_fc):
 35 |         os.mkdir(dir_fc)
 36 |     print("save video feats to %s" % (dir_fc))
 37 |     video_list = glob.glob(os.path.join(params['video_path'], '*.mp4'))
 38 |     mean_value = np.array([104.00698793, 116.66876762, 122.67891434])
 39 |     # np.array((102.144, 102.144, 108.64))
 40 |     for video in tqdm(video_list):
 41 |         video_id = video.split("/")[-1].split(".")[0]
 42 |         dst = video_id
 43 |         extract_frames(video, dst)
 44 |         image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
 45 |         samples = np.round(np.linspace(
 46 |             0, len(image_list) - 1, params['n_frame_steps']))
 47 |         image_list = [image_list[int(sample)] for sample in samples]
 48 |         ims = []
 49 |         img_feats = []
 50 |         for index, iImg in enumerate(range(len(image_list))):
 51 |             im = Image.open(image_list[iImg])
 52 |             im = im.resize((224, 224), Image.BILINEAR)
 53 |             im = np.array(im, dtype=np.float32)
 54 |             im = im[:, :, ::-1]  # RGB->BGR
 55 |             im -= mean_value  # BGR
 56 |             im = im.transpose((2, 0, 1))  # im:(c,h,w)
 57 |             im = im[np.newaxis, ...]
 58 |             ims.append(im)
 59 |             if (index+1) % params['batch_size'] == 0:
 60 |                 ims = np.concatenate(ims, axis=0)
 61 |                 net.blobs['data'].reshape(*ims.shape)
 62 |                 net.blobs['data'].data[...] = ims
 63 |                 output = net.forward()
 64 |                 img_feats.append(net.blobs['pool5'].data.squeeze())
 65 |                 ims = []
 66 |         img_feats = np.concatenate(img_feats, axis=0)
 67 |         # Save the inception features
 68 |         outfile = os.path.join(dir_fc, video_id + '.npy')
 69 |         np.save(outfile, img_feats)
 70 |         # cleanup
 71 |         shutil.rmtree(dst)
 72 | 
 73 | 
 74 | if __name__ == '__main__':
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument("--gpu", dest='gpu', type=int, default=0,
 77 |                         help='Set CUDA_VISIBLE_DEVICES environment variable, optional')
 78 |     parser.add_argument('--batch_size', type=int, default=20, help='minibatch size')
 79 |     parser.add_argument("--output_dir", dest='output_dir', type=str,
 80 |                         default='data/feats/resnet152', help='directory to store features')
 81 |     parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80,
 82 |                         help='how many frames to sampler per video')
 83 |     parser.add_argument("--video_path", dest='video_path', type=str,
 84 |                         default='data/videos', help='path to video dataset')
 85 |     parser.add_argument("--model_weight", dest="model_weight", type=str,
 86 |                         default='pretrained_models/resnet152_places365.caffemodel',
 87 |                         help='model_weight')
 88 |     parser.add_argument("--model_deploy", dest="model_deploy", type=str,
 89 |                         default='pretrained_models/deploy_resnet152_places365.prototxt',
 90 |                         help='deploy')
 91 |     args = parser.parse_args()
 92 |     params = vars(args)
 93 |     # TODO: remove this limit
 94 |     assert params['n_frame_steps'] % params['batch_size'] == 0, 'For simplicity, n_frame_steps%batch_size must = 0'
 95 |     caffe.set_device(params['gpu'])
 96 |     caffe.set_mode_gpu()
 97 |     model_weights = params['model_weight']
 98 |     model_def = params['model_deploy']
 99 |     net = caffe.Net(model_def, model_weights, caffe.TEST)
100 |     extract_feats(params, net)
101 | 


--------------------------------------------------------------------------------
/caffe_feat_extract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python caffe_feat_extract.py \
 4 | --video_path data/videos \
 5 | --output_dir data/feats/resnet269 \
 6 | --model_weight pretrained_models/resnet269-v2.caffemodel \
 7 | --model_deploy pretrained_models/deploy_resnet269-v2.prototxt \
 8 | --n_frame_steps 80  \
 9 | --gpu 1 \
10 | --batch_size 10 \
11 | 
12 | 
13 | #python caffe_feat_extract.py \
14 | #--video_path data/videos \
15 | #--output_dir data/feats/resnet152_places365 \
16 | #--model_weight pretrained_models/resnet152_places365.caffemodel \
17 | #--model_deploy pretrained_models/deploy_resnet152_places365.prototxt \
18 | #--n_frame_steps 80  \
19 | #--gpu 0 \
20 | 


--------------------------------------------------------------------------------
/coco-caption/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 DingXia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | #
 4 | # Description: Describes the class to compute the CIDEr
 5 | # (Consensus-Based Image Description Evaluation) Metric
 6 | #          by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 7 | #
 8 | # Creation Date: Sun Feb  8 14:16:54 2015
 9 | #
10 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and
11 | # Tsung-Yi Lin <tl483@cornell.edu>
12 | 
13 | from cider_scorer import CiderScorer
14 | 
15 | 
16 | class Cider:
17 |     """
18 |     Main Class to compute the CIDEr metric
19 | 
20 |     """
21 |     def __init__(self, n=4, df="corpus"):
22 |         """
23 |         Initialize the CIDEr scoring function
24 |         : param n (int): n-gram size
25 |         : param df (string): specifies where to get the IDF values from
26 |                     takes values 'corpus', 'coco-train'
27 |         : return: None
28 |         """
29 |         # set cider to sum over 1 to 4-grams
30 |         self._n = n
31 |         self._df = df
32 |         self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
33 | 
34 |     def compute_score(self, gts, res):
35 |         """
36 |         Main function to compute CIDEr score
37 |         : param  gts (dict) : {image:tokenized reference sentence}
38 |         : param res (dict)  : {image:tokenized candidate sentence}
39 |         : return: cider (float) : computed CIDEr score for the corpus
40 |         """
41 | 
42 |         # clear all the previous hypos and refs
43 |         self.cider_scorer.clear()
44 | 
45 |         for res_id in res:
46 | 
47 |             hypo = res_id['caption']
48 |             ref = gts[res_id['image_id']]
49 | 
50 |             # Sanity check.
51 |             assert(type(hypo) is list)
52 |             assert(len(hypo) == 1)
53 |             assert(type(ref) is list)
54 |             assert(len(ref) > 0)
55 |             self.cider_scorer += (hypo[0], ref)
56 | 
57 |         (score, scores) = self.cider_scorer.compute_score()
58 | 
59 |         return score, scores
60 | 
61 |     def method(self):
62 |         return "CIDEr"
63 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/ciderD/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/ciderD/ciderD.py:
--------------------------------------------------------------------------------
 1 | # Filename: ciderD.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from .ciderD_scorer import CiderScorer
11 | 
12 | 
13 | class CiderD:
14 |     """
15 |     Main Class to compute the CIDEr metric
16 | 
17 |     """
18 |     def __init__(self, n=4, sigma=6.0, df="corpus"):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 |         # set which where to compute document frequencies from
24 |         self._df = df
25 |         self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
26 | 
27 |     def compute_score(self, gts, res):
28 |         """
29 |         Main function to compute CIDEr score
30 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
31 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
32 |         :return: cider (float) : computed CIDEr score for the corpus
33 |         """
34 | 
35 |         # clear all the previous hypos and refs
36 |         self.cider_scorer.clear()
37 |         for res_id in res:
38 | 
39 |             hypo = res_id['caption']
40 |             ref = gts[res_id['image_id']]
41 | 
42 |             # Sanity check.
43 |             assert(type(hypo) is list)
44 |             assert(len(hypo) == 1)
45 |             assert(type(ref) is list)
46 |             assert(len(ref) > 0)
47 |             self.cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = self.cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr-D"
55 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/eval.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'rama'
 2 | from tokenizer.ptbtokenizer import PTBTokenizer
 3 | from cider.cider import Cider
 4 | from ciderD.ciderD import CiderD
 5 | 
 6 | 
 7 | class CIDErEvalCap:
 8 |     def __init__(self, gts, res, df):
 9 |         print 'tokenization...'
10 |         tokenizer = PTBTokenizer('gts')
11 |         _gts = tokenizer.tokenize(gts)
12 |         print 'tokenized refs'
13 |         tokenizer = PTBTokenizer('res')
14 |         _res = tokenizer.tokenize(res)
15 |         print 'tokenized cands'
16 | 
17 |         self.gts = _gts
18 |         self.res = _res
19 |         self.df = df
20 | 
21 |     def evaluate(self):
22 |         # =================================================
23 |         # Set up scorers
24 |         # =================================================
25 | 
26 |         print 'setting up scorers...'
27 |         scorers = [
28 |             (Cider(df=self.df), "CIDEr"), (CiderD(df=self.df), "CIDErD")
29 |         ]
30 | 
31 |         # =================================================
32 |         # Compute scores
33 |         # =================================================
34 |         metric_scores = {}
35 |         for scorer, method in scorers:
36 |             print 'computing %s score...' % (scorer.method())
37 |             score, scores = scorer.compute_score(self.gts, self.res)
38 |             print "Mean %s score: %0.3f" % (method, score)
39 |             metric_scores[method] = list(scores)
40 |         return metric_scores
41 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import os
12 | import pdb # python debugger
13 | import sys
14 | import subprocess
15 | import re
16 | import tempfile
17 | import itertools
18 | 
19 | # path to the stanford corenlp jar
20 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
21 | 
22 | # punctuations to be removed from the sentences
23 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
24 |         ".", "?", "!", ",", ":", "-", "--", "...", ";"] 
25 | 
26 | class PTBTokenizer:
27 |     """Python wrapper of Stanford PTBTokenizer"""
28 |     def __init__(self, _source='gts'):
29 |         self.source = _source
30 | 
31 |     def tokenize(self, captions_for_image):
32 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
33 |                 'edu.stanford.nlp.process.PTBTokenizer', \
34 |                 '-preserveLines', '-lowerCase']
35 | 
36 |         # ======================================================
37 |         # prepare data for PTB Tokenizer
38 |         # ======================================================
39 | 
40 |         if self.source == 'gts':
41 |             image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
42 |             sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
43 |             final_tokenized_captions_for_image = {}
44 | 
45 |         elif self.source == 'res':
46 |             index = [i for i, v in enumerate(captions_for_image)]
47 |             image_id = [v["image_id"] for v in captions_for_image]
48 |             sentences = '\n'.join(v["caption"].replace('\n', ' ') for v in captions_for_image )
49 |             final_tokenized_captions_for_index = []
50 | 
51 |         # ======================================================
52 |         # save sentences to temporary file
53 |         # ======================================================
54 |         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
55 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
56 |         tmp_file.write(sentences)
57 |         tmp_file.close()
58 | 
59 |         # ======================================================
60 |         # tokenize sentence
61 |         # ======================================================
62 |         cmd.append(os.path.basename(tmp_file.name))
63 |         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
64 |                 stdout=subprocess.PIPE)
65 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
66 |         lines = token_lines.split('\n')
67 |         # remove temp file
68 |         os.remove(tmp_file.name)
69 | 
70 |         # ======================================================
71 |         # create dictionary for tokenized captions
72 |         # ======================================================
73 |         if self.source == 'gts':
74 |             for k, line in zip(image_id, lines):
75 |                 if not k in final_tokenized_captions_for_image:
76 |                     final_tokenized_captions_for_image[k] = []
77 |                 tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
78 |                         if w not in PUNCTUATIONS])
79 |                 final_tokenized_captions_for_image[k].append(tokenized_caption)
80 | 
81 |             return final_tokenized_captions_for_image
82 | 
83 |         elif self.source == 'res':
84 |             for k, img, line in zip(index, image_id, lines):
85 |                 tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
86 |                         if w not in PUNCTUATIONS])
87 |                 final_tokenized_captions_for_index.append({'image_id': img, 'caption': [tokenized_caption]})
88 | 
89 |             return final_tokenized_captions_for_index
90 | 


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/tmpBF49XX:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpBF49XX


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/tmpql9uU7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpql9uU7


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/tmpuCp_T0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpuCp_T0


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/tmpxAmV_C:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpxAmV_C


--------------------------------------------------------------------------------
/coco-caption/pyciderevalcap/tokenizer/tmpzNW4I2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pyciderevalcap/tokenizer/tmpzNW4I2


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from .bleu_scorer import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(sorted(gts.keys()) == sorted(res.keys()))
24 |         imgIds = sorted(gts.keys())
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from .cider_scorer import CiderScorer
11 | import pdb
12 | 
13 | class Cider:
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         assert(sorted(gts.keys()) == sorted(res.keys()))
33 |         imgIds = sorted(gts.keys())
34 | 
35 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 | 
37 |         for id in imgIds:
38 |             hypo = res[id]
39 |             ref = gts[id]
40 | 
41 |             # Sanity check.
42 |             assert(type(hypo) is list)
43 |             assert(len(hypo) == 1)
44 |             assert(type(ref) is list)
45 |             assert(len(ref) >= 1)
46 | 
47 |             cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr"
55 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/cider/cider_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | 
  5 | import copy
  6 | from collections import defaultdict
  7 | import numpy as np
  8 | import pdb
  9 | import math
 10 | 
 11 | def precook(s, n=4, out=False):
 12 |     """
 13 |     Takes a string as input and returns an object that can be given to
 14 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 15 |     can take string arguments as well.
 16 |     :param s: string : sentence to be converted into ngrams
 17 |     :param n: int    : number of ngrams for which representation is calculated
 18 |     :return: term frequency vector for occuring ngrams
 19 |     """
 20 |     words = s.split()
 21 |     counts = defaultdict(int)
 22 |     for k in range(1,n+1):
 23 |         for i in range(len(words)-k+1):
 24 |             ngram = tuple(words[i:i+k])
 25 |             counts[ngram] += 1
 26 |     return counts
 27 | 
 28 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 29 |     '''Takes a list of reference sentences for a single segment
 30 |     and returns an object that encapsulates everything that BLEU
 31 |     needs to know about them.
 32 |     :param refs: list of string : reference sentences for some image
 33 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 34 |     :return: result (list of dict)
 35 |     '''
 36 |     return [precook(ref, n) for ref in refs]
 37 | 
 38 | def cook_test(test, n=4):
 39 |     '''Takes a test sentence and returns an object that
 40 |     encapsulates everything that BLEU needs to know about it.
 41 |     :param test: list of string : hypothesis sentence for some image
 42 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 43 |     :return: result (dict)
 44 |     '''
 45 |     return precook(test, n, True)
 46 | 
 47 | class CiderScorer(object):
 48 |     """CIDEr scorer.
 49 |     """
 50 | 
 51 |     def copy(self):
 52 |         ''' copy the refs.'''
 53 |         new = CiderScorer(n=self.n)
 54 |         new.ctest = copy.copy(self.ctest)
 55 |         new.crefs = copy.copy(self.crefs)
 56 |         return new
 57 | 
 58 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
 59 |         ''' singular instance '''
 60 |         self.n = n
 61 |         self.sigma = sigma
 62 |         self.crefs = []
 63 |         self.ctest = []
 64 |         self.document_frequency = defaultdict(float)
 65 |         self.cook_append(test, refs)
 66 |         self.ref_len = None
 67 | 
 68 |     def cook_append(self, test, refs):
 69 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 70 | 
 71 |         if refs is not None:
 72 |             self.crefs.append(cook_refs(refs))
 73 |             if test is not None:
 74 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 75 |             else:
 76 |                 self.ctest.append(None) # lens of crefs and ctest have to match
 77 | 
 78 |     def size(self):
 79 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
 80 |         return len(self.crefs)
 81 | 
 82 |     def __iadd__(self, other):
 83 |         '''add an instance (e.g., from another sentence).'''
 84 | 
 85 |         if type(other) is tuple:
 86 |             ## avoid creating new CiderScorer instances
 87 |             self.cook_append(other[0], other[1])
 88 |         else:
 89 |             self.ctest.extend(other.ctest)
 90 |             self.crefs.extend(other.crefs)
 91 | 
 92 |         return self
 93 |     def compute_doc_freq(self):
 94 |         '''
 95 |         Compute term frequency for reference data.
 96 |         This will be used to compute idf (inverse document frequency later)
 97 |         The term frequency is stored in the object
 98 |         :return: None
 99 |         '''
100 |         for refs in self.crefs:
101 |             # refs, k ref captions of one image
102 |             for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
103 |                 self.document_frequency[ngram] += 1
104 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
105 | 
106 |     def compute_cider(self):
107 |         def counts2vec(cnts):
108 |             """
109 |             Function maps counts of ngram to vector of tfidf weights.
110 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
111 |             The n-th entry of array denotes length of n-grams.
112 |             :param cnts:
113 |             :return: vec (array of dict), norm (array of float), length (int)
114 |             """
115 |             vec = [defaultdict(float) for _ in range(self.n)]
116 |             length = 0
117 |             norm = [0.0 for _ in range(self.n)]
118 |             for (ngram,term_freq) in cnts.items():
119 |                 # give word count 1 if it doesn't appear in reference corpus
120 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
121 |                 # ngram index
122 |                 n = len(ngram)-1
123 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
124 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
125 |                 # compute norm for the vector.  the norm will be used for computing similarity
126 |                 norm[n] += pow(vec[n][ngram], 2)
127 | 
128 |                 if n == 1:
129 |                     length += term_freq
130 |             norm = [np.sqrt(n) for n in norm]
131 |             return vec, norm, length
132 | 
133 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
134 |             '''
135 |             Compute the cosine similarity of two vectors.
136 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
137 |             :param vec_ref: array of dictionary for vector corresponding to reference
138 |             :param norm_hyp: array of float for vector corresponding to hypothesis
139 |             :param norm_ref: array of float for vector corresponding to reference
140 |             :param length_hyp: int containing length of hypothesis
141 |             :param length_ref: int containing length of reference
142 |             :return: array of score for each n-grams cosine similarity
143 |             '''
144 |             delta = float(length_hyp - length_ref)
145 |             # measure consine similarity
146 |             val = np.array([0.0 for _ in range(self.n)])
147 |             for n in range(self.n):
148 |                 # ngram
149 |                 for (ngram,count) in vec_hyp[n].items():
150 |                     # vrama91 : added clipping
151 |                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
152 | 
153 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
154 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
155 | 
156 |                 assert(not math.isnan(val[n]))
157 |                 # vrama91: added a length based gaussian penalty
158 |                 val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
159 |             return val
160 | 
161 |         # compute log reference length
162 |         self.ref_len = np.log(float(len(self.crefs)))
163 | 
164 |         scores = []
165 |         for test, refs in zip(self.ctest, self.crefs):
166 |             # compute vector for test captions
167 |             vec, norm, length = counts2vec(test)
168 |             # compute vector for ref captions
169 |             score = np.array([0.0 for _ in range(self.n)])
170 |             for ref in refs:
171 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
172 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
173 |             # change by vrama91 - mean of ngram scores, instead of sum
174 |             score_avg = np.mean(score)
175 |             # divide by number of references
176 |             score_avg /= len(refs)
177 |             # multiply score by 10
178 |             score_avg *= 10.0
179 |             # append score of an image to the score list
180 |             scores.append(score_avg)
181 |         return scores
182 | 
183 |     def compute_score(self, option=None, verbose=0):
184 |         # compute idf
185 |         self.compute_doc_freq()
186 |         # assert to check document frequency
187 |         assert(len(self.ctest) >= max(self.document_frequency.values()))
188 |         # compute cider score
189 |         score = self.compute_cider()
190 |         # debug
191 |         # print score
192 |         return np.mean(np.array(score)), np.array(score)


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/eval.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'tylin'
 2 | from .tokenizer.ptbtokenizer import PTBTokenizer
 3 | from .bleu.bleu import Bleu
 4 | from .meteor.meteor import Meteor
 5 | from .rouge.rouge import Rouge
 6 | from .cider.cider import Cider
 7 | from .spice.spice import Spice
 8 | 
 9 | class COCOEvalCap:
10 |     def __init__(self, coco, cocoRes):
11 |         self.evalImgs = []
12 |         self.eval = {}
13 |         self.imgToEval = {}
14 |         self.coco = coco
15 |         self.cocoRes = cocoRes
16 |         self.params = {'image_id': coco.getImgIds()}
17 | 
18 |     def evaluate(self):
19 |         imgIds = self.params['image_id']
20 |         # imgIds = self.coco.getImgIds()
21 |         gts = {}
22 |         res = {}
23 |         for imgId in imgIds:
24 |             gts[imgId] = self.coco.imgToAnns[imgId]
25 |             res[imgId] = self.cocoRes.imgToAnns[imgId]
26 | 
27 |         # =================================================
28 |         # Set up scorers
29 |         # =================================================
30 |         print('tokenization...')
31 |         tokenizer = PTBTokenizer()
32 |         gts  = tokenizer.tokenize(gts)
33 |         res = tokenizer.tokenize(res)
34 | 
35 |         # =================================================
36 |         # Set up scorers
37 |         # =================================================
38 |         print('setting up scorers...')
39 |         scorers = [
40 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
41 |             (Meteor(),"METEOR"),
42 |             (Rouge(), "ROUGE_L"),
43 |             (Cider(), "CIDEr"),
44 |             (Spice(), "SPICE")
45 |         ]
46 | 
47 |         # =================================================
48 |         # Compute scores
49 |         # =================================================
50 |         for scorer, method in scorers:
51 |             print('computing %s score...'%(scorer.method()))
52 |             score, scores = scorer.compute_score(gts, res)
53 |             if type(method) == list:
54 |                 for sc, scs, m in zip(score, scores, method):
55 |                     self.setEval(sc, m)
56 |                     self.setImgToEvalImgs(scs, gts.keys(), m)
57 |                     print("%s: %0.3f"%(m, sc))
58 |             else:
59 |                 self.setEval(score, method)
60 |                 self.setImgToEvalImgs(scores, gts.keys(), method)
61 |                 print("%s: %0.3f"%(method, score))
62 |         self.setEvalImgs()
63 | 
64 |     def setEval(self, score, method):
65 |         self.eval[method] = score
66 | 
67 |     def setImgToEvalImgs(self, scores, imgIds, method):
68 |         for imgId, score in zip(sorted(imgIds), scores):
69 |             if not imgId in self.imgToEval:
70 |                 self.imgToEval[imgId] = {}
71 |                 self.imgToEval[imgId]["image_id"] = imgId
72 |             self.imgToEval[imgId][method] = score
73 | 
74 |     def setEvalImgs(self):
75 |         self.evalImgs = [self.imgToEval[imgId] for imgId in sorted(self.imgToEval.keys())]
76 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/meteor/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pycocoevalcap/meteor/meteor-1.5.jar


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/meteor/meteor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Python wrapper for METEOR implementation, by Xinlei Chen
 4 | # Acknowledge Michael Denkowski for the generous discussion and help 
 5 | 
 6 | import os
 7 | import sys
 8 | import subprocess
 9 | import threading
10 | 
11 | # Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
12 | METEOR_JAR = 'meteor-1.5.jar'
13 | # print METEOR_JAR
14 | 
15 | class Meteor:
16 | 
17 |     def __init__(self):
18 |         self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
19 |                 '-', '-', '-stdio', '-l', 'en', '-norm']
20 |         self.meteor_p = subprocess.Popen(self.meteor_cmd, \
21 |                 cwd=os.path.dirname(os.path.abspath(__file__)), \
22 |                 stdin=subprocess.PIPE, \
23 |                 stdout=subprocess.PIPE, \
24 |                 stderr=subprocess.PIPE)
25 |         # Used to guarantee thread safety
26 |         self.lock = threading.Lock()
27 | 
28 |     def compute_score(self, gts, res):
29 |         assert(sorted(gts.keys()) == sorted(res.keys()))
30 |         imgIds = sorted(gts.keys())
31 |         scores = []
32 | 
33 |         eval_line = 'EVAL'
34 |         self.lock.acquire()
35 |         for i in imgIds:
36 |             assert(len(res[i]) == 1)
37 |             stat = self._stat(res[i][0], gts[i])
38 |             eval_line += ' ||| {}'.format(stat)
39 | 
40 |         self.meteor_p.stdin.write('{}\n'.format(eval_line).encode())
41 |         self.meteor_p.stdin.flush()
42 |         for i in range(0, len(imgIds)):
43 |             scores.append(float(self.meteor_p.stdout.readline().decode().strip()))
44 |         score = float(self.meteor_p.stdout.readline().decode().strip())
45 |         self.lock.release()
46 | 
47 |         return score, scores
48 | 
49 |     def method(self):
50 |         return "METEOR"
51 | 
52 |     def _stat(self, hypothesis_str, reference_list):
53 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
54 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
55 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
56 |         self.meteor_p.stdin.write('{}\n'.format(score_line).encode())
57 |         self.meteor_p.stdin.flush()
58 |         return self.meteor_p.stdout.readline().decode().strip()
59 | 
60 |     def _score(self, hypothesis_str, reference_list):
61 |         self.lock.acquire()
62 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
63 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
64 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
65 |         self.meteor_p.stdin.write('{}\n'.format(score_line).encode())
66 |         self.meteor_p.stdin.flush()
67 |         stats = self.meteor_p.stdout.readline().decode().strip()
68 |         eval_line = 'EVAL ||| {}'.format(stats)
69 |         # EVAL ||| stats 
70 |         self.meteor_p.stdin.write('{}\n'.format(eval_line).encode())
71 |         self.meteor_p.stdin.flush()
72 |         score = float(self.meteor_p.stdout.readline().decode().strip())
73 |         # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
74 |         # thanks for Andrej for pointing this out
75 |         score = float(self.meteor_p.stdout.readline().strip())
76 |         self.lock.release()
77 |         return score
78 |  
79 |     def __exit__(self):
80 |         self.lock.acquire()
81 |         self.meteor_p.stdin.close()
82 |         self.meteor_p.kill()
83 |         self.meteor_p.wait()
84 |         self.lock.release()
85 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 | 
 20 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 21 |     """
 22 |     if(len(string)< len(sub)):
 23 |         sub, string = string, sub
 24 | 
 25 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 26 | 
 27 |     for j in range(1,len(sub)+1):
 28 |         for i in range(1,len(string)+1):
 29 |             if(string[i-1] == sub[j-1]):
 30 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 31 |             else:
 32 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 33 | 
 34 |     return lengths[len(string)][len(sub)]
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 | 
 40 |     '''
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert(len(candidate)==1)	
 53 |         assert(len(refs)>0)         
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 |     	
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs/float(len(token_c)))
 66 |             rec.append(lcs/float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if(prec_max!=0 and rec_max !=0):
 72 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py 
 81 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 82 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert(sorted(gts.keys()) == sorted(res.keys()))
 86 |         imgIds = sorted(gts.keys())
 87 | 
 88 |         score = []
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref  = gts[id]
 92 | 
 93 |             score.append(self.calc_score(hypo, ref))
 94 | 
 95 |             # Sanity check.
 96 |             assert(type(hypo) is list)
 97 |             assert(len(hypo) == 1)
 98 |             assert(type(ref) is list)
 99 |             assert(len(ref) >= 1)
100 | 
101 |         average_score = np.mean(np.array(score))
102 |         return average_score, np.array(score)
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import os
12 | import sys
13 | import subprocess
14 | import tempfile
15 | import itertools
16 | 
17 | # path to the stanford corenlp jar
18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
19 | 
20 | # punctuations to be removed from the sentences
21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
22 |         ".", "?", "!", ",", ":", "-", "--", "...", ";"] 
23 | 
24 | class PTBTokenizer:
25 |     """Python wrapper of Stanford PTBTokenizer"""
26 | 
27 |     def tokenize(self, captions_for_image):
28 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
29 |                 'edu.stanford.nlp.process.PTBTokenizer', \
30 |                 '-preserveLines', '-lowerCase']
31 | 
32 |         # ======================================================
33 |         # prepare data for PTB Tokenizer
34 |         # ======================================================
35 |         final_tokenized_captions_for_image = {}
36 |         image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
37 |         sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
38 | 
39 |         # ======================================================
40 |         # save sentences to temporary file
41 |         # ======================================================
42 |         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
43 |         tmp_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, dir=path_to_jar_dirname)
44 |         tmp_file.write(sentences)
45 |         tmp_file.close()
46 | 
47 |         # ======================================================
48 |         # tokenize sentence
49 |         # ======================================================
50 |         cmd.append(os.path.basename(tmp_file.name))
51 |         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
52 |                 stdout=subprocess.PIPE)
53 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
54 |         lines = token_lines.decode().split('\n')
55 |         # remove temp file
56 |         os.remove(tmp_file.name)
57 | 
58 |         # ======================================================
59 |         # create dictionary for tokenized captions
60 |         # ======================================================
61 |         for k, line in zip(image_id, lines):
62 |             if not k in final_tokenized_captions_for_image:
63 |                 final_tokenized_captions_for_image[k] = []
64 |             tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
65 |                     if w not in PUNCTUATIONS])
66 |             final_tokenized_captions_for_image[k].append(tokenized_caption)
67 | 
68 |         return final_tokenized_captions_for_image
69 | 


--------------------------------------------------------------------------------
/coco-caption/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/coco-caption/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar


--------------------------------------------------------------------------------
/coco-caption/pycocotools/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/coco-caption/pycocotools/mask.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'tsungyi'
  2 | 
  3 | import pycocotools._mask as _mask
  4 | 
  5 | # Interface for manipulating masks stored in RLE format.
  6 | #
  7 | # RLE is a simple yet efficient format for storing binary masks. RLE
  8 | # first divides a vector (or vectorized image) into a series of piecewise
  9 | # constant regions and then for each piece simply stores the length of
 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
 12 | # (note that the odd counts are always the numbers of zeros). Instead of
 13 | # storing the counts directly, additional compression is achieved with a
 14 | # variable bitrate representation based on a common scheme called LEB128.
 15 | #
 16 | # Compression is greatest given large piecewise constant regions.
 17 | # Specifically, the size of the RLE is proportional to the number of
 18 | # *boundaries* in M (or for an image the number of boundaries in the y
 19 | # direction). Assuming fairly simple shapes, the RLE representation is
 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage
 21 | # is substantially lower, especially for large simple objects (large n).
 22 | #
 23 | # Many common operations on masks can be computed directly using the RLE
 24 | # (without need for decoding). This includes computations such as area,
 25 | # union, intersection, etc. All of these operations are linear in the
 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area
 27 | # of the object. Computing these operations on the original mask is O(n).
 28 | # Thus, using the RLE can result in substantial computational savings.
 29 | #
 30 | # The following API functions are defined:
 31 | #  encode         - Encode binary masks using RLE.
 32 | #  decode         - Decode binary masks encoded via RLE.
 33 | #  merge          - Compute union or intersection of encoded masks.
 34 | #  iou            - Compute intersection over union between masks.
 35 | #  area           - Compute area of encoded masks.
 36 | #  toBbox         - Get bounding boxes surrounding encoded masks.
 37 | #  frPyObjects    - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
 38 | #
 39 | # Usage:
 40 | #  Rs     = encode( masks )
 41 | #  masks  = decode( Rs )
 42 | #  R      = merge( Rs, intersect=false )
 43 | #  o      = iou( dt, gt, iscrowd )
 44 | #  a      = area( Rs )
 45 | #  bbs    = toBbox( Rs )
 46 | #  Rs     = frPyObjects( [pyObjects], h, w )
 47 | #
 48 | # In the API the following formats are used:
 49 | #  Rs      - [dict] Run-length encoding of binary masks
 50 | #  R       - dict Run-length encoding of binary mask
 51 | #  masks   - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
 52 | #  iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
 53 | #  bbs     - [nx4] Bounding box(es) stored as [x y w h]
 54 | #  poly    - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
 55 | #  dt,gt   - May be either bounding boxes or encoded masks
 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
 57 | #
 58 | # Finally, a note about the intersection over union (iou) computation.
 59 | # The standard iou of a ground truth (gt) and detected (dt) object is
 60 | #  iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
 61 | # For "crowd" regions, we use a modified criteria. If a gt object is
 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt.
 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using
 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
 65 | #  iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
 66 | # For crowd gt regions we use this modified criteria above for the iou.
 67 | #
 68 | # To compile run "python setup.py build_ext --inplace"
 69 | # Please do not contact us for help with compiling.
 70 | #
 71 | # Microsoft COCO Toolbox.      version 2.0
 72 | # Data, paper, and tutorials available at:  http://mscoco.org/
 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
 74 | # Licensed under the Simplified BSD License [see coco/license.txt]
 75 | 
 76 | iou         = _mask.iou
 77 | merge       = _mask.merge
 78 | frPyObjects = _mask.frPyObjects
 79 | 
 80 | def encode(bimask):
 81 |     if len(bimask.shape) == 3:
 82 |         return _mask.encode(bimask)
 83 |     elif len(bimask.shape) == 2:
 84 |         h, w = bimask.shape
 85 |         return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
 86 | 
 87 | def decode(rleObjs):
 88 |     if type(rleObjs) == list:
 89 |         return _mask.decode(rleObjs)
 90 |     else:
 91 |         return _mask.decode([rleObjs])[:,:,0]
 92 | 
 93 | def area(rleObjs):
 94 |     if type(rleObjs) == list:
 95 |         return _mask.area(rleObjs)
 96 |     else:
 97 |         return _mask.area([rleObjs])[0]
 98 | 
 99 | def toBbox(rleObjs):
100 |     if type(rleObjs) == list:
101 |         return _mask.toBbox(rleObjs)
102 |     else:
103 |         return _mask.toBbox([rleObjs])[0]


--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import random
  4 | import os
  5 | import numpy as np
  6 | import torch
  7 | from torch.utils.data import Dataset
  8 | 
  9 | 
 10 | class CocoDataset(Dataset):
 11 | 
 12 |     def __init__(self, coco_labels):
 13 |         # python 3
 14 |         # super().__init__()
 15 |         super(CocoDataset, self).__init__()
 16 |         self.coco_labels = list(coco_labels['labels'].items())
 17 |         self.num_classes = coco_labels['num_classes']
 18 | 
 19 |     def __getitem__(self, ix):
 20 |         labels = torch.zeros(self.num_classes)
 21 |         image_id, labels_ids = self.coco_labels[ix]
 22 |         labels[labels_ids] = 1
 23 |         data = {}
 24 |         data['image_ids'] = image_id
 25 |         data['labels'] = labels
 26 |         return data
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.coco_labels)
 30 | 
 31 | 
 32 | class VideoDataset(Dataset):
 33 | 
 34 |     def get_vocab_size(self):
 35 |         return len(self.get_vocab())
 36 | 
 37 |     def get_vocab(self):
 38 |         return self.ix_to_word
 39 | 
 40 |     def get_seq_length(self):
 41 |         return self.seq_length
 42 | 
 43 |     def __init__(self, opt, mode):
 44 |         # python 3
 45 |         # super().__init__()
 46 |         super(VideoDataset, self).__init__()
 47 |         self.mode = mode  # to load train/val/test data
 48 | 
 49 |         # load the json file which contains information about the dataset
 50 |         self.captions = json.load(open(opt["caption_json"]))
 51 |         info = json.load(open(opt["info_json"]))
 52 |         self.ix_to_word = info['ix_to_word']
 53 |         self.word_to_ix = info['word_to_ix']
 54 |         print('vocab size is ', len(self.ix_to_word))
 55 |         self.splits = info['videos']
 56 |         print('number of train videos: ', len(self.splits['train']))
 57 |         print('number of val videos: ', len(self.splits['val']))
 58 |         print('number of test videos: ', len(self.splits['test']))
 59 |         self.n_frame_steps = opt['n_frame_steps']
 60 |         self.feats_dir = opt["feats_dir"]
 61 |         self.c3d_feats_dir = opt['c3d_feats_dir']
 62 |         self.with_c3d = opt['with_c3d']
 63 |         print('load feats from %s' % (self.feats_dir))
 64 |         # load in the sequence data
 65 |         self.max_len = opt["max_len"]
 66 |         print('max sequence length in data is', self.max_len)
 67 | 
 68 |     def __getitem__(self, ix):
 69 |         """This function returns a tuple that is further passed to collate_fn
 70 |         """
 71 |         # which part of data to load
 72 |         if self.mode == 'val':
 73 |             ix += len(self.splits['train'])
 74 |         elif self.mode == 'test':
 75 |             ix = ix + len(self.splits['train']) + len(self.splits['val'])
 76 |         fc_feat = []
 77 |         for dir in self.feats_dir:
 78 |             fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % (ix))))
 79 |         fc_feat = np.concatenate(fc_feat, axis=1)
 80 |         samples = np.round(np.linspace(
 81 |             0, fc_feat.shape[0] - 1, self.n_frame_steps)).astype(np.int32)
 82 |         fc_feat = fc_feat[samples, :]
 83 |         if self.with_c3d == 1:
 84 |             c3d_feat = np.load(os.path.join(self.c3d_feats_dir, 'video%i.npy'%(ix)))
 85 |             if len(c3d_feat.shape) == 1:
 86 |                 fc_feat = np.concatenate((fc_feat, np.tile(c3d_feat, (fc_feat.shape[0], 1))), axis=1)
 87 |             elif len(c3d_feat.shape) == 2:
 88 |                 samples = np.round(np.linspace(
 89 |                     0, c3d_feat.shape[0] - 1, fc_feat.shape[0])).astype(np.int32)
 90 |                 fc_feat = np.concatenate((fc_feat, c3d_feat[samples, :]), axis=1)
 91 |         # label = torch.zeros(self.max_len)
 92 |         mask = torch.zeros(self.max_len)
 93 |         captions = self.captions['video%i'%(ix)]['final_captions']
 94 |         gts = torch.zeros(len(captions), self.max_len).long()
 95 |         for i, cap in enumerate(captions):
 96 |             if len(cap) > self.max_len:
 97 |                 cap = cap[:self.max_len]
 98 |                 cap[-1] = '<eos>'
 99 |             for j, w in enumerate(cap):
100 |                 gts[i, j] = self.word_to_ix[w]
101 |                 # # add by rgh
102 |                 # if w in self.word_to_ix.keys():
103 |                 #     gts[i, j] = self.word_to_ix[w]
104 |                 # else:
105 |                 #     gts[i, j] = 0
106 | 
107 |         # random select a caption for this video
108 |         cap_ix = random.randint(0, len(captions) - 1)
109 |         label = gts[cap_ix]
110 |         non_zero = (label == 0).nonzero()
111 |         mask[:int(non_zero[0]) + 1] = 1
112 | 
113 |         data = {}
114 |         data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor)
115 |         data['labels'] = label
116 |         data['masks'] = mask
117 |         data['gts'] = gts
118 |         data['video_ids'] = 'video%i'%(ix)
119 |         return data
120 | 
121 |     def __len__(self):
122 |         return len(self.splits[self.mode])
123 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import argparse
  4 | import torch
  5 | from torch import nn
  6 | from torch.autograd import Variable
  7 | from torch.utils.data import DataLoader
  8 | from models import EncoderRNN, DecoderRNN, S2VTAttModel, S2VTModel
  9 | from dataloader import VideoDataset
 10 | import misc.utils as utils
 11 | from misc.cocoeval import suppress_stdout_stderr, COCOScorer
 12 | from collections import OrderedDict
 13 | from pandas.io.json import json_normalize
 14 | 
 15 | 
 16 | def convert_data_to_coco_scorer_format(data_frame):
 17 |     gts = {}
 18 |     for row in zip(data_frame["caption"], data_frame["video_id"]):
 19 |         if row[1] in gts:
 20 |             gts[row[1]].append(
 21 |                 {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
 22 |         else:
 23 |             gts[row[1]] = []
 24 |             gts[row[1]].append(
 25 |                 {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
 26 |     return gts
 27 | 
 28 | 
 29 | def test(model, crit, dataset, vocab, opt):
 30 |     model.eval()
 31 |     loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=False)
 32 |     scorer = COCOScorer()
 33 |     gt_dataframe = json_normalize(
 34 |         json.load(open(opt["input_json"]))['sentences'])
 35 |     gts = convert_data_to_coco_scorer_format(gt_dataframe)
 36 |     #results = []
 37 |     samples = {}
 38 |     for index, data in enumerate(loader):
 39 |         print 'batch: '+str((index+1)*opt["batch_size"])
 40 |         # forward the model to get loss
 41 |         fc_feats = Variable(data['fc_feats'], volatile=True).cuda()
 42 |         labels = Variable(data['labels'], volatile=True).long().cuda()
 43 |         masks = Variable(data['masks'], volatile=True).cuda()
 44 |         video_ids = data['video_ids']
 45 |       
 46 |         # forward the model to also get generated samples for each image
 47 |         seq_probs, seq_preds = model(
 48 |             fc_feats, mode='inference', opt=opt)
 49 |         # print(seq_preds)
 50 | 
 51 |         sents = utils.decode_sequence(vocab, seq_preds)
 52 | 
 53 |         for k, sent in enumerate(sents):
 54 |             video_id = video_ids[k]
 55 |             samples[video_id] = [{'image_id': video_id, 'caption': sent}]
 56 |         # break
 57 |     with suppress_stdout_stderr():
 58 |         valid_score = scorer.score(gts, samples, samples.keys())
 59 |     #results.append(valid_score)
 60 |     #print(valid_score)
 61 | 
 62 |     if not os.path.exists(opt["results_path"]):
 63 |         os.makedirs(opt["results_path"])
 64 |     result = OrderedDict()
 65 |     result['checkpoint'] = opt["saved_model"][opt["saved_model"].rfind('/')+1:]
 66 |     score_sum = 0
 67 |     for key, value in valid_score.items():
 68 |         score_sum += float(value)
 69 |     result['sum'] = str(score_sum)
 70 |     #result = OrderedDict(result, **valid_score)
 71 |     result = OrderedDict(result.items() + valid_score.items())
 72 |     print result
 73 |     if not os.path.exists(opt["results_path"]):
 74 |         os.makedirs(opt["results_path"])
 75 |     with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table:
 76 |         scores_table.write(json.dumps(result) + "\n")
 77 |     with open(os.path.join(opt["results_path"],
 78 |                            opt["model"].split("/")[-1].split('.')[0] + ".json"), 'w') as prediction_results:
 79 |         json.dump({"predictions": samples, "scores": valid_score},
 80 |                   prediction_results)
 81 | 
 82 | 
 83 | def main(opt):
 84 |     dataset = VideoDataset(opt, "test")
 85 |     opt["vocab_size"] = dataset.get_vocab_size()
 86 |     opt["seq_length"] = dataset.max_len
 87 |     if opt['beam_size'] != 1:
 88 |         assert opt["batch_size"] == 1
 89 |     if opt["model"] == 'S2VTModel':
 90 |         model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
 91 |                           n_layers=opt['num_layers'],
 92 |                           rnn_cell=opt['rnn_type'],
 93 |                           bidirectional=opt["bidirectional"],
 94 |                           rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
 95 |     elif opt["model"] == "S2VTAttModel":
 96 |         encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
 97 |                              n_layers=opt['num_layers'],
 98 |                              rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
 99 |                              input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
100 |         decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
101 |                              n_layers=opt['num_layers'],
102 |                              rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
103 |                              rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
104 |         model = S2VTAttModel(encoder, decoder).cuda()
105 |     model = nn.DataParallel(model)
106 |     # Setup the model
107 |     model.load_state_dict(torch.load(opt["saved_model"]))
108 |     crit = utils.LanguageModelCriterion()
109 | 
110 |     test(model, crit, dataset, dataset.get_vocab(), opt)
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     parser = argparse.ArgumentParser()
115 |     parser.add_argument('--recover_opt', type=str, required=True,
116 |                         help='recover train opts from saved opt_json')
117 |     parser.add_argument('--saved_model', type=str, default='',
118 |                         help='path to saved model to evaluate')
119 |     # parser.add_argument('--rnn_type', type=str, default='gru', help='lstm or gru')
120 |     parser.add_argument('--dump_json', type=int, default=1,
121 |                         help='Dump json with predictions into vis folder? (1=yes,0=no)')
122 |     parser.add_argument('--results_path', type=str, default='results/')
123 |     parser.add_argument('--dump_path', type=int, default=0,
124 |                         help='Write image paths along with predictions into vis json? (1=yes,0=no)')
125 |     parser.add_argument('--gpu', type=str, default='0',
126 |                         help='gpu device number')
127 |     parser.add_argument('--batch_size', type=int, default=128,
128 |                         help='minibatch size')
129 |     parser.add_argument('--sample_max', type=int, default=1,
130 |                         help='0/1. whether sample max probs  to get next word in inference stage')
131 |     parser.add_argument('--temperature', type=float, default=1.0)
132 |     parser.add_argument('--beam_size', type=int, default=1,
133 |                         help='used when sample_max = 1. Usually 2 or 3 works well.')
134 | 
135 |     args = parser.parse_args()
136 |     args = vars((args))
137 |     opt = json.load(open(args["recover_opt"]))
138 |     for k, v in args.items():
139 |         opt[k] = v
140 |     os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"]
141 |     main(opt)
142 | 


--------------------------------------------------------------------------------
/eval_s2vt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ###  nasnet_resnet101_40frames
 4 | feat=nasnet_resnet101_40frames
 5 | epoch=60
 6 | python eval.py  \
 7 | --rnn_type lstm \
 8 | --results_path result/$feat/s2vt \
 9 | --recover_opt checkpoint/$feat/s2vt/opt_info.json \
10 | --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \
11 | --batch_size 100 \
12 | --gpu 0
13 | 
14 | 
15 | 
16 | # ###  nasnet_40frames
17 | # feat=nasnet_40frames
18 | # epoch=250
19 | # python eval.py  \
20 | # --rnn_type lstm \
21 | # --results_path result/$feat/s2vt \
22 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \
23 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \
24 | # --batch_size 100 \
25 | # --gpu 0
26 | 
27 | 
28 | ###  inception_v4 40frames
29 | # feat=inception_v4_40frames
30 | # epoch=300
31 | # python eval.py  \
32 | # --rnn_type lstm \
33 | # --results_path result/$feat/s2vt \
34 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \
35 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \
36 | # --batch_size 100 \
37 | # --gpu 0
38 | 
39 | 
40 | # feat=resnet101_40frames
41 | # epoch=150
42 | # python eval.py  \
43 | # --rnn_type lstm \
44 | # --results_path result/$feat/s2vt \
45 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \
46 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \
47 | # --batch_size 100 \
48 | # --gpu 1
49 | 
50 | 
51 | # feat=resnet101_c3d_fc7_wo_ft
52 | # epoch=150
53 | # python eval.py  \
54 | # --rnn_type lstm \
55 | # --results_path result/$feat/s2vt \
56 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \
57 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \
58 | # --batch_size 100 \
59 | # --gpu 1
60 | 
61 | 
62 | # feat=resnet101_80frames
63 | # feat=resnet101
64 | # epoch=150
65 | # python eval.py  \
66 | # --rnn_type lstm \
67 | # --results_path result/$feat/s2vt \
68 | # --recover_opt checkpoint/$feat/s2vt/opt_info.json \
69 | # --saved_model checkpoint/$feat/s2vt/model_$epoch.pth \
70 | # --batch_size 100 \
71 | # --gpu 1 
72 | 


--------------------------------------------------------------------------------
/finetune_cnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.utils.data import DataLoader
  3 | from torch.autograd import Variable
  4 | from torch import nn
  5 | import torch.optim as optim
  6 | import os
  7 | import json
  8 | import argparse
  9 | from dataloader import CocoDataset
 10 | import pretrainedmodels
 11 | from pretrainedmodels import utils
 12 | 
 13 | 
 14 | C, H, W = 3, 224, 224
 15 | 
 16 | 
 17 | class MILModel(nn.Module):
 18 |     def __init__(self, cnn_model, dim_hidden, num_classes):
 19 |         # python 3
 20 |         # super().__init__()
 21 |         super(MILModel, self).__init__()
 22 |         self.cnn_model = cnn_model
 23 |         self.num_classes = num_classes
 24 |         self.dim_hidden = dim_hidden
 25 |         self.linear = nn.Linear(dim_hidden, num_classes)
 26 | 
 27 |     def forward(self, x):
 28 |         feature_map = self.cnn_model.features(x)
 29 |         feature_map = feature_map.permute(0, 2, 3, 1)
 30 |         b, x, y, h = feature_map.size()
 31 |         feature_map = feature_map.contiguous().view(b, x * y, h)
 32 |         logits = self.linear(feature_map)
 33 |         logits = 1 - logits
 34 |         probs = Variable(torch.ones(logits.shape[0], logits.shape[2])).cuda()
 35 |         for i in range(x * y):
 36 |             probs = probs * logits[:, i, :]
 37 |         probs = 1 - probs
 38 |         return probs
 39 | 
 40 | 
 41 | def train(dataloader, model, crit, optimizer, lr_scheduler, load_image_fn, params):
 42 |     model.train()
 43 |     model = nn.DataParallel(model)
 44 |     images_path = json.load(open(params.coco_path))
 45 | 
 46 |     for epoch in range(params.epochs):
 47 |         lr_scheduler.step()
 48 |         iteration = 0
 49 |         for data in dataloader:
 50 |             iteration += 1
 51 |             image_ids, image_labels = data['image_ids'], data['labels']
 52 |             images = torch.zeros(image_labels.shape[0], C, H, W)
 53 |             for i, image_id in enumerate(image_ids):
 54 |                 image_path = os.path.join(
 55 |                     params.coco_dir, images_path[image_id])
 56 |                 images[i] = load_image_fn(image_path)
 57 |             logits = model(Variable(images).cuda())
 58 |             loss = crit(logits, Variable(image_labels).cuda())
 59 | 
 60 |             optimizer.zero_grad()
 61 |             loss.backward()
 62 |             optimizer.step()
 63 |             train_loss = loss.data[0]
 64 |             torch.cuda.synchronize()
 65 | 
 66 |             print("iter %d (epoch %d), train_loss = %.6f" %
 67 |                   (iteration, epoch, train_loss))
 68 | 
 69 |         if epoch % params.save_checkpoint_every == 0:
 70 |             checkpoint_path = os.path.join(
 71 |                 params.checkpoint_path, 'cnn_model_%d.pth' % (epoch))
 72 |             torch.save(model.state_dict(), checkpoint_path)
 73 |             print("model saved to %s" % (checkpoint_path))
 74 | 
 75 | 
 76 | def main(args):
 77 |     global C, H, W
 78 |     coco_labels = json.load(open(args.coco_labels))
 79 |     num_classes = coco_labels['num_classes']
 80 |     if args.model == 'inception_v3':
 81 |         C, H, W = 3, 299, 299
 82 |         model = pretrainedmodels.inceptionv3(pretrained='imagenet')
 83 | 
 84 |     elif args.model == 'resnet152':
 85 |         C, H, W = 3, 224, 224
 86 |         model = pretrainedmodels.resnet152(pretrained='imagenet')
 87 | 
 88 |     elif args.model == 'inception_v4':
 89 |         C, H, W = 3, 299, 299
 90 |         model = pretrainedmodels.inceptionv4(
 91 |             num_classes=1000, pretrained='imagenet')
 92 | 
 93 |     else:
 94 |         print("doesn't support %s" % (args['model']))
 95 | 
 96 |     load_image_fn = utils.LoadTransformImage(model)
 97 |     dim_feats = model.last_linear.in_features
 98 |     model = MILModel(model, dim_feats, num_classes)
 99 |     model = model.cuda()
100 |     dataset = CocoDataset(coco_labels)
101 |     dataloader = DataLoader(
102 |         dataset, batch_size=args.batch_size, shuffle=True)
103 |     optimizer = optim.Adam(
104 |         model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
105 |     exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.learning_rate_decay_every,
106 |                                                  gamma=args.learning_rate_decay_rate)
107 | 
108 |     crit = nn.MultiLabelSoftMarginLoss()
109 |     if not os.path.isdir(args.checkpoint_path):
110 |         os.mkdir(args.checkpoint_path)
111 |     train(dataloader, model, crit, optimizer,
112 |           exp_lr_scheduler, load_image_fn, args)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('--coco_path', type=str,
118 |                         default='data/coco_path.json', help='')
119 |     parser.add_argument('--coco_labels', type=str,
120 |                         default='data/coco_labels.json', help='path to processed coco caption json')
121 |     parser.add_argument('--coco_dir', type=str,
122 |                         default='data/mscoco/train2014')
123 |     parser.add_argument('--epochs', type=int, default=200,
124 |                         help='number of epochs')
125 |     parser.add_argument('--checkpoint_path', type=str,
126 |                         help='path to trained model')
127 |     parser.add_argument("--gpu", dest='gpu', type=str, default='0',
128 |                         help='Set CUDA_VISIBLE_DEVICES environment variable, optional')
129 |     parser.add_argument("--model", dest="model", type=str, default='resnet152',
130 |                         help='the CNN model you want to use to extract_feats')
131 | 
132 |     parser.add_argument('--save_checkpoint_every', type=int, default=20,
133 |                         help='how often to save a model checkpoint (in epoch)?')
134 |     parser.add_argument('--batch_size', type=int, default=512)
135 |     parser.add_argument('--learning_rate', type=float, default=1e-5,
136 |                         help='learning rate')
137 | 
138 |     parser.add_argument('--learning_rate_decay_every', type=int, default=2,
139 |                         help='every how many epoch thereafter to drop LR?')
140 |     parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8)
141 |     parser.add_argument('--optim_alpha', type=float, default=0.9,
142 |                         help='alpha for adam')
143 |     parser.add_argument('--optim_beta', type=float, default=0.999,
144 |                         help='beta used for adam')
145 |     parser.add_argument('--optim_epsilon', type=float, default=1e-8,
146 |                         help='epsilon that goes into denominator for smoothing')
147 |     parser.add_argument('--weight_decay', type=float, default=5e-4,
148 |                         help='weight_decay')
149 |     args = parser.parse_args()
150 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
151 | 
152 |     main(args)
153 | 


--------------------------------------------------------------------------------
/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sundrops/video-caption.pytorch/66430862696678f7233a1fc67af22f0525b22e52/misc/__init__.py


--------------------------------------------------------------------------------
/misc/cocoeval.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Wrapper for evaluation on CIDEr, ROUGE_L, METEOR and Bleu_N
  3 | using coco-caption repo https://github.com/tylin/coco-caption
  4 | 
  5 | class COCOScorer is taken from https://github.com/yaoli/arctic-capgen-vid
  6 | '''
  7 | 
  8 | import json
  9 | import os
 10 | import sys
 11 | sys.path.append('coco-caption')
 12 | 
 13 | from pycocoevalcap.bleu.bleu import Bleu
 14 | from pycocoevalcap.rouge.rouge import Rouge
 15 | from pycocoevalcap.cider.cider import Cider
 16 | from pycocoevalcap.meteor.meteor import Meteor
 17 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 18 | from collections import OrderedDict
 19 | # Define a context manager to suppress stdout and stderr.
 20 | 
 21 | 
 22 | class suppress_stdout_stderr:
 23 |     '''
 24 |     A context manager for doing a "deep suppression" of stdout and stderr in
 25 |     Python, i.e. will suppress all print, even if the print originates in a
 26 |     compiled C/Fortran sub-function.
 27 |        This will not suppress raised exceptions, since exceptions are printed
 28 |     to stderr just before a script exits, and after the context manager has
 29 |     exited (at least, I think that is why it lets exceptions through).
 30 | 
 31 |     '''
 32 | 
 33 |     def __init__(self):
 34 |         # Open a pair of null files
 35 |         self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
 36 |         # Save the actual stdout (1) and stderr (2) file descriptors.
 37 |         self.save_fds = (os.dup(1), os.dup(2))
 38 | 
 39 |     def __enter__(self):
 40 |         # Assign the null pointers to stdout and stderr.
 41 |         os.dup2(self.null_fds[0], 1)
 42 |         os.dup2(self.null_fds[1], 2)
 43 | 
 44 |     def __exit__(self, *_):
 45 |         # Re-assign the real stdout/stderr back to (1) and (2)
 46 |         os.dup2(self.save_fds[0], 1)
 47 |         os.dup2(self.save_fds[1], 2)
 48 |         # Close the null files
 49 |         os.close(self.null_fds[0])
 50 |         os.close(self.null_fds[1])
 51 | 
 52 | 
 53 | class COCOScorer(object):
 54 |     def __init__(self):
 55 |         print('init COCO-EVAL scorer')
 56 | 
 57 |     def score(self, GT, RES, IDs):
 58 |         # edited by rgh
 59 |         #self.eval = {}
 60 |         self.eval = OrderedDict()
 61 |         self.imgToEval = {}
 62 |         gts = {}
 63 |         res = {}
 64 |         for ID in IDs:
 65 |             #            print ID
 66 |             gts[ID] = GT[ID]
 67 |             res[ID] = RES[ID]
 68 |         print('tokenization...')
 69 |         tokenizer = PTBTokenizer()
 70 |         gts = tokenizer.tokenize(gts)
 71 |         res = tokenizer.tokenize(res)
 72 | 
 73 |         # =================================================
 74 |         # Set up scorers
 75 |         # =================================================
 76 |         print('setting up scorers...')
 77 |         # edited by rgh
 78 |         # scorers = [
 79 |         #     (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
 80 |         #     (Meteor(),"METEOR"),
 81 |         #     (Rouge(), "ROUGE_L"),
 82 |         #     (Cider(), "CIDEr"),
 83 |         #     #(Spice(), "SPICE")
 84 |         # ]
 85 |         scorers = [
 86 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
 87 |             (Meteor(), "METEOR"),
 88 |             (Cider(), "CIDEr"),
 89 |             (Rouge(), "ROUGE_L"),
 90 |             # (Spice(), "SPICE")
 91 |         ]
 92 | 
 93 |         # =================================================
 94 |         # Compute scores
 95 |         # =================================================
 96 |         eval = {}
 97 |         for scorer, method in scorers:
 98 |             print('computing %s score...' % (scorer.method()))
 99 |             score, scores = scorer.compute_score(gts, res)
100 |             if type(method) == list:
101 |                 # added by rgh
102 |                 # for sc, scs, m in zip(score, scores, method):
103 |                 #     self.setEval(sc, m)
104 |                 #     self.setImgToEvalImgs(scs, IDs, m)
105 |                 #     print("%s: %0.3f" % (m, sc))
106 |                 self.setEval("%.4f" % score[-1], method[-1])
107 |                 self.setImgToEvalImgs(scores[-1], IDs, method[-1])
108 |                 print("%s: %0.4f" % (method[-1], score[-1]))
109 |             else:
110 |                 self.setEval("%.4f" % score, method)
111 |                 self.setImgToEvalImgs(scores, IDs, method)
112 |                 print("%s: %0.4f" % (method, score))
113 | 
114 |         # for metric, score in self.eval.items():
115 |         #    print '%s: %.3f'%(metric, score)
116 |         return self.eval
117 | 
118 |     def setEval(self, score, method):
119 |         self.eval[method] = score
120 | 
121 |     def setImgToEvalImgs(self, scores, imgIds, method):
122 |         for imgId, score in zip(imgIds, scores):
123 |             if imgId not in self.imgToEval:
124 |                 self.imgToEval[imgId] = {}
125 |                 self.imgToEval[imgId]["image_id"] = imgId
126 |             self.imgToEval[imgId][method] = score
127 | 
128 | 
129 | def score(ref, sample):
130 |     # ref and sample are both dict
131 |     scorers = [
132 |         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
133 |         (Rouge(), "ROUGE_L"),
134 |         (Cider(), "CIDEr")
135 |     ]
136 |     final_scores = {}
137 |     for scorer, method in scorers:
138 |         print('computing %s score with COCO-EVAL...' % (scorer.method()))
139 |         score, scores = scorer.compute_score(ref, sample)
140 |         if type(score) == list:
141 |             for m, s in zip(method, score):
142 |                 final_scores[m] = s
143 |         else:
144 |             final_scores[method] = score
145 |     return final_scores
146 | 
147 | 
148 | def test_cocoscorer():
149 |     '''gts = {
150 |         184321:[
151 |         {u'image_id': 184321, u'id': 352188, u'caption': u'A train traveling down-tracks next to lights.'},
152 |         {u'image_id': 184321, u'id': 356043, u'caption': u"A blue and silver train next to train's station and trees."},
153 |         {u'image_id': 184321, u'id': 356382, u'caption': u'A blue train is next to a sidewalk on the rails.'},
154 |         {u'image_id': 184321, u'id': 361110, u'caption': u'A passenger train pulls into a train station.'},
155 |         {u'image_id': 184321, u'id': 362544, u'caption': u'A train coming down the tracks arriving at a station.'}],
156 |         81922: [
157 |         {u'image_id': 81922, u'id': 86779, u'caption': u'A large jetliner flying over a traffic filled street.'},
158 |         {u'image_id': 81922, u'id': 90172, u'caption': u'An airplane flies low in the sky over a city street. '},
159 |         {u'image_id': 81922, u'id': 91615, u'caption': u'An airplane flies over a street with many cars.'},
160 |         {u'image_id': 81922, u'id': 92689, u'caption': u'An airplane comes in to land over a road full of cars'},
161 |         {u'image_id': 81922, u'id': 823814, u'caption': u'The plane is flying over top of the cars'}]
162 |         }
163 | 
164 |     samples = {
165 |         184321: [{u'image_id': 184321, 'id': 111, u'caption': u'train traveling down a track in front of a road'}],
166 |         81922: [{u'image_id': 81922, 'id': 219, u'caption': u'plane is flying through the sky'}],
167 |         }
168 |     '''
169 |     gts = {
170 |         '184321': [
171 |             {u'image_id': '184321', u'cap_id': 0, u'caption': u'A train traveling down tracks next to lights.',
172 |              'tokenized': 'a train traveling down tracks next to lights'},
173 |             {u'image_id': '184321', u'cap_id': 1, u'caption': u'A train coming down the tracks arriving at a station.',
174 |              'tokenized': 'a train coming down the tracks arriving at a station'}],
175 |         '81922': [
176 |             {u'image_id': '81922', u'cap_id': 0, u'caption': u'A large jetliner flying over a traffic filled street.',
177 |              'tokenized': 'a large jetliner flying over a traffic filled street'},
178 |             {u'image_id': '81922', u'cap_id': 1, u'caption': u'The plane is flying over top of the cars',
179 |              'tokenized': 'the plan is flying over top of the cars'}, ]
180 |     }
181 | 
182 |     samples = {
183 |         '184321': [{u'image_id': '184321', u'caption': u'train traveling down a track in front of a road'}],
184 |         '81922': [{u'image_id': '81922', u'caption': u'plane is flying through the sky'}],
185 |     }
186 |     IDs = ['184321', '81922']
187 |     scorer = COCOScorer()
188 |     scorer.score(gts, samples, IDs)
189 | 
190 | 
191 | if __name__ == '__main__':
192 |     test_cocoscorer()
193 | 


--------------------------------------------------------------------------------
/misc/rewards.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import OrderedDict
 3 | import torch
 4 | import sys
 5 | sys.path.append("coco-caption")
 6 | from pyciderevalcap.ciderD.ciderD import CiderD
 7 | 
 8 | CiderD_scorer = None
 9 | # CiderD_scorer = CiderD(df='corpus')
10 | 
11 | 
12 | def init_cider_scorer(cached_tokens):
13 |     global CiderD_scorer
14 |     CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
15 | 
16 | 
17 | def array_to_str(arr):
18 |     out = ''
19 |     for i in range(len(arr)):
20 |         out += str(arr[i]) + ' '
21 |         if arr[i] == 0:
22 |             break
23 |     return out.strip()
24 | 
25 | 
26 | def get_self_critical_reward(model, fc_feats, data, gen_result):
27 |     batch_size = gen_result.size(0)
28 | 
29 |     # get greedy decoding baseline
30 |     _, greedy_res = model(fc_feats, mode='inference')
31 | 
32 |     res = OrderedDict()
33 | 
34 |     gen_result = gen_result.cpu().data.numpy()
35 |     greedy_res = greedy_res.cpu().data.numpy()
36 |     for i in range(batch_size):
37 |         res[i] = [array_to_str(gen_result[i])]
38 |     for i in range(batch_size):
39 |         res[batch_size + i] = [array_to_str(greedy_res[i])]
40 | 
41 |     gts = OrderedDict()
42 |     for i in range(data['gts'].size(0)):
43 |         gts[i] = [array_to_str(data['gts'][i][j])
44 |                   for j in range(data['gts'].size(1))]
45 | 
46 |     res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)]
47 |     gts = {i: gts[i % batch_size] for i in range(2 * batch_size)}
48 |     _, scores = CiderD_scorer.compute_score(gts, res)
49 |     print('Cider scores:', _)
50 | 
51 |     scores = scores[:batch_size] - scores[batch_size:]
52 | 
53 |     rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1)
54 | 
55 |     return rewards
56 | 


--------------------------------------------------------------------------------
/misc/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | 
 5 | 
 6 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token.
 7 | def decode_sequence(ix_to_word, seq):
 8 |     N, D = seq.size()
 9 |     out = []
10 |     for i in range(N):
11 |         txt = ''
12 |         for j in range(D):
13 |             ix = seq[i, j].data.cpu().numpy()[0]
14 |             if ix > 0:
15 |                 if j >= 1:
16 |                     txt = txt + ' '
17 |                 txt = txt + ix_to_word[str(ix)]
18 |             else:
19 |                 break
20 |         out.append(txt)
21 |     return out
22 | 
23 | 
24 | def to_contiguous(tensor):
25 |     if tensor.is_contiguous():
26 |         return tensor
27 |     else:
28 |         return tensor.contiguous()
29 | 
30 | 
31 | class RewardCriterion(nn.Module):
32 | 
33 |     def __init__(self):
34 |         super(RewardCriterion, self).__init__()
35 | 
36 |     def forward(self, input, seq, reward):
37 |         input = to_contiguous(input).view(-1)
38 |         reward = to_contiguous(reward).view(-1)
39 |         mask = (seq > 0).float()
40 |         mask = to_contiguous(torch.cat([Variable(mask.data.new(mask.size(0), 1).fill_(1)).cuda(),
41 |                                         mask[:, :-1]], 1)).view(-1)
42 |         output = - input * reward * mask
43 |         output = torch.sum(output) / torch.sum(mask)
44 | 
45 |         return output
46 | 
47 | 
48 | class LanguageModelCriterion(nn.Module):
49 | 
50 |     def __init__(self):
51 |         # python 3
52 |         # super().__init__()
53 |         super(LanguageModelCriterion, self).__init__()
54 |         self.loss_fn = nn.NLLLoss(reduce=False)
55 | 
56 |     def forward(self, logits, target, mask):
57 |         """
58 |         logits: shape of (N, seq_len, vocab_size)
59 |         target: shape of (N, seq_len)
60 |         mask: shape of (N, seq_len)
61 |         """
62 |         # truncate to the same size
63 |         batch_size = logits.shape[0]
64 |         target = target[:, :logits.shape[1]]
65 |         mask = mask[:, :logits.shape[1]]
66 |         logits = to_contiguous(logits).view(-1, logits.shape[2])
67 |         target = to_contiguous(target).view(-1)
68 |         mask = to_contiguous(mask).view(-1)
69 |         loss = self.loss_fn(logits, target)
70 |         output = torch.sum(loss * mask) / batch_size
71 |         return output
72 | 
73 | 
74 | def clip_gradient(optimizer, grad_clip):
75 |     for group in optimizer.param_groups:
76 |         for param in group['params']:
77 |             param.grad.data.clamp_(-grad_clip, grad_clip)
78 | 


--------------------------------------------------------------------------------
/models/Attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Attention(nn.Module):
 7 |     """
 8 |     Applies an attention mechanism on the output features from the decoder.
 9 |     """
10 | 
11 |     def __init__(self, dim):
12 |         # python 3
13 |         # super().__init__()
14 |         super(Attention, self).__init__()
15 |         #self.dim = dim
16 |         #self.linear1 = nn.Linear(dim * 2, dim)
17 |         #self.linear2 = nn.Linear(dim, 1, bias=False)
18 |         #self._init_hidden()
19 |         #self.dk = dim/2
20 |         # self.contextW = nn.Linear(dim, self.dk)
21 |         # nn.init.xavier_normal(self.contextW.weight)
22 |         # self.hidderW = nn.Linear(dim, self.dk)
23 |         # nn.init.xavier_normal(self.hidderW.weight)
24 |     def _init_hidden(self):
25 |         nn.init.xavier_normal(self.linear1.weight)
26 |         nn.init.xavier_normal(self.linear2.weight)
27 | 
28 |     def forward(self, hidden_state, encoder_outputs):
29 |         """
30 |         Arguments:
31 |             hidden_state {Variable} -- batch_size x dim
32 |             encoder_outputs {Variable} -- batch_size x seq_len x dim
33 | 
34 |         Returns:
35 |             Variable -- context vector of size batch_size x dim
36 |         """
37 |         ############### original ###################
38 |         '''
39 |         batch_size, seq_len, _ = encoder_outputs.size()
40 |         hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1)
41 |         (batch, seq_len, dim*2)
42 |         inputs = torch.cat((encoder_outputs, hidden_state),
43 |                            2).view(-1, self.dim * 2)
44 |         (batch, seq_len, dim*2)->(batch, seq_len, dim)->(batch, seq_len, 1)
45 |         o = self.linear2(F.tanh(self.linear1(inputs)))
46 |         e = o.view(batch_size, seq_len)
47 |         alpha = F.softmax(e, dim=1)
48 |         context = torch.bmm(alpha.unsqueeze(1), encoder_outputs).squeeze(1)
49 |         return context
50 |         '''
51 |         ################# seq2seq #######################
52 |         '''
53 |         batch_size, seq_len, hidden_size = encoder_outputs.size()
54 |         # batch, seq_len, dim
55 |         hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1)
56 |         # (batch, seq_len, dim) * (batch, dim, seq_len) -> (batch, seq_len, seq_len)
57 |         attn = torch.bmm(hidden_state, encoder_outputs.transpose(1, 2))
58 |         attn = F.softmax(attn.view(-1, seq_len)).view(batch_size, -1, seq_len)
59 |         # (batch, seq_len, seq_len) * (batch, seq_len, dim) -> (batch, seq_len, dim)
60 |         mix = torch.bmm(attn, encoder_outputs)
61 |         # concat -> (batch, seq_len, 2*dim)
62 |         combined = torch.cat((mix, hidden_state), dim=2)
63 |         # output -> (batch, out_len, dim)
64 |         output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)
65 |         return output
66 |         '''
67 |         ######## after reducing dim, calculate the similarity of between encoder_outputs and hidden_state #########
68 |         '''
69 |         batch_size, seq_len, hidden_size = encoder_outputs.size()
70 |         # (batch, seq_len, self.dk)
71 |         encoder_outputs_dk = self.contextW(encoder_outputs)
72 |         # (batch, self.dk)
73 |         hidden_state_dk = self.hidderW(hidden_state)
74 |         # (batch, seq_len, self.dk) * (batch, self.dk, 1) -> (batch, seq_len, 1)-> (batch, seq_len)
75 |         attn = torch.bmm(encoder_outputs_dk, hidden_state_dk.unsqueeze(2)).squeeze(2)
76 |         # (batch, seq_len)-> (batch, 1, seq_len)
77 |         attn = F.softmax(attn, dim=1).unsqueeze(1)
78 |         # (batch, 1, seq_len) * (batch, seq_len, dim) -> (batch, 1, dim)
79 |         context = torch.bmm(attn, encoder_outputs).squeeze(1)
80 |         return context
81 |         '''
82 |         ######### directly calculate the similarity of between encoder_outputs and hidden_state ############
83 |         # batch_size, seq_len, hidden_size = encoder_outputs.size()
84 |         # (batch, seq_len, dim) * (batch, dim, 1) -> (batch, seq_len, 1)-> (batch, seq_len)
85 |         attn = torch.bmm(encoder_outputs, hidden_state.unsqueeze(2)).squeeze(2)
86 |         # (batch, seq_len)-> (batch, 1, seq_len)
87 |         attn = F.softmax(attn, dim=1).unsqueeze(1)
88 |         # (batch, 1, seq_len) * (batch, seq_len, dim) -> (batch, 1, dim)
89 |         context = torch.bmm(attn, encoder_outputs).squeeze(1)
90 |         return context


--------------------------------------------------------------------------------
/models/EncoderRNN.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class EncoderRNN(nn.Module):
 5 |     def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5,
 6 |                  n_layers=1, bidirectional=False, rnn_cell='gru'):
 7 |         """
 8 | 
 9 |         Args:
10 |             hidden_dim (int): dim of hidden state of rnn
11 |             input_dropout_p (int): dropout probability for the input sequence
12 |             dropout_p (float): dropout probability for the output sequence
13 |             n_layers (int): number of rnn layers
14 |             rnn_cell (str): type of RNN cell ('LSTM'/'GRU')
15 |         """
16 |         # python 3
17 |         # super().__init__()
18 |         super(EncoderRNN, self).__init__()
19 |         self.dim_vid = dim_vid
20 |         self.dim_hidden = dim_hidden
21 |         self.input_dropout_p = input_dropout_p
22 |         self.rnn_dropout_p = rnn_dropout_p
23 |         self.n_layers = n_layers
24 |         self.bidirectional = bidirectional
25 |         self.rnn_cell = rnn_cell
26 | 
27 |         self.vid2hid = nn.Linear(dim_vid, dim_hidden)
28 |         self.input_dropout = nn.Dropout(input_dropout_p)
29 | 
30 |         if rnn_cell.lower() == 'lstm':
31 |             self.rnn_cell = nn.LSTM
32 |         elif rnn_cell.lower() == 'gru':
33 |             self.rnn_cell = nn.GRU
34 | 
35 |         self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True,
36 |                                 bidirectional=bidirectional, dropout=self.rnn_dropout_p)
37 | 
38 |         self._init_hidden()
39 | 
40 |     def _init_hidden(self):
41 |         nn.init.xavier_normal(self.vid2hid.weight)
42 | 
43 |     def forward(self, vid_feats):
44 |         """
45 |         Applies a multi-layer RNN to an input sequence.
46 |         Args:
47 |             input_var (batch, seq_len): tensor containing the features of the input sequence.
48 |             input_lengths (list of int, optional): A list that contains the lengths of sequences
49 |               in the mini-batch
50 |         Returns: output, hidden
51 |             - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
52 |             - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
53 |         """
54 |         batch_size, seq_len, dim_vid = vid_feats.size()
55 |         vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid))
56 |         vid_feats = self.input_dropout(vid_feats)
57 |         vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden)
58 |         self.rnn.flatten_parameters()
59 |         output, hidden = self.rnn(vid_feats)
60 |         return output, hidden
61 | 


--------------------------------------------------------------------------------
/models/S2VTAttModel.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class S2VTAttModel(nn.Module):
 5 |     def __init__(self, encoder, decoder):
 6 |         """
 7 | 
 8 |         Args:
 9 |             encoder (nn.Module): Encoder rnn
10 |             decoder (nn.Module): Decoder rnn
11 |         """
12 |         # python 3
13 |         # super().__init__()
14 |         super(S2VTAttModel, self).__init__()
15 |         self.encoder = encoder
16 |         self.decoder = decoder
17 | 
18 |     def forward(self, vid_feats, target_variable=None,
19 |                 mode='train', opt={}):
20 |         """
21 | 
22 |         Args:
23 |             vid_feats (Variable): video feats of shape [batch_size, seq_len, dim_vid]
24 |             target_variable (None, optional): groung truth labels
25 | 
26 |         Returns:
27 |             seq_prob: Variable of shape [batch_size, max_len-1, vocab_size]
28 |             seq_preds: [] or Variable of shape [batch_size, max_len-1]
29 |         """
30 |         encoder_outputs, encoder_hidden = self.encoder(vid_feats)
31 |         seq_prob, seq_preds = self.decoder(encoder_outputs, encoder_hidden, target_variable, mode, opt)
32 |         return seq_prob, seq_preds
33 | 


--------------------------------------------------------------------------------
/models/S2VTModel.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | import random
  5 | from torch.autograd import Variable
  6 | 
  7 | 
  8 | class S2VTModel(nn.Module):
  9 |     def __init__(self, vocab_size, max_len, dim_hidden, dim_word, dim_vid=2048, sos_id=1, eos_id=0,
 10 |                  n_layers=1, bidirectional=False, rnn_cell='gru', rnn_dropout_p=0.2):
 11 |         # python 3
 12 |         # super().__init__()
 13 |         super(S2VTModel, self).__init__()
 14 |         if rnn_cell.lower() == 'lstm':
 15 |             self.rnn_cell = nn.LSTM
 16 |         elif rnn_cell.lower() == 'gru':
 17 |             self.rnn_cell = nn.GRU
 18 |         #  hidden_size * num_directions
 19 |         #  num_directions = 2 if bidirectional else 1
 20 |         rnn_output_size = dim_hidden * 2 if bidirectional else dim_hidden
 21 | 
 22 |         self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, bidirectional=bidirectional,
 23 |                                   batch_first=True, dropout=rnn_dropout_p)
 24 |         self.rnn2 = self.rnn_cell(rnn_output_size + dim_word, dim_hidden, n_layers, bidirectional=bidirectional,
 25 |                                   batch_first=True, dropout=rnn_dropout_p)
 26 |         self.rnn_cell_type = rnn_cell.lower()
 27 |         self.n_layers = n_layers
 28 |         self.dim_vid = dim_vid
 29 |         self.dim_output = vocab_size
 30 |         self.dim_hidden = dim_hidden
 31 |         self.dim_word = dim_word
 32 |         self.max_length = max_len
 33 |         self.sos_id = sos_id
 34 |         self.eos_id = eos_id
 35 |         self.embedding = nn.Embedding(self.dim_output, self.dim_word)
 36 | 
 37 |         self.out = nn.Linear(rnn_output_size, self.dim_output)
 38 | 
 39 |     def forward(self, vid_feats, target_variable=None,
 40 |                 mode='train', opt={}):
 41 | 
 42 |         batch_size, n_frames, _ = vid_feats.shape
 43 |         padding_words = Variable(vid_feats.data.new(batch_size, n_frames, self.dim_word)).zero_()
 44 |         state1 = None
 45 |         state2 = None
 46 |         self.rnn1.flatten_parameters()
 47 |         self.rnn2.flatten_parameters()
 48 |         output1, state1 = self.rnn1(vid_feats, state1)
 49 |         input2 = torch.cat((output1, padding_words), dim=2)
 50 |         output2, state2 = self.rnn2(input2, state2)
 51 | 
 52 |         padding_frames = Variable(vid_feats.data.new(batch_size, 1, self.dim_vid)).zero_()
 53 |         seq_probs = []
 54 |         seq_preds = []
 55 |         if mode == 'train':
 56 |             for i in range(self.max_length - 1):
 57 |                 # <eos> doesn't input to the network
 58 |                 current_words = self.embedding(target_variable[:, i])
 59 |                 self.rnn1.flatten_parameters()
 60 |                 self.rnn2.flatten_parameters()
 61 |                 output1, state1 = self.rnn1(padding_frames, state1)
 62 |                 input2 = torch.cat(
 63 |                     (output1, current_words.unsqueeze(1)), dim=2)
 64 |                 output2, state2 = self.rnn2(input2, state2)
 65 |                 logits = self.out(output2.squeeze(1))
 66 |                 logits = F.log_softmax(logits, dim=1)
 67 |                 seq_probs.append(logits.unsqueeze(1))
 68 |             seq_probs = torch.cat(seq_probs, 1)
 69 |         else:
 70 |             beam_size = opt.get('beam_size', 1)
 71 |             if beam_size == 1:
 72 |                 current_words = self.embedding(Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda())
 73 |                 for i in range(self.max_length - 1):
 74 |                     self.rnn1.flatten_parameters()
 75 |                     self.rnn2.flatten_parameters()
 76 |                     output1, state1 = self.rnn1(padding_frames, state1)
 77 |                     input2 = torch.cat(
 78 |                         (output1, current_words.unsqueeze(1)), dim=2)
 79 |                     output2, state2 = self.rnn2(input2, state2)
 80 |                     logits = self.out(output2.squeeze(1))
 81 |                     logits = F.log_softmax(logits, dim=1)
 82 |                     seq_probs.append(logits.unsqueeze(1))
 83 |                     _, preds = torch.max(logits, 1)
 84 |                     current_words = self.embedding(preds)
 85 |                     seq_preds.append(preds.unsqueeze(1))
 86 |                 seq_probs = torch.cat(seq_probs, 1)
 87 |                 seq_preds = torch.cat(seq_preds, 1)
 88 |             else:
 89 |                 # batch*dim_word
 90 |                 start = [Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda()]
 91 |                 current_words = [[start, 0.0, state2]]
 92 |                 for i in range(self.max_length - 1):
 93 |                     self.rnn1.flatten_parameters()
 94 |                     self.rnn2.flatten_parameters()
 95 |                     # output1: batch*1*dim_hidden
 96 |                     output1, state1 = self.rnn1(padding_frames, state1)
 97 |                     temp = []
 98 |                     for s in current_words:
 99 |                         # s: [[batch*word_embed1, batch*word_embed2...], prob, state2]
100 |                         input2 = torch.cat(
101 |                             (output1, self.embedding(s[0][-1]).unsqueeze(1)), dim=2)
102 |                         output2, s[2] = self.rnn2(input2, s[2])
103 |                         logits = self.out(output2.squeeze(1))
104 |                         # batch*voc_size
105 |                         logits = F.log_softmax(logits, dim=1)
106 |                         # batch*beam
107 |                         topk_prob, topk_word = torch.topk(logits, k=beam_size, dim=1)
108 |                         # batch*beam -> beam*batch
109 |                         topk_prob = topk_prob.permute(1, 0)
110 |                         topk_word = topk_word.permute(1, 0)
111 |                         # Getting the top <beam_size>(n) predictions and creating a
112 |                         # new list so as to put them via the model again
113 |                         for prob, word in zip(topk_prob, topk_word):
114 |                             next_cap = s[0][:]
115 |                             next_cap.append(word)
116 |                             temp.append([next_cap, s[1]+prob,
117 |                                          (s[2][0].clone(), s[2][1].clone()) if isinstance(s[2], tuple)
118 |                                          else s[2].clone()])
119 |                     current_words = temp
120 |                     # sort by prob
121 |                     current_words = sorted(current_words, reverse=False, cmp=lambda x,y:cmp(int(x[1]),int(y[1])))
122 |                     # get the top words
123 |                     current_words = current_words[-beam_size:]
124 |                 seq_preds = torch.cat(current_words[-1][0][1:], 0).unsqueeze(0)
125 |         return seq_probs, seq_preds
126 | 
127 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .EncoderRNN import EncoderRNN
2 | from .DecoderRNN import DecoderRNN
3 | from .S2VTAttModel import S2VTAttModel
4 | from .S2VTModel import S2VTModel
5 | 


--------------------------------------------------------------------------------
/opts.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def parse_opt():
  5 |     parser = argparse.ArgumentParser()
  6 |     # Data input settings
  7 |     parser.add_argument(
  8 |         '--input_json',
  9 |         type=str,
 10 |         default='data/videodatainfo_2017.json',
 11 |         help='path to the json file containing video info')
 12 |     parser.add_argument(
 13 |         '--info_json',
 14 |         type=str,
 15 |         default='data/info.json',
 16 |         help='path to the json file containing additional info and vocab')
 17 |     parser.add_argument(
 18 |         '--caption_json',
 19 |         type=str,
 20 |         default='data/caption.json',
 21 |         help='path to the processed video caption json')
 22 | 
 23 |     parser.add_argument(
 24 |         '--feats_dir',
 25 |         nargs='+',
 26 |         type=str,
 27 |         default=['data/feats/resnet152/'],
 28 |         help='path to the directory containing the preprocessed fc feats')
 29 | 
 30 |     parser.add_argument('--c3d_feats_dir', type=str, default='data/c3d_feats')
 31 |     parser.add_argument(
 32 |         '--with_c3d', type=int, default=0, help='whether to use c3d features')
 33 | 
 34 |     parser.add_argument(
 35 |         '--cached_tokens',
 36 |         type=str,
 37 |         default='msr-all-idxs',
 38 |         help='Cached token file for calculating cider score \
 39 |                         during self critical training.')
 40 | 
 41 |     # Model settings
 42 |     parser.add_argument(
 43 |         "--model", type=str, default='S2VTModel', help="with model to use")
 44 | 
 45 |     parser.add_argument(
 46 |         "--max_len",
 47 |         type=int,
 48 |         default=28,
 49 |         help='max length of captions(containing <sos>,<eos>)')
 50 |     parser.add_argument(
 51 |         "--bidirectional",
 52 |         type=int,
 53 |         default=0,
 54 |         help="0 for disable, 1 for enable. encoder/decoder bidirectional.")
 55 |     parser.add_argument(
 56 |         '--n_frame_steps',
 57 |         type=int,
 58 |         default=80,
 59 |         help='how many frames to sampler per video')
 60 |     parser.add_argument(
 61 |         '--dim_hidden',
 62 |         type=int,
 63 |         default=512,
 64 |         help='size of the rnn hidden layer')
 65 |     parser.add_argument(
 66 |         '--num_layers', type=int, default=1, help='number of layers in the RNN')
 67 |     parser.add_argument(
 68 |         '--input_dropout_p',
 69 |         type=float,
 70 |         default=0.2,
 71 |         help='strength of dropout in the Language Model RNN')
 72 |     parser.add_argument(
 73 |         '--rnn_type', type=str, default='gru', help='lstm or gru')
 74 |     parser.add_argument(
 75 |         '--rnn_dropout_p',
 76 |         type=float,
 77 |         default=0.5,
 78 |         help='strength of dropout in the Language Model RNN')
 79 |     parser.add_argument(
 80 |         '--dim_word',
 81 |         type=int,
 82 |         default=512,
 83 |         help='the encoding size of each token in the vocabulary, and the video.'
 84 |     )
 85 | 
 86 |     parser.add_argument(
 87 |         '--dim_vid',
 88 |         type=int,
 89 |         default=2048,
 90 |         help='dim of features of video frames')
 91 | 
 92 |     # Optimization: General
 93 | 
 94 |     parser.add_argument(
 95 |         '--epochs', type=int, default=6001, help='number of epochs')
 96 |     parser.add_argument(
 97 |         '--batch_size', type=int, default=128, help='minibatch size')
 98 |     parser.add_argument(
 99 |         '--grad_clip',
100 |         type=float,
101 |         default=5,  # 5.,
102 |         help='clip gradients at this value')
103 | 
104 |     parser.add_argument(
105 |         '--self_crit_after',
106 |         type=int,
107 |         default=-1,
108 |         help='After what epoch do we start finetuning the CNN? \
109 |                         (-1 = disable; never finetune, 0 = finetune from start)'
110 |     )
111 | 
112 |     parser.add_argument(
113 |         '--learning_rate', type=float, default=4e-4, help='learning rate')
114 | 
115 |     parser.add_argument(
116 |         '--learning_rate_decay_every',
117 |         type=int,
118 |         default=200,
119 |         help='every how many iterations thereafter to drop LR?(in epoch)')
120 |     parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8)
121 |     parser.add_argument(
122 |         '--optim_alpha', type=float, default=0.9, help='alpha for adam')
123 |     parser.add_argument(
124 |         '--optim_beta', type=float, default=0.999, help='beta used for adam')
125 |     parser.add_argument(
126 |         '--optim_epsilon',
127 |         type=float,
128 |         default=1e-8,
129 |         help='epsilon that goes into denominator for smoothing')
130 |     parser.add_argument(
131 |         '--weight_decay',
132 |         type=float,
133 |         default=5e-4,
134 |         help='weight_decay. strength of weight regularization')
135 | 
136 |     parser.add_argument(
137 |         '--save_checkpoint_every',
138 |         type=int,
139 |         default=50,
140 |         help='how often to save a model checkpoint (in epoch)?')
141 |     parser.add_argument(
142 |         '--checkpoint_path',
143 |         type=str,
144 |         default='save',
145 |         help='directory to store checkpointed models')
146 | 
147 |     parser.add_argument(
148 |         '--gpu', type=str, default='0', help='gpu device number')
149 | 
150 |     args = parser.parse_args()
151 | 
152 |     return args
153 | 


--------------------------------------------------------------------------------
/prepro_coco.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | import nltk
 4 | nltk.download('stopwords')
 5 | from nltk.corpus import stopwords
 6 | from collections import Counter
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def main(args):
11 |     coco = json.load(
12 |         open(args.coco_json))['annotations']
13 |     msr = json.load(open(args.msr_caption_json))
14 |     stopWords = set(stopwords.words('english'))
15 |     coco_wordcounts = open(args.coco_wordcounts)
16 |     coco_words = []
17 |     for i in coco_wordcounts:
18 |         w = i.split()[0]
19 |         coco_words.append(w)
20 |     msr_wordcounts = []
21 |     for i in msr.values():
22 |         for j in i['final_captions']:
23 |             msr_wordcounts += j
24 |     msr_wordcounts = Counter(msr_wordcounts).most_common()
25 |     labels = [i for i in msr_wordcounts if i[0]
26 |               not in stopWords and i[0] in coco_words][:args.num_classes]
27 |     for i in tqdm(coco):
28 |         l = []
29 |         for j, w in enumerate(labels):
30 |             if w[0] in i['caption']:
31 |                 l.append(j)
32 |         i['labels'] = l
33 |     coco_labels = {}
34 |     for i in tqdm(coco):
35 |         if i['image_id'] in coco_labels:
36 |             coco_labels[i['image_id']] = coco_labels[i['image_id']] + \
37 |                 list(set(i['labels']) - set(coco_labels[i['image_id']]))
38 |         else:
39 |             coco_labels[i['image_id']] = i['labels']
40 |     info = {'num_classes': args.num_classes, 'labels': coco_labels}
41 |     with open(args.coco_labels_json, 'w') as f:
42 |         json.dump(info, f)
43 | 
44 |     coco = json.load(
45 |         open(args.coco_json))['images']
46 |     coco_path = {}
47 |     for i in tqdm(coco):
48 |         coco_path[i['id']] = i['file_name']
49 |     with open(args.coco_path_json, 'w') as f:
50 |         json.dump(coco_path, f)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument('--coco_json', type=str,
56 |                         default='data/mscoco/annotations/captions_train2014.json', help='path to coco train json')
57 |     parser.add_argument('-coco_wordcounts', type=str,
58 |                         default='data/mscoco/word_counts.txt', help='word_counts.txt of coco dataset')
59 |     parser.add_argument('--msr_caption_json', type=str,
60 |                         default='data/caption.json', help='path to processed msr vtt caption json')
61 |     parser.add_argument('--num_classes', type=int, default=1000,
62 |                         help='number of classes each image')
63 |     parser.add_argument('--coco_labels_json', type=str, default='data/coco_labels.json',
64 |                         help='path to processed coco train caption json')
65 |     parser.add_argument('--coco_path_json', type=str, default='data/coco_path.json',
66 |                         help='image id and image file name pairs')
67 |     args = parser.parse_args()
68 |     main(args)
69 | 


--------------------------------------------------------------------------------
/prepro_feats.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import subprocess
  3 | import glob
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | import os
  7 | import argparse
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | import torch.nn.functional as F
 12 | from torch.autograd import Variable
 13 | import pretrainedmodels
 14 | from pretrainedmodels import utils
 15 | 
 16 | C, H, W = 3, 224, 224
 17 | 
 18 | 
 19 | def extract_frames(video, dst):
 20 |     with open(os.devnull, "w") as ffmpeg_log:
 21 |         if os.path.exists(dst):
 22 |             print(" cleanup: " + dst + "/")
 23 |             shutil.rmtree(dst)
 24 |         os.makedirs(dst)
 25 |         video_to_frames_command = ["ffmpeg",
 26 |                                    # (optional) overwrite output file if it exists
 27 |                                    '-y',
 28 |                                    '-i', video,  # input file
 29 |                                    '-vf', "scale=400:300",  # input file
 30 |                                    '-qscale:v', "2",  # quality for JPEG
 31 |                                    '{0}/%06d.jpg'.format(dst)]
 32 |         subprocess.call(video_to_frames_command,
 33 |                         stdout=ffmpeg_log, stderr=ffmpeg_log)
 34 | 
 35 | 
 36 | def extract_feats(params, model, load_image_fn):
 37 |     global C, H, W
 38 |     model.eval()
 39 | 
 40 |     dir_fc = params['output_dir']
 41 |     if not os.path.isdir(dir_fc):
 42 |         os.mkdir(dir_fc)
 43 |     print("save video feats to %s" % (dir_fc))
 44 |     video_list = glob.glob(os.path.join(params['video_path'], '*.mp4'))
 45 |     for video in tqdm(video_list):
 46 |         video_id = video.split("/")[-1].split(".")[0]
 47 |         dst = params['model'] + '_' + video_id
 48 |         extract_frames(video, dst)
 49 | 
 50 |         image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
 51 |         samples = np.round(np.linspace(
 52 |             0, len(image_list) - 1, params['n_frame_steps']))
 53 |         image_list = [image_list[int(sample)] for sample in samples]
 54 |         images = torch.zeros((len(image_list), C, H, W))
 55 |         for iImg in range(len(image_list)):
 56 |             img = load_image_fn(image_list[iImg])
 57 |             images[iImg] = img
 58 |         fc_feats = model(Variable(images, volatile=True).cuda()).squeeze()
 59 |         img_feats = fc_feats.data.cpu().numpy()
 60 |         # Save the inception features
 61 |         outfile = os.path.join(dir_fc, video_id + '.npy')
 62 |         np.save(outfile, img_feats)
 63 |         # cleanup
 64 |         shutil.rmtree(dst)
 65 | 
 66 | 
 67 | if __name__ == '__main__':
 68 |     parser = argparse.ArgumentParser()
 69 |     parser.add_argument("--gpu", dest='gpu', type=str, default='0',
 70 |                         help='Set CUDA_VISIBLE_DEVICES environment variable, optional')
 71 |     parser.add_argument("--output_dir", dest='output_dir', type=str,
 72 |                         default='data/feats/resnet152', help='directory to store features')
 73 |     parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=40,
 74 |                         help='how many frames to sampler per video')
 75 |     parser.add_argument("--video_path", dest='video_path', type=str,
 76 |                         default='data/train-video', help='path to video dataset')
 77 |     parser.add_argument("--model", dest="model", type=str, default='resnet152',
 78 |                         help='the CNN model you want to use to extract_feats')
 79 |     parser.add_argument("--saved_model", dest="saved_model", type=str, default='',
 80 |                         help='the pretrained CNN model you want to use to extract_feats')
 81 | 
 82 |     args = parser.parse_args()
 83 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 84 |     params = vars(args)
 85 |     if params['model'] == 'inception_v3':
 86 |         C, H, W = 3, 299, 299
 87 |         model = pretrainedmodels.inceptionv3(pretrained='imagenet')
 88 |         load_image_fn = utils.LoadTransformImage(model)
 89 |     elif params['model'] == 'vgg16':
 90 |         C, H, W = 3, 224, 224
 91 |         model = pretrainedmodels.vgg16(pretrained='imagenet')
 92 |         load_image_fn = utils.LoadTransformImage(model)
 93 |     elif params['model'] == 'vgg19':
 94 |         C, H, W = 3, 224, 224
 95 |         model = pretrainedmodels.vgg19(pretrained='imagenet')
 96 |         load_image_fn = utils.LoadTransformImage(model)
 97 |     elif params['model'] == 'resnet50':
 98 |         C, H, W = 3, 224, 224
 99 |         model = pretrainedmodels.resnet50(pretrained='imagenet')
100 |         load_image_fn = utils.LoadTransformImage(model)
101 |     elif params['model'] == 'resnet101':
102 |         C, H, W = 3, 224, 224
103 |         model = pretrainedmodels.resnet101(pretrained='imagenet')
104 |         load_image_fn = utils.LoadTransformImage(model)
105 |     elif params['model'] == 'resnet152':
106 |         C, H, W = 3, 224, 224
107 |         model = pretrainedmodels.resnet152(pretrained='imagenet')
108 |         load_image_fn = utils.LoadTransformImage(model)
109 |     elif params['model'] == 'inception_v4':
110 |         C, H, W = 3, 299, 299
111 |         model = pretrainedmodels.inceptionv4(
112 |             num_classes=1000, pretrained='imagenet')
113 |         load_image_fn = utils.LoadTransformImage(model)
114 |     elif params['model'] == 'nasnet':
115 |         C, H, W = 3, 331, 331
116 |         model = pretrainedmodels.nasnetalarge(num_classes=1001, pretrained='imagenet+background')
117 |         load_image_fn = utils.LoadTransformImage(model)
118 |     else:
119 |         print("doesn't support %s" % (params['model']))
120 | 
121 |     model.last_linear = utils.Identity()
122 |     model = nn.DataParallel(model)
123 |     if params['saved_model'] != '':
124 |         model.load_state_dict(torch.load(params['saved_model']), strict=False)
125 |     model = model.cuda()
126 |     extract_feats(params, model, load_image_fn)
127 | 


--------------------------------------------------------------------------------
/prepro_ngrams.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | import pickle as pkl
  4 | from collections import defaultdict
  5 | 
  6 | 
  7 | def precook(s, n=4):
  8 |     """
  9 |     Takes a string as input and returns an object that can be given to
 10 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 11 |     can take string arguments as well.
 12 |     :param s: string : sentence to be converted into ngrams
 13 |     :param n: int    : number of ngrams for which representation is calculated
 14 |     :return: term frequency vector for occuring ngrams
 15 |     """
 16 |     words = s.split()
 17 |     counts = defaultdict(int)
 18 |     for k in range(1, n+1):
 19 |         for i in range(len(words)-k+1):
 20 |             ngram = tuple(words[i:i+k])
 21 |             counts[ngram] += 1
 22 |     return counts
 23 | 
 24 | 
 25 | def cook_refs(refs, n=4):  # lhuang: oracle will call with "average"
 26 |     '''Takes a list of reference sentences for a single segment
 27 |     and returns an object that encapsulates everything that BLEU
 28 |     needs to know about them.
 29 |     :param refs: list of string : reference sentences for some image
 30 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 31 |     :return: result (list of dict)
 32 |     '''
 33 |     return [precook(ref, n) for ref in refs]
 34 | 
 35 | 
 36 | def create_crefs(refs):
 37 |     crefs = []
 38 |     for ref in refs:
 39 |         # ref is a list of 5 captions
 40 |         crefs.append(cook_refs(ref))
 41 |     return crefs
 42 | 
 43 | 
 44 | def compute_doc_freq(crefs):
 45 |     '''
 46 |     Compute term frequency for reference data.
 47 |     This will be used to compute idf (inverse document frequency later)
 48 |     The term frequency is stored in the object
 49 |     :return: None
 50 |     '''
 51 |     document_frequency = defaultdict(float)
 52 |     for refs in crefs:
 53 |         # refs, k ref captions of one image
 54 |         for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]):
 55 |             document_frequency[ngram] += 1
 56 |       # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 57 |     return document_frequency
 58 | 
 59 | 
 60 | def build_dict(vids, wtoi):
 61 |     refs_words = []
 62 |     refs_idxs = []
 63 |     count_vids = 0
 64 |     for vid in vids:
 65 |         ref_words = []
 66 |         ref_idxs = []
 67 |         for cap in vids[vid]['final_captions']:
 68 |             tmp_tokens = cap
 69 |             tmp_tokens = [_ if _ in wtoi else '<UNK>' for _ in tmp_tokens]
 70 |             ref_words.append(' '.join(tmp_tokens))
 71 |             ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens]))
 72 |         refs_words.append(ref_words)
 73 |         refs_idxs.append(ref_idxs)
 74 |         count_vids += 1
 75 |     ngram_words = compute_doc_freq(create_crefs(refs_words))
 76 |     ngram_idxs = compute_doc_freq(create_crefs(refs_idxs))
 77 |     return ngram_words, ngram_idxs, count_vids
 78 | 
 79 | 
 80 | def main(params):
 81 |     vids = json.load(open(params['caption_json']))
 82 |     wtoi = json.load(open(params['info_json']))['word_to_ix']
 83 | 
 84 |     ngram_words, ngram_idxs, ref_len = build_dict(vids, wtoi)
 85 | 
 86 |     pkl.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(
 87 |         params['output_pkl']+'-words.p', 'wb'))
 88 |     pkl.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(
 89 |         params['output_pkl']+'-idxs.p', 'wb'))
 90 | 
 91 | if __name__ == "__main__":
 92 | 
 93 |     parser = argparse.ArgumentParser()
 94 | 
 95 |     # input json
 96 |     parser.add_argument('--caption_json', default='data/caption.json',
 97 |                         help='input json file to containing video captions')
 98 |     parser.add_argument('--info_json', default='data/info.json', help='vocab info json file')
 99 |     parser.add_argument('--output_pkl', default='data/msr-all', help='output pickle file')
100 |     args = parser.parse_args()
101 |     params = vars(args)  # convert to ordinary dict
102 | 
103 |     main(params)
104 | 


--------------------------------------------------------------------------------
/prepro_vocab.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | import argparse
 4 | import numpy as np
 5 | 
 6 | 
 7 | def build_vocab(vids, params):
 8 |     count_thr = params['word_count_threshold']
 9 |     # count up the number of words
10 |     counts = {}
11 |     for vid, caps in vids.items():
12 |         for cap in caps['captions']:
13 |             ws = re.sub(r'[.!,;?]', ' ', cap).split()
14 |             for w in ws:
15 |                 counts[w] = counts.get(w, 0) + 1
16 |     # cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
17 |     total_words = sum(counts.values())
18 |     bad_words = [w for w, n in counts.items() if n <= count_thr]
19 |     vocab = [w for w, n in counts.items() if n > count_thr]
20 |     bad_count = sum(counts[w] for w in bad_words)
21 |     print('number of bad words: %d/%d = %.2f%%' %
22 |           (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
23 |     print('number of words in vocab would be %d' % (len(vocab), ))
24 |     print('number of UNKs: %d/%d = %.2f%%' %
25 |           (bad_count, total_words, bad_count * 100.0 / total_words))
26 |     # lets now produce the final annotations
27 |     if bad_count > 0:
28 |         # additional special UNK token we will use below to map infrequent words to
29 |         print('inserting the special UNK token')
30 |         vocab.append('<UNK>')
31 |     for vid, caps in vids.items():
32 |         caps = caps['captions']
33 |         vids[vid]['final_captions'] = []
34 |         for cap in caps:
35 |             ws = re.sub(r'[.!,;?]', ' ', cap).split()
36 |             caption = [
37 |                 '<sos>'] + [w if counts.get(w, 0) > count_thr else '<UNK>' for w in ws] + ['<eos>']
38 |             vids[vid]['final_captions'].append(caption)
39 |     return vocab
40 | 
41 | 
42 | def main(params):
43 |     videos = json.load(open(params['input_json'], 'r'))['sentences']
44 |     video_caption = {}
45 |     for i in videos:
46 |         if i['video_id'] not in video_caption.keys():
47 |             video_caption[i['video_id']] = {'captions': []}
48 |         video_caption[i['video_id']]['captions'].append(i['caption'])
49 |     # create the vocab
50 |     vocab = build_vocab(video_caption, params)
51 |     itow = {i + 2: w for i, w in enumerate(vocab)}
52 |     wtoi = {w: i + 2 for i, w in enumerate(vocab)}  # inverse table
53 |     wtoi['<eos>'] = 0
54 |     itow[0] = '<eos>'
55 |     wtoi['<sos>'] = 1
56 |     itow[1] = '<sos>'
57 | 
58 |     out = {}
59 |     out['ix_to_word'] = itow
60 |     out['word_to_ix'] = wtoi
61 |     out['videos'] = {'train': [], 'val': [], 'test': []}
62 |     videos = json.load(open(params['input_json'], 'r'))['videos']
63 |     for i in videos:
64 |         out['videos'][i['split']].append(int(i['id']))
65 |     json.dump(out, open(params['info_json'], 'w'))
66 |     json.dump(video_caption, open(params['caption_json'], 'w'))
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser()
71 | 
72 |     # input json
73 |     parser.add_argument('--input_json', type=str, default='data/all_videodatainfo_2017.json',
74 |                         help='msr_vtt videoinfo json')
75 |     parser.add_argument('--info_json', default='data/all_info.json',
76 |                         help='info about iw2word and word2ix')
77 |     parser.add_argument('--caption_json', default='data/all_caption.json', help='caption json file')
78 | 
79 | 
80 |     parser.add_argument('--word_count_threshold', default=1, type=int,
81 |                         help='only words that occur more than this number of times will be put in vocab')
82 | 
83 |     args = parser.parse_args()
84 |     params = vars(args)  # convert to ordinary dict
85 |     main(params)
86 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import numpy as np
  5 | 
  6 | import misc.utils as utils
  7 | import opts
  8 | import torch
  9 | import torch.optim as optim
 10 | from dataloader import VideoDataset
 11 | from misc.rewards import get_self_critical_reward, init_cider_scorer
 12 | from models import DecoderRNN, EncoderRNN, S2VTAttModel, S2VTModel
 13 | from torch import nn
 14 | from torch.autograd import Variable
 15 | from torch.utils.data import DataLoader
 16 | 
 17 | 
 18 | def train(loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None):
 19 |     model.train()
 20 |     model = nn.DataParallel(model)
 21 |     for epoch in range(opt["epochs"]):
 22 |         lr_scheduler.step()
 23 | 
 24 |         iteration = 0
 25 |         # If start self crit training
 26 |         if opt["self_crit_after"] != -1 and epoch >= opt["self_crit_after"]:
 27 |             sc_flag = True
 28 |             init_cider_scorer(opt["cached_tokens"])
 29 |         else:
 30 |             sc_flag = False
 31 | 
 32 |         for data in loader:
 33 |             torch.cuda.synchronize()
 34 |             fc_feats = Variable(data['fc_feats']).cuda()
 35 |             labels = Variable(data['labels']).long().cuda()
 36 |             masks = Variable(data['masks']).cuda()
 37 | 
 38 |             optimizer.zero_grad()
 39 |             if not sc_flag:
 40 |                 seq_probs, _ = model(fc_feats, labels, 'train')
 41 |                 loss = crit(seq_probs, labels[:, 1:], masks[:, 1:])
 42 |             else:
 43 |                 seq_probs, seq_preds = model(
 44 |                     fc_feats, mode='inference', opt=opt)
 45 |                 reward = get_self_critical_reward(model, fc_feats, data,
 46 |                                                   seq_preds)
 47 |                 print(reward.shape)
 48 |                 loss = rl_crit(seq_probs, seq_preds,
 49 |                                Variable(
 50 |                                    torch.from_numpy(reward).float().cuda()))
 51 | 
 52 |             loss.backward()
 53 |             utils.clip_gradient(optimizer, opt["grad_clip"])
 54 |             optimizer.step()
 55 |             train_loss = loss.data[0]
 56 |             torch.cuda.synchronize()
 57 |             iteration += 1
 58 | 
 59 |             if not sc_flag:
 60 |                 print("iter %d (epoch %d), train_loss = %.6f" %
 61 |                       (iteration, epoch, train_loss))
 62 |             else:
 63 |                 print("iter %d (epoch %d), avg_reward = %.6f" %
 64 |                       (iteration, epoch, np.mean(reward[:, 0])))
 65 | 
 66 |         if epoch != 0 and epoch % opt["save_checkpoint_every"] == 0:
 67 |             model_path = os.path.join(opt["checkpoint_path"],
 68 |                                       'model_%d.pth' % (epoch))
 69 |             model_info_path = os.path.join(opt["checkpoint_path"],
 70 |                                            'model_score.txt')
 71 |             torch.save(model.state_dict(), model_path)
 72 |             print("model saved to %s" % (model_path))
 73 |             with open(model_info_path, 'a') as f:
 74 |                 f.write("model_%d, loss: %.6f\n" % (epoch, train_loss))
 75 | 
 76 | 
 77 | def main(opt):
 78 |     dataset = VideoDataset(opt, 'train')
 79 |     dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
 80 |     opt["vocab_size"] = dataset.get_vocab_size()
 81 |     if opt["model"] == 'S2VTModel':
 82 |         model = S2VTModel(
 83 |             opt["vocab_size"],
 84 |             opt["max_len"],
 85 |             opt["dim_hidden"],
 86 |             opt["dim_word"],
 87 |             opt['dim_vid'],
 88 |             rnn_cell=opt['rnn_type'],
 89 |             n_layers=opt['num_layers'],
 90 |             bidirectional=opt["bidirectional"],
 91 |             rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
 92 |     elif opt["model"] == "S2VTAttModel":
 93 |         encoder = EncoderRNN(
 94 |             opt["dim_vid"],
 95 |             opt["dim_hidden"],
 96 |             n_layers=opt['num_layers'],
 97 |             bidirectional=opt["bidirectional"],
 98 |             input_dropout_p=opt["input_dropout_p"],
 99 |             rnn_cell=opt['rnn_type'],
100 |             rnn_dropout_p=opt["rnn_dropout_p"])
101 |         decoder = DecoderRNN(
102 |             opt["vocab_size"],
103 |             opt["max_len"],
104 |             opt["dim_hidden"],
105 |             opt["dim_word"],
106 |             n_layers=opt['num_layers'],
107 |             input_dropout_p=opt["input_dropout_p"],
108 |             rnn_cell=opt['rnn_type'],
109 |             rnn_dropout_p=opt["rnn_dropout_p"],
110 |             bidirectional=opt["bidirectional"])
111 |         model = S2VTAttModel(encoder, decoder).cuda()
112 |     crit = utils.LanguageModelCriterion()
113 |     rl_crit = utils.RewardCriterion()
114 |     optimizer = optim.Adam(
115 |         model.parameters(),
116 |         lr=opt["learning_rate"],
117 |         weight_decay=opt["weight_decay"])
118 |     exp_lr_scheduler = optim.lr_scheduler.StepLR(
119 |         optimizer,
120 |         step_size=opt["learning_rate_decay_every"],
121 |         gamma=opt["learning_rate_decay_rate"])
122 | 
123 |     train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     opt = opts.parse_opt()
128 |     opt = vars(opt)
129 |     for key, value in opt.items():
130 |         print key, value
131 |     os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"]
132 |     opt_json = os.path.join(opt["checkpoint_path"], 'opt_info.json')
133 |     if not os.path.exists(opt["checkpoint_path"]):
134 |         os.makedirs(opt["checkpoint_path"])
135 |     with open(opt_json, 'w') as f:
136 |         json.dump(opt, f)
137 |     print('save opt details to %s' % (opt_json))
138 |     main(opt)
139 | 


--------------------------------------------------------------------------------
/train_s2vt.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ### nasnet_resnet101
  4 | feat=nasnet_resnet101
  5 | LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log
  6 | python train.py  \
  7 | --gpu 0 \
  8 | --save_checkpoint_every 20 \
  9 | --epochs 1000  \
 10 | --n_frame_steps 40 \
 11 | --batch_size 100  \
 12 | --input_json data/all_videodatainfo_2017.json \
 13 | --info_json data/all_info.json \
 14 | --caption_json data/all_caption.json \
 15 | --checkpoint_path checkpoint/${feat}_40frames/s2vt  \
 16 | --feats_dir data/feats/nasnet  data/feats/resnet101 \
 17 | --dim_vid 6080 \
 18 | --rnn_type lstm \
 19 | --learning_rate_decay_every 100 \
 20 | --model S2VTModel \
 21 |      2>&1 | tee $LOG
 22 | 
 23 | # ### nasnet
 24 | # feat=nasnet
 25 | # LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log
 26 | # python train.py  \
 27 | # --gpu 0 \
 28 | # --save_checkpoint_every 50 \
 29 | # --epochs 1000  \
 30 | # --n_frame_steps 40 \
 31 | # --batch_size 100  \
 32 | # --input_json data/all_videodatainfo_2017.json \
 33 | # --info_json data/all_info.json \
 34 | # --caption_json data/all_caption.json \
 35 | # --checkpoint_path checkpoint/${feat}_40frames/s2vt  \
 36 | # --feats_dir data/feats/$feat/  \
 37 | # --dim_vid 4032 \
 38 | # --rnn_type lstm \
 39 | # --learning_rate_decay_every 100 \
 40 | # --model S2VTModel \
 41 | #      2>&1 | tee $LOG
 42 | 
 43 | 
 44 | ###  inception_v4 40frames
 45 | # feat=inception_v4
 46 | # LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log
 47 | # python train.py  \
 48 | # --gpu 0 \
 49 | # --save_checkpoint_every 50 \
 50 | # --epochs 1000  \
 51 | # --n_frame_steps 40 \
 52 | # --batch_size 100  \
 53 | # --input_json data/all_videodatainfo_2017.json \
 54 | # --info_json data/all_info.json \
 55 | # --caption_json data/all_caption.json \
 56 | # --checkpoint_path checkpoint/${feat}_40frames/s2vt  \
 57 | # --feats_dir data/feats/$feat/  \
 58 | # --dim_vid 1536 \
 59 | # --rnn_type lstm \
 60 | # --learning_rate_decay_every 100 \
 61 | # --model S2VTModel \
 62 | #      2>&1 | tee $LOG
 63 | 
 64 | ###  resnet101 40frames
 65 | # feat=resnet101
 66 | # LOG=log/s2vt_${feat}_40frames-`date +%Y-%m-%d_%H-%M-%S`.log
 67 | # python train.py  \
 68 | # --gpu 1 \
 69 | # --save_checkpoint_every 50 \
 70 | # --epochs 1000  \
 71 | # --n_frame_steps 40 \
 72 | # --batch_size 100  \
 73 | # --input_json data/all_videodatainfo_2017.json \
 74 | # --info_json data/all_info.json \
 75 | # --caption_json data/all_caption.json \
 76 | # --checkpoint_path checkpoint/${feat}_40frames/s2vt  \
 77 | # --feats_dir data/feats/$feat/  \
 78 | # --dim_vid 2048 \
 79 | # --rnn_type lstm \
 80 | # --learning_rate_decay_every 100 \
 81 | # --model S2VTModel \
 82 | #      2>&1 | tee $LOG
 83 | 
 84 | 
 85 | ## resnet101_c3d_fc7_wo_ft
 86 | # feat=resnet101
 87 | # LOG=log/s2vt_resnet101_c3d_fc7_wo_ft-`date +%Y-%m-%d_%H-%M-%S`.log
 88 | # python train.py  \
 89 | # --gpu 1 \
 90 | # --save_checkpoint_every 50 \
 91 | # --epochs 1000  \
 92 | # --n_frame_steps 80 \
 93 | # --batch_size 100  \
 94 | # --input_json data/all_videodatainfo_2017.json \
 95 | # --info_json data/all_info.json \
 96 | # --caption_json data/all_caption.json \
 97 | # --checkpoint_path checkpoint/resnet101_c3d_fc7_wo_ft/s2vt  \
 98 | # --feats_dir data/feats/$feat/  \
 99 | # --dim_vid 6144 \
100 | # --with_c3d 1 \
101 | # --c3d_feats_dir data/feats/c3d_fc7_wo_ft \
102 | # --rnn_type lstm \
103 | # --learning_rate_decay_every 200 \
104 | # --model S2VTModel \
105 | #      2>&1 | tee $LOG
106 | 
107 | # ###  resnet101 80frames
108 | # feat=resnet101
109 | # LOG=log/s2vt_${feat}_80frames-`date +%Y-%m-%d_%H-%M-%S`.log
110 | # python train.py  \
111 | # --gpu 0 \
112 | # --save_checkpoint_every 50 \
113 | # --epochs 500  \
114 | # --n_frame_steps 80 \
115 | # --batch_size 200  \
116 | # --input_json data/all_videodatainfo_2017.json \
117 | # --info_json data/all_info.json \
118 | # --caption_json data/all_caption.json \
119 | # --checkpoint_path checkpoint/$feat/s2vt  \
120 | # --feats_dir data/feats/$feat/  \
121 | # --dim_vid 2048 \
122 | # --rnn_type lstm \
123 | # --learning_rate_decay_every 100 \
124 | # --model S2VTModel \
125 | #      2>&1 | tee $LOG


--------------------------------------------------------------------------------
/train_s2vt_att.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | LOG=log/s2vt_att-`date +%Y-%m-%d_%H-%M-%S`.log
 3 | python train.py  \
 4 | --gpu 0,1 \
 5 | --save_checkpoint_every 10 \
 6 | --epochs 1000  \
 7 | --batch_size 80  \
 8 | --input_json data/all_videodatainfo_2017.json \
 9 | --info_json data/all_info.json \
10 | --caption_json data/all_caption.json \
11 | --checkpoint_path checkpoint/vgg16/s2vt_att  \
12 | --feats_dir data/feats/vgg16/trainval/  \
13 | --dim_vid 4096 \
14 | --rnn_type lstm \
15 | --learning_rate_decay_every 100 \
16 | --model S2VTAttModel \
17 |      2>&1 | tee $LOG


--------------------------------------------------------------------------------