├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── cfgs
    ├── anet_c3d_pdvc.yml
    ├── anet_c3d_pdvc_gt.yml
    ├── anet_c3d_pdvcl.yml
    ├── anet_c3d_pdvcl_gt.yml
    ├── anet_c3d_props.yml
    ├── anet_i3dvgg_pdvc.yml
    ├── anet_i3dvgg_pdvc_gt.yml
    ├── anet_tsn_pdvc.yml
    ├── anet_tsn_pdvc_gt.yml
    ├── anet_tsn_pdvcl.yml
    ├── anet_tsn_pdvcl_gt.yml
    ├── anet_tsp_pdvc.yml
    ├── anet_tsp_pdvc_gt.yml
    ├── anet_tsp_pdvcl.yml
    ├── yc2_tsn_pdvc.yml
    ├── yc2_tsn_pdvc_gt.yml
    ├── yc2_tsn_pdvcl.yml
    └── yc2_tsn_pdvcl_gt.yml
├── data
    ├── anet
    │   ├── captiondata
    │   │   ├── fake_test.json
    │   │   ├── para
    │   │   │   ├── anet_entities_test_1_para.json
    │   │   │   ├── anet_entities_test_2_para.json
    │   │   │   ├── anet_entities_val_1_para.json
    │   │   │   ├── anet_entities_val_2_para.json
    │   │   │   └── readme.txt
    │   │   ├── train_modified.json
    │   │   ├── val_1.json
    │   │   └── val_2.json
    │   ├── features
    │   │   ├── I3D_vggish_invalid_videos.json
    │   │   ├── convert_c3d_h5_to_npy.py
    │   │   ├── convert_tsp_h5_to_npy.py
    │   │   ├── download_c3d_features.sh
    │   │   ├── download_i3d_vggish_features.sh
    │   │   ├── download_tsn_features.sh
    │   │   ├── download_tsp_features.sh
    │   │   └── resnet_bn_invalid_videos.json
    │   └── vocabulary_activitynet.json
    ├── video_dataset.py
    └── yc2
    │   ├── captiondata
    │       ├── para
    │       │   ├── convert_to_para.py
    │       │   └── para_yc2_val.json
    │       ├── yc2_test.json
    │       ├── yc2_train.json
    │       └── yc2_val.json
    │   ├── features
    │       └── download_yc2_tsn_features.sh
    │   └── vocabulary_youcook2.json
├── densevid_eval3
    ├── eval_dvc.py
    ├── eval_para.py
    ├── eval_soda.py
    ├── evaluate2018.py
    ├── evaluate2021.py
    └── para_evaluate.py
├── eval.py
├── eval_utils.py
├── misc
    ├── build_vocab.py
    ├── detr_utils
    │   ├── box_ops.py
    │   └── misc.py
    └── utils.py
├── opts.py
├── pdvc.jpg
├── pdvc
    ├── CaptioningHead
    │   ├── LSTM.py
    │   ├── LSTM_DSA.py
    │   ├── Puppet.py
    │   └── __init__.py
    ├── __init__.py
    ├── base_encoder.py
    ├── criterion.py
    ├── deformable_transformer.py
    ├── matcher.py
    ├── ops
    │   ├── __init__.py
    │   ├── functions
    │   │   ├── __init__.py
    │   │   └── ms_deform_attn_func.py
    │   ├── make.sh
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── ms_deform_attn.py
    │   │   └── ms_deform_attn_for_caption.py
    │   ├── setup.py
    │   ├── src
    │   │   ├── cpu
    │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   └── ms_deform_attn_cpu.h
    │   │   ├── cuda
    │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   ├── ms_deform_attn.h
    │   │   └── vision.cpp
    │   └── test.py
    ├── pdvc.py
    └── position_encoding.py
├── requirement.txt
├── test_and_visualize.sh
├── train.py
├── video_backbone
    ├── TSP
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── README.md
    │   ├── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── scheduler.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    │   ├── data
    │   │   ├── README.md
    │   │   ├── activitynet
    │   │   │   ├── activitynet_v1-3_action_label_mapping.json
    │   │   │   ├── activitynet_v1-3_temporal_region_label_mapping.json
    │   │   │   ├── activitynet_v1-3_test_metadata.csv
    │   │   │   ├── activitynet_v1-3_train_metadata.csv
    │   │   │   ├── activitynet_v1-3_train_tsp_groundtruth.csv
    │   │   │   ├── activitynet_v1-3_valid_metadata.csv
    │   │   │   └── activitynet_v1-3_valid_tsp_groundtruth.csv
    │   │   ├── generate_metadata_csv.py
    │   │   └── standardize_videos_to_constant_30fps_mp4.sh
    │   ├── environment.yml
    │   ├── extract_features
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── eval_video_dataset.py
    │   │   ├── extract_features.py
    │   │   ├── extract_features_from_a_local_checkpoint.sh
    │   │   ├── extract_features_from_a_released_checkpoint.sh
    │   │   ├── merge_pkl_files_into_one_h5_feature_file.py
    │   │   └── opts.py
    │   ├── img
    │   │   └── tsp.png
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   └── model.py
    │   └── train
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── opts.py
    │   │   ├── train.py
    │   │   ├── train_tac_on_activitynet.sh
    │   │   ├── train_tac_on_thumos14.sh
    │   │   ├── train_tsp_on_activitynet.sh
    │   │   ├── train_tsp_on_thumos14.sh
    │   │   └── untrimmed_video_dataset.py
    └── __init__.py
└── visualization
    ├── Arial.ttf
    ├── NotoSansCJK-Bold.otf
    ├── videos
        └── xukun.mp4
    ├── visualization.py
    ├── xukun_cn.gif
    └── xukun_en.gif


/.gitignore:
--------------------------------------------------------------------------------
  1 | save/
  2 | save*
  3 | *.hdf5
  4 | *.npy
  5 | data/anet/features/c3d
  6 | data/anet/features/resnet_bn
  7 | data/yc2/features/resnet_bn
  8 | data/densevid_eval3
  9 | *.tmp
 10 | 
 11 | *.Ink
 12 | .idea/
 13 | .DS_Store
 14 | *.pyc
 15 | cfgs/proposal/debug/
 16 | *.out
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python misc from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit debug / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .coverage
 58 | .coverage.*
 59 | .cache
 60 | nosetests.xml
 61 | coverage.xml
 62 | *.cover
 63 | .hypothesis/
 64 | .pytest_cache/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | 
 73 | local_settings.py
 74 | db.sqlite3
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # pyenv
 93 | .python-version
 94 | 
 95 | # celery beat schedule file
 96 | celerybeat-schedule
 97 | 
 98 | # SageMath parsed files
 99 | *.sage.py
100 | 
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/
122 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "densevid_eval3/SODA"]
2 | 	path = densevid_eval3/SODA
3 | 	url = https://github.com/fujiso/SODA.git
4 | [submodule "densevid_eval3/pycocoevalcap"]
5 | 	path = densevid_eval3/pycocoevalcap
6 | 	url = https://github.com/salaniz/pycocoevalcap.git
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Teng Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cfgs/anet_c3d_pdvc.yml:
--------------------------------------------------------------------------------
 1 | id: anet_c3d_pdvc # the results and logs will saved in this folder ./save/id
 2 | base_cfg_path: cfgs/anet_c3d_pdvcl.yml
 3 | 
 4 | caption_decoder_type: standard
 5 | cap_nheads: 1
 6 | cap_dec_n_points: 4
 7 | cap_num_feature_levels: 4
 8 | soft_attention: 1
 9 | att_hid_size: 512
10 | 
11 | ec_alpha: 1.0


--------------------------------------------------------------------------------
/cfgs/anet_c3d_pdvc_gt.yml:
--------------------------------------------------------------------------------
1 | id: anet_c3d_pdvc_gt
2 | base_cfg_path: cfgs/anet_c3d_pdvcl_gt.yml
3 | 
4 | caption_decoder_type: standard
5 | cap_nheads: 1
6 | cap_dec_n_points: 4
7 | cap_num_feature_levels: 4
8 | soft_attention: 1
9 | att_hid_size: 512


--------------------------------------------------------------------------------
/cfgs/anet_c3d_pdvcl.yml:
--------------------------------------------------------------------------------
 1 | id: anet_c3d_pdvcl
 2 | 
 3 | visual_feature_type: c3d
 4 | visual_feature_folder: 'data/anet/features/c3d'
 5 | feature_dim: 500
 6 | invalid_video_json: []
 7 | train_proposal_file: data/generated_proposals/dbg_trainval_top100.json
 8 | eval_proposal_file: data/generated_proposals/dbg_trainval_top100.json
 9 | gt_file_for_eval: ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json']
10 | gt_file_for_para_eval: ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json']
11 | 
12 | train_proposal_type: gt
13 | gt_proposal_sample_num: 30
14 | sample_method: nearest
15 | 
16 | batch_size: 1
17 | lr: 0.00005
18 | learning_rate_decay_start: 8
19 | learning_rate_decay_every: 3
20 | learning_rate_decay_rate: 0.5
21 | weight_decay: 0.0001
22 | save_all_checkpoint: 0
23 | 
24 | num_queries: 10
25 | dec_layers: 2
26 | enc_layers: 2
27 | transformer_ff_dim: 512
28 | transformer_dropout_prob: 0.1
29 | frame_embedding_num: 100
30 | caption_decoder_type: light
31 | att_hid_size: 0
32 | 
33 | with_box_refine: 1
34 | 
35 | fix_xcw: 1
36 | set_cost_caption: 0
37 | set_cost_giou: 4
38 | set_cost_bbox: 0
39 | set_cost_class: 2
40 | #cost_alpha: 0.5
41 | #cost_gamma: 1
42 | #focal_alpha: 0.5
43 | #focal_gamma: 1
44 | caption_loss_coef: 2
45 | giou_loss_coef: 4
46 | bbox_loss_coef: 0
47 | cls_loss_coef: 2
48 | count_loss_coef: 0.5
49 | max_eseq_length: 10
50 | lloss_cross_entropy: 0
51 | lloss_focal_loss: 0
52 | lloss_gau_mask: 1


--------------------------------------------------------------------------------
/cfgs/anet_c3d_pdvcl_gt.yml:
--------------------------------------------------------------------------------
 1 | id: anet_c3d_pdvcl_gt
 2 | 
 3 | visual_feature_type: c3d
 4 | visual_feature_folder: 'data/anet/features/c3d'
 5 | feature_dim: 500
 6 | invalid_video_json: []
 7 | train_proposal_file: data/generated_proposals/dbg_trainval_top100.json
 8 | eval_proposal_file: data/generated_proposals/dbg_trainval_top100.json
 9 | gt_file_for_eval: ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json']
10 | gt_file_for_para_eval: ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json']
11 | 
12 | train_proposal_type: gt
13 | gt_proposal_sample_num: 30
14 | sample_method: nearest
15 | 
16 | batch_size: 1
17 | lr: 0.00005
18 | learning_rate_decay_start: 8
19 | learning_rate_decay_every: 3
20 | learning_rate_decay_rate: 0.5
21 | weight_decay: 0.0001
22 | save_all_checkpoint: 0
23 | 
24 | num_queries: 10
25 | dec_layers: 2
26 | enc_layers: 2
27 | transformer_ff_dim: 512
28 | transformer_dropout_prob: 0.1
29 | frame_embedding_num: 100
30 | caption_decoder_type: light
31 | att_hid_size: 0
32 | 
33 | #with_box_refine: 1
34 | 
35 | fix_xcw: 1
36 | set_cost_caption: 0
37 | set_cost_giou: 4
38 | set_cost_bbox: 0.00001
39 | set_cost_class: 0
40 | #cost_alpha: 0.5
41 | #cost_gamma: 1
42 | #focal_alpha: 0.5
43 | #focal_gamma: 1
44 | caption_loss_coef: 2
45 | giou_loss_coef: 0
46 | bbox_loss_coef: 0
47 | cls_loss_coef: 0
48 | count_loss_coef: 0
49 | #max_eseq_length: 10
50 | #lloss_cross_entropy: 0
51 | #lloss_focal_loss: 0
52 | #lloss_gau_mask: 1
53 | 
54 | #two_stage: 1
55 | transformer_input_type: gt_proposals


--------------------------------------------------------------------------------
/cfgs/anet_c3d_props.yml:
--------------------------------------------------------------------------------
 1 | id: anet_c3d_props
 2 | visual_feature_type: c3d
 3 | visual_feature_folder: 'data/anet/features/c3d'
 4 | feature_dim: 500
 5 | invalid_video_json: []
 6 | train_proposal_file: data/generated_proposals/dbg_trainval_top100.json
 7 | eval_proposal_file: data/generated_proposals/dbg_trainval_top100.json
 8 | gt_file_for_eval: ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json']
 9 | gt_file_for_para_eval: ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json']
10 | 
11 | train_proposal_type: gt
12 | train_proposal_sample_num: 15
13 | sample_method: nearest
14 | 
15 | batch_size: 1
16 | lr: 0.00005
17 | learning_rate_decay_start: 8
18 | learning_rate_decay_every: 3
19 | learning_rate_decay_rate: 0.5
20 | weight_decay: 0.0001
21 | save_all_checkpoint: 0
22 | 
23 | num_queries: 10
24 | dec_layers: 2
25 | enc_layers: 2
26 | transformer_ff_dim: 512
27 | transformer_dropout_prob: 0.1
28 | frame_embedding_num: 100
29 | caption_decoder_type: none
30 | att_hid_size: 0
31 | 
32 | with_box_refine: 1
33 | 
34 | fix_xcw: 1
35 | set_cost_caption: 0
36 | set_cost_giou: 4
37 | set_cost_bbox: 0
38 | set_cost_class: 2
39 | #cost_alpha: 0.5
40 | #cost_gamma: 1
41 | #focal_alpha: 0.5
42 | #focal_gamma: 1
43 | caption_loss_coef: 0
44 | giou_loss_coef: 4
45 | bbox_loss_coef: 0
46 | cls_loss_coef: 2
47 | count_loss_coef: 0.5
48 | max_eseq_length: 10
49 | lloss_cross_entropy: 0
50 | lloss_focal_loss: 0
51 | lloss_gau_mask: 1


--------------------------------------------------------------------------------
/cfgs/anet_i3dvgg_pdvc.yml:
--------------------------------------------------------------------------------
1 | id: anet_i3dvgg_pdvc
2 | base_cfg_path: cfgs/anet_c3d_pdvc.yml
3 | visual_feature_type: ['i3d_rgb', 'i3d_flow', 'vggish']
4 | visual_feature_folder: ['data/anet/features/i3d/', 'data/anet/features/i3d/', 'data/anet/features/vggish/']
5 | invalid_video_json: ['data/anet/features/I3D_vggish_invalid_videos.json']
6 | feature_dim: 2176


--------------------------------------------------------------------------------
/cfgs/anet_i3dvgg_pdvc_gt.yml:
--------------------------------------------------------------------------------
1 | id: anet_i3dvgg_pdvc_gt
2 | base_cfg_path: cfgs/anet_c3d_pdvc_gt.yml
3 | visual_feature_type: ['i3d_rgb', 'i3d_flow', 'vggish']
4 | visual_feature_folder: ['data/anet/features/i3d_25fps_stack64step64_2stream_npy/', 'data/anet/features/i3d_25fps_stack64step64_2stream_npy/', 'data/anet/features/vggish_npy/']
5 | invalid_video_json: ['data/anet/features/I3D_vggish_invalid_videos.json']
6 | feature_dim: 2176


--------------------------------------------------------------------------------
/cfgs/anet_tsn_pdvc.yml:
--------------------------------------------------------------------------------
1 | id: anet_tsn_pdvc
2 | base_cfg_path: cfgs/anet_c3d_pdvc.yml
3 | visual_feature_type: ['resnet', 'bn']
4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn']
5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json']
6 | feature_dim: 3072


--------------------------------------------------------------------------------
/cfgs/anet_tsn_pdvc_gt.yml:
--------------------------------------------------------------------------------
1 | id: anet_tsn_pdvc_gt
2 | base_cfg_path: cfgs/anet_c3d_pdvc_gt.yml
3 | visual_feature_type: ['resnet', 'bn']
4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn']
5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json']
6 | feature_dim: 3072


--------------------------------------------------------------------------------
/cfgs/anet_tsn_pdvcl.yml:
--------------------------------------------------------------------------------
1 | id: anet_tsn_pdvcl
2 | base_cfg_path: cfgs/anet_c3d_pdvcl.yml
3 | visual_feature_type: ['resnet', 'bn']
4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn']
5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json']
6 | feature_dim: 3072


--------------------------------------------------------------------------------
/cfgs/anet_tsn_pdvcl_gt.yml:
--------------------------------------------------------------------------------
1 | id: anet_tsn_pdvcl_gt
2 | base_cfg_path: cfgs/anet_c3d_pdvcl_gt.yml
3 | visual_feature_type: ['resnet', 'bn']
4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn']
5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json']
6 | feature_dim: 3072


--------------------------------------------------------------------------------
/cfgs/anet_tsp_pdvc.yml:
--------------------------------------------------------------------------------
1 | id: anet_tsp_pdvc
2 | base_cfg_path: cfgs/anet_c3d_pdvc.yml
3 | visual_feature_type: ['tsp']
4 | visual_feature_folder: ['data/anet/features/tsp']
5 | invalid_video_json: []
6 | feature_dim: 512


--------------------------------------------------------------------------------
/cfgs/anet_tsp_pdvc_gt.yml:
--------------------------------------------------------------------------------
1 | id: anet_tsp_pdvc_gt
2 | base_cfg_path: cfgs/anet_c3d_pdvc_gt.yml
3 | visual_feature_type: ['tsp']
4 | visual_feature_folder: ['data/anet/features/tsp']
5 | invalid_video_json: []
6 | feature_dim: 512
7 | 


--------------------------------------------------------------------------------
/cfgs/anet_tsp_pdvcl.yml:
--------------------------------------------------------------------------------
1 | id: anet_tsp_pdvcl
2 | base_cfg_path: cfgs/anet_c3d_pdvcl.yml
3 | visual_feature_type: ['tsp']
4 | visual_feature_folder: ['data/anet/features/tsp']
5 | invalid_video_json: []
6 | feature_dim: 512


--------------------------------------------------------------------------------
/cfgs/yc2_tsn_pdvc.yml:
--------------------------------------------------------------------------------
 1 | id: yc2_tsn_pdvc
 2 | base_cfg_path: cfgs/yc2_tsn_pdvcl.yml
 3 | 
 4 | caption_decoder_type: standard
 5 | cap_nheads: 1
 6 | cap_dec_n_points: 4
 7 | cap_num_feature_levels: 4
 8 | soft_attention: 1
 9 | att_hid_size: 512
10 | 
11 | ec_alpha: 1.0


--------------------------------------------------------------------------------
/cfgs/yc2_tsn_pdvc_gt.yml:
--------------------------------------------------------------------------------
1 | id: yc2_tsn_pdvc_gt
2 | base_cfg_path: cfgs/yc2_tsn_pdvcl_gt.yml
3 | 
4 | caption_decoder_type: standard
5 | cap_nheads: 1
6 | cap_dec_n_points: 4
7 | cap_num_feature_levels: 4
8 | soft_attention: 1
9 | att_hid_size: 512


--------------------------------------------------------------------------------
/cfgs/yc2_tsn_pdvcl.yml:
--------------------------------------------------------------------------------
 1 | id: yc2_tsn_pdvcl
 2 | 
 3 | visual_feature_type: ['resnet', 'bn']
 4 | visual_feature_folder: ['data/yc2/features/resnet_bn/', 'data/yc2/features/resnet_bn/']
 5 | feature_dim: 3072
 6 | invalid_video_json: []
 7 | train_caption_file: 'data/yc2/captiondata/yc2_train.json'
 8 | val_caption_file: 'data/yc2/captiondata/yc2_val.json'
 9 | gt_file_for_eval: ['data/yc2/captiondata/yc2_val.json']
10 | gt_file_for_para_eval: ['data/yc2/captiondata/para/para_yc2_val.json']
11 | dict_file: data/yc2/vocabulary_youcook2.json
12 | vocab_size: 1607
13 | 
14 | train_proposal_type: gt
15 | train_proposal_sample_num: 30
16 | sample_method: nearest
17 | 
18 | batch_size: 1
19 | lr: 0.00005
20 | learning_rate_decay_start: 8
21 | learning_rate_decay_every: 3
22 | learning_rate_decay_rate: 0.5
23 | weight_decay: 0.0001
24 | save_all_checkpoint: 0
25 | 
26 | num_queries: 100
27 | dec_layers: 2
28 | enc_layers: 2
29 | transformer_ff_dim: 512
30 | transformer_dropout_prob: 0.1
31 | frame_embedding_num: 200
32 | caption_decoder_type: light
33 | att_hid_size: 0
34 | 
35 | with_box_refine: 1
36 | 
37 | fix_xcw: 1
38 | set_cost_caption: 0
39 | set_cost_giou: 4
40 | set_cost_bbox: 0
41 | set_cost_class: 2
42 | #cost_alpha: 0.5
43 | #cost_gamma: 1
44 | #focal_alpha: 0.5
45 | #focal_gamma: 1
46 | caption_loss_coef: 2
47 | giou_loss_coef: 4
48 | bbox_loss_coef: 0
49 | cls_loss_coef: 2
50 | count_loss_coef: 0.5
51 | max_eseq_length: 20
52 | lloss_cross_entropy: 0
53 | lloss_focal_loss: 0
54 | lloss_gau_mask: 1


--------------------------------------------------------------------------------
/cfgs/yc2_tsn_pdvcl_gt.yml:
--------------------------------------------------------------------------------
 1 | id: yc2_tsn_pdvcl_gt
 2 | 
 3 | visual_feature_type: ['resnet', 'bn']
 4 | visual_feature_folder: ['data/yc2/features/resnet_bn/', 'data/yc2/features/resnet_bn/']
 5 | feature_dim: 3072
 6 | invalid_video_json: []
 7 | train_caption_file: 'data/yc2/captiondata/yc2_train.json'
 8 | val_caption_file: 'data/yc2/captiondata/yc2_val.json'
 9 | gt_file_for_eval: ['data/yc2/captiondata/yc2_val.json']
10 | gt_file_for_para_eval: ['data/yc2/captiondata/para/para_yc2_val.json']
11 | dict_file: data/yc2/vocabulary_youcook2.json
12 | vocab_size: 1607
13 | 
14 | train_proposal_type: gt
15 | gt_proposal_sample_num: 30
16 | sample_method: nearest
17 | 
18 | batch_size: 1
19 | lr: 0.00005
20 | learning_rate_decay_start: 8
21 | learning_rate_decay_every: 3
22 | learning_rate_decay_rate: 0.5
23 | weight_decay: 0.0001
24 | save_all_checkpoint: 0
25 | 
26 | num_queries: 100
27 | dec_layers: 2
28 | enc_layers: 2
29 | transformer_ff_dim: 512
30 | transformer_dropout_prob: 0.1
31 | frame_embedding_num: 200
32 | caption_decoder_type: light
33 | att_hid_size: 0
34 | 
35 | #with_box_refine: 1
36 | 
37 | fix_xcw: 1
38 | set_cost_caption: 0
39 | set_cost_giou: 4
40 | set_cost_bbox: 0.0001
41 | set_cost_class: 0
42 | #cost_alpha: 0.5
43 | #cost_gamma: 1
44 | #focal_alpha: 0.5
45 | #focal_gamma: 1
46 | caption_loss_coef: 2
47 | giou_loss_coef: 0
48 | bbox_loss_coef: 0
49 | cls_loss_coef: 0
50 | count_loss_coef: 0
51 | #max_eseq_length: 10
52 | #lloss_cross_entropy: 0
53 | #lloss_focal_loss: 0
54 | #lloss_gau_mask: 1
55 | 
56 | #two_stage: 1
57 | transformer_input_type: gt_proposals


--------------------------------------------------------------------------------
/data/anet/captiondata/para/readme.txt:
--------------------------------------------------------------------------------
 1 | ANet-Entities val/test splits (re-split from ANet-caption val_1 and val_2 splits):
 2 | https://dl.fbaipublicfiles.com/ActivityNet-Entities/ActivityNet-Entities/anet_entities_captions.tar.gz
 3 | 
 4 | ANet-caption original splits:
 5 | http://cs.stanford.edu/people/ranjaykrishna/densevid/captions.zip
 6 | 
 7 | Experiment settings:
 8 | Training: use GT segments/sentences in `train.json`, 
 9 | Validation: use GT segments in `anet_entities_val_1.json`, evaluate against references `anet_entities_val_1_para.json` and `anet_entities_val_2_para.json`
10 | Test: use GT segments in `anet_entities_test_1.json`, evaluate against references `anet_entities_test_1_para.json` and `anet_entities_test_2_para.json`
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/data/anet/features/convert_c3d_h5_to_npy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import numpy as np
 4 | 
 5 | in_path = 'sub_activitynet_v1-3.c3d.hdf5'
 6 | out_path = 'c3d'
 7 | 
 8 | if not os.path.exists(out_path):
 9 |     os.mkdir(out_path)
10 | 
11 | d = h5py.File(in_path)
12 | for key in d.keys():
13 |     v_d = d[key]['c3d_features'][:].astype('float32')
14 |     np.save(os.path.join(out_path, key+'.npy'), v_d)


--------------------------------------------------------------------------------
/data/anet/features/convert_tsp_h5_to_npy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import numpy as np
 4 | 
 5 | in_paths = [
 6 |     'r2plus1d_34-tsp_on_activitynet-test_features.h5',
 7 |     'r2plus1d_34-tsp_on_activitynet-train_features.h5',
 8 |     'r2plus1d_34-tsp_on_activitynet-valid_features.h5'
 9 | ]
10 | out_path = 'tsp'
11 | 
12 | if not os.path.exists(out_path):
13 |     os.mkdir(out_path)
14 | 
15 | for in_path in in_paths:
16 |     d = h5py.File(in_path)
17 |     for key in d.keys():
18 |         v_d = d[key][:]
19 |         np.save(os.path.join(out_path, key+'.npy'), v_d)
20 | 


--------------------------------------------------------------------------------
/data/anet/features/download_c3d_features.sh:
--------------------------------------------------------------------------------
1 | # Download the C3D feature files , refer to http://activity-net.org/challenges/2016/download.html#c3d to more details.
2 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-00
3 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-01
4 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-02
5 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-03
6 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-04
7 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-05
8 | cat activitynet_v1-3.part-* > c3d_features.zip && unzip c3d_features.zip
9 | python convert_c3d_h5_to_npy.py


--------------------------------------------------------------------------------
/data/anet/features/download_i3d_vggish_features.sh:
--------------------------------------------------------------------------------
 1 | # download i3d features (rgb+flow) and vggish features of ActivityNet Captions
 2 | # Modified from https://github.com/v-iashin/BMT/blob/master/download_data.sh
 3 | # Copyright (c) 2020 Vladimir Iashin
 4 | 
 5 | 
 6 | # checking if wget is installed on a computer
 7 | if ! command -v wget &> /dev/null
 8 | then
 9 |     echo "wget: command not found"
10 |     echo ""
11 |     echo "wget command could not be found on your computer. Please, install it first."
12 |     echo "If you cannot/dontwantto install wget, you may try to download the features manually."
13 |     echo "You may find the links and correct paths in this file."
14 |     echo "Make sure to check the md5 sums after manual download:"
15 |     echo "./data/i3d_25fps_stack64step64_2stream_npy.zip    d7266e440f8c616acbc0d8aaa4a336dc"
16 |     echo "./data/vggish_npy.zip    9a654ad785e801aceb70af2a5e1cffbe"
17 |     echo "./.vector_cache/glove.840B.300d.zip    2ffafcc9f9ae46fc8c95f32372976137"
18 |     exit
19 | fi
20 | 
21 | 
22 | echo "Downloading i3d features"
23 | wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/i3d_25fps_stack64step64_2stream_npy.zip -q --show-progress
24 | echo "Downloading vggish features"
25 | wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/vggish_npy.zip -q --show-progress
26 | 
27 | #echo "Downloading GloVe embeddings"
28 | #mkdir .vector_cache
29 | #cd .vector_cache
30 | #wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/glove.840B.300d.zip  -q --show-progress
31 | #cd ../
32 | 
33 | echo "Checking for correctness of the downloaded files"
34 | 
35 | i3d_md5=($(md5sum ./data/i3d_25fps_stack64step64_2stream_npy.zip))
36 | if [ "$i3d_md5" == "d7266e440f8c616acbc0d8aaa4a336dc" ]; then
37 |     echo "OK: i3d features"
38 | else
39 |     echo "ERROR: .zip file with i3d features is corrupted"
40 |     exit 1
41 | fi
42 | 
43 | vggish_md5=($(md5sum ./data/vggish_npy.zip))
44 | if [ "$vggish_md5" == "9a654ad785e801aceb70af2a5e1cffbe" ]; then
45 |     echo "OK: vggish features"
46 | else
47 |     echo "ERROR: .zip file with vggish features is corrupted"
48 |     exit 1
49 | fi
50 | 
51 | glove_md5=($(md5sum ./.vector_cache/glove.840B.300d.zip))
52 | if [ "$glove_md5" == "2ffafcc9f9ae46fc8c95f32372976137" ]; then
53 |     echo "OK: glove embeddings"
54 | else
55 |     echo "ERROR: .zip file with glove embeddings is corrupted"
56 |     exit 1
57 | fi
58 | 
59 | echo "Unpacking i3d (~1 min)"
60 | 
61 | unzip -q i3d_25fps_stack64step64_2stream_npy.zip
62 | echo "Unpacking vggish features"
63 | unzip -q vggish_npy.zip
64 | 
65 | echo "Done"


--------------------------------------------------------------------------------
/data/anet/features/download_tsn_features.sh:
--------------------------------------------------------------------------------
 1 | # Download TSN feature files, refer to https://github.com/salesforce/densecap#data-preparation for more details about feature extraction.
 2 | wget http://youcook2.eecs.umich.edu/static/dat/anet_densecap/training_feat_anet.tar.gz
 3 | wget http://youcook2.eecs.umich.edu/static/dat/anet_densecap/validation_feat_anet.tar.gz
 4 | wget http://youcook2.eecs.umich.edu/static/dat/anet_densecap/testing_feat_anet.tar.gz
 5 | 
 6 | tar xvzf training_feat_anet.tar.gz
 7 | tar xvzf validation_feat_anet.tar.gz
 8 | tar xvzf testing_feat_anet.tar.gz
 9 | mkdir resnet_bn
10 | mv testing/* resnet_bn
11 | mv training/* resnet_bn
12 | mv validation/* resnet_bn
13 | 


--------------------------------------------------------------------------------
/data/anet/features/download_tsp_features.sh:
--------------------------------------------------------------------------------
1 | # TSP features from https://github.com/HumamAlwassel/TSP
2 | # download the following files and reformat them into data/features/tsp/VIDEO_ID.npy where VIDEO_ID starts with 'v_'
3 | wget https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-train_features.h5
4 | wget https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-valid_features.h5
5 | wget https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-test_features.h5
6 | python convert_tsp_h5_to_npy.py
7 | 


--------------------------------------------------------------------------------
/data/anet/features/resnet_bn_invalid_videos.json:
--------------------------------------------------------------------------------
1 | ["v_iVVatZsgnGo", "v_0dkIbKXXFzI", "v_xeOHoiH-dmo", "v_j73Wh1olDsA", "v_IeBCgi4xPIE"]


--------------------------------------------------------------------------------
/data/yc2/captiondata/para/convert_to_para.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | split='val'
 3 | p='yc2_{}.json'.format(split)
 4 | out_p = 'para_yc2_{}.json'.format(split)
 5 | 
 6 | d = json.load(open(p))
 7 | out = {}
 8 | for k,v in d.items():
 9 |     para = '. '.join(v['sentences'])
10 |     out[k] = para
11 | json.dump(out, open(out_p, 'w'))
12 | 


--------------------------------------------------------------------------------
/data/yc2/features/download_yc2_tsn_features.sh:
--------------------------------------------------------------------------------
 1 | http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/training_feat_yc2.tar.gz
 2 | 
 3 | # Download TSN feature files for the youcook2 dataset, refer to https://github.com/salesforce/densecap#data-preparation for more details about feature extraction.
 4 | wget http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/training_feat_yc2.tar.gz
 5 | wget http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/validation_feat_yc2.tar.gz
 6 | wget http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/testing_feat_yc2.tar.gz
 7 | 
 8 | tar xvzf training_feat_yc2.tar.gz
 9 | tar xvzf validation_feat_yc2.tar.gz
10 | tar xvzf testing_feat_yc2.tar.gz
11 | mkdir resnet_bn
12 | mv testing/* resnet_bn
13 | mv training/* resnet_bn
14 | mv validation/* resnet_bn
15 | 


--------------------------------------------------------------------------------
/densevid_eval3/eval_dvc.py:
--------------------------------------------------------------------------------
 1 | from densevid_eval3.evaluate2018 import main as eval2018
 2 | from densevid_eval3.evaluate2021 import main as eval2021
 3 | 
 4 | def eval_dvc(json_path, reference, no_lang_eval=False, topN=1000, version='2018'):
 5 |     args = type('args', (object,), {})()
 6 |     args.submission = json_path
 7 |     args.max_proposals_per_video = topN
 8 |     args.tious = [0.3,0.5,0.7,0.9]
 9 |     args.verbose = False
10 |     args.no_lang_eval = no_lang_eval
11 |     args.references = reference
12 |     eval_func = eval2018 if version=='2018' else eval2021
13 |     score = eval_func(args)
14 |     return score
15 | 
16 | if __name__ == '__main__':
17 |     p = '../save/pretrained_models/anet_c3d_pdvc/2021-08-21-23-40-05_debug_2021-08-21_20-46-20_epoch8_num4917_score0.json.top3.json'
18 |     ref = ['../data/anet/captiondata/val_1.json', '../data/anet/captiondata/val_2.json']
19 |     score = eval_dvc(p, ref, no_lang_eval=False, version='2018')
20 |     print(score)


--------------------------------------------------------------------------------
/densevid_eval3/eval_para.py:
--------------------------------------------------------------------------------
 1 | from densevid_eval3.para_evaluate import ANETcaptions
 2 | 
 3 | def eval_para(prediction, referneces, verbose=False):
 4 |     args = type('args', (object,), {})()
 5 |     args.submission = prediction
 6 |     args.references = referneces
 7 |     args.all_scorer = True
 8 |     args.verbose = verbose
 9 | 
10 |     evaluator = ANETcaptions(ground_truth_filenames=args.references,
11 |                              prediction_filename=args.submission,
12 |                              verbose=args.verbose,
13 |                              all_scorer=args.all_scorer)
14 |     evaluator.evaluate()
15 |     output = {}
16 | 
17 |     for metric, score in evaluator.scores.items():
18 |         # print ('| %s: %2.4f'%(metric, 100*score))
19 |         output['para_'+metric] = score
20 |     return output


--------------------------------------------------------------------------------
/densevid_eval3/eval_soda.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from os.path import dirname, abspath
 4 | 
 5 | pdvc_dir = dirname(dirname(abspath(__file__)))
 6 | sys.path.append(pdvc_dir)
 7 | sys.path.append(os.path.join(pdvc_dir, 'densevid_eval3/SODA'))
 8 | 
 9 | import numpy as np
10 | from densevid_eval3.SODA.soda import SODA
11 | from densevid_eval3.SODA.dataset import ANETCaptions
12 | from densevid_eval3.eval_para import eval_para
13 | 
14 | def eval_tool(prediction, referneces=None, metric='Meteor', soda_type='c', verbose=False):
15 | 
16 |     args = type('args', (object,), {})()
17 |     args.prediction = prediction
18 |     args.references = referneces
19 |     args.metric = metric
20 |     args.soda_type = soda_type
21 |     args.tious = [0.3, 0.5, 0.7, 0.9]
22 |     args.verbose = verbose
23 |     args.multi_reference = False
24 | 
25 |     data = ANETCaptions.from_load_files(args.references,
26 |                                         args.prediction,
27 |                                         multi_reference=args.multi_reference,
28 |                                         verbose=args.verbose,
29 |                                         )
30 |     data.preprocess()
31 |     if args.soda_type == 'a':
32 |         tious = args.tious
33 |     else:
34 |         tious = None
35 |     evaluator = SODA(data,
36 |                      soda_type=args.soda_type,
37 |                      tious=tious,
38 |                      scorer=args.metric,
39 |                      verbose=args.verbose
40 |                      )
41 |     result = evaluator.evaluate()
42 | 
43 |     return result
44 | 
45 | def eval_soda(p, ref_list,verbose=False):
46 |     score_sum = []
47 |     for ref in ref_list:
48 |         r = eval_tool(prediction=p, referneces=[ref], verbose=verbose, soda_type='c')
49 |         score_sum.append(r['Meteor'])
50 |     soda_avg = np.mean(score_sum, axis=0) #[avg_pre, avg_rec, avg_f1]
51 |     soda_c_avg = soda_avg[-1]
52 |     results = {'soda_c': soda_c_avg}
53 |     return results
54 | 
55 | 
56 | if __name__ == '__main__':
57 | 
58 |     p_new = '../save/old/cfgs--base_config_v2_0427--anet_c3d_pdvc_seed358/2021-08-21-21-47-13_debug_2021-08-21_20-46-20_epoch8_num4917_score0_top1000.json'
59 |     p_vitr= '../save/old/cfgs--base_config_v2_0427--anet_c3d_pdvc_seed358/2021-08-21-21-47-20_cfgs--base_config_v2_0427--anet_c3d_pdvc_seed358_epoch8_num4917_score0_top1000.json.tmp'
60 | 
61 |     for p in [p_new, p_vitr]:
62 |         print('\n')
63 |         print(p)
64 |         ref_list = ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json']
65 |         score=eval_soda(p, ref_list, verbose=False)
66 |         print(score)
67 |         para_score = get_para_score(p, referneces=['../data/anet/captiondata/para/anet_entities_val_1_para.json', '../data/anet/captiondata/para/anet_entities_val_2_para.json'])
68 |         print(para_score)
69 | 
70 | 
71 |         # metric = ['Meteor', 'Cider']
72 |         # score_type = ['standard_score', 'precision_recall', 'paragraph_score']
73 |         # dvc_score = soda3.eval_tool(predictions=[p], referneces=ref_list, metric=metric,score_type=score_type)[0]
74 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import argparse
  6 | import json
  7 | import os
  8 | import sys
  9 | import torch
 10 | import numpy as np
 11 | import time
 12 | from os.path import dirname, abspath
 13 | 
 14 | pdvc_dir = dirname(abspath(__file__))
 15 | sys.path.insert(0, pdvc_dir)
 16 | sys.path.insert(0, os.path.join(pdvc_dir, 'densevid_eval3'))
 17 | sys.path.insert(0, os.path.join(pdvc_dir, 'densevid_eval3/SODA'))
 18 | # print(sys.path)
 19 | 
 20 | from eval_utils import evaluate
 21 | from pdvc.pdvc import build
 22 | from misc.utils import create_logger
 23 | from data.video_dataset import PropSeqDataset, collate_fn
 24 | from torch.utils.data import DataLoader
 25 | from os.path import basename
 26 | import pandas as pd
 27 | 
 28 | def create_fake_test_caption_file(metadata_csv_path):
 29 |     out = {}
 30 |     df = pd.read_csv(metadata_csv_path)
 31 |     for i, row in df.iterrows():
 32 |         out[basename(row['filename']).split('.')[0]] = {'duration': row['video-duration'], "timestamps": [[0, 0.5]], "sentences":["None"]}
 33 |     fake_test_json = '.fake_test_json.tmp'
 34 |     json.dump(out, open(fake_test_json, 'w'))
 35 |     return fake_test_json
 36 | 
 37 | def main(opt):
 38 |     folder_path = os.path.join(opt.eval_save_dir, opt.eval_folder)
 39 |     if opt.eval_mode == 'test':
 40 |         if not os.path.exists(folder_path):
 41 |             os.makedirs(folder_path)
 42 |     logger = create_logger(folder_path, 'val.log')
 43 |     if opt.eval_model_path:
 44 |         model_path = opt.eval_model_path
 45 |         infos_path = os.path.join('/'.join(opt.eval_model_path.split('/')[:-1]), 'info.json')
 46 |     else:
 47 |         model_path = os.path.join(folder_path, 'model-best.pth')
 48 |         infos_path = os.path.join(folder_path, 'info.json')
 49 | 
 50 |     logger.info(vars(opt))
 51 | 
 52 |     with open(infos_path, 'rb') as f:
 53 |         logger.info('load info from {}'.format(infos_path))
 54 |         old_opt = json.load(f)['best']['opt']
 55 | 
 56 |     for k, v in old_opt.items():
 57 |         if k[:4] != 'eval':
 58 |             vars(opt).update({k: v})
 59 | 
 60 |     opt.transformer_input_type = opt.eval_transformer_input_type
 61 | 
 62 |     if not torch.cuda.is_available():
 63 |         opt.nthreads = 0
 64 |     # Create the Data Loader instance
 65 | 
 66 |     if opt.eval_mode == 'test':
 67 |         opt.eval_caption_file = create_fake_test_caption_file(opt.test_video_meta_data_csv_path)
 68 |         opt.visual_feature_folder = opt.test_video_feature_folder
 69 | 
 70 |     val_dataset = PropSeqDataset(opt.eval_caption_file,
 71 |                                  opt.visual_feature_folder,
 72 |                                  opt.dict_file, False, opt.eval_proposal_type,
 73 |                                  opt)
 74 |     loader = DataLoader(val_dataset, batch_size=opt.batch_size_for_eval,
 75 |                         shuffle=False, num_workers=opt.nthreads, collate_fn=collate_fn)
 76 | 
 77 | 
 78 |     model, criterion, postprocessors = build(opt)
 79 |     model.translator = val_dataset.translator
 80 | 
 81 | 
 82 | 
 83 |     while not os.path.exists(model_path):
 84 |         raise AssertionError('File {} does not exist'.format(model_path))
 85 | 
 86 |     logger.debug('Loading model from {}'.format(model_path))
 87 |     loaded_pth = torch.load(model_path, map_location=opt.eval_device)
 88 |     epoch = loaded_pth['epoch']
 89 | 
 90 |     # loaded_pth = transfer(model, loaded_pth, model_path+'.transfer.pth')
 91 |     model.load_state_dict(loaded_pth['model'], strict=True)
 92 |     model.eval()
 93 | 
 94 |     model.to(opt.eval_device)
 95 | 
 96 |     if opt.eval_mode == 'test':
 97 |         out_json_path = os.path.join(folder_path, 'dvc_results.json')
 98 |         evaluate(model, criterion, postprocessors, loader, out_json_path,
 99 |                          logger, alpha=opt.ec_alpha, dvc_eval_version=opt.eval_tool_version, device=opt.eval_device, debug=False, skip_lang_eval=True)
100 | 
101 | 
102 |     else:
103 |         out_json_path = os.path.join(folder_path, '{}_epoch{}_num{}_alpha{}.json'.format(
104 |             time.strftime("%Y-%m-%d-%H-%M-%S_", time.localtime()) + str(opt.id), epoch, len(loader.dataset),
105 |             opt.ec_alpha))
106 |         caption_scores, eval_loss = evaluate(model, criterion, postprocessors, loader, out_json_path,
107 |                          logger, alpha=opt.ec_alpha, dvc_eval_version=opt.eval_tool_version, device=opt.eval_device, debug=False, skip_lang_eval=False)
108 |         avg_eval_score = {key: np.array(value).mean() for key, value in caption_scores.items() if key !='tiou'}
109 |         avg_eval_score2 = {key: np.array(value).mean() * 4917 / len(loader.dataset) for key, value in caption_scores.items() if key != 'tiou'}
110 | 
111 |         logger.info(
112 |             '\nValidation result based on all 4917 val videos:\n {}\n avg_score:\n{}'.format(
113 |                                                                                        caption_scores.items(),
114 |                                                                                        avg_eval_score))
115 | 
116 |         logger.info(
117 |                 '\nValidation result based on {} available val videos:\n avg_score:\n{}'.format(len(loader.dataset),
118 |                                                                                            avg_eval_score2))
119 | 
120 |     logger.info('saving reults json to {}'.format(out_json_path))
121 | 
122 | if __name__ == '__main__':
123 |     parser = argparse.ArgumentParser()
124 |     parser.add_argument('--eval_save_dir', type=str, default='save')
125 |     parser.add_argument('--eval_mode', type=str, default='eval', choices=['eval', 'test'])
126 |     parser.add_argument('--test_video_feature_folder', type=str, nargs='+', default=None)
127 |     parser.add_argument('--test_video_meta_data_csv_path', type=str, default=None)
128 |     parser.add_argument('--eval_folder', type=str, required=True)
129 |     parser.add_argument('--eval_model_path', type=str, default='')
130 |     parser.add_argument('--eval_tool_version', type=str, default='2018', choices=['2018', '2021'])
131 |     parser.add_argument('--eval_caption_file', type=str, default='data/anet/captiondata/val_1.json')
132 |     parser.add_argument('--eval_proposal_type', type=str, default='gt')
133 |     parser.add_argument('--eval_transformer_input_type', type=str, default='queries', choices=['gt_proposals', 'queries'])
134 |     parser.add_argument('--gpu_id', type=str, nargs='+', default=['0'])
135 |     parser.add_argument('--eval_device', type=str, default='cuda')
136 |     opt = parser.parse_args()
137 | 
138 |     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in opt.gpu_id])
139 |     os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
140 |     if True:
141 |         torch.backends.cudnn.enabled = False
142 |     main(opt)
143 | 


--------------------------------------------------------------------------------
/misc/build_vocab.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import json
 3 | 
 4 | # file_path_list = ["data/captiondata/train_modified.json", "data/captiondata/val_1.json", "data/captiondata/val_2.json"]
 5 | file_path_list = ["data/captiondata/yc2/yc2_train.json", "data/captiondata/yc2/yc2_val.json"]
 6 | 
 7 | count_threshold = 2 # 4 for anet, 2 for youcook2
 8 | # output_path = './data/vocabulary_activitynet.json'
 9 | output_path = './data/vocabulary_youcook2.json'
10 | 
11 | mark = [',', ':', '!', '_', ';', '-', '.', '?', '/', '"', '\\n', '\\']
12 | 
13 | count_vocal = {}
14 | 
15 | for file_path in file_path_list:
16 |     data = json.load(open(file_path))
17 |     video_ids = data.keys()
18 |     print('video num of ' + file_path.split('/')[-1], len(video_ids))
19 |     for video_id in video_ids:
20 |         sentences = data[video_id]["sentences"]
21 |         for sentence in sentences:
22 |             for m in mark:
23 |                 if m in sentence:
24 |                     sentence = sentence.replace(m, " ")
25 |                 sentence = sentence.replace("  ", " ")
26 |                 sentence = sentence.replace("  ", " ")
27 |                 sentence = sentence.replace("  ", " ")
28 | 
29 |             sentence = sentence.lstrip()
30 |             sentence = sentence.rstrip()
31 |             sentence = sentence.lower()
32 |             sentence = sentence.split(" ")
33 |             length = len(sentence)
34 | 
35 |             # print(sentence)
36 |             for word in sentence:
37 |                 # print(type(word))
38 |                 for m in word:
39 |                     if m == ' ':
40 |                         print('warning !')
41 |                         word = word.replace(m, '')
42 |                 if word == '':
43 |                     print('warning !')
44 |                     pass
45 |                 count_vocal[word] = count_vocal.get(word, 0) + 1
46 | 
47 | print("total word:", sum(count_vocal.values()))
48 | count_vocal['<bos>'] = 1e10
49 | count_vocal['<eos>'] = 1e10
50 | vocab = [word for word, n in count_vocal.items() if n >= count_threshold]
51 | bad_word = [word for word, n in count_vocal.items() if n < count_threshold]
52 | bad_count = sum(count_vocal[word] for word in bad_word)
53 | 
54 | vocab.append('UNK')
55 | print("number of vocab:", len(vocab))
56 | print("number of bad word:", len(bad_word))
57 | print("number of unks:", bad_count)
58 | 
59 | itow = {i + 1: w for i, w in enumerate(vocab)}
60 | wtoi = {w: i + 1 for i, w in enumerate(vocab)}
61 | print(len(itow))
62 | print(len(wtoi))
63 | 
64 | json.dump({'ix_to_word': itow,
65 |            'word_to_ix': wtoi}, open(output_path, 'w'))
66 | print("saving vocabulary file to {}".format(output_path))


--------------------------------------------------------------------------------
/misc/detr_utils/box_ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | """
 3 | Utilities for bounding box manipulation and GIoU.
 4 | """
 5 | import torch
 6 | from torchvision.ops.boxes import box_area
 7 | 
 8 | def box_cl_to_xy(x):
 9 |     c, l = x.unbind(-1)
10 |     b = [c - 0.5 * l, c + 0.5 * l]
11 |     return torch.stack(b, dim=-1)
12 | 
13 | def box_xy_to_cl(x):
14 |     x0, x1 = x.unbind(-1)
15 |     b = [(x0 + x1) / 2, (x1 - x0)]
16 |     return torch.stack(b, dim=-1)
17 | 
18 | # modified from torchvision to also return the union
19 | def box_iou(boxes1, boxes2):
20 |     area1 = boxes1[:, 1] - boxes1[:, 0]
21 |     area2 = boxes2[:, 1] - boxes2[:, 0]
22 |     lt = torch.max(boxes1[:, None, 0], boxes2[:, 0])  # [N,M,2]
23 |     rb = torch.min(boxes1[:, None, 1], boxes2[:, 1])  # [N,M,2]
24 |     inter = (rb - lt).clamp(min=0)  # [N,M,2]
25 |     union = area1[:, None] + area2 - inter
26 |     iou = inter / (union + 1e-5)
27 |     return iou, union
28 | 
29 | 
30 | def generalized_box_iou(boxes1, boxes2):
31 |     """
32 |     Generalized IoU from https://giou.stanford.edu/
33 | 
34 |     The boxes should be in [x0, y0, x1, y1] format
35 | 
36 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
37 |     and M = len(boxes2)
38 |     """
39 |     # degenerate boxes gives inf / nan results
40 |     # so do an early check
41 |     assert (boxes1[:, 1:] >= boxes1[:, :1]).all()
42 |     assert (boxes2[:, 1:] >= boxes2[:, :1]).all()
43 |     iou, union = box_iou(boxes1, boxes2)
44 |     lt = torch.min(boxes1[:, None, 0], boxes2[:, 0])
45 |     rb = torch.max(boxes1[:, None, 1], boxes2[:, 1])
46 |     area = (rb - lt).clamp(min=0)  # [N,M,2]
47 |     giou = iou - (area - union) / (area + 1e-5)
48 |     return giou


--------------------------------------------------------------------------------
/misc/utils.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | # from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import time
  7 | import torch
  8 | import numpy as np
  9 | import glob
 10 | import shutil
 11 | import os
 12 | import colorlog
 13 | import random
 14 | import six
 15 | from six.moves import cPickle
 16 | import matplotlib as mpl
 17 | 
 18 | mpl.use('Agg')
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | 
 22 | def match_name_keywords(n, name_keywords):
 23 |     out = False
 24 |     for b in name_keywords:
 25 |         if b in n:
 26 |             out = True
 27 |             break
 28 |     return out
 29 | 
 30 | 
 31 | def decide_two_stage(transformer_input_type, dt, criterion):
 32 |     if transformer_input_type == 'gt_proposals':
 33 |         two_stage = True
 34 |         proposals = dt['gt_boxes']
 35 |         proposals_mask = dt['gt_boxes_mask']
 36 |         criterion.matcher.cost_caption = 0
 37 |         for q_k in ['loss_length', 'loss_ce', 'loss_bbox', 'loss_giou']:
 38 |             for key in criterion.weight_dict.keys():
 39 |                 if q_k in key:
 40 |                     criterion.weight_dict[key] = 0
 41 |         disable_iterative_refine = True
 42 |     elif transformer_input_type == 'queries':  #
 43 |         two_stage = False
 44 |         proposals = None
 45 |         proposals_mask = None
 46 |         disable_iterative_refine = False
 47 |     else:
 48 |         raise ValueError('Wrong value of transformer_input_type, got {}'.format(transformer_input_type))
 49 |     return two_stage, disable_iterative_refine, proposals, proposals_mask
 50 | 
 51 | 
 52 | def pickle_load(f):
 53 |     """ Load a pickle.
 54 |     Parameters
 55 |     ----------
 56 |     f: file-like object
 57 |     """
 58 |     if six.PY3:
 59 |         return cPickle.load(f, encoding='latin-1')
 60 |     else:
 61 |         return cPickle.load(f)
 62 | 
 63 | 
 64 | def pickle_dump(obj, f):
 65 |     """ Dump a pickle.
 66 |     Parameters
 67 |     ----------
 68 |     obj: pickled object
 69 |     f: file-like object
 70 |     """
 71 |     if six.PY3:
 72 |         return cPickle.dump(obj, f, protocol=2)
 73 |     else:
 74 |         return cPickle.dump(obj, f)
 75 | 
 76 | 
 77 | def set_seed(seed):
 78 |     random.seed(seed)
 79 |     np.random.seed(seed)
 80 |     torch.manual_seed(seed)
 81 |     torch.cuda.manual_seed(seed)
 82 |     torch.cuda.manual_seed_all(seed)
 83 |     torch.backends.cudnn.deterministic = True
 84 |     torch.backends.cudnn.benchmark = False
 85 | 
 86 | 
 87 | def update_values(dict_from, dict_to):
 88 |     for key, value in dict_from.items():
 89 |         if key not in dict_to.keys():
 90 |             raise AssertionError('key mismatching: {}'.format(key))
 91 |         if isinstance(value, dict):
 92 |             update_values(dict_from[key], dict_to[key])
 93 |         elif value is not None:
 94 |             dict_to[key] = dict_from[key]
 95 | 
 96 | 
 97 | def print_opt(opt, model, logger):
 98 |     print_alert_message('All args:', logger)
 99 |     for key, item in opt._get_kwargs():
100 |         logger.info('{} = {}'.format(key, item))
101 |     print_alert_message('Model structure:', logger)
102 |     logger.info(model)
103 | 
104 | 
105 | def build_floder(opt):
106 |     if opt.start_from:
107 |         print('Start training from id:{}'.format(opt.start_from))
108 |         save_folder = os.path.join(opt.save_dir, opt.start_from)
109 |         assert os.path.exists(save_folder)
110 |     else:
111 |         if not os.path.exists(opt.save_dir):
112 |             os.mkdir(opt.save_dir)
113 |         save_folder = os.path.join(opt.save_dir, opt.id)
114 |         if os.path.exists(save_folder):
115 |             # wait_flag = input('Warning! ID {} already exists, rename it? (Y/N) : '.format(opt.id))
116 |             wait_flag = 'Y'
117 |             if wait_flag in ['Y', 'y']:
118 |                 opt.id = opt.id + '_v_{}'.format(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
119 |                 save_folder = os.path.join(opt.save_dir, opt.id)
120 |                 print('Rename opt.id as "{}".'.format(opt.id))
121 |             else:
122 |                 raise AssertionError('ID already exists, folder {} exists'.format(save_folder))
123 |         print('Results folder "{}" does not exist, creating folder...'.format(save_folder))
124 |         os.mkdir(save_folder)
125 |         os.mkdir(os.path.join(save_folder, 'prediction'))
126 |     return save_folder
127 | 
128 | 
129 | def backup_envir(save_folder):
130 |     backup_folders = ['cfgs', 'misc', 'pdvc']
131 |     backup_files = glob.glob('./*.py')
132 |     for folder in backup_folders:
133 |         shutil.copytree(folder, os.path.join(save_folder, 'backup', folder))
134 |     for file in backup_files:
135 |         shutil.copyfile(file, os.path.join(save_folder, 'backup', file))
136 | 
137 | 
138 | def create_logger(folder, filename):
139 |     log_colors = {
140 |         'DEBUG': 'blue',
141 |         'INFO': 'white',
142 |         'WARNING': 'green',
143 |         'ERROR': 'red',
144 |         'CRITICAL': 'yellow',
145 |     }
146 | 
147 |     import logging
148 |     logger = logging.getLogger('DVC')
149 |     # %(filename)s$RESET:%(lineno)d
150 |     # LOGFORMAT = "%(log_color)s%(asctime)s [%(log_color)s%(filename)s:%(lineno)d] | %(log_color)s%(message)s%(reset)s |"
151 |     LOGFORMAT = ""
152 |     LOG_LEVEL = logging.DEBUG
153 |     logging.root.setLevel(LOG_LEVEL)
154 |     stream = logging.StreamHandler()
155 |     stream.setLevel(LOG_LEVEL)
156 |     stream.setFormatter(colorlog.ColoredFormatter(LOGFORMAT, datefmt='%d %H:%M', log_colors=log_colors))
157 | 
158 |     # print to log file
159 |     hdlr = logging.FileHandler(os.path.join(folder, filename))
160 |     hdlr.setLevel(LOG_LEVEL)
161 |     # hdlr.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
162 |     hdlr.setFormatter(logging.Formatter("%(message)s"))
163 |     logger.addHandler(hdlr)
164 |     logger.addHandler(stream)
165 |     return logger
166 | 
167 | 
168 | def print_alert_message(str, logger=None):
169 |     msg = '*' * 20 + ' ' + str + ' ' + '*' * (58 - len(str))
170 |     if logger:
171 |         logger.info('\n\n' + msg)
172 |     else:
173 |         print(msg)
174 | 
175 | 
176 | def set_lr(optimizer, lr):
177 |     for group in optimizer.param_groups:
178 |         group['lr'] = lr
179 | 
180 | 
181 | def clip_gradient(optimizer, grad_clip):
182 |     for group in optimizer.param_groups:
183 |         for i, param in enumerate(group['params']):
184 |             if param.grad is not None:
185 |                 param.grad.data.clamp_(-grad_clip, grad_clip)
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     # import opts
190 |     #
191 |     # info = {'opt': vars(opts.parse_opts()),
192 |     #         'loss': {'tap_loss': 0, 'tap_reg_loss': 0, 'tap_conf_loss': 0, 'lm_loss': 0}}
193 |     # record_this_run_to_csv(info, 'save/results_all_runs.csv')
194 | 
195 |     logger = create_logger('./', 'mylogger.log')
196 |     logger.info('debug')
197 |     logger.info('test2')
198 | 


--------------------------------------------------------------------------------
/pdvc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/pdvc.jpg


--------------------------------------------------------------------------------
/pdvc/CaptioningHead/LSTM.py:
--------------------------------------------------------------------------------
  1 | # This file contains ShowAttendTell and AllImg model
  2 | 
  3 | # ShowAttendTell is from Show, Attend and Tell: Neural Image Caption Generation with Visual Attention
  4 | # https://arxiv.org/abs/1502.03044
  5 | 
  6 | # AllImg is a model where
  7 | # img feature is concatenated with word embedding at every time step as the input of lstm
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import pdb
 13 | 
 14 | import numpy
 15 | import torch
 16 | import torch.nn as nn
 17 | import torch.nn.functional as F
 18 | from torch.autograd import *
 19 | 
 20 | class Captioner(nn.Module):
 21 |     def __init__(self, opt):
 22 |         super(Captioner, self).__init__()
 23 |         self.opt = opt
 24 | 
 25 |         self.vocab_size = opt.vocab_size
 26 |         self.input_encoding_size = opt.input_encoding_size
 27 |         self.rnn_size = opt.rnn_size
 28 |         self.num_layers = opt.num_layers
 29 |         self.drop_prob_lm = opt.drop_prob
 30 |         self.max_caption_len = opt.max_caption_len
 31 | 
 32 |         self.ss_prob = 0.0 # Schedule sampling probability
 33 |         self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size)
 34 | 
 35 |         self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
 36 |         self.dropout = nn.Dropout(self.drop_prob_lm)
 37 | 
 38 |         self.init_weights()
 39 | 
 40 |     def init_weights(self):
 41 |         initrange = 0.1
 42 |         self.embed.weight.data.uniform_(-initrange, initrange)
 43 |         self.logit.bias.data.fill_(0)
 44 |         self.logit.weight.data.uniform_(-initrange, initrange)
 45 | 
 46 |     def init_hidden(self, batch_size):
 47 |         weight = next(self.parameters()).data
 48 |         return (weight.new(self.num_layers, batch_size, self.rnn_size).zero_(),
 49 |                 weight.new(self.num_layers, batch_size, self.rnn_size).zero_())  # (h0, c0)
 50 | 
 51 |     def build_loss(self, input, target, mask):
 52 |         one_hot = torch.nn.functional.one_hot(target, self.opt.vocab_size+1)
 53 |         max_len = input.shape[1]
 54 |         output = - (one_hot[:, :max_len] * input * mask[:, :max_len, None]).sum(2).sum(1) / (mask.sum(1) + 1e-6)
 55 |         return output
 56 | 
 57 |     def forward(self, event, clip, clip_mask, seq):
 58 |         batch_size = clip.shape[0]
 59 | 
 60 |         state = self.init_hidden(batch_size)
 61 |         outputs = []
 62 |         seq = seq.long()
 63 | 
 64 |         for i in range(seq.size(1) - 1):
 65 |             if self.training and i >= 1 and self.ss_prob > 0.0: # otherwiste no need to sample
 66 |                 sample_prob = clip.data.new(batch_size).uniform_(0, 1)
 67 |                 sample_mask = sample_prob < self.ss_prob
 68 |                 if sample_mask.sum() == 0:
 69 |                     it = seq[:, i].clone()
 70 |                 else:
 71 |                     sample_ind = sample_mask.nonzero().view(-1)
 72 |                     it = seq[:, i].data.clone()
 73 |                     prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
 74 |                     it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
 75 |                     it = Variable(it, requires_grad=False)
 76 |             else:
 77 |                 it = seq[:, i].clone()
 78 |                 # break if all the sequences end
 79 |             if i >= 1 and seq[:, i].data.sum() == 0:
 80 |                 break
 81 | 
 82 |             output, state = self.get_logprobs_state(it, event, clip, clip_mask, state)
 83 |             outputs.append(output)
 84 | 
 85 |         return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
 86 | 
 87 | 
 88 |     def get_logprobs_state(self, it, event , clip, clip_mask, state):
 89 |         xt = self.embed(it)
 90 |         output, state = self.core(xt, event , clip, clip_mask, state)
 91 |         logprobs = F.log_softmax(self.logit(self.dropout(output)), dim=1)
 92 |         return logprobs, state
 93 | 
 94 |     def sample(self, event , clip, clip_mask, opt={}):
 95 | 
 96 |         sample_max = opt.get('sample_max', 1)
 97 |         beam_size = opt.get('beam_size', 1)
 98 |         temperature = opt.get('temperature', 1.0)
 99 | 
100 |         batch_size = clip.shape[0]
101 | 
102 |         state = self.init_hidden(batch_size)
103 | 
104 |         seq = []
105 |         seqLogprobs = []
106 | 
107 |         for t in range(self.max_caption_len + 1):
108 |             if t == 0: # input <bos>
109 |                 it = clip.data.new(batch_size).long().zero_()
110 |             elif sample_max:
111 |                 sampleLogprobs, it = torch.max(logprobs.data, 1)
112 |                 it = it.view(-1).long()
113 |             else:
114 |                 if temperature == 1.0:
115 |                     prob_prev = torch.exp(logprobs.data) # fetch prev distribution: shape Nx(M+1)
116 |                 else:
117 |                     # scale logprobs by temperature
118 |                     prob_prev = torch.exp(torch.div(logprobs.data, temperature))
119 |                 it = torch.multinomial(prob_prev, 1)
120 |                 sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions
121 |                 it = it.view(-1).long() # and flatten indices for downstream processing
122 | 
123 |             logprobs, state = self.get_logprobs_state(it, event , clip, clip_mask, state)
124 | 
125 |             if t >= 1:
126 |                 # stop when all finished
127 |                 if t == 1:
128 |                     unfinished = it > 0
129 |                 else:
130 |                     unfinished = unfinished & (it > 0)
131 |                 if unfinished.sum() == 0:
132 |                     break
133 |                 it = it * unfinished.type_as(it)
134 |                 seq.append(it) #seq[t] the input of t+2 time step
135 |                 seqLogprobs.append(sampleLogprobs.view(-1))
136 | 
137 |         if seq==[] or len(seq)==0:
138 |             return [],[]
139 |         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
140 | 
141 | class AllImgCore(nn.Module):
142 |     def __init__(self, opt):
143 |         super(AllImgCore, self).__init__()
144 |         self.input_encoding_size = opt.input_encoding_size
145 |         self.rnn_size = opt.rnn_size
146 |         self.num_layers = opt.num_layers
147 |         self.drop_prob_lm = opt.drop_prob
148 |         self.att_feat_size = opt.clip_context_dim
149 | 
150 |         self.opt = opt
151 |         self.wordRNN_input_feats_type = opt.wordRNN_input_feats_type
152 |         self.input_dim = self.decide_input_feats_dim()
153 |         self.rnn = nn.LSTM(self.input_encoding_size + self.input_dim,
154 |                            self.rnn_size, self.num_layers, bias=False, dropout=self.drop_prob_lm)
155 |         assert self.wordRNN_input_feats_type == 'C'
156 | 
157 |     def decide_input_feats_dim(self):
158 |         dim = 0
159 |         if 'E' in self.wordRNN_input_feats_type:
160 |             dim += self.opt.event_context_dim
161 |         if 'C' in self.wordRNN_input_feats_type:
162 |             dim += self.opt.clip_context_dim
163 |         return dim
164 | 
165 |     def forward(self, xt, event, clip, clip_mask, state):
166 |         input_feats = (clip * clip_mask.unsqueeze(2)).sum(1) / (clip_mask.sum(1, keepdims=True) + 1e-5)
167 |         output, state = self.rnn(torch.cat([xt, input_feats], 1).unsqueeze(0), state)
168 |         return output.squeeze(0), state
169 | 
170 | 
171 | class LightCaptioner(Captioner):
172 |     def __init__(self, opt):
173 |         super(LightCaptioner, self).__init__(opt)
174 |         self.core = AllImgCore(opt)
175 | 


--------------------------------------------------------------------------------
/pdvc/CaptioningHead/Puppet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class PuppetCaptionModel(nn.Module):
 6 |     def __init__(self, opt):
 7 |         super(PuppetCaptionModel, self).__init__()
 8 |         self.vocab_size = opt.vocab_size
 9 |         self.opt = opt
10 |         self.puppet_layer= nn.Linear(1,1)
11 | 
12 |     def forward(self, event, clip, clip_mask, seq):
13 |         N, L = seq.shape
14 |         output = torch.zeros((N, L-1, self.vocab_size + 1), device=seq.device)
15 |         return output
16 | 
17 |     def sample(self, event, clip, clip_mask, opt={}):
18 |         N, _, C = clip.shape
19 |         output = torch.zeros((N, 3), device=clip.device)
20 |         prob = torch.zeros((N, 3), device=clip.device)
21 |         return output, prob
22 | 
23 |     def build_loss(self, input, target, mask):
24 |         one_hot = torch.nn.functional.one_hot(target, self.opt.vocab_size+1)
25 |         output = - (one_hot * input * mask[..., None]).sum(2).sum(1) / (mask.sum(1) + 1e-6)
26 |         return output


--------------------------------------------------------------------------------
/pdvc/CaptioningHead/__init__.py:
--------------------------------------------------------------------------------
 1 | from .LSTM import LightCaptioner
 2 | from .Puppet import PuppetCaptionModel
 3 | from .LSTM_DSA import LSTMDSACaptioner
 4 | 
 5 | def build_captioner(opt):
 6 |     if opt.caption_decoder_type == 'none':
 7 |         caption_embed = PuppetCaptionModel(opt)
 8 | 
 9 |     elif opt.caption_decoder_type == 'light':
10 |         opt.event_context_dim = None
11 |         opt.clip_context_dim = opt.hidden_dim
12 |         caption_embed = LightCaptioner(opt)
13 | 
14 |     elif opt.caption_decoder_type == 'standard':
15 |         opt.event_context_dim = None
16 |         opt.clip_context_dim = opt.hidden_dim
17 |         caption_embed = LSTMDSACaptioner(opt)
18 | 
19 |     else:
20 |         raise ValueError('caption decoder type is invalid')
21 |     return caption_embed
22 | 
23 | 


--------------------------------------------------------------------------------
/pdvc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/pdvc/__init__.py


--------------------------------------------------------------------------------
/pdvc/base_encoder.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # PDVC
 3 | # ------------------------------------------------------------------------
 4 | # Modified from Deformable DETR
 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 6 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 7 | # ------------------------------------------------------------------------
 8 | # Modified from DETR (https://github.com/facebookresearch/detr)
 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
10 | # ------------------------------------------------------------------------
11 | 
12 | """
13 | Base Encoder to create multi-level conv features and positional embedding.
14 | """
15 | 
16 | import torch
17 | import torch.nn.functional as F
18 | from torch import nn
19 | from misc.detr_utils.misc import NestedTensor
20 | from .position_encoding import PositionEmbeddingSine
21 | 
22 | 
23 | class BaseEncoder(nn.Module):
24 |     def __init__(self, num_feature_levels, vf_dim, hidden_dim):
25 |         super(BaseEncoder, self).__init__()
26 |         self.pos_embed = PositionEmbeddingSine(hidden_dim//2, normalize=True)
27 |         self.num_feature_levels = num_feature_levels
28 |         self.hidden_dim = hidden_dim
29 | 
30 |         if num_feature_levels > 1:
31 |             input_proj_list = []
32 |             in_channels = vf_dim
33 |             input_proj_list.append(nn.Sequential(
34 |                 nn.Conv1d(in_channels, hidden_dim, kernel_size=1),
35 |                 nn.GroupNorm(32, hidden_dim),
36 |             ))
37 |             for _ in range(num_feature_levels - 1):
38 |                 input_proj_list.append(nn.Sequential(
39 |                     nn.Conv1d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
40 |                     nn.GroupNorm(32, hidden_dim),
41 |                 ))
42 |                 in_channels = hidden_dim
43 |             self.input_proj = nn.ModuleList(input_proj_list)
44 |         else:
45 |             self.input_proj = nn.ModuleList([
46 |                 nn.Sequential(
47 |                     nn.Conv2d(vf_dim, hidden_dim, kernel_size=1),
48 |                     nn.GroupNorm(32, hidden_dim),
49 |                 )])
50 | 
51 |         for proj in self.input_proj:
52 |             nn.init.xavier_uniform_(proj[0].weight, gain=1)
53 |             nn.init.constant_(proj[0].bias, 0)
54 | 
55 |     def forward(self, vf, mask, duration):
56 |         # vf: (N, L, C), mask: (N, L),  duration: (N)
57 |         vf = vf.transpose(1, 2)  # (N, L, C) --> (N, C, L)
58 |         vf_nt = NestedTensor(vf, mask, duration)
59 |         pos0 = self.pos_embed(vf_nt)
60 | 
61 |         srcs = []
62 |         masks = []
63 |         poses = []
64 | 
65 |         src0, mask0 = vf_nt.decompose()
66 |         srcs.append(self.input_proj[0](src0))
67 |         masks.append(mask0)
68 |         poses.append(pos0)
69 |         assert mask is not None
70 | 
71 |         for l in range(1, self.num_feature_levels):
72 |             if l == 1:
73 |                 src = self.input_proj[l](vf_nt.tensors)
74 |             else:
75 |                 src = self.input_proj[l](srcs[-1])
76 |             m = vf_nt.mask
77 |             mask = F.interpolate(m[None].float(), size=src.shape[-1:]).to(torch.bool)[0]
78 |             pos_l = self.pos_embed(NestedTensor(src, mask, duration)).to(src.dtype)
79 |             srcs.append(src)
80 |             masks.append(mask)
81 |             poses.append(pos_l)
82 |         return srcs, masks, poses
83 | 
84 | def build_base_encoder(args):
85 |     base_encoder = BaseEncoder(args.num_feature_levels, args.feature_dim, args.hidden_dim)
86 |     return base_encoder
87 | 


--------------------------------------------------------------------------------
/pdvc/matcher.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Modules to compute the matching cost and solve the corresponding LSAP.
 12 | """
 13 | import torch
 14 | from scipy.optimize import linear_sum_assignment
 15 | from torch import nn
 16 | 
 17 | from misc.detr_utils.box_ops import box_cl_to_xy, generalized_box_iou
 18 | 
 19 | 
 20 | class HungarianMatcher(nn.Module):
 21 |     """This class computes an assignment between the targets and the predictions of the network
 22 | 
 23 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 24 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 25 |     while the others are un-matched (and thus treated as non-objects).
 26 |     """
 27 | 
 28 |     def __init__(self,
 29 |                  cost_class: float = 1,
 30 |                  cost_bbox: float = 1,
 31 |                  cost_giou: float = 1,
 32 |                  cost_alpha = 0.25,
 33 |                  cost_gamma = 2):
 34 |         """Creates the matcher
 35 | 
 36 |         Params:
 37 |             cost_class: This is the relative weight of the classification error in the matching cost
 38 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
 39 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
 40 |         """
 41 |         super().__init__()
 42 |         self.cost_class = cost_class
 43 |         self.cost_bbox = cost_bbox
 44 |         self.cost_giou = cost_giou
 45 |         # self.cost_caption = cost_caption
 46 |         self.cost_alpha = cost_alpha
 47 |         self.cost_gamma = cost_gamma
 48 | 
 49 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 or cost_caption!=0, "all costs cant be 0"
 50 | 
 51 |     def forward(self, outputs, targets, verbose=False, many_to_one=False):
 52 |         """ Performs the matching
 53 | 
 54 |         Params:
 55 |             outputs: This is a dict that contains at least these entries:
 56 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
 57 |                  "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
 58 | 
 59 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
 60 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
 61 |                            objects in the target) containing the class labels
 62 |                  "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
 63 | 
 64 |         Returns:
 65 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
 66 |                 - index_i is the indices of the selected predictions (in order)
 67 |                 - index_j is the indices of the corresponding selected targets (in order)
 68 |             For each batch element, it holds:
 69 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
 70 |         """
 71 |         with torch.no_grad():
 72 |             bs, num_queries = outputs["pred_logits"].shape[:2]
 73 | 
 74 |             # We flatten to compute the cost matrices in a batch
 75 |             out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
 76 |             out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
 77 | 
 78 |             # Also concat the target labels and boxes
 79 |             tgt_ids = torch.cat([v["labels"] for v in targets])
 80 |             tgt_bbox = torch.cat([v["boxes"] for v in targets])
 81 | 
 82 |             # Compute the classification cost.
 83 |             # alpha = 0.25
 84 |             alpha = self.cost_alpha
 85 |             gamma = self.cost_gamma
 86 |             neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
 87 |             pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
 88 |             cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 89 | 
 90 |             # Compute the L1 cost between boxes
 91 |             cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
 92 | 
 93 |             # Compute the giou cost betwen boxes
 94 |             cost_giou = -generalized_box_iou(box_cl_to_xy(out_bbox),
 95 |                                              box_cl_to_xy(tgt_bbox))
 96 | 
 97 |             # cost_caption = outputs['caption_costs'].flatten(0, 1)
 98 | 
 99 |             # Final cost matrix
100 |             C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
101 | 
102 |             costs = {'cost_bbox': cost_bbox,
103 |                      'cost_class': cost_class,
104 |                      'cost_giou': cost_giou,
105 |                      # 'cost_caption': cost_caption,
106 |                      'out_bbox': out_bbox[:, 0::2]}
107 | 
108 |             if verbose:
109 |                 print('\n')
110 |                 print(self.cost_bbox, cost_bbox.var(dim=0), cost_bbox.max(dim=0)[0] - cost_bbox.min(dim=0)[0])
111 |                 print(self.cost_class, cost_class.var(dim=0), cost_class.max(dim=0)[0] - cost_class.min(dim=0)[0])
112 |                 print(self.cost_giou, cost_giou.var(dim=0), cost_giou.max(dim=0)[0] - cost_giou.min(dim=0)[0])
113 |                 # print(self.cost_caption, cost_caption.var(dim=0), cost_caption.max(dim=0)[0] - cost_caption.min(dim=0)[0])
114 | 
115 |             C = C.view(bs, num_queries, -1).cpu()
116 | 
117 |             sizes = [len(v["boxes"]) for v in targets]
118 |             # pdb.set_trace()
119 |             indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
120 |             m2o_rate = 4
121 |             rl_indices = [linear_sum_assignment(torch.cat([c[i]]*m2o_rate, -1)) for i, c in enumerate(C.split(sizes, -1))]
122 |             rl_indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j%sizes[ii], dtype=torch.int64)) for ii,(i, j) in
123 |                        enumerate(rl_indices)]
124 | 
125 |             indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
126 | 
127 |             if verbose:
128 |                 print('------matching results:')
129 |                 print(indices)
130 |                 for indice in indices:
131 |                     for i, j in zip(*indice):
132 |                         print(out_bbox[i][0::2], tgt_bbox[j][0::2])
133 |                 print('-----topK scores:')
134 |                 topk_indices = out_prob.topk(10, dim=0)
135 |                 print(topk_indices)
136 |                 for i,(v,ids) in enumerate(zip(*topk_indices)):
137 |                     print('top {}'.format(i))
138 |                     s= ''
139 |                     for name,cost in costs.items():
140 |                         s += name + ':{} '.format(cost[ids])
141 |                     print(s)
142 | 
143 |             return indices, rl_indices
144 | 
145 | 
146 | def build_matcher(args):
147 |     return HungarianMatcher(cost_class=args.set_cost_class,
148 |                             cost_bbox=args.set_cost_bbox,
149 |                             cost_giou=args.set_cost_giou,
150 |                             cost_alpha = args.cost_alpha,
151 |                             cost_gamma = args.cost_gamma
152 |                             )
153 | 


--------------------------------------------------------------------------------
/pdvc/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/pdvc/ops/__init__.py


--------------------------------------------------------------------------------
/pdvc/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
10 | 
11 | 


--------------------------------------------------------------------------------
/pdvc/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Function
16 | from torch.autograd.function import once_differentiable
17 | 
18 | try:
19 |     import MultiScaleDeformableAttention as MSDA
20 | except:
21 |     pass
22 | 
23 | class MSDeformAttnFunction(Function):
24 |     @staticmethod
25 |     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
26 |         # sampling_locations:(...,2), the first item of last dim means x axis corresponding to w, and second item of the last dim means y, corresponding to h.
27 |         ctx.im2col_step = im2col_step
28 |         output = MSDA.ms_deform_attn_forward(
29 |             value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
30 |         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
31 |         return output
32 | 
33 |     @staticmethod
34 |     @once_differentiable
35 |     def backward(ctx, grad_output):
36 |         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
37 |         grad_value, grad_sampling_loc, grad_attn_weight = \
38 |             MSDA.ms_deform_attn_backward(
39 |                 value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
40 | 
41 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
42 | 
43 | 
44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights, return_value=False):
45 |     # for debug and test only,
46 |     # need to use cuda version instead
47 |     N_, S_, M_, D_ = value.shape    # N_: batch size , S_: \sum_H*W, M_ : head number, D_: feature dim of each head
48 | 
49 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape  # Lq_: \sum H*W, L_: multi-scale number, P_: number of sampled key points
50 | 
51 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
52 |     sampling_grids = 2 * sampling_locations - 1 # convert value from range[0,1] to [-1, 1]
53 |     sampling_value_list = []
54 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
55 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
56 |         value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
57 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
58 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
59 |         # sampling_grid_l_: (...,2), the first item of last dim means x axis corresponding to w, and second item of the last dim means y, corresponding to h.
60 |         # N_*M_, D_, Lq_, P_
61 |         sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
62 |                                           mode='bilinear', padding_mode='border', align_corners=False)
63 |         sampling_value_list.append(sampling_value_l_)
64 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
65 |     attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
66 | 
67 |     if return_value:
68 |         return torch.stack(sampling_value_list, dim=-2)
69 |     #(N_ * M_, D_, Lq_, L_* P_) * (N_*M_, 1, Lq_, L_*P_) --> (N_*M_, D_, Lq_)
70 |     output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
71 |     return output.transpose(1, 2).contiguous()
72 | 


--------------------------------------------------------------------------------
/pdvc/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | python setup.py build install
10 | 


--------------------------------------------------------------------------------
/pdvc/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn import MSDeformAttn
10 | from .ms_deform_attn_for_caption import MSDeformAttnCap


--------------------------------------------------------------------------------
/pdvc/ops/modules/ms_deform_attn.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import print_function
 11 | from __future__ import division
 12 | 
 13 | import warnings
 14 | import math
 15 | 
 16 | import torch
 17 | from torch import nn
 18 | import torch.nn.functional as F
 19 | from torch.nn.init import xavier_uniform_, constant_
 20 | 
 21 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch
 22 | 
 23 | 
 24 | def _is_power_of_2(n):
 25 |     if (not isinstance(n, int)) or (n < 0):
 26 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 27 |     return (n & (n-1) == 0) and n != 0
 28 | 
 29 | 
 30 | class MSDeformAttn(nn.Module):
 31 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
 32 |         """
 33 |         Multi-Scale Deformable Attention Module
 34 |         :param d_model      hidden dimension
 35 |         :param n_levels     number of feature levels
 36 |         :param n_heads      number of attention heads
 37 |         :param n_points     number of sampling points per attention head per feature level
 38 |         """
 39 |         super().__init__()
 40 |         if d_model % n_heads != 0:
 41 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 42 |         _d_per_head = d_model // n_heads
 43 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 44 |         if not _is_power_of_2(_d_per_head):
 45 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 46 |                           "which is more efficient in our CUDA implementation.")
 47 | 
 48 |         self.im2col_step = 64
 49 | 
 50 |         self.d_model = d_model
 51 |         self.n_levels = n_levels
 52 |         self.n_heads = n_heads
 53 |         self.n_points = n_points
 54 | 
 55 |         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points )
 56 |         self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
 57 |         self.value_proj = nn.Linear(d_model, d_model)
 58 |         self.output_proj = nn.Linear(d_model, d_model)
 59 | 
 60 |         self._reset_parameters()
 61 | 
 62 |     def _reset_parameters(self):
 63 |         constant_(self.sampling_offsets.weight.data, 0.)
 64 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2 * math.pi / self.n_heads)
 65 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 66 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2)
 67 |         grid_init = grid_init[..., 0].repeat(1, self.n_levels, self.n_points)
 68 |         for i in range(self.n_points):
 69 |             grid_init[:, :, i] *= i + 1
 70 |         with torch.no_grad():
 71 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 72 |         constant_(self.attention_weights.weight.data, 0.)
 73 |         constant_(self.attention_weights.bias.data, 0.)
 74 |         xavier_uniform_(self.value_proj.weight.data)
 75 |         constant_(self.value_proj.bias.data, 0.)
 76 |         xavier_uniform_(self.output_proj.weight.data)
 77 |         constant_(self.output_proj.bias.data, 0.)
 78 | 
 79 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 80 |         """
 81 |         :param query                       (N, Length_{query}, C)
 82 |         :param reference_points            (N, Length_{query}, n_levels, 1), range in [0, 1], including padding area
 83 |                                         or (N, Length_{query}, n_levels, 2), add additional (c, l) to form reference boxes
 84 |         :param input_flatten               (N, \sum_{l=0}^{L-1} T_l, C)
 85 |         :param input_spatial_shapes        (n_levels ), [T_0, T_1, ..., T_{L-1}]
 86 |         :param input_level_start_index     (n_levels ), [0, 1_0, T_0+T_1, ...]
 87 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 88 | 
 89 |         :return output                     (N, Length_{query}, C)
 90 |         """
 91 |         N, Len_q, _ = query.shape
 92 |         N, Len_in, _ = input_flatten.shape
 93 |         assert input_spatial_shapes.sum() == Len_in
 94 | 
 95 |         value = self.value_proj(input_flatten)
 96 |         if input_padding_mask is not None:
 97 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 98 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
 99 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
100 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
101 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
102 |         # N, Len_q, n_heads, n_levels, n_points, 2
103 |         if reference_points.shape[-1] == 1:
104 |             offset_normalizer = input_spatial_shapes
105 |             sampling_locations = reference_points[:, :, None, :, None, 0] \
106 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None]
107 |         elif reference_points.shape[-1] == 2:
108 |             sampling_locations = reference_points[:, :, None, :, None, 0] \
109 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 1] * 0.5
110 |         else:
111 |             raise ValueError(
112 |                 'Last dim of reference_points must be 1 or 2, but get {} instead.'.format(reference_points.shape[-1]))
113 | 
114 |         if True:
115 |             sampling_locations = torch.stack(
116 |                 (sampling_locations, 0.5 * sampling_locations.new_ones(sampling_locations.shape)), -1)
117 |             input_spatial_shapes = torch.stack([input_spatial_shapes.new_ones(input_spatial_shapes.shape), input_spatial_shapes], -1)
118 | 
119 |         if query.device.type == 'cuda':
120 |             output = MSDeformAttnFunction.apply(
121 |                 value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights,
122 |                 self.im2col_step)
123 |         else:
124 |             output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
125 |         output = self.output_proj(output)
126 |         return output
127 | 


--------------------------------------------------------------------------------
/pdvc/ops/modules/ms_deform_attn_for_caption.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import print_function
 11 | from __future__ import division
 12 | 
 13 | import warnings
 14 | import math
 15 | 
 16 | import torch
 17 | from torch import nn
 18 | import torch.nn.functional as F
 19 | from torch.nn.init import xavier_uniform_, constant_
 20 | 
 21 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch
 22 | 
 23 | 
 24 | def _is_power_of_2(n):
 25 |     if (not isinstance(n, int)) or (n < 0):
 26 |         raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
 27 |     return (n & (n-1) == 0) and n != 0
 28 | 
 29 | 
 30 | class MSDeformAttnCap(nn.Module):
 31 |     def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4,):
 32 |         """
 33 |         Multi-Scale Deformable Attention Module
 34 |         :param d_model      hidden dimension
 35 |         :param n_levels     number of feature levels
 36 |         :param n_heads      number of attention heads
 37 |         :param n_points     number of sampling points per attention head per feature level
 38 |         """
 39 |         super().__init__()
 40 |         if d_model % n_heads != 0:
 41 |             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
 42 |         _d_per_head = d_model // n_heads
 43 |         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
 44 |         if not _is_power_of_2(_d_per_head):
 45 |             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
 46 |                           "which is more efficient in our CUDA implementation.")
 47 | 
 48 |         self.im2col_step = 64
 49 |         self.d_model = d_model
 50 |         self.n_levels = n_levels
 51 |         self.n_heads = n_heads
 52 |         self.n_points = n_points
 53 | 
 54 |         self.sampling_offsets = nn.Linear(2 * d_model, n_heads * n_levels * n_points)
 55 |         self.attention_weights = nn.Linear(2 * d_model, n_heads * n_levels * n_points)
 56 |         self.value_proj = nn.Linear(d_model, d_model)
 57 |         self.output_proj = nn.Linear(d_model, d_model)
 58 |         self._reset_parameters()
 59 | 
 60 |     def _reset_parameters(self):
 61 |         constant_(self.sampling_offsets.weight.data, 0.)
 62 |         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
 63 |         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
 64 |         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2)
 65 |         grid_init = grid_init[..., 0].repeat(1, self.n_levels, self.n_points)
 66 |         for i in range(self.n_points):
 67 |             grid_init[:, :, i] *= i + 1
 68 |         grid_init = grid_init - grid_init.mean(2, keepdim=True)
 69 |         with torch.no_grad():
 70 |             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
 71 |         constant_(self.attention_weights.weight.data, 0.)
 72 |         constant_(self.attention_weights.bias.data, 0.)
 73 |         xavier_uniform_(self.value_proj.weight.data)
 74 |         constant_(self.value_proj.bias.data, 0.)
 75 |         xavier_uniform_(self.output_proj.weight.data)
 76 |         constant_(self.output_proj.bias.data, 0.)
 77 | 
 78 |     def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
 79 |         """
 80 |         :param query                       (N, Length_{query}, C)
 81 |         :param reference_points            (N, Length_{query}, n_levels, 1), range in [0, 1], including padding area
 82 |                                         or (N, Length_{query}, n_levels, 2), add additional (c, l) to form reference boxes
 83 |         :param input_flatten               (N, \sum_{l=0}^{L-1} T_l, C)
 84 |         :param input_spatial_shapes        (n_levels ), [T_0, T_1, ..., T_{L-1}]
 85 |         :param input_level_start_index     (n_levels ), [0, 1_0, T_0+T_1, ...]
 86 |         :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
 87 | 
 88 |         :return output                     (N, Length_{query}, C)
 89 |         """
 90 |         N, Len_q, _ = query.shape
 91 |         N, Len_in, _ = input_flatten.shape
 92 |         assert input_spatial_shapes.sum() == Len_in
 93 | 
 94 |         value = self.value_proj(input_flatten)
 95 |         if input_padding_mask is not None:
 96 |             value = value.masked_fill(input_padding_mask[..., None], float(0))
 97 |         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
 98 |         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
 99 |         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
100 |         attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
101 |         # N, Len_q, n_heads, n_levels, n_points, 1
102 |         if reference_points.shape[-1] == 1:
103 |             offset_normalizer = input_spatial_shapes
104 |             sampling_locations = reference_points[:, :, None, :, None, 0] \
105 |                                  + sampling_offsets / offset_normalizer[None, None, None, :, None]
106 |         elif reference_points.shape[-1] == 2:
107 |             sampling_locations = reference_points[:, :, None, :, None, 0] \
108 |                                  + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 1] * 0.5
109 |         else:
110 |             raise ValueError(
111 |                 'Last dim of reference_points must be 1 or 2, but get {} instead.'.format(reference_points.shape[-1]))
112 | 
113 | 
114 | 
115 |         if True:
116 |             sampling_locations = torch.stack(
117 |                 (sampling_locations, 0.5 * sampling_locations.new_ones(sampling_locations.shape)), -1)
118 |             input_spatial_shapes = torch.stack([input_spatial_shapes.new_ones(input_spatial_shapes.shape), input_spatial_shapes], -1)
119 | 
120 |         output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights,
121 |                                              return_value=True)
122 | 
123 |         return output
124 | 


--------------------------------------------------------------------------------
/pdvc/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/pdvc/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/pdvc/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/pdvc/ops/src/cuda/ms_deform_attn_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 | **************************************************************************************************
  3 | * Deformable DETR
  4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
  5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  6 | **************************************************************************************************
  7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  8 | **************************************************************************************************
  9 | */
 10 | 
 11 | #include <vector>
 12 | #include "cuda/ms_deform_im2col_cuda.cuh"
 13 | 
 14 | #include <ATen/ATen.h>
 15 | #include <ATen/cuda/CUDAContext.h>
 16 | #include <cuda.h>
 17 | #include <cuda_runtime.h>
 18 | 
 19 | 
 20 | at::Tensor ms_deform_attn_cuda_forward(
 21 |     const at::Tensor &value, 
 22 |     const at::Tensor &spatial_shapes,
 23 |     const at::Tensor &level_start_index,
 24 |     const at::Tensor &sampling_loc,
 25 |     const at::Tensor &attn_weight,
 26 |     const int im2col_step)
 27 | {
 28 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 29 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 30 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 31 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 32 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 33 | 
 34 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
 35 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
 36 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
 37 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
 38 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
 39 | 
 40 |     const int batch = value.size(0);
 41 |     const int spatial_size = value.size(1);
 42 |     const int num_heads = value.size(2);
 43 |     const int channels = value.size(3);
 44 | 
 45 |     const int num_levels = spatial_shapes.size(0);
 46 | 
 47 |     const int num_query = sampling_loc.size(1);
 48 |     const int num_point = sampling_loc.size(4);
 49 | 
 50 |     const int im2col_step_ = std::min(batch, im2col_step);
 51 | 
 52 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
 53 |     
 54 |     auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
 55 | 
 56 |     const int batch_n = im2col_step_;
 57 |     auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
 58 |     auto per_value_size = spatial_size * num_heads * channels;
 59 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
 60 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
 61 |     for (int n = 0; n < batch/im2col_step_; ++n)
 62 |     {
 63 |         auto columns = output_n.select(0, n);
 64 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
 65 |             ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
 66 |                 value.data<scalar_t>() + n * im2col_step_ * per_value_size,
 67 |                 spatial_shapes.data<int64_t>(),
 68 |                 level_start_index.data<int64_t>(),
 69 |                 sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
 70 |                 attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
 71 |                 batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
 72 |                 columns.data<scalar_t>());
 73 | 
 74 |         }));
 75 |     }
 76 | 
 77 |     output = output.view({batch, num_query, num_heads*channels});
 78 | 
 79 |     return output;
 80 | }
 81 | 
 82 | 
 83 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
 84 |     const at::Tensor &value, 
 85 |     const at::Tensor &spatial_shapes,
 86 |     const at::Tensor &level_start_index,
 87 |     const at::Tensor &sampling_loc,
 88 |     const at::Tensor &attn_weight,
 89 |     const at::Tensor &grad_output,
 90 |     const int im2col_step)
 91 | {
 92 | 
 93 |     AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
 94 |     AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
 95 |     AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
 96 |     AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
 97 |     AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
 98 |     AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
 99 | 
100 |     AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101 |     AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102 |     AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103 |     AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104 |     AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105 |     AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106 | 
107 |     const int batch = value.size(0);
108 |     const int spatial_size = value.size(1);
109 |     const int num_heads = value.size(2);
110 |     const int channels = value.size(3);
111 | 
112 |     const int num_levels = spatial_shapes.size(0);
113 | 
114 |     const int num_query = sampling_loc.size(1);
115 |     const int num_point = sampling_loc.size(4);
116 | 
117 |     const int im2col_step_ = std::min(batch, im2col_step);
118 | 
119 |     AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120 | 
121 |     auto grad_value = at::zeros_like(value);
122 |     auto grad_sampling_loc = at::zeros_like(sampling_loc);
123 |     auto grad_attn_weight = at::zeros_like(attn_weight);
124 | 
125 |     const int batch_n = im2col_step_;
126 |     auto per_value_size = spatial_size * num_heads * channels;
127 |     auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128 |     auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129 |     auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130 |     
131 |     for (int n = 0; n < batch/im2col_step_; ++n)
132 |     {
133 |         auto grad_output_g = grad_output_n.select(0, n);
134 |         AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135 |             ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136 |                                     grad_output_g.data<scalar_t>(),
137 |                                     value.data<scalar_t>() + n * im2col_step_ * per_value_size,
138 |                                     spatial_shapes.data<int64_t>(),
139 |                                     level_start_index.data<int64_t>(),
140 |                                     sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
141 |                                     attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
142 |                                     batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143 |                                     grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
144 |                                     grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
145 |                                     grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
146 | 
147 |         }));
148 |     }
149 | 
150 |     return {
151 |         grad_value, grad_sampling_loc, grad_attn_weight
152 |     };
153 | }


--------------------------------------------------------------------------------
/pdvc/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/pdvc/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/pdvc/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/pdvc/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/pdvc/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | """
11 | Various positional encodings for the transformer.
12 | """
13 | import math
14 | import torch
15 | from torch import nn
16 | 
17 | from misc.detr_utils.misc import NestedTensor
18 | 
19 | 
20 | class PositionEmbeddingSine(nn.Module):
21 |     """
22 |     This is a more standard version of the position embedding, very similar to the one
23 |     used by the Attention is all you need paper, generalized to work on images.
24 |     """
25 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
26 |         super().__init__()
27 |         self.num_pos_feats = num_pos_feats
28 |         self.temperature = temperature
29 |         self.normalize = normalize
30 |         if scale is not None and normalize is False:
31 |             raise ValueError("normalize should be True if scale is passed")
32 |         if scale is None:
33 |             scale = 2 * math.pi
34 |         self.scale = scale
35 |         self.max_duration = 256
36 |         self.duration_embed_layer = nn.Linear(self.max_duration, self.max_duration)
37 | 
38 |     def forward(self, tensor_list: NestedTensor):
39 |         x = tensor_list.tensors
40 |         mask = tensor_list.mask
41 |         duration = tensor_list.duration
42 |         assert mask is not None
43 |         not_mask = ~mask
44 |         x_embed = not_mask.cumsum(1, dtype=torch.float32)
45 |         if self.normalize:
46 |             eps = 1e-6
47 |             x_embed = (x_embed - 0.5) / (x_embed[:, -1:] + eps) * self.scale
48 | 
49 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
50 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
51 |         pos_x = x_embed[:, :, None] / dim_t
52 |         pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
53 | 
54 |         dur_embed = self.duration_embedding(duration).reshape(-1,1,self.max_duration).expand_as(pos_x)
55 |         pos = torch.cat((pos_x, dur_embed), dim=2).permute(0, 2, 1)
56 |         return pos
57 | 
58 |     def duration_embedding(self, durations):
59 |         out = torch.zeros(len(durations), self.max_duration, device=durations.device)
60 |         durations = durations.int()
61 |         for ii in range(len(durations)):
62 |             out[ii, :durations[ii]] = 1
63 |         out = self.duration_embed_layer(out)
64 |         return out
65 | 
66 | 
67 | 
68 | def build_position_encoding(position_embedding, N_steps):
69 |     if position_embedding in ('v2', 'sine'):
70 |         # TODO find a better way of exposing other arguments
71 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
72 |     else:
73 |         raise ValueError(f"not supported {position_embedding}")
74 | 
75 |     return position_embedding
76 | 


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
 1 | h5py
 2 | matplotlib
 3 | numpy
 4 | pandas
 5 | Pillow
 6 | PyYAML
 7 | six
 8 | tqdm
 9 | tensorboardX
10 | colorlog
11 | scipy
12 | jupyter notebook
13 | pandas
14 | h5py
15 | av
16 | joblib
17 | tqdm
18 | google_trans_new
19 | 


--------------------------------------------------------------------------------
/test_and_visualize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -i
 2 | curdir=`pwd`
 3 | export PYTHONPATH=$PYTHONPATH:$curdir/video_backbone/TSP
 4 | export PYTHONPATH=$PYTHONPATH:$curdir/video_backbone/TSP/data
 5 | export PYTHONPATH=$PYTHONPATH:$curdir/video_backbone/TSP/extract_features
 6 | export PYTHONPATH=$PYTHONPATH:$curdir/visualization
 7 | 
 8 | DATA_PATH=$1 # path of the raw video folder
 9 | OUTPUT_FOLDER=$2 # path of the output folder to save generated captions
10 | PDVC_MODEL_PATH=$3
11 | OUTPUT_LANGUAGE=$4
12 | 
13 | if [ -z "$DATA_PATH" ]; then
14 |     echo "DATA_PATH variable is not set."
15 |     echo "Please set DATA_PATH to the folder containing the videos you want to process."
16 |     exit 1
17 | fi
18 | 
19 | if [ -z "$OUTPUT_FOLDER" ]; then
20 |     echo "OUTPUT_FOLDER variable is not set."
21 |       echo "Please set OUTPUT_FOLDER to the folder you want to save generate captions."
22 |     exit 1
23 |     exit 1
24 | fi
25 | 
26 | if [ -z "$PDVC_MODEL_PATH" ]; then
27 |     echo "PDVC_MODEL_PATH variable is not set."
28 |     echo "Please set the pretrained PDVC model path (only support PDVC with TSP features)."
29 |     exit 1
30 | fi
31 | 
32 | ####################################################################################
33 | ########################## PARAMETERS THAT NEED TO BE SET ##########################
34 | ####################################################################################
35 | 
36 | METADATA_CSV_FILENAME=$DATA_PATH/"metadata.csv" # path/to/metadata/csv/file. Use the ones provided in the data folder.
37 | RELEASED_CHECKPOINT=r2plus1d_34-tsp_on_activitynet
38 | 
39 | 
40 | # Choose the stride between clips, e.g. 16 for non-overlapping clips and 1 for dense overlapping clips
41 | STRIDE=16
42 | 
43 | # Optional: Split the videos into multiple shards for parallel feature extraction
44 | # Increase the number of shards and run this script independently on separate GPU devices,
45 | # each with a different SHARD_ID from 0 to NUM_SHARDS-1.
46 | # Each shard will process (num_videos / NUM_SHARDS) videos.
47 | SHARD_ID=0
48 | NUM_SHARDS=1
49 | DEVICE=cuda
50 | WORKER_NUM=8
51 | 
52 | echo "START GENERATE METADATA"
53 | python video_backbone/TSP/data/generate_metadata_csv.py --video-folder $DATA_PATH --output-csv $METADATA_CSV_FILENAME
54 | 
55 | FEATURE_DIR=$OUTPUT_FOLDER/${RELEASED_CHECKPOINT}_stride_${STRIDE}/
56 | mkdir -p $OUTPUT_DIR
57 | 
58 | echo "START EXTRACT VIDEO FEATURES"
59 | python video_backbone/TSP/extract_features/extract_features.py \
60 | --data-path $DATA_PATH \
61 | --metadata-csv-filename $METADATA_CSV_FILENAME \
62 | --released-checkpoint $RELEASED_CHECKPOINT \
63 | --stride $STRIDE \
64 | --shard-id $SHARD_ID \
65 | --num-shards $NUM_SHARDS \
66 | --device $DEVICE \
67 | --output-dir $FEATURE_DIR \
68 | --workers $WORKER_NUM
69 | 
70 | echo "START Dense-Captioning"
71 | python eval.py --eval_mode test --eval_save_dir $OUTPUT_FOLDER --eval_folder generated_captions --eval_model_path $PDVC_MODEL_PATH --test_video_feature_folder $FEATURE_DIR --test_video_meta_data_csv_path $METADATA_CSV_FILENAME
72 | 
73 | echo "START VISUALIZATION"
74 | python visualization/visualization.py --input_mp4_folder $DATA_PATH --output_mp4_folder  $OUTPUT_FOLDER/vis_videos --dvc_file $OUTPUT_FOLDER/generated_captions/dvc_results.json --output_language $OUTPUT_LANGUAGE
75 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Humam Alwassel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/README.md:
--------------------------------------------------------------------------------
 1 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/temporal-action-localization-on-activitynet)](https://paperswithcode.com/sota/temporal-action-localization-on-activitynet?p=tsp-temporally-sensitive-pretraining-of-video)
 2 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/temporal-action-proposal-generation-on)](https://paperswithcode.com/sota/temporal-action-proposal-generation-on?p=tsp-temporally-sensitive-pretraining-of-video)
 3 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/dense-video-captioning-on-activitynet)](https://paperswithcode.com/sota/dense-video-captioning-on-activitynet?p=tsp-temporally-sensitive-pretraining-of-video)
 4 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/temporal-action-localization-on-thumos14)](https://paperswithcode.com/sota/temporal-action-localization-on-thumos14?p=tsp-temporally-sensitive-pretraining-of-video)
 5 | 
 6 | # TSP: Temporally-Sensitive Pretraining of Video Encoders for Localization Tasks
 7 | 
 8 | <img align="right" width=40% src="img/tsp.png">
 9 | 
10 | [[Paper]](https://arxiv.org/pdf/2011.11479.pdf)
11 | [[Project Website]](http://humamalwassel.com/publication/tsp/)
12 | 
13 | This repository holds the source code, pretrained models, and pre-extracted features for the TSP method.
14 | 
15 | Please cite this work if you find TSP useful for your research.
16 | ```
17 | @inproceedings{alwassel_2021_tsp,
18 |   title={TSP: Temporally-Sensitive Pretraining of Video Encoders for Localization Tasks},
19 |   author={Alwassel, Humam and Giancola, Silvio and Ghanem, Bernard},
20 |   booktitle={Proceedings of the IEEE/CVF International
21 |              Conference on Computer Vision (ICCV) Workshops},
22 |   year={2021}
23 | }
24 | ```
25 | 
26 | ## Pre-extracted TSP Features
27 | 
28 | We provide pre-extracted features for ActivityNet v1.3 and THUMOS14 videos. The feature files are saved in H5 format, where we map each `video-name` to a features tensor of size `N x 512`, where `N` is the number of features and `512` is the feature size. Use `h5py` python package to read the feature files. Not familiar with H5 files or `h5py`? here is a quick start [guide](https://docs.h5py.org/en/stable/).
29 | 
30 | ### For ActivityNet v1.3 dataset
31 | **Download**:
32 | [[train subset]](https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-train_features.h5)
33 | [[valid subset]](https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-valid_features.h5)
34 | [[test subset]](https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-test_features.h5)
35 | 
36 | **Details**: The features are extracted from the R(2+1)D-34 encoder pretrained with TSP on ActivityNet ([released model](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_5-0d2cf854.pth)) using clips of `16 frames` at a frame rate of `15 fps` and a stride of `16 frames` (*i.e.,* **non-overlapping** clips). This gives one feature vector per `16/15 ~= 1.067` seconds.
37 | 
38 | 
39 | ### For THUMOS14 dataset
40 | 
41 | **Download**:
42 | [[valid subset]](https://github.com/HumamAlwassel/TSP/releases/download/thumos14_features/r2plus1d_34-tsp_on_thumos14-valid_features.h5)
43 | [[test subset]](https://github.com/HumamAlwassel/TSP/releases/download/thumos14_features/r2plus1d_34-tsp_on_thumos14-test_features.h5)
44 | 
45 | **Details**: The features are extracted from the R(2+1)D-34 encoder pretrained with TSP on THUMOS14 ([released model](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_thumos14-max_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_4-e6a30b2f.pth)) using clips of `16 frames` at a frame rate of `15 fps` and a stride of `1 frame` (*i.e.,* dense **overlapping** clips). This gives one feature vector per `1/15 ~= 0.067` seconds.
46 | 
47 | ## Setup
48 | Clone this repository and create the conda environment.
49 | ```
50 | git clone https://github.com/HumamAlwassel/TSP.git
51 | cd TSP
52 | conda env create -f environment.yml
53 | conda activate tsp
54 | ```
55 | 
56 | ## Data Preprocessing
57 | Follow the instructions [here](data) to download and preprocess the input data.
58 | 
59 | ## Training
60 | We provide training scripts for the TSP models and the TAC baselines [here](train).
61 | 
62 | ## Feature Extraction
63 | You can extract features from released pretrained models or from local checkpoints using the scripts [here](extract_features).
64 | 
65 | **Acknowledgment**: Our source code borrows implementation ideas from [pytorch/vision](https://github.com/pytorch/vision) and [facebookresearch/VMZ](https://github.com/facebookresearch/VMZ) repositories.
66 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/__init__.py


--------------------------------------------------------------------------------
/video_backbone/TSP/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/common/__init__.py


--------------------------------------------------------------------------------
/video_backbone/TSP/common/scheduler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from bisect import bisect_right
 3 | 
 4 | 
 5 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
 6 |     def __init__(
 7 |         self,
 8 |         optimizer,
 9 |         milestones,
10 |         gamma=0.1,
11 |         warmup_factor=1.0 / 3,
12 |         warmup_iters=5,
13 |         warmup_method='linear',
14 |         last_epoch=-1,
15 |     ):
16 |         if not milestones == sorted(milestones):
17 |             raise ValueError(
18 |                 f'Milestones should be a list of increasing integers. '
19 |                 f'Got {milestones}',
20 |             )
21 | 
22 |         if warmup_method not in ('constant', 'linear'):
23 |             raise ValueError(
24 |                 f'Only "constant" or "linear" warmup_method accepted'
25 |                 f'got {warmup_method}'
26 |             )
27 |         self.milestones = milestones
28 |         self.gamma = gamma
29 |         self.warmup_factor = warmup_factor
30 |         self.warmup_iters = warmup_iters
31 |         self.warmup_method = warmup_method
32 |         super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
33 | 
34 |     def get_lr(self):
35 |         warmup_factor = 1
36 |         if self.last_epoch < self.warmup_iters:
37 |             if self.warmup_method == 'constant':
38 |                 warmup_factor = self.warmup_factor
39 |             elif self.warmup_method == 'linear':
40 |                 alpha = float(self.last_epoch) / self.warmup_iters
41 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
42 |         return [
43 |             base_lr *
44 |             warmup_factor *
45 |             self.gamma ** bisect_right(self.milestones, self.last_epoch)
46 |             for base_lr in self.base_lrs
47 |         ]
48 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/common/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import random
  3 | 
  4 | 
  5 | def crop(vid, i, j, h, w):
  6 |     return vid[..., i:(i + h), j:(j + w)]
  7 | 
  8 | 
  9 | def center_crop(vid, output_size):
 10 |     h, w = vid.shape[-2:]
 11 |     th, tw = output_size
 12 | 
 13 |     i = int(round((h - th) / 2.))
 14 |     j = int(round((w - tw) / 2.))
 15 |     return crop(vid, i, j, th, tw)
 16 | 
 17 | 
 18 | def hflip(vid):
 19 |     return vid.flip(dims=(-1,))
 20 | 
 21 | 
 22 | # NOTE: for those functions, which generally expect mini-batches, we keep them
 23 | # as non-minibatch so that they are applied as if they were 4d (thus image).
 24 | # this way, we only apply the transformation in the spatial domain
 25 | def resize(vid, size, interpolation='bilinear'):
 26 |     # NOTE: using bilinear interpolation because we don't work on minibatches
 27 |     # at this level
 28 |     scale = None
 29 |     if isinstance(size, int):
 30 |         scale = float(size) / min(vid.shape[-2:])
 31 |         size = None
 32 |     return torch.nn.functional.interpolate(
 33 |         vid, size=size, scale_factor=scale, mode=interpolation, align_corners=False)
 34 | 
 35 | 
 36 | def pad(vid, padding, fill=0, padding_mode="constant"):
 37 |     # NOTE: don't want to pad on temporal dimension, so let as non-batch
 38 |     # (4d) before padding. This works as expected
 39 |     return torch.nn.functional.pad(vid, padding, value=fill, mode=padding_mode)
 40 | 
 41 | 
 42 | def to_normalized_float_tensor(vid):
 43 |     return vid.permute(3, 0, 1, 2).to(torch.float32) / 255
 44 | 
 45 | 
 46 | def normalize(vid, mean, std):
 47 |     shape = (-1,) + (1,) * (vid.dim() - 1)
 48 |     mean = torch.as_tensor(mean).reshape(shape)
 49 |     std = torch.as_tensor(std).reshape(shape)
 50 |     return (vid - mean) / std
 51 | 
 52 | 
 53 | # Class interface
 54 | 
 55 | class RandomCrop(object):
 56 |     def __init__(self, size):
 57 |         self.size = size
 58 | 
 59 |     @staticmethod
 60 |     def get_params(vid, output_size):
 61 |         '''
 62 |         Get parameters for ``crop`` for a random crop.
 63 |         '''
 64 |         h, w = vid.shape[-2:]
 65 |         th, tw = output_size
 66 |         if w == tw and h == th:
 67 |             return 0, 0, h, w
 68 |         i = random.randint(0, h - th)
 69 |         j = random.randint(0, w - tw)
 70 |         return i, j, th, tw
 71 | 
 72 |     def __call__(self, vid):
 73 |         i, j, h, w = self.get_params(vid, self.size)
 74 |         return crop(vid, i, j, h, w)
 75 | 
 76 | 
 77 | class CenterCrop(object):
 78 |     def __init__(self, size):
 79 |         self.size = size
 80 | 
 81 |     def __call__(self, vid):
 82 |         return center_crop(vid, self.size)
 83 | 
 84 | 
 85 | class Resize(object):
 86 |     def __init__(self, size):
 87 |         self.size = size
 88 | 
 89 |     def __call__(self, vid):
 90 |         return resize(vid, self.size)
 91 | 
 92 | 
 93 | class ToFloatTensorInZeroOne(object):
 94 |     def __call__(self, vid):
 95 |         return to_normalized_float_tensor(vid)
 96 | 
 97 | 
 98 | class Normalize(object):
 99 |     def __init__(self, mean, std):
100 |         self.mean = mean
101 |         self.std = std
102 | 
103 |     def __call__(self, vid):
104 |         return normalize(vid, self.mean, self.std)
105 | 
106 | 
107 | class RandomHorizontalFlip(object):
108 |     def __init__(self, p=0.5):
109 |         self.p = p
110 | 
111 |     def __call__(self, vid):
112 |         if random.random() < self.p:
113 |             return hflip(vid)
114 |         return vid
115 | 
116 | 
117 | class Pad(object):
118 |     def __init__(self, padding, fill=0):
119 |         self.padding = padding
120 |         self.fill = fill
121 | 
122 |     def __call__(self, vid):
123 |         return pad(vid, self.padding, self.fill)
124 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/data/README.md:
--------------------------------------------------------------------------------
 1 | # Data Preprocessing
 2 | 
 3 | **Step 1**: Download the ActivityNet v1.3 and THUMOS14 videos. For ActivityNet, you can submit a data request [here](https://docs.google.com/forms/d/e/1FAIpQLSeKaFq9ZfcmZ7W0B0PbEhfbTHY41GeEgwsa7WobJgGUhn4DTQ/viewform). For THUMOS14, you can download it directly from the [official website](http://crcv.ucf.edu/THUMOS14/download.html).
 4 | 
 5 | **Step 2**: Standardize all videos to MP4 format with a constant frame rate of 30fps using the script `standardize_videos_to_constant_30fps_mp4.sh`:
 6 | ```
 7 | bash standardize_video_to_constant_30fps_mp4.sh <input_folder> <output_folder>
 8 | ```
 9 | 
10 | **Step 3**: Split the ActivityNet videos into three subfolders: `train` (10024 videos), `valid` (4926 videos), and `test` (5044 videos) using the official splits. Similarly, split THUMOS14 into `valid` (200 videos) and `test` (213 videos) subfolders.
11 | 
12 | **Step 4**: Generate metadata CSV files for each ActivityNet and THUMOS14 subset using the script `generate_metadata_csv.py`. _This step is already pre-computed for the standardized ActivityNet and THUMOS14 videos and saved in the `activitynet` and `thumos14` folders_.
13 | ```
14 | python generate_metadata_csv.py --video-folder <path/to/folder> --output-csv <output_filename.csv>
15 | ```
16 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/data/activitynet/activitynet_v1-3_action_label_mapping.json:
--------------------------------------------------------------------------------
  1 | ["Applying sunscreen",
  2 | "Archery",
  3 | "Arm wrestling",
  4 | "Assembling bicycle",
  5 | "BMX",
  6 | "Baking cookies",
  7 | "Ballet",
  8 | "Bathing dog",
  9 | "Baton twirling",
 10 | "Beach soccer",
 11 | "Beer pong",
 12 | "Belly dance",
 13 | "Blow-drying hair",
 14 | "Blowing leaves",
 15 | "Braiding hair",
 16 | "Breakdancing",
 17 | "Brushing hair",
 18 | "Brushing teeth",
 19 | "Building sandcastles",
 20 | "Bullfighting",
 21 | "Bungee jumping",
 22 | "Calf roping",
 23 | "Camel ride",
 24 | "Canoeing",
 25 | "Capoeira",
 26 | "Carving jack-o-lanterns",
 27 | "Changing car wheel",
 28 | "Cheerleading",
 29 | "Chopping wood",
 30 | "Clean and jerk",
 31 | "Cleaning shoes",
 32 | "Cleaning sink",
 33 | "Cleaning windows",
 34 | "Clipping cat claws",
 35 | "Cricket",
 36 | "Croquet",
 37 | "Cumbia",
 38 | "Curling",
 39 | "Cutting the grass",
 40 | "Decorating the Christmas tree",
 41 | "Disc dog",
 42 | "Discus throw",
 43 | "Dodgeball",
 44 | "Doing a powerbomb",
 45 | "Doing crunches",
 46 | "Doing fencing",
 47 | "Doing karate",
 48 | "Doing kickboxing",
 49 | "Doing motocross",
 50 | "Doing nails",
 51 | "Doing step aerobics",
 52 | "Drinking beer",
 53 | "Drinking coffee",
 54 | "Drum corps",
 55 | "Elliptical trainer",
 56 | "Fixing bicycle",
 57 | "Fixing the roof",
 58 | "Fun sliding down",
 59 | "Futsal",
 60 | "Gargling mouthwash",
 61 | "Getting a haircut",
 62 | "Getting a piercing",
 63 | "Getting a tattoo",
 64 | "Grooming dog",
 65 | "Grooming horse",
 66 | "Hammer throw",
 67 | "Hand car wash",
 68 | "Hand washing clothes",
 69 | "Hanging wallpaper",
 70 | "Having an ice cream",
 71 | "High jump",
 72 | "Hitting a pinata",
 73 | "Hopscotch",
 74 | "Horseback riding",
 75 | "Hula hoop",
 76 | "Hurling",
 77 | "Ice fishing",
 78 | "Installing carpet",
 79 | "Ironing clothes",
 80 | "Javelin throw",
 81 | "Kayaking",
 82 | "Kite flying",
 83 | "Kneeling",
 84 | "Knitting",
 85 | "Laying tile",
 86 | "Layup drill in basketball",
 87 | "Long jump",
 88 | "Longboarding",
 89 | "Making a cake",
 90 | "Making a lemonade",
 91 | "Making a sandwich",
 92 | "Making an omelette",
 93 | "Mixing drinks",
 94 | "Mooping floor",
 95 | "Mowing the lawn",
 96 | "Paintball",
 97 | "Painting",
 98 | "Painting fence",
 99 | "Painting furniture",
100 | "Peeling potatoes",
101 | "Ping-pong",
102 | "Plastering",
103 | "Plataform diving",
104 | "Playing accordion",
105 | "Playing badminton",
106 | "Playing bagpipes",
107 | "Playing beach volleyball",
108 | "Playing blackjack",
109 | "Playing congas",
110 | "Playing drums",
111 | "Playing field hockey",
112 | "Playing flauta",
113 | "Playing guitarra",
114 | "Playing harmonica",
115 | "Playing ice hockey",
116 | "Playing kickball",
117 | "Playing lacrosse",
118 | "Playing piano",
119 | "Playing polo",
120 | "Playing pool",
121 | "Playing racquetball",
122 | "Playing rubik cube",
123 | "Playing saxophone",
124 | "Playing squash",
125 | "Playing ten pins",
126 | "Playing violin",
127 | "Playing water polo",
128 | "Pole vault",
129 | "Polishing forniture",
130 | "Polishing shoes",
131 | "Powerbocking",
132 | "Preparing pasta",
133 | "Preparing salad",
134 | "Putting in contact lenses",
135 | "Putting on makeup",
136 | "Putting on shoes",
137 | "Rafting",
138 | "Raking leaves",
139 | "Removing curlers",
140 | "Removing ice from car",
141 | "Riding bumper cars",
142 | "River tubing",
143 | "Rock climbing",
144 | "Rock-paper-scissors",
145 | "Rollerblading",
146 | "Roof shingle removal",
147 | "Rope skipping",
148 | "Running a marathon",
149 | "Sailing",
150 | "Scuba diving",
151 | "Sharpening knives",
152 | "Shaving",
153 | "Shaving legs",
154 | "Shot put",
155 | "Shoveling snow",
156 | "Shuffleboard",
157 | "Skateboarding",
158 | "Skiing",
159 | "Slacklining",
160 | "Smoking a cigarette",
161 | "Smoking hookah",
162 | "Snatch",
163 | "Snow tubing",
164 | "Snowboarding",
165 | "Spinning",
166 | "Spread mulch",
167 | "Springboard diving",
168 | "Starting a campfire",
169 | "Sumo",
170 | "Surfing",
171 | "Swimming",
172 | "Swinging at the playground",
173 | "Table soccer",
174 | "Tai chi",
175 | "Tango",
176 | "Tennis serve with ball bouncing",
177 | "Throwing darts",
178 | "Trimming branches or hedges",
179 | "Triple jump",
180 | "Tug of war",
181 | "Tumbling",
182 | "Using parallel bars",
183 | "Using the balance beam",
184 | "Using the monkey bar",
185 | "Using the pommel horse",
186 | "Using the rowing machine",
187 | "Using uneven bars",
188 | "Vacuuming floor",
189 | "Volleyball",
190 | "Wakeboarding",
191 | "Walking the dog",
192 | "Washing dishes",
193 | "Washing face",
194 | "Washing hands",
195 | "Waterskiing",
196 | "Waxing skis",
197 | "Welding",
198 | "Windsurfing",
199 | "Wrapping presents",
200 | "Zumba"]


--------------------------------------------------------------------------------
/video_backbone/TSP/data/activitynet/activitynet_v1-3_temporal_region_label_mapping.json:
--------------------------------------------------------------------------------
1 | ["Action",
2 |  "No action"]


--------------------------------------------------------------------------------
/video_backbone/TSP/data/generate_metadata_csv.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function
 2 | 
 3 | import argparse
 4 | import os
 5 | import glob
 6 | import pandas as pd
 7 | 
 8 | from torchvision.io import read_video_timestamps
 9 | from joblib import Parallel, delayed
10 | 
11 | 
12 | def get_video_stats(filename):
13 |     pts, video_fps = read_video_timestamps(filename=filename, pts_unit='sec')
14 |     if video_fps:
15 |         stats = {'filename': os.path.basename(filename),
16 |                  'video-duration': len(pts)/video_fps,
17 |                  'fps': video_fps,
18 |                  'video-frames': len(pts)}
19 |     else:
20 |         stats = {'filename': os.path.basename(filename),
21 |                  'video-duration': None,
22 |                  'fps': None,
23 |                  'video-frames': None}
24 |         print(f'WARNING: {filename} has an issue. video_fps = {video_fps}, len(pts) = {len(pts)}.')
25 |     return stats
26 | 
27 | 
28 | def main(args):
29 |     print(args)
30 | 
31 |     filenames = glob.glob(os.path.join(args.video_folder, f'*.{args.ext}'))
32 |     print(f'Number of video files: {len(filenames)}')
33 | 
34 |     all_stats = Parallel(n_jobs=args.workers)(
35 |         delayed(get_video_stats)(
36 |             filename=filename,
37 |         ) for filename in filenames)
38 | 
39 |     df = pd.DataFrame(all_stats)
40 |     df.to_csv(args.output_csv, index=False)
41 |     print(f'Saved metadata to {args.output_csv}')
42 | 
43 | if __name__ == '__main__':
44 |     parser = argparse.ArgumentParser(description='Generates a metadata CSV file with columns '
45 |                                                  '[filename, video-duration, fps, video-frames] '
46 |                                                  'for a given input video folder.')
47 | 
48 |     parser.add_argument('--video-folder', required=True, type=str,
49 |                       help='Path to folder containing the raw video files')
50 |     parser.add_argument('--ext', default='mp4', type=str,
51 |                       help='Video files extension (default: mp4)')
52 |     parser.add_argument('--output-csv', required=True, type=str,
53 |                       help='Where to save the metadata CSV file')
54 |     parser.add_argument('--workers', default=20, type=int,
55 |                       help='Number of parallel processes to use to generate the output (default: 20)')
56 | 
57 |     args = parser.parse_args()
58 | 
59 |     main(args)
60 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/data/standardize_videos_to_constant_30fps_mp4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Script to convert all videos in <input_folder> to mp4 videos with constant frame rate of 30fps.
 4 | # The output videos are saved in <output_folder>.
 5 | #
 6 | # usage: bash standardize_video_to_constant_30fps_mp4.sh <input_folder> <output_folder>
 7 | 
 8 | if [ "$#" -ne 2 ]; then
 9 |     echo "Illegal number of parameters"
10 |     echo "usage: bash standardize_video_to_constant_30fps_mp4.sh <input_folder> <output_folder>"
11 |     exit 1
12 | fi
13 | 
14 | 
15 | INPUT_FOLDER=$1
16 | OUTPUT_FOLDER=$2
17 | 
18 | echo "INPUT_FOLDER=$INPUT_FOLDER"
19 | echo "OUTPUT_FOLDER=$OUTPUT_FOLDER"
20 | 
21 | mkdir -p $OUTPUT_FOLDER
22 | 
23 | for input_video_path in $INPUT_FOLDER/*;
24 | do
25 | 	video_filename=$(basename $input_video_path)
26 | 	video_name="${video_filename%.*}"
27 | 	output_video_path="$OUTPUT_FOLDER/$video_name.mp4"
28 | 
29 | 	echo "ffmpeg -y -i $input_video_path -filter:v fps=fps=30 $output_video_path"
30 | 	ffmpeg -y -i $input_video_path -filter:v fps=fps=30 $output_video_path
31 | done
32 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/environment.yml:
--------------------------------------------------------------------------------
 1 | name: tsp
 2 | channels:
 3 |   - pytorch
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - torchvision=0.5.0
 8 |   - pytorch=1.4.0
 9 |   - cudatoolkit=10.1
10 |   - pandas
11 |   - h5py
12 |   - av
13 |   - joblib
14 |   - tqdm
15 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/README.md:
--------------------------------------------------------------------------------
 1 | # TSP Feature Extraction
 2 | 
 3 | Follow the data preprocessing instructions described [here](../data) before extracting features. We provide scripts for feature extraction using the released pretrained models or using a local checkpoint.
 4 | 
 5 | ### From Released Pretrained Models
 6 | Use the `extract_features_from_a_released_checkpoint.sh` script to extract features from the official released models. You need to manually set the following variables:
 7 | - `DATA_PATH`: Path to the video folder.
 8 | - `METADATA_CSV_FILENAME`: Path to a metadata CSV file. For ActivityNet and THUMOS14, use the CSV files precomputed in the [data](../data) folder. If you want to extract features for other video datasets, first standardized the videos and then generate the metadata files as per the instructions [here](../data), specifically step 2 and 4.
 9 | - `RELEASED_CHECKPOINT`: Name of the one of the `13` released pretrained model. Refer to the tables below for more details.
10 | - `STRIDE`: Choose the stride between clips, *e.g.,* `16` for non-overlapping clips and `1` for dense overlapping clips.
11 | - (Optional) `SHARD_ID`, `NUM_SHARDS`, `DEVICE`: Split the videos in the CSV into multiple shards for parallel feature extraction. Increase the number of shards and run the script independently on separate GPU devices, each with a different `SHARD_ID` from `0` to `NUM_SHARDS-1`. Each shard will process `num_videos / NUM_SHARDS` videos.
12 | 
13 | ### From a Local Checkpoint
14 | Use the `extract_features_from_a_local_checkpoint.sh` script to extract features from a local checkpoint. You need to manually set the same variables above plus the following 2 variables instead of `RELEASED_CHECKPOINT`:
15 | - `LOCAL_CHECKPOINT`: Path to the local checkpoint `.pth` file.
16 | - `BACKBONE`: The backbone used in the local checkpoint: `r2plus1d_34`, `r2plus1d_18`, or `r3d_18`.
17 | 
18 | ## Post Processing Output
19 | The feature extraction script will output a `.pkl` file for each video. Merge all the `.pkl` files into one `.h5` file as follows:
20 | 
21 | ```
22 | python merge_pkl_files_into_one_h5_feature_file.py --features-folder <path/to/feature/output/folder/> --output-h5 <features_filenames.h5>
23 | ```
24 | 
25 | ------
26 | 
27 | **Released Pretrained Models**
28 | 
29 | **Main TSP models**
30 | | Name                                     | Description                                                 | Weights |
31 | | ---------------------------------------- | ----------------------------------------------------------- | ------- |
32 | | `r2plus1d_34-tsp_on_activitynet`         | R(2+1)D-34 pretrained with TSP on ActivityNet               | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_5-0d2cf854.pth) |
33 | | `r2plus1d_34-tsp_on_thumos14`            | R(2+1)D-34 pretrained with TSP on THUMOS14                  | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_thumos14-max_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_4-e6a30b2f.pth) |
34 | 
35 | **Main TAC baseline models**
36 | | Name                                     | Description                                                 | Weights |
37 | | ---------------------------------------- | ----------------------------------------------------------- | ------- |
38 | | `r2plus1d_34-tac_on_activitynet`         | R(2+1)D-34 pretrained with TAC on ActivityNet               | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.002-epoch_5-98ccac94.pth) |
39 | | `r2plus1d_34-tac_on_thumos14`            | R(2+1)D-34 pretrained with TAC on THUMOS14                  | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_thumos14-backbone_lr_0.00001-fc_lr_0.002-epoch_3-54b5c8aa.pth) |
40 | | `r2plus1d_34-tac_on_kinetics`            | R(2+1)D-34 pretrained with TAC on Kinetics                  | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_kinetics-0547130e.pth) |
41 | 
42 | **Other models from the GVF and backbone architecture ablation studies**
43 | | Name                                     | Description                                                 | Weights |
44 | | ---------------------------------------- | ----------------------------------------------------------- | ------- |
45 | | `r2plus1d_34-tsp_on_activitynet-avg_gvf` | R(2+1)D-34 pretrained with TSP on ActivityNet (average GVF) | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-avg_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-8b74eaa2.pth) |
46 | | `r2plus1d_34-tsp_on_activitynet-no_gvf`  | R(2+1)D-34 pretrained with TSP on ActivityNet (without GVF) | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-no_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-fb38fdd2.pth) |
47 | | `r2plus1d_18-tsp_on_activitynet`         | R(2+1)D-18 pretrained with TSP on ActivityNet               | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-22835b73.pth) |
48 | | `r2plus1d_18-tac_on_activitynet`         | R(2+1)D-18 pretrained with TAC on ActivityNet               | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.004-epoch_5-9f56941a.pth) |
49 | | `r2plus1d_18-tac_on_kinetics`            | R(2+1)D-18 pretrained with TAC on Kinetics                  | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_kinetics-76ce975c.pth) |
50 | | `r3d_18-tsp_on_activitynet`              | R3D-18 pretrained with TSP on ActivityNet                   | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-85584422.pth) |
51 | | `r3d_18-tac_on_activitynet`              | R3D-18 pretrained with TAC on ActivityNet                   | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_activitynet-backbone_lr_0.001-fc_lr_0.01-epoch_5-31fd6e95.pth) |
52 | | `r3d_18-tac_on_kinetics`                 | R3D-18 pretrained with TAC on Kinetics                      | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_kinetics-dcd952c6.pth) |
53 | 
54 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/extract_features/__init__.py


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/eval_video_dataset.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | 
  3 | import os
  4 | import pandas as pd
  5 | import numpy as np
  6 | import torch
  7 | import h5py
  8 | import pickle as pkl
  9 | 
 10 | from torch.utils.data import Dataset
 11 | from torchvision.io import read_video
 12 | 
 13 | 
 14 | class EvalVideoDataset(Dataset):
 15 |     '''
 16 |     EvalVideoDataset:
 17 |         This dataset takes in a list of videos and return all clips with the given length and stride
 18 |         Each item in the dataset is a dictionary with the keys:
 19 |             - "clip": a Tensor (dtype=torch.float) of the clip frames after applying transforms
 20 |             - "filename": the video filename
 21 |             - "is-last-clip": a flag to mark the last clip in the video
 22 |     '''
 23 | 
 24 |     def __init__(self, metadata_df, root_dir, clip_length, frame_rate, stride, output_dir, transforms=None):
 25 |         '''
 26 |         Args:
 27 |             metadata_df (pandas.DataFrame): a DataFrame with the following video metadata columns:
 28 |                 [filename, fps, video-frames].
 29 |             root_dir (string): Directory with all the video files.
 30 |             clip_length (int): The number of frames per clip.
 31 |             frame_rate (int): The effective frame rate (fps) to sample clips.
 32 |             stride (int): The number of frames (after resampling with frame_rate) between consecutive clips.
 33 |                 For example, `stride`=1 will generate dense clips, while `stride`=`clip_length` will generate non-overlapping clips
 34 |             output_dir (string): Path to the directory where video features will be saved
 35 |             transforms (callable): A function/transform that takes in a TxHxWxC video
 36 |                 and returns a transformed version.
 37 |         '''
 38 |         metadata_df = EvalVideoDataset._append_root_dir_to_filenames_and_check_files_exist(metadata_df, root_dir)
 39 |         self.clip_metadata_df = EvalVideoDataset._generate_clips_metadata(metadata_df, clip_length, frame_rate, stride)
 40 |         self.clip_length = clip_length
 41 |         self.frame_rate = frame_rate
 42 |         self.stride = stride
 43 |         self.output_dir = output_dir
 44 |         self.transforms = transforms
 45 | 
 46 |         # Holds clip features for a given video until all clips are processed and the
 47 |         # full video features are ready to be saved to disk
 48 |         self.saved_features = {}
 49 |         self.saved_results = {}
 50 | 
 51 |     def __len__(self):
 52 |         return len(self.clip_metadata_df)
 53 | 
 54 |     def __getitem__(self, idx):
 55 |         sample = {}
 56 |         row = self.clip_metadata_df.iloc[idx]
 57 |         filename, fps = row['filename'], row['fps']
 58 | 
 59 |         filename, fps, clip_t_start, is_last_clip = row['filename'], row['fps'], row['clip-t-start'], row['is-last-clip']
 60 | 
 61 |         # compute clip_t_start and clip_t_end
 62 |         clip_length_in_sec = self.clip_length / self.frame_rate
 63 |         clip_t_end = clip_t_start + clip_length_in_sec
 64 | 
 65 |         # get a tensor [clip_length, H, W, C] of the video frames between clip_t_start and clip_t_end seconds
 66 |         vframes, _, _ = read_video(filename=filename, start_pts=clip_t_start, end_pts=clip_t_end, pts_unit='sec')
 67 |         idxs = EvalVideoDataset._resample_video_idx(self.clip_length, fps, self.frame_rate)
 68 |         vframes = vframes[idxs][:self.clip_length] # [:self.clip_length] for removing extra frames if isinstance(idxs, slice)
 69 |         if vframes.shape[0] != self.clip_length:
 70 |             raise RuntimeError(f'<EvalVideoDataset>: got clip of length {vframes.shape[0]} != {self.clip_length}.'
 71 |                                f'filename={filename}, clip_t_start={clip_t_start}, clip_t_end={clip_t_end}, '
 72 |                                f'fps={fps}')
 73 | 
 74 |         sample['clip'] = self.transforms(vframes)
 75 |         sample['filename'] = filename
 76 |         sample['is-last-clip'] = is_last_clip
 77 | 
 78 |         return sample
 79 | 
 80 |     def save_output(self, batch_output, batch_input, label_columns):
 81 |         batch_output = [x.detach().cpu().numpy() for x in batch_output]
 82 | 
 83 |         for i in range(batch_output[0].shape[0]):
 84 |             filename, is_last_clip = batch_input['filename'][i], batch_input['is-last-clip'][i]
 85 |             if not (filename in self.saved_results):
 86 |                 self.saved_results[filename] = {l: [] for l in label_columns}
 87 |             for j, label in enumerate(label_columns):
 88 |                 self.saved_results[filename][label].append(batch_output[j][i,...])
 89 | 
 90 |             if is_last_clip:
 91 |                 # dump results in disk at self.output_dir and then remove from self.saved_results
 92 |                 output_filename = os.path.join(self.output_dir, os.path.basename(filename).split('.')[0] + '_output.pkl')
 93 |                 for label in label_columns:
 94 |                     self.saved_results[filename][label] = np.stack(self.saved_results[filename][label])
 95 |                 # np.save(output_filename, self.saved_results[filename])
 96 |                 with open(output_filename, 'wb') as fobj:
 97 |                     pkl.dump(self.saved_results[filename], fobj)
 98 |                 del self.saved_results[filename]
 99 | 
100 |     def save_features(self, batch_features, batch_input):
101 |         batch_features = batch_features.detach().cpu().numpy()
102 | 
103 |         for i in range(batch_features.shape[0]):
104 |             filename, is_last_clip = batch_input['filename'][i], batch_input['is-last-clip'][i]
105 |             if not (filename in self.saved_features):
106 |                 self.saved_features[filename] = []
107 |             self.saved_features[filename].append(batch_features[i,...])
108 | 
109 |             if is_last_clip:
110 |                 # dump features to disk at self.output_dir and remove them from self.saved_features
111 |                 output_filename = os.path.join(self.output_dir, os.path.basename(filename).split('.')[0] + '.npy')
112 |                 self.saved_features[filename] = np.stack(self.saved_features[filename])
113 |                 np.save(output_filename, self.saved_features[filename])
114 |                 # with open(output_filename, 'wb') as fobj:
115 |                 #     pkl.dump(self.saved_features[filename], fobj)
116 |                 del self.saved_features[filename]
117 | 
118 | 
119 |     @staticmethod
120 |     def _append_root_dir_to_filenames_and_check_files_exist(df, root_dir):
121 |         df['filename'] = df['filename'].map(lambda f: os.path.join(root_dir, f))
122 |         filenames = df.drop_duplicates('filename')['filename'].values
123 |         for f in filenames:
124 |             if not os.path.exists(f):
125 |                 raise ValueError(f'<EvalVideoDataset>: file={f} does not exists. '
126 |                                  f'Double-check root_dir and metadata_df inputs')
127 |         return df
128 | 
129 |     @staticmethod
130 |     def _generate_clips_metadata(df, clip_length, frame_rate, stride):
131 |         clip_metadata = {
132 |             'filename': [],
133 |             'fps': [],
134 |             'clip-t-start': [],
135 |             'is-last-clip': [],
136 |         }
137 |         for i, row in df.iterrows():
138 |             total_frames_after_resampling = int(row['video-frames'] * (float(frame_rate) / row['fps']))
139 |             idxs = EvalVideoDataset._resample_video_idx(total_frames_after_resampling, row['fps'], frame_rate)
140 |             if isinstance(idxs, slice):
141 |                 frame_idxs = np.arange(row['video-frames'])[idxs]
142 |             else:
143 |                 frame_idxs = idxs.numpy()
144 |             clip_t_start = list(frame_idxs[np.arange(0,frame_idxs.shape[0]-clip_length+1,stride)]/row['fps'])
145 |             num_clips = len(clip_t_start)
146 | 
147 |             clip_metadata['filename'].extend([row['filename']]*num_clips)
148 |             clip_metadata['fps'].extend([row['fps']]*num_clips)
149 |             clip_metadata['clip-t-start'].extend(clip_t_start)
150 |             is_last_clip = [0] * num_clips
151 |             is_last_clip[-1] = 1
152 |             clip_metadata['is-last-clip'].extend(is_last_clip)
153 | 
154 |         return pd.DataFrame(clip_metadata)
155 | 
156 |     @staticmethod
157 |     def _resample_video_idx(num_frames, original_fps, new_fps):
158 |         step = float(original_fps) / new_fps
159 |         if step.is_integer():
160 |             # optimization: if step is integer, don't need to perform
161 |             # advanced indexing
162 |             step = int(step)
163 |             return slice(None, None, step)
164 |         idxs = torch.arange(num_frames, dtype=torch.float32) * step
165 |         idxs = idxs.floor().to(torch.int64)
166 |         return idxs
167 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/extract_features.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | 
  3 | import os
  4 | import torch
  5 | import torchvision
  6 | import json
  7 | import datetime
  8 | import time
  9 | import numpy as np
 10 | import pandas as pd
 11 | import pickle as pkl
 12 | import sys
 13 | 
 14 | from torchvision import transforms
 15 | from torch import nn
 16 | from eval_video_dataset import EvalVideoDataset
 17 | sys.path.insert(0, '..')
 18 | from common import utils
 19 | from common import transforms as T
 20 | from models.model import Model
 21 | 
 22 | 
 23 | MODEL_URLS = {
 24 |     # main TSP models
 25 |     'r2plus1d_34-tsp_on_activitynet'        : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_5-0d2cf854.pth',
 26 |     'r2plus1d_34-tsp_on_thumos14'           : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_thumos14-max_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_4-e6a30b2f.pth',
 27 | 
 28 |     # main TAC baseline models
 29 |     'r2plus1d_34-tac_on_activitynet'        : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.002-epoch_5-98ccac94.pth',
 30 |     'r2plus1d_34-tac_on_thumos14'           : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_thumos14-backbone_lr_0.00001-fc_lr_0.002-epoch_3-54b5c8aa.pth',
 31 |     'r2plus1d_34-tac_on_kinetics'           : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_kinetics-0547130e.pth',
 32 | 
 33 |     # other models from the GVF and backbone architecture ablation studies
 34 |     'r2plus1d_34-tsp_on_activitynet-avg_gvf': 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-avg_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-8b74eaa2.pth',
 35 |     'r2plus1d_34-tsp_on_activitynet-no_gvf' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-no_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-fb38fdd2.pth',
 36 | 
 37 |     'r2plus1d_18-tsp_on_activitynet'        : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-22835b73.pth',
 38 |     'r2plus1d_18-tac_on_activitynet'        : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.004-epoch_5-9f56941a.pth',
 39 |     'r2plus1d_18-tac_on_kinetics'           : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_kinetics-76ce975c.pth',
 40 | 
 41 |     'r3d_18-tsp_on_activitynet'             : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-85584422.pth',
 42 |     'r3d_18-tac_on_activitynet'             : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_activitynet-backbone_lr_0.001-fc_lr_0.01-epoch_5-31fd6e95.pth',
 43 |     'r3d_18-tac_on_kinetics'                : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_kinetics-dcd952c6.pth',
 44 | }
 45 | 
 46 | 
 47 | def evaluate(model, data_loader, device):
 48 |     model.eval()
 49 |     metric_logger = utils.MetricLogger(delimiter=' ')
 50 |     header = 'Feature extraction:'
 51 |     with torch.no_grad():
 52 |         for sample in metric_logger.log_every(data_loader, 10, header, device=device):
 53 |             clip = sample['clip'].to(device, non_blocking=True)
 54 |             logits, features = model(clip, return_features=True)
 55 |             data_loader.dataset.save_features(features, sample)
 56 |             # print(len(logits))
 57 |             # print(logits[0].shape, logits[1].shape)
 58 |             data_loader.dataset.save_output(logits, sample, ["action-label"])
 59 | 
 60 | 
 61 | def main(args):
 62 |     print(args)
 63 |     print('TORCH VERSION: ', torch.__version__)
 64 |     print('TORCHVISION VERSION: ', torchvision.__version__)
 65 |     torch.backends.cudnn.benchmark = True
 66 | 
 67 |     device = torch.device(args.device)
 68 |     os.makedirs(args.output_dir, exist_ok=True)
 69 | 
 70 |     print('LOADING DATA')
 71 |     normalize = T.Normalize(mean=[0.43216, 0.394666, 0.37645],
 72 |                             std=[0.22803, 0.22145, 0.216989])
 73 | 
 74 |     transform = torchvision.transforms.Compose([
 75 |         T.ToFloatTensorInZeroOne(),
 76 |         T.Resize((128, 171)),
 77 |         normalize,
 78 |         T.CenterCrop((112, 112))
 79 |     ])
 80 | 
 81 |     metadata_df = pd.read_csv(args.metadata_csv_filename)
 82 |     shards = np.linspace(0,len(metadata_df),args.num_shards+1).astype(int)
 83 |     start_idx, end_idx = shards[args.shard_id], shards[args.shard_id+1]
 84 |     print(f'shard-id: {args.shard_id + 1} out of {args.num_shards}, '
 85 |         f'total number of videos: {len(metadata_df)}, shard size {end_idx-start_idx} videos')
 86 | 
 87 |     metadata_df = metadata_df.iloc[start_idx:end_idx].reset_index()
 88 |     metadata_df['is-computed-already'] = metadata_df['filename'].map(lambda f:
 89 |         os.path.exists(os.path.join(args.output_dir, os.path.basename(f).split('.')[0] + '.npy')))
 90 |     metadata_df = metadata_df[metadata_df['is-computed-already']==False].reset_index(drop=True)
 91 |     print(f'Number of videos to process after excluding the ones already computed on disk: {len(metadata_df)}')
 92 | 
 93 |     dataset = EvalVideoDataset(
 94 |         metadata_df=metadata_df,
 95 |         root_dir=args.data_path,
 96 |         clip_length=args.clip_len,
 97 |         frame_rate=args.frame_rate,
 98 |         stride=args.stride,
 99 |         output_dir=args.output_dir,
100 |         transforms=transform)
101 | 
102 |     print('CREATING DATA LOADER')
103 |     data_loader = torch.utils.data.DataLoader(
104 |         dataset, batch_size=args.batch_size, shuffle=False,
105 |         num_workers=args.workers, pin_memory=True)
106 | 
107 |     print(f'LOADING MODEL')
108 |     if args.local_checkpoint:
109 |         print(f'from the local checkpoint: {args.local_checkpoint}')
110 |         pretrained_state_dict = torch.load(args.local_checkpoint, map_location='cpu')['model']
111 |     else:
112 |         print(f'from the GitHub released model: {args.released_checkpoint}')
113 |         args.backbone = args.released_checkpoint.split('-')[0]
114 |         pretrained_state_dict = torch.hub.load_state_dict_from_url(
115 |             MODEL_URLS[args.released_checkpoint], progress=True, check_hash=True, map_location='cpu'
116 |             )['model']
117 | 
118 |     # model with a dummy classifier layer
119 |     model = Model(backbone=args.backbone, num_classes=[1], num_heads=1, concat_gvf=False)
120 |     model.to(device)
121 | 
122 |     # remove the classifier layers from the pretrained model and load the backbone weights
123 |     pretrained_state_dict = {k: v for k,v in pretrained_state_dict.items() if 'fc' not in k}
124 |     state_dict = model.state_dict()
125 |     pretrained_state_dict['fc.weight'] = state_dict['fc.weight']
126 |     pretrained_state_dict['fc.bias'] = state_dict['fc.bias']
127 |     model.load_state_dict(pretrained_state_dict)
128 | 
129 |     print('START FEATURE EXTRACTION')
130 |     evaluate(model, data_loader, device)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     from opts import parse_args
135 |     args = parse_args()
136 |     main(args)
137 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/extract_features_from_a_local_checkpoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -i
 2 | 
 3 | ####################################################################################
 4 | ########################## PARAMETERS THAT NEED TO BE SET ##########################
 5 | ####################################################################################
 6 | 
 7 | DATA_PATH= # path/to/video/folder/
 8 | METADATA_CSV_FILENAME= # path/to/metadata/csv/file. Use the ones provided in the data folder.
 9 | 
10 | LOCAL_CHECKPOINT= # path/to/local/checkpoint/file.pth
11 | BACKBONE= # Set the backbone used in the LOCAL_CHECKPOINT: r2plus1d_34, r2plus1d_18, or r3d_18
12 | 
13 | # Choose the stride between clips, e.g. 16 for non-overlapping clips and 1 for dense overlapping clips
14 | STRIDE=16 
15 | 
16 | # Optional: Split the videos into multiple shards for parallel feature extraction
17 | # Increase the number of shards and run this script independently on separate GPU devices,
18 | # each with a different SHARD_ID from 0 to NUM_SHARDS-1.
19 | # Each shard will process (num_videos / NUM_SHARDS) videos.
20 | SHARD_ID=0
21 | NUM_SHARDS=1
22 | DEVICE=cuda:0
23 | 
24 | if [ -z "$DATA_PATH" ]; then
25 |     echo "DATA_PATH variable is not set."
26 |     echo "Please set DATA_PATH to the folder containing the videos you want to process."
27 |     exit 1
28 | fi
29 | 
30 | if [ -z "$METADATA_CSV_FILENAME" ]; then
31 |     echo "METADATA_CSV_FILENAME variable is not set."
32 |     echo "We provide metadata CSV files for ActivityNet and THUMOS14 in the data folder."
33 |     exit 1
34 | fi
35 | 
36 | if [ -z "$LOCAL_CHECKPOINT" ]; then
37 |     echo "LOCAL_CHECKPOINT variable is not set."
38 |     echo "Please set LOCAL_CHECKPOINT to the location of the local checkpoint .pth file."
39 |     echo "Make sure to set the correct BACKBONE variable as well."
40 |     exit 1
41 | fi
42 | 
43 | if [ -z "$BACKBONE" ]; then
44 |     echo "BACKBONE variable is not set."
45 |     exit 1
46 | fi
47 | 
48 | ####################################################################################
49 | ############################# PARAMETERS TO KEEP AS IS #############################
50 | ####################################################################################
51 | 
52 | OUTPUT_DIR=output/local_checkpoint_${BACKBONE}_features/stride_${STRIDE}/
53 | 
54 | source activate tsp
55 | mkdir -p $OUTPUT_DIR
56 | 
57 | python extract_features.py \
58 | --data-path $DATA_PATH \
59 | --metadata-csv-filename $METADATA_CSV_FILENAME \
60 | --local-checkpoint $LOCAL_CHECKPOINT \
61 | --backbone $BACKBONE \
62 | --stride $STRIDE \
63 | --shard-id $SHARD_ID \
64 | --num-shards $NUM_SHARDS \
65 | --device $DEVICE \
66 | --output-dir $OUTPUT_DIR
67 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/extract_features_from_a_released_checkpoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -i
 2 | 
 3 | ####################################################################################
 4 | ########################## PARAMETERS THAT NEED TO BE SET ##########################
 5 | ####################################################################################
 6 | 
 7 | DATA_PATH= # path/to/video/folder
 8 | METADATA_CSV_FILENAME= # path/to/metadata/csv/file. Use the ones provided in the data folder.
 9 | 
10 | ##############################
11 | ### RELEASED GITHUB MODELS ###
12 | ##############################
13 | ## main TSP models ->
14 | # r2plus1d_34-tsp_on_activitynet (default)
15 | # r2plus1d_34-tsp_on_thumos14
16 | #
17 | ## main TAC baseline models ->
18 | # r2plus1d_34-tac_on_activitynet
19 | # r2plus1d_34-tac_on_thumos14
20 | # r2plus1d_34-tac_on_kinetics
21 | #
22 | ## other models from the GVF and backbone architecture ablation studies ->
23 | # r2plus1d_34-tsp_on_activitynet-avg_gvf
24 | # r2plus1d_34-tsp_on_activitynet-no_gvf
25 | # r2plus1d_18-tsp_on_activitynet
26 | # r2plus1d_18-tac_on_activitynet
27 | # r2plus1d_18-tac_on_kinetics
28 | # r3d_18-tsp_on_activitynet
29 | # r3d_18-tac_on_activitynet
30 | # r3d_18-tac_on_kinetics
31 | RELEASED_CHECKPOINT=r2plus1d_34-tsp_on_activitynet # choose one of the models above
32 | 
33 | # Choose the stride between clips, e.g. 16 for non-overlapping clips and 1 for dense overlapping clips
34 | STRIDE=16 
35 | 
36 | # Optional: Split the videos into multiple shards for parallel feature extraction
37 | # Increase the number of shards and run this script independently on separate GPU devices,
38 | # each with a different SHARD_ID from 0 to NUM_SHARDS-1.
39 | # Each shard will process (num_videos / NUM_SHARDS) videos.
40 | SHARD_ID=0
41 | NUM_SHARDS=1
42 | DEVICE=cuda:0
43 | 
44 | if [ -z "$DATA_PATH" ]; then
45 |     echo "DATA_PATH variable is not set."
46 |     echo "Please set DATA_PATH to the folder containing the videos you want to process."
47 |     exit 1
48 | fi
49 | 
50 | if [ -z "$METADATA_CSV_FILENAME" ]; then
51 |     echo "METADATA_CSV_FILENAME variable is not set."
52 |     echo "We provide metadata CSV files for ActivityNet and THUMOS14 in the data folder."
53 |     exit 1
54 | fi
55 | 
56 | ####################################################################################
57 | ############################# PARAMETERS TO KEEP AS IS #############################
58 | ####################################################################################
59 | 
60 | OUTPUT_DIR=output/${RELEASED_CHECKPOINT}_features/stride_${STRIDE}/
61 | 
62 | source activate tsp
63 | mkdir -p $OUTPUT_DIR
64 | 
65 | python extract_features.py \
66 | --data-path $DATA_PATH \
67 | --metadata-csv-filename $METADATA_CSV_FILENAME \
68 | --released-checkpoint $RELEASED_CHECKPOINT \
69 | --stride $STRIDE \
70 | --shard-id $SHARD_ID \
71 | --num-shards $NUM_SHARDS \
72 | --device $DEVICE \
73 | --output-dir $OUTPUT_DIR
74 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/merge_pkl_files_into_one_h5_feature_file.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function
 2 | 
 3 | import argparse
 4 | import pickle as pkl
 5 | import h5py
 6 | import glob
 7 | import os
 8 | 
 9 | from tqdm import tqdm
10 | 
11 | 
12 | def main(args):
13 |     print(args)
14 |     compression_flags = dict(compression='gzip', compression_opts=9)
15 |     filenames = glob.glob(os.path.join(args.features_folder, '*.pkl'))
16 |     print(f'Number of pkl files: {len(filenames)}')
17 | 
18 |     output = h5py.File(args.output_h5, 'w')
19 |     for f in tqdm(filenames):
20 |         video_name = os.path.basename(f).split('.pkl')[0]
21 |         with open(f, 'rb') as fobj:
22 |             data = pkl.load(fobj)
23 |         output.create_dataset(video_name, data=data, chunks=True, **compression_flags)
24 | 
25 |     output.close()
26 |     print(f'The h5 feature file is saved to {args.output_h5}')
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     parser = argparse.ArgumentParser(description='Merge the feature pkl files of different videos into one '
31 |                                                  'h5 feature file mapping video name to feature tensor.')
32 | 
33 |     parser.add_argument('--features-folder', required=True, type=str,
34 |                       help='Path to the folder containing the pkl feature files')
35 |     parser.add_argument('--output-h5', required=True, type=str,
36 |                       help='Where to save the combined metadata CSV file')
37 | 
38 |     args = parser.parse_args()
39 | 
40 |     main(args)
41 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/extract_features/opts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | RELEASED_GITHUB_MODELS = [
 5 |     # main TSP models
 6 |     'r2plus1d_34-tsp_on_activitynet',
 7 |     'r2plus1d_34-tsp_on_thumos14',
 8 | 
 9 |     # main TAC baseline models
10 |     'r2plus1d_34-tac_on_activitynet',
11 |     'r2plus1d_34-tac_on_thumos14',
12 |     'r2plus1d_34-tac_on_kinetics',
13 | 
14 |     # other models from the GVF and backbone architecture ablation studies
15 |     'r2plus1d_34-tsp_on_activitynet-avg_gvf',
16 |     'r2plus1d_34-tsp_on_activitynet-no_gvf',
17 | 
18 |     'r2plus1d_18-tsp_on_activitynet',
19 |     'r2plus1d_18-tac_on_activitynet',
20 |     'r2plus1d_18-tac_on_kinetics',
21 | 
22 |     'r3d_18-tsp_on_activitynet',
23 |     'r3d_18-tac_on_activitynet',
24 |     'r3d_18-tac_on_kinetics',
25 | ]
26 | 
27 | 
28 | def parse_args():
29 |     parser = argparse.ArgumentParser(description='Features extraction script')
30 | 
31 |     parser.add_argument('--data-path', required=True,
32 |                         help='Path to the directory containing the videos files')
33 |     parser.add_argument('--metadata-csv-filename', required=True,
34 |                         help='Path to the metadata CSV file')
35 | 
36 |     parser.add_argument('--backbone', default='r2plus1d_34',
37 |                         choices=['r2plus1d_34', 'r2plus1d_18', 'r3d_18'],
38 |                         help='Encoder backbone architecture (default r2plus1d_34). '
39 |                              'Supported backbones are r2plus1d_34, r2plus1d_18, and r3d_18')
40 |     parser.add_argument('--device', default='cuda',
41 |                         help='Device to train on (default: cuda)')
42 | 
43 |     parser.add_argument('--released-checkpoint', default='r2plus1d-34_tsp-on-activitynet_max-gvf',
44 |                         choices=RELEASED_GITHUB_MODELS,
45 |                         help='Model checkpoint name to load from the released GitHub pretrained models. '
46 |                              'The backbone parameter is set automatically if loading from a released model. '
47 |                              'If `local-checkpoint` flag is not None, then this parameter is ignored and '
48 |                              'a checkpoint is loaded from the given `local-checkpoint` path on disk.')
49 |     parser.add_argument('--local-checkpoint', default=None,
50 |                         help='Path to checkpoint on disk. If set, then read checkpoint from local disk. '
51 |                             'Otherwise, load checkpoint from the released GitHub models.')
52 | 
53 |     parser.add_argument('--clip-len', default=16, type=int,
54 |                         help='Number of frames per clip (default: 16)')
55 |     parser.add_argument('--frame-rate', default=15, type=int,
56 |                         help='Frames-per-second rate at which the videos are sampled (default: 15)')
57 |     parser.add_argument('--stride', default=16, type=int,
58 |                         help='Number of frames (after resampling with frame-rate) between consecutive clips (default: 16)')
59 | 
60 |     parser.add_argument('--batch-size', default=32, type=int,
61 |                         help='Batch size per GPU (default: 32)')
62 |     parser.add_argument('--workers', default=6, type=int,
63 |                         help='Number of data loading workers (default: 6)')
64 | 
65 |     parser.add_argument('--output-dir', required=True,
66 |                         help='Path for saving features')
67 |     parser.add_argument('--shard-id', default=0, type=int,
68 |                         help='Shard id number. Must be between [0, num-shards)')
69 |     parser.add_argument('--num-shards', default=1, type=int,
70 |                         help='Number of shards to split the metadata-csv-filename')
71 | 
72 |     args = parser.parse_args()
73 | 
74 |     return args
75 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/img/tsp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/img/tsp.png


--------------------------------------------------------------------------------
/video_backbone/TSP/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/models/__init__.py


--------------------------------------------------------------------------------
/video_backbone/TSP/models/backbone.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from torchvision.models.video import r2plus1d_18 as _r2plus1d_18
 5 | from torchvision.models.video import r3d_18 as _r3d_18
 6 | from torchvision.models.video.resnet import VideoResNet, R2Plus1dStem, BasicBlock
 7 | 
 8 | __all__ = ['r2plus1d_34', 'r2plus1d_18', 'r3d_18']
 9 | 
10 | R2PLUS1D_34_MODEL_URL="https://github.com/moabitcoin/ig65m-pytorch/releases/download/v1.0.0/r2plus1d_34_clip8_ft_kinetics_from_ig65m-0aa0550b.pth"
11 | 
12 | 
13 | def r2plus1d_34(pretrained=True, progress=False, **kwargs):
14 |     model = VideoResNet(
15 |         block=BasicBlock,
16 |         conv_makers=[Conv2Plus1D] * 4,
17 |         layers=[3, 4, 6, 3],
18 |         stem=R2Plus1dStem,
19 |         **kwargs,
20 |     )
21 | 
22 |     # We need exact Caffe2 momentum for BatchNorm scaling
23 |     for m in model.modules():
24 |         if isinstance(m, nn.BatchNorm3d):
25 |             m.eps = 1e-3
26 |             m.momentum = 0.9
27 | 
28 |     if pretrained:
29 |         state_dict = torch.hub.load_state_dict_from_url(
30 |             R2PLUS1D_34_MODEL_URL, progress=progress
31 |         )
32 |         model.load_state_dict(state_dict)
33 | 
34 |     return model
35 | 
36 | 
37 | def r2plus1d_18(pretrained=True, progress=False, **kwargs):
38 |     return _r2plus1d_18(pretrained=pretrained, progress=progress, **kwargs)
39 | 
40 | 
41 | def r3d_18(pretrained=True, progress=False, **kwargs):
42 |     return _r3d_18(pretrained=pretrained, progress=progress, **kwargs)
43 | 
44 | 
45 | class Conv2Plus1D(nn.Sequential):
46 |     def __init__(self, in_planes, out_planes, midplanes, stride=1, padding=1):
47 | 
48 |         midplanes = (in_planes * out_planes * 3 * 3 * 3) // (
49 |             in_planes * 3 * 3 + 3 * out_planes
50 |         )
51 |         super(Conv2Plus1D, self).__init__(
52 |             nn.Conv3d(
53 |                 in_planes,
54 |                 midplanes,
55 |                 kernel_size=(1, 3, 3),
56 |                 stride=(1, stride, stride),
57 |                 padding=(0, padding, padding),
58 |                 bias=False,
59 |             ),
60 |             nn.BatchNorm3d(midplanes),
61 |             nn.ReLU(inplace=True),
62 |             nn.Conv3d(
63 |                 midplanes,
64 |                 out_planes,
65 |                 kernel_size=(3, 1, 1),
66 |                 stride=(stride, 1, 1),
67 |                 padding=(padding, 0, 0),
68 |                 bias=False,
69 |             ),
70 |         )
71 | 
72 |     @staticmethod
73 |     def get_downsample_stride(stride):
74 |         return (stride, stride, stride)


--------------------------------------------------------------------------------
/video_backbone/TSP/models/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from .backbone import r2plus1d_34, r2plus1d_18, r3d_18
 4 | 
 5 | 
 6 | class Model(nn.Module):
 7 | 
 8 |     def __init__(self, backbone, num_classes, num_heads=1, concat_gvf=False, progress=True, **kwargs):
 9 |         '''
10 |         Args:
11 |             backbone (string): The name of the backbone architecture. Supported architectures: r2plus1d_34, r2plus1d_18, and r3d_18.
12 |             num_heads (int): The number of output heads
13 |             num_classes (list of int): The number of labels per head
14 |             concat_gvf (bool): If True and num_heads == 2, then concat global video features (GVF) to clip
15 |                 features before applying the second head FC layer.
16 |             progress (bool): If True, displays a progress bar of the download to stderr
17 |             **kwargs: keyword arguments to pass to backbone architecture constructor
18 |         '''
19 |         super().__init__()
20 |         print(f'<Model>: backbone {backbone} num_classes {num_classes} num_heads {num_heads} kwargs {kwargs}')
21 |         assert len(num_classes) == num_heads, f'<Model>: incompatible configuration. len(num_classes) must be equal to num_heads'
22 |         assert num_heads == 1 or num_heads == 2, f'<Model>: num_heads = {num_heads} must be either 1 or 2'
23 | 
24 |         self.backbone = backbone
25 |         self.num_classes = num_classes
26 |         self.num_heads = num_heads
27 |         self.concat_gvf = concat_gvf
28 | 
29 |         self.features, self.feature_size = Model._build_feature_backbone(backbone, progress, **kwargs)
30 | 
31 |         if self.num_heads == 1:
32 |             self.fc = Model._build_fc(self.feature_size, num_classes[0])
33 |         else:
34 |             self.fc1 = Model._build_fc(self.feature_size, num_classes[0])
35 |             self.fc2 = Model._build_fc(2 * self.feature_size if self.concat_gvf else self.feature_size, num_classes[1])
36 | 
37 |     def forward(self, x, gvf=None, return_features=False):
38 |         features = self.features(x)
39 |         if self.num_heads == 1:
40 |             logits = [self.fc(features)]
41 |         else:
42 |             logits = [self.fc1(features)]
43 |             if self.concat_gvf:
44 |                 assert gvf is not None, 'Forward pass expects a global video feature input but got None'
45 |                 logits.append(self.fc2(torch.cat([features, gvf], dim=-1)))
46 |             else:
47 |                 logits.append(self.fc2(features))
48 | 
49 |         return (logits, features) if return_features else logits
50 | 
51 |     @staticmethod
52 |     def _build_feature_backbone(backbone, progress, **kwargs):
53 |         if backbone == 'r2plus1d_34': builder = r2plus1d_34
54 |         elif backbone == 'r2plus1d_18': builder = r2plus1d_18
55 |         elif backbone == 'r3d_18': builder = r3d_18
56 |         else:
57 |             raise ValueError(f'<Model>: {backbone} is an invalid architecture type. '
58 |                 f'Supported  architectures: r2plus1d_34, r2plus1d_18, and r3d_18')
59 | 
60 |         feature_backbone = builder(pretrained=True, progress=progress, **kwargs)
61 | 
62 |         # remove the FC layer of the backbone
63 |         feature_size = feature_backbone.fc.in_features
64 |         feature_backbone.fc = nn.Sequential()
65 | 
66 |         return feature_backbone, feature_size
67 | 
68 |     @staticmethod
69 |     def _build_fc(in_features, out_features):
70 |         fc = nn.Linear(in_features, out_features)
71 |         nn.init.normal_(fc.weight, 0, 0.01)
72 |         nn.init.constant_(fc.bias, 0)
73 |         return fc
74 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/train/README.md:
--------------------------------------------------------------------------------
 1 | # TSP Training
 2 | 
 3 | We provide four training scripts:
 4 | - `train_tsp_on_activitynet.sh`: pretraining R(2+1)D-34 encoder with TSP on ActivityNet
 5 | - `train_tsp_on_thumos14.sh`: pretraining R(2+1)D-34 encoder with TSP on THUMOS14
 6 | - `train_tac_on_activitynet.sh`: pretraining R(2+1)D-34 encoder with TAC on ActivityNet (baseline)
 7 | - `train_tac_on_thumos14.sh`: pretraining R(2+1)D-34 encoder with TAC on THUMOS14 (baseline)
 8 | 
 9 | ## Launching the Training Scripts
10 | 
11 | Before launching each script, you need to manually set **3 variables** inside each file:
12 | - `ROOT_DIR`: The root directory of either the ActivityNet or THUMOS14 videos. Follow the data preprocessing instructions and subfolders naming described [here](../data).
13 | - `NUM_GPUS`: The number of GPUs to use for training. We used 2 V100 (32G) GPUs in our TSP experiments, but the code is generic and can be run on any number of GPUs.
14 | - `DOWNSCALE_FACTOR`: The default batch size and learning rates were optimized for a GPU with 32G memory. We understand that such GPUs might not be accessible to all of the community. Thus, the training code can seamlessly be adapt to run on a smaller GPU memory size by adjusting this variable. Set `DOWNSCALE_FACTOR` to `1`, `2`, or `4` if you have a GPU with 32G, 16G, or 8G memory, respectively. The script will automatically downscale the batch size and the learning rate accordingly to keep the same expected performance.
15 | 
16 | ## Experiment Output
17 | 
18 | - Checkpoint per epoch (*e.g.,* `epoch_3.pth`): a `.pth` file containing the state dictionary of the model, optimizer, and learning rate scheduler. The checkpoint files can be used to resume the training (use `--resume` and `--start-epoch` input parameters in `train.py`) or to extract features (use the scripts [here](../extract_features)).
19 | - Metric results file (`results.txt`): A log of the metrics results on the validation subset after each epoch. We choose the best pretrained model based on the epoch with the highest `Avg Accuracy` value.
20 | 
21 | ## Interested in Reproducing the Ablation Studies?
22 | 
23 | Train with different encoder architectures? Change the variable `BACKBONE` to either `r2plus1d_18` or `r3d_18`.
24 | Train without GVF? Remove the line `--global-video-features $GLOBAL_VIDEO_FEATURES \` from the `train.py` call at the end.
25 | Train with average GVF? Set `GLOBAL_VIDEO_FEATURES=../data/activitynet/global_video_features/r2plus1d_34-avg_gvf.h5`.
26 | Train with only the temporal region classification head? Set `LABEL_COLUMNS=temporal-region-label` and `LABEL_MAPPING_JSONS=../data/activitynet/activitynet_v1-3_temporal_region_label_mapping.json`. Finally, make sure to rename `OUTPUT_DIR` to avoid overwriting previous experiment when reproducing the ablation studies.
27 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/train/__init__.py


--------------------------------------------------------------------------------
/video_backbone/TSP/train/opts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def parse_args():
 5 |     parser = argparse.ArgumentParser(description='Training script for "TSP: Temporally-Sensitive Pretraining of Video Encoders for Localization Tasks"')
 6 | 
 7 |     parser.add_argument('--root-dir', required=True,
 8 |                         help='Path to root directory containing the videos files')
 9 |     parser.add_argument('--train-subdir', default='train',
10 |                         help='Training subdirectory inside the root directory (default: train)')
11 |     parser.add_argument('--valid-subdir', default='valid',
12 |                         help='Validation subdirectory inside the root directory (default: val)')
13 |     parser.add_argument('--train-csv-filename', required=True,
14 |                         help='Path to the training CSV file')
15 |     parser.add_argument('--valid-csv-filename', required=True,
16 |                         help='Path to the validation CSV file')
17 |     parser.add_argument('--label-columns', nargs='+', required=True,
18 |                         help='Names of the label columns in the CSV files')
19 |     parser.add_argument('--label-mapping-jsons', nargs='+', required=True,
20 |                         help='Path to the mapping of each label column')
21 |     parser.add_argument('--loss-alphas', nargs='+', default=[1.0, 1.0], type=float,
22 |                         help='A list of the scalar alpha with which to weight each label loss')
23 |     parser.add_argument('--global-video-features',
24 |                         help='Path to the h5 file containing global video features (GVF). '
25 |                              'If not given, then train without GVF.')
26 | 
27 |     parser.add_argument('--backbone', default='r2plus1d_34',
28 |                         choices=['r2plus1d_34', 'r2plus1d_18', 'r3d_18'],
29 |                         help='Encoder backbone architecture (default r2plus1d_34). '
30 |                              'Supported backbones are r2plus1d_34, r2plus1d_18, and r3d_18')
31 |     parser.add_argument('--device', default='cuda',
32 |                         help='Device to train on (default: cuda)')
33 | 
34 |     parser.add_argument('--clip-len', default=16, type=int,
35 |                         help='Number of frames per clip (default: 16)')
36 |     parser.add_argument('--frame-rate', default=15, type=int,
37 |                         help='Frames-per-second rate at which the videos are sampled (default: 15)')
38 |     parser.add_argument('--clips-per-segment', default=5, type=int,
39 |                         help='Number of clips sampled per video segment (default: 5)')
40 |     parser.add_argument('--batch-size', default=32, type=int,
41 |                         help='Batch size per GPU (default: 32)')
42 |     parser.add_argument('--workers', default=6, type=int,
43 |                         help='Number of data loading workers (default: 6)')
44 | 
45 |     parser.add_argument('--epochs', default=8, type=int,
46 |                         help='Number of total epochs to run')
47 |     parser.add_argument('--backbone-lr', default=0.0001, type=float,
48 |                         help='Backbone layers learning rate')
49 |     parser.add_argument('--fc-lr', default=0.002, type=float,
50 |                         help='Fully-connected classifiers learning rate')
51 |     parser.add_argument('--lr-warmup-epochs', default=2, type=int,
52 |                         help='Number of warmup epochs')
53 |     parser.add_argument('--lr-milestones', nargs='+', default=[4, 6], type=int,
54 |                         help='Decrease lr on milestone epoch')
55 |     parser.add_argument('--lr-gamma', default=0.01, type=float,
56 |                         help='Decrease lr by a factor of lr-gamma at each milestone epoch')
57 |     parser.add_argument('--momentum', default=0.9, type=float,
58 |                         help='Momentum (default: 0.9)')
59 |     parser.add_argument('--weight-decay', default=0.005, type=float,
60 |                         help='Weight decay (default: 0.005)')
61 | 
62 |     parser.add_argument('--valid-only', action='store_true',
63 |                         help='Test the model on the validation subset and exit')
64 |     parser.add_argument('--train-only-one-epoch', action='store_true',
65 |                         help='Train the model for only one epoch without testing on validation subset')
66 | 
67 |     parser.add_argument('--print-freq', default=100, type=int,
68 |                         help='Print frequency in number of batches')
69 |     parser.add_argument('--output-dir', required=True,
70 |                         help='Path for saving checkpoints and results output')
71 |     parser.add_argument('--resume', default='',
72 |                         help='Resume from checkpoint')
73 |     parser.add_argument('--start-epoch', default=0, type=int,
74 |                         help='Start epoch (default: 0)')
75 | 
76 |     parser.add_argument('--dist-url', default='env://',
77 |                         help='URL used to set up distributed training')
78 |     parser.add_argument('--sync-bn', action='store_true',
79 |                         help='Use sync batch norm (default: False)')
80 | 
81 |     parser.add_argument('--debug', action='store_true',
82 |                         help='Run the training over 100 samples only with batch size of 4')
83 | 
84 |     args = parser.parse_args()
85 | 
86 |     assert len(args.label_columns) == len(args.label_mapping_jsons) and len(args.label_columns) == len(args.loss_alphas), \
87 |         (f'The parameters label-columns, label-mapping-jsons, and loss-alphas must have the same length. '
88 |          f'Got len(label-columns)={len(args.label_columns)}, len(label-mapping-jsons)={len(args.label_mapping_jsons)}, '
89 |          f'and len(loss-alphas)={len(args.loss_alphas)}')
90 | 
91 |     if args.debug:
92 |         print('####### DEBUG MODE #######')
93 |         args.batch_size = 4
94 |         args.print_freq = 4
95 | 
96 |     return args
97 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/train/train_tac_on_activitynet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -i
 2 | 
 3 | ####################################################################################
 4 | ########################## PARAMETERS THAT NEED TO BE SET ##########################
 5 | ####################################################################################
 6 | 
 7 | ROOT_DIR=
 8 | NUM_GPUS=
 9 | 
10 | # Choose the appropriate batch size downscale factor for your GPU memory size
11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default)
12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU
13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU
14 | DOWNSCALE_FACTOR=1
15 | 
16 | if [ -z "$ROOT_DIR" ]; then
17 |     echo "ROOT_DIR variable is not set."
18 |     echo "Please set ROOT_DIR to the location of the ActivityNet videos."
19 |     echo "The directory must contain two subdirectories: train and valid."
20 |     exit 1
21 | fi
22 | 
23 | if [ -z "$NUM_GPUS" ]; then
24 |     echo "NUM_GPUS variable is not set."
25 |     exit 1
26 | fi
27 | 
28 | ####################################################################################
29 | ############################# PARAMETERS TO KEEP AS IS #############################
30 | ####################################################################################
31 | 
32 | TRAIN_SUBDIR=train
33 | VALID_SUBDIR=valid
34 | TRAIN_CSV_FILENAME=../data/activitynet/activitynet_v1-3_train_tsp_groundtruth.csv
35 | VALID_CSV_FILENAME=../data/activitynet/activitynet_v1-3_valid_tsp_groundtruth.csv
36 | LABEL_COLUMNS=action-label
37 | LABEL_MAPPING_JSONS=../data/activitynet/activitynet_v1-3_action_label_mapping.json
38 | LOSS_ALPHAS=1.0
39 | 
40 | BACKBONE=r2plus1d_34
41 | 
42 | BATCH_SIZE=32
43 | BACKBONE_LR=0.0001
44 | FC_LR=0.002
45 | 
46 | OUTPUT_DIR=output/${BACKBONE}-tac_on_activitynet/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/
47 | 
48 | MY_MASTER_ADDR=127.0.0.1
49 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1)
50 | 
51 | # downscaling
52 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR)
53 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR)
54 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR)
55 | 
56 | source activate tsp
57 | mkdir -p $OUTPUT_DIR
58 | export OMP_NUM_THREADS=6
59 | 
60 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \
61 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \
62 | train.py \
63 | --root-dir $ROOT_DIR \
64 | --train-subdir $TRAIN_SUBDIR \
65 | --valid-subdir $VALID_SUBDIR \
66 | --train-csv-filename $TRAIN_CSV_FILENAME \
67 | --valid-csv-filename $VALID_CSV_FILENAME \
68 | --label-mapping-jsons $LABEL_MAPPING_JSONS \
69 | --label-columns $LABEL_COLUMNS \
70 | --loss-alphas $LOSS_ALPHAS \
71 | --backbone $BACKBONE \
72 | --batch-size $BATCH_SIZE \
73 | --backbone-lr $BACKBONE_LR \
74 | --fc-lr $FC_LR \
75 | --output-dir $OUTPUT_DIR \
76 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/train/train_tac_on_thumos14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -i
 2 | 
 3 | ####################################################################################
 4 | ########################## PARAMETERS THAT NEED TO BE SET ##########################
 5 | ####################################################################################
 6 | 
 7 | ROOT_DIR=
 8 | NUM_GPUS=
 9 | 
10 | # Choose the appropriate batch size downscale factor for your GPU memory size
11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default)
12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU
13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU
14 | DOWNSCALE_FACTOR=1
15 | 
16 | if [ -z "$ROOT_DIR" ]; then
17 |     echo "ROOT_DIR variable is not set."
18 |     echo "Please set ROOT_DIR to the location of the THUMOS14 videos."
19 |     echo "The directory must contain two subdirectories: valid and test"
20 |     exit 1
21 | fi
22 | 
23 | if [ -z "$NUM_GPUS" ]; then
24 |     echo "NUM_GPUS variable is not set."
25 |     exit 1
26 | fi
27 | 
28 | ####################################################################################
29 | ############################# PARAMETERS TO KEEP AS IS #############################
30 | ####################################################################################
31 | 
32 | TRAIN_SUBDIR=valid
33 | VALID_SUBDIR=test
34 | TRAIN_CSV_FILENAME=../data/thumos14/thumos14_valid_tsp_groundtruth.csv
35 | VALID_CSV_FILENAME=../data/thumos14/thumos14_test_tsp_groundtruth.csv
36 | LABEL_COLUMNS=action-label
37 | LABEL_MAPPING_JSONS=../data/thumos14/thumos14_action_label_mapping.json
38 | LOSS_ALPHAS=1.0
39 | 
40 | BACKBONE=r2plus1d_34
41 | 
42 | BATCH_SIZE=32
43 | BACKBONE_LR=0.00001
44 | FC_LR=0.002
45 | 
46 | OUTPUT_DIR=output/${BACKBONE}-tac_on_thumos14/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/
47 | 
48 | MY_MASTER_ADDR=127.0.0.1
49 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1)
50 | 
51 | # downscaling
52 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR)
53 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR)
54 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR)
55 | 
56 | source activate tsp
57 | mkdir -p $OUTPUT_DIR
58 | export OMP_NUM_THREADS=6
59 | 
60 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \
61 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \
62 | train.py \
63 | --root-dir $ROOT_DIR \
64 | --train-subdir $TRAIN_SUBDIR \
65 | --valid-subdir $VALID_SUBDIR \
66 | --train-csv-filename $TRAIN_CSV_FILENAME \
67 | --valid-csv-filename $VALID_CSV_FILENAME \
68 | --label-mapping-jsons $LABEL_MAPPING_JSONS \
69 | --label-columns $LABEL_COLUMNS \
70 | --loss-alphas $LOSS_ALPHAS \
71 | --backbone $BACKBONE \
72 | --batch-size $BATCH_SIZE \
73 | --backbone-lr $BACKBONE_LR \
74 | --fc-lr $FC_LR \
75 | --output-dir $OUTPUT_DIR \
76 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/train/train_tsp_on_activitynet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -i
 2 | 
 3 | ####################################################################################
 4 | ########################## PARAMETERS THAT NEED TO BE SET ##########################
 5 | ####################################################################################
 6 | 
 7 | ROOT_DIR=
 8 | NUM_GPUS=
 9 | 
10 | # Choose the appropriate batch size downscale factor for your GPU memory size
11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default)
12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU
13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU
14 | DOWNSCALE_FACTOR=1
15 | 
16 | if [ -z "$ROOT_DIR" ]; then
17 |     echo "ROOT_DIR variable is not set."
18 |     echo "Please set ROOT_DIR to the location of the ActivityNet videos."
19 |     echo "The directory must contain two subdirectories: train and valid."
20 |     exit 1
21 | fi
22 | 
23 | if [ -z "$NUM_GPUS" ]; then
24 |     echo "NUM_GPUS variable is not set."
25 |     exit 1
26 | fi
27 | 
28 | ####################################################################################
29 | ############################# PARAMETERS TO KEEP AS IS #############################
30 | ####################################################################################
31 | 
32 | TRAIN_SUBDIR=train
33 | VALID_SUBDIR=valid
34 | TRAIN_CSV_FILENAME=../data/activitynet/activitynet_v1-3_train_tsp_groundtruth.csv
35 | VALID_CSV_FILENAME=../data/activitynet/activitynet_v1-3_valid_tsp_groundtruth.csv
36 | LABEL_COLUMNS="action-label temporal-region-label"
37 | LABEL_MAPPING_JSONS="../data/activitynet/activitynet_v1-3_action_label_mapping.json \
38 |                      ../data/activitynet/activitynet_v1-3_temporal_region_label_mapping.json"
39 | LOSS_ALPHAS="1.0 1.0"
40 | GLOBAL_VIDEO_FEATURES=../data/activitynet/global_video_features/r2plus1d_34-max_gvf.h5
41 | 
42 | BACKBONE=r2plus1d_34
43 | 
44 | BATCH_SIZE=32
45 | BACKBONE_LR=0.0001
46 | FC_LR=0.002
47 | 
48 | OUTPUT_DIR=output/${BACKBONE}-tsp_on_activitynet/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/
49 | 
50 | MY_MASTER_ADDR=127.0.0.1
51 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1)
52 | 
53 | # downscaling
54 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR)
55 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR)
56 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR)
57 | 
58 | source activate tsp
59 | mkdir -p $OUTPUT_DIR
60 | export OMP_NUM_THREADS=6
61 | 
62 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \
63 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \
64 | train.py \
65 | --root-dir $ROOT_DIR \
66 | --train-subdir $TRAIN_SUBDIR \
67 | --valid-subdir $VALID_SUBDIR \
68 | --train-csv-filename $TRAIN_CSV_FILENAME \
69 | --valid-csv-filename $VALID_CSV_FILENAME \
70 | --label-mapping-jsons $LABEL_MAPPING_JSONS \
71 | --label-columns $LABEL_COLUMNS \
72 | --loss-alphas $LOSS_ALPHAS \
73 | --global-video-features $GLOBAL_VIDEO_FEATURES \
74 | --backbone $BACKBONE \
75 | --batch-size $BATCH_SIZE \
76 | --backbone-lr $BACKBONE_LR \
77 | --fc-lr $FC_LR \
78 | --output-dir $OUTPUT_DIR \
79 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/train/train_tsp_on_thumos14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -i
 2 | 
 3 | ####################################################################################
 4 | ########################## PARAMETERS THAT NEED TO BE SET ##########################
 5 | ####################################################################################
 6 | 
 7 | ROOT_DIR=
 8 | NUM_GPUS=
 9 | 
10 | # Choose the appropriate batch size downscale factor for your GPU memory size
11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default)
12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU
13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU
14 | DOWNSCALE_FACTOR=1
15 | 
16 | if [ -z "$ROOT_DIR" ]; then
17 |     echo "ROOT_DIR variable is not set."
18 |     echo "Please set ROOT_DIR to the location of the THUMOS14 videos."
19 |     echo "The directory must contain two subdirectories: valid and test"
20 |     exit 1
21 | fi
22 | 
23 | if [ -z "$NUM_GPUS" ]; then
24 |     echo "NUM_GPUS variable is not set."
25 |     exit 1
26 | fi
27 | 
28 | ####################################################################################
29 | ############################# PARAMETERS TO KEEP AS IS #############################
30 | ####################################################################################
31 | 
32 | TRAIN_SUBDIR=valid
33 | VALID_SUBDIR=test
34 | TRAIN_CSV_FILENAME=../data/thumos14/thumos14_valid_tsp_groundtruth.csv
35 | VALID_CSV_FILENAME=../data/thumos14/thumos14_test_tsp_groundtruth.csv
36 | LABEL_COLUMNS="action-label temporal-region-label"
37 | LABEL_MAPPING_JSONS="../data/thumos14/thumos14_action_label_mapping.json \
38 |                      ../data/thumos14/thumos14_temporal_region_label_mapping.json"
39 | LOSS_ALPHAS="1.0 1.0"
40 | GLOBAL_VIDEO_FEATURES=../data/thumos14/global_video_features/r2plus1d_34-max_gvf.h5
41 | 
42 | BACKBONE=r2plus1d_34
43 | 
44 | BATCH_SIZE=32
45 | BACKBONE_LR=0.0001
46 | FC_LR=0.004
47 | 
48 | OUTPUT_DIR=output/${BACKBONE}-tsp_on_thumos14/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/
49 | 
50 | MY_MASTER_ADDR=127.0.0.1
51 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1)
52 | 
53 | # downscaling
54 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR)
55 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR)
56 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR)
57 | 
58 | source activate tsp
59 | mkdir -p $OUTPUT_DIR
60 | export OMP_NUM_THREADS=6
61 | 
62 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \
63 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \
64 | train.py \
65 | --root-dir $ROOT_DIR \
66 | --train-subdir $TRAIN_SUBDIR \
67 | --valid-subdir $VALID_SUBDIR \
68 | --train-csv-filename $TRAIN_CSV_FILENAME \
69 | --valid-csv-filename $VALID_CSV_FILENAME \
70 | --label-mapping-jsons $LABEL_MAPPING_JSONS \
71 | --label-columns $LABEL_COLUMNS \
72 | --loss-alphas $LOSS_ALPHAS \
73 | --global-video-features $GLOBAL_VIDEO_FEATURES \
74 | --backbone $BACKBONE \
75 | --batch-size $BATCH_SIZE \
76 | --backbone-lr $BACKBONE_LR \
77 | --fc-lr $FC_LR \
78 | --output-dir $OUTPUT_DIR \
79 | 


--------------------------------------------------------------------------------
/video_backbone/TSP/train/untrimmed_video_dataset.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | 
  3 | import os
  4 | import pandas as pd
  5 | import numpy as np
  6 | import torch
  7 | import h5py
  8 | 
  9 | from torch.utils.data import Dataset
 10 | from torchvision.io import read_video
 11 | 
 12 | 
 13 | class UntrimmedVideoDataset(Dataset):
 14 |     '''
 15 |     UntrimmedVideoDataset:
 16 |         This dataset takes in temporal segments from untrimmed videos and samples fixed-length
 17 |         clips from each segment. Each item in the dataset is a dictionary with the keys:
 18 |             - "clip": A Tensor (dtype=torch.float) of the clip frames after applying transforms
 19 |             - "label-Y": A label from the `label_columns` (one key for each label) or -1 if label is missing for that clip
 20 |             - "gvf": The global video feature (GVF) vector if `global_video_features` parameter is not None
 21 |     '''
 22 | 
 23 |     def __init__(self, csv_filename, root_dir, clip_length, frame_rate, clips_per_segment, temporal_jittering,
 24 |             label_columns, label_mappings, seed=42, transforms=None, global_video_features=None, debug=False):
 25 |         '''
 26 |         Args:
 27 |             csv_filename (string): Path to the CSV file with temporal segments information and annotations.
 28 |                 The CSV file must include the columns [filename, fps, t-start, t-end, video-duration] and
 29 |                 the label columns given by the parameter `label_columns`.
 30 |             root_dir (string): Directory with all the video files.
 31 |             clip_length (int): The number of frames per clip.
 32 |             frame_rate (int): The effective frame rate (fps) to sample clips.
 33 |             clips_per_segment (int): The number of clips to sample per segment in the CSV file.
 34 |             temporal_jittering (bool): If True, clips are randomly sampled between t-start and t-end of
 35 |                 each segment. Otherwise, clips are are sampled uniformly between t-start and t-end.
 36 |             seed (int): Seed of the random number generator used for the temporal jittering.
 37 |             transforms (callable): A function/transform that takes in a TxHxWxC video
 38 |                 and returns a transformed version.
 39 |             label_columns (list of string): A list of the label columns in the CSV file.
 40 |                 If more than one column is specified, the sample return a label for each.
 41 |             label_mappings (list of dict): A list of dictionaries to map the corresponding label
 42 |                 from `label_columns` from a category string to an integer ID value.
 43 |             global_video_features (string): Path to h5 file containing global video features (optional)
 44 |             debug (bool): If true, create a debug dataset with 100 samples.
 45 |         '''
 46 |         df = UntrimmedVideoDataset._clean_df_and_remove_short_segments(pd.read_csv(csv_filename), clip_length, frame_rate)
 47 |         self.df = UntrimmedVideoDataset._append_root_dir_to_filenames_and_check_files_exist(df, root_dir)
 48 |         self.clip_length = clip_length
 49 |         self.frame_rate = frame_rate
 50 |         self.clips_per_segment = clips_per_segment
 51 | 
 52 |         self.temporal_jittering = temporal_jittering
 53 |         self.rng = np.random.RandomState(seed=seed)
 54 |         self.uniform_sampling = np.linspace(0, 1, clips_per_segment)
 55 | 
 56 |         self.transforms = transforms
 57 | 
 58 |         self.label_columns = label_columns
 59 |         self.label_mappings = label_mappings
 60 |         for label_column, label_mapping in zip(label_columns, label_mappings):
 61 |             self.df[label_column] = self.df[label_column].map(lambda x: -1 if pd.isnull(x) else label_mapping[x])
 62 | 
 63 |         self.global_video_features = global_video_features
 64 |         self.debug = debug
 65 | 
 66 |     def __len__(self):
 67 |         return len(self.df) * self.clips_per_segment if not self.debug else 100
 68 | 
 69 |     def __getitem__(self, idx):
 70 |         sample = {}
 71 |         row = self.df.iloc[idx % len(self.df)]
 72 |         filename, fps, t_start, t_end = row['filename'], row['fps'], row['t-start'], row['t-end']
 73 | 
 74 |         # compute clip_t_start and clip_t_end
 75 |         clip_length_in_sec = self.clip_length / self.frame_rate
 76 |         ratio = self.rng.uniform() if self.temporal_jittering else self.uniform_sampling[idx//len(self.df)]
 77 |         clip_t_start = t_start + ratio * (t_end - t_start - clip_length_in_sec)
 78 |         clip_t_end = clip_t_start + clip_length_in_sec
 79 | 
 80 |         # get a tensor [clip_length, H, W, C] of the video frames between clip_t_start and clip_t_end seconds
 81 |         vframes, _, _ = read_video(filename=filename, start_pts=clip_t_start, end_pts=clip_t_end, pts_unit='sec')
 82 |         idxs = UntrimmedVideoDataset._resample_video_idx(self.clip_length, fps, self.frame_rate)
 83 |         vframes = vframes[idxs][:self.clip_length] # [:self.clip_length] for removing extra frames if isinstance(idxs, slice)
 84 |         if vframes.shape[0] != self.clip_length:
 85 |             raise RuntimeError(f'<UntrimmedVideoDataset>: got clip of length {vframes.shape[0]} != {self.clip_length}.'
 86 |                                f'filename={filename}, clip_t_start={clip_t_start}, clip_t_end={clip_t_end}, '
 87 |                                f'fps={fps}, t_start={t_start}, t_end={t_end}')
 88 | 
 89 |         # apply transforms
 90 |         sample['clip'] = self.transforms(vframes)
 91 | 
 92 |         # add labels
 93 |         for label_column in self.label_columns:
 94 |             sample[label_column] = row[label_column]
 95 | 
 96 |         # add global video feature if it exists
 97 |         if self.global_video_features:
 98 |             f = h5py.File(self.global_video_features, 'r')
 99 |             sample['gvf'] = torch.tensor(f[os.path.basename(filename).split('.')[0]][()])
100 |             f.close()
101 | 
102 |         return sample
103 | 
104 |     @staticmethod
105 |     def _clean_df_and_remove_short_segments(df, clip_length, frame_rate):
106 |         # restrict all segments to be between [0, video-duration]
107 |         df['t-end'] = np.minimum(df['t-end'], df['video-duration'])
108 |         df['t-start'] = np.maximum(df['t-start'], 0)
109 | 
110 |         # remove segments that are too short to fit at least one clip
111 |         segment_length = (df['t-end'] - df['t-start']) * frame_rate
112 |         mask = segment_length >= clip_length
113 |         num_segments = len(df)
114 |         num_segments_to_keep = sum(mask)
115 |         if num_segments - num_segments_to_keep > 0:
116 |             df = df[mask].reset_index(drop=True)
117 |             print(f'<UntrimmedVideoDataset>: removed {num_segments - num_segments_to_keep}='
118 |                 f'{100*(1 - num_segments_to_keep/num_segments):.2f}% from the {num_segments} '
119 |                 f'segments from the input CSV file because they are shorter than '
120 |                 f'clip_length={clip_length} frames using frame_rate={frame_rate} fps.')
121 | 
122 |         return df
123 | 
124 |     @staticmethod
125 |     def _append_root_dir_to_filenames_and_check_files_exist(df, root_dir):
126 |         df['filename'] = df['filename'].map(lambda f: os.path.join(root_dir, f))
127 |         filenames = df.drop_duplicates('filename')['filename'].values
128 |         for f in filenames:
129 |             if not os.path.exists(f):
130 |                 raise ValueError(f'<UntrimmedVideoDataset>: file={f} does not exists. '
131 |                                  f'Double-check root_dir and csv_filename inputs.')
132 |         return df
133 | 
134 |     @staticmethod
135 |     def _resample_video_idx(num_frames, original_fps, new_fps):
136 |         step = float(original_fps) / new_fps
137 |         if step.is_integer():
138 |             # optimization: if step is integer, don't need to perform
139 |             # advanced indexing
140 |             step = int(step)
141 |             return slice(None, None, step)
142 |         idxs = torch.arange(num_frames, dtype=torch.float32) * step
143 |         idxs = idxs.floor().to(torch.int64)
144 |         return idxs
145 | 


--------------------------------------------------------------------------------
/video_backbone/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/__init__.py


--------------------------------------------------------------------------------
/visualization/Arial.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/Arial.ttf


--------------------------------------------------------------------------------
/visualization/NotoSansCJK-Bold.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/NotoSansCJK-Bold.otf


--------------------------------------------------------------------------------
/visualization/videos/xukun.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/videos/xukun.mp4


--------------------------------------------------------------------------------
/visualization/visualization.py:
--------------------------------------------------------------------------------
  1 | # from IPython.display import clear_output, Image, display, HTML
  2 | # %matplotlib notebook
  3 | import matplotlib.pyplot as plt
  4 | import time
  5 | import numpy as np
  6 | import cv2
  7 | import base64
  8 | import json
  9 | from PIL import Image, ImageFont, ImageDraw
 10 | import pdb
 11 | import argparse
 12 | import os
 13 | from tqdm import tqdm
 14 | 
 15 | 
 16 | def get_frame_caption(frame_time, dense_captions, ranking=False):
 17 |     frame_captions = []
 18 |     idx_list = []
 19 |     for event in dense_captions:
 20 |         s, e = event['timestamp']
 21 |         if frame_time >= s and frame_time <= e:
 22 |             frame_captions.append(event)
 23 |             idx_list.append(event['original_id'])
 24 |     temperature = 1
 25 |     if ranking:
 26 |         scorer = lambda p: p['sentence_score'] / (float(len(p['sentence'].split())) ** (temperature) + 1e-5) + \
 27 |                        1.0 * p['proposal_score'] * (1 - np.abs(frame_time - 0.5 * (p['timestamp'][0] + p['timestamp'][1])) / (
 28 |                                    p['timestamp'][1] - p['timestamp'][0] + 1e-8))
 29 | 
 30 |         frame_captions = sorted(frame_captions, key=scorer, reverse=True)
 31 |     else:
 32 |         frame_captions = sorted(frame_captions, key=lambda p: p['timestamp'])
 33 |     return frame_captions, idx_list
 34 | 
 35 | 
 36 | def paint_text(im, chinese, font, pos, color):
 37 |     img_PIL = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
 38 |     fillColor = color  # (255,0,0)
 39 |     position = pos  # (100,100)
 40 |     if not isinstance(chinese, str):
 41 |         chinese = chinese.decode('utf-8')
 42 |     draw = ImageDraw.Draw(img_PIL)
 43 |     draw.text(position, chinese, font=font, fill=fillColor)
 44 | 
 45 |     img = cv2.cvtColor(np.asarray(img_PIL), cv2.COLOR_RGB2BGR)
 46 |     return img
 47 | 
 48 | def processImg(img, cur_time, title, dense_captions, prop_idx, n_caption=3, output_language='en'):
 49 |     scale = 1.0
 50 |     basic_text_height = 50
 51 |     text_height = int(basic_text_height * scale)
 52 |     font_size = int(text_height * 0.8)
 53 | 
 54 |     h, w, c = img.shape
 55 |     last_time = cur_time
 56 |     cur_time = time.time()
 57 |     img_fps = 1. / (cur_time - last_time + 1e-8)
 58 |     bg_img = np.zeros_like(img)
 59 |     cv2.rectangle(bg_img, (0, 0), (len(title) * text_height // 2, text_height), (120, 120, 120), -1, 1, 0)
 60 |     cv2.rectangle(bg_img, (0, h - text_height * n_caption), (w, h), (120, 120, 120), -1, 1, 0)
 61 |     mask = bg_img / 255.
 62 |     alpha = 0.5
 63 |     img = img * (mask == 0) + alpha * img * (mask > 0) + (1 - alpha) * mask
 64 |     img = img.astype('uint8')
 65 |     if output_language == 'zh-cn':
 66 |         font = ImageFont.truetype('visualization/NotoSansCJK-Bold.otf', font_size)
 67 |     elif output_language == 'en':
 68 |         font = ImageFont.truetype("visualization/Arial.ttf", font_size)
 69 |     else:
 70 |         font = ImageFont.truetype("/path/to/your.font.ttf", font_size)
 71 |     img = paint_text(img, title, font, (10, 0), color=(255, 255, 255))
 72 |     for i, (proposal) in enumerate(dense_captions):
 73 |         caption, timestamp = proposal['sentence'], proposal['timestamp']
 74 |         caption = '{:2.1f}s-{:2.1f}s: {}'.format(timestamp[0], timestamp[1], caption)
 75 |         ptText = (10, h - text_height * n_caption + i * text_height)
 76 |         if i in prop_idx:
 77 |             img = paint_text(img, caption, font, ptText, color=(255, 0, 0))
 78 |         else:
 79 |             img = paint_text(img, caption, font, ptText, color=(255, 255, 255))
 80 | 
 81 |     return img, cur_time, img_fps
 82 | 
 83 | def vid_show(vid_path, captions, save_mp4, save_mp4_path, output_language='en'):
 84 |     start_time = time.time()
 85 |     cur_time = time.time()
 86 |     video = cv2.VideoCapture(vid_path)
 87 |     fps = video.get(cv2.CAP_PROP_FPS)
 88 |     frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
 89 |     duration = frame_count / fps
 90 |     print('fps: {}, duration: {}, frames: {}'.format(fps, duration, frame_count))
 91 |     img_fps = fps
 92 |     n = 0
 93 |     if save_mp4:
 94 |         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
 95 |         videoWriter = cv2.VideoWriter(save_mp4_path, fourcc, fps, (1280, 720))
 96 | 
 97 |     if not output_language == 'en':
 98 |         for proposal in captions:
 99 |             caption = translator.translate(proposal['sentence'], lang_src='en', lang_tgt=output_language)
100 |             proposal['sentence'] = caption
101 |     for i, proposal in enumerate(captions):
102 |         proposal['original_id'] = i
103 |     captions = sorted(captions, key=lambda p: p['timestamp'])
104 | 
105 |     for frame_id in tqdm(range(int(frame_count))):
106 |         ret, frame = video.read()
107 |         if n >= int(fps / img_fps) or save_mp4:
108 |             n = 0
109 |             # clear_output(wait=True)
110 |         else:
111 |             n += 1
112 |             continue
113 |         if not ret:
114 |             break
115 |         lines, columns, _ = frame.shape
116 |         frame = cv2.resize(frame, (1280, 720))
117 |         frame_time = frame_id / fps
118 |         if opt.show_all_caption_per_frame:
119 |             frame_captions, highlight_idx = get_frame_caption(frame_time, captions, ranking=False)
120 |             captions_to_show = captions
121 |             n_caption = len(captions)
122 |         else:
123 |             frame_captions, highlight_idx = get_frame_caption(frame_time, captions, ranking=True)
124 |             captions_to_show = frame_captions
125 |             n_caption = min(3, len(captions_to_show))
126 | 
127 |         title = '{:.1f}s/{:.1f}s'.format(frame_time, duration)
128 |         frame, cur_time, img_fps = processImg(frame, cur_time, title, captions_to_show, highlight_idx, output_language=output_language, n_caption=n_caption)
129 |         if not save_mp4:
130 |             plt.axis('off')
131 |             plt.imshow(frame[:, :, ::-1])
132 |             plt.show()
133 |         # control fps
134 |         if save_mp4:
135 |             videoWriter.write(frame)
136 | 
137 |     if save_mp4:
138 |         videoWriter.release()
139 |         print('output videos saved at {}, process time: {} s'.format(save_mp4_path, cur_time - start_time))
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument('--output_language', type=str, default='en',
145 |                         help='refer to /path/to/miniconda3/envs/PDVC/lib/python3.7/site-packages/google_trans_new/constant.py for more information')
146 |     parser.add_argument('--output_mp4_folder', type=str, default=None)
147 |     parser.add_argument('--input_mp4_folder', type=str, required=True)
148 |     parser.add_argument('--dvc_file', type=str, required=True)
149 |     parser.add_argument('--show_all_caption_per_frame', type=int, default=False)
150 |     opt = parser.parse_args()
151 |     if not opt.output_language == 'en':
152 |         from google_trans_new import google_translator
153 |         translator = google_translator()
154 |     d = json.load(open(opt.dvc_file))['results']
155 |     for vid, dense_captions in d.items():
156 |         if opt.output_mp4_folder is None:
157 |             opt.output_mp4_folder = opt.input_mp4_folder + '_output'
158 |         if not os.path.exists(opt.output_mp4_folder):
159 |             os.mkdir(opt.output_mp4_folder)
160 |         output_mp4_path = os.path.join(opt.output_mp4_folder, vid + '.mp4')
161 | 
162 |         input_mp4_path = os.path.join(opt.input_mp4_folder, vid + '.mp4')
163 |         print('process video: {} --> output: {}'.format(input_mp4_path, output_mp4_path))
164 |         if not os.path.exists(input_mp4_path):
165 |             print('vidoe {} does not exists, skip it.')
166 |             continue
167 |         vid_show(input_mp4_path, dense_captions, save_mp4=True, save_mp4_path=output_mp4_path,
168 |                  output_language=opt.output_language)
169 | 


--------------------------------------------------------------------------------
/visualization/xukun_cn.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/xukun_cn.gif


--------------------------------------------------------------------------------
/visualization/xukun_en.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/xukun_en.gif


--------------------------------------------------------------------------------