├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── cfgs ├── anet_c3d_pdvc.yml ├── anet_c3d_pdvc_gt.yml ├── anet_c3d_pdvcl.yml ├── anet_c3d_pdvcl_gt.yml ├── anet_c3d_props.yml ├── anet_i3dvgg_pdvc.yml ├── anet_i3dvgg_pdvc_gt.yml ├── anet_tsn_pdvc.yml ├── anet_tsn_pdvc_gt.yml ├── anet_tsn_pdvcl.yml ├── anet_tsn_pdvcl_gt.yml ├── anet_tsp_pdvc.yml ├── anet_tsp_pdvc_gt.yml ├── anet_tsp_pdvcl.yml ├── yc2_tsn_pdvc.yml ├── yc2_tsn_pdvc_gt.yml ├── yc2_tsn_pdvcl.yml └── yc2_tsn_pdvcl_gt.yml ├── data ├── anet │ ├── captiondata │ │ ├── fake_test.json │ │ ├── para │ │ │ ├── anet_entities_test_1_para.json │ │ │ ├── anet_entities_test_2_para.json │ │ │ ├── anet_entities_val_1_para.json │ │ │ ├── anet_entities_val_2_para.json │ │ │ └── readme.txt │ │ ├── train_modified.json │ │ ├── val_1.json │ │ └── val_2.json │ ├── features │ │ ├── I3D_vggish_invalid_videos.json │ │ ├── convert_c3d_h5_to_npy.py │ │ ├── convert_tsp_h5_to_npy.py │ │ ├── download_c3d_features.sh │ │ ├── download_i3d_vggish_features.sh │ │ ├── download_tsn_features.sh │ │ ├── download_tsp_features.sh │ │ └── resnet_bn_invalid_videos.json │ └── vocabulary_activitynet.json ├── video_dataset.py └── yc2 │ ├── captiondata │ ├── para │ │ ├── convert_to_para.py │ │ └── para_yc2_val.json │ ├── yc2_test.json │ ├── yc2_train.json │ └── yc2_val.json │ ├── features │ └── download_yc2_tsn_features.sh │ └── vocabulary_youcook2.json ├── densevid_eval3 ├── eval_dvc.py ├── eval_para.py ├── eval_soda.py ├── evaluate2018.py ├── evaluate2021.py └── para_evaluate.py ├── eval.py ├── eval_utils.py ├── misc ├── build_vocab.py ├── detr_utils │ ├── box_ops.py │ └── misc.py └── utils.py ├── opts.py ├── pdvc.jpg ├── pdvc ├── CaptioningHead │ ├── LSTM.py │ ├── LSTM_DSA.py │ ├── Puppet.py │ └── __init__.py ├── __init__.py ├── base_encoder.py ├── criterion.py ├── deformable_transformer.py ├── matcher.py ├── ops │ ├── __init__.py │ ├── functions │ │ ├── __init__.py │ │ └── ms_deform_attn_func.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ ├── ms_deform_attn.py │ │ └── ms_deform_attn_for_caption.py │ ├── setup.py │ ├── src │ │ ├── cpu │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ └── ms_deform_attn_cpu.h │ │ ├── cuda │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ ├── ms_deform_attn_cuda.h │ │ │ └── ms_deform_im2col_cuda.cuh │ │ ├── ms_deform_attn.h │ │ └── vision.cpp │ └── test.py ├── pdvc.py └── position_encoding.py ├── requirement.txt ├── test_and_visualize.sh ├── train.py ├── video_backbone ├── TSP │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── scheduler.py │ │ ├── transforms.py │ │ └── utils.py │ ├── data │ │ ├── README.md │ │ ├── activitynet │ │ │ ├── activitynet_v1-3_action_label_mapping.json │ │ │ ├── activitynet_v1-3_temporal_region_label_mapping.json │ │ │ ├── activitynet_v1-3_test_metadata.csv │ │ │ ├── activitynet_v1-3_train_metadata.csv │ │ │ ├── activitynet_v1-3_train_tsp_groundtruth.csv │ │ │ ├── activitynet_v1-3_valid_metadata.csv │ │ │ └── activitynet_v1-3_valid_tsp_groundtruth.csv │ │ ├── generate_metadata_csv.py │ │ └── standardize_videos_to_constant_30fps_mp4.sh │ ├── environment.yml │ ├── extract_features │ │ ├── README.md │ │ ├── __init__.py │ │ ├── eval_video_dataset.py │ │ ├── extract_features.py │ │ ├── extract_features_from_a_local_checkpoint.sh │ │ ├── extract_features_from_a_released_checkpoint.sh │ │ ├── merge_pkl_files_into_one_h5_feature_file.py │ │ └── opts.py │ ├── img │ │ └── tsp.png │ ├── models │ │ ├── __init__.py │ │ ├── backbone.py │ │ └── model.py │ └── train │ │ ├── README.md │ │ ├── __init__.py │ │ ├── opts.py │ │ ├── train.py │ │ ├── train_tac_on_activitynet.sh │ │ ├── train_tac_on_thumos14.sh │ │ ├── train_tsp_on_activitynet.sh │ │ ├── train_tsp_on_thumos14.sh │ │ └── untrimmed_video_dataset.py └── __init__.py └── visualization ├── Arial.ttf ├── NotoSansCJK-Bold.otf ├── videos └── xukun.mp4 ├── visualization.py ├── xukun_cn.gif └── xukun_en.gif /.gitignore: -------------------------------------------------------------------------------- 1 | save/ 2 | save* 3 | *.hdf5 4 | *.npy 5 | data/anet/features/c3d 6 | data/anet/features/resnet_bn 7 | data/yc2/features/resnet_bn 8 | data/densevid_eval3 9 | *.tmp 10 | 11 | *.Ink 12 | .idea/ 13 | .DS_Store 14 | *.pyc 15 | cfgs/proposal/debug/ 16 | *.out 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python misc from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit debug / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .coverage 58 | .coverage.* 59 | .cache 60 | nosetests.xml 61 | coverage.xml 62 | *.cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | 73 | local_settings.py 74 | db.sqlite3 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # pyenv 93 | .python-version 94 | 95 | # celery beat schedule file 96 | celerybeat-schedule 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "densevid_eval3/SODA"] 2 | path = densevid_eval3/SODA 3 | url = https://github.com/fujiso/SODA.git 4 | [submodule "densevid_eval3/pycocoevalcap"] 5 | path = densevid_eval3/pycocoevalcap 6 | url = https://github.com/salaniz/pycocoevalcap.git 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Teng Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cfgs/anet_c3d_pdvc.yml: -------------------------------------------------------------------------------- 1 | id: anet_c3d_pdvc # the results and logs will saved in this folder ./save/id 2 | base_cfg_path: cfgs/anet_c3d_pdvcl.yml 3 | 4 | caption_decoder_type: standard 5 | cap_nheads: 1 6 | cap_dec_n_points: 4 7 | cap_num_feature_levels: 4 8 | soft_attention: 1 9 | att_hid_size: 512 10 | 11 | ec_alpha: 1.0 -------------------------------------------------------------------------------- /cfgs/anet_c3d_pdvc_gt.yml: -------------------------------------------------------------------------------- 1 | id: anet_c3d_pdvc_gt 2 | base_cfg_path: cfgs/anet_c3d_pdvcl_gt.yml 3 | 4 | caption_decoder_type: standard 5 | cap_nheads: 1 6 | cap_dec_n_points: 4 7 | cap_num_feature_levels: 4 8 | soft_attention: 1 9 | att_hid_size: 512 -------------------------------------------------------------------------------- /cfgs/anet_c3d_pdvcl.yml: -------------------------------------------------------------------------------- 1 | id: anet_c3d_pdvcl 2 | 3 | visual_feature_type: c3d 4 | visual_feature_folder: 'data/anet/features/c3d' 5 | feature_dim: 500 6 | invalid_video_json: [] 7 | train_proposal_file: data/generated_proposals/dbg_trainval_top100.json 8 | eval_proposal_file: data/generated_proposals/dbg_trainval_top100.json 9 | gt_file_for_eval: ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json'] 10 | gt_file_for_para_eval: ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json'] 11 | 12 | train_proposal_type: gt 13 | gt_proposal_sample_num: 30 14 | sample_method: nearest 15 | 16 | batch_size: 1 17 | lr: 0.00005 18 | learning_rate_decay_start: 8 19 | learning_rate_decay_every: 3 20 | learning_rate_decay_rate: 0.5 21 | weight_decay: 0.0001 22 | save_all_checkpoint: 0 23 | 24 | num_queries: 10 25 | dec_layers: 2 26 | enc_layers: 2 27 | transformer_ff_dim: 512 28 | transformer_dropout_prob: 0.1 29 | frame_embedding_num: 100 30 | caption_decoder_type: light 31 | att_hid_size: 0 32 | 33 | with_box_refine: 1 34 | 35 | fix_xcw: 1 36 | set_cost_caption: 0 37 | set_cost_giou: 4 38 | set_cost_bbox: 0 39 | set_cost_class: 2 40 | #cost_alpha: 0.5 41 | #cost_gamma: 1 42 | #focal_alpha: 0.5 43 | #focal_gamma: 1 44 | caption_loss_coef: 2 45 | giou_loss_coef: 4 46 | bbox_loss_coef: 0 47 | cls_loss_coef: 2 48 | count_loss_coef: 0.5 49 | max_eseq_length: 10 50 | lloss_cross_entropy: 0 51 | lloss_focal_loss: 0 52 | lloss_gau_mask: 1 -------------------------------------------------------------------------------- /cfgs/anet_c3d_pdvcl_gt.yml: -------------------------------------------------------------------------------- 1 | id: anet_c3d_pdvcl_gt 2 | 3 | visual_feature_type: c3d 4 | visual_feature_folder: 'data/anet/features/c3d' 5 | feature_dim: 500 6 | invalid_video_json: [] 7 | train_proposal_file: data/generated_proposals/dbg_trainval_top100.json 8 | eval_proposal_file: data/generated_proposals/dbg_trainval_top100.json 9 | gt_file_for_eval: ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json'] 10 | gt_file_for_para_eval: ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json'] 11 | 12 | train_proposal_type: gt 13 | gt_proposal_sample_num: 30 14 | sample_method: nearest 15 | 16 | batch_size: 1 17 | lr: 0.00005 18 | learning_rate_decay_start: 8 19 | learning_rate_decay_every: 3 20 | learning_rate_decay_rate: 0.5 21 | weight_decay: 0.0001 22 | save_all_checkpoint: 0 23 | 24 | num_queries: 10 25 | dec_layers: 2 26 | enc_layers: 2 27 | transformer_ff_dim: 512 28 | transformer_dropout_prob: 0.1 29 | frame_embedding_num: 100 30 | caption_decoder_type: light 31 | att_hid_size: 0 32 | 33 | #with_box_refine: 1 34 | 35 | fix_xcw: 1 36 | set_cost_caption: 0 37 | set_cost_giou: 4 38 | set_cost_bbox: 0.00001 39 | set_cost_class: 0 40 | #cost_alpha: 0.5 41 | #cost_gamma: 1 42 | #focal_alpha: 0.5 43 | #focal_gamma: 1 44 | caption_loss_coef: 2 45 | giou_loss_coef: 0 46 | bbox_loss_coef: 0 47 | cls_loss_coef: 0 48 | count_loss_coef: 0 49 | #max_eseq_length: 10 50 | #lloss_cross_entropy: 0 51 | #lloss_focal_loss: 0 52 | #lloss_gau_mask: 1 53 | 54 | #two_stage: 1 55 | transformer_input_type: gt_proposals -------------------------------------------------------------------------------- /cfgs/anet_c3d_props.yml: -------------------------------------------------------------------------------- 1 | id: anet_c3d_props 2 | visual_feature_type: c3d 3 | visual_feature_folder: 'data/anet/features/c3d' 4 | feature_dim: 500 5 | invalid_video_json: [] 6 | train_proposal_file: data/generated_proposals/dbg_trainval_top100.json 7 | eval_proposal_file: data/generated_proposals/dbg_trainval_top100.json 8 | gt_file_for_eval: ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json'] 9 | gt_file_for_para_eval: ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json'] 10 | 11 | train_proposal_type: gt 12 | train_proposal_sample_num: 15 13 | sample_method: nearest 14 | 15 | batch_size: 1 16 | lr: 0.00005 17 | learning_rate_decay_start: 8 18 | learning_rate_decay_every: 3 19 | learning_rate_decay_rate: 0.5 20 | weight_decay: 0.0001 21 | save_all_checkpoint: 0 22 | 23 | num_queries: 10 24 | dec_layers: 2 25 | enc_layers: 2 26 | transformer_ff_dim: 512 27 | transformer_dropout_prob: 0.1 28 | frame_embedding_num: 100 29 | caption_decoder_type: none 30 | att_hid_size: 0 31 | 32 | with_box_refine: 1 33 | 34 | fix_xcw: 1 35 | set_cost_caption: 0 36 | set_cost_giou: 4 37 | set_cost_bbox: 0 38 | set_cost_class: 2 39 | #cost_alpha: 0.5 40 | #cost_gamma: 1 41 | #focal_alpha: 0.5 42 | #focal_gamma: 1 43 | caption_loss_coef: 0 44 | giou_loss_coef: 4 45 | bbox_loss_coef: 0 46 | cls_loss_coef: 2 47 | count_loss_coef: 0.5 48 | max_eseq_length: 10 49 | lloss_cross_entropy: 0 50 | lloss_focal_loss: 0 51 | lloss_gau_mask: 1 -------------------------------------------------------------------------------- /cfgs/anet_i3dvgg_pdvc.yml: -------------------------------------------------------------------------------- 1 | id: anet_i3dvgg_pdvc 2 | base_cfg_path: cfgs/anet_c3d_pdvc.yml 3 | visual_feature_type: ['i3d_rgb', 'i3d_flow', 'vggish'] 4 | visual_feature_folder: ['data/anet/features/i3d/', 'data/anet/features/i3d/', 'data/anet/features/vggish/'] 5 | invalid_video_json: ['data/anet/features/I3D_vggish_invalid_videos.json'] 6 | feature_dim: 2176 -------------------------------------------------------------------------------- /cfgs/anet_i3dvgg_pdvc_gt.yml: -------------------------------------------------------------------------------- 1 | id: anet_i3dvgg_pdvc_gt 2 | base_cfg_path: cfgs/anet_c3d_pdvc_gt.yml 3 | visual_feature_type: ['i3d_rgb', 'i3d_flow', 'vggish'] 4 | visual_feature_folder: ['data/anet/features/i3d_25fps_stack64step64_2stream_npy/', 'data/anet/features/i3d_25fps_stack64step64_2stream_npy/', 'data/anet/features/vggish_npy/'] 5 | invalid_video_json: ['data/anet/features/I3D_vggish_invalid_videos.json'] 6 | feature_dim: 2176 -------------------------------------------------------------------------------- /cfgs/anet_tsn_pdvc.yml: -------------------------------------------------------------------------------- 1 | id: anet_tsn_pdvc 2 | base_cfg_path: cfgs/anet_c3d_pdvc.yml 3 | visual_feature_type: ['resnet', 'bn'] 4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn'] 5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json'] 6 | feature_dim: 3072 -------------------------------------------------------------------------------- /cfgs/anet_tsn_pdvc_gt.yml: -------------------------------------------------------------------------------- 1 | id: anet_tsn_pdvc_gt 2 | base_cfg_path: cfgs/anet_c3d_pdvc_gt.yml 3 | visual_feature_type: ['resnet', 'bn'] 4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn'] 5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json'] 6 | feature_dim: 3072 -------------------------------------------------------------------------------- /cfgs/anet_tsn_pdvcl.yml: -------------------------------------------------------------------------------- 1 | id: anet_tsn_pdvcl 2 | base_cfg_path: cfgs/anet_c3d_pdvcl.yml 3 | visual_feature_type: ['resnet', 'bn'] 4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn'] 5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json'] 6 | feature_dim: 3072 -------------------------------------------------------------------------------- /cfgs/anet_tsn_pdvcl_gt.yml: -------------------------------------------------------------------------------- 1 | id: anet_tsn_pdvcl_gt 2 | base_cfg_path: cfgs/anet_c3d_pdvcl_gt.yml 3 | visual_feature_type: ['resnet', 'bn'] 4 | visual_feature_folder: ['data/anet/features/resnet_bn', 'data/anet/features/resnet_bn'] 5 | invalid_video_json: ['data/anet/features/resnet_bn_invalid_videos.json'] 6 | feature_dim: 3072 -------------------------------------------------------------------------------- /cfgs/anet_tsp_pdvc.yml: -------------------------------------------------------------------------------- 1 | id: anet_tsp_pdvc 2 | base_cfg_path: cfgs/anet_c3d_pdvc.yml 3 | visual_feature_type: ['tsp'] 4 | visual_feature_folder: ['data/anet/features/tsp'] 5 | invalid_video_json: [] 6 | feature_dim: 512 -------------------------------------------------------------------------------- /cfgs/anet_tsp_pdvc_gt.yml: -------------------------------------------------------------------------------- 1 | id: anet_tsp_pdvc_gt 2 | base_cfg_path: cfgs/anet_c3d_pdvc_gt.yml 3 | visual_feature_type: ['tsp'] 4 | visual_feature_folder: ['data/anet/features/tsp'] 5 | invalid_video_json: [] 6 | feature_dim: 512 7 | -------------------------------------------------------------------------------- /cfgs/anet_tsp_pdvcl.yml: -------------------------------------------------------------------------------- 1 | id: anet_tsp_pdvcl 2 | base_cfg_path: cfgs/anet_c3d_pdvcl.yml 3 | visual_feature_type: ['tsp'] 4 | visual_feature_folder: ['data/anet/features/tsp'] 5 | invalid_video_json: [] 6 | feature_dim: 512 -------------------------------------------------------------------------------- /cfgs/yc2_tsn_pdvc.yml: -------------------------------------------------------------------------------- 1 | id: yc2_tsn_pdvc 2 | base_cfg_path: cfgs/yc2_tsn_pdvcl.yml 3 | 4 | caption_decoder_type: standard 5 | cap_nheads: 1 6 | cap_dec_n_points: 4 7 | cap_num_feature_levels: 4 8 | soft_attention: 1 9 | att_hid_size: 512 10 | 11 | ec_alpha: 1.0 -------------------------------------------------------------------------------- /cfgs/yc2_tsn_pdvc_gt.yml: -------------------------------------------------------------------------------- 1 | id: yc2_tsn_pdvc_gt 2 | base_cfg_path: cfgs/yc2_tsn_pdvcl_gt.yml 3 | 4 | caption_decoder_type: standard 5 | cap_nheads: 1 6 | cap_dec_n_points: 4 7 | cap_num_feature_levels: 4 8 | soft_attention: 1 9 | att_hid_size: 512 -------------------------------------------------------------------------------- /cfgs/yc2_tsn_pdvcl.yml: -------------------------------------------------------------------------------- 1 | id: yc2_tsn_pdvcl 2 | 3 | visual_feature_type: ['resnet', 'bn'] 4 | visual_feature_folder: ['data/yc2/features/resnet_bn/', 'data/yc2/features/resnet_bn/'] 5 | feature_dim: 3072 6 | invalid_video_json: [] 7 | train_caption_file: 'data/yc2/captiondata/yc2_train.json' 8 | val_caption_file: 'data/yc2/captiondata/yc2_val.json' 9 | gt_file_for_eval: ['data/yc2/captiondata/yc2_val.json'] 10 | gt_file_for_para_eval: ['data/yc2/captiondata/para/para_yc2_val.json'] 11 | dict_file: data/yc2/vocabulary_youcook2.json 12 | vocab_size: 1607 13 | 14 | train_proposal_type: gt 15 | train_proposal_sample_num: 30 16 | sample_method: nearest 17 | 18 | batch_size: 1 19 | lr: 0.00005 20 | learning_rate_decay_start: 8 21 | learning_rate_decay_every: 3 22 | learning_rate_decay_rate: 0.5 23 | weight_decay: 0.0001 24 | save_all_checkpoint: 0 25 | 26 | num_queries: 100 27 | dec_layers: 2 28 | enc_layers: 2 29 | transformer_ff_dim: 512 30 | transformer_dropout_prob: 0.1 31 | frame_embedding_num: 200 32 | caption_decoder_type: light 33 | att_hid_size: 0 34 | 35 | with_box_refine: 1 36 | 37 | fix_xcw: 1 38 | set_cost_caption: 0 39 | set_cost_giou: 4 40 | set_cost_bbox: 0 41 | set_cost_class: 2 42 | #cost_alpha: 0.5 43 | #cost_gamma: 1 44 | #focal_alpha: 0.5 45 | #focal_gamma: 1 46 | caption_loss_coef: 2 47 | giou_loss_coef: 4 48 | bbox_loss_coef: 0 49 | cls_loss_coef: 2 50 | count_loss_coef: 0.5 51 | max_eseq_length: 20 52 | lloss_cross_entropy: 0 53 | lloss_focal_loss: 0 54 | lloss_gau_mask: 1 -------------------------------------------------------------------------------- /cfgs/yc2_tsn_pdvcl_gt.yml: -------------------------------------------------------------------------------- 1 | id: yc2_tsn_pdvcl_gt 2 | 3 | visual_feature_type: ['resnet', 'bn'] 4 | visual_feature_folder: ['data/yc2/features/resnet_bn/', 'data/yc2/features/resnet_bn/'] 5 | feature_dim: 3072 6 | invalid_video_json: [] 7 | train_caption_file: 'data/yc2/captiondata/yc2_train.json' 8 | val_caption_file: 'data/yc2/captiondata/yc2_val.json' 9 | gt_file_for_eval: ['data/yc2/captiondata/yc2_val.json'] 10 | gt_file_for_para_eval: ['data/yc2/captiondata/para/para_yc2_val.json'] 11 | dict_file: data/yc2/vocabulary_youcook2.json 12 | vocab_size: 1607 13 | 14 | train_proposal_type: gt 15 | gt_proposal_sample_num: 30 16 | sample_method: nearest 17 | 18 | batch_size: 1 19 | lr: 0.00005 20 | learning_rate_decay_start: 8 21 | learning_rate_decay_every: 3 22 | learning_rate_decay_rate: 0.5 23 | weight_decay: 0.0001 24 | save_all_checkpoint: 0 25 | 26 | num_queries: 100 27 | dec_layers: 2 28 | enc_layers: 2 29 | transformer_ff_dim: 512 30 | transformer_dropout_prob: 0.1 31 | frame_embedding_num: 200 32 | caption_decoder_type: light 33 | att_hid_size: 0 34 | 35 | #with_box_refine: 1 36 | 37 | fix_xcw: 1 38 | set_cost_caption: 0 39 | set_cost_giou: 4 40 | set_cost_bbox: 0.0001 41 | set_cost_class: 0 42 | #cost_alpha: 0.5 43 | #cost_gamma: 1 44 | #focal_alpha: 0.5 45 | #focal_gamma: 1 46 | caption_loss_coef: 2 47 | giou_loss_coef: 0 48 | bbox_loss_coef: 0 49 | cls_loss_coef: 0 50 | count_loss_coef: 0 51 | #max_eseq_length: 10 52 | #lloss_cross_entropy: 0 53 | #lloss_focal_loss: 0 54 | #lloss_gau_mask: 1 55 | 56 | #two_stage: 1 57 | transformer_input_type: gt_proposals -------------------------------------------------------------------------------- /data/anet/captiondata/para/readme.txt: -------------------------------------------------------------------------------- 1 | ANet-Entities val/test splits (re-split from ANet-caption val_1 and val_2 splits): 2 | https://dl.fbaipublicfiles.com/ActivityNet-Entities/ActivityNet-Entities/anet_entities_captions.tar.gz 3 | 4 | ANet-caption original splits: 5 | http://cs.stanford.edu/people/ranjaykrishna/densevid/captions.zip 6 | 7 | Experiment settings: 8 | Training: use GT segments/sentences in `train.json`, 9 | Validation: use GT segments in `anet_entities_val_1.json`, evaluate against references `anet_entities_val_1_para.json` and `anet_entities_val_2_para.json` 10 | Test: use GT segments in `anet_entities_test_1.json`, evaluate against references `anet_entities_test_1_para.json` and `anet_entities_test_2_para.json` 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /data/anet/features/convert_c3d_h5_to_npy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import numpy as np 4 | 5 | in_path = 'sub_activitynet_v1-3.c3d.hdf5' 6 | out_path = 'c3d' 7 | 8 | if not os.path.exists(out_path): 9 | os.mkdir(out_path) 10 | 11 | d = h5py.File(in_path) 12 | for key in d.keys(): 13 | v_d = d[key]['c3d_features'][:].astype('float32') 14 | np.save(os.path.join(out_path, key+'.npy'), v_d) -------------------------------------------------------------------------------- /data/anet/features/convert_tsp_h5_to_npy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import numpy as np 4 | 5 | in_paths = [ 6 | 'r2plus1d_34-tsp_on_activitynet-test_features.h5', 7 | 'r2plus1d_34-tsp_on_activitynet-train_features.h5', 8 | 'r2plus1d_34-tsp_on_activitynet-valid_features.h5' 9 | ] 10 | out_path = 'tsp' 11 | 12 | if not os.path.exists(out_path): 13 | os.mkdir(out_path) 14 | 15 | for in_path in in_paths: 16 | d = h5py.File(in_path) 17 | for key in d.keys(): 18 | v_d = d[key][:] 19 | np.save(os.path.join(out_path, key+'.npy'), v_d) 20 | -------------------------------------------------------------------------------- /data/anet/features/download_c3d_features.sh: -------------------------------------------------------------------------------- 1 | # Download the C3D feature files , refer to http://activity-net.org/challenges/2016/download.html#c3d to more details. 2 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-00 3 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-01 4 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-02 5 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-03 6 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-04 7 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-05 8 | cat activitynet_v1-3.part-* > c3d_features.zip && unzip c3d_features.zip 9 | python convert_c3d_h5_to_npy.py -------------------------------------------------------------------------------- /data/anet/features/download_i3d_vggish_features.sh: -------------------------------------------------------------------------------- 1 | # download i3d features (rgb+flow) and vggish features of ActivityNet Captions 2 | # Modified from https://github.com/v-iashin/BMT/blob/master/download_data.sh 3 | # Copyright (c) 2020 Vladimir Iashin 4 | 5 | 6 | # checking if wget is installed on a computer 7 | if ! command -v wget &> /dev/null 8 | then 9 | echo "wget: command not found" 10 | echo "" 11 | echo "wget command could not be found on your computer. Please, install it first." 12 | echo "If you cannot/dontwantto install wget, you may try to download the features manually." 13 | echo "You may find the links and correct paths in this file." 14 | echo "Make sure to check the md5 sums after manual download:" 15 | echo "./data/i3d_25fps_stack64step64_2stream_npy.zip d7266e440f8c616acbc0d8aaa4a336dc" 16 | echo "./data/vggish_npy.zip 9a654ad785e801aceb70af2a5e1cffbe" 17 | echo "./.vector_cache/glove.840B.300d.zip 2ffafcc9f9ae46fc8c95f32372976137" 18 | exit 19 | fi 20 | 21 | 22 | echo "Downloading i3d features" 23 | wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/i3d_25fps_stack64step64_2stream_npy.zip -q --show-progress 24 | echo "Downloading vggish features" 25 | wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/vggish_npy.zip -q --show-progress 26 | 27 | #echo "Downloading GloVe embeddings" 28 | #mkdir .vector_cache 29 | #cd .vector_cache 30 | #wget https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/bmt/glove.840B.300d.zip -q --show-progress 31 | #cd ../ 32 | 33 | echo "Checking for correctness of the downloaded files" 34 | 35 | i3d_md5=($(md5sum ./data/i3d_25fps_stack64step64_2stream_npy.zip)) 36 | if [ "$i3d_md5" == "d7266e440f8c616acbc0d8aaa4a336dc" ]; then 37 | echo "OK: i3d features" 38 | else 39 | echo "ERROR: .zip file with i3d features is corrupted" 40 | exit 1 41 | fi 42 | 43 | vggish_md5=($(md5sum ./data/vggish_npy.zip)) 44 | if [ "$vggish_md5" == "9a654ad785e801aceb70af2a5e1cffbe" ]; then 45 | echo "OK: vggish features" 46 | else 47 | echo "ERROR: .zip file with vggish features is corrupted" 48 | exit 1 49 | fi 50 | 51 | glove_md5=($(md5sum ./.vector_cache/glove.840B.300d.zip)) 52 | if [ "$glove_md5" == "2ffafcc9f9ae46fc8c95f32372976137" ]; then 53 | echo "OK: glove embeddings" 54 | else 55 | echo "ERROR: .zip file with glove embeddings is corrupted" 56 | exit 1 57 | fi 58 | 59 | echo "Unpacking i3d (~1 min)" 60 | 61 | unzip -q i3d_25fps_stack64step64_2stream_npy.zip 62 | echo "Unpacking vggish features" 63 | unzip -q vggish_npy.zip 64 | 65 | echo "Done" -------------------------------------------------------------------------------- /data/anet/features/download_tsn_features.sh: -------------------------------------------------------------------------------- 1 | # Download TSN feature files, refer to https://github.com/salesforce/densecap#data-preparation for more details about feature extraction. 2 | wget http://youcook2.eecs.umich.edu/static/dat/anet_densecap/training_feat_anet.tar.gz 3 | wget http://youcook2.eecs.umich.edu/static/dat/anet_densecap/validation_feat_anet.tar.gz 4 | wget http://youcook2.eecs.umich.edu/static/dat/anet_densecap/testing_feat_anet.tar.gz 5 | 6 | tar xvzf training_feat_anet.tar.gz 7 | tar xvzf validation_feat_anet.tar.gz 8 | tar xvzf testing_feat_anet.tar.gz 9 | mkdir resnet_bn 10 | mv testing/* resnet_bn 11 | mv training/* resnet_bn 12 | mv validation/* resnet_bn 13 | -------------------------------------------------------------------------------- /data/anet/features/download_tsp_features.sh: -------------------------------------------------------------------------------- 1 | # TSP features from https://github.com/HumamAlwassel/TSP 2 | # download the following files and reformat them into data/features/tsp/VIDEO_ID.npy where VIDEO_ID starts with 'v_' 3 | wget https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-train_features.h5 4 | wget https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-valid_features.h5 5 | wget https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-test_features.h5 6 | python convert_tsp_h5_to_npy.py 7 | -------------------------------------------------------------------------------- /data/anet/features/resnet_bn_invalid_videos.json: -------------------------------------------------------------------------------- 1 | ["v_iVVatZsgnGo", "v_0dkIbKXXFzI", "v_xeOHoiH-dmo", "v_j73Wh1olDsA", "v_IeBCgi4xPIE"] -------------------------------------------------------------------------------- /data/yc2/captiondata/para/convert_to_para.py: -------------------------------------------------------------------------------- 1 | import json 2 | split='val' 3 | p='yc2_{}.json'.format(split) 4 | out_p = 'para_yc2_{}.json'.format(split) 5 | 6 | d = json.load(open(p)) 7 | out = {} 8 | for k,v in d.items(): 9 | para = '. '.join(v['sentences']) 10 | out[k] = para 11 | json.dump(out, open(out_p, 'w')) 12 | -------------------------------------------------------------------------------- /data/yc2/features/download_yc2_tsn_features.sh: -------------------------------------------------------------------------------- 1 | http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/training_feat_yc2.tar.gz 2 | 3 | # Download TSN feature files for the youcook2 dataset, refer to https://github.com/salesforce/densecap#data-preparation for more details about feature extraction. 4 | wget http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/training_feat_yc2.tar.gz 5 | wget http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/validation_feat_yc2.tar.gz 6 | wget http://youcook2.eecs.umich.edu/static/dat/yc2_densecap/testing_feat_yc2.tar.gz 7 | 8 | tar xvzf training_feat_yc2.tar.gz 9 | tar xvzf validation_feat_yc2.tar.gz 10 | tar xvzf testing_feat_yc2.tar.gz 11 | mkdir resnet_bn 12 | mv testing/* resnet_bn 13 | mv training/* resnet_bn 14 | mv validation/* resnet_bn 15 | -------------------------------------------------------------------------------- /densevid_eval3/eval_dvc.py: -------------------------------------------------------------------------------- 1 | from densevid_eval3.evaluate2018 import main as eval2018 2 | from densevid_eval3.evaluate2021 import main as eval2021 3 | 4 | def eval_dvc(json_path, reference, no_lang_eval=False, topN=1000, version='2018'): 5 | args = type('args', (object,), {})() 6 | args.submission = json_path 7 | args.max_proposals_per_video = topN 8 | args.tious = [0.3,0.5,0.7,0.9] 9 | args.verbose = False 10 | args.no_lang_eval = no_lang_eval 11 | args.references = reference 12 | eval_func = eval2018 if version=='2018' else eval2021 13 | score = eval_func(args) 14 | return score 15 | 16 | if __name__ == '__main__': 17 | p = '../save/pretrained_models/anet_c3d_pdvc/2021-08-21-23-40-05_debug_2021-08-21_20-46-20_epoch8_num4917_score0.json.top3.json' 18 | ref = ['../data/anet/captiondata/val_1.json', '../data/anet/captiondata/val_2.json'] 19 | score = eval_dvc(p, ref, no_lang_eval=False, version='2018') 20 | print(score) -------------------------------------------------------------------------------- /densevid_eval3/eval_para.py: -------------------------------------------------------------------------------- 1 | from densevid_eval3.para_evaluate import ANETcaptions 2 | 3 | def eval_para(prediction, referneces, verbose=False): 4 | args = type('args', (object,), {})() 5 | args.submission = prediction 6 | args.references = referneces 7 | args.all_scorer = True 8 | args.verbose = verbose 9 | 10 | evaluator = ANETcaptions(ground_truth_filenames=args.references, 11 | prediction_filename=args.submission, 12 | verbose=args.verbose, 13 | all_scorer=args.all_scorer) 14 | evaluator.evaluate() 15 | output = {} 16 | 17 | for metric, score in evaluator.scores.items(): 18 | # print ('| %s: %2.4f'%(metric, 100*score)) 19 | output['para_'+metric] = score 20 | return output -------------------------------------------------------------------------------- /densevid_eval3/eval_soda.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from os.path import dirname, abspath 4 | 5 | pdvc_dir = dirname(dirname(abspath(__file__))) 6 | sys.path.append(pdvc_dir) 7 | sys.path.append(os.path.join(pdvc_dir, 'densevid_eval3/SODA')) 8 | 9 | import numpy as np 10 | from densevid_eval3.SODA.soda import SODA 11 | from densevid_eval3.SODA.dataset import ANETCaptions 12 | from densevid_eval3.eval_para import eval_para 13 | 14 | def eval_tool(prediction, referneces=None, metric='Meteor', soda_type='c', verbose=False): 15 | 16 | args = type('args', (object,), {})() 17 | args.prediction = prediction 18 | args.references = referneces 19 | args.metric = metric 20 | args.soda_type = soda_type 21 | args.tious = [0.3, 0.5, 0.7, 0.9] 22 | args.verbose = verbose 23 | args.multi_reference = False 24 | 25 | data = ANETCaptions.from_load_files(args.references, 26 | args.prediction, 27 | multi_reference=args.multi_reference, 28 | verbose=args.verbose, 29 | ) 30 | data.preprocess() 31 | if args.soda_type == 'a': 32 | tious = args.tious 33 | else: 34 | tious = None 35 | evaluator = SODA(data, 36 | soda_type=args.soda_type, 37 | tious=tious, 38 | scorer=args.metric, 39 | verbose=args.verbose 40 | ) 41 | result = evaluator.evaluate() 42 | 43 | return result 44 | 45 | def eval_soda(p, ref_list,verbose=False): 46 | score_sum = [] 47 | for ref in ref_list: 48 | r = eval_tool(prediction=p, referneces=[ref], verbose=verbose, soda_type='c') 49 | score_sum.append(r['Meteor']) 50 | soda_avg = np.mean(score_sum, axis=0) #[avg_pre, avg_rec, avg_f1] 51 | soda_c_avg = soda_avg[-1] 52 | results = {'soda_c': soda_c_avg} 53 | return results 54 | 55 | 56 | if __name__ == '__main__': 57 | 58 | p_new = '../save/old/cfgs--base_config_v2_0427--anet_c3d_pdvc_seed358/2021-08-21-21-47-13_debug_2021-08-21_20-46-20_epoch8_num4917_score0_top1000.json' 59 | p_vitr= '../save/old/cfgs--base_config_v2_0427--anet_c3d_pdvc_seed358/2021-08-21-21-47-20_cfgs--base_config_v2_0427--anet_c3d_pdvc_seed358_epoch8_num4917_score0_top1000.json.tmp' 60 | 61 | for p in [p_new, p_vitr]: 62 | print('\n') 63 | print(p) 64 | ref_list = ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json'] 65 | score=eval_soda(p, ref_list, verbose=False) 66 | print(score) 67 | para_score = get_para_score(p, referneces=['../data/anet/captiondata/para/anet_entities_val_1_para.json', '../data/anet/captiondata/para/anet_entities_val_2_para.json']) 68 | print(para_score) 69 | 70 | 71 | # metric = ['Meteor', 'Cider'] 72 | # score_type = ['standard_score', 'precision_recall', 'paragraph_score'] 73 | # dvc_score = soda3.eval_tool(predictions=[p], referneces=ref_list, metric=metric,score_type=score_type)[0] 74 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import argparse 6 | import json 7 | import os 8 | import sys 9 | import torch 10 | import numpy as np 11 | import time 12 | from os.path import dirname, abspath 13 | 14 | pdvc_dir = dirname(abspath(__file__)) 15 | sys.path.insert(0, pdvc_dir) 16 | sys.path.insert(0, os.path.join(pdvc_dir, 'densevid_eval3')) 17 | sys.path.insert(0, os.path.join(pdvc_dir, 'densevid_eval3/SODA')) 18 | # print(sys.path) 19 | 20 | from eval_utils import evaluate 21 | from pdvc.pdvc import build 22 | from misc.utils import create_logger 23 | from data.video_dataset import PropSeqDataset, collate_fn 24 | from torch.utils.data import DataLoader 25 | from os.path import basename 26 | import pandas as pd 27 | 28 | def create_fake_test_caption_file(metadata_csv_path): 29 | out = {} 30 | df = pd.read_csv(metadata_csv_path) 31 | for i, row in df.iterrows(): 32 | out[basename(row['filename']).split('.')[0]] = {'duration': row['video-duration'], "timestamps": [[0, 0.5]], "sentences":["None"]} 33 | fake_test_json = '.fake_test_json.tmp' 34 | json.dump(out, open(fake_test_json, 'w')) 35 | return fake_test_json 36 | 37 | def main(opt): 38 | folder_path = os.path.join(opt.eval_save_dir, opt.eval_folder) 39 | if opt.eval_mode == 'test': 40 | if not os.path.exists(folder_path): 41 | os.makedirs(folder_path) 42 | logger = create_logger(folder_path, 'val.log') 43 | if opt.eval_model_path: 44 | model_path = opt.eval_model_path 45 | infos_path = os.path.join('/'.join(opt.eval_model_path.split('/')[:-1]), 'info.json') 46 | else: 47 | model_path = os.path.join(folder_path, 'model-best.pth') 48 | infos_path = os.path.join(folder_path, 'info.json') 49 | 50 | logger.info(vars(opt)) 51 | 52 | with open(infos_path, 'rb') as f: 53 | logger.info('load info from {}'.format(infos_path)) 54 | old_opt = json.load(f)['best']['opt'] 55 | 56 | for k, v in old_opt.items(): 57 | if k[:4] != 'eval': 58 | vars(opt).update({k: v}) 59 | 60 | opt.transformer_input_type = opt.eval_transformer_input_type 61 | 62 | if not torch.cuda.is_available(): 63 | opt.nthreads = 0 64 | # Create the Data Loader instance 65 | 66 | if opt.eval_mode == 'test': 67 | opt.eval_caption_file = create_fake_test_caption_file(opt.test_video_meta_data_csv_path) 68 | opt.visual_feature_folder = opt.test_video_feature_folder 69 | 70 | val_dataset = PropSeqDataset(opt.eval_caption_file, 71 | opt.visual_feature_folder, 72 | opt.dict_file, False, opt.eval_proposal_type, 73 | opt) 74 | loader = DataLoader(val_dataset, batch_size=opt.batch_size_for_eval, 75 | shuffle=False, num_workers=opt.nthreads, collate_fn=collate_fn) 76 | 77 | 78 | model, criterion, postprocessors = build(opt) 79 | model.translator = val_dataset.translator 80 | 81 | 82 | 83 | while not os.path.exists(model_path): 84 | raise AssertionError('File {} does not exist'.format(model_path)) 85 | 86 | logger.debug('Loading model from {}'.format(model_path)) 87 | loaded_pth = torch.load(model_path, map_location=opt.eval_device) 88 | epoch = loaded_pth['epoch'] 89 | 90 | # loaded_pth = transfer(model, loaded_pth, model_path+'.transfer.pth') 91 | model.load_state_dict(loaded_pth['model'], strict=True) 92 | model.eval() 93 | 94 | model.to(opt.eval_device) 95 | 96 | if opt.eval_mode == 'test': 97 | out_json_path = os.path.join(folder_path, 'dvc_results.json') 98 | evaluate(model, criterion, postprocessors, loader, out_json_path, 99 | logger, alpha=opt.ec_alpha, dvc_eval_version=opt.eval_tool_version, device=opt.eval_device, debug=False, skip_lang_eval=True) 100 | 101 | 102 | else: 103 | out_json_path = os.path.join(folder_path, '{}_epoch{}_num{}_alpha{}.json'.format( 104 | time.strftime("%Y-%m-%d-%H-%M-%S_", time.localtime()) + str(opt.id), epoch, len(loader.dataset), 105 | opt.ec_alpha)) 106 | caption_scores, eval_loss = evaluate(model, criterion, postprocessors, loader, out_json_path, 107 | logger, alpha=opt.ec_alpha, dvc_eval_version=opt.eval_tool_version, device=opt.eval_device, debug=False, skip_lang_eval=False) 108 | avg_eval_score = {key: np.array(value).mean() for key, value in caption_scores.items() if key !='tiou'} 109 | avg_eval_score2 = {key: np.array(value).mean() * 4917 / len(loader.dataset) for key, value in caption_scores.items() if key != 'tiou'} 110 | 111 | logger.info( 112 | '\nValidation result based on all 4917 val videos:\n {}\n avg_score:\n{}'.format( 113 | caption_scores.items(), 114 | avg_eval_score)) 115 | 116 | logger.info( 117 | '\nValidation result based on {} available val videos:\n avg_score:\n{}'.format(len(loader.dataset), 118 | avg_eval_score2)) 119 | 120 | logger.info('saving reults json to {}'.format(out_json_path)) 121 | 122 | if __name__ == '__main__': 123 | parser = argparse.ArgumentParser() 124 | parser.add_argument('--eval_save_dir', type=str, default='save') 125 | parser.add_argument('--eval_mode', type=str, default='eval', choices=['eval', 'test']) 126 | parser.add_argument('--test_video_feature_folder', type=str, nargs='+', default=None) 127 | parser.add_argument('--test_video_meta_data_csv_path', type=str, default=None) 128 | parser.add_argument('--eval_folder', type=str, required=True) 129 | parser.add_argument('--eval_model_path', type=str, default='') 130 | parser.add_argument('--eval_tool_version', type=str, default='2018', choices=['2018', '2021']) 131 | parser.add_argument('--eval_caption_file', type=str, default='data/anet/captiondata/val_1.json') 132 | parser.add_argument('--eval_proposal_type', type=str, default='gt') 133 | parser.add_argument('--eval_transformer_input_type', type=str, default='queries', choices=['gt_proposals', 'queries']) 134 | parser.add_argument('--gpu_id', type=str, nargs='+', default=['0']) 135 | parser.add_argument('--eval_device', type=str, default='cuda') 136 | opt = parser.parse_args() 137 | 138 | os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in opt.gpu_id]) 139 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' 140 | if True: 141 | torch.backends.cudnn.enabled = False 142 | main(opt) 143 | -------------------------------------------------------------------------------- /misc/build_vocab.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import json 3 | 4 | # file_path_list = ["data/captiondata/train_modified.json", "data/captiondata/val_1.json", "data/captiondata/val_2.json"] 5 | file_path_list = ["data/captiondata/yc2/yc2_train.json", "data/captiondata/yc2/yc2_val.json"] 6 | 7 | count_threshold = 2 # 4 for anet, 2 for youcook2 8 | # output_path = './data/vocabulary_activitynet.json' 9 | output_path = './data/vocabulary_youcook2.json' 10 | 11 | mark = [',', ':', '!', '_', ';', '-', '.', '?', '/', '"', '\\n', '\\'] 12 | 13 | count_vocal = {} 14 | 15 | for file_path in file_path_list: 16 | data = json.load(open(file_path)) 17 | video_ids = data.keys() 18 | print('video num of ' + file_path.split('/')[-1], len(video_ids)) 19 | for video_id in video_ids: 20 | sentences = data[video_id]["sentences"] 21 | for sentence in sentences: 22 | for m in mark: 23 | if m in sentence: 24 | sentence = sentence.replace(m, " ") 25 | sentence = sentence.replace(" ", " ") 26 | sentence = sentence.replace(" ", " ") 27 | sentence = sentence.replace(" ", " ") 28 | 29 | sentence = sentence.lstrip() 30 | sentence = sentence.rstrip() 31 | sentence = sentence.lower() 32 | sentence = sentence.split(" ") 33 | length = len(sentence) 34 | 35 | # print(sentence) 36 | for word in sentence: 37 | # print(type(word)) 38 | for m in word: 39 | if m == ' ': 40 | print('warning !') 41 | word = word.replace(m, '') 42 | if word == '': 43 | print('warning !') 44 | pass 45 | count_vocal[word] = count_vocal.get(word, 0) + 1 46 | 47 | print("total word:", sum(count_vocal.values())) 48 | count_vocal[''] = 1e10 49 | count_vocal[''] = 1e10 50 | vocab = [word for word, n in count_vocal.items() if n >= count_threshold] 51 | bad_word = [word for word, n in count_vocal.items() if n < count_threshold] 52 | bad_count = sum(count_vocal[word] for word in bad_word) 53 | 54 | vocab.append('UNK') 55 | print("number of vocab:", len(vocab)) 56 | print("number of bad word:", len(bad_word)) 57 | print("number of unks:", bad_count) 58 | 59 | itow = {i + 1: w for i, w in enumerate(vocab)} 60 | wtoi = {w: i + 1 for i, w in enumerate(vocab)} 61 | print(len(itow)) 62 | print(len(wtoi)) 63 | 64 | json.dump({'ix_to_word': itow, 65 | 'word_to_ix': wtoi}, open(output_path, 'w')) 66 | print("saving vocabulary file to {}".format(output_path)) -------------------------------------------------------------------------------- /misc/detr_utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch 6 | from torchvision.ops.boxes import box_area 7 | 8 | def box_cl_to_xy(x): 9 | c, l = x.unbind(-1) 10 | b = [c - 0.5 * l, c + 0.5 * l] 11 | return torch.stack(b, dim=-1) 12 | 13 | def box_xy_to_cl(x): 14 | x0, x1 = x.unbind(-1) 15 | b = [(x0 + x1) / 2, (x1 - x0)] 16 | return torch.stack(b, dim=-1) 17 | 18 | # modified from torchvision to also return the union 19 | def box_iou(boxes1, boxes2): 20 | area1 = boxes1[:, 1] - boxes1[:, 0] 21 | area2 = boxes2[:, 1] - boxes2[:, 0] 22 | lt = torch.max(boxes1[:, None, 0], boxes2[:, 0]) # [N,M,2] 23 | rb = torch.min(boxes1[:, None, 1], boxes2[:, 1]) # [N,M,2] 24 | inter = (rb - lt).clamp(min=0) # [N,M,2] 25 | union = area1[:, None] + area2 - inter 26 | iou = inter / (union + 1e-5) 27 | return iou, union 28 | 29 | 30 | def generalized_box_iou(boxes1, boxes2): 31 | """ 32 | Generalized IoU from https://giou.stanford.edu/ 33 | 34 | The boxes should be in [x0, y0, x1, y1] format 35 | 36 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 37 | and M = len(boxes2) 38 | """ 39 | # degenerate boxes gives inf / nan results 40 | # so do an early check 41 | assert (boxes1[:, 1:] >= boxes1[:, :1]).all() 42 | assert (boxes2[:, 1:] >= boxes2[:, :1]).all() 43 | iou, union = box_iou(boxes1, boxes2) 44 | lt = torch.min(boxes1[:, None, 0], boxes2[:, 0]) 45 | rb = torch.max(boxes1[:, None, 1], boxes2[:, 1]) 46 | area = (rb - lt).clamp(min=0) # [N,M,2] 47 | giou = iou - (area - union) / (area + 1e-5) 48 | return giou -------------------------------------------------------------------------------- /misc/utils.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | # from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import time 7 | import torch 8 | import numpy as np 9 | import glob 10 | import shutil 11 | import os 12 | import colorlog 13 | import random 14 | import six 15 | from six.moves import cPickle 16 | import matplotlib as mpl 17 | 18 | mpl.use('Agg') 19 | import matplotlib.pyplot as plt 20 | 21 | 22 | def match_name_keywords(n, name_keywords): 23 | out = False 24 | for b in name_keywords: 25 | if b in n: 26 | out = True 27 | break 28 | return out 29 | 30 | 31 | def decide_two_stage(transformer_input_type, dt, criterion): 32 | if transformer_input_type == 'gt_proposals': 33 | two_stage = True 34 | proposals = dt['gt_boxes'] 35 | proposals_mask = dt['gt_boxes_mask'] 36 | criterion.matcher.cost_caption = 0 37 | for q_k in ['loss_length', 'loss_ce', 'loss_bbox', 'loss_giou']: 38 | for key in criterion.weight_dict.keys(): 39 | if q_k in key: 40 | criterion.weight_dict[key] = 0 41 | disable_iterative_refine = True 42 | elif transformer_input_type == 'queries': # 43 | two_stage = False 44 | proposals = None 45 | proposals_mask = None 46 | disable_iterative_refine = False 47 | else: 48 | raise ValueError('Wrong value of transformer_input_type, got {}'.format(transformer_input_type)) 49 | return two_stage, disable_iterative_refine, proposals, proposals_mask 50 | 51 | 52 | def pickle_load(f): 53 | """ Load a pickle. 54 | Parameters 55 | ---------- 56 | f: file-like object 57 | """ 58 | if six.PY3: 59 | return cPickle.load(f, encoding='latin-1') 60 | else: 61 | return cPickle.load(f) 62 | 63 | 64 | def pickle_dump(obj, f): 65 | """ Dump a pickle. 66 | Parameters 67 | ---------- 68 | obj: pickled object 69 | f: file-like object 70 | """ 71 | if six.PY3: 72 | return cPickle.dump(obj, f, protocol=2) 73 | else: 74 | return cPickle.dump(obj, f) 75 | 76 | 77 | def set_seed(seed): 78 | random.seed(seed) 79 | np.random.seed(seed) 80 | torch.manual_seed(seed) 81 | torch.cuda.manual_seed(seed) 82 | torch.cuda.manual_seed_all(seed) 83 | torch.backends.cudnn.deterministic = True 84 | torch.backends.cudnn.benchmark = False 85 | 86 | 87 | def update_values(dict_from, dict_to): 88 | for key, value in dict_from.items(): 89 | if key not in dict_to.keys(): 90 | raise AssertionError('key mismatching: {}'.format(key)) 91 | if isinstance(value, dict): 92 | update_values(dict_from[key], dict_to[key]) 93 | elif value is not None: 94 | dict_to[key] = dict_from[key] 95 | 96 | 97 | def print_opt(opt, model, logger): 98 | print_alert_message('All args:', logger) 99 | for key, item in opt._get_kwargs(): 100 | logger.info('{} = {}'.format(key, item)) 101 | print_alert_message('Model structure:', logger) 102 | logger.info(model) 103 | 104 | 105 | def build_floder(opt): 106 | if opt.start_from: 107 | print('Start training from id:{}'.format(opt.start_from)) 108 | save_folder = os.path.join(opt.save_dir, opt.start_from) 109 | assert os.path.exists(save_folder) 110 | else: 111 | if not os.path.exists(opt.save_dir): 112 | os.mkdir(opt.save_dir) 113 | save_folder = os.path.join(opt.save_dir, opt.id) 114 | if os.path.exists(save_folder): 115 | # wait_flag = input('Warning! ID {} already exists, rename it? (Y/N) : '.format(opt.id)) 116 | wait_flag = 'Y' 117 | if wait_flag in ['Y', 'y']: 118 | opt.id = opt.id + '_v_{}'.format(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) 119 | save_folder = os.path.join(opt.save_dir, opt.id) 120 | print('Rename opt.id as "{}".'.format(opt.id)) 121 | else: 122 | raise AssertionError('ID already exists, folder {} exists'.format(save_folder)) 123 | print('Results folder "{}" does not exist, creating folder...'.format(save_folder)) 124 | os.mkdir(save_folder) 125 | os.mkdir(os.path.join(save_folder, 'prediction')) 126 | return save_folder 127 | 128 | 129 | def backup_envir(save_folder): 130 | backup_folders = ['cfgs', 'misc', 'pdvc'] 131 | backup_files = glob.glob('./*.py') 132 | for folder in backup_folders: 133 | shutil.copytree(folder, os.path.join(save_folder, 'backup', folder)) 134 | for file in backup_files: 135 | shutil.copyfile(file, os.path.join(save_folder, 'backup', file)) 136 | 137 | 138 | def create_logger(folder, filename): 139 | log_colors = { 140 | 'DEBUG': 'blue', 141 | 'INFO': 'white', 142 | 'WARNING': 'green', 143 | 'ERROR': 'red', 144 | 'CRITICAL': 'yellow', 145 | } 146 | 147 | import logging 148 | logger = logging.getLogger('DVC') 149 | # %(filename)s$RESET:%(lineno)d 150 | # LOGFORMAT = "%(log_color)s%(asctime)s [%(log_color)s%(filename)s:%(lineno)d] | %(log_color)s%(message)s%(reset)s |" 151 | LOGFORMAT = "" 152 | LOG_LEVEL = logging.DEBUG 153 | logging.root.setLevel(LOG_LEVEL) 154 | stream = logging.StreamHandler() 155 | stream.setLevel(LOG_LEVEL) 156 | stream.setFormatter(colorlog.ColoredFormatter(LOGFORMAT, datefmt='%d %H:%M', log_colors=log_colors)) 157 | 158 | # print to log file 159 | hdlr = logging.FileHandler(os.path.join(folder, filename)) 160 | hdlr.setLevel(LOG_LEVEL) 161 | # hdlr.setFormatter(logging.Formatter("[%(asctime)s] %(message)s")) 162 | hdlr.setFormatter(logging.Formatter("%(message)s")) 163 | logger.addHandler(hdlr) 164 | logger.addHandler(stream) 165 | return logger 166 | 167 | 168 | def print_alert_message(str, logger=None): 169 | msg = '*' * 20 + ' ' + str + ' ' + '*' * (58 - len(str)) 170 | if logger: 171 | logger.info('\n\n' + msg) 172 | else: 173 | print(msg) 174 | 175 | 176 | def set_lr(optimizer, lr): 177 | for group in optimizer.param_groups: 178 | group['lr'] = lr 179 | 180 | 181 | def clip_gradient(optimizer, grad_clip): 182 | for group in optimizer.param_groups: 183 | for i, param in enumerate(group['params']): 184 | if param.grad is not None: 185 | param.grad.data.clamp_(-grad_clip, grad_clip) 186 | 187 | 188 | if __name__ == '__main__': 189 | # import opts 190 | # 191 | # info = {'opt': vars(opts.parse_opts()), 192 | # 'loss': {'tap_loss': 0, 'tap_reg_loss': 0, 'tap_conf_loss': 0, 'lm_loss': 0}} 193 | # record_this_run_to_csv(info, 'save/results_all_runs.csv') 194 | 195 | logger = create_logger('./', 'mylogger.log') 196 | logger.info('debug') 197 | logger.info('test2') 198 | -------------------------------------------------------------------------------- /pdvc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/pdvc.jpg -------------------------------------------------------------------------------- /pdvc/CaptioningHead/LSTM.py: -------------------------------------------------------------------------------- 1 | # This file contains ShowAttendTell and AllImg model 2 | 3 | # ShowAttendTell is from Show, Attend and Tell: Neural Image Caption Generation with Visual Attention 4 | # https://arxiv.org/abs/1502.03044 5 | 6 | # AllImg is a model where 7 | # img feature is concatenated with word embedding at every time step as the input of lstm 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import pdb 13 | 14 | import numpy 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from torch.autograd import * 19 | 20 | class Captioner(nn.Module): 21 | def __init__(self, opt): 22 | super(Captioner, self).__init__() 23 | self.opt = opt 24 | 25 | self.vocab_size = opt.vocab_size 26 | self.input_encoding_size = opt.input_encoding_size 27 | self.rnn_size = opt.rnn_size 28 | self.num_layers = opt.num_layers 29 | self.drop_prob_lm = opt.drop_prob 30 | self.max_caption_len = opt.max_caption_len 31 | 32 | self.ss_prob = 0.0 # Schedule sampling probability 33 | self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) 34 | 35 | self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1) 36 | self.dropout = nn.Dropout(self.drop_prob_lm) 37 | 38 | self.init_weights() 39 | 40 | def init_weights(self): 41 | initrange = 0.1 42 | self.embed.weight.data.uniform_(-initrange, initrange) 43 | self.logit.bias.data.fill_(0) 44 | self.logit.weight.data.uniform_(-initrange, initrange) 45 | 46 | def init_hidden(self, batch_size): 47 | weight = next(self.parameters()).data 48 | return (weight.new(self.num_layers, batch_size, self.rnn_size).zero_(), 49 | weight.new(self.num_layers, batch_size, self.rnn_size).zero_()) # (h0, c0) 50 | 51 | def build_loss(self, input, target, mask): 52 | one_hot = torch.nn.functional.one_hot(target, self.opt.vocab_size+1) 53 | max_len = input.shape[1] 54 | output = - (one_hot[:, :max_len] * input * mask[:, :max_len, None]).sum(2).sum(1) / (mask.sum(1) + 1e-6) 55 | return output 56 | 57 | def forward(self, event, clip, clip_mask, seq): 58 | batch_size = clip.shape[0] 59 | 60 | state = self.init_hidden(batch_size) 61 | outputs = [] 62 | seq = seq.long() 63 | 64 | for i in range(seq.size(1) - 1): 65 | if self.training and i >= 1 and self.ss_prob > 0.0: # otherwiste no need to sample 66 | sample_prob = clip.data.new(batch_size).uniform_(0, 1) 67 | sample_mask = sample_prob < self.ss_prob 68 | if sample_mask.sum() == 0: 69 | it = seq[:, i].clone() 70 | else: 71 | sample_ind = sample_mask.nonzero().view(-1) 72 | it = seq[:, i].data.clone() 73 | prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1) 74 | it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind)) 75 | it = Variable(it, requires_grad=False) 76 | else: 77 | it = seq[:, i].clone() 78 | # break if all the sequences end 79 | if i >= 1 and seq[:, i].data.sum() == 0: 80 | break 81 | 82 | output, state = self.get_logprobs_state(it, event, clip, clip_mask, state) 83 | outputs.append(output) 84 | 85 | return torch.cat([_.unsqueeze(1) for _ in outputs], 1) 86 | 87 | 88 | def get_logprobs_state(self, it, event , clip, clip_mask, state): 89 | xt = self.embed(it) 90 | output, state = self.core(xt, event , clip, clip_mask, state) 91 | logprobs = F.log_softmax(self.logit(self.dropout(output)), dim=1) 92 | return logprobs, state 93 | 94 | def sample(self, event , clip, clip_mask, opt={}): 95 | 96 | sample_max = opt.get('sample_max', 1) 97 | beam_size = opt.get('beam_size', 1) 98 | temperature = opt.get('temperature', 1.0) 99 | 100 | batch_size = clip.shape[0] 101 | 102 | state = self.init_hidden(batch_size) 103 | 104 | seq = [] 105 | seqLogprobs = [] 106 | 107 | for t in range(self.max_caption_len + 1): 108 | if t == 0: # input 109 | it = clip.data.new(batch_size).long().zero_() 110 | elif sample_max: 111 | sampleLogprobs, it = torch.max(logprobs.data, 1) 112 | it = it.view(-1).long() 113 | else: 114 | if temperature == 1.0: 115 | prob_prev = torch.exp(logprobs.data) # fetch prev distribution: shape Nx(M+1) 116 | else: 117 | # scale logprobs by temperature 118 | prob_prev = torch.exp(torch.div(logprobs.data, temperature)) 119 | it = torch.multinomial(prob_prev, 1) 120 | sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions 121 | it = it.view(-1).long() # and flatten indices for downstream processing 122 | 123 | logprobs, state = self.get_logprobs_state(it, event , clip, clip_mask, state) 124 | 125 | if t >= 1: 126 | # stop when all finished 127 | if t == 1: 128 | unfinished = it > 0 129 | else: 130 | unfinished = unfinished & (it > 0) 131 | if unfinished.sum() == 0: 132 | break 133 | it = it * unfinished.type_as(it) 134 | seq.append(it) #seq[t] the input of t+2 time step 135 | seqLogprobs.append(sampleLogprobs.view(-1)) 136 | 137 | if seq==[] or len(seq)==0: 138 | return [],[] 139 | return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1) 140 | 141 | class AllImgCore(nn.Module): 142 | def __init__(self, opt): 143 | super(AllImgCore, self).__init__() 144 | self.input_encoding_size = opt.input_encoding_size 145 | self.rnn_size = opt.rnn_size 146 | self.num_layers = opt.num_layers 147 | self.drop_prob_lm = opt.drop_prob 148 | self.att_feat_size = opt.clip_context_dim 149 | 150 | self.opt = opt 151 | self.wordRNN_input_feats_type = opt.wordRNN_input_feats_type 152 | self.input_dim = self.decide_input_feats_dim() 153 | self.rnn = nn.LSTM(self.input_encoding_size + self.input_dim, 154 | self.rnn_size, self.num_layers, bias=False, dropout=self.drop_prob_lm) 155 | assert self.wordRNN_input_feats_type == 'C' 156 | 157 | def decide_input_feats_dim(self): 158 | dim = 0 159 | if 'E' in self.wordRNN_input_feats_type: 160 | dim += self.opt.event_context_dim 161 | if 'C' in self.wordRNN_input_feats_type: 162 | dim += self.opt.clip_context_dim 163 | return dim 164 | 165 | def forward(self, xt, event, clip, clip_mask, state): 166 | input_feats = (clip * clip_mask.unsqueeze(2)).sum(1) / (clip_mask.sum(1, keepdims=True) + 1e-5) 167 | output, state = self.rnn(torch.cat([xt, input_feats], 1).unsqueeze(0), state) 168 | return output.squeeze(0), state 169 | 170 | 171 | class LightCaptioner(Captioner): 172 | def __init__(self, opt): 173 | super(LightCaptioner, self).__init__(opt) 174 | self.core = AllImgCore(opt) 175 | -------------------------------------------------------------------------------- /pdvc/CaptioningHead/Puppet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class PuppetCaptionModel(nn.Module): 6 | def __init__(self, opt): 7 | super(PuppetCaptionModel, self).__init__() 8 | self.vocab_size = opt.vocab_size 9 | self.opt = opt 10 | self.puppet_layer= nn.Linear(1,1) 11 | 12 | def forward(self, event, clip, clip_mask, seq): 13 | N, L = seq.shape 14 | output = torch.zeros((N, L-1, self.vocab_size + 1), device=seq.device) 15 | return output 16 | 17 | def sample(self, event, clip, clip_mask, opt={}): 18 | N, _, C = clip.shape 19 | output = torch.zeros((N, 3), device=clip.device) 20 | prob = torch.zeros((N, 3), device=clip.device) 21 | return output, prob 22 | 23 | def build_loss(self, input, target, mask): 24 | one_hot = torch.nn.functional.one_hot(target, self.opt.vocab_size+1) 25 | output = - (one_hot * input * mask[..., None]).sum(2).sum(1) / (mask.sum(1) + 1e-6) 26 | return output -------------------------------------------------------------------------------- /pdvc/CaptioningHead/__init__.py: -------------------------------------------------------------------------------- 1 | from .LSTM import LightCaptioner 2 | from .Puppet import PuppetCaptionModel 3 | from .LSTM_DSA import LSTMDSACaptioner 4 | 5 | def build_captioner(opt): 6 | if opt.caption_decoder_type == 'none': 7 | caption_embed = PuppetCaptionModel(opt) 8 | 9 | elif opt.caption_decoder_type == 'light': 10 | opt.event_context_dim = None 11 | opt.clip_context_dim = opt.hidden_dim 12 | caption_embed = LightCaptioner(opt) 13 | 14 | elif opt.caption_decoder_type == 'standard': 15 | opt.event_context_dim = None 16 | opt.clip_context_dim = opt.hidden_dim 17 | caption_embed = LSTMDSACaptioner(opt) 18 | 19 | else: 20 | raise ValueError('caption decoder type is invalid') 21 | return caption_embed 22 | 23 | -------------------------------------------------------------------------------- /pdvc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/pdvc/__init__.py -------------------------------------------------------------------------------- /pdvc/base_encoder.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # PDVC 3 | # ------------------------------------------------------------------------ 4 | # Modified from Deformable DETR 5 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 6 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 7 | # ------------------------------------------------------------------------ 8 | # Modified from DETR (https://github.com/facebookresearch/detr) 9 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 10 | # ------------------------------------------------------------------------ 11 | 12 | """ 13 | Base Encoder to create multi-level conv features and positional embedding. 14 | """ 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch import nn 19 | from misc.detr_utils.misc import NestedTensor 20 | from .position_encoding import PositionEmbeddingSine 21 | 22 | 23 | class BaseEncoder(nn.Module): 24 | def __init__(self, num_feature_levels, vf_dim, hidden_dim): 25 | super(BaseEncoder, self).__init__() 26 | self.pos_embed = PositionEmbeddingSine(hidden_dim//2, normalize=True) 27 | self.num_feature_levels = num_feature_levels 28 | self.hidden_dim = hidden_dim 29 | 30 | if num_feature_levels > 1: 31 | input_proj_list = [] 32 | in_channels = vf_dim 33 | input_proj_list.append(nn.Sequential( 34 | nn.Conv1d(in_channels, hidden_dim, kernel_size=1), 35 | nn.GroupNorm(32, hidden_dim), 36 | )) 37 | for _ in range(num_feature_levels - 1): 38 | input_proj_list.append(nn.Sequential( 39 | nn.Conv1d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), 40 | nn.GroupNorm(32, hidden_dim), 41 | )) 42 | in_channels = hidden_dim 43 | self.input_proj = nn.ModuleList(input_proj_list) 44 | else: 45 | self.input_proj = nn.ModuleList([ 46 | nn.Sequential( 47 | nn.Conv2d(vf_dim, hidden_dim, kernel_size=1), 48 | nn.GroupNorm(32, hidden_dim), 49 | )]) 50 | 51 | for proj in self.input_proj: 52 | nn.init.xavier_uniform_(proj[0].weight, gain=1) 53 | nn.init.constant_(proj[0].bias, 0) 54 | 55 | def forward(self, vf, mask, duration): 56 | # vf: (N, L, C), mask: (N, L), duration: (N) 57 | vf = vf.transpose(1, 2) # (N, L, C) --> (N, C, L) 58 | vf_nt = NestedTensor(vf, mask, duration) 59 | pos0 = self.pos_embed(vf_nt) 60 | 61 | srcs = [] 62 | masks = [] 63 | poses = [] 64 | 65 | src0, mask0 = vf_nt.decompose() 66 | srcs.append(self.input_proj[0](src0)) 67 | masks.append(mask0) 68 | poses.append(pos0) 69 | assert mask is not None 70 | 71 | for l in range(1, self.num_feature_levels): 72 | if l == 1: 73 | src = self.input_proj[l](vf_nt.tensors) 74 | else: 75 | src = self.input_proj[l](srcs[-1]) 76 | m = vf_nt.mask 77 | mask = F.interpolate(m[None].float(), size=src.shape[-1:]).to(torch.bool)[0] 78 | pos_l = self.pos_embed(NestedTensor(src, mask, duration)).to(src.dtype) 79 | srcs.append(src) 80 | masks.append(mask) 81 | poses.append(pos_l) 82 | return srcs, masks, poses 83 | 84 | def build_base_encoder(args): 85 | base_encoder = BaseEncoder(args.num_feature_levels, args.feature_dim, args.hidden_dim) 86 | return base_encoder 87 | -------------------------------------------------------------------------------- /pdvc/matcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Modules to compute the matching cost and solve the corresponding LSAP. 12 | """ 13 | import torch 14 | from scipy.optimize import linear_sum_assignment 15 | from torch import nn 16 | 17 | from misc.detr_utils.box_ops import box_cl_to_xy, generalized_box_iou 18 | 19 | 20 | class HungarianMatcher(nn.Module): 21 | """This class computes an assignment between the targets and the predictions of the network 22 | 23 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 24 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 25 | while the others are un-matched (and thus treated as non-objects). 26 | """ 27 | 28 | def __init__(self, 29 | cost_class: float = 1, 30 | cost_bbox: float = 1, 31 | cost_giou: float = 1, 32 | cost_alpha = 0.25, 33 | cost_gamma = 2): 34 | """Creates the matcher 35 | 36 | Params: 37 | cost_class: This is the relative weight of the classification error in the matching cost 38 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 39 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 40 | """ 41 | super().__init__() 42 | self.cost_class = cost_class 43 | self.cost_bbox = cost_bbox 44 | self.cost_giou = cost_giou 45 | # self.cost_caption = cost_caption 46 | self.cost_alpha = cost_alpha 47 | self.cost_gamma = cost_gamma 48 | 49 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 or cost_caption!=0, "all costs cant be 0" 50 | 51 | def forward(self, outputs, targets, verbose=False, many_to_one=False): 52 | """ Performs the matching 53 | 54 | Params: 55 | outputs: This is a dict that contains at least these entries: 56 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 57 | "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates 58 | 59 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 60 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 61 | objects in the target) containing the class labels 62 | "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates 63 | 64 | Returns: 65 | A list of size batch_size, containing tuples of (index_i, index_j) where: 66 | - index_i is the indices of the selected predictions (in order) 67 | - index_j is the indices of the corresponding selected targets (in order) 68 | For each batch element, it holds: 69 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 70 | """ 71 | with torch.no_grad(): 72 | bs, num_queries = outputs["pred_logits"].shape[:2] 73 | 74 | # We flatten to compute the cost matrices in a batch 75 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() 76 | out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] 77 | 78 | # Also concat the target labels and boxes 79 | tgt_ids = torch.cat([v["labels"] for v in targets]) 80 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 81 | 82 | # Compute the classification cost. 83 | # alpha = 0.25 84 | alpha = self.cost_alpha 85 | gamma = self.cost_gamma 86 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 87 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 88 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 89 | 90 | # Compute the L1 cost between boxes 91 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 92 | 93 | # Compute the giou cost betwen boxes 94 | cost_giou = -generalized_box_iou(box_cl_to_xy(out_bbox), 95 | box_cl_to_xy(tgt_bbox)) 96 | 97 | # cost_caption = outputs['caption_costs'].flatten(0, 1) 98 | 99 | # Final cost matrix 100 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 101 | 102 | costs = {'cost_bbox': cost_bbox, 103 | 'cost_class': cost_class, 104 | 'cost_giou': cost_giou, 105 | # 'cost_caption': cost_caption, 106 | 'out_bbox': out_bbox[:, 0::2]} 107 | 108 | if verbose: 109 | print('\n') 110 | print(self.cost_bbox, cost_bbox.var(dim=0), cost_bbox.max(dim=0)[0] - cost_bbox.min(dim=0)[0]) 111 | print(self.cost_class, cost_class.var(dim=0), cost_class.max(dim=0)[0] - cost_class.min(dim=0)[0]) 112 | print(self.cost_giou, cost_giou.var(dim=0), cost_giou.max(dim=0)[0] - cost_giou.min(dim=0)[0]) 113 | # print(self.cost_caption, cost_caption.var(dim=0), cost_caption.max(dim=0)[0] - cost_caption.min(dim=0)[0]) 114 | 115 | C = C.view(bs, num_queries, -1).cpu() 116 | 117 | sizes = [len(v["boxes"]) for v in targets] 118 | # pdb.set_trace() 119 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 120 | m2o_rate = 4 121 | rl_indices = [linear_sum_assignment(torch.cat([c[i]]*m2o_rate, -1)) for i, c in enumerate(C.split(sizes, -1))] 122 | rl_indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j%sizes[ii], dtype=torch.int64)) for ii,(i, j) in 123 | enumerate(rl_indices)] 124 | 125 | indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 126 | 127 | if verbose: 128 | print('------matching results:') 129 | print(indices) 130 | for indice in indices: 131 | for i, j in zip(*indice): 132 | print(out_bbox[i][0::2], tgt_bbox[j][0::2]) 133 | print('-----topK scores:') 134 | topk_indices = out_prob.topk(10, dim=0) 135 | print(topk_indices) 136 | for i,(v,ids) in enumerate(zip(*topk_indices)): 137 | print('top {}'.format(i)) 138 | s= '' 139 | for name,cost in costs.items(): 140 | s += name + ':{} '.format(cost[ids]) 141 | print(s) 142 | 143 | return indices, rl_indices 144 | 145 | 146 | def build_matcher(args): 147 | return HungarianMatcher(cost_class=args.set_cost_class, 148 | cost_bbox=args.set_cost_bbox, 149 | cost_giou=args.set_cost_giou, 150 | cost_alpha = args.cost_alpha, 151 | cost_gamma = args.cost_gamma 152 | ) 153 | -------------------------------------------------------------------------------- /pdvc/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/pdvc/ops/__init__.py -------------------------------------------------------------------------------- /pdvc/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 10 | 11 | -------------------------------------------------------------------------------- /pdvc/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | try: 19 | import MultiScaleDeformableAttention as MSDA 20 | except: 21 | pass 22 | 23 | class MSDeformAttnFunction(Function): 24 | @staticmethod 25 | def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): 26 | # sampling_locations:(...,2), the first item of last dim means x axis corresponding to w, and second item of the last dim means y, corresponding to h. 27 | ctx.im2col_step = im2col_step 28 | output = MSDA.ms_deform_attn_forward( 29 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) 30 | ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) 31 | return output 32 | 33 | @staticmethod 34 | @once_differentiable 35 | def backward(ctx, grad_output): 36 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors 37 | grad_value, grad_sampling_loc, grad_attn_weight = \ 38 | MSDA.ms_deform_attn_backward( 39 | value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) 40 | 41 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 42 | 43 | 44 | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights, return_value=False): 45 | # for debug and test only, 46 | # need to use cuda version instead 47 | N_, S_, M_, D_ = value.shape # N_: batch size , S_: \sum_H*W, M_ : head number, D_: feature dim of each head 48 | 49 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape # Lq_: \sum H*W, L_: multi-scale number, P_: number of sampled key points 50 | 51 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 52 | sampling_grids = 2 * sampling_locations - 1 # convert value from range[0,1] to [-1, 1] 53 | sampling_value_list = [] 54 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 55 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 56 | value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) 57 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 58 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 59 | # sampling_grid_l_: (...,2), the first item of last dim means x axis corresponding to w, and second item of the last dim means y, corresponding to h. 60 | # N_*M_, D_, Lq_, P_ 61 | sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, 62 | mode='bilinear', padding_mode='border', align_corners=False) 63 | sampling_value_list.append(sampling_value_l_) 64 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 65 | attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) 66 | 67 | if return_value: 68 | return torch.stack(sampling_value_list, dim=-2) 69 | #(N_ * M_, D_, Lq_, L_* P_) * (N_*M_, 1, Lq_, L_*P_) --> (N_*M_, D_, Lq_) 70 | output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) 71 | return output.transpose(1, 2).contiguous() 72 | -------------------------------------------------------------------------------- /pdvc/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | python setup.py build install 10 | -------------------------------------------------------------------------------- /pdvc/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | from .ms_deform_attn_for_caption import MSDeformAttnCap -------------------------------------------------------------------------------- /pdvc/ops/modules/ms_deform_attn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import warnings 14 | import math 15 | 16 | import torch 17 | from torch import nn 18 | import torch.nn.functional as F 19 | from torch.nn.init import xavier_uniform_, constant_ 20 | 21 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | def _is_power_of_2(n): 25 | if (not isinstance(n, int)) or (n < 0): 26 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 27 | return (n & (n-1) == 0) and n != 0 28 | 29 | 30 | class MSDeformAttn(nn.Module): 31 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): 32 | """ 33 | Multi-Scale Deformable Attention Module 34 | :param d_model hidden dimension 35 | :param n_levels number of feature levels 36 | :param n_heads number of attention heads 37 | :param n_points number of sampling points per attention head per feature level 38 | """ 39 | super().__init__() 40 | if d_model % n_heads != 0: 41 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 42 | _d_per_head = d_model // n_heads 43 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 44 | if not _is_power_of_2(_d_per_head): 45 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 46 | "which is more efficient in our CUDA implementation.") 47 | 48 | self.im2col_step = 64 49 | 50 | self.d_model = d_model 51 | self.n_levels = n_levels 52 | self.n_heads = n_heads 53 | self.n_points = n_points 54 | 55 | self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points ) 56 | self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) 57 | self.value_proj = nn.Linear(d_model, d_model) 58 | self.output_proj = nn.Linear(d_model, d_model) 59 | 60 | self._reset_parameters() 61 | 62 | def _reset_parameters(self): 63 | constant_(self.sampling_offsets.weight.data, 0.) 64 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2 * math.pi / self.n_heads) 65 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 66 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2) 67 | grid_init = grid_init[..., 0].repeat(1, self.n_levels, self.n_points) 68 | for i in range(self.n_points): 69 | grid_init[:, :, i] *= i + 1 70 | with torch.no_grad(): 71 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 72 | constant_(self.attention_weights.weight.data, 0.) 73 | constant_(self.attention_weights.bias.data, 0.) 74 | xavier_uniform_(self.value_proj.weight.data) 75 | constant_(self.value_proj.bias.data, 0.) 76 | xavier_uniform_(self.output_proj.weight.data) 77 | constant_(self.output_proj.bias.data, 0.) 78 | 79 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 80 | """ 81 | :param query (N, Length_{query}, C) 82 | :param reference_points (N, Length_{query}, n_levels, 1), range in [0, 1], including padding area 83 | or (N, Length_{query}, n_levels, 2), add additional (c, l) to form reference boxes 84 | :param input_flatten (N, \sum_{l=0}^{L-1} T_l, C) 85 | :param input_spatial_shapes (n_levels ), [T_0, T_1, ..., T_{L-1}] 86 | :param input_level_start_index (n_levels ), [0, 1_0, T_0+T_1, ...] 87 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 88 | 89 | :return output (N, Length_{query}, C) 90 | """ 91 | N, Len_q, _ = query.shape 92 | N, Len_in, _ = input_flatten.shape 93 | assert input_spatial_shapes.sum() == Len_in 94 | 95 | value = self.value_proj(input_flatten) 96 | if input_padding_mask is not None: 97 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 98 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 99 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 100 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 101 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 102 | # N, Len_q, n_heads, n_levels, n_points, 2 103 | if reference_points.shape[-1] == 1: 104 | offset_normalizer = input_spatial_shapes 105 | sampling_locations = reference_points[:, :, None, :, None, 0] \ 106 | + sampling_offsets / offset_normalizer[None, None, None, :, None] 107 | elif reference_points.shape[-1] == 2: 108 | sampling_locations = reference_points[:, :, None, :, None, 0] \ 109 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 1] * 0.5 110 | else: 111 | raise ValueError( 112 | 'Last dim of reference_points must be 1 or 2, but get {} instead.'.format(reference_points.shape[-1])) 113 | 114 | if True: 115 | sampling_locations = torch.stack( 116 | (sampling_locations, 0.5 * sampling_locations.new_ones(sampling_locations.shape)), -1) 117 | input_spatial_shapes = torch.stack([input_spatial_shapes.new_ones(input_spatial_shapes.shape), input_spatial_shapes], -1) 118 | 119 | if query.device.type == 'cuda': 120 | output = MSDeformAttnFunction.apply( 121 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, 122 | self.im2col_step) 123 | else: 124 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) 125 | output = self.output_proj(output) 126 | return output 127 | -------------------------------------------------------------------------------- /pdvc/ops/modules/ms_deform_attn_for_caption.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import warnings 14 | import math 15 | 16 | import torch 17 | from torch import nn 18 | import torch.nn.functional as F 19 | from torch.nn.init import xavier_uniform_, constant_ 20 | 21 | from ..functions import MSDeformAttnFunction, ms_deform_attn_core_pytorch 22 | 23 | 24 | def _is_power_of_2(n): 25 | if (not isinstance(n, int)) or (n < 0): 26 | raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) 27 | return (n & (n-1) == 0) and n != 0 28 | 29 | 30 | class MSDeformAttnCap(nn.Module): 31 | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4,): 32 | """ 33 | Multi-Scale Deformable Attention Module 34 | :param d_model hidden dimension 35 | :param n_levels number of feature levels 36 | :param n_heads number of attention heads 37 | :param n_points number of sampling points per attention head per feature level 38 | """ 39 | super().__init__() 40 | if d_model % n_heads != 0: 41 | raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) 42 | _d_per_head = d_model // n_heads 43 | # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation 44 | if not _is_power_of_2(_d_per_head): 45 | warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " 46 | "which is more efficient in our CUDA implementation.") 47 | 48 | self.im2col_step = 64 49 | self.d_model = d_model 50 | self.n_levels = n_levels 51 | self.n_heads = n_heads 52 | self.n_points = n_points 53 | 54 | self.sampling_offsets = nn.Linear(2 * d_model, n_heads * n_levels * n_points) 55 | self.attention_weights = nn.Linear(2 * d_model, n_heads * n_levels * n_points) 56 | self.value_proj = nn.Linear(d_model, d_model) 57 | self.output_proj = nn.Linear(d_model, d_model) 58 | self._reset_parameters() 59 | 60 | def _reset_parameters(self): 61 | constant_(self.sampling_offsets.weight.data, 0.) 62 | thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) 63 | grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) 64 | grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2) 65 | grid_init = grid_init[..., 0].repeat(1, self.n_levels, self.n_points) 66 | for i in range(self.n_points): 67 | grid_init[:, :, i] *= i + 1 68 | grid_init = grid_init - grid_init.mean(2, keepdim=True) 69 | with torch.no_grad(): 70 | self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) 71 | constant_(self.attention_weights.weight.data, 0.) 72 | constant_(self.attention_weights.bias.data, 0.) 73 | xavier_uniform_(self.value_proj.weight.data) 74 | constant_(self.value_proj.bias.data, 0.) 75 | xavier_uniform_(self.output_proj.weight.data) 76 | constant_(self.output_proj.bias.data, 0.) 77 | 78 | def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): 79 | """ 80 | :param query (N, Length_{query}, C) 81 | :param reference_points (N, Length_{query}, n_levels, 1), range in [0, 1], including padding area 82 | or (N, Length_{query}, n_levels, 2), add additional (c, l) to form reference boxes 83 | :param input_flatten (N, \sum_{l=0}^{L-1} T_l, C) 84 | :param input_spatial_shapes (n_levels ), [T_0, T_1, ..., T_{L-1}] 85 | :param input_level_start_index (n_levels ), [0, 1_0, T_0+T_1, ...] 86 | :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements 87 | 88 | :return output (N, Length_{query}, C) 89 | """ 90 | N, Len_q, _ = query.shape 91 | N, Len_in, _ = input_flatten.shape 92 | assert input_spatial_shapes.sum() == Len_in 93 | 94 | value = self.value_proj(input_flatten) 95 | if input_padding_mask is not None: 96 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 97 | value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) 98 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 99 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 100 | attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) 101 | # N, Len_q, n_heads, n_levels, n_points, 1 102 | if reference_points.shape[-1] == 1: 103 | offset_normalizer = input_spatial_shapes 104 | sampling_locations = reference_points[:, :, None, :, None, 0] \ 105 | + sampling_offsets / offset_normalizer[None, None, None, :, None] 106 | elif reference_points.shape[-1] == 2: 107 | sampling_locations = reference_points[:, :, None, :, None, 0] \ 108 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 1] * 0.5 109 | else: 110 | raise ValueError( 111 | 'Last dim of reference_points must be 1 or 2, but get {} instead.'.format(reference_points.shape[-1])) 112 | 113 | 114 | 115 | if True: 116 | sampling_locations = torch.stack( 117 | (sampling_locations, 0.5 * sampling_locations.new_ones(sampling_locations.shape)), -1) 118 | input_spatial_shapes = torch.stack([input_spatial_shapes.new_ones(input_spatial_shapes.shape), input_spatial_shapes], -1) 119 | 120 | output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights, 121 | return_value=True) 122 | 123 | return output 124 | -------------------------------------------------------------------------------- /pdvc/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /pdvc/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /pdvc/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /pdvc/ops/src/cuda/ms_deform_attn_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | #include "cuda/ms_deform_im2col_cuda.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | at::Tensor ms_deform_attn_cuda_forward( 21 | const at::Tensor &value, 22 | const at::Tensor &spatial_shapes, 23 | const at::Tensor &level_start_index, 24 | const at::Tensor &sampling_loc, 25 | const at::Tensor &attn_weight, 26 | const int im2col_step) 27 | { 28 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 29 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 30 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 31 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 32 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 33 | 34 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 35 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 36 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 37 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 38 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 39 | 40 | const int batch = value.size(0); 41 | const int spatial_size = value.size(1); 42 | const int num_heads = value.size(2); 43 | const int channels = value.size(3); 44 | 45 | const int num_levels = spatial_shapes.size(0); 46 | 47 | const int num_query = sampling_loc.size(1); 48 | const int num_point = sampling_loc.size(4); 49 | 50 | const int im2col_step_ = std::min(batch, im2col_step); 51 | 52 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 53 | 54 | auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); 55 | 56 | const int batch_n = im2col_step_; 57 | auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 58 | auto per_value_size = spatial_size * num_heads * channels; 59 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 60 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 61 | for (int n = 0; n < batch/im2col_step_; ++n) 62 | { 63 | auto columns = output_n.select(0, n); 64 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { 65 | ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), 66 | value.data() + n * im2col_step_ * per_value_size, 67 | spatial_shapes.data(), 68 | level_start_index.data(), 69 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 70 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 71 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 72 | columns.data()); 73 | 74 | })); 75 | } 76 | 77 | output = output.view({batch, num_query, num_heads*channels}); 78 | 79 | return output; 80 | } 81 | 82 | 83 | std::vector ms_deform_attn_cuda_backward( 84 | const at::Tensor &value, 85 | const at::Tensor &spatial_shapes, 86 | const at::Tensor &level_start_index, 87 | const at::Tensor &sampling_loc, 88 | const at::Tensor &attn_weight, 89 | const at::Tensor &grad_output, 90 | const int im2col_step) 91 | { 92 | 93 | AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); 94 | AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); 95 | AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); 96 | AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); 97 | AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); 98 | AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); 99 | 100 | AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); 101 | AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); 102 | AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); 103 | AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); 104 | AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); 105 | AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); 106 | 107 | const int batch = value.size(0); 108 | const int spatial_size = value.size(1); 109 | const int num_heads = value.size(2); 110 | const int channels = value.size(3); 111 | 112 | const int num_levels = spatial_shapes.size(0); 113 | 114 | const int num_query = sampling_loc.size(1); 115 | const int num_point = sampling_loc.size(4); 116 | 117 | const int im2col_step_ = std::min(batch, im2col_step); 118 | 119 | AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); 120 | 121 | auto grad_value = at::zeros_like(value); 122 | auto grad_sampling_loc = at::zeros_like(sampling_loc); 123 | auto grad_attn_weight = at::zeros_like(attn_weight); 124 | 125 | const int batch_n = im2col_step_; 126 | auto per_value_size = spatial_size * num_heads * channels; 127 | auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; 128 | auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; 129 | auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); 130 | 131 | for (int n = 0; n < batch/im2col_step_; ++n) 132 | { 133 | auto grad_output_g = grad_output_n.select(0, n); 134 | AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { 135 | ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), 136 | grad_output_g.data(), 137 | value.data() + n * im2col_step_ * per_value_size, 138 | spatial_shapes.data(), 139 | level_start_index.data(), 140 | sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 141 | attn_weight.data() + n * im2col_step_ * per_attn_weight_size, 142 | batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, 143 | grad_value.data() + n * im2col_step_ * per_value_size, 144 | grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, 145 | grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); 146 | 147 | })); 148 | } 149 | 150 | return { 151 | grad_value, grad_sampling_loc, grad_attn_weight 152 | }; 153 | } -------------------------------------------------------------------------------- /pdvc/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /pdvc/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /pdvc/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /pdvc/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /pdvc/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Various positional encodings for the transformer. 12 | """ 13 | import math 14 | import torch 15 | from torch import nn 16 | 17 | from misc.detr_utils.misc import NestedTensor 18 | 19 | 20 | class PositionEmbeddingSine(nn.Module): 21 | """ 22 | This is a more standard version of the position embedding, very similar to the one 23 | used by the Attention is all you need paper, generalized to work on images. 24 | """ 25 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 26 | super().__init__() 27 | self.num_pos_feats = num_pos_feats 28 | self.temperature = temperature 29 | self.normalize = normalize 30 | if scale is not None and normalize is False: 31 | raise ValueError("normalize should be True if scale is passed") 32 | if scale is None: 33 | scale = 2 * math.pi 34 | self.scale = scale 35 | self.max_duration = 256 36 | self.duration_embed_layer = nn.Linear(self.max_duration, self.max_duration) 37 | 38 | def forward(self, tensor_list: NestedTensor): 39 | x = tensor_list.tensors 40 | mask = tensor_list.mask 41 | duration = tensor_list.duration 42 | assert mask is not None 43 | not_mask = ~mask 44 | x_embed = not_mask.cumsum(1, dtype=torch.float32) 45 | if self.normalize: 46 | eps = 1e-6 47 | x_embed = (x_embed - 0.5) / (x_embed[:, -1:] + eps) * self.scale 48 | 49 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 50 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 51 | pos_x = x_embed[:, :, None] / dim_t 52 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 53 | 54 | dur_embed = self.duration_embedding(duration).reshape(-1,1,self.max_duration).expand_as(pos_x) 55 | pos = torch.cat((pos_x, dur_embed), dim=2).permute(0, 2, 1) 56 | return pos 57 | 58 | def duration_embedding(self, durations): 59 | out = torch.zeros(len(durations), self.max_duration, device=durations.device) 60 | durations = durations.int() 61 | for ii in range(len(durations)): 62 | out[ii, :durations[ii]] = 1 63 | out = self.duration_embed_layer(out) 64 | return out 65 | 66 | 67 | 68 | def build_position_encoding(position_embedding, N_steps): 69 | if position_embedding in ('v2', 'sine'): 70 | # TODO find a better way of exposing other arguments 71 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 72 | else: 73 | raise ValueError(f"not supported {position_embedding}") 74 | 75 | return position_embedding 76 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | matplotlib 3 | numpy 4 | pandas 5 | Pillow 6 | PyYAML 7 | six 8 | tqdm 9 | tensorboardX 10 | colorlog 11 | scipy 12 | jupyter notebook 13 | pandas 14 | h5py 15 | av 16 | joblib 17 | tqdm 18 | google_trans_new 19 | -------------------------------------------------------------------------------- /test_and_visualize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -i 2 | curdir=`pwd` 3 | export PYTHONPATH=$PYTHONPATH:$curdir/video_backbone/TSP 4 | export PYTHONPATH=$PYTHONPATH:$curdir/video_backbone/TSP/data 5 | export PYTHONPATH=$PYTHONPATH:$curdir/video_backbone/TSP/extract_features 6 | export PYTHONPATH=$PYTHONPATH:$curdir/visualization 7 | 8 | DATA_PATH=$1 # path of the raw video folder 9 | OUTPUT_FOLDER=$2 # path of the output folder to save generated captions 10 | PDVC_MODEL_PATH=$3 11 | OUTPUT_LANGUAGE=$4 12 | 13 | if [ -z "$DATA_PATH" ]; then 14 | echo "DATA_PATH variable is not set." 15 | echo "Please set DATA_PATH to the folder containing the videos you want to process." 16 | exit 1 17 | fi 18 | 19 | if [ -z "$OUTPUT_FOLDER" ]; then 20 | echo "OUTPUT_FOLDER variable is not set." 21 | echo "Please set OUTPUT_FOLDER to the folder you want to save generate captions." 22 | exit 1 23 | exit 1 24 | fi 25 | 26 | if [ -z "$PDVC_MODEL_PATH" ]; then 27 | echo "PDVC_MODEL_PATH variable is not set." 28 | echo "Please set the pretrained PDVC model path (only support PDVC with TSP features)." 29 | exit 1 30 | fi 31 | 32 | #################################################################################### 33 | ########################## PARAMETERS THAT NEED TO BE SET ########################## 34 | #################################################################################### 35 | 36 | METADATA_CSV_FILENAME=$DATA_PATH/"metadata.csv" # path/to/metadata/csv/file. Use the ones provided in the data folder. 37 | RELEASED_CHECKPOINT=r2plus1d_34-tsp_on_activitynet 38 | 39 | 40 | # Choose the stride between clips, e.g. 16 for non-overlapping clips and 1 for dense overlapping clips 41 | STRIDE=16 42 | 43 | # Optional: Split the videos into multiple shards for parallel feature extraction 44 | # Increase the number of shards and run this script independently on separate GPU devices, 45 | # each with a different SHARD_ID from 0 to NUM_SHARDS-1. 46 | # Each shard will process (num_videos / NUM_SHARDS) videos. 47 | SHARD_ID=0 48 | NUM_SHARDS=1 49 | DEVICE=cuda 50 | WORKER_NUM=8 51 | 52 | echo "START GENERATE METADATA" 53 | python video_backbone/TSP/data/generate_metadata_csv.py --video-folder $DATA_PATH --output-csv $METADATA_CSV_FILENAME 54 | 55 | FEATURE_DIR=$OUTPUT_FOLDER/${RELEASED_CHECKPOINT}_stride_${STRIDE}/ 56 | mkdir -p $OUTPUT_DIR 57 | 58 | echo "START EXTRACT VIDEO FEATURES" 59 | python video_backbone/TSP/extract_features/extract_features.py \ 60 | --data-path $DATA_PATH \ 61 | --metadata-csv-filename $METADATA_CSV_FILENAME \ 62 | --released-checkpoint $RELEASED_CHECKPOINT \ 63 | --stride $STRIDE \ 64 | --shard-id $SHARD_ID \ 65 | --num-shards $NUM_SHARDS \ 66 | --device $DEVICE \ 67 | --output-dir $FEATURE_DIR \ 68 | --workers $WORKER_NUM 69 | 70 | echo "START Dense-Captioning" 71 | python eval.py --eval_mode test --eval_save_dir $OUTPUT_FOLDER --eval_folder generated_captions --eval_model_path $PDVC_MODEL_PATH --test_video_feature_folder $FEATURE_DIR --test_video_meta_data_csv_path $METADATA_CSV_FILENAME 72 | 73 | echo "START VISUALIZATION" 74 | python visualization/visualization.py --input_mp4_folder $DATA_PATH --output_mp4_folder $OUTPUT_FOLDER/vis_videos --dvc_file $OUTPUT_FOLDER/generated_captions/dvc_results.json --output_language $OUTPUT_LANGUAGE 75 | -------------------------------------------------------------------------------- /video_backbone/TSP/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /video_backbone/TSP/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Humam Alwassel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /video_backbone/TSP/README.md: -------------------------------------------------------------------------------- 1 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/temporal-action-localization-on-activitynet)](https://paperswithcode.com/sota/temporal-action-localization-on-activitynet?p=tsp-temporally-sensitive-pretraining-of-video) 2 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/temporal-action-proposal-generation-on)](https://paperswithcode.com/sota/temporal-action-proposal-generation-on?p=tsp-temporally-sensitive-pretraining-of-video) 3 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/dense-video-captioning-on-activitynet)](https://paperswithcode.com/sota/dense-video-captioning-on-activitynet?p=tsp-temporally-sensitive-pretraining-of-video) 4 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/tsp-temporally-sensitive-pretraining-of-video/temporal-action-localization-on-thumos14)](https://paperswithcode.com/sota/temporal-action-localization-on-thumos14?p=tsp-temporally-sensitive-pretraining-of-video) 5 | 6 | # TSP: Temporally-Sensitive Pretraining of Video Encoders for Localization Tasks 7 | 8 | 9 | 10 | [[Paper]](https://arxiv.org/pdf/2011.11479.pdf) 11 | [[Project Website]](http://humamalwassel.com/publication/tsp/) 12 | 13 | This repository holds the source code, pretrained models, and pre-extracted features for the TSP method. 14 | 15 | Please cite this work if you find TSP useful for your research. 16 | ``` 17 | @inproceedings{alwassel_2021_tsp, 18 | title={TSP: Temporally-Sensitive Pretraining of Video Encoders for Localization Tasks}, 19 | author={Alwassel, Humam and Giancola, Silvio and Ghanem, Bernard}, 20 | booktitle={Proceedings of the IEEE/CVF International 21 | Conference on Computer Vision (ICCV) Workshops}, 22 | year={2021} 23 | } 24 | ``` 25 | 26 | ## Pre-extracted TSP Features 27 | 28 | We provide pre-extracted features for ActivityNet v1.3 and THUMOS14 videos. The feature files are saved in H5 format, where we map each `video-name` to a features tensor of size `N x 512`, where `N` is the number of features and `512` is the feature size. Use `h5py` python package to read the feature files. Not familiar with H5 files or `h5py`? here is a quick start [guide](https://docs.h5py.org/en/stable/). 29 | 30 | ### For ActivityNet v1.3 dataset 31 | **Download**: 32 | [[train subset]](https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-train_features.h5) 33 | [[valid subset]](https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-valid_features.h5) 34 | [[test subset]](https://github.com/HumamAlwassel/TSP/releases/download/activitynet_features/r2plus1d_34-tsp_on_activitynet-test_features.h5) 35 | 36 | **Details**: The features are extracted from the R(2+1)D-34 encoder pretrained with TSP on ActivityNet ([released model](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_5-0d2cf854.pth)) using clips of `16 frames` at a frame rate of `15 fps` and a stride of `16 frames` (*i.e.,* **non-overlapping** clips). This gives one feature vector per `16/15 ~= 1.067` seconds. 37 | 38 | 39 | ### For THUMOS14 dataset 40 | 41 | **Download**: 42 | [[valid subset]](https://github.com/HumamAlwassel/TSP/releases/download/thumos14_features/r2plus1d_34-tsp_on_thumos14-valid_features.h5) 43 | [[test subset]](https://github.com/HumamAlwassel/TSP/releases/download/thumos14_features/r2plus1d_34-tsp_on_thumos14-test_features.h5) 44 | 45 | **Details**: The features are extracted from the R(2+1)D-34 encoder pretrained with TSP on THUMOS14 ([released model](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_thumos14-max_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_4-e6a30b2f.pth)) using clips of `16 frames` at a frame rate of `15 fps` and a stride of `1 frame` (*i.e.,* dense **overlapping** clips). This gives one feature vector per `1/15 ~= 0.067` seconds. 46 | 47 | ## Setup 48 | Clone this repository and create the conda environment. 49 | ``` 50 | git clone https://github.com/HumamAlwassel/TSP.git 51 | cd TSP 52 | conda env create -f environment.yml 53 | conda activate tsp 54 | ``` 55 | 56 | ## Data Preprocessing 57 | Follow the instructions [here](data) to download and preprocess the input data. 58 | 59 | ## Training 60 | We provide training scripts for the TSP models and the TAC baselines [here](train). 61 | 62 | ## Feature Extraction 63 | You can extract features from released pretrained models or from local checkpoints using the scripts [here](extract_features). 64 | 65 | **Acknowledgment**: Our source code borrows implementation ideas from [pytorch/vision](https://github.com/pytorch/vision) and [facebookresearch/VMZ](https://github.com/facebookresearch/VMZ) repositories. 66 | -------------------------------------------------------------------------------- /video_backbone/TSP/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/__init__.py -------------------------------------------------------------------------------- /video_backbone/TSP/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/common/__init__.py -------------------------------------------------------------------------------- /video_backbone/TSP/common/scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from bisect import bisect_right 3 | 4 | 5 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 6 | def __init__( 7 | self, 8 | optimizer, 9 | milestones, 10 | gamma=0.1, 11 | warmup_factor=1.0 / 3, 12 | warmup_iters=5, 13 | warmup_method='linear', 14 | last_epoch=-1, 15 | ): 16 | if not milestones == sorted(milestones): 17 | raise ValueError( 18 | f'Milestones should be a list of increasing integers. ' 19 | f'Got {milestones}', 20 | ) 21 | 22 | if warmup_method not in ('constant', 'linear'): 23 | raise ValueError( 24 | f'Only "constant" or "linear" warmup_method accepted' 25 | f'got {warmup_method}' 26 | ) 27 | self.milestones = milestones 28 | self.gamma = gamma 29 | self.warmup_factor = warmup_factor 30 | self.warmup_iters = warmup_iters 31 | self.warmup_method = warmup_method 32 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 33 | 34 | def get_lr(self): 35 | warmup_factor = 1 36 | if self.last_epoch < self.warmup_iters: 37 | if self.warmup_method == 'constant': 38 | warmup_factor = self.warmup_factor 39 | elif self.warmup_method == 'linear': 40 | alpha = float(self.last_epoch) / self.warmup_iters 41 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 42 | return [ 43 | base_lr * 44 | warmup_factor * 45 | self.gamma ** bisect_right(self.milestones, self.last_epoch) 46 | for base_lr in self.base_lrs 47 | ] 48 | -------------------------------------------------------------------------------- /video_backbone/TSP/common/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | 5 | def crop(vid, i, j, h, w): 6 | return vid[..., i:(i + h), j:(j + w)] 7 | 8 | 9 | def center_crop(vid, output_size): 10 | h, w = vid.shape[-2:] 11 | th, tw = output_size 12 | 13 | i = int(round((h - th) / 2.)) 14 | j = int(round((w - tw) / 2.)) 15 | return crop(vid, i, j, th, tw) 16 | 17 | 18 | def hflip(vid): 19 | return vid.flip(dims=(-1,)) 20 | 21 | 22 | # NOTE: for those functions, which generally expect mini-batches, we keep them 23 | # as non-minibatch so that they are applied as if they were 4d (thus image). 24 | # this way, we only apply the transformation in the spatial domain 25 | def resize(vid, size, interpolation='bilinear'): 26 | # NOTE: using bilinear interpolation because we don't work on minibatches 27 | # at this level 28 | scale = None 29 | if isinstance(size, int): 30 | scale = float(size) / min(vid.shape[-2:]) 31 | size = None 32 | return torch.nn.functional.interpolate( 33 | vid, size=size, scale_factor=scale, mode=interpolation, align_corners=False) 34 | 35 | 36 | def pad(vid, padding, fill=0, padding_mode="constant"): 37 | # NOTE: don't want to pad on temporal dimension, so let as non-batch 38 | # (4d) before padding. This works as expected 39 | return torch.nn.functional.pad(vid, padding, value=fill, mode=padding_mode) 40 | 41 | 42 | def to_normalized_float_tensor(vid): 43 | return vid.permute(3, 0, 1, 2).to(torch.float32) / 255 44 | 45 | 46 | def normalize(vid, mean, std): 47 | shape = (-1,) + (1,) * (vid.dim() - 1) 48 | mean = torch.as_tensor(mean).reshape(shape) 49 | std = torch.as_tensor(std).reshape(shape) 50 | return (vid - mean) / std 51 | 52 | 53 | # Class interface 54 | 55 | class RandomCrop(object): 56 | def __init__(self, size): 57 | self.size = size 58 | 59 | @staticmethod 60 | def get_params(vid, output_size): 61 | ''' 62 | Get parameters for ``crop`` for a random crop. 63 | ''' 64 | h, w = vid.shape[-2:] 65 | th, tw = output_size 66 | if w == tw and h == th: 67 | return 0, 0, h, w 68 | i = random.randint(0, h - th) 69 | j = random.randint(0, w - tw) 70 | return i, j, th, tw 71 | 72 | def __call__(self, vid): 73 | i, j, h, w = self.get_params(vid, self.size) 74 | return crop(vid, i, j, h, w) 75 | 76 | 77 | class CenterCrop(object): 78 | def __init__(self, size): 79 | self.size = size 80 | 81 | def __call__(self, vid): 82 | return center_crop(vid, self.size) 83 | 84 | 85 | class Resize(object): 86 | def __init__(self, size): 87 | self.size = size 88 | 89 | def __call__(self, vid): 90 | return resize(vid, self.size) 91 | 92 | 93 | class ToFloatTensorInZeroOne(object): 94 | def __call__(self, vid): 95 | return to_normalized_float_tensor(vid) 96 | 97 | 98 | class Normalize(object): 99 | def __init__(self, mean, std): 100 | self.mean = mean 101 | self.std = std 102 | 103 | def __call__(self, vid): 104 | return normalize(vid, self.mean, self.std) 105 | 106 | 107 | class RandomHorizontalFlip(object): 108 | def __init__(self, p=0.5): 109 | self.p = p 110 | 111 | def __call__(self, vid): 112 | if random.random() < self.p: 113 | return hflip(vid) 114 | return vid 115 | 116 | 117 | class Pad(object): 118 | def __init__(self, padding, fill=0): 119 | self.padding = padding 120 | self.fill = fill 121 | 122 | def __call__(self, vid): 123 | return pad(vid, self.padding, self.fill) 124 | -------------------------------------------------------------------------------- /video_backbone/TSP/data/README.md: -------------------------------------------------------------------------------- 1 | # Data Preprocessing 2 | 3 | **Step 1**: Download the ActivityNet v1.3 and THUMOS14 videos. For ActivityNet, you can submit a data request [here](https://docs.google.com/forms/d/e/1FAIpQLSeKaFq9ZfcmZ7W0B0PbEhfbTHY41GeEgwsa7WobJgGUhn4DTQ/viewform). For THUMOS14, you can download it directly from the [official website](http://crcv.ucf.edu/THUMOS14/download.html). 4 | 5 | **Step 2**: Standardize all videos to MP4 format with a constant frame rate of 30fps using the script `standardize_videos_to_constant_30fps_mp4.sh`: 6 | ``` 7 | bash standardize_video_to_constant_30fps_mp4.sh 8 | ``` 9 | 10 | **Step 3**: Split the ActivityNet videos into three subfolders: `train` (10024 videos), `valid` (4926 videos), and `test` (5044 videos) using the official splits. Similarly, split THUMOS14 into `valid` (200 videos) and `test` (213 videos) subfolders. 11 | 12 | **Step 4**: Generate metadata CSV files for each ActivityNet and THUMOS14 subset using the script `generate_metadata_csv.py`. _This step is already pre-computed for the standardized ActivityNet and THUMOS14 videos and saved in the `activitynet` and `thumos14` folders_. 13 | ``` 14 | python generate_metadata_csv.py --video-folder --output-csv 15 | ``` 16 | -------------------------------------------------------------------------------- /video_backbone/TSP/data/activitynet/activitynet_v1-3_action_label_mapping.json: -------------------------------------------------------------------------------- 1 | ["Applying sunscreen", 2 | "Archery", 3 | "Arm wrestling", 4 | "Assembling bicycle", 5 | "BMX", 6 | "Baking cookies", 7 | "Ballet", 8 | "Bathing dog", 9 | "Baton twirling", 10 | "Beach soccer", 11 | "Beer pong", 12 | "Belly dance", 13 | "Blow-drying hair", 14 | "Blowing leaves", 15 | "Braiding hair", 16 | "Breakdancing", 17 | "Brushing hair", 18 | "Brushing teeth", 19 | "Building sandcastles", 20 | "Bullfighting", 21 | "Bungee jumping", 22 | "Calf roping", 23 | "Camel ride", 24 | "Canoeing", 25 | "Capoeira", 26 | "Carving jack-o-lanterns", 27 | "Changing car wheel", 28 | "Cheerleading", 29 | "Chopping wood", 30 | "Clean and jerk", 31 | "Cleaning shoes", 32 | "Cleaning sink", 33 | "Cleaning windows", 34 | "Clipping cat claws", 35 | "Cricket", 36 | "Croquet", 37 | "Cumbia", 38 | "Curling", 39 | "Cutting the grass", 40 | "Decorating the Christmas tree", 41 | "Disc dog", 42 | "Discus throw", 43 | "Dodgeball", 44 | "Doing a powerbomb", 45 | "Doing crunches", 46 | "Doing fencing", 47 | "Doing karate", 48 | "Doing kickboxing", 49 | "Doing motocross", 50 | "Doing nails", 51 | "Doing step aerobics", 52 | "Drinking beer", 53 | "Drinking coffee", 54 | "Drum corps", 55 | "Elliptical trainer", 56 | "Fixing bicycle", 57 | "Fixing the roof", 58 | "Fun sliding down", 59 | "Futsal", 60 | "Gargling mouthwash", 61 | "Getting a haircut", 62 | "Getting a piercing", 63 | "Getting a tattoo", 64 | "Grooming dog", 65 | "Grooming horse", 66 | "Hammer throw", 67 | "Hand car wash", 68 | "Hand washing clothes", 69 | "Hanging wallpaper", 70 | "Having an ice cream", 71 | "High jump", 72 | "Hitting a pinata", 73 | "Hopscotch", 74 | "Horseback riding", 75 | "Hula hoop", 76 | "Hurling", 77 | "Ice fishing", 78 | "Installing carpet", 79 | "Ironing clothes", 80 | "Javelin throw", 81 | "Kayaking", 82 | "Kite flying", 83 | "Kneeling", 84 | "Knitting", 85 | "Laying tile", 86 | "Layup drill in basketball", 87 | "Long jump", 88 | "Longboarding", 89 | "Making a cake", 90 | "Making a lemonade", 91 | "Making a sandwich", 92 | "Making an omelette", 93 | "Mixing drinks", 94 | "Mooping floor", 95 | "Mowing the lawn", 96 | "Paintball", 97 | "Painting", 98 | "Painting fence", 99 | "Painting furniture", 100 | "Peeling potatoes", 101 | "Ping-pong", 102 | "Plastering", 103 | "Plataform diving", 104 | "Playing accordion", 105 | "Playing badminton", 106 | "Playing bagpipes", 107 | "Playing beach volleyball", 108 | "Playing blackjack", 109 | "Playing congas", 110 | "Playing drums", 111 | "Playing field hockey", 112 | "Playing flauta", 113 | "Playing guitarra", 114 | "Playing harmonica", 115 | "Playing ice hockey", 116 | "Playing kickball", 117 | "Playing lacrosse", 118 | "Playing piano", 119 | "Playing polo", 120 | "Playing pool", 121 | "Playing racquetball", 122 | "Playing rubik cube", 123 | "Playing saxophone", 124 | "Playing squash", 125 | "Playing ten pins", 126 | "Playing violin", 127 | "Playing water polo", 128 | "Pole vault", 129 | "Polishing forniture", 130 | "Polishing shoes", 131 | "Powerbocking", 132 | "Preparing pasta", 133 | "Preparing salad", 134 | "Putting in contact lenses", 135 | "Putting on makeup", 136 | "Putting on shoes", 137 | "Rafting", 138 | "Raking leaves", 139 | "Removing curlers", 140 | "Removing ice from car", 141 | "Riding bumper cars", 142 | "River tubing", 143 | "Rock climbing", 144 | "Rock-paper-scissors", 145 | "Rollerblading", 146 | "Roof shingle removal", 147 | "Rope skipping", 148 | "Running a marathon", 149 | "Sailing", 150 | "Scuba diving", 151 | "Sharpening knives", 152 | "Shaving", 153 | "Shaving legs", 154 | "Shot put", 155 | "Shoveling snow", 156 | "Shuffleboard", 157 | "Skateboarding", 158 | "Skiing", 159 | "Slacklining", 160 | "Smoking a cigarette", 161 | "Smoking hookah", 162 | "Snatch", 163 | "Snow tubing", 164 | "Snowboarding", 165 | "Spinning", 166 | "Spread mulch", 167 | "Springboard diving", 168 | "Starting a campfire", 169 | "Sumo", 170 | "Surfing", 171 | "Swimming", 172 | "Swinging at the playground", 173 | "Table soccer", 174 | "Tai chi", 175 | "Tango", 176 | "Tennis serve with ball bouncing", 177 | "Throwing darts", 178 | "Trimming branches or hedges", 179 | "Triple jump", 180 | "Tug of war", 181 | "Tumbling", 182 | "Using parallel bars", 183 | "Using the balance beam", 184 | "Using the monkey bar", 185 | "Using the pommel horse", 186 | "Using the rowing machine", 187 | "Using uneven bars", 188 | "Vacuuming floor", 189 | "Volleyball", 190 | "Wakeboarding", 191 | "Walking the dog", 192 | "Washing dishes", 193 | "Washing face", 194 | "Washing hands", 195 | "Waterskiing", 196 | "Waxing skis", 197 | "Welding", 198 | "Windsurfing", 199 | "Wrapping presents", 200 | "Zumba"] -------------------------------------------------------------------------------- /video_backbone/TSP/data/activitynet/activitynet_v1-3_temporal_region_label_mapping.json: -------------------------------------------------------------------------------- 1 | ["Action", 2 | "No action"] -------------------------------------------------------------------------------- /video_backbone/TSP/data/generate_metadata_csv.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import argparse 4 | import os 5 | import glob 6 | import pandas as pd 7 | 8 | from torchvision.io import read_video_timestamps 9 | from joblib import Parallel, delayed 10 | 11 | 12 | def get_video_stats(filename): 13 | pts, video_fps = read_video_timestamps(filename=filename, pts_unit='sec') 14 | if video_fps: 15 | stats = {'filename': os.path.basename(filename), 16 | 'video-duration': len(pts)/video_fps, 17 | 'fps': video_fps, 18 | 'video-frames': len(pts)} 19 | else: 20 | stats = {'filename': os.path.basename(filename), 21 | 'video-duration': None, 22 | 'fps': None, 23 | 'video-frames': None} 24 | print(f'WARNING: {filename} has an issue. video_fps = {video_fps}, len(pts) = {len(pts)}.') 25 | return stats 26 | 27 | 28 | def main(args): 29 | print(args) 30 | 31 | filenames = glob.glob(os.path.join(args.video_folder, f'*.{args.ext}')) 32 | print(f'Number of video files: {len(filenames)}') 33 | 34 | all_stats = Parallel(n_jobs=args.workers)( 35 | delayed(get_video_stats)( 36 | filename=filename, 37 | ) for filename in filenames) 38 | 39 | df = pd.DataFrame(all_stats) 40 | df.to_csv(args.output_csv, index=False) 41 | print(f'Saved metadata to {args.output_csv}') 42 | 43 | if __name__ == '__main__': 44 | parser = argparse.ArgumentParser(description='Generates a metadata CSV file with columns ' 45 | '[filename, video-duration, fps, video-frames] ' 46 | 'for a given input video folder.') 47 | 48 | parser.add_argument('--video-folder', required=True, type=str, 49 | help='Path to folder containing the raw video files') 50 | parser.add_argument('--ext', default='mp4', type=str, 51 | help='Video files extension (default: mp4)') 52 | parser.add_argument('--output-csv', required=True, type=str, 53 | help='Where to save the metadata CSV file') 54 | parser.add_argument('--workers', default=20, type=int, 55 | help='Number of parallel processes to use to generate the output (default: 20)') 56 | 57 | args = parser.parse_args() 58 | 59 | main(args) 60 | -------------------------------------------------------------------------------- /video_backbone/TSP/data/standardize_videos_to_constant_30fps_mp4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Script to convert all videos in to mp4 videos with constant frame rate of 30fps. 4 | # The output videos are saved in . 5 | # 6 | # usage: bash standardize_video_to_constant_30fps_mp4.sh 7 | 8 | if [ "$#" -ne 2 ]; then 9 | echo "Illegal number of parameters" 10 | echo "usage: bash standardize_video_to_constant_30fps_mp4.sh " 11 | exit 1 12 | fi 13 | 14 | 15 | INPUT_FOLDER=$1 16 | OUTPUT_FOLDER=$2 17 | 18 | echo "INPUT_FOLDER=$INPUT_FOLDER" 19 | echo "OUTPUT_FOLDER=$OUTPUT_FOLDER" 20 | 21 | mkdir -p $OUTPUT_FOLDER 22 | 23 | for input_video_path in $INPUT_FOLDER/*; 24 | do 25 | video_filename=$(basename $input_video_path) 26 | video_name="${video_filename%.*}" 27 | output_video_path="$OUTPUT_FOLDER/$video_name.mp4" 28 | 29 | echo "ffmpeg -y -i $input_video_path -filter:v fps=fps=30 $output_video_path" 30 | ffmpeg -y -i $input_video_path -filter:v fps=fps=30 $output_video_path 31 | done 32 | -------------------------------------------------------------------------------- /video_backbone/TSP/environment.yml: -------------------------------------------------------------------------------- 1 | name: tsp 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - torchvision=0.5.0 8 | - pytorch=1.4.0 9 | - cudatoolkit=10.1 10 | - pandas 11 | - h5py 12 | - av 13 | - joblib 14 | - tqdm 15 | -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/README.md: -------------------------------------------------------------------------------- 1 | # TSP Feature Extraction 2 | 3 | Follow the data preprocessing instructions described [here](../data) before extracting features. We provide scripts for feature extraction using the released pretrained models or using a local checkpoint. 4 | 5 | ### From Released Pretrained Models 6 | Use the `extract_features_from_a_released_checkpoint.sh` script to extract features from the official released models. You need to manually set the following variables: 7 | - `DATA_PATH`: Path to the video folder. 8 | - `METADATA_CSV_FILENAME`: Path to a metadata CSV file. For ActivityNet and THUMOS14, use the CSV files precomputed in the [data](../data) folder. If you want to extract features for other video datasets, first standardized the videos and then generate the metadata files as per the instructions [here](../data), specifically step 2 and 4. 9 | - `RELEASED_CHECKPOINT`: Name of the one of the `13` released pretrained model. Refer to the tables below for more details. 10 | - `STRIDE`: Choose the stride between clips, *e.g.,* `16` for non-overlapping clips and `1` for dense overlapping clips. 11 | - (Optional) `SHARD_ID`, `NUM_SHARDS`, `DEVICE`: Split the videos in the CSV into multiple shards for parallel feature extraction. Increase the number of shards and run the script independently on separate GPU devices, each with a different `SHARD_ID` from `0` to `NUM_SHARDS-1`. Each shard will process `num_videos / NUM_SHARDS` videos. 12 | 13 | ### From a Local Checkpoint 14 | Use the `extract_features_from_a_local_checkpoint.sh` script to extract features from a local checkpoint. You need to manually set the same variables above plus the following 2 variables instead of `RELEASED_CHECKPOINT`: 15 | - `LOCAL_CHECKPOINT`: Path to the local checkpoint `.pth` file. 16 | - `BACKBONE`: The backbone used in the local checkpoint: `r2plus1d_34`, `r2plus1d_18`, or `r3d_18`. 17 | 18 | ## Post Processing Output 19 | The feature extraction script will output a `.pkl` file for each video. Merge all the `.pkl` files into one `.h5` file as follows: 20 | 21 | ``` 22 | python merge_pkl_files_into_one_h5_feature_file.py --features-folder --output-h5 23 | ``` 24 | 25 | ------ 26 | 27 | **Released Pretrained Models** 28 | 29 | **Main TSP models** 30 | | Name | Description | Weights | 31 | | ---------------------------------------- | ----------------------------------------------------------- | ------- | 32 | | `r2plus1d_34-tsp_on_activitynet` | R(2+1)D-34 pretrained with TSP on ActivityNet | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_5-0d2cf854.pth) | 33 | | `r2plus1d_34-tsp_on_thumos14` | R(2+1)D-34 pretrained with TSP on THUMOS14 | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_thumos14-max_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_4-e6a30b2f.pth) | 34 | 35 | **Main TAC baseline models** 36 | | Name | Description | Weights | 37 | | ---------------------------------------- | ----------------------------------------------------------- | ------- | 38 | | `r2plus1d_34-tac_on_activitynet` | R(2+1)D-34 pretrained with TAC on ActivityNet | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.002-epoch_5-98ccac94.pth) | 39 | | `r2plus1d_34-tac_on_thumos14` | R(2+1)D-34 pretrained with TAC on THUMOS14 | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_thumos14-backbone_lr_0.00001-fc_lr_0.002-epoch_3-54b5c8aa.pth) | 40 | | `r2plus1d_34-tac_on_kinetics` | R(2+1)D-34 pretrained with TAC on Kinetics | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_kinetics-0547130e.pth) | 41 | 42 | **Other models from the GVF and backbone architecture ablation studies** 43 | | Name | Description | Weights | 44 | | ---------------------------------------- | ----------------------------------------------------------- | ------- | 45 | | `r2plus1d_34-tsp_on_activitynet-avg_gvf` | R(2+1)D-34 pretrained with TSP on ActivityNet (average GVF) | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-avg_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-8b74eaa2.pth) | 46 | | `r2plus1d_34-tsp_on_activitynet-no_gvf` | R(2+1)D-34 pretrained with TSP on ActivityNet (without GVF) | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-no_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-fb38fdd2.pth) | 47 | | `r2plus1d_18-tsp_on_activitynet` | R(2+1)D-18 pretrained with TSP on ActivityNet | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-22835b73.pth) | 48 | | `r2plus1d_18-tac_on_activitynet` | R(2+1)D-18 pretrained with TAC on ActivityNet | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.004-epoch_5-9f56941a.pth) | 49 | | `r2plus1d_18-tac_on_kinetics` | R(2+1)D-18 pretrained with TAC on Kinetics | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_kinetics-76ce975c.pth) | 50 | | `r3d_18-tsp_on_activitynet` | R3D-18 pretrained with TSP on ActivityNet | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-85584422.pth) | 51 | | `r3d_18-tac_on_activitynet` | R3D-18 pretrained with TAC on ActivityNet | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_activitynet-backbone_lr_0.001-fc_lr_0.01-epoch_5-31fd6e95.pth) | 52 | | `r3d_18-tac_on_kinetics` | R3D-18 pretrained with TAC on Kinetics | [checkpoint](https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_kinetics-dcd952c6.pth) | 53 | 54 | -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/extract_features/__init__.py -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/eval_video_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | import torch 7 | import h5py 8 | import pickle as pkl 9 | 10 | from torch.utils.data import Dataset 11 | from torchvision.io import read_video 12 | 13 | 14 | class EvalVideoDataset(Dataset): 15 | ''' 16 | EvalVideoDataset: 17 | This dataset takes in a list of videos and return all clips with the given length and stride 18 | Each item in the dataset is a dictionary with the keys: 19 | - "clip": a Tensor (dtype=torch.float) of the clip frames after applying transforms 20 | - "filename": the video filename 21 | - "is-last-clip": a flag to mark the last clip in the video 22 | ''' 23 | 24 | def __init__(self, metadata_df, root_dir, clip_length, frame_rate, stride, output_dir, transforms=None): 25 | ''' 26 | Args: 27 | metadata_df (pandas.DataFrame): a DataFrame with the following video metadata columns: 28 | [filename, fps, video-frames]. 29 | root_dir (string): Directory with all the video files. 30 | clip_length (int): The number of frames per clip. 31 | frame_rate (int): The effective frame rate (fps) to sample clips. 32 | stride (int): The number of frames (after resampling with frame_rate) between consecutive clips. 33 | For example, `stride`=1 will generate dense clips, while `stride`=`clip_length` will generate non-overlapping clips 34 | output_dir (string): Path to the directory where video features will be saved 35 | transforms (callable): A function/transform that takes in a TxHxWxC video 36 | and returns a transformed version. 37 | ''' 38 | metadata_df = EvalVideoDataset._append_root_dir_to_filenames_and_check_files_exist(metadata_df, root_dir) 39 | self.clip_metadata_df = EvalVideoDataset._generate_clips_metadata(metadata_df, clip_length, frame_rate, stride) 40 | self.clip_length = clip_length 41 | self.frame_rate = frame_rate 42 | self.stride = stride 43 | self.output_dir = output_dir 44 | self.transforms = transforms 45 | 46 | # Holds clip features for a given video until all clips are processed and the 47 | # full video features are ready to be saved to disk 48 | self.saved_features = {} 49 | self.saved_results = {} 50 | 51 | def __len__(self): 52 | return len(self.clip_metadata_df) 53 | 54 | def __getitem__(self, idx): 55 | sample = {} 56 | row = self.clip_metadata_df.iloc[idx] 57 | filename, fps = row['filename'], row['fps'] 58 | 59 | filename, fps, clip_t_start, is_last_clip = row['filename'], row['fps'], row['clip-t-start'], row['is-last-clip'] 60 | 61 | # compute clip_t_start and clip_t_end 62 | clip_length_in_sec = self.clip_length / self.frame_rate 63 | clip_t_end = clip_t_start + clip_length_in_sec 64 | 65 | # get a tensor [clip_length, H, W, C] of the video frames between clip_t_start and clip_t_end seconds 66 | vframes, _, _ = read_video(filename=filename, start_pts=clip_t_start, end_pts=clip_t_end, pts_unit='sec') 67 | idxs = EvalVideoDataset._resample_video_idx(self.clip_length, fps, self.frame_rate) 68 | vframes = vframes[idxs][:self.clip_length] # [:self.clip_length] for removing extra frames if isinstance(idxs, slice) 69 | if vframes.shape[0] != self.clip_length: 70 | raise RuntimeError(f': got clip of length {vframes.shape[0]} != {self.clip_length}.' 71 | f'filename={filename}, clip_t_start={clip_t_start}, clip_t_end={clip_t_end}, ' 72 | f'fps={fps}') 73 | 74 | sample['clip'] = self.transforms(vframes) 75 | sample['filename'] = filename 76 | sample['is-last-clip'] = is_last_clip 77 | 78 | return sample 79 | 80 | def save_output(self, batch_output, batch_input, label_columns): 81 | batch_output = [x.detach().cpu().numpy() for x in batch_output] 82 | 83 | for i in range(batch_output[0].shape[0]): 84 | filename, is_last_clip = batch_input['filename'][i], batch_input['is-last-clip'][i] 85 | if not (filename in self.saved_results): 86 | self.saved_results[filename] = {l: [] for l in label_columns} 87 | for j, label in enumerate(label_columns): 88 | self.saved_results[filename][label].append(batch_output[j][i,...]) 89 | 90 | if is_last_clip: 91 | # dump results in disk at self.output_dir and then remove from self.saved_results 92 | output_filename = os.path.join(self.output_dir, os.path.basename(filename).split('.')[0] + '_output.pkl') 93 | for label in label_columns: 94 | self.saved_results[filename][label] = np.stack(self.saved_results[filename][label]) 95 | # np.save(output_filename, self.saved_results[filename]) 96 | with open(output_filename, 'wb') as fobj: 97 | pkl.dump(self.saved_results[filename], fobj) 98 | del self.saved_results[filename] 99 | 100 | def save_features(self, batch_features, batch_input): 101 | batch_features = batch_features.detach().cpu().numpy() 102 | 103 | for i in range(batch_features.shape[0]): 104 | filename, is_last_clip = batch_input['filename'][i], batch_input['is-last-clip'][i] 105 | if not (filename in self.saved_features): 106 | self.saved_features[filename] = [] 107 | self.saved_features[filename].append(batch_features[i,...]) 108 | 109 | if is_last_clip: 110 | # dump features to disk at self.output_dir and remove them from self.saved_features 111 | output_filename = os.path.join(self.output_dir, os.path.basename(filename).split('.')[0] + '.npy') 112 | self.saved_features[filename] = np.stack(self.saved_features[filename]) 113 | np.save(output_filename, self.saved_features[filename]) 114 | # with open(output_filename, 'wb') as fobj: 115 | # pkl.dump(self.saved_features[filename], fobj) 116 | del self.saved_features[filename] 117 | 118 | 119 | @staticmethod 120 | def _append_root_dir_to_filenames_and_check_files_exist(df, root_dir): 121 | df['filename'] = df['filename'].map(lambda f: os.path.join(root_dir, f)) 122 | filenames = df.drop_duplicates('filename')['filename'].values 123 | for f in filenames: 124 | if not os.path.exists(f): 125 | raise ValueError(f': file={f} does not exists. ' 126 | f'Double-check root_dir and metadata_df inputs') 127 | return df 128 | 129 | @staticmethod 130 | def _generate_clips_metadata(df, clip_length, frame_rate, stride): 131 | clip_metadata = { 132 | 'filename': [], 133 | 'fps': [], 134 | 'clip-t-start': [], 135 | 'is-last-clip': [], 136 | } 137 | for i, row in df.iterrows(): 138 | total_frames_after_resampling = int(row['video-frames'] * (float(frame_rate) / row['fps'])) 139 | idxs = EvalVideoDataset._resample_video_idx(total_frames_after_resampling, row['fps'], frame_rate) 140 | if isinstance(idxs, slice): 141 | frame_idxs = np.arange(row['video-frames'])[idxs] 142 | else: 143 | frame_idxs = idxs.numpy() 144 | clip_t_start = list(frame_idxs[np.arange(0,frame_idxs.shape[0]-clip_length+1,stride)]/row['fps']) 145 | num_clips = len(clip_t_start) 146 | 147 | clip_metadata['filename'].extend([row['filename']]*num_clips) 148 | clip_metadata['fps'].extend([row['fps']]*num_clips) 149 | clip_metadata['clip-t-start'].extend(clip_t_start) 150 | is_last_clip = [0] * num_clips 151 | is_last_clip[-1] = 1 152 | clip_metadata['is-last-clip'].extend(is_last_clip) 153 | 154 | return pd.DataFrame(clip_metadata) 155 | 156 | @staticmethod 157 | def _resample_video_idx(num_frames, original_fps, new_fps): 158 | step = float(original_fps) / new_fps 159 | if step.is_integer(): 160 | # optimization: if step is integer, don't need to perform 161 | # advanced indexing 162 | step = int(step) 163 | return slice(None, None, step) 164 | idxs = torch.arange(num_frames, dtype=torch.float32) * step 165 | idxs = idxs.floor().to(torch.int64) 166 | return idxs 167 | -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/extract_features.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import os 4 | import torch 5 | import torchvision 6 | import json 7 | import datetime 8 | import time 9 | import numpy as np 10 | import pandas as pd 11 | import pickle as pkl 12 | import sys 13 | 14 | from torchvision import transforms 15 | from torch import nn 16 | from eval_video_dataset import EvalVideoDataset 17 | sys.path.insert(0, '..') 18 | from common import utils 19 | from common import transforms as T 20 | from models.model import Model 21 | 22 | 23 | MODEL_URLS = { 24 | # main TSP models 25 | 'r2plus1d_34-tsp_on_activitynet' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_5-0d2cf854.pth', 26 | 'r2plus1d_34-tsp_on_thumos14' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_thumos14-max_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_4-e6a30b2f.pth', 27 | 28 | # main TAC baseline models 29 | 'r2plus1d_34-tac_on_activitynet' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.002-epoch_5-98ccac94.pth', 30 | 'r2plus1d_34-tac_on_thumos14' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_thumos14-backbone_lr_0.00001-fc_lr_0.002-epoch_3-54b5c8aa.pth', 31 | 'r2plus1d_34-tac_on_kinetics' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tac_on_kinetics-0547130e.pth', 32 | 33 | # other models from the GVF and backbone architecture ablation studies 34 | 'r2plus1d_34-tsp_on_activitynet-avg_gvf': 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-avg_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-8b74eaa2.pth', 35 | 'r2plus1d_34-tsp_on_activitynet-no_gvf' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_34-tsp_on_activitynet-no_gvf-backbone_lr_0.0001-fc_lr_0.004-epoch_5-fb38fdd2.pth', 36 | 37 | 'r2plus1d_18-tsp_on_activitynet' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-22835b73.pth', 38 | 'r2plus1d_18-tac_on_activitynet' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_activitynet-backbone_lr_0.0001-fc_lr_0.004-epoch_5-9f56941a.pth', 39 | 'r2plus1d_18-tac_on_kinetics' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r2plus1d_18-tac_on_kinetics-76ce975c.pth', 40 | 41 | 'r3d_18-tsp_on_activitynet' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tsp_on_activitynet-max_gvf-backbone_lr_0.0001-fc_lr_0.002-epoch_6-85584422.pth', 42 | 'r3d_18-tac_on_activitynet' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_activitynet-backbone_lr_0.001-fc_lr_0.01-epoch_5-31fd6e95.pth', 43 | 'r3d_18-tac_on_kinetics' : 'https://github.com/HumamAlwassel/TSP/releases/download/model_weights/r3d_18-tac_on_kinetics-dcd952c6.pth', 44 | } 45 | 46 | 47 | def evaluate(model, data_loader, device): 48 | model.eval() 49 | metric_logger = utils.MetricLogger(delimiter=' ') 50 | header = 'Feature extraction:' 51 | with torch.no_grad(): 52 | for sample in metric_logger.log_every(data_loader, 10, header, device=device): 53 | clip = sample['clip'].to(device, non_blocking=True) 54 | logits, features = model(clip, return_features=True) 55 | data_loader.dataset.save_features(features, sample) 56 | # print(len(logits)) 57 | # print(logits[0].shape, logits[1].shape) 58 | data_loader.dataset.save_output(logits, sample, ["action-label"]) 59 | 60 | 61 | def main(args): 62 | print(args) 63 | print('TORCH VERSION: ', torch.__version__) 64 | print('TORCHVISION VERSION: ', torchvision.__version__) 65 | torch.backends.cudnn.benchmark = True 66 | 67 | device = torch.device(args.device) 68 | os.makedirs(args.output_dir, exist_ok=True) 69 | 70 | print('LOADING DATA') 71 | normalize = T.Normalize(mean=[0.43216, 0.394666, 0.37645], 72 | std=[0.22803, 0.22145, 0.216989]) 73 | 74 | transform = torchvision.transforms.Compose([ 75 | T.ToFloatTensorInZeroOne(), 76 | T.Resize((128, 171)), 77 | normalize, 78 | T.CenterCrop((112, 112)) 79 | ]) 80 | 81 | metadata_df = pd.read_csv(args.metadata_csv_filename) 82 | shards = np.linspace(0,len(metadata_df),args.num_shards+1).astype(int) 83 | start_idx, end_idx = shards[args.shard_id], shards[args.shard_id+1] 84 | print(f'shard-id: {args.shard_id + 1} out of {args.num_shards}, ' 85 | f'total number of videos: {len(metadata_df)}, shard size {end_idx-start_idx} videos') 86 | 87 | metadata_df = metadata_df.iloc[start_idx:end_idx].reset_index() 88 | metadata_df['is-computed-already'] = metadata_df['filename'].map(lambda f: 89 | os.path.exists(os.path.join(args.output_dir, os.path.basename(f).split('.')[0] + '.npy'))) 90 | metadata_df = metadata_df[metadata_df['is-computed-already']==False].reset_index(drop=True) 91 | print(f'Number of videos to process after excluding the ones already computed on disk: {len(metadata_df)}') 92 | 93 | dataset = EvalVideoDataset( 94 | metadata_df=metadata_df, 95 | root_dir=args.data_path, 96 | clip_length=args.clip_len, 97 | frame_rate=args.frame_rate, 98 | stride=args.stride, 99 | output_dir=args.output_dir, 100 | transforms=transform) 101 | 102 | print('CREATING DATA LOADER') 103 | data_loader = torch.utils.data.DataLoader( 104 | dataset, batch_size=args.batch_size, shuffle=False, 105 | num_workers=args.workers, pin_memory=True) 106 | 107 | print(f'LOADING MODEL') 108 | if args.local_checkpoint: 109 | print(f'from the local checkpoint: {args.local_checkpoint}') 110 | pretrained_state_dict = torch.load(args.local_checkpoint, map_location='cpu')['model'] 111 | else: 112 | print(f'from the GitHub released model: {args.released_checkpoint}') 113 | args.backbone = args.released_checkpoint.split('-')[0] 114 | pretrained_state_dict = torch.hub.load_state_dict_from_url( 115 | MODEL_URLS[args.released_checkpoint], progress=True, check_hash=True, map_location='cpu' 116 | )['model'] 117 | 118 | # model with a dummy classifier layer 119 | model = Model(backbone=args.backbone, num_classes=[1], num_heads=1, concat_gvf=False) 120 | model.to(device) 121 | 122 | # remove the classifier layers from the pretrained model and load the backbone weights 123 | pretrained_state_dict = {k: v for k,v in pretrained_state_dict.items() if 'fc' not in k} 124 | state_dict = model.state_dict() 125 | pretrained_state_dict['fc.weight'] = state_dict['fc.weight'] 126 | pretrained_state_dict['fc.bias'] = state_dict['fc.bias'] 127 | model.load_state_dict(pretrained_state_dict) 128 | 129 | print('START FEATURE EXTRACTION') 130 | evaluate(model, data_loader, device) 131 | 132 | 133 | if __name__ == '__main__': 134 | from opts import parse_args 135 | args = parse_args() 136 | main(args) 137 | -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/extract_features_from_a_local_checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -i 2 | 3 | #################################################################################### 4 | ########################## PARAMETERS THAT NEED TO BE SET ########################## 5 | #################################################################################### 6 | 7 | DATA_PATH= # path/to/video/folder/ 8 | METADATA_CSV_FILENAME= # path/to/metadata/csv/file. Use the ones provided in the data folder. 9 | 10 | LOCAL_CHECKPOINT= # path/to/local/checkpoint/file.pth 11 | BACKBONE= # Set the backbone used in the LOCAL_CHECKPOINT: r2plus1d_34, r2plus1d_18, or r3d_18 12 | 13 | # Choose the stride between clips, e.g. 16 for non-overlapping clips and 1 for dense overlapping clips 14 | STRIDE=16 15 | 16 | # Optional: Split the videos into multiple shards for parallel feature extraction 17 | # Increase the number of shards and run this script independently on separate GPU devices, 18 | # each with a different SHARD_ID from 0 to NUM_SHARDS-1. 19 | # Each shard will process (num_videos / NUM_SHARDS) videos. 20 | SHARD_ID=0 21 | NUM_SHARDS=1 22 | DEVICE=cuda:0 23 | 24 | if [ -z "$DATA_PATH" ]; then 25 | echo "DATA_PATH variable is not set." 26 | echo "Please set DATA_PATH to the folder containing the videos you want to process." 27 | exit 1 28 | fi 29 | 30 | if [ -z "$METADATA_CSV_FILENAME" ]; then 31 | echo "METADATA_CSV_FILENAME variable is not set." 32 | echo "We provide metadata CSV files for ActivityNet and THUMOS14 in the data folder." 33 | exit 1 34 | fi 35 | 36 | if [ -z "$LOCAL_CHECKPOINT" ]; then 37 | echo "LOCAL_CHECKPOINT variable is not set." 38 | echo "Please set LOCAL_CHECKPOINT to the location of the local checkpoint .pth file." 39 | echo "Make sure to set the correct BACKBONE variable as well." 40 | exit 1 41 | fi 42 | 43 | if [ -z "$BACKBONE" ]; then 44 | echo "BACKBONE variable is not set." 45 | exit 1 46 | fi 47 | 48 | #################################################################################### 49 | ############################# PARAMETERS TO KEEP AS IS ############################# 50 | #################################################################################### 51 | 52 | OUTPUT_DIR=output/local_checkpoint_${BACKBONE}_features/stride_${STRIDE}/ 53 | 54 | source activate tsp 55 | mkdir -p $OUTPUT_DIR 56 | 57 | python extract_features.py \ 58 | --data-path $DATA_PATH \ 59 | --metadata-csv-filename $METADATA_CSV_FILENAME \ 60 | --local-checkpoint $LOCAL_CHECKPOINT \ 61 | --backbone $BACKBONE \ 62 | --stride $STRIDE \ 63 | --shard-id $SHARD_ID \ 64 | --num-shards $NUM_SHARDS \ 65 | --device $DEVICE \ 66 | --output-dir $OUTPUT_DIR 67 | -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/extract_features_from_a_released_checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -i 2 | 3 | #################################################################################### 4 | ########################## PARAMETERS THAT NEED TO BE SET ########################## 5 | #################################################################################### 6 | 7 | DATA_PATH= # path/to/video/folder 8 | METADATA_CSV_FILENAME= # path/to/metadata/csv/file. Use the ones provided in the data folder. 9 | 10 | ############################## 11 | ### RELEASED GITHUB MODELS ### 12 | ############################## 13 | ## main TSP models -> 14 | # r2plus1d_34-tsp_on_activitynet (default) 15 | # r2plus1d_34-tsp_on_thumos14 16 | # 17 | ## main TAC baseline models -> 18 | # r2plus1d_34-tac_on_activitynet 19 | # r2plus1d_34-tac_on_thumos14 20 | # r2plus1d_34-tac_on_kinetics 21 | # 22 | ## other models from the GVF and backbone architecture ablation studies -> 23 | # r2plus1d_34-tsp_on_activitynet-avg_gvf 24 | # r2plus1d_34-tsp_on_activitynet-no_gvf 25 | # r2plus1d_18-tsp_on_activitynet 26 | # r2plus1d_18-tac_on_activitynet 27 | # r2plus1d_18-tac_on_kinetics 28 | # r3d_18-tsp_on_activitynet 29 | # r3d_18-tac_on_activitynet 30 | # r3d_18-tac_on_kinetics 31 | RELEASED_CHECKPOINT=r2plus1d_34-tsp_on_activitynet # choose one of the models above 32 | 33 | # Choose the stride between clips, e.g. 16 for non-overlapping clips and 1 for dense overlapping clips 34 | STRIDE=16 35 | 36 | # Optional: Split the videos into multiple shards for parallel feature extraction 37 | # Increase the number of shards and run this script independently on separate GPU devices, 38 | # each with a different SHARD_ID from 0 to NUM_SHARDS-1. 39 | # Each shard will process (num_videos / NUM_SHARDS) videos. 40 | SHARD_ID=0 41 | NUM_SHARDS=1 42 | DEVICE=cuda:0 43 | 44 | if [ -z "$DATA_PATH" ]; then 45 | echo "DATA_PATH variable is not set." 46 | echo "Please set DATA_PATH to the folder containing the videos you want to process." 47 | exit 1 48 | fi 49 | 50 | if [ -z "$METADATA_CSV_FILENAME" ]; then 51 | echo "METADATA_CSV_FILENAME variable is not set." 52 | echo "We provide metadata CSV files for ActivityNet and THUMOS14 in the data folder." 53 | exit 1 54 | fi 55 | 56 | #################################################################################### 57 | ############################# PARAMETERS TO KEEP AS IS ############################# 58 | #################################################################################### 59 | 60 | OUTPUT_DIR=output/${RELEASED_CHECKPOINT}_features/stride_${STRIDE}/ 61 | 62 | source activate tsp 63 | mkdir -p $OUTPUT_DIR 64 | 65 | python extract_features.py \ 66 | --data-path $DATA_PATH \ 67 | --metadata-csv-filename $METADATA_CSV_FILENAME \ 68 | --released-checkpoint $RELEASED_CHECKPOINT \ 69 | --stride $STRIDE \ 70 | --shard-id $SHARD_ID \ 71 | --num-shards $NUM_SHARDS \ 72 | --device $DEVICE \ 73 | --output-dir $OUTPUT_DIR 74 | -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/merge_pkl_files_into_one_h5_feature_file.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import argparse 4 | import pickle as pkl 5 | import h5py 6 | import glob 7 | import os 8 | 9 | from tqdm import tqdm 10 | 11 | 12 | def main(args): 13 | print(args) 14 | compression_flags = dict(compression='gzip', compression_opts=9) 15 | filenames = glob.glob(os.path.join(args.features_folder, '*.pkl')) 16 | print(f'Number of pkl files: {len(filenames)}') 17 | 18 | output = h5py.File(args.output_h5, 'w') 19 | for f in tqdm(filenames): 20 | video_name = os.path.basename(f).split('.pkl')[0] 21 | with open(f, 'rb') as fobj: 22 | data = pkl.load(fobj) 23 | output.create_dataset(video_name, data=data, chunks=True, **compression_flags) 24 | 25 | output.close() 26 | print(f'The h5 feature file is saved to {args.output_h5}') 27 | 28 | 29 | if __name__ == '__main__': 30 | parser = argparse.ArgumentParser(description='Merge the feature pkl files of different videos into one ' 31 | 'h5 feature file mapping video name to feature tensor.') 32 | 33 | parser.add_argument('--features-folder', required=True, type=str, 34 | help='Path to the folder containing the pkl feature files') 35 | parser.add_argument('--output-h5', required=True, type=str, 36 | help='Where to save the combined metadata CSV file') 37 | 38 | args = parser.parse_args() 39 | 40 | main(args) 41 | -------------------------------------------------------------------------------- /video_backbone/TSP/extract_features/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | RELEASED_GITHUB_MODELS = [ 5 | # main TSP models 6 | 'r2plus1d_34-tsp_on_activitynet', 7 | 'r2plus1d_34-tsp_on_thumos14', 8 | 9 | # main TAC baseline models 10 | 'r2plus1d_34-tac_on_activitynet', 11 | 'r2plus1d_34-tac_on_thumos14', 12 | 'r2plus1d_34-tac_on_kinetics', 13 | 14 | # other models from the GVF and backbone architecture ablation studies 15 | 'r2plus1d_34-tsp_on_activitynet-avg_gvf', 16 | 'r2plus1d_34-tsp_on_activitynet-no_gvf', 17 | 18 | 'r2plus1d_18-tsp_on_activitynet', 19 | 'r2plus1d_18-tac_on_activitynet', 20 | 'r2plus1d_18-tac_on_kinetics', 21 | 22 | 'r3d_18-tsp_on_activitynet', 23 | 'r3d_18-tac_on_activitynet', 24 | 'r3d_18-tac_on_kinetics', 25 | ] 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser(description='Features extraction script') 30 | 31 | parser.add_argument('--data-path', required=True, 32 | help='Path to the directory containing the videos files') 33 | parser.add_argument('--metadata-csv-filename', required=True, 34 | help='Path to the metadata CSV file') 35 | 36 | parser.add_argument('--backbone', default='r2plus1d_34', 37 | choices=['r2plus1d_34', 'r2plus1d_18', 'r3d_18'], 38 | help='Encoder backbone architecture (default r2plus1d_34). ' 39 | 'Supported backbones are r2plus1d_34, r2plus1d_18, and r3d_18') 40 | parser.add_argument('--device', default='cuda', 41 | help='Device to train on (default: cuda)') 42 | 43 | parser.add_argument('--released-checkpoint', default='r2plus1d-34_tsp-on-activitynet_max-gvf', 44 | choices=RELEASED_GITHUB_MODELS, 45 | help='Model checkpoint name to load from the released GitHub pretrained models. ' 46 | 'The backbone parameter is set automatically if loading from a released model. ' 47 | 'If `local-checkpoint` flag is not None, then this parameter is ignored and ' 48 | 'a checkpoint is loaded from the given `local-checkpoint` path on disk.') 49 | parser.add_argument('--local-checkpoint', default=None, 50 | help='Path to checkpoint on disk. If set, then read checkpoint from local disk. ' 51 | 'Otherwise, load checkpoint from the released GitHub models.') 52 | 53 | parser.add_argument('--clip-len', default=16, type=int, 54 | help='Number of frames per clip (default: 16)') 55 | parser.add_argument('--frame-rate', default=15, type=int, 56 | help='Frames-per-second rate at which the videos are sampled (default: 15)') 57 | parser.add_argument('--stride', default=16, type=int, 58 | help='Number of frames (after resampling with frame-rate) between consecutive clips (default: 16)') 59 | 60 | parser.add_argument('--batch-size', default=32, type=int, 61 | help='Batch size per GPU (default: 32)') 62 | parser.add_argument('--workers', default=6, type=int, 63 | help='Number of data loading workers (default: 6)') 64 | 65 | parser.add_argument('--output-dir', required=True, 66 | help='Path for saving features') 67 | parser.add_argument('--shard-id', default=0, type=int, 68 | help='Shard id number. Must be between [0, num-shards)') 69 | parser.add_argument('--num-shards', default=1, type=int, 70 | help='Number of shards to split the metadata-csv-filename') 71 | 72 | args = parser.parse_args() 73 | 74 | return args 75 | -------------------------------------------------------------------------------- /video_backbone/TSP/img/tsp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/img/tsp.png -------------------------------------------------------------------------------- /video_backbone/TSP/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/models/__init__.py -------------------------------------------------------------------------------- /video_backbone/TSP/models/backbone.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from torchvision.models.video import r2plus1d_18 as _r2plus1d_18 5 | from torchvision.models.video import r3d_18 as _r3d_18 6 | from torchvision.models.video.resnet import VideoResNet, R2Plus1dStem, BasicBlock 7 | 8 | __all__ = ['r2plus1d_34', 'r2plus1d_18', 'r3d_18'] 9 | 10 | R2PLUS1D_34_MODEL_URL="https://github.com/moabitcoin/ig65m-pytorch/releases/download/v1.0.0/r2plus1d_34_clip8_ft_kinetics_from_ig65m-0aa0550b.pth" 11 | 12 | 13 | def r2plus1d_34(pretrained=True, progress=False, **kwargs): 14 | model = VideoResNet( 15 | block=BasicBlock, 16 | conv_makers=[Conv2Plus1D] * 4, 17 | layers=[3, 4, 6, 3], 18 | stem=R2Plus1dStem, 19 | **kwargs, 20 | ) 21 | 22 | # We need exact Caffe2 momentum for BatchNorm scaling 23 | for m in model.modules(): 24 | if isinstance(m, nn.BatchNorm3d): 25 | m.eps = 1e-3 26 | m.momentum = 0.9 27 | 28 | if pretrained: 29 | state_dict = torch.hub.load_state_dict_from_url( 30 | R2PLUS1D_34_MODEL_URL, progress=progress 31 | ) 32 | model.load_state_dict(state_dict) 33 | 34 | return model 35 | 36 | 37 | def r2plus1d_18(pretrained=True, progress=False, **kwargs): 38 | return _r2plus1d_18(pretrained=pretrained, progress=progress, **kwargs) 39 | 40 | 41 | def r3d_18(pretrained=True, progress=False, **kwargs): 42 | return _r3d_18(pretrained=pretrained, progress=progress, **kwargs) 43 | 44 | 45 | class Conv2Plus1D(nn.Sequential): 46 | def __init__(self, in_planes, out_planes, midplanes, stride=1, padding=1): 47 | 48 | midplanes = (in_planes * out_planes * 3 * 3 * 3) // ( 49 | in_planes * 3 * 3 + 3 * out_planes 50 | ) 51 | super(Conv2Plus1D, self).__init__( 52 | nn.Conv3d( 53 | in_planes, 54 | midplanes, 55 | kernel_size=(1, 3, 3), 56 | stride=(1, stride, stride), 57 | padding=(0, padding, padding), 58 | bias=False, 59 | ), 60 | nn.BatchNorm3d(midplanes), 61 | nn.ReLU(inplace=True), 62 | nn.Conv3d( 63 | midplanes, 64 | out_planes, 65 | kernel_size=(3, 1, 1), 66 | stride=(stride, 1, 1), 67 | padding=(padding, 0, 0), 68 | bias=False, 69 | ), 70 | ) 71 | 72 | @staticmethod 73 | def get_downsample_stride(stride): 74 | return (stride, stride, stride) -------------------------------------------------------------------------------- /video_backbone/TSP/models/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from .backbone import r2plus1d_34, r2plus1d_18, r3d_18 4 | 5 | 6 | class Model(nn.Module): 7 | 8 | def __init__(self, backbone, num_classes, num_heads=1, concat_gvf=False, progress=True, **kwargs): 9 | ''' 10 | Args: 11 | backbone (string): The name of the backbone architecture. Supported architectures: r2plus1d_34, r2plus1d_18, and r3d_18. 12 | num_heads (int): The number of output heads 13 | num_classes (list of int): The number of labels per head 14 | concat_gvf (bool): If True and num_heads == 2, then concat global video features (GVF) to clip 15 | features before applying the second head FC layer. 16 | progress (bool): If True, displays a progress bar of the download to stderr 17 | **kwargs: keyword arguments to pass to backbone architecture constructor 18 | ''' 19 | super().__init__() 20 | print(f': backbone {backbone} num_classes {num_classes} num_heads {num_heads} kwargs {kwargs}') 21 | assert len(num_classes) == num_heads, f': incompatible configuration. len(num_classes) must be equal to num_heads' 22 | assert num_heads == 1 or num_heads == 2, f': num_heads = {num_heads} must be either 1 or 2' 23 | 24 | self.backbone = backbone 25 | self.num_classes = num_classes 26 | self.num_heads = num_heads 27 | self.concat_gvf = concat_gvf 28 | 29 | self.features, self.feature_size = Model._build_feature_backbone(backbone, progress, **kwargs) 30 | 31 | if self.num_heads == 1: 32 | self.fc = Model._build_fc(self.feature_size, num_classes[0]) 33 | else: 34 | self.fc1 = Model._build_fc(self.feature_size, num_classes[0]) 35 | self.fc2 = Model._build_fc(2 * self.feature_size if self.concat_gvf else self.feature_size, num_classes[1]) 36 | 37 | def forward(self, x, gvf=None, return_features=False): 38 | features = self.features(x) 39 | if self.num_heads == 1: 40 | logits = [self.fc(features)] 41 | else: 42 | logits = [self.fc1(features)] 43 | if self.concat_gvf: 44 | assert gvf is not None, 'Forward pass expects a global video feature input but got None' 45 | logits.append(self.fc2(torch.cat([features, gvf], dim=-1))) 46 | else: 47 | logits.append(self.fc2(features)) 48 | 49 | return (logits, features) if return_features else logits 50 | 51 | @staticmethod 52 | def _build_feature_backbone(backbone, progress, **kwargs): 53 | if backbone == 'r2plus1d_34': builder = r2plus1d_34 54 | elif backbone == 'r2plus1d_18': builder = r2plus1d_18 55 | elif backbone == 'r3d_18': builder = r3d_18 56 | else: 57 | raise ValueError(f': {backbone} is an invalid architecture type. ' 58 | f'Supported architectures: r2plus1d_34, r2plus1d_18, and r3d_18') 59 | 60 | feature_backbone = builder(pretrained=True, progress=progress, **kwargs) 61 | 62 | # remove the FC layer of the backbone 63 | feature_size = feature_backbone.fc.in_features 64 | feature_backbone.fc = nn.Sequential() 65 | 66 | return feature_backbone, feature_size 67 | 68 | @staticmethod 69 | def _build_fc(in_features, out_features): 70 | fc = nn.Linear(in_features, out_features) 71 | nn.init.normal_(fc.weight, 0, 0.01) 72 | nn.init.constant_(fc.bias, 0) 73 | return fc 74 | -------------------------------------------------------------------------------- /video_backbone/TSP/train/README.md: -------------------------------------------------------------------------------- 1 | # TSP Training 2 | 3 | We provide four training scripts: 4 | - `train_tsp_on_activitynet.sh`: pretraining R(2+1)D-34 encoder with TSP on ActivityNet 5 | - `train_tsp_on_thumos14.sh`: pretraining R(2+1)D-34 encoder with TSP on THUMOS14 6 | - `train_tac_on_activitynet.sh`: pretraining R(2+1)D-34 encoder with TAC on ActivityNet (baseline) 7 | - `train_tac_on_thumos14.sh`: pretraining R(2+1)D-34 encoder with TAC on THUMOS14 (baseline) 8 | 9 | ## Launching the Training Scripts 10 | 11 | Before launching each script, you need to manually set **3 variables** inside each file: 12 | - `ROOT_DIR`: The root directory of either the ActivityNet or THUMOS14 videos. Follow the data preprocessing instructions and subfolders naming described [here](../data). 13 | - `NUM_GPUS`: The number of GPUs to use for training. We used 2 V100 (32G) GPUs in our TSP experiments, but the code is generic and can be run on any number of GPUs. 14 | - `DOWNSCALE_FACTOR`: The default batch size and learning rates were optimized for a GPU with 32G memory. We understand that such GPUs might not be accessible to all of the community. Thus, the training code can seamlessly be adapt to run on a smaller GPU memory size by adjusting this variable. Set `DOWNSCALE_FACTOR` to `1`, `2`, or `4` if you have a GPU with 32G, 16G, or 8G memory, respectively. The script will automatically downscale the batch size and the learning rate accordingly to keep the same expected performance. 15 | 16 | ## Experiment Output 17 | 18 | - Checkpoint per epoch (*e.g.,* `epoch_3.pth`): a `.pth` file containing the state dictionary of the model, optimizer, and learning rate scheduler. The checkpoint files can be used to resume the training (use `--resume` and `--start-epoch` input parameters in `train.py`) or to extract features (use the scripts [here](../extract_features)). 19 | - Metric results file (`results.txt`): A log of the metrics results on the validation subset after each epoch. We choose the best pretrained model based on the epoch with the highest `Avg Accuracy` value. 20 | 21 | ## Interested in Reproducing the Ablation Studies? 22 | 23 | Train with different encoder architectures? Change the variable `BACKBONE` to either `r2plus1d_18` or `r3d_18`. 24 | Train without GVF? Remove the line `--global-video-features $GLOBAL_VIDEO_FEATURES \` from the `train.py` call at the end. 25 | Train with average GVF? Set `GLOBAL_VIDEO_FEATURES=../data/activitynet/global_video_features/r2plus1d_34-avg_gvf.h5`. 26 | Train with only the temporal region classification head? Set `LABEL_COLUMNS=temporal-region-label` and `LABEL_MAPPING_JSONS=../data/activitynet/activitynet_v1-3_temporal_region_label_mapping.json`. Finally, make sure to rename `OUTPUT_DIR` to avoid overwriting previous experiment when reproducing the ablation studies. 27 | -------------------------------------------------------------------------------- /video_backbone/TSP/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/TSP/train/__init__.py -------------------------------------------------------------------------------- /video_backbone/TSP/train/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_args(): 5 | parser = argparse.ArgumentParser(description='Training script for "TSP: Temporally-Sensitive Pretraining of Video Encoders for Localization Tasks"') 6 | 7 | parser.add_argument('--root-dir', required=True, 8 | help='Path to root directory containing the videos files') 9 | parser.add_argument('--train-subdir', default='train', 10 | help='Training subdirectory inside the root directory (default: train)') 11 | parser.add_argument('--valid-subdir', default='valid', 12 | help='Validation subdirectory inside the root directory (default: val)') 13 | parser.add_argument('--train-csv-filename', required=True, 14 | help='Path to the training CSV file') 15 | parser.add_argument('--valid-csv-filename', required=True, 16 | help='Path to the validation CSV file') 17 | parser.add_argument('--label-columns', nargs='+', required=True, 18 | help='Names of the label columns in the CSV files') 19 | parser.add_argument('--label-mapping-jsons', nargs='+', required=True, 20 | help='Path to the mapping of each label column') 21 | parser.add_argument('--loss-alphas', nargs='+', default=[1.0, 1.0], type=float, 22 | help='A list of the scalar alpha with which to weight each label loss') 23 | parser.add_argument('--global-video-features', 24 | help='Path to the h5 file containing global video features (GVF). ' 25 | 'If not given, then train without GVF.') 26 | 27 | parser.add_argument('--backbone', default='r2plus1d_34', 28 | choices=['r2plus1d_34', 'r2plus1d_18', 'r3d_18'], 29 | help='Encoder backbone architecture (default r2plus1d_34). ' 30 | 'Supported backbones are r2plus1d_34, r2plus1d_18, and r3d_18') 31 | parser.add_argument('--device', default='cuda', 32 | help='Device to train on (default: cuda)') 33 | 34 | parser.add_argument('--clip-len', default=16, type=int, 35 | help='Number of frames per clip (default: 16)') 36 | parser.add_argument('--frame-rate', default=15, type=int, 37 | help='Frames-per-second rate at which the videos are sampled (default: 15)') 38 | parser.add_argument('--clips-per-segment', default=5, type=int, 39 | help='Number of clips sampled per video segment (default: 5)') 40 | parser.add_argument('--batch-size', default=32, type=int, 41 | help='Batch size per GPU (default: 32)') 42 | parser.add_argument('--workers', default=6, type=int, 43 | help='Number of data loading workers (default: 6)') 44 | 45 | parser.add_argument('--epochs', default=8, type=int, 46 | help='Number of total epochs to run') 47 | parser.add_argument('--backbone-lr', default=0.0001, type=float, 48 | help='Backbone layers learning rate') 49 | parser.add_argument('--fc-lr', default=0.002, type=float, 50 | help='Fully-connected classifiers learning rate') 51 | parser.add_argument('--lr-warmup-epochs', default=2, type=int, 52 | help='Number of warmup epochs') 53 | parser.add_argument('--lr-milestones', nargs='+', default=[4, 6], type=int, 54 | help='Decrease lr on milestone epoch') 55 | parser.add_argument('--lr-gamma', default=0.01, type=float, 56 | help='Decrease lr by a factor of lr-gamma at each milestone epoch') 57 | parser.add_argument('--momentum', default=0.9, type=float, 58 | help='Momentum (default: 0.9)') 59 | parser.add_argument('--weight-decay', default=0.005, type=float, 60 | help='Weight decay (default: 0.005)') 61 | 62 | parser.add_argument('--valid-only', action='store_true', 63 | help='Test the model on the validation subset and exit') 64 | parser.add_argument('--train-only-one-epoch', action='store_true', 65 | help='Train the model for only one epoch without testing on validation subset') 66 | 67 | parser.add_argument('--print-freq', default=100, type=int, 68 | help='Print frequency in number of batches') 69 | parser.add_argument('--output-dir', required=True, 70 | help='Path for saving checkpoints and results output') 71 | parser.add_argument('--resume', default='', 72 | help='Resume from checkpoint') 73 | parser.add_argument('--start-epoch', default=0, type=int, 74 | help='Start epoch (default: 0)') 75 | 76 | parser.add_argument('--dist-url', default='env://', 77 | help='URL used to set up distributed training') 78 | parser.add_argument('--sync-bn', action='store_true', 79 | help='Use sync batch norm (default: False)') 80 | 81 | parser.add_argument('--debug', action='store_true', 82 | help='Run the training over 100 samples only with batch size of 4') 83 | 84 | args = parser.parse_args() 85 | 86 | assert len(args.label_columns) == len(args.label_mapping_jsons) and len(args.label_columns) == len(args.loss_alphas), \ 87 | (f'The parameters label-columns, label-mapping-jsons, and loss-alphas must have the same length. ' 88 | f'Got len(label-columns)={len(args.label_columns)}, len(label-mapping-jsons)={len(args.label_mapping_jsons)}, ' 89 | f'and len(loss-alphas)={len(args.loss_alphas)}') 90 | 91 | if args.debug: 92 | print('####### DEBUG MODE #######') 93 | args.batch_size = 4 94 | args.print_freq = 4 95 | 96 | return args 97 | -------------------------------------------------------------------------------- /video_backbone/TSP/train/train_tac_on_activitynet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -i 2 | 3 | #################################################################################### 4 | ########################## PARAMETERS THAT NEED TO BE SET ########################## 5 | #################################################################################### 6 | 7 | ROOT_DIR= 8 | NUM_GPUS= 9 | 10 | # Choose the appropriate batch size downscale factor for your GPU memory size 11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default) 12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU 13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU 14 | DOWNSCALE_FACTOR=1 15 | 16 | if [ -z "$ROOT_DIR" ]; then 17 | echo "ROOT_DIR variable is not set." 18 | echo "Please set ROOT_DIR to the location of the ActivityNet videos." 19 | echo "The directory must contain two subdirectories: train and valid." 20 | exit 1 21 | fi 22 | 23 | if [ -z "$NUM_GPUS" ]; then 24 | echo "NUM_GPUS variable is not set." 25 | exit 1 26 | fi 27 | 28 | #################################################################################### 29 | ############################# PARAMETERS TO KEEP AS IS ############################# 30 | #################################################################################### 31 | 32 | TRAIN_SUBDIR=train 33 | VALID_SUBDIR=valid 34 | TRAIN_CSV_FILENAME=../data/activitynet/activitynet_v1-3_train_tsp_groundtruth.csv 35 | VALID_CSV_FILENAME=../data/activitynet/activitynet_v1-3_valid_tsp_groundtruth.csv 36 | LABEL_COLUMNS=action-label 37 | LABEL_MAPPING_JSONS=../data/activitynet/activitynet_v1-3_action_label_mapping.json 38 | LOSS_ALPHAS=1.0 39 | 40 | BACKBONE=r2plus1d_34 41 | 42 | BATCH_SIZE=32 43 | BACKBONE_LR=0.0001 44 | FC_LR=0.002 45 | 46 | OUTPUT_DIR=output/${BACKBONE}-tac_on_activitynet/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/ 47 | 48 | MY_MASTER_ADDR=127.0.0.1 49 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1) 50 | 51 | # downscaling 52 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR) 53 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR) 54 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR) 55 | 56 | source activate tsp 57 | mkdir -p $OUTPUT_DIR 58 | export OMP_NUM_THREADS=6 59 | 60 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \ 61 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \ 62 | train.py \ 63 | --root-dir $ROOT_DIR \ 64 | --train-subdir $TRAIN_SUBDIR \ 65 | --valid-subdir $VALID_SUBDIR \ 66 | --train-csv-filename $TRAIN_CSV_FILENAME \ 67 | --valid-csv-filename $VALID_CSV_FILENAME \ 68 | --label-mapping-jsons $LABEL_MAPPING_JSONS \ 69 | --label-columns $LABEL_COLUMNS \ 70 | --loss-alphas $LOSS_ALPHAS \ 71 | --backbone $BACKBONE \ 72 | --batch-size $BATCH_SIZE \ 73 | --backbone-lr $BACKBONE_LR \ 74 | --fc-lr $FC_LR \ 75 | --output-dir $OUTPUT_DIR \ 76 | -------------------------------------------------------------------------------- /video_backbone/TSP/train/train_tac_on_thumos14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -i 2 | 3 | #################################################################################### 4 | ########################## PARAMETERS THAT NEED TO BE SET ########################## 5 | #################################################################################### 6 | 7 | ROOT_DIR= 8 | NUM_GPUS= 9 | 10 | # Choose the appropriate batch size downscale factor for your GPU memory size 11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default) 12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU 13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU 14 | DOWNSCALE_FACTOR=1 15 | 16 | if [ -z "$ROOT_DIR" ]; then 17 | echo "ROOT_DIR variable is not set." 18 | echo "Please set ROOT_DIR to the location of the THUMOS14 videos." 19 | echo "The directory must contain two subdirectories: valid and test" 20 | exit 1 21 | fi 22 | 23 | if [ -z "$NUM_GPUS" ]; then 24 | echo "NUM_GPUS variable is not set." 25 | exit 1 26 | fi 27 | 28 | #################################################################################### 29 | ############################# PARAMETERS TO KEEP AS IS ############################# 30 | #################################################################################### 31 | 32 | TRAIN_SUBDIR=valid 33 | VALID_SUBDIR=test 34 | TRAIN_CSV_FILENAME=../data/thumos14/thumos14_valid_tsp_groundtruth.csv 35 | VALID_CSV_FILENAME=../data/thumos14/thumos14_test_tsp_groundtruth.csv 36 | LABEL_COLUMNS=action-label 37 | LABEL_MAPPING_JSONS=../data/thumos14/thumos14_action_label_mapping.json 38 | LOSS_ALPHAS=1.0 39 | 40 | BACKBONE=r2plus1d_34 41 | 42 | BATCH_SIZE=32 43 | BACKBONE_LR=0.00001 44 | FC_LR=0.002 45 | 46 | OUTPUT_DIR=output/${BACKBONE}-tac_on_thumos14/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/ 47 | 48 | MY_MASTER_ADDR=127.0.0.1 49 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1) 50 | 51 | # downscaling 52 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR) 53 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR) 54 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR) 55 | 56 | source activate tsp 57 | mkdir -p $OUTPUT_DIR 58 | export OMP_NUM_THREADS=6 59 | 60 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \ 61 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \ 62 | train.py \ 63 | --root-dir $ROOT_DIR \ 64 | --train-subdir $TRAIN_SUBDIR \ 65 | --valid-subdir $VALID_SUBDIR \ 66 | --train-csv-filename $TRAIN_CSV_FILENAME \ 67 | --valid-csv-filename $VALID_CSV_FILENAME \ 68 | --label-mapping-jsons $LABEL_MAPPING_JSONS \ 69 | --label-columns $LABEL_COLUMNS \ 70 | --loss-alphas $LOSS_ALPHAS \ 71 | --backbone $BACKBONE \ 72 | --batch-size $BATCH_SIZE \ 73 | --backbone-lr $BACKBONE_LR \ 74 | --fc-lr $FC_LR \ 75 | --output-dir $OUTPUT_DIR \ 76 | -------------------------------------------------------------------------------- /video_backbone/TSP/train/train_tsp_on_activitynet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -i 2 | 3 | #################################################################################### 4 | ########################## PARAMETERS THAT NEED TO BE SET ########################## 5 | #################################################################################### 6 | 7 | ROOT_DIR= 8 | NUM_GPUS= 9 | 10 | # Choose the appropriate batch size downscale factor for your GPU memory size 11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default) 12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU 13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU 14 | DOWNSCALE_FACTOR=1 15 | 16 | if [ -z "$ROOT_DIR" ]; then 17 | echo "ROOT_DIR variable is not set." 18 | echo "Please set ROOT_DIR to the location of the ActivityNet videos." 19 | echo "The directory must contain two subdirectories: train and valid." 20 | exit 1 21 | fi 22 | 23 | if [ -z "$NUM_GPUS" ]; then 24 | echo "NUM_GPUS variable is not set." 25 | exit 1 26 | fi 27 | 28 | #################################################################################### 29 | ############################# PARAMETERS TO KEEP AS IS ############################# 30 | #################################################################################### 31 | 32 | TRAIN_SUBDIR=train 33 | VALID_SUBDIR=valid 34 | TRAIN_CSV_FILENAME=../data/activitynet/activitynet_v1-3_train_tsp_groundtruth.csv 35 | VALID_CSV_FILENAME=../data/activitynet/activitynet_v1-3_valid_tsp_groundtruth.csv 36 | LABEL_COLUMNS="action-label temporal-region-label" 37 | LABEL_MAPPING_JSONS="../data/activitynet/activitynet_v1-3_action_label_mapping.json \ 38 | ../data/activitynet/activitynet_v1-3_temporal_region_label_mapping.json" 39 | LOSS_ALPHAS="1.0 1.0" 40 | GLOBAL_VIDEO_FEATURES=../data/activitynet/global_video_features/r2plus1d_34-max_gvf.h5 41 | 42 | BACKBONE=r2plus1d_34 43 | 44 | BATCH_SIZE=32 45 | BACKBONE_LR=0.0001 46 | FC_LR=0.002 47 | 48 | OUTPUT_DIR=output/${BACKBONE}-tsp_on_activitynet/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/ 49 | 50 | MY_MASTER_ADDR=127.0.0.1 51 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1) 52 | 53 | # downscaling 54 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR) 55 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR) 56 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR) 57 | 58 | source activate tsp 59 | mkdir -p $OUTPUT_DIR 60 | export OMP_NUM_THREADS=6 61 | 62 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \ 63 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \ 64 | train.py \ 65 | --root-dir $ROOT_DIR \ 66 | --train-subdir $TRAIN_SUBDIR \ 67 | --valid-subdir $VALID_SUBDIR \ 68 | --train-csv-filename $TRAIN_CSV_FILENAME \ 69 | --valid-csv-filename $VALID_CSV_FILENAME \ 70 | --label-mapping-jsons $LABEL_MAPPING_JSONS \ 71 | --label-columns $LABEL_COLUMNS \ 72 | --loss-alphas $LOSS_ALPHAS \ 73 | --global-video-features $GLOBAL_VIDEO_FEATURES \ 74 | --backbone $BACKBONE \ 75 | --batch-size $BATCH_SIZE \ 76 | --backbone-lr $BACKBONE_LR \ 77 | --fc-lr $FC_LR \ 78 | --output-dir $OUTPUT_DIR \ 79 | -------------------------------------------------------------------------------- /video_backbone/TSP/train/train_tsp_on_thumos14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -i 2 | 3 | #################################################################################### 4 | ########################## PARAMETERS THAT NEED TO BE SET ########################## 5 | #################################################################################### 6 | 7 | ROOT_DIR= 8 | NUM_GPUS= 9 | 10 | # Choose the appropriate batch size downscale factor for your GPU memory size 11 | # DOWNSCALE_FACTOR=1 --> a 32G memory GPU (default) 12 | # DOWNSCALE_FACTOR=2 --> a 16G memory GPU 13 | # DOWNSCALE_FACTOR=4 --> a 8G memory GPU 14 | DOWNSCALE_FACTOR=1 15 | 16 | if [ -z "$ROOT_DIR" ]; then 17 | echo "ROOT_DIR variable is not set." 18 | echo "Please set ROOT_DIR to the location of the THUMOS14 videos." 19 | echo "The directory must contain two subdirectories: valid and test" 20 | exit 1 21 | fi 22 | 23 | if [ -z "$NUM_GPUS" ]; then 24 | echo "NUM_GPUS variable is not set." 25 | exit 1 26 | fi 27 | 28 | #################################################################################### 29 | ############################# PARAMETERS TO KEEP AS IS ############################# 30 | #################################################################################### 31 | 32 | TRAIN_SUBDIR=valid 33 | VALID_SUBDIR=test 34 | TRAIN_CSV_FILENAME=../data/thumos14/thumos14_valid_tsp_groundtruth.csv 35 | VALID_CSV_FILENAME=../data/thumos14/thumos14_test_tsp_groundtruth.csv 36 | LABEL_COLUMNS="action-label temporal-region-label" 37 | LABEL_MAPPING_JSONS="../data/thumos14/thumos14_action_label_mapping.json \ 38 | ../data/thumos14/thumos14_temporal_region_label_mapping.json" 39 | LOSS_ALPHAS="1.0 1.0" 40 | GLOBAL_VIDEO_FEATURES=../data/thumos14/global_video_features/r2plus1d_34-max_gvf.h5 41 | 42 | BACKBONE=r2plus1d_34 43 | 44 | BATCH_SIZE=32 45 | BACKBONE_LR=0.0001 46 | FC_LR=0.004 47 | 48 | OUTPUT_DIR=output/${BACKBONE}-tsp_on_thumos14/backbone_lr_${BACKBONE_LR}-fc_lr_${FC_LR}/ 49 | 50 | MY_MASTER_ADDR=127.0.0.1 51 | MY_MASTER_PORT=$(shuf -i 30000-60000 -n 1) 52 | 53 | # downscaling 54 | BATCH_SIZE=$(bc <<< $BATCH_SIZE/$DOWNSCALE_FACTOR) 55 | BACKBONE_LR=$(bc -l <<< $BACKBONE_LR/$DOWNSCALE_FACTOR) 56 | FC_LR=$(bc -l <<< $FC_LR/$DOWNSCALE_FACTOR) 57 | 58 | source activate tsp 59 | mkdir -p $OUTPUT_DIR 60 | export OMP_NUM_THREADS=6 61 | 62 | python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS \ 63 | --master_addr $MY_MASTER_ADDR --master_port $MY_MASTER_PORT --use_env \ 64 | train.py \ 65 | --root-dir $ROOT_DIR \ 66 | --train-subdir $TRAIN_SUBDIR \ 67 | --valid-subdir $VALID_SUBDIR \ 68 | --train-csv-filename $TRAIN_CSV_FILENAME \ 69 | --valid-csv-filename $VALID_CSV_FILENAME \ 70 | --label-mapping-jsons $LABEL_MAPPING_JSONS \ 71 | --label-columns $LABEL_COLUMNS \ 72 | --loss-alphas $LOSS_ALPHAS \ 73 | --global-video-features $GLOBAL_VIDEO_FEATURES \ 74 | --backbone $BACKBONE \ 75 | --batch-size $BATCH_SIZE \ 76 | --backbone-lr $BACKBONE_LR \ 77 | --fc-lr $FC_LR \ 78 | --output-dir $OUTPUT_DIR \ 79 | -------------------------------------------------------------------------------- /video_backbone/TSP/train/untrimmed_video_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | import torch 7 | import h5py 8 | 9 | from torch.utils.data import Dataset 10 | from torchvision.io import read_video 11 | 12 | 13 | class UntrimmedVideoDataset(Dataset): 14 | ''' 15 | UntrimmedVideoDataset: 16 | This dataset takes in temporal segments from untrimmed videos and samples fixed-length 17 | clips from each segment. Each item in the dataset is a dictionary with the keys: 18 | - "clip": A Tensor (dtype=torch.float) of the clip frames after applying transforms 19 | - "label-Y": A label from the `label_columns` (one key for each label) or -1 if label is missing for that clip 20 | - "gvf": The global video feature (GVF) vector if `global_video_features` parameter is not None 21 | ''' 22 | 23 | def __init__(self, csv_filename, root_dir, clip_length, frame_rate, clips_per_segment, temporal_jittering, 24 | label_columns, label_mappings, seed=42, transforms=None, global_video_features=None, debug=False): 25 | ''' 26 | Args: 27 | csv_filename (string): Path to the CSV file with temporal segments information and annotations. 28 | The CSV file must include the columns [filename, fps, t-start, t-end, video-duration] and 29 | the label columns given by the parameter `label_columns`. 30 | root_dir (string): Directory with all the video files. 31 | clip_length (int): The number of frames per clip. 32 | frame_rate (int): The effective frame rate (fps) to sample clips. 33 | clips_per_segment (int): The number of clips to sample per segment in the CSV file. 34 | temporal_jittering (bool): If True, clips are randomly sampled between t-start and t-end of 35 | each segment. Otherwise, clips are are sampled uniformly between t-start and t-end. 36 | seed (int): Seed of the random number generator used for the temporal jittering. 37 | transforms (callable): A function/transform that takes in a TxHxWxC video 38 | and returns a transformed version. 39 | label_columns (list of string): A list of the label columns in the CSV file. 40 | If more than one column is specified, the sample return a label for each. 41 | label_mappings (list of dict): A list of dictionaries to map the corresponding label 42 | from `label_columns` from a category string to an integer ID value. 43 | global_video_features (string): Path to h5 file containing global video features (optional) 44 | debug (bool): If true, create a debug dataset with 100 samples. 45 | ''' 46 | df = UntrimmedVideoDataset._clean_df_and_remove_short_segments(pd.read_csv(csv_filename), clip_length, frame_rate) 47 | self.df = UntrimmedVideoDataset._append_root_dir_to_filenames_and_check_files_exist(df, root_dir) 48 | self.clip_length = clip_length 49 | self.frame_rate = frame_rate 50 | self.clips_per_segment = clips_per_segment 51 | 52 | self.temporal_jittering = temporal_jittering 53 | self.rng = np.random.RandomState(seed=seed) 54 | self.uniform_sampling = np.linspace(0, 1, clips_per_segment) 55 | 56 | self.transforms = transforms 57 | 58 | self.label_columns = label_columns 59 | self.label_mappings = label_mappings 60 | for label_column, label_mapping in zip(label_columns, label_mappings): 61 | self.df[label_column] = self.df[label_column].map(lambda x: -1 if pd.isnull(x) else label_mapping[x]) 62 | 63 | self.global_video_features = global_video_features 64 | self.debug = debug 65 | 66 | def __len__(self): 67 | return len(self.df) * self.clips_per_segment if not self.debug else 100 68 | 69 | def __getitem__(self, idx): 70 | sample = {} 71 | row = self.df.iloc[idx % len(self.df)] 72 | filename, fps, t_start, t_end = row['filename'], row['fps'], row['t-start'], row['t-end'] 73 | 74 | # compute clip_t_start and clip_t_end 75 | clip_length_in_sec = self.clip_length / self.frame_rate 76 | ratio = self.rng.uniform() if self.temporal_jittering else self.uniform_sampling[idx//len(self.df)] 77 | clip_t_start = t_start + ratio * (t_end - t_start - clip_length_in_sec) 78 | clip_t_end = clip_t_start + clip_length_in_sec 79 | 80 | # get a tensor [clip_length, H, W, C] of the video frames between clip_t_start and clip_t_end seconds 81 | vframes, _, _ = read_video(filename=filename, start_pts=clip_t_start, end_pts=clip_t_end, pts_unit='sec') 82 | idxs = UntrimmedVideoDataset._resample_video_idx(self.clip_length, fps, self.frame_rate) 83 | vframes = vframes[idxs][:self.clip_length] # [:self.clip_length] for removing extra frames if isinstance(idxs, slice) 84 | if vframes.shape[0] != self.clip_length: 85 | raise RuntimeError(f': got clip of length {vframes.shape[0]} != {self.clip_length}.' 86 | f'filename={filename}, clip_t_start={clip_t_start}, clip_t_end={clip_t_end}, ' 87 | f'fps={fps}, t_start={t_start}, t_end={t_end}') 88 | 89 | # apply transforms 90 | sample['clip'] = self.transforms(vframes) 91 | 92 | # add labels 93 | for label_column in self.label_columns: 94 | sample[label_column] = row[label_column] 95 | 96 | # add global video feature if it exists 97 | if self.global_video_features: 98 | f = h5py.File(self.global_video_features, 'r') 99 | sample['gvf'] = torch.tensor(f[os.path.basename(filename).split('.')[0]][()]) 100 | f.close() 101 | 102 | return sample 103 | 104 | @staticmethod 105 | def _clean_df_and_remove_short_segments(df, clip_length, frame_rate): 106 | # restrict all segments to be between [0, video-duration] 107 | df['t-end'] = np.minimum(df['t-end'], df['video-duration']) 108 | df['t-start'] = np.maximum(df['t-start'], 0) 109 | 110 | # remove segments that are too short to fit at least one clip 111 | segment_length = (df['t-end'] - df['t-start']) * frame_rate 112 | mask = segment_length >= clip_length 113 | num_segments = len(df) 114 | num_segments_to_keep = sum(mask) 115 | if num_segments - num_segments_to_keep > 0: 116 | df = df[mask].reset_index(drop=True) 117 | print(f': removed {num_segments - num_segments_to_keep}=' 118 | f'{100*(1 - num_segments_to_keep/num_segments):.2f}% from the {num_segments} ' 119 | f'segments from the input CSV file because they are shorter than ' 120 | f'clip_length={clip_length} frames using frame_rate={frame_rate} fps.') 121 | 122 | return df 123 | 124 | @staticmethod 125 | def _append_root_dir_to_filenames_and_check_files_exist(df, root_dir): 126 | df['filename'] = df['filename'].map(lambda f: os.path.join(root_dir, f)) 127 | filenames = df.drop_duplicates('filename')['filename'].values 128 | for f in filenames: 129 | if not os.path.exists(f): 130 | raise ValueError(f': file={f} does not exists. ' 131 | f'Double-check root_dir and csv_filename inputs.') 132 | return df 133 | 134 | @staticmethod 135 | def _resample_video_idx(num_frames, original_fps, new_fps): 136 | step = float(original_fps) / new_fps 137 | if step.is_integer(): 138 | # optimization: if step is integer, don't need to perform 139 | # advanced indexing 140 | step = int(step) 141 | return slice(None, None, step) 142 | idxs = torch.arange(num_frames, dtype=torch.float32) * step 143 | idxs = idxs.floor().to(torch.int64) 144 | return idxs 145 | -------------------------------------------------------------------------------- /video_backbone/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/video_backbone/__init__.py -------------------------------------------------------------------------------- /visualization/Arial.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/Arial.ttf -------------------------------------------------------------------------------- /visualization/NotoSansCJK-Bold.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/NotoSansCJK-Bold.otf -------------------------------------------------------------------------------- /visualization/videos/xukun.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/videos/xukun.mp4 -------------------------------------------------------------------------------- /visualization/visualization.py: -------------------------------------------------------------------------------- 1 | # from IPython.display import clear_output, Image, display, HTML 2 | # %matplotlib notebook 3 | import matplotlib.pyplot as plt 4 | import time 5 | import numpy as np 6 | import cv2 7 | import base64 8 | import json 9 | from PIL import Image, ImageFont, ImageDraw 10 | import pdb 11 | import argparse 12 | import os 13 | from tqdm import tqdm 14 | 15 | 16 | def get_frame_caption(frame_time, dense_captions, ranking=False): 17 | frame_captions = [] 18 | idx_list = [] 19 | for event in dense_captions: 20 | s, e = event['timestamp'] 21 | if frame_time >= s and frame_time <= e: 22 | frame_captions.append(event) 23 | idx_list.append(event['original_id']) 24 | temperature = 1 25 | if ranking: 26 | scorer = lambda p: p['sentence_score'] / (float(len(p['sentence'].split())) ** (temperature) + 1e-5) + \ 27 | 1.0 * p['proposal_score'] * (1 - np.abs(frame_time - 0.5 * (p['timestamp'][0] + p['timestamp'][1])) / ( 28 | p['timestamp'][1] - p['timestamp'][0] + 1e-8)) 29 | 30 | frame_captions = sorted(frame_captions, key=scorer, reverse=True) 31 | else: 32 | frame_captions = sorted(frame_captions, key=lambda p: p['timestamp']) 33 | return frame_captions, idx_list 34 | 35 | 36 | def paint_text(im, chinese, font, pos, color): 37 | img_PIL = Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB)) 38 | fillColor = color # (255,0,0) 39 | position = pos # (100,100) 40 | if not isinstance(chinese, str): 41 | chinese = chinese.decode('utf-8') 42 | draw = ImageDraw.Draw(img_PIL) 43 | draw.text(position, chinese, font=font, fill=fillColor) 44 | 45 | img = cv2.cvtColor(np.asarray(img_PIL), cv2.COLOR_RGB2BGR) 46 | return img 47 | 48 | def processImg(img, cur_time, title, dense_captions, prop_idx, n_caption=3, output_language='en'): 49 | scale = 1.0 50 | basic_text_height = 50 51 | text_height = int(basic_text_height * scale) 52 | font_size = int(text_height * 0.8) 53 | 54 | h, w, c = img.shape 55 | last_time = cur_time 56 | cur_time = time.time() 57 | img_fps = 1. / (cur_time - last_time + 1e-8) 58 | bg_img = np.zeros_like(img) 59 | cv2.rectangle(bg_img, (0, 0), (len(title) * text_height // 2, text_height), (120, 120, 120), -1, 1, 0) 60 | cv2.rectangle(bg_img, (0, h - text_height * n_caption), (w, h), (120, 120, 120), -1, 1, 0) 61 | mask = bg_img / 255. 62 | alpha = 0.5 63 | img = img * (mask == 0) + alpha * img * (mask > 0) + (1 - alpha) * mask 64 | img = img.astype('uint8') 65 | if output_language == 'zh-cn': 66 | font = ImageFont.truetype('visualization/NotoSansCJK-Bold.otf', font_size) 67 | elif output_language == 'en': 68 | font = ImageFont.truetype("visualization/Arial.ttf", font_size) 69 | else: 70 | font = ImageFont.truetype("/path/to/your.font.ttf", font_size) 71 | img = paint_text(img, title, font, (10, 0), color=(255, 255, 255)) 72 | for i, (proposal) in enumerate(dense_captions): 73 | caption, timestamp = proposal['sentence'], proposal['timestamp'] 74 | caption = '{:2.1f}s-{:2.1f}s: {}'.format(timestamp[0], timestamp[1], caption) 75 | ptText = (10, h - text_height * n_caption + i * text_height) 76 | if i in prop_idx: 77 | img = paint_text(img, caption, font, ptText, color=(255, 0, 0)) 78 | else: 79 | img = paint_text(img, caption, font, ptText, color=(255, 255, 255)) 80 | 81 | return img, cur_time, img_fps 82 | 83 | def vid_show(vid_path, captions, save_mp4, save_mp4_path, output_language='en'): 84 | start_time = time.time() 85 | cur_time = time.time() 86 | video = cv2.VideoCapture(vid_path) 87 | fps = video.get(cv2.CAP_PROP_FPS) 88 | frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT) 89 | duration = frame_count / fps 90 | print('fps: {}, duration: {}, frames: {}'.format(fps, duration, frame_count)) 91 | img_fps = fps 92 | n = 0 93 | if save_mp4: 94 | fourcc = cv2.VideoWriter_fourcc(*"mp4v") 95 | videoWriter = cv2.VideoWriter(save_mp4_path, fourcc, fps, (1280, 720)) 96 | 97 | if not output_language == 'en': 98 | for proposal in captions: 99 | caption = translator.translate(proposal['sentence'], lang_src='en', lang_tgt=output_language) 100 | proposal['sentence'] = caption 101 | for i, proposal in enumerate(captions): 102 | proposal['original_id'] = i 103 | captions = sorted(captions, key=lambda p: p['timestamp']) 104 | 105 | for frame_id in tqdm(range(int(frame_count))): 106 | ret, frame = video.read() 107 | if n >= int(fps / img_fps) or save_mp4: 108 | n = 0 109 | # clear_output(wait=True) 110 | else: 111 | n += 1 112 | continue 113 | if not ret: 114 | break 115 | lines, columns, _ = frame.shape 116 | frame = cv2.resize(frame, (1280, 720)) 117 | frame_time = frame_id / fps 118 | if opt.show_all_caption_per_frame: 119 | frame_captions, highlight_idx = get_frame_caption(frame_time, captions, ranking=False) 120 | captions_to_show = captions 121 | n_caption = len(captions) 122 | else: 123 | frame_captions, highlight_idx = get_frame_caption(frame_time, captions, ranking=True) 124 | captions_to_show = frame_captions 125 | n_caption = min(3, len(captions_to_show)) 126 | 127 | title = '{:.1f}s/{:.1f}s'.format(frame_time, duration) 128 | frame, cur_time, img_fps = processImg(frame, cur_time, title, captions_to_show, highlight_idx, output_language=output_language, n_caption=n_caption) 129 | if not save_mp4: 130 | plt.axis('off') 131 | plt.imshow(frame[:, :, ::-1]) 132 | plt.show() 133 | # control fps 134 | if save_mp4: 135 | videoWriter.write(frame) 136 | 137 | if save_mp4: 138 | videoWriter.release() 139 | print('output videos saved at {}, process time: {} s'.format(save_mp4_path, cur_time - start_time)) 140 | 141 | 142 | if __name__ == '__main__': 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument('--output_language', type=str, default='en', 145 | help='refer to /path/to/miniconda3/envs/PDVC/lib/python3.7/site-packages/google_trans_new/constant.py for more information') 146 | parser.add_argument('--output_mp4_folder', type=str, default=None) 147 | parser.add_argument('--input_mp4_folder', type=str, required=True) 148 | parser.add_argument('--dvc_file', type=str, required=True) 149 | parser.add_argument('--show_all_caption_per_frame', type=int, default=False) 150 | opt = parser.parse_args() 151 | if not opt.output_language == 'en': 152 | from google_trans_new import google_translator 153 | translator = google_translator() 154 | d = json.load(open(opt.dvc_file))['results'] 155 | for vid, dense_captions in d.items(): 156 | if opt.output_mp4_folder is None: 157 | opt.output_mp4_folder = opt.input_mp4_folder + '_output' 158 | if not os.path.exists(opt.output_mp4_folder): 159 | os.mkdir(opt.output_mp4_folder) 160 | output_mp4_path = os.path.join(opt.output_mp4_folder, vid + '.mp4') 161 | 162 | input_mp4_path = os.path.join(opt.input_mp4_folder, vid + '.mp4') 163 | print('process video: {} --> output: {}'.format(input_mp4_path, output_mp4_path)) 164 | if not os.path.exists(input_mp4_path): 165 | print('vidoe {} does not exists, skip it.') 166 | continue 167 | vid_show(input_mp4_path, dense_captions, save_mp4=True, save_mp4_path=output_mp4_path, 168 | output_language=opt.output_language) 169 | -------------------------------------------------------------------------------- /visualization/xukun_cn.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/xukun_cn.gif -------------------------------------------------------------------------------- /visualization/xukun_en.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ttengwang/PDVC/0b025c84f42fe27da51c312e8871c4b19628a04c/visualization/xukun_en.gif --------------------------------------------------------------------------------