├── .gitattributes ├── README.md ├── args.py ├── args_kmeans.py ├── attention.py ├── cvpr19_dataloader.py ├── eval.py ├── eval_avlnet.sh ├── eval_cross.py ├── gen_loader.py ├── hmdb_dataloader.py ├── local_eval.py ├── loss.py ├── loss_mil.py ├── loss_sink.py ├── lsmdc_dataloader.py ├── metrics.py ├── minY_dataloader.py ├── model.py ├── model_davenet.py ├── model_kmeans_ICCV.py ├── model_tri_c.py ├── model_tri_c_clean_sp.py ├── model_tri_kmeans.py ├── msrvtt_dataloader.py ├── script.txt ├── train_avlnet.sh ├── train_tri_c.py ├── train_tri_cos_mil.py ├── train_tri_kmeans.py ├── ucf_dataloader.py ├── video_evaluation.py ├── youcook_dataloader.py ├── youtube_dataloader.py └── youtube_mil_dataloader.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multimodal-Clustering-Network 2 | ICCV 2021 3 | 4 | This repo has the implementation of our ICCV 2021 paper: Multimodal Clustering Networks for Self-supervised Learning from Unlabeled Videos https://arxiv.org/abs/2104.12671. 5 | 6 | 7 | 8 | Command for pretraining: 9 | 10 | ``` 11 | 12 | model1=MCN_sep_recon_r 13 | 14 | python -u train_tri_kmeans.py --num_thread_reader=74 --epochs=30 --batch_size=128 \ 15 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 16 | --lr=0.0001 --tri_modal=1 --apex_level=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --recon=1 --recon_size=1024 \ 17 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 18 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 19 | --checkpoint_dir=model_me/$model1 >> logs/$model1 20 | ``` 21 | Weights 22 | https://drive.google.com/drive/folders/1J8v3Ya_H9ciX1KsLUtlqeiGaSjqVbp7j?usp=sharing -------------------------------------------------------------------------------- /args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(description='Youtube-Text-Video'): 4 | parser = argparse.ArgumentParser(description=description) 5 | parser.add_argument( 6 | '--train_csv', 7 | type=str, 8 | default='data/HowTo100M_1166_videopaths.txt', 9 | #default='/home/brian27/nobackup/data/howto100m/HowTo100M_1166_videopaths.txt', 10 | help='train csv') 11 | parser.add_argument( 12 | '--features_path', 13 | type=str, 14 | default='parsed_videos/', 15 | help='path for visual features (2D, 3D) visual features') 16 | parser.add_argument( 17 | '--features_path_audio', 18 | type=str, 19 | default='', 20 | help='path for audio files (defaults to --features_path)') 21 | parser.add_argument( 22 | '--caption_path', 23 | type=str, 24 | default='data/caption.pickle', 25 | help='HowTo100M caption pickle file path') 26 | parser.add_argument( 27 | '--word2vec_path', 28 | type=str, 29 | default='data/GoogleNews-vectors-negative300.bin', 30 | help='word embedding path') 31 | parser.add_argument( 32 | '--pretrain_path', 33 | type=str, 34 | default='', 35 | help='pre train model path') 36 | parser.add_argument( 37 | '--checkpoint_dir', 38 | type=str, 39 | default='', 40 | help='checkpoint model folder') 41 | parser.add_argument('--eval_lang_retrieval', type=int, default=0, 42 | help='if 1, eval language retrieval instead of video retrieval') 43 | parser.add_argument('--tri_modal', type=int, default=0, 44 | help='use vision, speech, and text') 45 | parser.add_argument('--tri_modal_fuse', type=int, default=0, 46 | help='use speech and text features (tri_modal must be 1)') 47 | parser.add_argument('--fuse_videoaudio_additive', type=int, default=0, 48 | help='eval T->A+V with tri-modal modal \ 49 | set tri_modal=1, tri_modal_fuse=0') 50 | parser.add_argument('--loss', type=int, default=0, 51 | help='0 for Masked Margin Softmax (MMS) loss') 52 | parser.add_argument('--apex_level', type=int, default=0, 53 | help='Apex (mixed precision) level: chose 0 for none, 1 for O1.') 54 | parser.add_argument('--random_audio_windows', type=int, default=1, 55 | help='1 to use random audio windows, 0 to use HowTo100M ASR clips') 56 | parser.add_argument('--howto_audio_frames', type=int, default=1024, 57 | help='number of frames to use for loading howto100m audio') 58 | parser.add_argument('--youcook_num_frames_multiplier', type=int, default=5, 59 | help='use 1024 * x audio frames for youcook2') 60 | parser.add_argument('--msrvtt_num_frames_multiplier', type=int, default=3, 61 | help='use 1024 * x audio frames for msrvtt') 62 | parser.add_argument('--lsmdc_num_frames_multiplier', type=int, default=3, 63 | help='use 1024 * x audio frames for lsmdc') 64 | parser.add_argument('--num_thread_reader', type=int, default=1, 65 | help='') 66 | parser.add_argument('--embd_dim', type=int, default=2048, 67 | help='embedding dim') 68 | parser.add_argument('--lr', type=float, default=0.0001, 69 | help='initial learning rate') 70 | parser.add_argument('--epochs', type=int, default=20, 71 | help='upper epoch limit') 72 | parser.add_argument('--batch_size', type=int, default=256, 73 | help='batch size') 74 | parser.add_argument('--batch_size_val', type=int, default=3500, 75 | help='batch size eval') 76 | parser.add_argument('--lr_decay', type=float, default=0.9, 77 | help='Learning rate exp epoch decay') 78 | parser.add_argument('--n_display', type=int, default=200, 79 | help='Information display frequence') 80 | parser.add_argument('--feature_dim', type=int, default=4096, 81 | help='video feature dimension') 82 | parser.add_argument('--we_dim', type=int, default=300, 83 | help='word embedding dimension') 84 | parser.add_argument('--seed', type=int, default=1, 85 | help='random seed') 86 | parser.add_argument('--verbose', type=int, default=1, 87 | help='') 88 | parser.add_argument('--max_words', type=int, default=20, 89 | help='') 90 | parser.add_argument('--min_words', type=int, default=0, 91 | help='') 92 | parser.add_argument('--feature_framerate', type=int, default=1, 93 | help='') 94 | parser.add_argument('--min_time', type=float, default=5.0, 95 | help='Gather small clips') 96 | parser.add_argument('--n_pair', type=int, default=1, 97 | help='Number of video clips to use per video') 98 | parser.add_argument('--lsmdc', type=int, default=0, 99 | help='Train on LSDMC data') 100 | parser.add_argument('--youcook', type=int, default=0, 101 | help='Train on YouCook2 data') 102 | parser.add_argument('--msrvtt', type=int, default=0, 103 | help='Train on MSRVTT data') 104 | parser.add_argument('--eval_lsmdc', type=int, default=0, 105 | help='Evaluate on LSMDC data') 106 | parser.add_argument('--eval_msrvtt', type=int, default=0, 107 | help='Evaluate on MSRVTT data') 108 | parser.add_argument('--eval_youcook', type=int, default=0, 109 | help='Evaluate on YouCook2 data') 110 | parser.add_argument('--eval_how', type=int, default=0, 111 | help='Evaluate on how2 data') 112 | parser.add_argument('--sentence_dim', type=int, default=-1, 113 | help='sentence dimension') 114 | parser.add_argument('--cluster', type=int, default=0, 115 | help='cluster loss') 116 | parser.add_argument('--queue_size', type=int, default=3, 117 | help='queue size') 118 | parser.add_argument('--start_queue', type=int, default=0, 119 | help='start_queue') 120 | parser.add_argument('--start_cluster', type=int, default=0, 121 | help='start_cluster') 122 | parser.add_argument('--num_candidates', type=int, default=1, 123 | help='num candidates for MILNCE loss') 124 | parser.add_argument('--use_queue', type=int, default=0, 125 | help='use_queue') 126 | parser.add_argument('--cluster_size', type=int, default=256, 127 | help='cluster_size') 128 | parser.add_argument('--layer', type=int, default=0, 129 | help='classification layer') 130 | parser.add_argument('--soft_label', type=int, default=0, 131 | help='soft_label') 132 | parser.add_argument('--multi_cluster', type=int, default=0, 133 | help='multi_cluster') 134 | parser.add_argument('--pure_cluster', type=int, default=0, 135 | help='pure_cluster') 136 | parser.add_argument('--project', type=int, default=0, 137 | help='project') 138 | parser.add_argument('--proto_nce', type=int, default=0, 139 | help='proto_nce') 140 | parser.add_argument('--switch_loss_h', type=int, default=0, 141 | help='switch_loss_h') 142 | parser.add_argument('--switch_loss_s', type=int, default=0, 143 | help='switch_loss_s') 144 | parser.add_argument('--self_prediction', type=int, default=0, 145 | help='self_prediction') 146 | parser.add_argument('--soft_contrast', type=int, default=0, 147 | help='soft_contrast') 148 | parser.add_argument('--soft_contrast_only', type=int, default=0, 149 | help='soft_contrast_only') 150 | parser.add_argument('--nce', type=int, default=0, 151 | help='nce') 152 | parser.add_argument('--nce_only', type=int, default=0, 153 | help='nce_only') 154 | parser.add_argument('--pseudo_contrast', type=int, default=0, 155 | help='pseudo_contrast') 156 | parser.add_argument('--cooperative', type=int, default=0, 157 | help='cooperative') 158 | parser.add_argument('--project_dim', type=int, default=6000, 159 | help='project_dim') 160 | parser.add_argument('--no_audio', type=int, default=0, 161 | help='no_audio') 162 | parser.add_argument('--no_video', type=int, default=0, 163 | help='no_video') 164 | parser.add_argument('--no_va', type=int, default=0, 165 | help='no_va') 166 | parser.add_argument('--rand', type=int, default=0, 167 | help='random drop') 168 | parser.add_argument('--joint', type=int, default=0, 169 | help='joint cluster') 170 | parser.add_argument('--kmeans', type=int, default=0, 171 | help='kmeans cluster') 172 | parser.add_argument('--fastC', type=int, default=0, 173 | help='fast cluster') 174 | parser.add_argument('--withMLP', type=int, default=0, 175 | help='withMLP cluster') 176 | parser.add_argument('--recon', type=int, default=0, 177 | help='recon ') 178 | parser.add_argument('--mms', type=int, default=0, 179 | help='mms ') 180 | parser.add_argument('--mean', type=int, default=0, 181 | help='mean ') 182 | parser.add_argument('--lamb', type=float, default=0.5, 183 | help='lambda ') 184 | parser.add_argument('--tri_loss', type=int, default=0, 185 | help='tri_loss ') 186 | parser.add_argument('--recon_size', type=int, default=768, 187 | help='recon_size ') 188 | parser.add_argument('--clu_lamb', type=int, default=1, 189 | help='clu_lamb ') 190 | parser.add_argument('--noC', type=int, default=0, 191 | help='noC ') 192 | parser.add_argument('--cos', type=int, default=1, 193 | help='cos ') 194 | parser.add_argument("--base_lr", default=4.8, type=float, help="base learning rate") 195 | parser.add_argument("--final_lr", type=float, default=0, help="final learning rate") 196 | parser.add_argument("--freeze_prototypes_niters", default=313, type=int, 197 | help="freeze the prototypes during this many iterations from the start") 198 | parser.add_argument("--wd", default=1e-6, type=float, help="weight decay") 199 | parser.add_argument("--warmup_epochs", default=10, type=int, help="number of warmup epochs") 200 | parser.add_argument("--start_warmup", default=0, type=float, 201 | help="initial warmup learning rate") 202 | parser.add_argument('--warmup_steps', type=int, default=5000, 203 | help='') 204 | parser.add_argument( 205 | '--youcook_train_path', 206 | type=str, 207 | default='data/youcook_train_audio.pkl', 208 | help='') 209 | parser.add_argument( 210 | '--youcook_val_path', 211 | type=str, 212 | default='data/youcook_val_audio.pkl', 213 | help='') 214 | parser.add_argument( 215 | '--msrvtt_test_path', 216 | type=str, 217 | default='data/msrvtt_jsfusion_test.pkl', 218 | help='') 219 | parser.add_argument( 220 | '--msrvtt_train_path', 221 | type=str, 222 | default='data/msrvtt_train.pkl', 223 | help='') 224 | parser.add_argument( 225 | '--lsmdc_test_path', 226 | type=str, 227 | default='data/lsmdc_test.pkl', 228 | help='') 229 | parser.add_argument( 230 | '--lsmdc_train_path', 231 | type=str, 232 | default='data/lsmdc_train.pkl', 233 | help='') 234 | args = parser.parse_args() 235 | return args -------------------------------------------------------------------------------- /args_kmeans.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(description='Youtube-Text-Video'): 4 | parser = argparse.ArgumentParser(description=description) 5 | parser.add_argument( 6 | '--train_csv', 7 | type=str, 8 | default='data/HowTo100M_1166_videopaths.txt', 9 | #default='/home/brian27/nobackup/data/howto100m/HowTo100M_1166_videopaths.txt', 10 | help='train csv') 11 | parser.add_argument( 12 | '--features_path', 13 | type=str, 14 | default='parsed_videos/', 15 | help='path for visual features (2D, 3D) visual features') 16 | parser.add_argument( 17 | '--features_path_audio', 18 | type=str, 19 | default='', 20 | help='path for audio files (defaults to --features_path)') 21 | parser.add_argument( 22 | '--caption_path', 23 | type=str, 24 | default='data/caption.pickle', 25 | help='HowTo100M caption pickle file path') 26 | parser.add_argument( 27 | '--word2vec_path', 28 | type=str, 29 | default='data/GoogleNews-vectors-negative300.bin', 30 | help='word embedding path') 31 | parser.add_argument( 32 | '--pretrain_path', 33 | type=str, 34 | default='', 35 | help='pre train model path') 36 | parser.add_argument( 37 | '--checkpoint_dir', 38 | type=str, 39 | default='', 40 | help='checkpoint model folder') 41 | parser.add_argument('--eval_lang_retrieval', type=int, default=0, 42 | help='if 1, eval language retrieval instead of video retrieval') 43 | parser.add_argument('--tri_modal', type=int, default=0, 44 | help='use vision, speech, and text') 45 | parser.add_argument('--tri_modal_fuse', type=int, default=0, 46 | help='use speech and text features (tri_modal must be 1)') 47 | parser.add_argument('--fuse_videoaudio_additive', type=int, default=0, 48 | help='eval T->A+V with tri-modal modal \ 49 | set tri_modal=1, tri_modal_fuse=0') 50 | parser.add_argument('--loss', type=int, default=0, 51 | help='0 for Masked Margin Softmax (MMS) loss') 52 | parser.add_argument('--apex_level', type=int, default=0, 53 | help='Apex (mixed precision) level: chose 0 for none, 1 for O1.') 54 | parser.add_argument('--random_audio_windows', type=int, default=1, 55 | help='1 to use random audio windows, 0 to use HowTo100M ASR clips') 56 | parser.add_argument('--howto_audio_frames', type=int, default=1024, 57 | help='number of frames to use for loading howto100m audio') 58 | parser.add_argument('--youcook_num_frames_multiplier', type=int, default=5, 59 | help='use 1024 * x audio frames for youcook2') 60 | parser.add_argument('--msrvtt_num_frames_multiplier', type=int, default=3, 61 | help='use 1024 * x audio frames for msrvtt') 62 | parser.add_argument('--lsmdc_num_frames_multiplier', type=int, default=3, 63 | help='use 1024 * x audio frames for lsmdc') 64 | parser.add_argument('--num_thread_reader', type=int, default=1, 65 | help='') 66 | parser.add_argument('--embd_dim', type=int, default=2048, 67 | help='embedding dim') 68 | parser.add_argument('--lr', type=float, default=0.0001, 69 | help='initial learning rate') 70 | parser.add_argument('--epochs', type=int, default=20, 71 | help='upper epoch limit') 72 | parser.add_argument('--batch_size', type=int, default=256, 73 | help='batch size') 74 | parser.add_argument('--batch_size_val', type=int, default=3500, 75 | help='batch size eval') 76 | parser.add_argument('--lr_decay', type=float, default=0.9, 77 | help='Learning rate exp epoch decay') 78 | parser.add_argument('--n_display', type=int, default=200, 79 | help='Information display frequence') 80 | parser.add_argument('--feature_dim', type=int, default=4096, 81 | help='video feature dimension') 82 | parser.add_argument('--we_dim', type=int, default=300, 83 | help='word embedding dimension') 84 | parser.add_argument('--seed', type=int, default=1, 85 | help='random seed') 86 | parser.add_argument('--verbose', type=int, default=1, 87 | help='') 88 | parser.add_argument('--max_words', type=int, default=20, 89 | help='') 90 | parser.add_argument('--min_words', type=int, default=0, 91 | help='') 92 | parser.add_argument('--feature_framerate', type=int, default=1, 93 | help='') 94 | parser.add_argument('--min_time', type=float, default=5.0, 95 | help='Gather small clips') 96 | parser.add_argument('--n_pair', type=int, default=1, 97 | help='Number of video clips to use per video') 98 | parser.add_argument('--lsmdc', type=int, default=0, 99 | help='Train on LSDMC data') 100 | parser.add_argument('--youcook', type=int, default=0, 101 | help='Train on YouCook2 data') 102 | parser.add_argument('--msrvtt', type=int, default=0, 103 | help='Train on MSRVTT data') 104 | parser.add_argument('--eval_lsmdc', type=int, default=0, 105 | help='Evaluate on LSMDC data') 106 | parser.add_argument('--eval_msrvtt', type=int, default=0, 107 | help='Evaluate on MSRVTT data') 108 | parser.add_argument('--eval_youcook', type=int, default=0, 109 | help='Evaluate on YouCook2 data') 110 | parser.add_argument('--eval_ucf', type=int, default=0, 111 | help='Evaluate on UCF-101 data') 112 | parser.add_argument('--eval_hmdb', type=int, default=0, 113 | help='Evaluate on HMDB data') 114 | parser.add_argument('--eval_cross', type=int, default=0, 115 | help='Evaluate on CrossTask data') 116 | parser.add_argument('--eval_how', type=int, default=0, 117 | help='Evaluate on how2 data') 118 | parser.add_argument('--sentence_dim', type=int, default=-1, 119 | help='sentence dimension') 120 | parser.add_argument('--cluster', type=int, default=0, 121 | help='cluster loss') 122 | parser.add_argument('--queue_size', type=int, default=3, 123 | help='queue size') 124 | parser.add_argument('--start_queue', type=int, default=0, 125 | help='start_queue') 126 | parser.add_argument('--start_cluster', type=int, default=0, 127 | help='start_cluster') 128 | parser.add_argument('--num_candidates', type=int, default=1, 129 | help='num candidates for MILNCE loss') 130 | parser.add_argument('--use_queue', type=int, default=0, 131 | help='use_queue') 132 | parser.add_argument('--cluster_size', type=int, default=256, 133 | help='cluster_size') 134 | parser.add_argument('--layer', type=int, default=0, 135 | help='classification layer') 136 | parser.add_argument('--soft_label', type=int, default=0, 137 | help='soft_label') 138 | parser.add_argument('--multi_cluster', type=int, default=0, 139 | help='multi_cluster') 140 | parser.add_argument('--pure_cluster', type=int, default=0, 141 | help='pure_cluster') 142 | parser.add_argument('--project', type=int, default=0, 143 | help='project') 144 | parser.add_argument('--proto_nce', type=int, default=0, 145 | help='proto_nce') 146 | parser.add_argument('--switch_loss_h', type=int, default=0, 147 | help='switch_loss_h') 148 | parser.add_argument('--switch_loss_s', type=int, default=0, 149 | help='switch_loss_s') 150 | parser.add_argument('--self_prediction', type=int, default=0, 151 | help='self_prediction') 152 | parser.add_argument('--soft_contrast', type=int, default=0, 153 | help='soft_contrast') 154 | parser.add_argument('--soft_contrast_only', type=int, default=0, 155 | help='soft_contrast_only') 156 | parser.add_argument('--nce', type=int, default=0, 157 | help='nce') 158 | parser.add_argument('--nce_only', type=int, default=0, 159 | help='nce_only') 160 | parser.add_argument('--pseudo_contrast', type=int, default=0, 161 | help='pseudo_contrast') 162 | parser.add_argument('--cooperative', type=int, default=0, 163 | help='cooperative') 164 | parser.add_argument('--project_dim', type=int, default=6000, 165 | help='project_dim') 166 | parser.add_argument('--no_audio', type=int, default=0, 167 | help='no_audio') 168 | parser.add_argument('--no_va', type=int, default=0, 169 | help='no_va') 170 | parser.add_argument('--rand', type=int, default=0, 171 | help='random drop') 172 | parser.add_argument('--joint', type=int, default=0, 173 | help='joint cluster') 174 | parser.add_argument('--kmeans', type=int, default=0, 175 | help='kmeans cluster') 176 | parser.add_argument('--fastC', type=int, default=0, 177 | help='fast cluster') 178 | parser.add_argument('--withMLP', type=int, default=0, 179 | help='withMLP cluster') 180 | parser.add_argument('--recon', type=int, default=0, 181 | help='recon ') 182 | parser.add_argument('--mms', type=int, default=0, 183 | help='mms ') 184 | parser.add_argument('--mean', type=int, default=0, 185 | help='mean ') 186 | parser.add_argument('--lamb', type=float, default=0.5, 187 | help='lambda ') 188 | parser.add_argument('--tri_loss', type=int, default=0, 189 | help='tri_loss ') 190 | parser.add_argument('--recon_size', type=int, default=768, 191 | help='recon_size ') 192 | parser.add_argument('--clu_lamb', type=int, default=1, 193 | help='clu_lamb ') 194 | parser.add_argument('--noC', type=int, default=0, 195 | help='noC ') 196 | parser.add_argument('--cos', type=int, default=1, 197 | help='cos ') 198 | parser.add_argument("--base_lr", default=4.8, type=float, help="base learning rate") 199 | parser.add_argument("--final_lr", type=float, default=0, help="final learning rate") 200 | parser.add_argument("--freeze_prototypes_niters", default=313, type=int, 201 | help="freeze the prototypes during this many iterations from the start") 202 | parser.add_argument("--wd", default=1e-6, type=float, help="weight decay") 203 | parser.add_argument("--warmup_epochs", default=10, type=int, help="number of warmup epochs") 204 | parser.add_argument("--start_warmup", default=0, type=float, 205 | help="initial warmup learning rate") 206 | parser.add_argument('--warmup_steps', type=int, default=5000, 207 | help='') 208 | parser.add_argument( 209 | '--youcook_train_path', 210 | type=str, 211 | default='data/youcook_train_audio.pkl', 212 | help='') 213 | parser.add_argument( 214 | '--youcook_val_path', 215 | type=str, 216 | default='data/youcook_val_audio.pkl', 217 | help='') 218 | parser.add_argument( 219 | '--msrvtt_test_path', 220 | type=str, 221 | default='data/msrvtt_jsfusion_test.pkl', 222 | help='') 223 | parser.add_argument( 224 | '--msrvtt_train_path', 225 | type=str, 226 | default='data/msrvtt_train.pkl', 227 | help='') 228 | parser.add_argument( 229 | '--lsmdc_test_path', 230 | type=str, 231 | default='data/lsmdc_test.pkl', 232 | help='') 233 | parser.add_argument( 234 | '--lsmdc_train_path', 235 | type=str, 236 | default='data/lsmdc_train.pkl', 237 | help='') 238 | parser.add_argument( 239 | '--ucf_test_path', 240 | type=str, 241 | default='data/UCF101_data.pkl', 242 | help='') 243 | parser.add_argument( 244 | '--hmdb_test_path', 245 | type=str, 246 | default='data/HMDB_data.pkl', 247 | help='') 248 | args = parser.parse_args() 249 | return args -------------------------------------------------------------------------------- /attention.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn import functional as F 3 | import torch 4 | import math 5 | 6 | 7 | class MultiHeadAttention(nn.Module): 8 | def __init__(self, heads, d_model, dropout = 0.1): 9 | super().__init__() 10 | 11 | self.d_model = d_model 12 | self.d_k = d_model // heads 13 | self.h = heads 14 | 15 | self.q_linear = nn.Linear(d_model, d_model) 16 | self.v_linear = nn.Linear(d_model, d_model) 17 | self.k_linear = nn.Linear(d_model, d_model) 18 | self.dropout = nn.Dropout(dropout) 19 | self.out = nn.Linear(d_model, d_model) 20 | 21 | def forward(self, q, k, v, mask=None): 22 | 23 | bs = q.size(0) 24 | 25 | # perform linear operation and split into h heads 26 | 27 | k = self.k_linear(k).view(bs, -1, self.h, self.d_k) 28 | q = self.q_linear(q).view(bs, -1, self.h, self.d_k) 29 | v = self.v_linear(v).view(bs, -1, self.h, self.d_k) 30 | 31 | # transpose to get dimensions bs * h * sl * d_model 32 | 33 | k = k.transpose(1 ,2) 34 | q = q.transpose(1 ,2) 35 | v = v.transpose(1 ,2) 36 | # calculate attention using function we will define next 37 | scores = attention(q, k, v, self.d_k, mask, self.dropout) 38 | 39 | # concatenate heads and put through final linear layer 40 | concat = scores.transpose(1 ,2).contiguous() \ 41 | .view(bs, -1, self.d_model) 42 | 43 | output = self.out(concat) 44 | 45 | return output 46 | 47 | 48 | def attention(q, k, v, d_k, mask=None, dropout=None): 49 | scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) 50 | 51 | if mask is not None: 52 | mask = mask.unsqueeze(1) 53 | scores = scores.masked_fill(mask == 0, -1e9) 54 | scores = F.softmax(scores, dim=-1) 55 | 56 | if dropout is not None: 57 | scores = dropout(scores) 58 | 59 | output = torch.matmul(scores, v) 60 | return output 61 | 62 | -------------------------------------------------------------------------------- /cvpr19_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.nn.functional import adaptive_max_pool1d 8 | from torch.utils.data import Dataset 9 | import pandas as pd 10 | import os 11 | import numpy as np 12 | import re 13 | import random 14 | import torch.nn.functional as F 15 | import json 16 | import librosa 17 | import math 18 | 19 | class CVPR19_DataLoader(Dataset): 20 | """CVPR19 testset loader.""" 21 | 22 | def __init__( 23 | self, 24 | csv, 25 | features_path, 26 | annot_path, 27 | steps_path, 28 | audio_path, 29 | annot_path_time, 30 | cook_path, 31 | with_audio, 32 | we, 33 | we_dim=300, 34 | max_words=30, 35 | features_path_3D=None, 36 | feature_framerate=1.0, 37 | feature_framerate_3D=24.0 / 16.0, 38 | num_audio_frames=1024, 39 | zeus=0, 40 | ): 41 | """ 42 | Args: 43 | """ 44 | self.csv = pd.read_csv(csv) 45 | self.annot_path = annot_path 46 | self.steps_path = steps_path 47 | self.audio_path = audio_path 48 | self.annot_path_time = annot_path_time 49 | self.we = we 50 | self.we_dim = we_dim 51 | self.max_words = max_words 52 | self.feature_framerate = feature_framerate 53 | self.num_audio_frames = num_audio_frames 54 | self.zeus = zeus 55 | self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D} 56 | self.feature_path = features_path 57 | #if features_path_3D: 58 | # self.feature_path['3d'] = features_path_3D 59 | self.steps = {} 60 | self.cook_path = cook_path 61 | self.cook_set = set() 62 | self.with_audio = with_audio 63 | 64 | file1 = open(cook_path) 65 | for line in file1: 66 | data = line.strip() 67 | self.cook_set.add(data) 68 | # for task in self.csv['task'].unique(): 69 | # with open (os.path.join(self.steps_path,str(task)),'r') as f: 70 | # self.steps[str(task)] = th.cat([self._words_to_we(self._tokenize_text(line.strip()))[None,:,:] for line in f],dim=0) 71 | with open(steps_path, "r") as read_file: 72 | # print("Converting JSON encoded data into Python dictionary") 73 | step_dict = json.load(read_file) 74 | for task, y in step_dict.items(): 75 | self.steps[str(task)] = th.cat([self._words_to_we(self._tokenize_text(step))[None, :, :] for step in y], 76 | dim=0) 77 | 78 | def __len__(self): 79 | return len(self.csv) 80 | 81 | def _zero_pad_tensor(self, tensor, size): 82 | if len(tensor) >= size: 83 | return tensor[:size] 84 | else: 85 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 86 | return np.concatenate((tensor, zero), axis=0) 87 | 88 | def _tokenize_text(self, sentence): 89 | w = re.findall(r"[\w']+", str(sentence)) 90 | return w 91 | 92 | def _words_to_we(self, words): 93 | words = [word for word in words if word in self.we.vocab] 94 | if words: 95 | we = self._zero_pad_tensor(self.we[words], self.max_words) 96 | return th.from_numpy(we) 97 | else: 98 | return th.zeros(self.max_words, self.we_dim) 99 | 100 | def _zero_pad_audio(self, audio, max_frames): 101 | n_frames = audio.shape[1] 102 | if n_frames >= max_frames: 103 | return audio[:, 0:max_frames], int(max_frames) 104 | else: 105 | p = max_frames - n_frames 106 | audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 107 | return audio_padded, n_frames 108 | 109 | #""" 110 | def _get_video(self, feature_path): 111 | if self.zeus: 112 | video = th.load(feature_path).float() 113 | else: 114 | video = np.load(feature_path) 115 | return video if self.zeus else th.from_numpy(video).float() 116 | #""" 117 | 118 | def _get_video_me(self, vid_path, s, e, fps): 119 | feature_path = {} 120 | video = {} 121 | output = {} 122 | video = np.load(vid_path) 123 | video = th.from_numpy(video).float() 124 | 125 | output = th.zeros(len(s), video.shape[-1]) 126 | for i in range(len(s)): 127 | # start = int(s[i] * fps) 128 | # end = int(e[i] * fps) 129 | start = int(i * fps) 130 | end = int((i + 1) * fps) 131 | slice = video[start:end] 132 | 133 | output[i] = F.normalize(th.max(slice, dim=0)[0], dim=0) 134 | 135 | return output # th.cat([output[k] for k in output], dim=1) 136 | 137 | def _get_audio_and_text(self, k, mel_spec): 138 | # n_caption = len(caption['start']) 139 | # k = n_pair_max 140 | starts = np.zeros(k) 141 | ends = np.zeros(k) 142 | # text = th.zeros(k, self.max_words, self.we_dim) 143 | audio = [0 for i in range(k)] 144 | 145 | nframes = np.zeros(k) 146 | # r_ind = np.random.choice(range(n_caption), k, replace=True) 147 | dur = 4 148 | for i in range(k): 149 | # ind = r_ind[i] 150 | if i < dur: 151 | start = 0 152 | end = 2 * dur 153 | elif i > k - dur: 154 | start = k - 2 * dur 155 | end = k 156 | else: 157 | start = i - dur 158 | end = i + dur 159 | # print('time',start,end) 160 | audio[i], nframes[i], starts[i], ends[i] = self._get_single_audio_text(start, end, mel_spec) 161 | # print('nframes',nframes) 162 | audio = th.cat([i.unsqueeze(0) for i in audio], dim=0) 163 | return audio, nframes, starts, ends 164 | 165 | def _get_single_audio_text(self, start, end, mel_spec): 166 | 167 | # words = self._tokenize_text(caption['text'][ind]) 168 | 169 | frames = librosa.core.time_to_frames([start, end], sr=16000, hop_length=160, n_fft=400) 170 | # print('frames',frames[0], frames[1]) 171 | if frames[0] < 0: 172 | frames[0] = 0 173 | padded_mel_spec, nframes = self._zero_pad_audio(mel_spec[:, frames[0]: frames[1]], self.num_audio_frames) 174 | return th.from_numpy( 175 | padded_mel_spec), nframes, start, end # , nframes#, caption['start'][start], caption['end'][end], self._words_to_we(words) 176 | 177 | def read_assignment(self, T, K, path): 178 | Y = np.zeros([T, K], dtype=np.uint8) 179 | with open(path, 'r') as f: 180 | for line in f: 181 | step, start, end = line.strip().split(',') 182 | start = int(math.floor(float(start))) 183 | end = int(math.ceil(float(end))) 184 | step = int(step) - 1 185 | Y[start:end, step] = 1 186 | return Y 187 | 188 | def __getitem__(self, idx): 189 | video_id = self.csv['video_id'][idx] 190 | task = str(self.csv['task'][idx]) 191 | if self.zeus: 192 | vid_path_2d = os.path.join(self.feature_path['2d'], self.csv['path'][idx].split('.')[0] + '.pth') 193 | vid_path_3d = os.path.join(self.feature_path['3d'], self.csv['path'][idx].split('.')[0] + '.pth') 194 | else: 195 | # vid_path_2d = os.path.join(self.feature_path['2d'], self.csv['path'][idx]) 196 | # vid_path_3d = os.path.join(self.feature_path['3d'], self.csv['path'][idx]) 197 | vid_path_2d = os.path.join(self.feature_path, self.csv['video_id'][idx] + '_2d.npy') 198 | vid_path_3d = os.path.join(self.feature_path, self.csv['video_id'][idx] + '_3d.npy') 199 | 200 | annot = th.from_numpy(np.load(os.path.join(self.annot_path, task + '_' + video_id + '.npy'))) 201 | T = annot.size()[0] # number of frames 202 | # video[frame,2048] -> [1,2048,frame] 203 | """ 204 | video_2d = adaptive_max_pool1d(video_2d.transpose(1,0)[None,:,:],T).view(-1,T).transpose(1,0) 205 | 206 | s = [i for i in range(T)] 207 | e = [i+1 for i in range(T)] 208 | video_3d_r = th.zeros(T, video_3d.shape[-1]) 209 | for i in range(len(s)): 210 | start = int(s[i] * self.fps['3d']) 211 | end = int(e[i] * self.fps['3d']) + 1 212 | slice_v = video_3d[start:end] 213 | if len(slice_v) < 1: 214 | print("error") 215 | else: 216 | video_3d_r[i] = F.normalize(th.max(slice_v, dim=0)[0], dim=0) 217 | video_3d = video_3d_r#adaptive_max_pool1d(video_3d.transpose(1,0)[None,:,:],T).view(-1,T).transpose(1,0) 218 | """ 219 | # video_3d = adaptive_max_pool1d(video_3d.transpose(1,0)[None,:,:],T).view(-1,T).transpose(1,0) 220 | # 221 | 222 | # """ 223 | # audio 224 | au_path = os.path.join(self.audio_path, self.csv['video_id'][idx] + '.npz') 225 | mel_spec = np.load(au_path)['arr_0'] 226 | audio, nframes, starts, ends = self._get_audio_and_text(T, mel_spec) 227 | #video_2d = self._get_video_me(vid_path_2d, starts, ends, self.fps['2d']) 228 | #video_3d = self._get_video_me(vid_path_3d, starts, ends, self.fps['3d']) 229 | video_2d = self._get_video(vid_path_2d) 230 | video_3d = self._get_video(vid_path_3d) 231 | annot = th.from_numpy(np.load(os.path.join(self.annot_path, task + '_' + video_id + '.npy'))) 232 | T = annot.size()[0] 233 | video_2d = adaptive_max_pool1d(video_2d.transpose(1, 0)[None, :, :], T).view(-1, T).transpose(1, 0) 234 | video_3d = adaptive_max_pool1d(video_3d.transpose(1, 0)[None, :, :], T).view(-1, T).transpose(1, 0) 235 | #video = th.cat((F.normalize(video_2d, dim=1), F.normalize(video_3d, dim=1)), dim=1) 236 | 237 | video = th.cat((F.normalize(video_2d, dim=1), F.normalize(video_3d, dim=1)), dim=1) 238 | #video = th.cat(video_2d,video_3d) 239 | 240 | frames = len(video_2d) 241 | step_num = len(self.steps[task]) 242 | #annot = self.read_assignment(frames,step_num,os.path.join(self.annot_path_time, task + '_' + video_id + '.csv')) 243 | # print(video.shape) 244 | if task in self.cook_set: 245 | iscook = 1 246 | else: 247 | iscook = 0 248 | if not self.with_audio: 249 | return {'video': video, 'nframes': th.IntTensor(nframes), 'steps': self.steps[task], 'video_id': video_id, 250 | 'task': task, 'Y': annot, 'cook': iscook} 251 | else: 252 | return {'video': video, 'audio': th.FloatTensor(audio.float()), \ 253 | 'nframes': th.IntTensor(nframes), 'steps': self.steps[task], 'video_id': video_id, \ 254 | 'task': task, 'Y': annot, 'cook': iscook} 255 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import DataLoader 8 | from args import get_args 9 | from model import Net 10 | from metrics import compute_metrics, print_computed_metrics 11 | from gensim.models.keyedvectors import KeyedVectors 12 | import pickle 13 | import glob 14 | from lsmdc_dataloader import LSMDC_DataLoader 15 | from msrvtt_dataloader import MSRVTT_DataLoader 16 | from youcook_dataloader import Youcook_DataLoader 17 | from video_evaluation import evaluate_recall_youcook 18 | 19 | 20 | args = get_args() 21 | if args.verbose: 22 | print(args) 23 | 24 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument' 25 | 26 | print('Loading word vectors: {}'.format(args.word2vec_path)) 27 | we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True) 28 | print('done') 29 | 30 | 31 | if args.eval_youcook: 32 | dataset_val = Youcook_DataLoader( 33 | data=args.youcook_val_path, 34 | we=we, 35 | max_words=args.max_words, 36 | we_dim=args.we_dim, 37 | tri_modal=True 38 | ) 39 | dataloader_val = DataLoader( 40 | dataset_val, 41 | batch_size=args.batch_size_val, 42 | num_workers=args.num_thread_reader, 43 | shuffle=False, 44 | ) 45 | if args.eval_lsmdc: 46 | dataset_lsmdc = LSMDC_DataLoader( 47 | csv_path=args.lsmdc_test_csv_path, 48 | features_path=args.lsmdc_test_features_path, 49 | we=we, 50 | max_words=args.max_words, 51 | we_dim=args.we_dim, 52 | ) 53 | dataloader_lsmdc = DataLoader( 54 | dataset_lsmdc, 55 | batch_size=args.batch_size_val, 56 | num_workers=args.num_thread_reader, 57 | shuffle=False, 58 | ) 59 | if args.eval_msrvtt: 60 | msrvtt_testset = MSRVTT_DataLoader( 61 | csv_path=args.msrvtt_test_csv_path, 62 | features_path=args.msrvtt_test_features_path, 63 | we=we, 64 | max_words=args.max_words, 65 | we_dim=args.we_dim, 66 | ) 67 | dataloader_msrvtt = DataLoader( 68 | msrvtt_testset, 69 | batch_size=3000, 70 | num_workers=args.num_thread_reader, 71 | shuffle=False, 72 | drop_last=False, 73 | ) 74 | net = Net( 75 | video_dim=args.feature_dim, 76 | embd_dim=args.embd_dim, 77 | we_dim=args.we_dim, 78 | max_words=args.max_words, 79 | ) 80 | net.eval() 81 | net.cuda() 82 | 83 | pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad) 84 | print(pytorch_total_params) 85 | exit() 86 | 87 | if args.verbose: 88 | print('Starting evaluation loop ...') 89 | 90 | 91 | def Eval_retrieval(model, eval_dataloader, dataset_name): 92 | model.eval() 93 | print('Evaluating Text-Video retrieval on {} data'.format(dataset_name)) 94 | with th.no_grad(): 95 | for i_batch, data in enumerate(eval_dataloader): 96 | text = data['text'].cuda() 97 | video = data['video'].cuda() 98 | vid = data['video_id'] 99 | m = model(video, text) 100 | m = m.cpu().detach().numpy() 101 | metrics = compute_metrics(m) 102 | print_computed_metrics(metrics) 103 | if args.eval_youcook: 104 | evaluate_recall_youcook(None, None, data['video_id'], m) 105 | 106 | 107 | all_checkpoints = glob.glob(args.pretrain_path) 108 | 109 | for c in all_checkpoints: 110 | print('Eval checkpoint: {}'.format(c)) 111 | print('Loading checkpoint: {}'.format(c)) 112 | net.load_checkpoint(c) 113 | if args.eval_youcook: 114 | Eval_retrieval(net, dataloader_val, 'YouCook2') 115 | if args.eval_msrvtt: 116 | Eval_retrieval(net, dataloader_msrvtt, 'MSR-VTT') 117 | if args.eval_lsmdc: 118 | Eval_retrieval(net, dataloader_lsmdc, 'LSMDC') 119 | -------------------------------------------------------------------------------- /eval_avlnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --qos=sched_level_2 3 | #SBATCH --gres=gpu:4 4 | #SBATCH --gpus-per-node=4 5 | #SBATCH --nodes=1 6 | #SBATCH --time=2:00:00 7 | #SBATCH --cpus-per-task 74 8 | #SBATCH --ntasks-per-node=1 9 | #SBATCH --mem=1T 10 | #SBATCH --exclusive 11 | #SBATCH --job-name="ht" 12 | #SBATCH --output logs/ht-%j.out 13 | #SBATCH --error logs/ht-%j.err 14 | ## NOTE: adjust the dependency if needed for the 2nd and 3rd run 15 | ##SBATCH --dependency=afterok:12625 16 | 17 | ## Number of total processes 18 | echo " " 19 | echo " Nodelist:= " $SLURM_JOB_NODELIST 20 | echo " Number of nodes:= " $SLURM_JOB_NUM_NODES 21 | echo " GPUs per node:= " $SLURM_JOB_GPUS 22 | echo " Ntasks per node:= " $SLURM_NTASKS_PER_NODE 23 | 24 | echo " Running on multiple nodes/GPU devices" 25 | echo "" 26 | echo " Run started at:- " 27 | date 28 | 29 | source /nobackup/users/duartek/anaconda3/bin/activate 30 | conda activate wmlce-1.6.2 31 | 32 | 33 | nvidia-smi 34 | pwd 35 | 36 | ##################### 37 | 38 | #python gen_loader.py 39 | 40 | #python eval.py --eval_youcook=1 --num_thread_reader=74 --embd_dim=6144 --pretrain_path=/nobackup/users/brian27/howto100m/model/howto100m_pt_model.pth 41 | 42 | python eval.py --eval_youcook=1 --num_thread_reader=74 --embd_dim=4096 --pretrain_path=/nobackup/users/brian27/howto100m/model_me/mil_nce_two/e18.pth 43 | 44 | 45 | #python train_tri_kmeans.py --eval_youcook=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 46 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 47 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 48 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 49 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 50 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth 51 | 52 | #python train_tri_kmeans.py --eval_msrvtt=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 53 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 54 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 55 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 56 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 57 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth 58 | 59 | 60 | echo "Weights 16" 61 | 62 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 63 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 64 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 65 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 66 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 67 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth 68 | 69 | python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 70 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 71 | --lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 72 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 73 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 74 | --pretrain_path=model_mcn/MCN_KMeans/e16.pth 75 | 76 | 77 | 78 | #echo "Weights 21" 79 | 80 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 81 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 82 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 83 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 84 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 85 | #--pretrain_path=model_mcn/MCN_KMeans/e21.pth 86 | 87 | #python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 88 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 89 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 90 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 91 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 92 | #--pretrain_path=model_mcn/MCN_KMeans/e21.pth 93 | 94 | #echo "Weights 24" 95 | 96 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 97 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 98 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 99 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 100 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 101 | #--pretrain_path=model_mcn/MCN_KMeans/e24.pth 102 | 103 | #python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 104 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 105 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 106 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 107 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 108 | #--pretrain_path=model_mcn/MCN_KMeans/e24.pth 109 | 110 | 111 | echo "Weights 26" 112 | 113 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 114 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 115 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 116 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 117 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 118 | #--pretrain_path=model_mcn/MCN_KMeans/e26.pth 119 | 120 | python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 121 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 122 | --lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 123 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 124 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 125 | --pretrain_path=model_mcn/MCN_KMeans/e26.pth 126 | 127 | 128 | 129 | 130 | 131 | #python train_tri_kmeans.py --eval_cross=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \ 132 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 133 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \ 134 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 135 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 136 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth 137 | 138 | #python train_tri_cos_mil.py --eval_cross=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 139 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Joint_Recon_Hard/e15.pth \ 140 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \ 141 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 142 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos 143 | 144 | #python train_tri_cos_mil.py --eval_youcook=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 145 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Joint_Recon_Cross_Hard/e9.pth \ 146 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \ 147 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 148 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos 149 | 150 | #python train_tri_cos_mil.py --eval_msrvtt=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 151 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Joint_Recon_Cross_Hard/e9.pth \ 152 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \ 153 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 154 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos 155 | 156 | #python train_tri_cos_mil.py --eval_ucf=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 157 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Sports/e20.pth \ 158 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \ 159 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 160 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos 161 | 162 | #python train_tri_cos_mil.py --eval_hmdb=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 163 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Sports/e20.pth \ 164 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \ 165 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 166 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos 167 | 168 | #python train_tri_cos_mil.py --eval_msrvtt=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 169 | #python train_tri_cos_mil.py --eval_youcook=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 170 | #python train_tri_cos_mil.py --eval_ucf=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 171 | #python train_tri_cos_mil.py --eval_hmdb=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 172 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Joint_Recon/e11.pth \ 173 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \ 174 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 175 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos 176 | 177 | 178 | #python local_eval.py 179 | 180 | # model_mcn/MCN1/e9.pth 181 | 182 | #python train_tri_c.py --eval_youcook=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 183 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Recon2/e10.pth \ 184 | #--lr=1e-5 --tri_modal=1 185 | 186 | 187 | 188 | #python train_tri_c.py --eval_msrvtt=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 189 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Recon2/e14.pth \ 190 | #--lr=1e-5 --tri_modal=1 \ 191 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 192 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos 193 | 194 | # model_mcn/MCN1/e9.pth 195 | 196 | #python train_tri_c.py --eval_youcook=1 --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \ 197 | #--lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/MCN_Recon2/e14.pth \ 198 | #--lr=1e-5 --tri_modal=1 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | ## Wait for all commands to finish 210 | wait 211 | echo "Run completed at:- " 212 | date 213 | -------------------------------------------------------------------------------- /eval_cross.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import DataLoader 8 | from args import get_args 9 | import numpy as np 10 | from dp.dp import dp 11 | from tqdm import tqdm 12 | from sklearn.metrics import average_precision_score 13 | from tqdm import tqdm as std_tqdm 14 | from functools import partial 15 | tqdm = partial(std_tqdm, dynamic_ncols=True) 16 | import torch.nn as nn 17 | from metrics import compute_metrics, print_computed_metrics 18 | from gensim.models.keyedvectors import KeyedVectors 19 | import pickle 20 | import glob 21 | from lsmdc_dataloader import LSMDC_DataLoader 22 | from msrvtt_dataloader import MSRVTT_DataLoader 23 | from youcook_dataloader import Youcook_DataLoader 24 | from cvpr19_dataloader import CVPR19_DataLoader 25 | from mining_dataloader import Mining_DataLoader 26 | import pprint 27 | 28 | #th.backends.cudnn.enabled = False 29 | 30 | pp = pprint.PrettyPrinter(indent=4) 31 | 32 | args = get_args() 33 | if args.verbose: 34 | print(args) 35 | 36 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument' 37 | 38 | 39 | 40 | print('Loading word vectors: {}'.format(args.word2vec_path)) 41 | we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True) 42 | print('done') 43 | 44 | if args.save_feature==1: 45 | step_path = 'step_all.json' 46 | else: 47 | step_path = 'step.json' 48 | 49 | if args.cross: 50 | cross_testset = CVPR19_DataLoader( 51 | #csv='vids_27.csv', 52 | csv='/nobackup/users/brian27/CrossTask/howto100m_crosstask_eval/cvpr19_test.csv', 53 | features_path='vids_feature', 54 | #features_path = '/nobackup/users/brian27/CrossTask/howto100m_crosstask_eval/features_2d', 55 | #features_path_3D = '/nobackup/users/brian27/CrossTask/howto100m_crosstask_eval/features_3d', 56 | annot_path = 'anno', #'/nobackup/users/brian27/CrossTask/crosstask_release/Y-1', 57 | steps_path = step_path, 58 | audio_path = 'audio_feature',#'/home/brian27/nobackup/CrossTask/audio_feature_new',# 59 | annot_path_time='/nobackup/users/brian27/CrossTask/crosstask_release/annotations', 60 | cook_path = '/home/brian27/nobackup/CrossTask/crosstask_release/cook.txt', 61 | with_audio = args.with_audio, 62 | we=we 63 | #features_path_3D='howto100m_crosstask_eval/features_3d' 64 | ) 65 | if args.mining: 66 | cross_testset = Mining_DataLoader( 67 | csv='/nobackup/users/brian27/Weak_YouTube_dataset/mining.csv', 68 | features_path='/nobackup/users/brian27/Weak_YouTube_dataset/test_new_f', 69 | annot_path='/nobackup/users/brian27/Weak_YouTube_dataset/anno', 70 | steps_path='/nobackup/users/brian27/Weak_YouTube_dataset/'+step_path, 71 | audio_path='/nobackup/users/brian27/Weak_YouTube_dataset/test_new_a_f', # 'audio_feature', 72 | we=we 73 | # features_path_3D='howto100m_crosstask_eval/features_3d' 74 | ) 75 | #print(cross_testset) 76 | dataloader_cross = DataLoader( 77 | cross_testset, 78 | batch_size=1, 79 | num_workers=args.num_thread_reader, 80 | shuffle=False, 81 | drop_last=False, 82 | ) 83 | 84 | 85 | #def cvpr19_score(X, steps, model): 86 | def cvpr19_score_a(X, audio, nframes, steps, model): 87 | #sim_matrix = model.forward(X.cuda(),steps.cuda()).transpose(1,0) #[frame,class] 88 | #print('video',X.shape) 89 | #print('audio',audio.shape) 90 | #print('text',steps.shape) 91 | if args.v_only==1: 92 | sim_matrix = model.forward(X, audio, nframes, args.v_only, steps) 93 | return sim_matrix.transpose(1, 0).detach().cpu().numpy() 94 | 95 | #sim_matrix,s2,s3 = model.forward(X, audio, nframes, steps).transpose(1, 0) 96 | sim_matrix,s2,s3 = model.forward(X, audio, nframes, args.v_only, steps) # [frame,class] 97 | #v,a,t = model.forward(X, audio, nframes, steps)#.transpose(1, 0) # [frame,class] 98 | #print('sim_matrix',sim_matrix.shape) 99 | return sim_matrix.transpose(1, 0).detach().cpu().numpy(),s2.transpose(1, 0).detach().cpu().numpy(),s3.transpose(1, 0).detach().cpu().numpy() 100 | #return v,a,t 101 | 102 | def cvpr19_score(X, steps, model): 103 | sim_matrix = model.forward(X.cuda(),steps.cuda()).transpose(1,0) #[frame,class] 104 | #print('video',X.shape) 105 | #print('audio',audio.shape) 106 | #print('text',steps.shape) 107 | #sim_matrix = model.forward(X, audio, nframes, steps).transpose(1, 0) # [frame,class] 108 | #print('sim_matrix',sim_matrix.shape) 109 | return sim_matrix.detach().cpu().numpy() 110 | 111 | def cvpr19_predict(scores): 112 | C = -scores#.cpu().detach().numpy() 113 | y = np.empty(scores.shape, dtype=np.float32) 114 | dp(y, C, exactly_one=True) #[frame,class] 115 | return y 116 | 117 | def arg_max_predict(scores): 118 | y_final = np.zeros((scores.shape[0], scores.shape[1])) 119 | arg_y = np.argmax(scores, axis=1) 120 | for i in range(scores.shape[0]): 121 | y_final[i][arg_y[i]] = 1 122 | return y_final 123 | 124 | def get_recall(y_true, y): 125 | #return ((y*y_true).sum(axis=1)>0).sum() / (y_true.sum(axis=1)>0).sum() 126 | if args.recall_frame==0: 127 | return ((y*y_true).sum(axis=0)>0).sum() / (y_true.sum(axis=0)>0).sum() 128 | else: 129 | return ((y * y_true).sum(axis=0) > 0).sum() / (y_true.sum(axis=0) > 0).sum() 130 | 131 | def align_eval(model, dataloader, gpu_mode=1): 132 | print('start cross') 133 | recalls = {} 134 | counts = {} 135 | recalls_m = 0 136 | counts_m = 0 137 | task_scores = {} 138 | task_gt = {} 139 | for sample in tqdm(dataloader): 140 | with th.no_grad(): 141 | 142 | #print(sample) 143 | #for sample in batch: 144 | 145 | 146 | 147 | video = sample['video'].cuda() if gpu_mode else sample['video'] 148 | text = sample['steps'].cuda() if gpu_mode else sample['steps'] 149 | 150 | video = video.view(-1, video.shape[-1]) 151 | text = th.squeeze(text)# class x emb 152 | #n_frame = th.tensor([]) 153 | n_frame = sample['nframes'].cuda()#th.ones(video.shape[0],1)*1#.cuda() 154 | n_frame = n_frame.view(-1) 155 | #print(n_frame.shape) 156 | 157 | #print('n_frame',n_frame.shape) 158 | if args.tri==1: 159 | audio = sample['audio'].cuda() if gpu_mode else sample['video'] 160 | audio = audio.view(-1, audio.shape[-2], audio.shape[-1]) 161 | #print(audio.shape) 162 | scores_list = [] 163 | split = 15 164 | batch_size = 25 165 | #print(video.shape[0]) 166 | b_s = int(video.shape[0] / batch_size) 167 | # for i in range(video.shape[0]): 168 | # video_1 = th.unsqueeze(video[:half],0) 169 | # audio_1 = th.unsqueeze(audio[:half],0) 170 | if video.shape[0] < batch_size: 171 | if args.v_only==0: 172 | scores,s2,s3 = cvpr19_score_a(video, audio, n_frame, text, model) 173 | else: 174 | scores = cvpr19_score_a(video, audio, n_frame, text, model) 175 | else: 176 | for i in range(b_s): 177 | if i == b_s - 1: 178 | video_1 = video[batch_size * i:] 179 | audio_1 = audio[batch_size * i:] 180 | n_frame_1 = n_frame[batch_size * i:] 181 | else: 182 | video_1 = video[batch_size * i:batch_size * (i + 1)] 183 | audio_1 = audio[batch_size * i:batch_size * (i + 1)] 184 | n_frame_1 = n_frame[batch_size * i:batch_size * (i + 1)] 185 | # text_1 = th.unsqueeze(text[i]) 186 | if args.v_only==0: 187 | scores,s2,s3 = cvpr19_score_a(video_1, audio_1, n_frame_1, text, model) 188 | else: 189 | scores = cvpr19_score_a(video_1, audio_1, n_frame_1, text, model) 190 | scores_list.append(scores) 191 | scores = np.vstack(scores_list) 192 | if args.save_feature==1: 193 | 194 | scores = th.from_numpy(scores) 195 | m = nn.LogSoftmax(dim=1) 196 | scores = m(scores).detach().cpu().numpy() 197 | #print(scores) 198 | method = args.method_name 199 | if args.mining == 1: 200 | path = 'mining_score_'+method+'/' 201 | else: 202 | path = 'cross_score_' + method + '/' 203 | from pathlib import Path 204 | Path(path).mkdir(parents=True, exist_ok=True) 205 | 206 | file1 = open(path + sample['video_id'][0] + '.probs', 'w') 207 | for i in range(scores.shape[0]): 208 | for j in range(scores.shape[1]): 209 | # for k in range(30): 210 | file1.write(str(scores[i][j]) + ' ') 211 | file1.write('\n') 212 | file1.close() 213 | 214 | else: 215 | scores = cvpr19_score(video, text, model) #[time,class] 216 | if args.save_feature == 1: 217 | scores = th.from_numpy(scores) 218 | m = nn.LogSoftmax(dim=1) 219 | scores = m(scores).detach().cpu().numpy() 220 | from pathlib import Path 221 | path = 'mining_score_ver/' 222 | Path(path).mkdir(parents=True, exist_ok=True) 223 | file1 = open(path + sample['video_id'][0] + '.probs', 'w') 224 | for i in range(scores.shape[0]): 225 | for j in range(scores.shape[1]): 226 | # for k in range(30): 227 | file1.write(str(scores[i][j]) + ' ') 228 | file1.write('\n') 229 | file1.close() 230 | #""" 231 | if args.save_feature == 0: 232 | if args.recall_frame==0: 233 | #scores = np.log(scores) 234 | #""" 235 | if args.mining==1: 236 | m = nn.LogSoftmax(dim=1) 237 | #m = nn.LogSigmoid() 238 | scores = th.from_numpy(scores) 239 | scores = m(scores).detach().cpu().numpy() 240 | #""" 241 | y = cvpr19_predict(scores) #[time,class] 242 | else: 243 | y = arg_max_predict(scores) 244 | y_true = th.squeeze(sample['Y']).numpy() 245 | 246 | if args.cross==1: 247 | task = sample['task'] 248 | #y_true = y_true.view(-1, y_true.shape[-1]) 249 | 250 | task = task[0]#.view(-1, task.shape[-1]) 251 | 252 | if task not in recalls: 253 | recalls[task] = 0. 254 | recalls[task] += get_recall(y_true, y) 255 | if task not in counts: 256 | counts[task] = 0 257 | counts[task] += 1 258 | 259 | # mAP ---------------------------------------- 260 | if task not in task_scores: 261 | task_scores[task] = [] 262 | task_gt[task] = [] 263 | task_scores[task].append(scores) 264 | task_gt[task].append(y_true) 265 | else: 266 | recalls_m += get_recall(y_true, y) 267 | counts_m += 1 268 | 269 | #if task == '77721': 270 | # print('recall:', recalls['77721']) 271 | # print('counts:', counts['77721']) 272 | # print(sample['video_id']) 273 | #print(recalls) 274 | #""" 275 | # -------------------------------------------- 276 | #""" 277 | if args.save_feature == 0: 278 | if args.cross==1: 279 | recalls = {task: recall / counts[task] for task,recall in recalls.items()} 280 | # mAP ---------------------------------------- 281 | task_scores = {task: np.concatenate(scores) for task,scores in task_scores.items()} 282 | task_gt = {task: np.concatenate(y) for task,y in task_gt.items()} 283 | mAPs = {task: average_precision_score(task_gt[task],scores) for task,scores in task_scores.items()} 284 | # -------------------------------------------- 285 | #""" 286 | return recalls, mAPs 287 | else: 288 | print(recalls_m/counts_m) 289 | return recalls_m, None 290 | 291 | 292 | 293 | if args.tri == 0: 294 | from model import Net 295 | else: 296 | from model_avl import Net 297 | 298 | net = Net( 299 | embd_dim=args.embd_dim, #2048 300 | video_dim=args.feature_dim, #4096 301 | we_dim=args.we_dim, 302 | ratio=args.ratio, 303 | ) 304 | 305 | net.eval() 306 | net.cuda() 307 | 308 | if args.verbose: 309 | print('Starting evaluation loop ...') 310 | 311 | 312 | all_checkpoints = glob.glob(args.pretrain_path) 313 | 314 | for c in all_checkpoints: 315 | print('Eval checkpoint: {}'.format(c)) 316 | print('Loading checkpoint: {}'.format(c)) 317 | net.load_checkpoint(c) 318 | 319 | if args.save_feature == 1: 320 | align_eval(net, dataloader_cross) 321 | elif args.save_feature == 0: 322 | recall, mAPs = align_eval(net, dataloader_cross) 323 | 324 | pp.pprint(recall) 325 | if args.cross==1: 326 | 327 | pp.pprint(mAPs) 328 | sum = 0 329 | count = 0 330 | sum_c = 0 331 | count_c = 0 332 | sum_nc = 0 333 | count_nc = 0 334 | 335 | cook_set=set() 336 | file1 = open('/home/brian27/nobackup/CrossTask/crosstask_release/cook.txt') 337 | for line in file1: 338 | data = line.strip() 339 | cook_set.add(data) 340 | 341 | for x,y in recall.items(): 342 | sum+=y 343 | count+=1 344 | if x in cook_set: 345 | sum_c += y 346 | count_c += 1 347 | else: 348 | sum_nc += y 349 | count_nc += 1 350 | 351 | print('recall',sum/float(count)) 352 | print('recall cook', sum_c / float(count_c)) 353 | print('recall not cook', sum_nc / float(count_nc)) 354 | sum = 0 355 | count = 0 356 | for x,y in mAPs.items(): 357 | sum+=y 358 | count+=1 359 | print('mAPs',sum/float(count)) 360 | #""" 361 | -------------------------------------------------------------------------------- /gen_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | 5 | 6 | #a = np.load('./temp_data/v_ApplyEyeMakeup_g01_c01.npz') 7 | #print(a['arr_0'].shape) 8 | #exit() 9 | 10 | 11 | def generate_ucf101_pickle(): 12 | data_dir = '/nobackup/users/brian27/data/UCF-101_feature/' 13 | audio_dir = '/nobackup/users/brian27/data/UCF-101_audio/' 14 | #data_dir = '../Data/' 15 | 16 | feature_list = os.listdir(data_dir) 17 | #print(feature_list) 18 | videos = sorted(set([v[:-6] for v in feature_list])) 19 | print('# Videos', len(videos)) 20 | 21 | train_list = open('./data/ucf_trainlist01.txt').readlines() 22 | 23 | #print(videos) 24 | #v_Basketball_g07_c02_2d.npy 25 | data = [] 26 | for video_name in videos: 27 | training = 0 28 | for tr_vid in train_list: 29 | if video_name[:-1] in tr_vid: 30 | training = 1 31 | try: 32 | feats_3d = np.load(data_dir + video_name + '3d.npy') 33 | #print(feats_3d.shape) 34 | feats_2d = np.load(data_dir.replace('brian27', 'duartek') + video_name + '2d.npy') 35 | #print(feats_2d.shape) 36 | except: 37 | continue 38 | try: 39 | audio = np.load(audio_dir + video_name[:-1] + '.npz') 40 | print(audio.files, audio_dir + video_name + '.npz', audio['arr_0'].shape) 41 | audio = audio['arr_0'] 42 | has_audio = 1 43 | except: 44 | audio = np.zeros((40, 1), dtype=np.float32) 45 | has_audio = 0 46 | 47 | data.append({'2d': feats_2d, 48 | '3d': feats_3d, 49 | '2d_pooled': np.mean(feats_2d, 0), 50 | '3d_pooled': np.mean(feats_3d, 0), 51 | 'class': video_name.split('_')[1], 52 | 'video': video_name, 53 | 'audio': audio, 54 | 'has_audio': has_audio, 55 | 'training': training 56 | }) 57 | pickle.dump(data, open('./data/UCF101_data.pkl', 'wb')) 58 | print('# Videos with features extracted:', len(data)) 59 | #a = os.listdir('/nobackup/users/brian27/data/hmdb51_feature/') 60 | 61 | 62 | def generate_hmdb_pickle(): 63 | data_dir = '/nobackup/users/brian27/data/hmdb51_feature/' 64 | folders_dir = '/nobackup/users/brian27/data/hmdb51_org/' 65 | 66 | classes = os.listdir(folders_dir) 67 | 68 | feature_list = os.listdir(data_dir) 69 | videos = sorted(set([v[:-6] for v in feature_list])) 70 | print('# Videos', len(videos)) 71 | 72 | train_list = open('./data/hmdb_train_split1.txt').readlines() 73 | test_list = open('./data/hmdb_test_split1.txt').readlines() 74 | 75 | n_samples = np.zeros((len(classes), )) 76 | data = [] 77 | for video_name in videos: 78 | training = 0 79 | for tr_vid in train_list: 80 | if video_name[:-1] in tr_vid: 81 | training = 1 82 | 83 | testing = 0 84 | for te_vid in test_list: 85 | if video_name[:-1] in te_vid: 86 | testing = 1 87 | 88 | if training == 0 and testing == 0: 89 | training = 2 90 | 91 | try: 92 | feats_3d = np.load(data_dir + video_name + '3d.npy') 93 | #print(feats_3d.shape) 94 | feats_2d = np.load(data_dir.replace('brian27', 'duartek') + video_name + '2d.npy') 95 | #print(feats_2d.shape) 96 | except: 97 | continue 98 | 99 | split_name = '_'.join(video_name.split('_')[:-7]) + '_' 100 | class_name = [cls for cls in classes if '_'+cls+'_' == split_name[-(len(cls)+2):]] 101 | class_name = sorted(class_name, key=lambda x: len(x)) 102 | #print(class_name, class_name[-1]) 103 | class_name = class_name[-1] 104 | n_samples[classes.index(class_name)] += 1 105 | data.append({'2d': feats_2d, 106 | '3d': feats_3d, 107 | '2d_pooled': np.mean(feats_2d, 0), 108 | '3d_pooled': np.mean(feats_3d, 0), 109 | 'class': class_name, 110 | 'video': video_name, 111 | 'training': training 112 | }) 113 | pickle.dump(data, open('./data/HMDB_data.pkl', 'wb')) 114 | print('# Videos with features extracted:', len(data)) 115 | for i, cls in enumerate(classes): 116 | print(cls, n_samples[i]) 117 | #print(n_samples) 118 | 119 | generate_ucf101_pickle() 120 | generate_hmdb_pickle() 121 | -------------------------------------------------------------------------------- /hmdb_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import pickle 9 | import torch.nn.functional as F 10 | import numpy as np 11 | import re 12 | import pandas as pd 13 | from collections import defaultdict 14 | from torch.utils.data.dataloader import default_collate 15 | import json 16 | import random 17 | 18 | 19 | def name_to_stringlist(name): 20 | change = {'claping': ['clapping']} 21 | if name in change: 22 | name_vec = change[name] 23 | else: 24 | name_vec = name.split('_') 25 | return name_vec 26 | 27 | 28 | class HMDB_DataLoader(Dataset): 29 | """MSRVTT dataset loader.""" 30 | 31 | def __init__( 32 | self, 33 | data_path, 34 | we, 35 | we_dim=300, 36 | max_words=30, 37 | num_frames_multiplier=5, 38 | training=True, 39 | tri_modal=False, 40 | finetune_video=False, 41 | video_interp=False 42 | ): 43 | """ 44 | Args: 45 | """ 46 | self.data = pickle.load(open(data_path, 'rb')) # contains a list of video names 47 | self.we = we 48 | self.we_dim = we_dim 49 | self.max_words = max_words 50 | self.max_video = 30 51 | self.num_frames_multiplier = num_frames_multiplier 52 | self.training = training 53 | self.tri_modal = tri_modal 54 | self.finetune_video = finetune_video 55 | self.max_frames = 16 56 | self.video_interp = video_interp 57 | 58 | names = [] 59 | for vid in self.data: 60 | names.append(vid['class']) 61 | 62 | self.classes = sorted(set(names)) 63 | print('# Classes', len(self.classes)) 64 | 65 | self.class_embeds = [] 66 | for name in self.classes: 67 | word_list = name_to_stringlist(name) 68 | caption = ' '.join(word_list) 69 | self.class_embeds.append(self._get_caption(caption)) 70 | self.class_embeds = th.stack(self.class_embeds, 0) 71 | print('Shape of class embeds', self.class_embeds.shape) 72 | 73 | def __len__(self): 74 | return len(self.data) 75 | 76 | def custom_collate(self, batch): 77 | return default_collate(batch) 78 | 79 | def _zero_pad_tensor(self, tensor, size): 80 | if len(tensor) >= size: 81 | return tensor[:size] 82 | else: 83 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 84 | return np.concatenate((tensor, zero), axis=0) 85 | 86 | def _tokenize_text(self, sentence): 87 | w = re.findall(r"[\w']+", str(sentence)) 88 | return w 89 | 90 | def _words_to_we(self, words): 91 | words = [word for word in words if word in self.we.vocab] 92 | if words: 93 | we = self._zero_pad_tensor(self.we[words], self.max_words) 94 | return th.from_numpy(we) 95 | else: 96 | return th.zeros(self.max_words, self.we_dim) 97 | 98 | def _get_caption(self, idx): 99 | """Chooses random caption if training. Uses set caption if evaluating.""" 100 | if self.training: 101 | captions = idx 102 | caption = self._words_to_we(self._tokenize_text(random.choice(captions))) 103 | return caption 104 | else: 105 | caption = idx 106 | return self._words_to_we(self._tokenize_text(caption)) 107 | 108 | def __getitem__(self, idx): 109 | data = self.data[idx] 110 | # load 2d and 3d features (features are pooled over the time dimension) 111 | 112 | if self.finetune_video: 113 | feat_2d = th.from_numpy(self.data[idx]['2d']).float() 114 | feat_3d = th.from_numpy(self.data[idx]['3d']).float() 115 | if self.video_interp: 116 | feat_2d = F.interpolate(feat_2d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear', 117 | align_corners=True).squeeze(0) 118 | feat_3d = F.interpolate(feat_3d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear', 119 | align_corners=True).squeeze(0) 120 | else: 121 | feat2d_buffer = th.zeros(self.max_frames, feat_2d.shape[-1]) 122 | feat_2d = feat_2d[:self.max_frames] 123 | feat2d_buffer[:len(feat_2d)] = feat_2d 124 | 125 | feat3d_buffer = th.zeros(self.max_frames, feat_3d.shape[-1]) 126 | feat_3d = feat_3d[:self.max_frames] 127 | feat3d_buffer[:len(feat_3d)] = feat_3d 128 | 129 | feat_2d = feat2d_buffer.transpose(1, 0) 130 | feat_3d = feat3d_buffer.transpose(1, 0) 131 | 132 | feat_2d = F.normalize(feat_2d, dim=0) 133 | feat_3d = F.normalize(feat_3d, dim=0) 134 | video = th.cat((feat_2d, feat_3d), dim=0) 135 | else: 136 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0) 137 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0) 138 | video = th.cat((feat_2d, feat_3d)) 139 | 140 | # load audio and zero pad/truncate if necessary 141 | audio = th.FloatTensor(th.from_numpy(np.zeros((40, 1000), dtype=np.float32))) 142 | 143 | # choose a caption 144 | caption = '' 145 | name = self.data[idx]['class'] 146 | if self.tri_modal: 147 | word_list = name_to_stringlist(name) 148 | caption = ' '.join(word_list) 149 | caption = self._get_caption(caption) 150 | 151 | return {'video': video, 'text': caption, 'video_id': idx, 152 | 'audio': audio, 'nframes': 32, 'class_name': name, 153 | 'class_id': th.ones(1)*self.classes.index(name), 154 | 'has_audio': th.zeros(1), 155 | 'video_name': self.data[idx]['video'], 156 | 'training': th.ones(1)*self.data[idx]['training']} 157 | 158 | 159 | class MSRVTT_DataLoader_label(Dataset): 160 | """MSRVTT dataset loader.""" 161 | 162 | def __init__( 163 | self, 164 | data_path, 165 | we, 166 | pseudo_v, 167 | pseudo_a, 168 | we_dim=300, 169 | max_words=30, 170 | num_frames_multiplier=5, 171 | training=True, 172 | tri_modal=False, 173 | ): 174 | """ 175 | Args: 176 | """ 177 | self.data = pickle.load(open(data_path, 'rb')) 178 | self.we = we 179 | self.we_dim = we_dim 180 | self.max_words = max_words 181 | self.max_video = 30 182 | self.num_frames_multiplier = num_frames_multiplier 183 | self.training = training 184 | self.tri_modal = tri_modal 185 | self.pseudo_v = pseudo_v 186 | self.pseudo_a = pseudo_a 187 | 188 | 189 | 190 | def __len__(self): 191 | return len(self.data) 192 | 193 | def custom_collate(self, batch): 194 | return default_collate(batch) 195 | 196 | def _zero_pad_tensor(self, tensor, size): 197 | if len(tensor) >= size: 198 | return tensor[:size] 199 | else: 200 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 201 | return np.concatenate((tensor, zero), axis=0) 202 | 203 | def _tokenize_text(self, sentence): 204 | w = re.findall(r"[\w']+", str(sentence)) 205 | return w 206 | 207 | def _words_to_we(self, words): 208 | words = [word for word in words if word in self.we.vocab] 209 | if words: 210 | we = self._zero_pad_tensor(self.we[words], self.max_words) 211 | return th.from_numpy(we) 212 | else: 213 | return th.zeros(self.max_words, self.we_dim) 214 | 215 | def _get_caption(self, idx): 216 | """Chooses random caption if training. Uses set caption if evaluating.""" 217 | if self.training: 218 | captions = self.data[idx]['caption'] 219 | caption = self._words_to_we(self._tokenize_text(random.choice(captions))) 220 | return caption 221 | else: 222 | caption = self.data[idx]['eval_caption'] 223 | return self._words_to_we(self._tokenize_text(caption)) 224 | 225 | def __getitem__(self, idx): 226 | video_id = self.data[idx]['id'] 227 | # load 2d and 3d features (features are pooled over the time dimension) 228 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0) 229 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0) 230 | video = th.cat((feat_2d, feat_3d)) 231 | 232 | # load audio and zero pad/truncate if necessary 233 | audio = self.data[idx]['audio'] 234 | target_length = 1024 * self.num_frames_multiplier 235 | nframes = audio.numpy().shape[1] 236 | p = target_length - nframes 237 | if p > 0: 238 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 239 | elif p < 0: 240 | audio = audio[:, 0:p] 241 | audio = th.FloatTensor(audio) 242 | 243 | # choose a caption 244 | caption = '' 245 | if self.tri_modal: 246 | caption = self._get_caption(idx) 247 | 248 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 249 | 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]} 250 | -------------------------------------------------------------------------------- /local_eval.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from metrics import compute_metrics, print_computed_metrics 3 | import pickle 4 | import torch 5 | 6 | 7 | eval_lang_retrieval = 0 8 | eval_msrvtt = 1 9 | 10 | data = pickle.load(open('temp_data/MSR-VTT.pkl', 'rb')) 11 | #data = pickle.load(open('temp_data/YouCook2.pkl', 'rb')) 12 | 13 | text = data['text'] 14 | video = data['video'] 15 | audio = data['audio'] 16 | 17 | text2 = data['out_t'] 18 | video2 = data['out_v'] 19 | audio2 = data['out_a'] 20 | 21 | text3 = data['out_t2'] 22 | video3 = data['out_v2'] 23 | audio3 = data['out_a2'] 24 | 25 | #m = np.matmul(text, video.T) #+ np.matmul(text2, video2.T) 26 | #m = np.matmul(text, (video+audio).T) #+ np.matmul(text2, video2.T)#+ np.matmul(text, audio.T) 27 | m = np.matmul(text, (video).T)# + np.matmul(text, (audio).T) 28 | #m = np.matmul(text, (audio).T) 29 | 30 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt) 31 | print('Combined Space') 32 | print_computed_metrics(metrics) 33 | 34 | def norm(mat, axis=-1): 35 | return np.sqrt(np.sum(mat**2, axis=axis, keepdims=True) + 1e-9) 36 | 37 | 38 | def softmax(x, axis=-1): 39 | return np.exp(x)/np.sum(np.exp(x)+1e-12, axis=axis, keepdims=True) 40 | 41 | #text2 = text3#softmax(text2*10) 42 | #video2 = video3#softmax(video2*10) 43 | 44 | m = np.matmul(text2, (video2).T)# + np.matmul(text2, (audio2).T) 45 | 46 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt) 47 | print('Dot Product on Embedding 2') 48 | print_computed_metrics(metrics) 49 | 50 | text2 = softmax(text2*10) 51 | video2 = softmax(video2*10) 52 | m = np.matmul(text2, (video2).T)# + np.matmul(text2, (audio2).T) 53 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt) 54 | print('Dot Product on softmax Embedding 2 x10 temp') 55 | print_computed_metrics(metrics) 56 | 57 | 58 | text2 = text3#softmax(text2*10) 59 | video2 = text3#softmax(video2*10) 60 | m = np.matmul(text3, (video3).T)# + np.matmul(text2, (audio2).T) 61 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt) 62 | print('Dot Product on normalized Embedding') 63 | print_computed_metrics(metrics) 64 | 65 | exit() 66 | m = torch.zeros((text2.shape[0], video2.shape[0])) 67 | 68 | text2 = torch.from_numpy(text2) 69 | video2 = torch.from_numpy(video2) 70 | 71 | 72 | for i, v in enumerate(video2): 73 | diff = (text2 - torch.unsqueeze(v, 0)) ** 2 74 | diff = torch.sum(diff, -1) 75 | m[:, i] = 0-diff 76 | 77 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt) 78 | print('Euclidian Distance Embedding 2') 79 | print_computed_metrics(metrics) 80 | 81 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn.functional as F 7 | import torch as th 8 | import numpy as np 9 | 10 | class MMS_loss(th.nn.Module): 11 | def __init__(self): 12 | super(MMS_loss, self).__init__() 13 | 14 | def forward(self, S, margin=0.001): 15 | deltas = margin * th.eye(S.size(0)).to(S.device) 16 | S = S - deltas 17 | 18 | target = th.LongTensor(list(range(S.size(0)))).to(S.device) 19 | I2C_loss = F.nll_loss(F.log_softmax(S, dim=1), target) 20 | C2I_loss = F.nll_loss(F.log_softmax(S.t(), dim=1), target) 21 | loss = I2C_loss + C2I_loss 22 | return loss -------------------------------------------------------------------------------- /loss_mil.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | 3 | 4 | class MILNCELoss(th.nn.Module): 5 | def __init__(self): 6 | super(MILNCELoss, self).__init__() 7 | 8 | def forward(self, video_embd, text_embd): 9 | x = th.matmul(video_embd, text_embd.t()) 10 | x = x.view(video_embd.shape[0], video_embd.shape[0], -1) 11 | nominator = x * th.eye(x.shape[0])[:,:,None].cuda() 12 | nominator = nominator.sum(dim=1) 13 | nominator = th.logsumexp(nominator, dim=1) 14 | denominator = th.cat((x, x.permute(1,0,2)), dim=1).view(x.shape[0], -1) 15 | denominator = th.logsumexp(denominator, dim=1) 16 | return th.mean(denominator - nominator) -------------------------------------------------------------------------------- /loss_sink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn.functional as F 7 | import torch as th 8 | import numpy as np 9 | eps = 1e-7 10 | 11 | class MMS_loss(th.nn.Module): 12 | def __init__(self): 13 | super(MMS_loss, self).__init__() 14 | 15 | def forward(self, S,audio,video, margin=0.001): 16 | #print(audio.shape) 17 | #print(video.shape) 18 | 19 | #video = video.view(64,4, video.shape[-1])[:,0,:].squeeze() 20 | #audio = audio.view(64, 4, audio.shape[-1])[:, 0, :].squeeze() 21 | 22 | #print(video.shape) 23 | #video = video.permute(1, 0, 2) #4*64*4096 24 | #audio = audio.view(64,4, audio.shape[-1]) 25 | video_embd = audio 26 | text_embd = video 27 | deltas = margin * th.eye(S.size(0)).cuda()#.to(S.device) #batch size eye 28 | #S = th.matmul(audio, video.t()) # 256*4096 29 | S = S - deltas #??? 30 | """ 31 | pseudo_v = pseudo_v.cpu().detach().numpy() 32 | soft = th.nn.Softmax(dim=1)(S) 33 | #pseudo_a = pseudo_v.cpu().detach().numpy() 34 | z_arr = np.ones((256, 256), dtype=float) 35 | for i in range(256): 36 | result, = np.where(pseudo_v == pseudo_v[i]) 37 | # print(result) 38 | for r in result: 39 | # print(r) 40 | if i==r: 41 | z_arr[i][r] = 1#-1000 42 | else: 43 | #if S[i][r]>0: 44 | z_arr[i][r] = 1#-soft[i][r]#0.001 45 | #print(1-soft[i][r]) 46 | # break 47 | #print(z_arr) 48 | z_arr = th.from_numpy(z_arr).type(th.FloatTensor).to(S.device)#z_arr.cuda() 49 | """ 50 | """ 51 | target = th.LongTensor(list(range(S.size(0)))).cuda()#.to(S.device) #0 to batch size list of numbers 52 | #print(target) 53 | #print(pseudo_a) 54 | #target_a = th.LongTensor(pseudo_a).to(S.device) 55 | #print(target_a) 56 | #target_v = th.LongTensor(pseudo_v).to(S.device) 57 | 58 | 59 | I2C_loss = F.nll_loss(F.log_softmax(S, dim=1), target) #softmax on feature 60 | C2I_loss = F.nll_loss(F.log_softmax(S.t(), dim=1), target) 61 | loss = I2C_loss + C2I_loss 62 | 63 | #I2C_loss = th.nn.BCELoss()(F.softmax(S, dim=1), z_arr) # softmax on feature 64 | #C2I_loss = th.nn.BCELoss()(F.softmax(S.t(), dim=1), z_arr) 65 | #loss = I2C_loss + C2I_loss 66 | 67 | #return loss 68 | #""" 69 | #""" 70 | #video_embd = pseudo_v 71 | #text_embd = pseudo_a 72 | x = th.matmul(video_embd, text_embd.t()) 73 | 74 | x = S 75 | x = x.view(video_embd.shape[0], video_embd.shape[0], -1) # batch*batch*1 76 | 77 | #print(S) 78 | #x = x.view(S.shape[0], S.shape[0], -1) # batch*batch*1 79 | nominator = x * th.eye(x.shape[0])[:, :, None].cuda() # correct pairs, assume batches are same video 80 | #nominator = x * z_arr[:, :, None] 81 | #print(z_arr) 82 | #print(nominator) 83 | # replace eye by our one hot cluster label 84 | nominator = nominator.sum(dim=1) 85 | nominator = th.logsumexp(nominator, dim=1) 86 | #print(nominator) 87 | #p = x * z_arr[:, :, None] 88 | #pos = th.logsumexp(pos, dim=1) 89 | 90 | #pos = th.cat((p, p.permute(1, 0, 2)), dim=1).view(p.shape[0], -1) 91 | #pos = th.logsumexp(pos, dim=1) 92 | 93 | #x = x * z_arr[:, :, None] 94 | denominator = th.cat((x, x.permute(1, 0, 2)), dim=1).view(x.shape[0], -1) 95 | denominator = th.logsumexp(denominator, dim=1) 96 | #print(nominator) 97 | #print(denominator) 98 | return th.mean(denominator- nominator ) 99 | #""" 100 | """ 101 | numerator = th.logsumexp(th.diag(S).view(-1, 1), dim=1) # only diagnal 102 | #print(th.diag(S).shape) 103 | #print(th.diag(S).view(-1, 1).shape) # 256*1 104 | #print(numerator.shape) #[256] 105 | denominator = th.logsumexp(th.cat([S, S.t()], dim=1), dim=1) 106 | #print(th.cat([S, S.t()], dim=1).shape) 107 | #print(denominator.shape) #256 108 | loss = th.mean(denominator - numerator) 109 | print(numerator) 110 | print(denominator) 111 | """ 112 | #return loss 113 | 114 | 115 | #return loss 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /lsmdc_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import pickle 9 | import torch.nn.functional as F 10 | import numpy as np 11 | import re 12 | from torch.utils.data.dataloader import default_collate 13 | 14 | class LSMDC_DataLoader(Dataset): 15 | """LSMDC dataset loader.""" 16 | 17 | def __init__( 18 | self, 19 | data_path, 20 | we, 21 | we_dim=300, 22 | max_words=30, 23 | num_frames_multiplier=5, 24 | tri_modal=False, 25 | ): 26 | """ 27 | Args: 28 | """ 29 | self.data = pickle.load(open(data_path, 'rb')) 30 | self.we = we 31 | self.we_dim = we_dim 32 | self.max_words = max_words 33 | self.max_video = 30 34 | self.num_frames_multiplier = num_frames_multiplier 35 | self.tri_modal = tri_modal 36 | 37 | def __len__(self): 38 | return len(self.data) 39 | 40 | def custom_collate(self, batch): 41 | return default_collate(batch) 42 | 43 | def _zero_pad_tensor(self, tensor, size): 44 | if len(tensor) >= size: 45 | return tensor[:size] 46 | else: 47 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 48 | return np.concatenate((tensor, zero), axis=0) 49 | 50 | def _tokenize_text(self, sentence): 51 | w = re.findall(r"[\w']+", str(sentence)) 52 | return w 53 | 54 | def _words_to_we(self, words): 55 | words = [word for word in words if word in self.we.vocab] 56 | if words: 57 | we = self._zero_pad_tensor(self.we[words], self.max_words) 58 | return th.from_numpy(we) 59 | else: 60 | return th.zeros(self.max_words, self.we_dim) 61 | 62 | def __getitem__(self, idx): 63 | video_id = self.data[idx]['id'] 64 | # load 2d and 3d features (features are pooled over the time dimension) 65 | feat_2d = F.normalize(self.data[idx]['2d_pooled'].float(), dim=0) 66 | feat_3d = F.normalize(self.data[idx]['3d_pooled'].float(), dim=0) 67 | video = th.cat((feat_2d, feat_3d)) 68 | 69 | # load audio and zero pad/truncate if necessary 70 | audio = self.data[idx]['audio'] 71 | target_length = 1024 * self.num_frames_multiplier 72 | nframes = audio.numpy().shape[1] 73 | p = target_length - nframes 74 | if p > 0: 75 | audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0)) 76 | elif p < 0: 77 | audio = audio[:,0:p] 78 | audio = th.FloatTensor(audio) 79 | 80 | # choose a caption 81 | caption = '' 82 | if self.tri_modal: 83 | caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 84 | 85 | return {'video': video, 'text': caption, 'video_id': video_id, 86 | 'audio': audio, 'nframes': nframes} 87 | 88 | 89 | class LSMDC_DataLoader_label(Dataset): 90 | """LSMDC dataset loader.""" 91 | 92 | def __init__( 93 | self, 94 | data_path, 95 | we, 96 | pseudo_v, 97 | pseudo_a, 98 | we_dim=300, 99 | max_words=30, 100 | num_frames_multiplier=5, 101 | tri_modal=False, 102 | ): 103 | """ 104 | Args: 105 | """ 106 | self.data = pickle.load(open(data_path, 'rb')) 107 | self.we = we 108 | self.we_dim = we_dim 109 | self.max_words = max_words 110 | self.max_video = 30 111 | self.num_frames_multiplier = num_frames_multiplier 112 | self.tri_modal = tri_modal 113 | self.pseudo_v = pseudo_v 114 | self.pseudo_a = pseudo_a 115 | 116 | def __len__(self): 117 | return len(self.data) 118 | 119 | def custom_collate(self, batch): 120 | return default_collate(batch) 121 | 122 | def _zero_pad_tensor(self, tensor, size): 123 | if len(tensor) >= size: 124 | return tensor[:size] 125 | else: 126 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 127 | return np.concatenate((tensor, zero), axis=0) 128 | 129 | def _tokenize_text(self, sentence): 130 | w = re.findall(r"[\w']+", str(sentence)) 131 | return w 132 | 133 | def _words_to_we(self, words): 134 | words = [word for word in words if word in self.we.vocab] 135 | if words: 136 | we = self._zero_pad_tensor(self.we[words], self.max_words) 137 | return th.from_numpy(we) 138 | else: 139 | return th.zeros(self.max_words, self.we_dim) 140 | 141 | def __getitem__(self, idx): 142 | video_id = self.data[idx]['id'] 143 | # load 2d and 3d features (features are pooled over the time dimension) 144 | feat_2d = F.normalize(self.data[idx]['2d_pooled'].float(), dim=0) 145 | feat_3d = F.normalize(self.data[idx]['3d_pooled'].float(), dim=0) 146 | video = th.cat((feat_2d, feat_3d)) 147 | 148 | # load audio and zero pad/truncate if necessary 149 | audio = self.data[idx]['audio'] 150 | target_length = 1024 * self.num_frames_multiplier 151 | nframes = audio.numpy().shape[1] 152 | p = target_length - nframes 153 | if p > 0: 154 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 155 | elif p < 0: 156 | audio = audio[:, 0:p] 157 | audio = th.FloatTensor(audio) 158 | 159 | # choose a caption 160 | caption = '' 161 | if self.tri_modal: 162 | caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 163 | 164 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 165 | 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]} -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | 8 | def compute_metrics(x, eval_lang_retrieval=False, eval_msrvtt=False): 9 | if eval_lang_retrieval: 10 | print("Retrieving language given input video clips") 11 | x = x.T 12 | else: 13 | print("Retrieving video clips given input language") 14 | sx = np.sort(-x, axis=1) 15 | d = np.diag(-x) 16 | d = d[:, np.newaxis] 17 | ind = sx - d 18 | ind = np.where(ind == 0) 19 | ind = ind[1] 20 | metrics = {} 21 | test_set_size = x.shape[0] if not eval_msrvtt else 1000 22 | if eval_msrvtt: print("MSR-VTT: counting {} missing test clips as mistakes".format(1000 - x.shape[0])) 23 | metrics['R1'] = float(np.sum(ind == 0)) / test_set_size 24 | metrics['R5'] = float(np.sum(ind < 5)) / test_set_size 25 | metrics['R10'] = float(np.sum(ind < 10)) / test_set_size 26 | metrics['MR'] = np.median(ind) + 1 27 | return metrics 28 | 29 | def print_computed_metrics(metrics): 30 | r1 = metrics['R1'] 31 | r5 = metrics['R5'] 32 | r10 = metrics['R10'] 33 | mr = metrics['MR'] 34 | print('R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.format(r1, r5, r10, mr)) 35 | return r1,r5,r10,mr 36 | 37 | class AverageMeter(object): 38 | """Computes and stores the average and current value""" 39 | def __init__(self): 40 | self.reset() 41 | 42 | def reset(self): 43 | self.val = 0 44 | self.avg = 0 45 | self.sum = 0 46 | self.count = 0 47 | 48 | def update(self, val, n=1): 49 | self.val = val 50 | self.sum += val * n 51 | self.count += n 52 | self.avg = self.sum / self.count -------------------------------------------------------------------------------- /minY_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import pickle 9 | import torch.nn.functional as F 10 | import numpy as np 11 | import re 12 | from torch.utils.data.dataloader import default_collate 13 | 14 | class Youcook_DataLoader(Dataset): 15 | """Youcook dataset loader.""" 16 | 17 | def __init__( 18 | self, 19 | data, 20 | we, 21 | we_dim=300, 22 | max_words=30, 23 | num_frames_multiplier=5, 24 | tri_modal=False, 25 | ): 26 | """ 27 | Args: 28 | """ 29 | self.data = pickle.load(open(data, 'rb')) 30 | self.we = we 31 | self.we_dim = we_dim 32 | self.max_words = max_words 33 | self.num_frames_multiplier = num_frames_multiplier 34 | self.tri_modal = tri_modal 35 | 36 | def __len__(self): 37 | return len(self.data) 38 | 39 | def custom_collate(self, batch): 40 | return default_collate(batch) 41 | 42 | def _zero_pad_tensor(self, tensor, size): 43 | if len(tensor) >= size: 44 | return tensor[:size] 45 | else: 46 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 47 | return np.concatenate((tensor, zero), axis=0) 48 | 49 | def _tokenize_text(self, sentence): 50 | w = re.findall(r"[\w']+", str(sentence)) 51 | return w 52 | 53 | def _words_to_we(self, words): 54 | words = [word for word in words if word in self.we.vocab] 55 | if words: 56 | we = self._zero_pad_tensor(self.we[words], self.max_words) 57 | return th.from_numpy(we) 58 | else: 59 | return th.zeros(self.max_words, self.we_dim) 60 | 61 | def __getitem__(self, idx): 62 | # load 2d and 3d features (features are pooled over the time dimension) 63 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0) 64 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0) 65 | video = th.cat((feat_2d, feat_3d)) 66 | 67 | # load audio and zero pad/truncate if necessary 68 | audio = self.data[idx]['audio'] 69 | target_length = 1024 * self.num_frames_multiplier 70 | nframes = audio.numpy().shape[1] 71 | p = target_length - nframes 72 | if p > 0: 73 | audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0)) 74 | elif p < 0: 75 | audio = audio[:,0:p] 76 | audio = th.FloatTensor(audio) 77 | 78 | caption = '' 79 | if self.tri_modal: 80 | caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 81 | 82 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 83 | 'audio': audio, 'nframes': nframes} 84 | 85 | 86 | class Youcook_DataLoader_label(Dataset): 87 | """Youcook dataset loader.""" 88 | 89 | def __init__( 90 | self, 91 | data, 92 | we, 93 | pseudo_v, 94 | pseudo_a, 95 | we_dim=300, 96 | max_words=30, 97 | num_frames_multiplier=5, 98 | tri_modal=False, 99 | 100 | ): 101 | """ 102 | Args: 103 | """ 104 | self.data = pickle.load(open(data, 'rb')) #9000*4800 105 | self.we = we 106 | self.we_dim = we_dim 107 | self.max_words = max_words 108 | self.num_frames_multiplier = num_frames_multiplier 109 | self.tri_modal = tri_modal 110 | self.pseudo_v = pseudo_v 111 | self.pseudo_a = pseudo_a 112 | 113 | def __len__(self): 114 | return len(self.data) 115 | 116 | def custom_collate(self, batch): 117 | return default_collate(batch) 118 | 119 | def _zero_pad_tensor(self, tensor, size): 120 | if len(tensor) >= size: 121 | return tensor[:size] 122 | else: 123 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 124 | return np.concatenate((tensor, zero), axis=0) 125 | 126 | def _tokenize_text(self, sentence): 127 | w = re.findall(r"[\w']+", str(sentence)) 128 | return w 129 | 130 | def _words_to_we(self, words): 131 | words = [word for word in words if word in self.we.vocab] 132 | if words: 133 | we = self._zero_pad_tensor(self.we[words], self.max_words) 134 | return th.from_numpy(we) 135 | else: 136 | return th.zeros(self.max_words, self.we_dim) 137 | 138 | def __getitem__(self, idx): 139 | # load 2d and 3d features (features are pooled over the time dimension) 140 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0) 141 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0) 142 | video = th.cat((feat_2d, feat_3d)) 143 | 144 | # load audio and zero pad/truncate if necessary 145 | audio = self.data[idx]['audio'] 146 | target_length = 1024 * self.num_frames_multiplier 147 | nframes = audio.numpy().shape[1] 148 | p = target_length - nframes 149 | if p > 0: 150 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 151 | elif p < 0: 152 | audio = audio[:, 0:p] 153 | audio = th.FloatTensor(audio) 154 | 155 | caption = '' 156 | if self.tri_modal: 157 | caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 158 | 159 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 160 | 'audio': audio, 'nframes': nframes, 'pseudo_v':self.pseudo_v[idx], 'pseudo_a':self.pseudo_a[idx]} 161 | 162 | class Youcook_DataLoader_knn(Dataset): 163 | """Youcook dataset loader.""" 164 | 165 | def __init__( 166 | self, 167 | data, 168 | we, 169 | knn_v, 170 | knn_a, 171 | we_dim=300, 172 | max_words=30, 173 | num_frames_multiplier=5, 174 | tri_modal=False, 175 | 176 | ): 177 | """ 178 | Args: 179 | """ 180 | self.data = pickle.load(open(data, 'rb')) #9000*4800 181 | self.we = we 182 | self.we_dim = we_dim 183 | self.max_words = max_words 184 | self.num_frames_multiplier = num_frames_multiplier 185 | self.tri_modal = tri_modal 186 | self.knn_v = knn_v 187 | self.knn_a = knn_a 188 | 189 | def __len__(self): 190 | return len(self.data) 191 | 192 | def custom_collate(self, batch): 193 | return default_collate(batch) 194 | 195 | def _zero_pad_tensor(self, tensor, size): 196 | if len(tensor) >= size: 197 | return tensor[:size] 198 | else: 199 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 200 | return np.concatenate((tensor, zero), axis=0) 201 | 202 | def _tokenize_text(self, sentence): 203 | w = re.findall(r"[\w']+", str(sentence)) 204 | return w 205 | 206 | def _words_to_we(self, words): 207 | words = [word for word in words if word in self.we.vocab] 208 | if words: 209 | we = self._zero_pad_tensor(self.we[words], self.max_words) 210 | return th.from_numpy(we) 211 | else: 212 | return th.zeros(self.max_words, self.we_dim) 213 | 214 | def __getitem__(self, idx): 215 | video_feature = [] 216 | text_feature = [] 217 | audio_feature = [] 218 | nframes_list = [] 219 | caption_text = [] 220 | for i in self.knn_v[idx]: 221 | # load 2d and 3d features (features are pooled over the time dimension) 222 | feat_2d = F.normalize(th.from_numpy(self.data[i]['2d']).float(), dim=0) 223 | feat_3d = F.normalize(th.from_numpy(self.data[i]['3d']).float(), dim=0) 224 | video = th.cat((feat_2d, feat_3d)) 225 | video_feature.append(video.numpy()) 226 | # load audio and zero pad/truncate if necessary 227 | audio = self.data[i]['audio'] 228 | target_length = 1024 * self.num_frames_multiplier 229 | nframes = audio.numpy().shape[1] 230 | nframes_list.append(nframes) 231 | p = target_length - nframes 232 | if p > 0: 233 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 234 | elif p < 0: 235 | audio = audio[:, 0:p] 236 | audio = th.FloatTensor(audio) 237 | audio_feature.append(audio.numpy()) 238 | caption = '' 239 | if self.tri_modal: 240 | caption = self._words_to_we(self._tokenize_text(self.data[i]['caption'])) 241 | text_feature.append(caption.numpy()) 242 | video_f = np.asarray(video_feature) 243 | text_f = np.asarray(text_feature) 244 | audio_f = np.asarray(audio_feature) 245 | nframes_l = np.asarray(nframes_list) 246 | """ 247 | print('dataload') 248 | print(video_f.shape) 249 | print(text_f.shape) 250 | print(audio_f.shape) 251 | print(nframes_l.shape) 252 | print('dataload_fin') 253 | """ 254 | #caption_text = 255 | return {'video': video_f, 'text': text_f, 'video_id': self.data[i]['id'], 256 | 'audio': audio_f, 'nframes': nframes_l} -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn as nn 7 | import torch as th 8 | import torch.nn.functional as F 9 | import re 10 | 11 | class Net(nn.Module): 12 | def __init__( 13 | self, 14 | embd_dim=1024, 15 | video_dim=2048, 16 | n_pair=1, 17 | we_dim=300, 18 | max_words=30, 19 | sentence_dim=-1, 20 | mil=0, 21 | no_norm=0, 22 | we=None, 23 | ): 24 | super(Net, self).__init__() 25 | if sentence_dim <= 0: 26 | self.text_pooling = Sentence_Maxpool(we_dim, embd_dim) 27 | else: 28 | self.text_pooling = Sentence_Maxpool(we_dim, sentence_dim) 29 | self.GU_text = Gated_Embedding_Unit( 30 | self.text_pooling.out_dim, embd_dim, gating=True) 31 | self.GU_video = Gated_Embedding_Unit( 32 | video_dim, embd_dim, gating=True) 33 | self.n_pair = n_pair 34 | self.embd_dim = embd_dim 35 | self.we = we 36 | self.we_dim = we_dim 37 | self.mil=mil 38 | self.no_norm = no_norm 39 | 40 | 41 | def save_checkpoint(self, path): 42 | th.save(self.state_dict(), path) 43 | 44 | def load_checkpoint(self, path, cpu=False): 45 | if cpu: 46 | self.load_state_dict(th.load(path, 47 | map_location=lambda storage, loc: storage)) 48 | else: 49 | self.load_state_dict(th.load(path)) 50 | 51 | def forward(self, video, text): 52 | video = self.GU_video(video) #[frames,emb] 53 | text = self.GU_text(self.text_pooling(text)) #[class,emb] 54 | if self.mil==0: 55 | return th.matmul(text, video.t()) #[class,frame] 56 | else: 57 | return video, text 58 | 59 | 60 | 61 | class Gated_Embedding_Unit(nn.Module): 62 | def __init__(self, input_dimension, output_dimension, gating=True): 63 | super(Gated_Embedding_Unit, self).__init__() 64 | self.fc = nn.Linear(input_dimension, output_dimension) 65 | self.cg = Context_Gating(output_dimension) 66 | self.gating = gating 67 | 68 | def forward(self, x): 69 | x = self.fc(x) 70 | if self.gating: 71 | x = self.cg(x) 72 | # un comment when inferencing 73 | x = F.normalize(x) 74 | return x 75 | 76 | class Sentence_Maxpool(nn.Module): 77 | def __init__(self, word_dimension, output_dim, relu=True): 78 | super(Sentence_Maxpool, self).__init__() 79 | self.fc = nn.Linear(word_dimension, output_dim) 80 | self.out_dim = output_dim 81 | self.relu = relu 82 | 83 | def forward(self, x): 84 | x = self.fc(x) 85 | if self.relu: 86 | x = F.relu(x) 87 | return th.max(x, dim=1)[0] 88 | 89 | 90 | class Context_Gating(nn.Module): 91 | def __init__(self, dimension, add_batch_norm=False): 92 | super(Context_Gating, self).__init__() 93 | self.fc = nn.Linear(dimension, dimension) 94 | self.add_batch_norm = add_batch_norm 95 | self.batch_norm = nn.BatchNorm1d(dimension) 96 | 97 | def forward(self, x): 98 | x1 = self.fc(x) 99 | if self.add_batch_norm: 100 | x1 = self.batch_norm(x1) 101 | x = th.cat((x, x1), 1) 102 | return F.glu(x, 1) 103 | -------------------------------------------------------------------------------- /model_davenet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | import librosa 4 | import numpy as np 5 | import scipy.signal 6 | 7 | def conv1x9(in_planes, out_planes, stride=1): 8 | """1x9 convolution with padding""" 9 | return nn.Conv2d(in_planes, out_planes, kernel_size=(1,9), stride=stride, padding=(0,4), bias=False) 10 | 11 | def conv1d(in_planes, out_planes, width=9, stride=1, bias=False): 12 | """1xd convolution with padding""" 13 | if width % 2 == 0: 14 | pad_amt = int(width / 2) 15 | else: 16 | pad_amt = int((width - 1) / 2) 17 | return nn.Conv2d(in_planes, out_planes, kernel_size=(1,width), stride=stride, padding=(0,pad_amt), bias=bias) 18 | 19 | class SpeechBasicBlock(nn.Module): 20 | expansion = 1 21 | def __init__(self, inplanes, planes, width=9, stride=1, downsample=None): 22 | super(SpeechBasicBlock, self).__init__() 23 | self.conv1 = conv1d(inplanes, planes, width=width, stride=stride) 24 | self.bn1 = nn.BatchNorm2d(planes) 25 | self.relu = nn.ReLU(inplace=True) 26 | self.conv2 = conv1d(planes, planes, width=width) 27 | self.bn2 = nn.BatchNorm2d(planes) 28 | self.downsample = downsample 29 | self.stride = stride 30 | 31 | def forward(self, x): 32 | residual = x 33 | out = self.conv1(x) 34 | out = self.bn1(out) 35 | out = self.relu(out) 36 | out = self.conv2(out) 37 | out = self.bn2(out) 38 | if self.downsample is not None: 39 | residual = self.downsample(x) 40 | out += residual 41 | out = self.relu(out) 42 | return out 43 | 44 | class ResDavenet(nn.Module): 45 | def __init__(self, feat_dim=40, block=SpeechBasicBlock, layers=[2, 2, 2, 2], layer_widths=[128, 128, 256, 512, 1024], convsize=9): 46 | super(ResDavenet, self).__init__() 47 | self.feat_dim = feat_dim 48 | self.inplanes = layer_widths[0] 49 | self.batchnorm1 = nn.BatchNorm2d(1) 50 | self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=(self.feat_dim,1), stride=1, padding=(0,0), bias=False) 51 | self.bn1 = nn.BatchNorm2d(self.inplanes) 52 | self.relu = nn.ReLU(inplace=True) 53 | self.layer1 = self._make_layer(block, layer_widths[1], layers[0], width=convsize, stride=2) 54 | self.layer2 = self._make_layer(block, layer_widths[2], layers[1], width=convsize, stride=2) 55 | self.layer3 = self._make_layer(block, layer_widths[3], layers[2], width=convsize, stride=2) 56 | self.layer4 = self._make_layer(block, layer_widths[4], layers[3], width=convsize, stride=2) 57 | 58 | for m in self.modules(): 59 | if isinstance(m, nn.Conv2d): 60 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 61 | m.weight.data.normal_(0, math.sqrt(2. / n)) 62 | elif isinstance(m, nn.BatchNorm2d): 63 | m.weight.data.fill_(1) 64 | m.bias.data.zero_() 65 | 66 | def _make_layer(self, block, planes, blocks, width=9, stride=1): 67 | downsample = None 68 | if stride != 1 or self.inplanes != planes * block.expansion: 69 | downsample = nn.Sequential( 70 | nn.Conv2d(self.inplanes, planes * block.expansion, 71 | kernel_size=1, stride=stride, bias=False), 72 | nn.BatchNorm2d(planes * block.expansion), 73 | ) 74 | layers = [] 75 | layers.append(block(self.inplanes, planes, width=width, stride=stride, downsample=downsample)) 76 | self.inplanes = planes * block.expansion 77 | for i in range(1, blocks): 78 | layers.append(block(self.inplanes, planes, width=width, stride=1)) 79 | return nn.Sequential(*layers) 80 | 81 | def forward(self, x): 82 | if x.dim() == 3: 83 | x = x.unsqueeze(1) 84 | x = self.conv1(x) 85 | x = self.bn1(x) 86 | x = self.relu(x) 87 | x = self.layer1(x) 88 | x = self.layer2(x) 89 | x = self.layer3(x) 90 | x = self.layer4(x) 91 | x = x.squeeze(2) 92 | return x 93 | 94 | def preemphasis(signal,coeff=0.97): 95 | """perform preemphasis on the input signal. 96 | 97 | :param signal: The signal to filter. 98 | :param coeff: The preemphasis coefficient. 0 is none, default 0.97. 99 | :returns: the filtered signal. 100 | """ 101 | return np.append(signal[0],signal[1:]-coeff*signal[:-1]) 102 | 103 | def load_DAVEnet(): 104 | layer_widths = [128,128,256,512,1024] 105 | layer_depths = [2,2,2,2] 106 | audio_model = ResDavenet(feat_dim=40, layers=layer_depths, convsize=9, layer_widths=layer_widths) 107 | 108 | return audio_model 109 | 110 | def LoadAudio(path, target_length=2048, use_raw_length=False): 111 | audio_type = 'melspectrogram' 112 | preemph_coef = 0.97 113 | sample_rate = 16000 114 | window_size = 0.025 115 | window_stride = 0.01 116 | window_type = 'hamming' 117 | num_mel_bins = 40 118 | padval = 0 119 | fmin = 20 120 | n_fft = int(sample_rate * window_size) 121 | win_length = int(sample_rate * window_size) 122 | hop_length = int(sample_rate * window_stride) 123 | 124 | windows = {'hamming': scipy.signal.hamming} 125 | # load audio, subtract DC, preemphasis 126 | # sr=None to avoid resampling (assuming audio already at 16 kHz sr) 127 | y, sr = librosa.load(path, sr=None) 128 | if y.size == 0: 129 | y = np.zeros(200) 130 | y = y - y.mean() 131 | y = preemphasis(y, preemph_coef) 132 | stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, 133 | win_length=win_length, 134 | window=windows[window_type]) 135 | spec = np.abs(stft)**2 136 | if audio_type == 'melspectrogram': 137 | mel_basis = librosa.filters.mel(sr, n_fft, n_mels=num_mel_bins, fmin=fmin) 138 | melspec = np.dot(mel_basis, spec) 139 | feats = librosa.power_to_db(melspec, ref=np.max) 140 | n_frames = feats.shape[1] 141 | 142 | if use_raw_length: 143 | target_length = n_frames 144 | p = target_length - n_frames 145 | if p > 0: 146 | feats = np.pad(feats, ((0,0),(0,p)), 'constant', 147 | constant_values=(padval,padval)) 148 | elif p < 0: 149 | feats = feats[:,0:p] 150 | n_frames = target_length 151 | 152 | return feats, n_frames -------------------------------------------------------------------------------- /model_kmeans_ICCV.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn as nn 7 | import torch as th 8 | import torch.nn.functional as F 9 | from model_davenet import load_DAVEnet 10 | 11 | 12 | class Net(nn.Module): 13 | def __init__( 14 | self, 15 | embd_dim=4096, 16 | video_dim=4096, 17 | we_dim=300, 18 | tri_modal=False, 19 | tri_modal_fuse=False, 20 | cluster_size=256, 21 | layer=0, 22 | project=0, 23 | project_dim=6000, 24 | multi_cluster=0, 25 | recon=0, 26 | withMLP=0, 27 | recon_size=768, 28 | 29 | ): 30 | super(Net, self).__init__() 31 | self.DAVEnet = load_DAVEnet() 32 | self.DAVEnet_projection = nn.Linear(1024, embd_dim) 33 | self.GU_audio = Gated_Embedding_Unit(1024, 1024) 34 | self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim) 35 | if tri_modal and not tri_modal_fuse: 36 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim) 37 | self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim) 38 | 39 | elif tri_modal_fuse: 40 | self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2) 41 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2) 42 | self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim) 43 | self.tri_modal = tri_modal 44 | self.tri_modal_fuse = tri_modal_fuse 45 | self.project = project 46 | self.withMLP = withMLP 47 | self.recon_size = recon_size 48 | if withMLP==1: 49 | if project==0: 50 | self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256 51 | self.classification2 = nn.Linear(embd_dim, project_dim, bias=False) # 4096,256 52 | self.classification3 = nn.Linear(embd_dim, project_dim, bias=False) # 4096,256 53 | else: 54 | 55 | self.projection_head = nn.Sequential( 56 | nn.Linear(embd_dim, embd_dim//8), 57 | nn.BatchNorm1d(embd_dim//8), 58 | nn.ReLU(inplace=True), 59 | nn.Linear(embd_dim//8, cluster_size), 60 | ) 61 | 62 | self.classification = nn.Linear(cluster_size, project_dim, bias=False) 63 | 64 | 65 | self.layer=layer 66 | self.recon = recon 67 | if recon: 68 | inp_dim = embd_dim 69 | 70 | self.recon_v = nn.Sequential( 71 | nn.Linear(inp_dim, recon_size), 72 | nn.ReLU(inplace=True), 73 | nn.Linear(recon_size, video_dim), 74 | nn.ReLU(inplace=True) 75 | ) 76 | self.recon_a = nn.Sequential( 77 | nn.Linear(inp_dim, recon_size), 78 | nn.ReLU(inplace=True), 79 | nn.Linear(recon_size, 1024), 80 | nn.ReLU(inplace=True) 81 | ) 82 | self.recon_t = nn.Sequential( 83 | nn.Linear(inp_dim, recon_size), 84 | nn.ReLU(inplace=True), 85 | nn.Linear(recon_size, embd_dim), 86 | nn.ReLU(inplace=True) 87 | ) 88 | self.mse = nn.MSELoss(reduction='none') 89 | 90 | 91 | def save_checkpoint(self, path): 92 | th.save(self.state_dict(), path) 93 | 94 | def load_checkpoint(self, path): 95 | try: 96 | self.load_state_dict(th.load(path, map_location='cpu')) 97 | except Exception as e: 98 | print(e) 99 | print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE") 100 | self.load_state_dict(th.load(path, map_location='cpu'), strict=False) 101 | print("Loaded model checkpoint from {}".format(path)) 102 | 103 | def forward(self, video, audio_input, nframes, text=None): 104 | video_gt = video 105 | video = self.GU_video(video) 106 | if self.recon: 107 | video_recon = self.recon_v(video) 108 | audio = self.DAVEnet(audio_input) 109 | if not self.training: # controlled by net.train() / net.eval() (use for downstream tasks) 110 | # Mean-pool audio embeddings and disregard embeddings from input 0 padding 111 | pooling_ratio = round(audio_input.size(-1) / audio.size(-1)) 112 | nframes.div_(pooling_ratio) 113 | audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) # 114 | #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1)) 115 | audio_outputs = audio.unsqueeze(2) 116 | pooled_audio_outputs_list = [] 117 | for idx in range(audio.shape[0]): 118 | nF = max(1, nframes[idx]) 119 | pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0)) 120 | audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2) 121 | else: 122 | audio = audio.mean(dim=2) # this averages features from 0 padding too 123 | 124 | if self.tri_modal_fuse: 125 | text = self.text_pooling_caption(text) 126 | audio = self.DAVEnet_projection(audio) 127 | audio_text = self.GU_audio_text(audio, text) 128 | return audio_text, video 129 | 130 | # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training 131 | audio_gt = audio 132 | audio = self.GU_audio(audio) 133 | audio = self.DAVEnet_projection(audio) 134 | if self.recon: 135 | audio_recon = self.recon_a(audio) 136 | if self.tri_modal and not self.tri_modal_fuse: 137 | text_gt = self.text_pooling_caption(text) 138 | text = self.GU_text_captions(text_gt) 139 | 140 | if self.recon: 141 | text_recon = self.recon_t(text) 142 | 143 | 144 | if self.layer==1: 145 | video_c = self.layer1(video) 146 | audio_c = self.layer2(audio) 147 | text_c = self.layer3(text) 148 | else: 149 | if self.withMLP==1: 150 | if self.project==1: 151 | video_c = self.projection_head(video) 152 | video_c = nn.functional.normalize(video_c, dim=1, p=2) 153 | else: 154 | video_c = nn.functional.normalize(video, dim=1, p=2) 155 | video_c = self.classification(video_c) 156 | 157 | # 158 | if self.project == 1: 159 | audio_c = self.projection_head(audio) 160 | audio_c = nn.functional.normalize(audio_c, dim=1, p=2) 161 | else: 162 | audio_c = nn.functional.normalize(audio, dim=1, p=2) 163 | audio_c = self.classification(audio_c) 164 | 165 | if self.project == 1: 166 | text_c = self.projection_head(text) 167 | text_c = nn.functional.normalize(text_c, dim=1, p=2) 168 | else: 169 | text_c = nn.functional.normalize(text, dim=1, p=2) 170 | text_c = self.classification(text_c) 171 | 172 | if self.recon: 173 | mse_v = th.mean(self.mse(video_recon, video_gt), dim=-1) 174 | mse_a = th.mean(self.mse(audio_recon, audio_gt), dim=-1) 175 | mse_t = th.mean(self.mse(text_recon, text_gt), dim=-1) 176 | if self.withMLP == 1: 177 | return audio, video, text, audio_c, video_c, text_c, mse_v + mse_a + mse_t 178 | else: 179 | return audio, video, text, mse_v + mse_a + mse_t 180 | return audio, video, text 181 | 182 | return audio, video 183 | 184 | 185 | class Gated_Embedding_Unit(nn.Module): 186 | def __init__(self, input_dimension, output_dimension): 187 | super(Gated_Embedding_Unit, self).__init__() 188 | self.fc = nn.Linear(input_dimension, output_dimension) 189 | self.cg = Context_Gating(output_dimension) 190 | 191 | def forward(self, x): 192 | x = self.fc(x) 193 | x = self.cg(x) 194 | return x 195 | 196 | 197 | class Fused_Gated_Unit(nn.Module): 198 | def __init__(self, input_dimension, output_dimension): 199 | super(Fused_Gated_Unit, self).__init__() 200 | self.fc_audio = nn.Linear(input_dimension, output_dimension) 201 | self.fc_text = nn.Linear(input_dimension, output_dimension) 202 | self.cg = Context_Gating(output_dimension) 203 | 204 | def forward(self, audio, text): 205 | audio = self.fc_audio(audio) 206 | text = self.fc_text(text) 207 | x = audio + text 208 | x = self.cg(x) 209 | return x 210 | 211 | 212 | class Context_Gating(nn.Module): 213 | def __init__(self, dimension): 214 | super(Context_Gating, self).__init__() 215 | self.fc = nn.Linear(dimension, dimension) 216 | 217 | def forward(self, x): 218 | x1 = self.fc(x) 219 | x = th.cat((x, x1), 1) 220 | return F.glu(x, 1) 221 | 222 | 223 | class Sentence_Maxpool(nn.Module): 224 | def __init__(self, word_dimension, output_dim): 225 | super(Sentence_Maxpool, self).__init__() 226 | self.fc = nn.Linear(word_dimension, output_dim) 227 | 228 | def forward(self, x): 229 | x = self.fc(x) 230 | x = F.relu(x) 231 | return th.max(x, dim=1)[0] -------------------------------------------------------------------------------- /model_tri_c.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn as nn 7 | import torch as th 8 | import torch.nn.functional as F 9 | from model_davenet import load_DAVEnet 10 | 11 | 12 | class Net(nn.Module): 13 | def __init__( 14 | self, 15 | embd_dim=4096, 16 | video_dim=4096, 17 | we_dim=300, 18 | tri_modal=False, 19 | tri_modal_fuse=False, 20 | cluster_size=512, 21 | layer=0, 22 | project=0, 23 | project_dim=6000, 24 | multi_cluster=0, 25 | finetune_video=False 26 | 27 | ): 28 | super(Net, self).__init__() 29 | self.DAVEnet = load_DAVEnet() 30 | self.DAVEnet_projection = nn.Linear(1024, embd_dim) 31 | self.GU_audio = Gated_Embedding_Unit(1024, 1024) 32 | self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim) 33 | if tri_modal and not tri_modal_fuse: 34 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim) 35 | self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim) 36 | 37 | elif tri_modal_fuse: 38 | self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2) 39 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2) 40 | self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim) 41 | self.tri_modal = tri_modal 42 | self.tri_modal_fuse = tri_modal_fuse 43 | self.project = project 44 | if project==0: 45 | self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256 46 | else: 47 | 48 | self.projection_head = nn.Sequential( 49 | nn.Linear(embd_dim, embd_dim), 50 | nn.BatchNorm1d(embd_dim), 51 | nn.ReLU(inplace=True), 52 | nn.Linear(embd_dim, cluster_size), 53 | ) 54 | self.classification = nn.Linear(cluster_size, project_dim, bias=False) 55 | if project_dim==8000: 56 | self.classification2 = nn.Linear(embd_dim, cluster_size) 57 | self.classification3 = nn.Linear(embd_dim, cluster_size) 58 | self.layer=layer 59 | if self.layer==1: 60 | self.layer1 = nn.Sequential( 61 | nn.Linear(embd_dim, 4096), 62 | nn.BatchNorm1d(4096), 63 | nn.ReLU(True), 64 | nn.Linear(4096, 256) 65 | ) 66 | self.layer2 = nn.Sequential( 67 | nn.Linear(embd_dim, 4096), 68 | nn.BatchNorm1d(4096), 69 | nn.ReLU(True), 70 | nn.Linear(4096, 256) 71 | ) 72 | self.layer3 = nn.Sequential( 73 | nn.Linear(embd_dim, 4096), 74 | nn.BatchNorm1d(4096), 75 | nn.ReLU(True), 76 | nn.Linear(4096, 256) 77 | ) 78 | self.finetune_video = finetune_video 79 | if self.finetune_video: 80 | self.video_encoder = nn.Sequential( 81 | nn.Conv1d(video_dim, embd_dim, 3, 2), 82 | nn.ReLU(inplace=True), 83 | nn.Conv1d(embd_dim, video_dim, 3, 2), 84 | nn.ReLU(inplace=True) 85 | ) 86 | 87 | def save_checkpoint(self, path): 88 | th.save(self.state_dict(), path) 89 | 90 | def load_checkpoint(self, path): 91 | try: 92 | self.load_state_dict(th.load(path, map_location='cpu')) 93 | except Exception as e: 94 | print(e) 95 | print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE") 96 | self.load_state_dict(th.load(path, map_location='cpu'), strict=False) 97 | print("Loaded model checkpoint from {}".format(path)) 98 | 99 | def forward(self, video, audio_input, nframes, text=None): 100 | if self.finetune_video: 101 | video = self.video_encoder(video) 102 | video = th.max(video, -1)[0] # Max pools along the last dimension 103 | video = self.GU_video(video) 104 | 105 | audio = self.DAVEnet(audio_input) 106 | if not self.training: # controlled by net.train() / net.eval() (use for downstream tasks) 107 | # Mean-pool audio embeddings and disregard embeddings from input 0 padding 108 | pooling_ratio = round(audio_input.size(-1) / audio.size(-1)) 109 | nframes.div_(pooling_ratio) 110 | audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) # 111 | #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1)) 112 | audio_outputs = audio.unsqueeze(2) 113 | pooled_audio_outputs_list = [] 114 | for idx in range(audio.shape[0]): 115 | nF = max(1, nframes[idx]) 116 | pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0)) 117 | audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2) 118 | else: 119 | audio = audio.mean(dim=2) # this averages features from 0 padding too 120 | 121 | if self.tri_modal_fuse: 122 | text = self.text_pooling_caption(text) 123 | audio = self.DAVEnet_projection(audio) 124 | audio_text = self.GU_audio_text(audio, text) 125 | return audio_text, video 126 | 127 | # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training 128 | audio = self.GU_audio(audio) 129 | audio = self.DAVEnet_projection(audio) 130 | if self.tri_modal and not self.tri_modal_fuse: 131 | text = self.GU_text_captions(self.text_pooling_caption(text)) 132 | 133 | # video_c2 = self.layer2(video) 134 | #""" 135 | 136 | #""" 137 | if self.layer==1: 138 | video_c = self.layer1(video) 139 | audio_c = self.layer2(audio) 140 | text_c = self.layer3(text) 141 | else: 142 | if self.project==1: 143 | video_c = self.projection_head(video) 144 | video_c2 = nn.functional.normalize(video_c, dim=1, p=2) 145 | else: 146 | video_c = nn.functional.normalize(video, dim=1, p=2) 147 | video_c = self.classification(video_c2) 148 | 149 | # 150 | if self.project == 1: 151 | audio_c = self.projection_head(audio) 152 | audio_c2 = nn.functional.normalize(audio_c, dim=1, p=2) 153 | else: 154 | audio_c = nn.functional.normalize(audio, dim=1, p=2) 155 | audio_c = self.classification(audio_c2) 156 | 157 | #text_c = self.projection_head(text) 158 | if self.project == 1: 159 | text_c = self.projection_head(text) 160 | text_c2 = nn.functional.normalize(text_c, dim=1, p=2) 161 | else: 162 | text_c = nn.functional.normalize(text, dim=1, p=2) 163 | text_c = self.classification(text_c2) 164 | 165 | return audio, video, text, audio_c, video_c, text_c, audio_c2, video_c2, text_c2 166 | 167 | return audio, video 168 | 169 | 170 | class Gated_Embedding_Unit(nn.Module): 171 | def __init__(self, input_dimension, output_dimension): 172 | super(Gated_Embedding_Unit, self).__init__() 173 | self.fc = nn.Linear(input_dimension, output_dimension) 174 | self.cg = Context_Gating(output_dimension) 175 | 176 | def forward(self, x): 177 | x = self.fc(x) 178 | x = self.cg(x) 179 | return x 180 | 181 | 182 | class Fused_Gated_Unit(nn.Module): 183 | def __init__(self, input_dimension, output_dimension): 184 | super(Fused_Gated_Unit, self).__init__() 185 | self.fc_audio = nn.Linear(input_dimension, output_dimension) 186 | self.fc_text = nn.Linear(input_dimension, output_dimension) 187 | self.cg = Context_Gating(output_dimension) 188 | 189 | def forward(self, audio, text): 190 | audio = self.fc_audio(audio) 191 | text = self.fc_text(text) 192 | x = audio + text 193 | x = self.cg(x) 194 | return x 195 | 196 | 197 | class Context_Gating(nn.Module): 198 | def __init__(self, dimension): 199 | super(Context_Gating, self).__init__() 200 | self.fc = nn.Linear(dimension, dimension) 201 | 202 | def forward(self, x): 203 | x1 = self.fc(x) 204 | x = th.cat((x, x1), 1) 205 | return F.glu(x, 1) 206 | 207 | 208 | class Sentence_Maxpool(nn.Module): 209 | def __init__(self, word_dimension, output_dim): 210 | super(Sentence_Maxpool, self).__init__() 211 | self.fc = nn.Linear(word_dimension, output_dim) 212 | 213 | def forward(self, x): 214 | x = self.fc(x) 215 | x = F.relu(x) 216 | return th.max(x, dim=1)[0] -------------------------------------------------------------------------------- /model_tri_c_clean_sp.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn as nn 7 | import torch as th 8 | import torch.nn.functional as F 9 | from model_davenet import load_DAVEnet 10 | from attention import MultiHeadAttention 11 | 12 | 13 | class Net(nn.Module): 14 | def __init__( 15 | self, 16 | embd_dim=4096, 17 | video_dim=4096, 18 | we_dim=300, 19 | tri_modal=False, 20 | tri_modal_fuse=False, 21 | cluster_size=256, 22 | layer=0, 23 | project=0, 24 | project_dim=6000, 25 | multi_cluster=0, 26 | recon=0, 27 | recon_b=0, 28 | finetune_video=0, 29 | multi_head=0, 30 | joint_cluster=0, 31 | output_norm=0, 32 | recon_cross=0 33 | 34 | ): 35 | super(Net, self).__init__() 36 | self.DAVEnet = load_DAVEnet() 37 | self.DAVEnet_projection = nn.Linear(1024, embd_dim) 38 | self.GU_audio = Gated_Embedding_Unit(1024, 1024) 39 | self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim) 40 | if tri_modal and not tri_modal_fuse: 41 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim) 42 | self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim) 43 | 44 | elif tri_modal_fuse: 45 | self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2) 46 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2) 47 | self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim) 48 | self.tri_modal = tri_modal 49 | self.tri_modal_fuse = tri_modal_fuse 50 | self.project = project 51 | if project==0: 52 | self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256 53 | else: 54 | 55 | self.projection_head = nn.Sequential( 56 | nn.Linear(embd_dim, embd_dim), 57 | nn.BatchNorm1d(embd_dim), 58 | nn.ReLU(inplace=True), 59 | nn.Linear(embd_dim, cluster_size), 60 | ) 61 | if joint_cluster: 62 | self.projection_head2 = self.projection_head 63 | self.projection_head3 = self.projection_head 64 | else: 65 | self.projection_head2 = nn.Sequential( 66 | nn.Linear(embd_dim, embd_dim), 67 | nn.BatchNorm1d(embd_dim), 68 | nn.ReLU(inplace=True), 69 | nn.Linear(embd_dim, cluster_size), 70 | ) 71 | self.projection_head3 = nn.Sequential( 72 | nn.Linear(embd_dim, embd_dim), 73 | nn.BatchNorm1d(embd_dim), 74 | nn.ReLU(inplace=True), 75 | nn.Linear(embd_dim, cluster_size), 76 | ) 77 | self.classification = nn.Linear(cluster_size, project_dim, bias=False) 78 | 79 | self.layer=layer 80 | 81 | self.output_norm = output_norm 82 | 83 | self.recon = recon 84 | self.recon_b = recon_b 85 | self.recon_cross = recon_cross 86 | if recon: 87 | if recon_b: 88 | inp_dim = cluster_size 89 | else: 90 | inp_dim = embd_dim 91 | 92 | self.recon_v = nn.Sequential( 93 | nn.Linear(inp_dim, embd_dim//8), 94 | nn.ReLU(inplace=True), 95 | nn.Linear(embd_dim//8, video_dim), 96 | nn.ReLU(inplace=True) 97 | ) 98 | self.recon_a = nn.Sequential( 99 | nn.Linear(inp_dim, embd_dim//8), 100 | nn.ReLU(inplace=True), 101 | nn.Linear(embd_dim//8, 1024), 102 | nn.ReLU(inplace=True) 103 | ) 104 | self.recon_t = nn.Sequential( 105 | nn.Linear(inp_dim, embd_dim//8), 106 | nn.ReLU(inplace=True), 107 | nn.Linear(embd_dim//8, embd_dim), 108 | nn.ReLU(inplace=True) 109 | ) 110 | self.mse = nn.MSELoss(reduction='none') 111 | 112 | self.finetune_video = finetune_video 113 | self.multi_head = multi_head 114 | if self.finetune_video: 115 | if self.multi_head: 116 | self.video_encoder = MultiHeadAttention(8, video_dim) 117 | else: 118 | self.video_encoder = nn.Sequential( 119 | nn.Conv1d(video_dim, embd_dim, 3, 2), 120 | nn.ReLU(inplace=True), 121 | nn.Conv1d(embd_dim, video_dim, 3, 2), 122 | nn.ReLU(inplace=True) 123 | ) 124 | 125 | def save_checkpoint(self, path): 126 | th.save(self.state_dict(), path) 127 | 128 | def load_checkpoint(self, path): 129 | try: 130 | self.load_state_dict(th.load(path, map_location='cpu')) 131 | except Exception as e: 132 | print(e) 133 | print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE") 134 | self.load_state_dict(th.load(path, map_location='cpu'), strict=False) 135 | print("Loaded model checkpoint from {}".format(path)) 136 | 137 | def forward(self, video, audio_input, nframes, text=None): 138 | if self.finetune_video: 139 | if self.multi_head: 140 | video = video.transpose(-1, -2) 141 | video = self.video_encoder(video, video, video) 142 | video = th.max(video, 1)[0] 143 | else: 144 | video = self.video_encoder(video) 145 | video = th.max(video, -1)[0] # Max pools along the last dimension 146 | video_gt = video 147 | video = self.GU_video(video) 148 | if self.recon and not self.recon_b: 149 | video_recon = self.recon_v(video) 150 | if self.recon_cross: 151 | audio_recon_v = self.recon_a(video) 152 | text_recon_v = self.recon_t(video) 153 | 154 | audio = self.DAVEnet(audio_input) 155 | if not self.training: # controlled by net.train() / net.eval() (use for downstream tasks) 156 | # Mean-pool audio embeddings and disregard embeddings from input 0 padding 157 | pooling_ratio = round(audio_input.size(-1) / audio.size(-1)) 158 | nframes.div_(pooling_ratio) 159 | audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) # 160 | #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1)) 161 | audio_outputs = audio.unsqueeze(2) 162 | pooled_audio_outputs_list = [] 163 | for idx in range(audio.shape[0]): 164 | nF = max(1, nframes[idx]) 165 | pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0)) 166 | audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2) 167 | else: 168 | audio = audio.mean(dim=2) # this averages features from 0 padding too 169 | 170 | if self.tri_modal_fuse: 171 | text = self.text_pooling_caption(text) 172 | audio = self.DAVEnet_projection(audio) 173 | audio_text = self.GU_audio_text(audio, text) 174 | return audio_text, video 175 | 176 | # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training 177 | audio_gt = audio 178 | audio = self.GU_audio(audio) 179 | audio = self.DAVEnet_projection(audio) 180 | if self.recon and not self.recon_b: 181 | audio_recon = self.recon_a(audio) 182 | if self.recon_cross: 183 | video_recon_a = self.recon_v(audio) 184 | text_recon_a = self.recon_t(audio) 185 | if self.tri_modal and not self.tri_modal_fuse: 186 | text_gt = self.text_pooling_caption(text) 187 | text = self.GU_text_captions(text_gt) 188 | if self.recon and not self.recon_b: 189 | text_recon = self.recon_t(text) 190 | if self.recon_cross: 191 | audio_recon_t = self.recon_a(text) 192 | video_recon_t = self.recon_v(text) 193 | 194 | # video_c2 = self.layer2(video) 195 | #""" 196 | 197 | #""" 198 | if self.layer==1: 199 | video_c = self.layer1(video) 200 | audio_c = self.layer2(audio) 201 | text_c = self.layer3(text) 202 | else: 203 | if self.project==1: 204 | video_c = self.projection_head(video) 205 | video_c2 = nn.functional.normalize(video_c, dim=1, p=2) 206 | else: 207 | video_c2 = nn.functional.normalize(video, dim=1, p=2) 208 | if self.recon and self.recon_b: 209 | video_recon = self.recon_v(video_c2) 210 | video_c = self.classification(video_c2) 211 | 212 | # 213 | if self.project == 1: 214 | audio_c = self.projection_head2(audio) 215 | audio_c2 = nn.functional.normalize(audio_c, dim=1, p=2) 216 | else: 217 | audio_c2 = nn.functional.normalize(audio, dim=1, p=2) 218 | if self.recon and self.recon_b: 219 | audio_recon = self.recon_a(audio_c2) 220 | audio_c = self.classification(audio_c2) 221 | 222 | #text_c = self.projection_head(text) 223 | if self.project == 1: 224 | text_c = self.projection_head3(text) 225 | text_c2 = nn.functional.normalize(text_c, dim=1, p=2) 226 | else: 227 | text_c2 = nn.functional.normalize(text, dim=1, p=2) 228 | if self.recon and self.recon_b: 229 | text_recon = self.recon_t(text_c2) 230 | text_c = self.classification(text_c2) 231 | 232 | if self.recon: 233 | mse_v = th.mean(self.mse(video_recon, video_gt), dim=-1) 234 | mse_a = th.mean(self.mse(audio_recon, audio_gt), dim=-1) 235 | mse_t = th.mean(self.mse(text_recon, text_gt), dim=-1) 236 | mse = mse_v+mse_a+mse_t 237 | 238 | if self.recon_cross: 239 | mse = mse + th.mean(self.mse(video_recon_a, video_gt), dim=-1) 240 | mse = mse + th.mean(self.mse(video_recon_t, video_gt), dim=-1) 241 | mse = mse + th.mean(self.mse(audio_recon_v, audio_gt), dim=-1) 242 | mse = mse + th.mean(self.mse(audio_recon_t, audio_gt), dim=-1) 243 | mse = mse + th.mean(self.mse(text_recon_v, text_gt), dim=-1) 244 | mse = mse + th.mean(self.mse(text_recon_a, text_gt), dim=-1) 245 | 246 | return audio, video, text, audio_c, video_c, text_c, mse 247 | 248 | if self.output_norm: 249 | return audio, video, text, audio_c, video_c, text_c, audio_c2, video_c2, text_c2 250 | else: 251 | return audio, video, text, audio_c, video_c, text_c 252 | #return audio, video, text 253 | return audio, video 254 | 255 | 256 | class Gated_Embedding_Unit(nn.Module): 257 | def __init__(self, input_dimension, output_dimension): 258 | super(Gated_Embedding_Unit, self).__init__() 259 | self.fc = nn.Linear(input_dimension, output_dimension) 260 | self.cg = Context_Gating(output_dimension) 261 | 262 | def forward(self, x): 263 | x = self.fc(x) 264 | x = self.cg(x) 265 | return x 266 | 267 | 268 | class Fused_Gated_Unit(nn.Module): 269 | def __init__(self, input_dimension, output_dimension): 270 | super(Fused_Gated_Unit, self).__init__() 271 | self.fc_audio = nn.Linear(input_dimension, output_dimension) 272 | self.fc_text = nn.Linear(input_dimension, output_dimension) 273 | self.cg = Context_Gating(output_dimension) 274 | 275 | def forward(self, audio, text): 276 | audio = self.fc_audio(audio) 277 | text = self.fc_text(text) 278 | x = audio + text 279 | x = self.cg(x) 280 | return x 281 | 282 | 283 | class Context_Gating(nn.Module): 284 | def __init__(self, dimension): 285 | super(Context_Gating, self).__init__() 286 | self.fc = nn.Linear(dimension, dimension) 287 | 288 | def forward(self, x): 289 | x1 = self.fc(x) 290 | x = th.cat((x, x1), 1) 291 | return F.glu(x, 1) 292 | 293 | 294 | class Sentence_Maxpool(nn.Module): 295 | def __init__(self, word_dimension, output_dim): 296 | super(Sentence_Maxpool, self).__init__() 297 | self.fc = nn.Linear(word_dimension, output_dim) 298 | 299 | def forward(self, x): 300 | x = self.fc(x) 301 | x = F.relu(x) 302 | return th.max(x, dim=1)[0] -------------------------------------------------------------------------------- /model_tri_kmeans.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn as nn 7 | import torch as th 8 | import torch.nn.functional as F 9 | from model_davenet import load_DAVEnet 10 | 11 | 12 | class Net(nn.Module): 13 | def __init__( 14 | self, 15 | embd_dim=4096, 16 | video_dim=4096, 17 | we_dim=300, 18 | tri_modal=False, 19 | tri_modal_fuse=False, 20 | cluster_size=256, 21 | layer=0, 22 | project=0, 23 | project_dim=6000, 24 | multi_cluster=0, 25 | recon=0, 26 | withMLP=0, 27 | recon_size=768, 28 | 29 | ): 30 | super(Net, self).__init__() 31 | self.DAVEnet = load_DAVEnet() 32 | self.DAVEnet_projection = nn.Linear(1024, embd_dim) 33 | self.GU_audio = Gated_Embedding_Unit(1024, 1024) 34 | self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim) 35 | if tri_modal and not tri_modal_fuse: 36 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim) 37 | self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim) 38 | 39 | elif tri_modal_fuse: 40 | self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2) 41 | self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2) 42 | self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim) 43 | self.tri_modal = tri_modal 44 | self.tri_modal_fuse = tri_modal_fuse 45 | self.project = project 46 | self.withMLP = withMLP 47 | self.recon_size = recon_size 48 | if withMLP==1: 49 | if project==0: 50 | self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256 51 | self.classification2 = nn.Linear(embd_dim, project_dim, bias=False) # 4096,256 52 | self.classification3 = nn.Linear(embd_dim, project_dim, bias=False) # 4096,256 53 | else: 54 | 55 | self.projection_head = nn.Sequential( 56 | nn.Linear(embd_dim, embd_dim//8), 57 | nn.BatchNorm1d(embd_dim//8), 58 | nn.ReLU(inplace=True), 59 | nn.Linear(embd_dim//8, cluster_size), 60 | ) 61 | """ 62 | self.projection_head2 = nn.Sequential( 63 | nn.Linear(embd_dim, embd_dim), 64 | nn.BatchNorm1d(embd_dim), 65 | nn.ReLU(inplace=True), 66 | nn.Linear(embd_dim, cluster_size), 67 | ) 68 | self.projection_head3 = nn.Sequential( 69 | nn.Linear(embd_dim, embd_dim), 70 | nn.BatchNorm1d(embd_dim), 71 | nn.ReLU(inplace=True), 72 | nn.Linear(embd_dim, cluster_size), 73 | ) 74 | """ 75 | self.classification = nn.Linear(cluster_size, project_dim, bias=False) 76 | #self.classification2 = nn.Linear(cluster_size, project_dim, bias=False) # 4096,256 77 | #self.classification3 = nn.Linear(cluster_size, project_dim, bias=False) # 4096,256 78 | 79 | self.layer=layer 80 | self.recon = recon 81 | if recon: 82 | inp_dim = embd_dim 83 | 84 | self.recon_v = nn.Sequential( 85 | nn.Linear(inp_dim, recon_size), 86 | nn.ReLU(inplace=True), 87 | nn.Linear(recon_size, video_dim), 88 | nn.ReLU(inplace=True) 89 | ) 90 | self.recon_a = nn.Sequential( 91 | nn.Linear(inp_dim, recon_size), 92 | nn.ReLU(inplace=True), 93 | nn.Linear(recon_size, 1024), 94 | nn.ReLU(inplace=True) 95 | ) 96 | self.recon_t = nn.Sequential( 97 | nn.Linear(inp_dim, recon_size), 98 | nn.ReLU(inplace=True), 99 | nn.Linear(recon_size, embd_dim), 100 | nn.ReLU(inplace=True) 101 | ) 102 | self.mse = nn.MSELoss(reduction='none') 103 | 104 | 105 | def save_checkpoint(self, path): 106 | th.save(self.state_dict(), path) 107 | 108 | def load_checkpoint(self, path): 109 | try: 110 | self.load_state_dict(th.load(path, map_location='cpu')) 111 | except Exception as e: 112 | print(e) 113 | print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE") 114 | self.load_state_dict(th.load(path, map_location='cpu'), strict=False) 115 | print("Loaded model checkpoint from {}".format(path)) 116 | 117 | def forward(self, video, audio_input, nframes, text=None): 118 | video_gt = video 119 | video = self.GU_video(video) 120 | if self.recon: 121 | video_recon = self.recon_v(video) 122 | audio = self.DAVEnet(audio_input) 123 | if not self.training: # controlled by net.train() / net.eval() (use for downstream tasks) 124 | # Mean-pool audio embeddings and disregard embeddings from input 0 padding 125 | pooling_ratio = round(audio_input.size(-1) / audio.size(-1)) 126 | nframes.div_(pooling_ratio) 127 | audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) # 128 | #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1)) 129 | audio_outputs = audio.unsqueeze(2) 130 | pooled_audio_outputs_list = [] 131 | for idx in range(audio.shape[0]): 132 | nF = max(1, nframes[idx]) 133 | pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0)) 134 | audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2) 135 | else: 136 | audio = audio.mean(dim=2) # this averages features from 0 padding too 137 | 138 | if self.tri_modal_fuse: 139 | text = self.text_pooling_caption(text) 140 | audio = self.DAVEnet_projection(audio) 141 | audio_text = self.GU_audio_text(audio, text) 142 | return audio_text, video 143 | 144 | # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training 145 | audio_gt = audio 146 | audio = self.GU_audio(audio) 147 | audio = self.DAVEnet_projection(audio) 148 | if self.recon: 149 | audio_recon = self.recon_a(audio) 150 | if self.tri_modal and not self.tri_modal_fuse: 151 | text_gt = self.text_pooling_caption(text) 152 | text = self.GU_text_captions(text_gt) 153 | #fushed = (audio+text+video)/3 154 | # video_c2 = self.layer2(video) 155 | #""" 156 | if self.recon: 157 | text_recon = self.recon_t(text) 158 | 159 | 160 | if self.layer==1: 161 | video_c = self.layer1(video) 162 | audio_c = self.layer2(audio) 163 | text_c = self.layer3(text) 164 | else: 165 | if self.withMLP==1: 166 | if self.project==1: 167 | video_c = self.projection_head(video) 168 | video_c = nn.functional.normalize(video_c, dim=1, p=2) 169 | else: 170 | video_c = nn.functional.normalize(video, dim=1, p=2) 171 | video_c = self.classification(video_c) 172 | 173 | # 174 | if self.project == 1: 175 | audio_c = self.projection_head(audio) 176 | audio_c = nn.functional.normalize(audio_c, dim=1, p=2) 177 | else: 178 | audio_c = nn.functional.normalize(audio, dim=1, p=2) 179 | audio_c = self.classification(audio_c) 180 | 181 | #text_c = self.projection_head(text) 182 | if self.project == 1: 183 | text_c = self.projection_head(text) 184 | text_c = nn.functional.normalize(text_c, dim=1, p=2) 185 | else: 186 | text_c = nn.functional.normalize(text, dim=1, p=2) 187 | text_c = self.classification(text_c) 188 | #else: 189 | # audio_c = video_c = text_c = audio 190 | #""" 191 | #fushed = (audio_c + text_c + video_c) / 3 192 | 193 | #fushed = self.projection_head(fushed) 194 | #fushed = nn.functional.normalize(fushed, dim=1, p=2) 195 | #video_c = audio_c = text_c= fushed#self.classification(fushed) 196 | if self.recon: 197 | mse_v = th.mean(self.mse(video_recon, video_gt), dim=-1) 198 | mse_a = th.mean(self.mse(audio_recon, audio_gt), dim=-1) 199 | mse_t = th.mean(self.mse(text_recon, text_gt), dim=-1) 200 | if self.withMLP == 1: 201 | return audio, video, text, audio_c, video_c, text_c, mse_v + mse_a + mse_t 202 | else: 203 | return audio, video, text, mse_v + mse_a + mse_t 204 | return audio, video, text, text#, audio_c, video_c, text_c 205 | #return audio, video, text 206 | return audio, video 207 | 208 | 209 | class Gated_Embedding_Unit(nn.Module): 210 | def __init__(self, input_dimension, output_dimension): 211 | super(Gated_Embedding_Unit, self).__init__() 212 | self.fc = nn.Linear(input_dimension, output_dimension) 213 | self.cg = Context_Gating(output_dimension) 214 | 215 | def forward(self, x): 216 | x = self.fc(x) 217 | x = self.cg(x) 218 | return x 219 | 220 | 221 | class Fused_Gated_Unit(nn.Module): 222 | def __init__(self, input_dimension, output_dimension): 223 | super(Fused_Gated_Unit, self).__init__() 224 | self.fc_audio = nn.Linear(input_dimension, output_dimension) 225 | self.fc_text = nn.Linear(input_dimension, output_dimension) 226 | self.cg = Context_Gating(output_dimension) 227 | 228 | def forward(self, audio, text): 229 | audio = self.fc_audio(audio) 230 | text = self.fc_text(text) 231 | x = audio + text 232 | x = self.cg(x) 233 | return x 234 | 235 | 236 | class Context_Gating(nn.Module): 237 | def __init__(self, dimension): 238 | super(Context_Gating, self).__init__() 239 | self.fc = nn.Linear(dimension, dimension) 240 | 241 | def forward(self, x): 242 | x1 = self.fc(x) 243 | x = th.cat((x, x1), 1) 244 | return F.glu(x, 1) 245 | 246 | 247 | class Sentence_Maxpool(nn.Module): 248 | def __init__(self, word_dimension, output_dim): 249 | super(Sentence_Maxpool, self).__init__() 250 | self.fc = nn.Linear(word_dimension, output_dim) 251 | 252 | def forward(self, x): 253 | x = self.fc(x) 254 | x = F.relu(x) 255 | return th.max(x, dim=1)[0] 256 | -------------------------------------------------------------------------------- /msrvtt_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import pickle 9 | import torch.nn.functional as F 10 | import numpy as np 11 | import re 12 | import pandas as pd 13 | from collections import defaultdict 14 | from torch.utils.data.dataloader import default_collate 15 | import json 16 | import random 17 | 18 | class MSRVTT_DataLoader(Dataset): 19 | """MSRVTT dataset loader.""" 20 | 21 | def __init__( 22 | self, 23 | data_path, 24 | we, 25 | we_dim=300, 26 | max_words=30, 27 | num_frames_multiplier=5, 28 | training=True, 29 | tri_modal=False, 30 | ): 31 | """ 32 | Args: 33 | """ 34 | self.data = pickle.load(open(data_path, 'rb')) 35 | self.we = we 36 | self.we_dim = we_dim 37 | self.max_words = max_words 38 | self.max_video = 30 39 | self.num_frames_multiplier = num_frames_multiplier 40 | self.training = training 41 | self.tri_modal = tri_modal 42 | 43 | def __len__(self): 44 | return len(self.data) 45 | 46 | def custom_collate(self, batch): 47 | return default_collate(batch) 48 | 49 | def _zero_pad_tensor(self, tensor, size): 50 | if len(tensor) >= size: 51 | return tensor[:size] 52 | else: 53 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 54 | return np.concatenate((tensor, zero), axis=0) 55 | 56 | def _tokenize_text(self, sentence): 57 | w = re.findall(r"[\w']+", str(sentence)) 58 | return w 59 | 60 | def _words_to_we(self, words): 61 | words = [word for word in words if word in self.we.vocab] 62 | if words: 63 | we = self._zero_pad_tensor(self.we[words], self.max_words) 64 | return th.from_numpy(we) 65 | else: 66 | return th.zeros(self.max_words, self.we_dim) 67 | 68 | def _get_caption(self, idx): 69 | """Chooses random caption if training. Uses set caption if evaluating.""" 70 | if self.training: 71 | captions = self.data[idx]['caption'] 72 | caption = self._words_to_we(self._tokenize_text(random.choice(captions))) 73 | return caption 74 | else: 75 | caption = self.data[idx]['eval_caption'] 76 | return self._words_to_we(self._tokenize_text(caption)) 77 | 78 | 79 | def __getitem__(self, idx): 80 | video_id = self.data[idx]['id'] 81 | # load 2d and 3d features (features are pooled over the time dimension) 82 | #""" 83 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0) 84 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0) 85 | video = th.cat((feat_2d, feat_3d)) 86 | """ 87 | feat_2d = th.from_numpy(self.data[idx]['2d']).float() 88 | feat_3d = th.from_numpy(self.data[idx]['3d']).float() 89 | feat_2d = feat_2d[:10] 90 | feat_3d = feat_3d[:10] 91 | #feat_2d = F.interpolate(feat_2d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear', 92 | # align_corners=True).squeeze(0) 93 | #feat_3d = F.interpolate(feat_3d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear', 94 | # align_corners=True).squeeze(0) 95 | 96 | feat_2d = F.normalize(feat_2d, dim=1) 97 | feat_3d = F.normalize(feat_3d, dim=1) 98 | video = th.cat((feat_2d, feat_3d), dim=1) 99 | #""" 100 | # load audio and zero pad/truncate if necessary 101 | audio = self.data[idx]['audio'] 102 | target_length = 1024 * self.num_frames_multiplier 103 | nframes = audio.numpy().shape[1] 104 | p = target_length - nframes 105 | if p > 0: 106 | audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0)) 107 | elif p < 0: 108 | audio = audio[:,0:p] 109 | audio = th.FloatTensor(audio) 110 | 111 | # choose a caption 112 | caption='' 113 | if self.tri_modal: 114 | caption = self._get_caption(idx) 115 | 116 | return {'video': video, 'text': caption, 'video_id': video_id, 117 | 'audio': audio, 'nframes': nframes} 118 | 119 | 120 | class MSRVTT_DataLoader_label(Dataset): 121 | """MSRVTT dataset loader.""" 122 | 123 | def __init__( 124 | self, 125 | data_path, 126 | we, 127 | pseudo_v, 128 | pseudo_a, 129 | we_dim=300, 130 | max_words=30, 131 | num_frames_multiplier=5, 132 | training=True, 133 | tri_modal=False, 134 | ): 135 | """ 136 | Args: 137 | """ 138 | self.data = pickle.load(open(data_path, 'rb')) 139 | self.we = we 140 | self.we_dim = we_dim 141 | self.max_words = max_words 142 | self.max_video = 30 143 | self.num_frames_multiplier = num_frames_multiplier 144 | self.training = training 145 | self.tri_modal = tri_modal 146 | self.pseudo_v = pseudo_v 147 | self.pseudo_a = pseudo_a 148 | 149 | def __len__(self): 150 | return len(self.data) 151 | 152 | def custom_collate(self, batch): 153 | return default_collate(batch) 154 | 155 | def _zero_pad_tensor(self, tensor, size): 156 | if len(tensor) >= size: 157 | return tensor[:size] 158 | else: 159 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 160 | return np.concatenate((tensor, zero), axis=0) 161 | 162 | def _tokenize_text(self, sentence): 163 | w = re.findall(r"[\w']+", str(sentence)) 164 | return w 165 | 166 | def _words_to_we(self, words): 167 | words = [word for word in words if word in self.we.vocab] 168 | if words: 169 | we = self._zero_pad_tensor(self.we[words], self.max_words) 170 | return th.from_numpy(we) 171 | else: 172 | return th.zeros(self.max_words, self.we_dim) 173 | 174 | def _get_caption(self, idx): 175 | """Chooses random caption if training. Uses set caption if evaluating.""" 176 | if self.training: 177 | captions = self.data[idx]['caption'] 178 | caption = self._words_to_we(self._tokenize_text(random.choice(captions))) 179 | return caption 180 | else: 181 | caption = self.data[idx]['eval_caption'] 182 | return self._words_to_we(self._tokenize_text(caption)) 183 | 184 | def __getitem__(self, idx): 185 | video_id = self.data[idx]['id'] 186 | # load 2d and 3d features (features are pooled over the time dimension) 187 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0) 188 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0) 189 | video = th.cat((feat_2d, feat_3d)) 190 | 191 | # load audio and zero pad/truncate if necessary 192 | audio = self.data[idx]['audio'] 193 | target_length = 1024 * self.num_frames_multiplier 194 | nframes = audio.numpy().shape[1] 195 | p = target_length - nframes 196 | if p > 0: 197 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 198 | elif p < 0: 199 | audio = audio[:, 0:p] 200 | audio = th.FloatTensor(audio) 201 | 202 | # choose a caption 203 | caption = '' 204 | if self.tri_modal: 205 | caption = self._get_caption(idx) 206 | 207 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 208 | 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]} 209 | -------------------------------------------------------------------------------- /script.txt: -------------------------------------------------------------------------------- 1 | # get machine 2 | 3 | srun --gres=gpu:4 -N 1 --exclusive --mem=1000G --time 24:00:00 --cpus-per-task=74 --qos=sched_level_2 --pty /bin/bash 4 | 5 | # training 6 | 7 | model1=AVLnet_tri_single_cluster_128_soft_8000_project_cos_mil_e4_sp_6144 8 | 9 | python -u train_tri_cos_mil.py --num_thread_reader=74 --epochs=30 --batch_size=128 \ 10 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 11 | --lr=0.0001 --tri_modal=1 --apex_level=1 --cluster=1 --soft_label=1 --start_cluster=0 --project=1 --project_dim=8000 \ 12 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 13 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 14 | --pretrain_path=/nobackup/users/brian27/MCN_public/model_mcn/$model1/e9.pth \ 15 | --checkpoint_dir=model_mcn/$model1 >> logs/$model1 16 | 17 | # resume pretrain 18 | 19 | --pretrain_path=/nobackup/users/brian27/avlnet_private/model_mcn/$model1/e9.pth \ 20 | 21 | # test on youcook, MSR-VTT 22 | 23 | python train_tri_c.py --eval_msrvtt=1 --num_thread_reader=74 --batch_size=512 --epochs=30 --project=1 --project_dim=8000 \ 24 | --lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/AVLnet_tri_single_cluster_128_soft_8000_project_cos_mil_e4_sp_6144/e9.pth \ 25 | --lr=1e-5 --tri_modal=1 26 | 27 | python train_tri_c.py --eval_youcook=1 --num_thread_reader=74 --batch_size=512 --epochs=30 --project=1 --project_dim=8000 \ 28 | --lr_decay=1.0 --embd_dim=6144 --pretrain_path=model_mcn/AVLnet_tri_single_cluster_128_soft_8000_project_cos_mil_e4_sp_6144/e9.pth \ 29 | --lr=1e-5 --tri_modal=1 -------------------------------------------------------------------------------- /train_avlnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --qos=sched_level_2 3 | #SBATCH --gres=gpu:4 4 | #SBATCH --gpus-per-node=4 5 | #SBATCH --nodes=1 6 | #SBATCH --time=24:00:00 7 | #SBATCH --cpus-per-task 74 8 | #SBATCH --ntasks-per-node=1 9 | #SBATCH --mem=1T 10 | #SBATCH --exclusive 11 | #SBATCH --job-name="ht" 12 | #SBATCH --output logs/ht-%j.out 13 | #SBATCH --error logs/ht-%j.err 14 | ## NOTE: adjust the dependency if needed for the 2nd and 3rd run 15 | ##SBATCH --dependency=afterok:12625 16 | 17 | ## Number of total processes 18 | echo " " 19 | echo " Nodelist:= " $SLURM_JOB_NODELIST 20 | echo " Number of nodes:= " $SLURM_JOB_NUM_NODES 21 | echo " GPUs per node:= " $SLURM_JOB_GPUS 22 | echo " Ntasks per node:= " $SLURM_NTASKS_PER_NODE 23 | 24 | echo " Running on multiple nodes/GPU devices" 25 | echo "" 26 | echo " Run started at:- " 27 | date 28 | 29 | source /nobackup/users/duartek/anaconda3/bin/activate 30 | conda activate wmlce-1.6.2 31 | 32 | nvidia-smi 33 | pwd 34 | 35 | ##################### 36 | 37 | 38 | python -u train_tri_kmeans.py --num_thread_reader=74 --epochs=10 --batch_size=128 \ 39 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 40 | --lr=0.0001 --tri_modal=1 --apex_level=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=1 --recon_size=1024 \ 41 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 42 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 43 | --pretrain_path=model_mcn/MCN_KMeans/e16.pth --train_csv=data/HowTo100M_336_videopaths.txt \ 44 | --checkpoint_dir=model_mcn/MCN_KMeans >> logs/MCN_KMeans 45 | 46 | 47 | #python -u train_tri_cos_mil.py --num_thread_reader=74 --epochs=30 --batch_size=128 \ 48 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 --finetune_video=0 --video_interp=0 \ 49 | #--recon=1 --recon_b=0 --recon_cross=0 --joint_cluster=1 --cluster_a=0 --multi_head=0 \ 50 | #--lr=0.0001 --tri_modal=1 --apex_level=1 --cluster=1 --soft_label=0 --start_cluster=0 --project=1 --project_dim=8000 \ 51 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 52 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 53 | #--pretrain_path=model_mcn/MCN_Sports/e10.pth --train_csv=data/HowTo100M_336_videopaths.txt \ 54 | #--checkpoint_dir=model_mcn/MCN_Sports >> logs/MCN_Sports 55 | 56 | # --pretrain_path=/nobackup/users/brian27/MCN_public/model_mcn/$model1/e9.pth \ 57 | ## Run two training commands in the background, each on two V100 GPUs 58 | #model1=AVLnet_test_code_release 59 | #model2=AVLnet_text_test_code_release 60 | 61 | 62 | #CUDA_VISIBLE_DEVICES=0,1 python -u train.py --num_thread_reader=20 --epochs=7 --batch_size=128 --n_pair=32 --embd_dim=4096 --howto_audio_frames=1000 --lr=0.001 --apex_level=1 \ 63 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \ 64 | #--checkpoint_dir=model/$model1 >> logs/$model1 & \ 65 | 66 | ## Add --pretrain_path to the command before the >> for the second run 67 | # --pretrain_path=model/$model1/e7.pth 68 | 69 | #CUDA_VISIBLE_DEVICES=2,3 python -u train.py --num_thread_reader=20 --epochs=7 --batch_size=128 --n_pair=32 --embd_dim=4096 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 \ 70 | #--lr=0.0001 --tri_modal=1 --tri_modal_fuse=1 --apex_level=1 --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \ 71 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos --checkpoint_dir=model/$model2 >> logs/$model2 & \ 72 | 73 | ## Add --pretrain_path to the command before the >> for the second run 74 | # --pretrain_path=model/$model2/e7.pth 75 | 76 | ## Wait for all commands to finish 77 | wait 78 | echo "Run completed at:- " 79 | date 80 | -------------------------------------------------------------------------------- /ucf_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import pickle 9 | import torch.nn.functional as F 10 | import numpy as np 11 | import re 12 | import pandas as pd 13 | from collections import defaultdict 14 | from torch.utils.data.dataloader import default_collate 15 | import json 16 | import random 17 | 18 | 19 | def name_to_stringlist(name): 20 | change = {} 21 | """ 22 | change = {'HandStandPushups': ['handstand', 'pushups'], 23 | 'HandstandPushups': ['handstand', 'pushups'], 24 | 'PushUps': ['pushups'], 25 | 'PullUps': ['pullups']} 26 | """ 27 | """ 28 | change = { 29 | 'CleanAndJerk': ['weight', 'lift'], 30 | 'Skijet': ['Skyjet'], 31 | 'HandStandPushups': ['handstand', 'pushups'], 32 | 'HandstandPushups': ['handstand', 'pushups'], 33 | 'PushUps': ['pushups'], 34 | 'PullUps': ['pullups'], 35 | 'WalkingWithDog': ['walk', 'dog'], 36 | 'ThrowDiscus': ['throw', 'disc'], 37 | 'TaiChi': ['taichi'], 38 | 'CuttingInKitchen': ['cut', 'kitchen'], 39 | 'YoYo': ['yoyo'], 40 | } 41 | """ 42 | if name in change: 43 | name_vec = change[name] 44 | else: 45 | upper_idx = np.where([x.isupper() for x in name])[0].tolist() 46 | upper_idx += [len(name)] 47 | name_vec = [] 48 | for i in range(len(upper_idx)-1): 49 | name_vec.append(name[upper_idx[i]: upper_idx[i+1]]) 50 | name_vec = [n.lower() for n in name_vec] 51 | #name_vec = verbs2basicform(name_vec) 52 | return name_vec 53 | 54 | 55 | class UCF_DataLoader(Dataset): 56 | """MSRVTT dataset loader.""" 57 | 58 | def __init__( 59 | self, 60 | data_path, 61 | we, 62 | we_dim=300, 63 | max_words=30, 64 | num_frames_multiplier=5, 65 | training=True, 66 | tri_modal=False, 67 | finetune_video=False, 68 | video_interp=False 69 | ): 70 | """ 71 | Args: 72 | """ 73 | self.data = pickle.load(open(data_path, 'rb')) # contains a list of video names 74 | self.we = we 75 | self.we_dim = we_dim 76 | self.max_words = max_words 77 | self.max_video = 30 78 | self.num_frames_multiplier = num_frames_multiplier 79 | self.training = training 80 | self.tri_modal = tri_modal 81 | self.finetune_video = finetune_video 82 | self.max_frames = 16 83 | self.video_interp = video_interp 84 | 85 | names = [] 86 | for vid in self.data: 87 | names.append(vid['class']) 88 | 89 | self.classes = sorted(set(names)) 90 | print('# Classes', len(self.classes)) 91 | 92 | self.class_embeds = [] 93 | for name in self.classes: 94 | word_list = name_to_stringlist(name) 95 | caption = ' '.join(word_list) 96 | self.class_embeds.append(self._get_caption(caption)) 97 | self.class_embeds = th.stack(self.class_embeds, 0) 98 | print('Shape of class embeds', self.class_embeds.shape) 99 | 100 | def __len__(self): 101 | return len(self.data) 102 | 103 | def custom_collate(self, batch): 104 | return default_collate(batch) 105 | 106 | def _zero_pad_tensor(self, tensor, size): 107 | if len(tensor) >= size: 108 | return tensor[:size] 109 | else: 110 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 111 | return np.concatenate((tensor, zero), axis=0) 112 | 113 | def _tokenize_text(self, sentence): 114 | w = re.findall(r"[\w']+", str(sentence)) 115 | return w 116 | 117 | def _words_to_we(self, words): 118 | words = [word for word in words if word in self.we.vocab] 119 | if words: 120 | we = self._zero_pad_tensor(self.we[words], self.max_words) 121 | return th.from_numpy(we) 122 | else: 123 | return th.zeros(self.max_words, self.we_dim) 124 | 125 | def _get_caption(self, idx): 126 | """Chooses random caption if training. Uses set caption if evaluating.""" 127 | if self.training: 128 | captions = idx 129 | caption = self._words_to_we(self._tokenize_text(random.choice(captions))) 130 | return caption 131 | else: 132 | caption = idx 133 | return self._words_to_we(self._tokenize_text(caption)) 134 | 135 | def __getitem__(self, idx): 136 | data = self.data[idx] 137 | # load 2d and 3d features (features are pooled over the time dimension) 138 | 139 | if self.finetune_video: 140 | feat_2d = th.from_numpy(self.data[idx]['2d']).float() 141 | feat_3d = th.from_numpy(self.data[idx]['3d']).float() 142 | if self.video_interp: 143 | feat_2d = F.interpolate(feat_2d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear', 144 | align_corners=True).squeeze(0) 145 | feat_3d = F.interpolate(feat_3d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear', 146 | align_corners=True).squeeze(0) 147 | else: 148 | feat2d_buffer = th.zeros(self.max_frames, feat_2d.shape[-1]) 149 | feat_2d = feat_2d[:self.max_frames] 150 | feat2d_buffer[:len(feat_2d)] = feat_2d 151 | 152 | feat3d_buffer = th.zeros(self.max_frames, feat_3d.shape[-1]) 153 | feat_3d = feat_3d[:self.max_frames] 154 | feat3d_buffer[:len(feat_3d)] = feat_3d 155 | 156 | feat_2d = feat2d_buffer.transpose(1, 0) 157 | feat_3d = feat3d_buffer.transpose(1, 0) 158 | 159 | feat_2d = F.normalize(feat_2d, dim=0) 160 | feat_3d = F.normalize(feat_3d, dim=0) 161 | video = th.cat((feat_2d, feat_3d), dim=0) 162 | else: 163 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0) 164 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0) 165 | video = th.cat((feat_2d, feat_3d)) 166 | 167 | # load audio and zero pad/truncate if necessary 168 | audio = self.data[idx]['audio'] 169 | target_length = 1024 * self.num_frames_multiplier 170 | nframes = audio.shape[1] 171 | p = target_length - nframes 172 | if p > 0: 173 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 174 | elif p < 0: 175 | audio = audio[:, 0:p] 176 | audio = th.FloatTensor(audio) 177 | 178 | # choose a caption 179 | caption = '' 180 | name = self.data[idx]['class'] 181 | if self.tri_modal: 182 | word_list = name_to_stringlist(name) 183 | caption = ' '.join(word_list) 184 | caption = self._get_caption(caption) 185 | 186 | return {'video': video, 'text': caption, 'video_id': idx, 187 | 'audio': audio, 'nframes': 32, 'class_name': name, 188 | 'class_id': th.ones(1)*self.classes.index(name), 189 | 'has_audio': th.ones(1)*self.data[idx]['has_audio'], 190 | 'video_name': self.data[idx]['video'], 191 | 'training': th.ones(1)*self.data[idx]['training']} 192 | 193 | 194 | class MSRVTT_DataLoader_label(Dataset): 195 | """MSRVTT dataset loader.""" 196 | 197 | def __init__( 198 | self, 199 | data_path, 200 | we, 201 | pseudo_v, 202 | pseudo_a, 203 | we_dim=300, 204 | max_words=30, 205 | num_frames_multiplier=5, 206 | training=True, 207 | tri_modal=False, 208 | ): 209 | """ 210 | Args: 211 | """ 212 | self.data = pickle.load(open(data_path, 'rb')) 213 | self.we = we 214 | self.we_dim = we_dim 215 | self.max_words = max_words 216 | self.max_video = 30 217 | self.num_frames_multiplier = num_frames_multiplier 218 | self.training = training 219 | self.tri_modal = tri_modal 220 | self.pseudo_v = pseudo_v 221 | self.pseudo_a = pseudo_a 222 | 223 | 224 | 225 | def __len__(self): 226 | return len(self.data) 227 | 228 | def custom_collate(self, batch): 229 | return default_collate(batch) 230 | 231 | def _zero_pad_tensor(self, tensor, size): 232 | if len(tensor) >= size: 233 | return tensor[:size] 234 | else: 235 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 236 | return np.concatenate((tensor, zero), axis=0) 237 | 238 | def _tokenize_text(self, sentence): 239 | w = re.findall(r"[\w']+", str(sentence)) 240 | return w 241 | 242 | def _words_to_we(self, words): 243 | words = [word for word in words if word in self.we.vocab] 244 | if words: 245 | we = self._zero_pad_tensor(self.we[words], self.max_words) 246 | return th.from_numpy(we) 247 | else: 248 | return th.zeros(self.max_words, self.we_dim) 249 | 250 | def _get_caption(self, idx): 251 | """Chooses random caption if training. Uses set caption if evaluating.""" 252 | if self.training: 253 | captions = self.data[idx]['caption'] 254 | caption = self._words_to_we(self._tokenize_text(random.choice(captions))) 255 | return caption 256 | else: 257 | caption = self.data[idx]['eval_caption'] 258 | return self._words_to_we(self._tokenize_text(caption)) 259 | 260 | def __getitem__(self, idx): 261 | video_id = self.data[idx]['id'] 262 | # load 2d and 3d features (features are pooled over the time dimension) 263 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0) 264 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0) 265 | video = th.cat((feat_2d, feat_3d)) 266 | 267 | # load audio and zero pad/truncate if necessary 268 | audio = self.data[idx]['audio'] 269 | target_length = 1024 * self.num_frames_multiplier 270 | nframes = audio.numpy().shape[1] 271 | p = target_length - nframes 272 | if p > 0: 273 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 274 | elif p < 0: 275 | audio = audio[:, 0:p] 276 | audio = th.FloatTensor(audio) 277 | 278 | # choose a caption 279 | caption = '' 280 | if self.tri_modal: 281 | caption = self._get_caption(idx) 282 | 283 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 284 | 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]} 285 | -------------------------------------------------------------------------------- /video_evaluation.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | 4 | 5 | def recall(mat, gts): 6 | # mat is of shape (Queries, Targets), where higher=prediction 7 | # gts is of shape (Queries, ) 8 | 9 | predictions = np.argsort(mat, 1) # (Queries, Targets) 10 | 11 | top_1 = predictions[:, -1] 12 | 13 | recall = np.mean(top_1 == gts) 14 | print('NN Retrieval R@1:', recall) 15 | 16 | recall_top5 = np.mean([l in p for l, p in zip(gts, predictions[:, -5:])]) 17 | print('NN Retrieval R@5:', recall_top5) 18 | 19 | recall_top10 = np.mean([l in p for l, p in zip(gts, predictions[:, -10:])]) 20 | print('NN Retrieval R@10:', recall_top10) 21 | 22 | 23 | def evaluate_recall_youcook(text, video_audio, clip_ids, m=None): 24 | # text is of shape (n_clips, n_feats) 25 | # video_audio is of shape (n_clips, n_feats) 26 | # video_ids is a list of length n_clips with all the clip_ids 27 | full_videos = sorted(list(set([d[:11] for d in clip_ids]))) 28 | print('# Clips', len(clip_ids)) 29 | print('# Videos', len(full_videos)) 30 | 31 | n_clips = len(clip_ids) 32 | n_vids = len(full_videos) 33 | clip_to_video = [] 34 | [clip_to_video.extend([i for i, x in enumerate(full_videos) if x in clip_id]) for clip_id in clip_ids] 35 | clip_to_video = np.array(clip_to_video) 36 | 37 | if m is None: 38 | m = np.matmul(text, video_audio.T) # (n_clips, n_clips) 39 | print('Standard Retrieval | single caption -> single clip') 40 | recall(m, np.arange(m.shape[0])) 41 | 42 | predictions = np.argsort(m, 1) 43 | 44 | video_predictions = clip_to_video[predictions] 45 | video_gts = clip_to_video[np.arange(len(clip_to_video))] 46 | 47 | print('Retrieval single | single caption -> full video') 48 | recall_top1 = np.mean(video_predictions[:, -1] == video_gts) 49 | print('NN Retrieval R@1:', recall_top1) 50 | 51 | recall_top5 = np.mean([l in p for l, p in zip(video_gts, video_predictions[:, -5:])]) 52 | print('NN Retrieval R@5:', recall_top5) 53 | 54 | recall_top10 = np.mean([l in p for l, p in zip(video_gts, video_predictions[:, -10:])]) 55 | print('NN Retrieval R@10:', recall_top10) 56 | 57 | video_inds = [[i for i, x in enumerate(clip_ids) if video in x] for video in full_videos] # list of length n_vids, with the corresponding clip_inds 58 | 59 | video_preds_m = np.stack([np.max(m[:, v], axis=1) for v in video_inds], 1) # (n_clips, n_vids) 60 | video_preds_m2 = np.stack([np.mean(video_preds_m[v, :], axis=0) for v in video_inds], 0) # (n_vids, n_vids) 61 | 62 | print('Retrieval single | full caption -> full video | for each caption get max prediction over a video, then average over all captions of a video.') 63 | recall(video_preds_m2, np.arange(n_vids)) 64 | 65 | corr_preds = [] 66 | for video_id in range(len(full_videos)): 67 | vid_i_m = video_preds_m[video_gts == video_id] 68 | vid_i_pred = np.argsort(vid_i_m, 1) 69 | prs = [] 70 | for i in [1, 5, 10]: 71 | top_i_preds = vid_i_pred[:, -i:] 72 | unique_ids, counts = np.unique(top_i_preds, return_counts=True) 73 | id_pred = unique_ids[np.argsort(counts)[-i:]] 74 | #print(id_pred) 75 | prs.append(video_id in id_pred) 76 | corr_preds.append(prs) 77 | 78 | t1, t5, t10 = zip(*corr_preds) 79 | print('Retrieval single | full caption -> full video | for each caption get top_k video predictions, then get sorted majority vote for final top_k predictions.') 80 | print('NN Retrieval R@1:', np.mean(t1)) 81 | print('NN Retrieval R@5:', np.mean(t5)) 82 | print('NN Retrieval R@10:', np.mean(t10)) 83 | 84 | corr_preds = [] 85 | for video_id in range(len(full_videos)): 86 | vid_i_m = m[video_gts == video_id] 87 | vid_i_pred = clip_to_video[np.argsort(vid_i_m, 1)] 88 | 89 | prs = [] 90 | for i in [1, 5, 10]: 91 | top_i_preds = vid_i_pred[:, -i:] 92 | unique_ids, counts = np.unique(top_i_preds, return_counts=True) 93 | id_pred = unique_ids[np.argsort(counts)[-i:]] 94 | prs.append(video_id in id_pred) 95 | corr_preds.append(prs) 96 | 97 | t1, t5, t10 = zip(*corr_preds) 98 | print('Retrieval single | full caption -> full video | for each caption get top_k clip predictions, then get sorted majority vote for final top_k predictions.') 99 | print('NN Retrieval R@1:', np.mean(t1)) 100 | print('NN Retrieval R@5:', np.mean(t5)) 101 | print('NN Retrieval R@10:', np.mean(t10)) 102 | 103 | #data = pickle.load(open('temp_data/YouCook2.pkl', 'rb')) 104 | #print(data.keys()) 105 | #evaluate_recall(data['text'], data['audio']+data['video'], data['video_id']) 106 | 107 | -------------------------------------------------------------------------------- /youcook_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import pickle 9 | import torch.nn.functional as F 10 | import numpy as np 11 | import re 12 | from torch.utils.data.dataloader import default_collate 13 | 14 | class Youcook_DataLoader(Dataset): 15 | """Youcook dataset loader.""" 16 | 17 | def __init__( 18 | self, 19 | data, 20 | we, 21 | we_dim=300, 22 | max_words=30, 23 | num_frames_multiplier=5, 24 | tri_modal=False, 25 | ): 26 | """ 27 | Args: 28 | """ 29 | self.data = pickle.load(open(data, 'rb')) 30 | self.we = we 31 | self.we_dim = we_dim 32 | self.max_words = max_words 33 | self.num_frames_multiplier = num_frames_multiplier 34 | self.tri_modal = tri_modal 35 | 36 | def __len__(self): 37 | return len(self.data) 38 | 39 | def custom_collate(self, batch): 40 | return default_collate(batch) 41 | 42 | def _zero_pad_tensor(self, tensor, size): 43 | if len(tensor) >= size: 44 | return tensor[:size] 45 | else: 46 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 47 | return np.concatenate((tensor, zero), axis=0) 48 | 49 | def _tokenize_text(self, sentence): 50 | w = re.findall(r"[\w']+", str(sentence)) 51 | return w 52 | 53 | def _words_to_we(self, words): 54 | words = [word for word in words if word in self.we.vocab] 55 | if words: 56 | we = self._zero_pad_tensor(self.we[words], self.max_words) 57 | return th.from_numpy(we) 58 | else: 59 | return th.zeros(self.max_words, self.we_dim) 60 | 61 | def __getitem__(self, idx): 62 | # load 2d and 3d features (features are pooled over the time dimension) 63 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0) 64 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0) 65 | video = th.cat((feat_2d, feat_3d)) 66 | 67 | # load audio and zero pad/truncate if necessary 68 | audio = self.data[idx]['audio'] 69 | target_length = 1024 * self.num_frames_multiplier 70 | nframes = audio.numpy().shape[1] 71 | p = target_length - nframes 72 | if p > 0: 73 | audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0)) 74 | elif p < 0: 75 | audio = audio[:,0:p] 76 | audio = th.FloatTensor(audio) 77 | 78 | caption = '' 79 | if self.tri_modal: 80 | caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 81 | 82 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 83 | 'audio': audio, 'nframes': nframes} 84 | 85 | 86 | class Youcook_DataLoader_label(Dataset): 87 | """Youcook dataset loader.""" 88 | 89 | def __init__( 90 | self, 91 | data, 92 | we, 93 | pseudo_v, 94 | pseudo_a, 95 | we_dim=300, 96 | max_words=30, 97 | num_frames_multiplier=5, 98 | tri_modal=False, 99 | 100 | ): 101 | """ 102 | Args: 103 | """ 104 | self.data = pickle.load(open(data, 'rb')) #9000*4800 105 | self.we = we 106 | self.we_dim = we_dim 107 | self.max_words = max_words 108 | self.num_frames_multiplier = num_frames_multiplier 109 | self.tri_modal = tri_modal 110 | self.pseudo_v = pseudo_v 111 | self.pseudo_a = pseudo_a 112 | 113 | def __len__(self): 114 | return len(self.data) 115 | 116 | def custom_collate(self, batch): 117 | return default_collate(batch) 118 | 119 | def _zero_pad_tensor(self, tensor, size): 120 | if len(tensor) >= size: 121 | return tensor[:size] 122 | else: 123 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 124 | return np.concatenate((tensor, zero), axis=0) 125 | 126 | def _tokenize_text(self, sentence): 127 | w = re.findall(r"[\w']+", str(sentence)) 128 | return w 129 | 130 | def _words_to_we(self, words): 131 | words = [word for word in words if word in self.we.vocab] 132 | if words: 133 | we = self._zero_pad_tensor(self.we[words], self.max_words) 134 | return th.from_numpy(we) 135 | else: 136 | return th.zeros(self.max_words, self.we_dim) 137 | 138 | def __getitem__(self, idx): 139 | # load 2d and 3d features (features are pooled over the time dimension) 140 | feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0) 141 | feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0) 142 | video = th.cat((feat_2d, feat_3d)) 143 | 144 | # load audio and zero pad/truncate if necessary 145 | audio = self.data[idx]['audio'] 146 | target_length = 1024 * self.num_frames_multiplier 147 | nframes = audio.numpy().shape[1] 148 | p = target_length - nframes 149 | if p > 0: 150 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 151 | elif p < 0: 152 | audio = audio[:, 0:p] 153 | audio = th.FloatTensor(audio) 154 | 155 | caption = '' 156 | if self.tri_modal: 157 | caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 158 | 159 | return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'], 160 | 'audio': audio, 'nframes': nframes, 'pseudo_v':self.pseudo_v[idx], 'pseudo_a':self.pseudo_a[idx]} 161 | 162 | class Youcook_DataLoader_knn(Dataset): 163 | """Youcook dataset loader.""" 164 | 165 | def __init__( 166 | self, 167 | data, 168 | we, 169 | knn_v, 170 | knn_a, 171 | we_dim=300, 172 | max_words=30, 173 | num_frames_multiplier=5, 174 | tri_modal=False, 175 | 176 | ): 177 | """ 178 | Args: 179 | """ 180 | self.data = pickle.load(open(data, 'rb')) #9000*4800 181 | self.we = we 182 | self.we_dim = we_dim 183 | self.max_words = max_words 184 | self.num_frames_multiplier = num_frames_multiplier 185 | self.tri_modal = tri_modal 186 | self.knn_v = knn_v 187 | self.knn_a = knn_a 188 | 189 | def __len__(self): 190 | return len(self.data) 191 | 192 | def custom_collate(self, batch): 193 | return default_collate(batch) 194 | 195 | def _zero_pad_tensor(self, tensor, size): 196 | if len(tensor) >= size: 197 | return tensor[:size] 198 | else: 199 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 200 | return np.concatenate((tensor, zero), axis=0) 201 | 202 | def _tokenize_text(self, sentence): 203 | w = re.findall(r"[\w']+", str(sentence)) 204 | return w 205 | 206 | def _words_to_we(self, words): 207 | words = [word for word in words if word in self.we.vocab] 208 | if words: 209 | we = self._zero_pad_tensor(self.we[words], self.max_words) 210 | return th.from_numpy(we) 211 | else: 212 | return th.zeros(self.max_words, self.we_dim) 213 | 214 | def __getitem__(self, idx): 215 | video_feature = [] 216 | text_feature = [] 217 | audio_feature = [] 218 | nframes_list = [] 219 | caption_text = [] 220 | for i in self.knn_v[idx]: 221 | # load 2d and 3d features (features are pooled over the time dimension) 222 | feat_2d = F.normalize(th.from_numpy(self.data[i]['2d']).float(), dim=0) 223 | feat_3d = F.normalize(th.from_numpy(self.data[i]['3d']).float(), dim=0) 224 | video = th.cat((feat_2d, feat_3d)) 225 | video_feature.append(video.numpy()) 226 | # load audio and zero pad/truncate if necessary 227 | audio = self.data[i]['audio'] 228 | target_length = 1024 * self.num_frames_multiplier 229 | nframes = audio.numpy().shape[1] 230 | nframes_list.append(nframes) 231 | p = target_length - nframes 232 | if p > 0: 233 | audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 234 | elif p < 0: 235 | audio = audio[:, 0:p] 236 | audio = th.FloatTensor(audio) 237 | audio_feature.append(audio.numpy()) 238 | caption = '' 239 | if self.tri_modal: 240 | caption = self._words_to_we(self._tokenize_text(self.data[i]['caption'])) 241 | text_feature.append(caption.numpy()) 242 | video_f = np.asarray(video_feature) 243 | text_f = np.asarray(text_feature) 244 | audio_f = np.asarray(audio_feature) 245 | nframes_l = np.asarray(nframes_list) 246 | """ 247 | print('dataload') 248 | print(video_f.shape) 249 | print(text_f.shape) 250 | print(audio_f.shape) 251 | print(nframes_l.shape) 252 | print('dataload_fin') 253 | """ 254 | #caption_text = 255 | return {'video': video_f, 'text': text_f, 'video_id': self.data[i]['id'], 256 | 'audio': audio_f, 'nframes': nframes_l} -------------------------------------------------------------------------------- /youtube_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import torch.nn.functional as F 9 | import pandas as pd 10 | import os 11 | import numpy as np 12 | import re 13 | import random 14 | import librosa 15 | from model_davenet import LoadAudio 16 | 17 | 18 | class Youtube_DataLoader(Dataset): 19 | """Youtube dataset loader.""" 20 | 21 | def __init__( 22 | self, 23 | csv, 24 | features_path, 25 | features_path_audio, 26 | caption, 27 | we, 28 | min_time=10.0, 29 | feature_framerate=1.0, 30 | feature_framerate_3D=24.0 / 16.0, 31 | we_dim=300, 32 | max_words=30, 33 | min_words=0, 34 | n_pair=1, 35 | num_audio_frames=1024, 36 | random_audio_windows=False, 37 | ): 38 | """ 39 | Args: 40 | """ 41 | self.csv = pd.read_csv(csv) 42 | self.features_path = features_path 43 | self.features_path_audio = features_path_audio if features_path_audio != "" \ 44 | else features_path 45 | self.caption = caption 46 | self.min_time = min_time 47 | self.feature_framerate = feature_framerate 48 | self.feature_framerate_3D = feature_framerate_3D 49 | self.we_dim = we_dim 50 | self.max_words = max_words 51 | self.min_words = min_words 52 | self.num_audio_frames = num_audio_frames 53 | self.we = we 54 | self.n_pair = n_pair 55 | self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D} 56 | self.feature_path = {'2d': features_path} 57 | if features_path != '': 58 | self.feature_path['3d'] = features_path 59 | self.random_audio_windows = random_audio_windows 60 | 61 | def __len__(self): 62 | return len(self.csv) 63 | 64 | def _zero_pad_tensor(self, tensor, size): 65 | if len(tensor) >= size: 66 | return tensor[:size] 67 | else: 68 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 69 | return np.concatenate((tensor, zero), axis=0) 70 | 71 | def _zero_pad_audio(self, audio, max_frames): 72 | n_frames = audio.shape[1] 73 | if n_frames >= max_frames: 74 | return audio[:, 0:max_frames], int(max_frames) 75 | else: 76 | p = max_frames - n_frames 77 | audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 78 | return audio_padded, n_frames 79 | 80 | def _tokenize_text(self, sentence): 81 | w = re.findall(r"[\w']+", str(sentence)) 82 | return w 83 | 84 | def _words_to_we(self, words): 85 | words = [word for word in words if word in self.we.vocab] 86 | if words: 87 | we = self._zero_pad_tensor(self.we[words], self.max_words) 88 | return th.from_numpy(we) 89 | else: 90 | return th.zeros(self.max_words, self.we_dim) 91 | 92 | def _get_audio_and_text(self, caption, n_pair_max, mel_spec): 93 | n_caption = len(caption['start']) 94 | k = n_pair_max 95 | starts = np.zeros(k) 96 | ends = np.zeros(k) 97 | text = th.zeros(k, self.max_words, self.we_dim) 98 | audio = [0 for i in range(k)] 99 | nframes = np.zeros(k) 100 | r_ind = np.random.choice(range(n_caption), k, replace=True) 101 | 102 | for i in range(k): 103 | ind = r_ind[i] 104 | audio[i], nframes[i], starts[i], ends[i], text[i] = self._get_single_audio_text(caption, ind, mel_spec) 105 | 106 | audio = th.cat([i.unsqueeze(0) for i in audio], dim=0) 107 | return audio, nframes, starts, ends, text 108 | 109 | def _get_single_audio_text(self, caption, ind, mel_spec): 110 | start, end = ind, ind 111 | words = self._tokenize_text(caption['text'][ind]) 112 | diff = caption['end'][end] - caption['start'][start] 113 | # Extend the video clip if shorter than the minimum desired clip duration 114 | while diff < self.min_time: 115 | if start > 0 and end < len(caption['end']) - 1: 116 | next_words = self._tokenize_text(caption['text'][end + 1]) 117 | prev_words = self._tokenize_text(caption['text'][start - 1]) 118 | d1 = caption['end'][end + 1] - caption['start'][start] 119 | d2 = caption['end'][end] - caption['start'][start - 1] 120 | # Use the closest neighboring video clip 121 | if d2 <= d1: 122 | start -= 1 123 | words.extend(prev_words) 124 | else: 125 | end += 1 126 | words.extend(next_words) 127 | # If no video clips after it, use the clip before it 128 | elif start > 0: 129 | words.extend(self._tokenize_text(caption['text'][start - 1])) 130 | start -= 1 131 | # If no video clips before it, use the clip after it. 132 | elif end < len(caption['end']) - 1: 133 | words.extend(self._tokenize_text(caption['text'][end + 1])) 134 | end += 1 135 | # If there's no clips before or after 136 | else: 137 | break 138 | diff = caption['end'][end] - caption['start'][start] 139 | 140 | frames = librosa.core.time_to_frames([caption['start'][start], caption['end'][end]], sr=16000, hop_length=160, 141 | n_fft=400) 142 | padded_mel_spec, nframes = self._zero_pad_audio(mel_spec[:, frames[0]: frames[1]], self.num_audio_frames) 143 | return th.from_numpy(padded_mel_spec), nframes, caption['start'][start], caption['end'][end], self._words_to_we( 144 | words) 145 | 146 | def _get_audio_random(self, n_pair_max, mel_spec): 147 | k = n_pair_max 148 | starts = np.zeros(k) 149 | ends = np.zeros(k) 150 | audio = [0 for i in range(k)] 151 | nframes = np.zeros(k) 152 | video_duration_seconds = int( 153 | librosa.core.frames_to_time(mel_spec.shape[1], sr=16000, hop_length=160, n_fft=400)) 154 | num_audio_seconds = int(librosa.core.frames_to_time(self.num_audio_frames, sr=16000, hop_length=160, n_fft=400)) 155 | # Sample clips that end before the end of the video 156 | # If the video is shorter than the desired window, use the entire video 157 | start_seconds = np.random.choice(range(max(1, video_duration_seconds - (num_audio_seconds + 1))), k, 158 | replace=True) 159 | 160 | for i in range(k): 161 | start_frame = max(0, librosa.core.time_to_frames(start_seconds[i], sr=16000, hop_length=160, n_fft=400)) 162 | audio_window = mel_spec[:, start_frame: start_frame + self.num_audio_frames] 163 | # Pad in the case that the audio wasn't long enough 164 | padded_mel_spec, nframes_spec = self._zero_pad_audio(audio_window, self.num_audio_frames) 165 | end_second = start_seconds[i] + num_audio_seconds 166 | audio[i], nframes[i], starts[i], ends[i] = th.from_numpy(padded_mel_spec), nframes_spec, start_seconds[ 167 | i], end_second 168 | 169 | audio = th.cat([i.unsqueeze(0) for i in audio], dim=0) 170 | return audio, nframes, starts, ends 171 | 172 | def _get_video(self, vid_path, s, e, video_id): 173 | feature_path = {} 174 | video = {} 175 | output = {} 176 | for k in self.feature_path: 177 | feature_path[k] = os.path.join(self.feature_path[k], vid_path, video_id + "_{}.npz".format(k)) 178 | np_arr = np.load(feature_path[k])['features'] 179 | video[k] = th.from_numpy(np_arr).float() 180 | output[k] = th.zeros(len(s), video[k].shape[-1]) 181 | for i in range(len(s)): 182 | start = int(s[i] * self.fps[k]) 183 | end = int(e[i] * self.fps[k]) + 1 184 | slice = video[k][start:end] 185 | if len(slice) < 1: 186 | #print("missing visual feats; video_id: {}, start: {}, end: {}".format(feature_path[k], start, end)) 187 | missing=1 188 | else: 189 | output[k][i] = F.normalize(th.max(slice, dim=0)[0], dim=0) 190 | 191 | return th.cat([output[k] for k in output], dim=1) 192 | 193 | def __getitem__(self, idx): 194 | vid_path = self.csv['path'].values[idx].replace("None/", "") 195 | video_id = vid_path.split("/")[-1] 196 | audio_path = os.path.join(self.features_path_audio, vid_path, video_id + "_spec.npz") 197 | mel_spec = np.load(audio_path)['arr_0'] 198 | if self.random_audio_windows: 199 | audio, nframes, starts, ends = self._get_audio_random(self.n_pair, mel_spec) 200 | else: 201 | audio, nframes, starts, ends, text = self._get_audio_and_text(self.caption[video_id], self.n_pair, mel_spec) 202 | video = self._get_video(vid_path, starts, ends, video_id) 203 | if self.random_audio_windows: 204 | return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes), 205 | 'video_id': video_id} 206 | else: 207 | return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes), 208 | 'video_id': video_id, 209 | 'text': text} 210 | 211 | 212 | class Youtube_DataLoader_label(Dataset): 213 | """Youtube dataset loader.""" 214 | 215 | def __init__( 216 | self, 217 | csv, 218 | pseu_label_a, 219 | pseu_label_v, 220 | features_path, 221 | features_path_audio, 222 | caption, 223 | we, 224 | min_time=10.0, 225 | feature_framerate=1.0, 226 | feature_framerate_3D=24.0 / 16.0, 227 | we_dim=300, 228 | max_words=30, 229 | min_words=0, 230 | n_pair=1, 231 | num_audio_frames=1024, 232 | random_audio_windows=False, 233 | ): 234 | """ 235 | Args: 236 | """ 237 | self.csv = pd.read_csv(csv) 238 | self.features_path = features_path 239 | self.features_path_audio = features_path_audio if features_path_audio != "" \ 240 | else features_path 241 | self.caption = caption 242 | self.min_time = min_time 243 | self.feature_framerate = feature_framerate 244 | self.feature_framerate_3D = feature_framerate_3D 245 | self.we_dim = we_dim 246 | self.max_words = max_words 247 | self.min_words = min_words 248 | self.num_audio_frames = num_audio_frames 249 | self.we = we 250 | self.n_pair = n_pair 251 | self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D} 252 | self.feature_path = {'2d': features_path} 253 | if features_path != '': 254 | self.feature_path['3d'] = features_path 255 | self.random_audio_windows = random_audio_windows 256 | self.pseu_label_a = pseu_label_a 257 | self.pseu_label_v = pseu_label_v 258 | 259 | def __len__(self): 260 | return len(self.csv) 261 | 262 | def _zero_pad_tensor(self, tensor, size): 263 | if len(tensor) >= size: 264 | return tensor[:size] 265 | else: 266 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 267 | return np.concatenate((tensor, zero), axis=0) 268 | 269 | def _zero_pad_audio(self, audio, max_frames): 270 | n_frames = audio.shape[1] 271 | if n_frames >= max_frames: 272 | return audio[:, 0:max_frames], int(max_frames) 273 | else: 274 | p = max_frames - n_frames 275 | audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 276 | return audio_padded, n_frames 277 | 278 | def _tokenize_text(self, sentence): 279 | w = re.findall(r"[\w']+", str(sentence)) 280 | return w 281 | 282 | def _words_to_we(self, words): 283 | words = [word for word in words if word in self.we.vocab] 284 | if words: 285 | we = self._zero_pad_tensor(self.we[words], self.max_words) 286 | return th.from_numpy(we) 287 | else: 288 | return th.zeros(self.max_words, self.we_dim) 289 | 290 | def _get_audio_and_text(self, caption, n_pair_max, mel_spec): 291 | n_caption = len(caption['start']) 292 | k = n_pair_max 293 | starts = np.zeros(k) 294 | ends = np.zeros(k) 295 | text = th.zeros(k, self.max_words, self.we_dim) 296 | audio = [0 for i in range(k)] 297 | nframes = np.zeros(k) 298 | r_ind = np.random.choice(range(n_caption), k, replace=True) 299 | 300 | for i in range(k): 301 | ind = r_ind[i] 302 | audio[i], nframes[i], starts[i], ends[i], text[i] = self._get_single_audio_text(caption, ind, mel_spec) 303 | 304 | audio = th.cat([i.unsqueeze(0) for i in audio], dim=0) 305 | return audio, nframes, starts, ends, text 306 | 307 | def _get_single_audio_text(self, caption, ind, mel_spec): 308 | start, end = ind, ind 309 | words = self._tokenize_text(caption['text'][ind]) 310 | diff = caption['end'][end] - caption['start'][start] 311 | # Extend the video clip if shorter than the minimum desired clip duration 312 | while diff < self.min_time: 313 | if start > 0 and end < len(caption['end']) - 1: 314 | next_words = self._tokenize_text(caption['text'][end + 1]) 315 | prev_words = self._tokenize_text(caption['text'][start - 1]) 316 | d1 = caption['end'][end + 1] - caption['start'][start] 317 | d2 = caption['end'][end] - caption['start'][start - 1] 318 | # Use the closest neighboring video clip 319 | if d2 <= d1: 320 | start -= 1 321 | words.extend(prev_words) 322 | else: 323 | end += 1 324 | words.extend(next_words) 325 | # If no video clips after it, use the clip before it 326 | elif start > 0: 327 | words.extend(self._tokenize_text(caption['text'][start - 1])) 328 | start -= 1 329 | # If no video clips before it, use the clip after it. 330 | elif end < len(caption['end']) - 1: 331 | words.extend(self._tokenize_text(caption['text'][end + 1])) 332 | end += 1 333 | # If there's no clips before or after 334 | else: 335 | break 336 | diff = caption['end'][end] - caption['start'][start] 337 | 338 | frames = librosa.core.time_to_frames([caption['start'][start], caption['end'][end]], sr=16000, hop_length=160, 339 | n_fft=400) 340 | padded_mel_spec, nframes = self._zero_pad_audio(mel_spec[:, frames[0]: frames[1]], self.num_audio_frames) 341 | return th.from_numpy(padded_mel_spec), nframes, caption['start'][start], caption['end'][end], self._words_to_we( 342 | words) 343 | 344 | def _get_audio_random(self, n_pair_max, mel_spec): 345 | k = n_pair_max 346 | starts = np.zeros(k) 347 | ends = np.zeros(k) 348 | audio = [0 for i in range(k)] 349 | nframes = np.zeros(k) 350 | video_duration_seconds = int( 351 | librosa.core.frames_to_time(mel_spec.shape[1], sr=16000, hop_length=160, n_fft=400)) 352 | num_audio_seconds = int(librosa.core.frames_to_time(self.num_audio_frames, sr=16000, hop_length=160, n_fft=400)) 353 | # Sample clips that end before the end of the video 354 | # If the video is shorter than the desired window, use the entire video 355 | start_seconds = np.random.choice(range(max(1, video_duration_seconds - (num_audio_seconds + 1))), k, 356 | replace=True) 357 | 358 | for i in range(k): 359 | start_frame = max(0, librosa.core.time_to_frames(start_seconds[i], sr=16000, hop_length=160, n_fft=400)) 360 | audio_window = mel_spec[:, start_frame: start_frame + self.num_audio_frames] 361 | # Pad in the case that the audio wasn't long enough 362 | padded_mel_spec, nframes_spec = self._zero_pad_audio(audio_window, self.num_audio_frames) 363 | end_second = start_seconds[i] + num_audio_seconds 364 | audio[i], nframes[i], starts[i], ends[i] = th.from_numpy(padded_mel_spec), nframes_spec, start_seconds[ 365 | i], end_second 366 | 367 | audio = th.cat([i.unsqueeze(0) for i in audio], dim=0) 368 | return audio, nframes, starts, ends 369 | 370 | def _get_video(self, vid_path, s, e, video_id): 371 | feature_path = {} 372 | video = {} 373 | output = {} 374 | for k in self.feature_path: 375 | feature_path[k] = os.path.join(self.feature_path[k], vid_path, video_id + "_{}.npz".format(k)) 376 | np_arr = np.load(feature_path[k])['features'] 377 | video[k] = th.from_numpy(np_arr).float() 378 | output[k] = th.zeros(len(s), video[k].shape[-1]) 379 | for i in range(len(s)): 380 | start = int(s[i] * self.fps[k]) 381 | end = int(e[i] * self.fps[k]) + 1 382 | slice = video[k][start:end] 383 | if len(slice) < 1: 384 | #print("missing visual feats; video_id: {}, start: {}, end: {}".format(feature_path[k], start, end)) 385 | missing = 1 386 | else: 387 | output[k][i] = F.normalize(th.max(slice, dim=0)[0], dim=0) 388 | 389 | return th.cat([output[k] for k in output], dim=1) 390 | 391 | def __getitem__(self, idx): 392 | vid_path = self.csv['path'].values[idx].replace("None/", "") 393 | video_id = vid_path.split("/")[-1] 394 | audio_path = os.path.join(self.features_path_audio, vid_path, video_id + "_spec.npz") 395 | mel_spec = np.load(audio_path)['arr_0'] 396 | if self.random_audio_windows: 397 | audio, nframes, starts, ends = self._get_audio_random(self.n_pair, mel_spec) 398 | else: 399 | audio, nframes, starts, ends, text = self._get_audio_and_text(self.caption[video_id], self.n_pair, mel_spec) 400 | video = self._get_video(vid_path, starts, ends, video_id) 401 | if self.random_audio_windows: 402 | return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes), 403 | 'video_id': video_id} 404 | else: 405 | return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes), 406 | 'video_id': video_id, 407 | 'text': text, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]} -------------------------------------------------------------------------------- /youtube_mil_dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import Dataset 8 | import torch.nn.functional as F 9 | import pandas as pd 10 | import os 11 | import numpy as np 12 | import re 13 | import random 14 | import librosa 15 | from model_davenet import LoadAudio 16 | 17 | 18 | class Youtube_DataLoader(Dataset): 19 | """Youtube dataset loader.""" 20 | 21 | def __init__( 22 | self, 23 | csv, 24 | features_path, 25 | features_path_audio, 26 | caption, 27 | we, 28 | min_time=10.0, 29 | feature_framerate=1.0, 30 | feature_framerate_3D=24.0 / 16.0, 31 | we_dim=300, 32 | max_words=30, 33 | min_words=0, 34 | n_pair=1, 35 | num_audio_frames=1024, 36 | num_candidates=1, 37 | random_audio_windows=False, 38 | ): 39 | """ 40 | Args: 41 | """ 42 | self.csv = pd.read_csv(csv) 43 | self.features_path = features_path 44 | self.features_path_audio = features_path_audio if features_path_audio != "" \ 45 | else features_path 46 | self.caption = caption 47 | self.min_time = min_time 48 | self.feature_framerate = feature_framerate 49 | self.feature_framerate_3D = feature_framerate_3D 50 | self.we_dim = we_dim 51 | self.max_words = max_words 52 | self.min_words = min_words 53 | self.num_audio_frames = num_audio_frames 54 | self.we = we 55 | self.n_pair = n_pair 56 | self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D} 57 | self.feature_path = {'2d': features_path} 58 | if features_path != '': 59 | self.feature_path['3d'] = features_path 60 | self.num_candidates = num_candidates 61 | self.random_audio_windows = random_audio_windows 62 | 63 | def __len__(self): 64 | return len(self.csv) 65 | 66 | def _zero_pad_tensor(self, tensor, size): 67 | if len(tensor) >= size: 68 | return tensor[:size] 69 | else: 70 | zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32) 71 | return np.concatenate((tensor, zero), axis=0) 72 | 73 | def _zero_pad_audio(self, audio, max_frames): 74 | n_frames = audio.shape[1] 75 | if n_frames >= max_frames: 76 | return audio[:, 0:max_frames], int(max_frames) 77 | else: 78 | p = max_frames - n_frames 79 | audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0)) 80 | return audio_padded, n_frames 81 | 82 | def _tokenize_text(self, sentence): 83 | w = re.findall(r"[\w']+", str(sentence)) 84 | return w 85 | 86 | def _words_to_we(self, words): 87 | words = [word for word in words if word in self.we.vocab] 88 | if words: 89 | we = self._zero_pad_tensor(self.we[words], self.max_words) 90 | return th.from_numpy(we) 91 | else: 92 | return th.zeros(self.max_words, self.we_dim) 93 | """ 94 | def _get_text(self, caption, n_pair_max): 95 | n_caption = len(caption['start']) 96 | k = n_pair_max 97 | starts = np.zeros(k) 98 | ends = np.zeros(k) 99 | text = th.zeros(k, self.max_words, self.we_dim) 100 | r_ind = np.random.choice(range(n_caption), k, replace=True) 101 | 102 | for i in range(k): 103 | ind = r_ind[i] 104 | text[i], starts[i], ends[i] = self._get_single_text(caption, ind) 105 | 106 | return text, starts, ends 107 | """ 108 | def _get_single_text(self, caption, ind): 109 | start, end = ind, ind 110 | words = self._tokenize_text(caption['text'][ind]) 111 | diff = caption['end'][end] - caption['start'][start] 112 | while len(words) < self.min_words or diff < self.min_time: 113 | if start > 0 and end < len(caption['end']) - 1: 114 | next_words = self._tokenize_text(caption['text'][end + 1]) 115 | prev_words = self._tokenize_text(caption['text'][start - 1]) 116 | d1 = caption['end'][end + 1] - caption['start'][start] 117 | d2 = caption['end'][end] - caption['start'][start - 1] 118 | if (self.min_time > 0 and d2 <= d1) or \ 119 | (self.min_time == 0 and len(next_words) <= len(prev_words)): 120 | start -= 1 121 | words.extend(prev_words) 122 | else: 123 | end += 1 124 | words.extend(next_words) 125 | elif start > 0: 126 | words.extend(self._tokenize_text(caption['text'][start - 1])) 127 | start -= 1 128 | elif end < len(caption['end']) - 1: 129 | words.extend(self._tokenize_text(caption['text'][end + 1])) 130 | end += 1 131 | else: 132 | break 133 | diff = caption['end'][end] - caption['start'][start] 134 | return self._words_to_we(words), \ 135 | caption['start'][start], caption['end'][end] 136 | 137 | 138 | def _get_video(self, vid_path, s, e, video_id): 139 | feature_path = {} 140 | video = {} 141 | output = {} 142 | for k in self.feature_path: 143 | feature_path[k] = os.path.join(self.feature_path[k], vid_path, video_id + "_{}.npz".format(k)) 144 | np_arr = np.load(feature_path[k])['features'] 145 | video[k] = th.from_numpy(np_arr).float() 146 | output[k] = th.zeros(len(s), video[k].shape[-1]) 147 | 148 | start = int(s * self.fps[k]) 149 | end = int(e * self.fps[k]) + 1 150 | slice = video[k][start:end] 151 | if len(slice) < 1: 152 | #print("missing visual feats; video_id: {}, start: {}, end: {}".format(feature_path[k], start, end)) 153 | missing=1 154 | else: 155 | output[k] = F.normalize(th.max(slice, dim=0)[0], dim=0) 156 | 157 | return th.cat([output[k] for k in output], dim=1) 158 | 159 | def _find_nearest_candidates(self, caption, ind): 160 | start, end = ind, ind 161 | diff = caption['end'][end] - caption['start'][start] 162 | n_candidate = 1 163 | while n_candidate < self.num_candidates: 164 | if start == 0: 165 | return 0 166 | elif end == len(caption) - 1: 167 | return start - (self.num_candidates - n_candidate) 168 | elif caption['end'][end] - caption['start'][start - 1] < caption['end'][end + 1] - caption['start'][start]: 169 | start -= 1 170 | else: 171 | end += 1 172 | n_candidate += 1 173 | return start 174 | 175 | def _get_text(self, cap): 176 | #cap = pd.read_csv(caption) 177 | ind = random.randint(0, len(cap) - 1) 178 | if self.num_candidates == 1: 179 | #words = self.words_to_ids(cap['text'].values[ind]) 180 | words = self._tokenize_text(cap['text'][ind]) 181 | else: 182 | #words = th.zeros(self.num_candidates, self.max_words, dtype=th.long) 183 | words = th.zeros(self.num_candidates, self.max_words, self.we_dim) 184 | cap_start = self._find_nearest_candidates(cap, ind) 185 | for i in range(self.num_candidates): 186 | candidate_w = cap['text'].values[max(0, min(len(cap['text']) - 1, cap_start + i))] 187 | word_token = self._tokenize_text(candidate_w) 188 | words[i] = self._words_to_we(word_token)#self.words_to_ids() 189 | start, end = cap['start'].values[ind], cap['end'].values[ind] 190 | # TODO: May need to be improved for edge cases. 191 | if end - start < self.min_time: 192 | diff = self.min_time - end + start 193 | start = max(0, start - diff / 2) 194 | end = start + self.min_time 195 | return words, int(start), int(end) 196 | 197 | def __getitem__(self, idx): 198 | vid_path = self.csv['path'].values[idx].replace("None/", "") 199 | video_id = vid_path.split("/")[-1] 200 | #audio_path = os.path.join(self.features_path_audio, vid_path, video_id + "_spec.npz") 201 | #mel_spec = np.load(audio_path)['arr_0'] 202 | 203 | #video_path = os.path.join(self.video_root, video_file) 204 | text, start, end = self._get_text(self.caption[video_id]) 205 | video = self._get_video(vid_path, start, end, video_id) 206 | #video = self._get_video(video_path, start, end) 207 | return {'video': video, 'text': text} 208 | --------------------------------------------------------------------------------