├── .gitattributes
├── README.md
├── args.py
├── args_kmeans.py
├── attention.py
├── cvpr19_dataloader.py
├── eval.py
├── eval_avlnet.sh
├── eval_cross.py
├── gen_loader.py
├── hmdb_dataloader.py
├── local_eval.py
├── loss.py
├── loss_mil.py
├── loss_sink.py
├── lsmdc_dataloader.py
├── metrics.py
├── minY_dataloader.py
├── model.py
├── model_davenet.py
├── model_kmeans_ICCV.py
├── model_tri_c.py
├── model_tri_c_clean_sp.py
├── model_tri_kmeans.py
├── msrvtt_dataloader.py
├── script.txt
├── train_avlnet.sh
├── train_tri_c.py
├── train_tri_cos_mil.py
├── train_tri_kmeans.py
├── ucf_dataloader.py
├── video_evaluation.py
├── youcook_dataloader.py
├── youtube_dataloader.py
└── youtube_mil_dataloader.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multimodal-Clustering-Network
 2 | ICCV 2021
 3 | 
 4 | This repo has the implementation of our ICCV 2021 paper: Multimodal Clustering Networks for Self-supervised Learning from Unlabeled Videos https://arxiv.org/abs/2104.12671.
 5 | 
 6 | 
 7 | 
 8 | Command for pretraining:
 9 | 
10 | ```
11 | 
12 | model1=MCN_sep_recon_r
13 | 
14 | python -u train_tri_kmeans.py --num_thread_reader=74 --epochs=30 --batch_size=128 \
15 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
16 | --lr=0.0001 --tri_modal=1 --apex_level=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --recon=1 --recon_size=1024 \
17 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
18 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
19 | --checkpoint_dir=model_me/$model1 >> logs/$model1
20 | ```
21 | Weights
22 | https://drive.google.com/drive/folders/1J8v3Ya_H9ciX1KsLUtlqeiGaSjqVbp7j?usp=sharing


--------------------------------------------------------------------------------
/args.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | def get_args(description='Youtube-Text-Video'):
  4 |     parser = argparse.ArgumentParser(description=description)
  5 |     parser.add_argument(
  6 |         '--train_csv',
  7 |         type=str,
  8 |         default='data/HowTo100M_1166_videopaths.txt',
  9 |         #default='/home/brian27/nobackup/data/howto100m/HowTo100M_1166_videopaths.txt',
 10 |         help='train csv')
 11 |     parser.add_argument(
 12 |         '--features_path',
 13 |         type=str,
 14 |         default='parsed_videos/',
 15 |         help='path for visual features (2D, 3D) visual features')
 16 |     parser.add_argument(
 17 |         '--features_path_audio',
 18 |         type=str,
 19 |         default='',
 20 |         help='path for audio files (defaults to --features_path)')
 21 |     parser.add_argument(
 22 |         '--caption_path',
 23 |         type=str,
 24 |         default='data/caption.pickle',
 25 |         help='HowTo100M caption pickle file path')
 26 |     parser.add_argument(
 27 |         '--word2vec_path',
 28 |         type=str,
 29 |         default='data/GoogleNews-vectors-negative300.bin',
 30 |         help='word embedding path')
 31 |     parser.add_argument(
 32 |         '--pretrain_path',
 33 |         type=str,
 34 |         default='',
 35 |         help='pre train model path')
 36 |     parser.add_argument(
 37 |         '--checkpoint_dir',
 38 |         type=str,
 39 |         default='',
 40 |         help='checkpoint model folder')
 41 |     parser.add_argument('--eval_lang_retrieval', type=int, default=0,
 42 |                     help='if 1, eval language retrieval instead of video retrieval')
 43 |     parser.add_argument('--tri_modal', type=int, default=0,
 44 |                             help='use vision, speech, and text')
 45 |     parser.add_argument('--tri_modal_fuse', type=int, default=0,
 46 |                             help='use speech and text features (tri_modal must be 1)')
 47 |     parser.add_argument('--fuse_videoaudio_additive', type=int, default=0,
 48 |                             help='eval T->A+V with tri-modal modal \
 49 |                                   set tri_modal=1, tri_modal_fuse=0')
 50 |     parser.add_argument('--loss', type=int, default=0,
 51 |                                 help='0 for Masked Margin Softmax (MMS) loss')
 52 |     parser.add_argument('--apex_level', type=int, default=0,
 53 |                                 help='Apex (mixed precision) level: chose 0 for none, 1 for O1.')
 54 |     parser.add_argument('--random_audio_windows', type=int, default=1,
 55 |                                 help='1 to use random audio windows, 0 to use HowTo100M ASR clips')
 56 |     parser.add_argument('--howto_audio_frames', type=int, default=1024,
 57 |                             help='number of frames to use for loading howto100m audio')
 58 |     parser.add_argument('--youcook_num_frames_multiplier', type=int, default=5,
 59 |                                 help='use 1024 * x audio frames for youcook2')
 60 |     parser.add_argument('--msrvtt_num_frames_multiplier', type=int, default=3,
 61 |                                 help='use 1024 * x audio frames for msrvtt')
 62 |     parser.add_argument('--lsmdc_num_frames_multiplier', type=int, default=3,
 63 |                                 help='use 1024 * x audio frames for lsmdc')
 64 |     parser.add_argument('--num_thread_reader', type=int, default=1,
 65 |                                 help='')
 66 |     parser.add_argument('--embd_dim', type=int, default=2048,
 67 |                                 help='embedding dim')
 68 |     parser.add_argument('--lr', type=float, default=0.0001,
 69 |                                 help='initial learning rate')
 70 |     parser.add_argument('--epochs', type=int, default=20,
 71 |                                 help='upper epoch limit')
 72 |     parser.add_argument('--batch_size', type=int, default=256,
 73 |                                 help='batch size')
 74 |     parser.add_argument('--batch_size_val', type=int, default=3500,
 75 |                                 help='batch size eval')
 76 |     parser.add_argument('--lr_decay', type=float, default=0.9,
 77 |                                 help='Learning rate exp epoch decay')
 78 |     parser.add_argument('--n_display', type=int, default=200,
 79 |                                 help='Information display frequence')
 80 |     parser.add_argument('--feature_dim', type=int, default=4096,
 81 |                                 help='video feature dimension')
 82 |     parser.add_argument('--we_dim', type=int, default=300,
 83 |                                 help='word embedding dimension')
 84 |     parser.add_argument('--seed', type=int, default=1,
 85 |                                 help='random seed')
 86 |     parser.add_argument('--verbose', type=int, default=1,
 87 |                                 help='')
 88 |     parser.add_argument('--max_words', type=int, default=20,
 89 |                                 help='')
 90 |     parser.add_argument('--min_words', type=int, default=0,
 91 |                                 help='')
 92 |     parser.add_argument('--feature_framerate', type=int, default=1,
 93 |                                 help='')
 94 |     parser.add_argument('--min_time', type=float, default=5.0,
 95 |                                 help='Gather small clips')
 96 |     parser.add_argument('--n_pair', type=int, default=1,
 97 |                                 help='Number of video clips to use per video')
 98 |     parser.add_argument('--lsmdc', type=int, default=0,
 99 |                                 help='Train on LSDMC data')
100 |     parser.add_argument('--youcook', type=int, default=0,
101 |                                 help='Train on YouCook2 data')
102 |     parser.add_argument('--msrvtt', type=int, default=0,
103 |                                 help='Train on MSRVTT data')
104 |     parser.add_argument('--eval_lsmdc', type=int, default=0,
105 |                                 help='Evaluate on LSMDC data')
106 |     parser.add_argument('--eval_msrvtt', type=int, default=0,
107 |                                 help='Evaluate on MSRVTT data')
108 |     parser.add_argument('--eval_youcook', type=int, default=0,
109 |                                 help='Evaluate on YouCook2 data')
110 |     parser.add_argument('--eval_how', type=int, default=0,
111 |                         help='Evaluate on how2 data')
112 |     parser.add_argument('--sentence_dim', type=int, default=-1,
113 |                                 help='sentence dimension')
114 |     parser.add_argument('--cluster', type=int, default=0,
115 |                         help='cluster loss')
116 |     parser.add_argument('--queue_size', type=int, default=3,
117 |                         help='queue size')
118 |     parser.add_argument('--start_queue', type=int, default=0,
119 |                         help='start_queue')
120 |     parser.add_argument('--start_cluster', type=int, default=0,
121 |                         help='start_cluster')
122 |     parser.add_argument('--num_candidates', type=int, default=1,
123 |                         help='num candidates for MILNCE loss')
124 |     parser.add_argument('--use_queue', type=int, default=0,
125 |                         help='use_queue')
126 |     parser.add_argument('--cluster_size', type=int, default=256,
127 |                         help='cluster_size')
128 |     parser.add_argument('--layer', type=int, default=0,
129 |                         help='classification layer')
130 |     parser.add_argument('--soft_label', type=int, default=0,
131 |                         help='soft_label')
132 |     parser.add_argument('--multi_cluster', type=int, default=0,
133 |                         help='multi_cluster')
134 |     parser.add_argument('--pure_cluster', type=int, default=0,
135 |                         help='pure_cluster')
136 |     parser.add_argument('--project', type=int, default=0,
137 |                         help='project')
138 |     parser.add_argument('--proto_nce', type=int, default=0,
139 |                         help='proto_nce')
140 |     parser.add_argument('--switch_loss_h', type=int, default=0,
141 |                         help='switch_loss_h')
142 |     parser.add_argument('--switch_loss_s', type=int, default=0,
143 |                         help='switch_loss_s')
144 |     parser.add_argument('--self_prediction', type=int, default=0,
145 |                         help='self_prediction')
146 |     parser.add_argument('--soft_contrast', type=int, default=0,
147 |                         help='soft_contrast')
148 |     parser.add_argument('--soft_contrast_only', type=int, default=0,
149 |                         help='soft_contrast_only')
150 |     parser.add_argument('--nce', type=int, default=0,
151 |                         help='nce')
152 |     parser.add_argument('--nce_only', type=int, default=0,
153 |                         help='nce_only')
154 |     parser.add_argument('--pseudo_contrast', type=int, default=0,
155 |                         help='pseudo_contrast')
156 |     parser.add_argument('--cooperative', type=int, default=0,
157 |                         help='cooperative')
158 |     parser.add_argument('--project_dim', type=int, default=6000,
159 |                         help='project_dim')
160 |     parser.add_argument('--no_audio', type=int, default=0,
161 |                         help='no_audio')
162 |     parser.add_argument('--no_video', type=int, default=0,
163 |                         help='no_video')
164 |     parser.add_argument('--no_va', type=int, default=0,
165 |                         help='no_va')
166 |     parser.add_argument('--rand', type=int, default=0,
167 |                         help='random drop')
168 |     parser.add_argument('--joint', type=int, default=0,
169 |                         help='joint cluster')
170 |     parser.add_argument('--kmeans', type=int, default=0,
171 |                         help='kmeans cluster')
172 |     parser.add_argument('--fastC', type=int, default=0,
173 |                         help='fast cluster')
174 |     parser.add_argument('--withMLP', type=int, default=0,
175 |                         help='withMLP cluster')
176 |     parser.add_argument('--recon', type=int, default=0,
177 |                         help='recon ')
178 |     parser.add_argument('--mms', type=int, default=0,
179 |                         help='mms ')
180 |     parser.add_argument('--mean', type=int, default=0,
181 |                         help='mean ')
182 |     parser.add_argument('--lamb', type=float, default=0.5,
183 |                         help='lambda ')
184 |     parser.add_argument('--tri_loss', type=int, default=0,
185 |                         help='tri_loss ')
186 |     parser.add_argument('--recon_size', type=int, default=768,
187 |                         help='recon_size ')
188 |     parser.add_argument('--clu_lamb', type=int, default=1,
189 |                         help='clu_lamb ')
190 |     parser.add_argument('--noC', type=int, default=0,
191 |                         help='noC ')
192 |     parser.add_argument('--cos', type=int, default=1,
193 |                         help='cos ')
194 |     parser.add_argument("--base_lr", default=4.8, type=float, help="base learning rate")
195 |     parser.add_argument("--final_lr", type=float, default=0, help="final learning rate")
196 |     parser.add_argument("--freeze_prototypes_niters", default=313, type=int,
197 |                         help="freeze the prototypes during this many iterations from the start")
198 |     parser.add_argument("--wd", default=1e-6, type=float, help="weight decay")
199 |     parser.add_argument("--warmup_epochs", default=10, type=int, help="number of warmup epochs")
200 |     parser.add_argument("--start_warmup", default=0, type=float,
201 |                         help="initial warmup learning rate")
202 |     parser.add_argument('--warmup_steps', type=int, default=5000,
203 |                         help='')
204 |     parser.add_argument(
205 |         '--youcook_train_path',
206 |         type=str,
207 |         default='data/youcook_train_audio.pkl',
208 |         help='')
209 |     parser.add_argument(
210 |         '--youcook_val_path',
211 |         type=str,
212 |         default='data/youcook_val_audio.pkl',
213 |         help='')
214 |     parser.add_argument(
215 |         '--msrvtt_test_path',
216 |         type=str,
217 |         default='data/msrvtt_jsfusion_test.pkl',
218 |         help='')
219 |     parser.add_argument(
220 |         '--msrvtt_train_path',
221 |         type=str,
222 |         default='data/msrvtt_train.pkl',
223 |         help='')
224 |     parser.add_argument(
225 |         '--lsmdc_test_path',
226 |         type=str,
227 |         default='data/lsmdc_test.pkl',
228 |         help='')
229 |     parser.add_argument(
230 |         '--lsmdc_train_path',
231 |         type=str,
232 |         default='data/lsmdc_train.pkl',
233 |         help='')
234 |     args = parser.parse_args()
235 |     return args


--------------------------------------------------------------------------------
/args_kmeans.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | def get_args(description='Youtube-Text-Video'):
  4 |     parser = argparse.ArgumentParser(description=description)
  5 |     parser.add_argument(
  6 |         '--train_csv',
  7 |         type=str,
  8 |         default='data/HowTo100M_1166_videopaths.txt',
  9 |         #default='/home/brian27/nobackup/data/howto100m/HowTo100M_1166_videopaths.txt',
 10 |         help='train csv')
 11 |     parser.add_argument(
 12 |         '--features_path',
 13 |         type=str,
 14 |         default='parsed_videos/',
 15 |         help='path for visual features (2D, 3D) visual features')
 16 |     parser.add_argument(
 17 |         '--features_path_audio',
 18 |         type=str,
 19 |         default='',
 20 |         help='path for audio files (defaults to --features_path)')
 21 |     parser.add_argument(
 22 |         '--caption_path',
 23 |         type=str,
 24 |         default='data/caption.pickle',
 25 |         help='HowTo100M caption pickle file path')
 26 |     parser.add_argument(
 27 |         '--word2vec_path',
 28 |         type=str,
 29 |         default='data/GoogleNews-vectors-negative300.bin',
 30 |         help='word embedding path')
 31 |     parser.add_argument(
 32 |         '--pretrain_path',
 33 |         type=str,
 34 |         default='',
 35 |         help='pre train model path')
 36 |     parser.add_argument(
 37 |         '--checkpoint_dir',
 38 |         type=str,
 39 |         default='',
 40 |         help='checkpoint model folder')
 41 |     parser.add_argument('--eval_lang_retrieval', type=int, default=0,
 42 |                     help='if 1, eval language retrieval instead of video retrieval')
 43 |     parser.add_argument('--tri_modal', type=int, default=0,
 44 |                             help='use vision, speech, and text')
 45 |     parser.add_argument('--tri_modal_fuse', type=int, default=0,
 46 |                             help='use speech and text features (tri_modal must be 1)')
 47 |     parser.add_argument('--fuse_videoaudio_additive', type=int, default=0,
 48 |                             help='eval T->A+V with tri-modal modal \
 49 |                                   set tri_modal=1, tri_modal_fuse=0')
 50 |     parser.add_argument('--loss', type=int, default=0,
 51 |                                 help='0 for Masked Margin Softmax (MMS) loss')
 52 |     parser.add_argument('--apex_level', type=int, default=0,
 53 |                                 help='Apex (mixed precision) level: chose 0 for none, 1 for O1.')
 54 |     parser.add_argument('--random_audio_windows', type=int, default=1,
 55 |                                 help='1 to use random audio windows, 0 to use HowTo100M ASR clips')
 56 |     parser.add_argument('--howto_audio_frames', type=int, default=1024,
 57 |                             help='number of frames to use for loading howto100m audio')
 58 |     parser.add_argument('--youcook_num_frames_multiplier', type=int, default=5,
 59 |                                 help='use 1024 * x audio frames for youcook2')
 60 |     parser.add_argument('--msrvtt_num_frames_multiplier', type=int, default=3,
 61 |                                 help='use 1024 * x audio frames for msrvtt')
 62 |     parser.add_argument('--lsmdc_num_frames_multiplier', type=int, default=3,
 63 |                                 help='use 1024 * x audio frames for lsmdc')
 64 |     parser.add_argument('--num_thread_reader', type=int, default=1,
 65 |                                 help='')
 66 |     parser.add_argument('--embd_dim', type=int, default=2048,
 67 |                                 help='embedding dim')
 68 |     parser.add_argument('--lr', type=float, default=0.0001,
 69 |                                 help='initial learning rate')
 70 |     parser.add_argument('--epochs', type=int, default=20,
 71 |                                 help='upper epoch limit')
 72 |     parser.add_argument('--batch_size', type=int, default=256,
 73 |                                 help='batch size')
 74 |     parser.add_argument('--batch_size_val', type=int, default=3500,
 75 |                                 help='batch size eval')
 76 |     parser.add_argument('--lr_decay', type=float, default=0.9,
 77 |                                 help='Learning rate exp epoch decay')
 78 |     parser.add_argument('--n_display', type=int, default=200,
 79 |                                 help='Information display frequence')
 80 |     parser.add_argument('--feature_dim', type=int, default=4096,
 81 |                                 help='video feature dimension')
 82 |     parser.add_argument('--we_dim', type=int, default=300,
 83 |                                 help='word embedding dimension')
 84 |     parser.add_argument('--seed', type=int, default=1,
 85 |                                 help='random seed')
 86 |     parser.add_argument('--verbose', type=int, default=1,
 87 |                                 help='')
 88 |     parser.add_argument('--max_words', type=int, default=20,
 89 |                                 help='')
 90 |     parser.add_argument('--min_words', type=int, default=0,
 91 |                                 help='')
 92 |     parser.add_argument('--feature_framerate', type=int, default=1,
 93 |                                 help='')
 94 |     parser.add_argument('--min_time', type=float, default=5.0,
 95 |                                 help='Gather small clips')
 96 |     parser.add_argument('--n_pair', type=int, default=1,
 97 |                                 help='Number of video clips to use per video')
 98 |     parser.add_argument('--lsmdc', type=int, default=0,
 99 |                                 help='Train on LSDMC data')
100 |     parser.add_argument('--youcook', type=int, default=0,
101 |                                 help='Train on YouCook2 data')
102 |     parser.add_argument('--msrvtt', type=int, default=0,
103 |                                 help='Train on MSRVTT data')
104 |     parser.add_argument('--eval_lsmdc', type=int, default=0,
105 |                                 help='Evaluate on LSMDC data')
106 |     parser.add_argument('--eval_msrvtt', type=int, default=0,
107 |                                 help='Evaluate on MSRVTT data')
108 |     parser.add_argument('--eval_youcook', type=int, default=0,
109 |                                 help='Evaluate on YouCook2 data')
110 |     parser.add_argument('--eval_ucf', type=int, default=0,
111 |                         help='Evaluate on UCF-101 data')
112 |     parser.add_argument('--eval_hmdb', type=int, default=0,
113 |                         help='Evaluate on HMDB data')
114 |     parser.add_argument('--eval_cross', type=int, default=0,
115 |                         help='Evaluate on CrossTask data')
116 |     parser.add_argument('--eval_how', type=int, default=0,
117 |                         help='Evaluate on how2 data')
118 |     parser.add_argument('--sentence_dim', type=int, default=-1,
119 |                                 help='sentence dimension')
120 |     parser.add_argument('--cluster', type=int, default=0,
121 |                         help='cluster loss')
122 |     parser.add_argument('--queue_size', type=int, default=3,
123 |                         help='queue size')
124 |     parser.add_argument('--start_queue', type=int, default=0,
125 |                         help='start_queue')
126 |     parser.add_argument('--start_cluster', type=int, default=0,
127 |                         help='start_cluster')
128 |     parser.add_argument('--num_candidates', type=int, default=1,
129 |                         help='num candidates for MILNCE loss')
130 |     parser.add_argument('--use_queue', type=int, default=0,
131 |                         help='use_queue')
132 |     parser.add_argument('--cluster_size', type=int, default=256,
133 |                         help='cluster_size')
134 |     parser.add_argument('--layer', type=int, default=0,
135 |                         help='classification layer')
136 |     parser.add_argument('--soft_label', type=int, default=0,
137 |                         help='soft_label')
138 |     parser.add_argument('--multi_cluster', type=int, default=0,
139 |                         help='multi_cluster')
140 |     parser.add_argument('--pure_cluster', type=int, default=0,
141 |                         help='pure_cluster')
142 |     parser.add_argument('--project', type=int, default=0,
143 |                         help='project')
144 |     parser.add_argument('--proto_nce', type=int, default=0,
145 |                         help='proto_nce')
146 |     parser.add_argument('--switch_loss_h', type=int, default=0,
147 |                         help='switch_loss_h')
148 |     parser.add_argument('--switch_loss_s', type=int, default=0,
149 |                         help='switch_loss_s')
150 |     parser.add_argument('--self_prediction', type=int, default=0,
151 |                         help='self_prediction')
152 |     parser.add_argument('--soft_contrast', type=int, default=0,
153 |                         help='soft_contrast')
154 |     parser.add_argument('--soft_contrast_only', type=int, default=0,
155 |                         help='soft_contrast_only')
156 |     parser.add_argument('--nce', type=int, default=0,
157 |                         help='nce')
158 |     parser.add_argument('--nce_only', type=int, default=0,
159 |                         help='nce_only')
160 |     parser.add_argument('--pseudo_contrast', type=int, default=0,
161 |                         help='pseudo_contrast')
162 |     parser.add_argument('--cooperative', type=int, default=0,
163 |                         help='cooperative')
164 |     parser.add_argument('--project_dim', type=int, default=6000,
165 |                         help='project_dim')
166 |     parser.add_argument('--no_audio', type=int, default=0,
167 |                         help='no_audio')
168 |     parser.add_argument('--no_va', type=int, default=0,
169 |                         help='no_va')
170 |     parser.add_argument('--rand', type=int, default=0,
171 |                         help='random drop')
172 |     parser.add_argument('--joint', type=int, default=0,
173 |                         help='joint cluster')
174 |     parser.add_argument('--kmeans', type=int, default=0,
175 |                         help='kmeans cluster')
176 |     parser.add_argument('--fastC', type=int, default=0,
177 |                         help='fast cluster')
178 |     parser.add_argument('--withMLP', type=int, default=0,
179 |                         help='withMLP cluster')
180 |     parser.add_argument('--recon', type=int, default=0,
181 |                         help='recon ')
182 |     parser.add_argument('--mms', type=int, default=0,
183 |                         help='mms ')
184 |     parser.add_argument('--mean', type=int, default=0,
185 |                         help='mean ')
186 |     parser.add_argument('--lamb', type=float, default=0.5,
187 |                         help='lambda ')
188 |     parser.add_argument('--tri_loss', type=int, default=0,
189 |                         help='tri_loss ')
190 |     parser.add_argument('--recon_size', type=int, default=768,
191 |                         help='recon_size ')
192 |     parser.add_argument('--clu_lamb', type=int, default=1,
193 |                         help='clu_lamb ')
194 |     parser.add_argument('--noC', type=int, default=0,
195 |                         help='noC ')
196 |     parser.add_argument('--cos', type=int, default=1,
197 |                         help='cos ')
198 |     parser.add_argument("--base_lr", default=4.8, type=float, help="base learning rate")
199 |     parser.add_argument("--final_lr", type=float, default=0, help="final learning rate")
200 |     parser.add_argument("--freeze_prototypes_niters", default=313, type=int,
201 |                         help="freeze the prototypes during this many iterations from the start")
202 |     parser.add_argument("--wd", default=1e-6, type=float, help="weight decay")
203 |     parser.add_argument("--warmup_epochs", default=10, type=int, help="number of warmup epochs")
204 |     parser.add_argument("--start_warmup", default=0, type=float,
205 |                         help="initial warmup learning rate")
206 |     parser.add_argument('--warmup_steps', type=int, default=5000,
207 |                         help='')
208 |     parser.add_argument(
209 |         '--youcook_train_path',
210 |         type=str,
211 |         default='data/youcook_train_audio.pkl',
212 |         help='')
213 |     parser.add_argument(
214 |         '--youcook_val_path',
215 |         type=str,
216 |         default='data/youcook_val_audio.pkl',
217 |         help='')
218 |     parser.add_argument(
219 |         '--msrvtt_test_path',
220 |         type=str,
221 |         default='data/msrvtt_jsfusion_test.pkl',
222 |         help='')
223 |     parser.add_argument(
224 |         '--msrvtt_train_path',
225 |         type=str,
226 |         default='data/msrvtt_train.pkl',
227 |         help='')
228 |     parser.add_argument(
229 |         '--lsmdc_test_path',
230 |         type=str,
231 |         default='data/lsmdc_test.pkl',
232 |         help='')
233 |     parser.add_argument(
234 |         '--lsmdc_train_path',
235 |         type=str,
236 |         default='data/lsmdc_train.pkl',
237 |         help='')
238 |     parser.add_argument(
239 |         '--ucf_test_path',
240 |         type=str,
241 |         default='data/UCF101_data.pkl',
242 |         help='')
243 |     parser.add_argument(
244 |         '--hmdb_test_path',
245 |         type=str,
246 |         default='data/HMDB_data.pkl',
247 |         help='')
248 |     args = parser.parse_args()
249 |     return args


--------------------------------------------------------------------------------
/attention.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch.nn import functional as F
 3 | import torch
 4 | import math
 5 | 
 6 | 
 7 | class MultiHeadAttention(nn.Module):
 8 |     def __init__(self, heads, d_model, dropout = 0.1):
 9 |         super().__init__()
10 | 
11 |         self.d_model = d_model
12 |         self.d_k = d_model // heads
13 |         self.h = heads
14 | 
15 |         self.q_linear = nn.Linear(d_model, d_model)
16 |         self.v_linear = nn.Linear(d_model, d_model)
17 |         self.k_linear = nn.Linear(d_model, d_model)
18 |         self.dropout = nn.Dropout(dropout)
19 |         self.out = nn.Linear(d_model, d_model)
20 | 
21 |     def forward(self, q, k, v, mask=None):
22 | 
23 |         bs = q.size(0)
24 | 
25 |         # perform linear operation and split into h heads
26 | 
27 |         k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
28 |         q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
29 |         v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
30 | 
31 |         # transpose to get dimensions bs * h * sl * d_model
32 | 
33 |         k = k.transpose(1 ,2)
34 |         q = q.transpose(1 ,2)
35 |         v = v.transpose(1 ,2)
36 |         # calculate attention using function we will define next
37 |         scores = attention(q, k, v, self.d_k, mask, self.dropout)
38 | 
39 |         # concatenate heads and put through final linear layer
40 |         concat = scores.transpose(1 ,2).contiguous() \
41 |             .view(bs, -1, self.d_model)
42 | 
43 |         output = self.out(concat)
44 | 
45 |         return output
46 | 
47 | 
48 | def attention(q, k, v, d_k, mask=None, dropout=None):
49 |     scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
50 | 
51 |     if mask is not None:
52 |         mask = mask.unsqueeze(1)
53 |         scores = scores.masked_fill(mask == 0, -1e9)
54 |     scores = F.softmax(scores, dim=-1)
55 | 
56 |     if dropout is not None:
57 |         scores = dropout(scores)
58 | 
59 |     output = torch.matmul(scores, v)
60 |     return output
61 | 
62 | 


--------------------------------------------------------------------------------
/cvpr19_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.nn.functional import adaptive_max_pool1d
  8 | from torch.utils.data import Dataset
  9 | import pandas as pd
 10 | import os
 11 | import numpy as np
 12 | import re
 13 | import random
 14 | import torch.nn.functional as F
 15 | import json
 16 | import librosa
 17 | import math
 18 | 
 19 | class CVPR19_DataLoader(Dataset):
 20 |     """CVPR19 testset loader."""
 21 | 
 22 |     def __init__(
 23 |             self,
 24 |             csv,
 25 |             features_path,
 26 |             annot_path,
 27 |             steps_path,
 28 |             audio_path,
 29 |             annot_path_time,
 30 |             cook_path,
 31 |             with_audio,
 32 |             we,
 33 |             we_dim=300,
 34 |             max_words=30,
 35 |             features_path_3D=None,
 36 |             feature_framerate=1.0,
 37 |             feature_framerate_3D=24.0 / 16.0,
 38 |             num_audio_frames=1024,
 39 |             zeus=0,
 40 |     ):
 41 |         """
 42 |         Args:
 43 |         """
 44 |         self.csv = pd.read_csv(csv)
 45 |         self.annot_path = annot_path
 46 |         self.steps_path = steps_path
 47 |         self.audio_path = audio_path
 48 |         self.annot_path_time = annot_path_time
 49 |         self.we = we
 50 |         self.we_dim = we_dim
 51 |         self.max_words = max_words
 52 |         self.feature_framerate = feature_framerate
 53 |         self.num_audio_frames = num_audio_frames
 54 |         self.zeus = zeus
 55 |         self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D}
 56 |         self.feature_path = features_path
 57 |         #if features_path_3D:
 58 |         #    self.feature_path['3d'] = features_path_3D
 59 |         self.steps = {}
 60 |         self.cook_path = cook_path
 61 |         self.cook_set = set()
 62 |         self.with_audio = with_audio
 63 | 
 64 |         file1 = open(cook_path)
 65 |         for line in file1:
 66 |             data = line.strip()
 67 |             self.cook_set.add(data)
 68 |         # for task in self.csv['task'].unique():
 69 |         #    with open (os.path.join(self.steps_path,str(task)),'r') as f:
 70 |         #        self.steps[str(task)] = th.cat([self._words_to_we(self._tokenize_text(line.strip()))[None,:,:] for line in f],dim=0)
 71 |         with open(steps_path, "r") as read_file:
 72 |             # print("Converting JSON encoded data into Python dictionary")
 73 |             step_dict = json.load(read_file)
 74 |         for task, y in step_dict.items():
 75 |             self.steps[str(task)] = th.cat([self._words_to_we(self._tokenize_text(step))[None, :, :] for step in y],
 76 |                                            dim=0)
 77 | 
 78 |     def __len__(self):
 79 |         return len(self.csv)
 80 | 
 81 |     def _zero_pad_tensor(self, tensor, size):
 82 |         if len(tensor) >= size:
 83 |             return tensor[:size]
 84 |         else:
 85 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 86 |             return np.concatenate((tensor, zero), axis=0)
 87 | 
 88 |     def _tokenize_text(self, sentence):
 89 |         w = re.findall(r"[\w']+", str(sentence))
 90 |         return w
 91 | 
 92 |     def _words_to_we(self, words):
 93 |         words = [word for word in words if word in self.we.vocab]
 94 |         if words:
 95 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 96 |             return th.from_numpy(we)
 97 |         else:
 98 |             return th.zeros(self.max_words, self.we_dim)
 99 | 
100 |     def _zero_pad_audio(self, audio, max_frames):
101 |         n_frames = audio.shape[1]
102 |         if n_frames >= max_frames:
103 |             return audio[:, 0:max_frames], int(max_frames)
104 |         else:
105 |             p = max_frames - n_frames
106 |             audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
107 |             return audio_padded, n_frames
108 | 
109 |     #"""
110 |     def _get_video(self, feature_path):
111 |         if self.zeus:
112 |             video = th.load(feature_path).float()
113 |         else:
114 |             video = np.load(feature_path)
115 |         return video if self.zeus else th.from_numpy(video).float()
116 |     #"""
117 | 
118 |     def _get_video_me(self, vid_path, s, e, fps):
119 |         feature_path = {}
120 |         video = {}
121 |         output = {}
122 |         video = np.load(vid_path)
123 |         video = th.from_numpy(video).float()
124 | 
125 |         output = th.zeros(len(s), video.shape[-1])
126 |         for i in range(len(s)):
127 |             # start = int(s[i] * fps)
128 |             # end = int(e[i] * fps)
129 |             start = int(i * fps)
130 |             end = int((i + 1) * fps)
131 |             slice = video[start:end]
132 | 
133 |             output[i] = F.normalize(th.max(slice, dim=0)[0], dim=0)
134 | 
135 |         return output  # th.cat([output[k] for k in output], dim=1)
136 | 
137 |     def _get_audio_and_text(self, k, mel_spec):
138 |         # n_caption = len(caption['start'])
139 |         # k = n_pair_max
140 |         starts = np.zeros(k)
141 |         ends = np.zeros(k)
142 |         # text = th.zeros(k, self.max_words, self.we_dim)
143 |         audio = [0 for i in range(k)]
144 | 
145 |         nframes = np.zeros(k)
146 |         # r_ind = np.random.choice(range(n_caption), k, replace=True)
147 |         dur = 4
148 |         for i in range(k):
149 |             # ind = r_ind[i]
150 |             if i < dur:
151 |                 start = 0
152 |                 end = 2 * dur
153 |             elif i > k - dur:
154 |                 start = k - 2 * dur
155 |                 end = k
156 |             else:
157 |                 start = i - dur
158 |                 end = i + dur
159 |             # print('time',start,end)
160 |             audio[i], nframes[i], starts[i], ends[i] = self._get_single_audio_text(start, end, mel_spec)
161 |         # print('nframes',nframes)
162 |         audio = th.cat([i.unsqueeze(0) for i in audio], dim=0)
163 |         return audio, nframes, starts, ends
164 | 
165 |     def _get_single_audio_text(self, start, end, mel_spec):
166 | 
167 |         # words = self._tokenize_text(caption['text'][ind])
168 | 
169 |         frames = librosa.core.time_to_frames([start, end], sr=16000, hop_length=160, n_fft=400)
170 |         # print('frames',frames[0], frames[1])
171 |         if frames[0] < 0:
172 |             frames[0] = 0
173 |         padded_mel_spec, nframes = self._zero_pad_audio(mel_spec[:, frames[0]: frames[1]], self.num_audio_frames)
174 |         return th.from_numpy(
175 |             padded_mel_spec), nframes, start, end  # , nframes#, caption['start'][start], caption['end'][end], self._words_to_we(words)
176 | 
177 |     def read_assignment(self, T, K, path):
178 |         Y = np.zeros([T, K], dtype=np.uint8)
179 |         with open(path, 'r') as f:
180 |             for line in f:
181 |                 step, start, end = line.strip().split(',')
182 |                 start = int(math.floor(float(start)))
183 |                 end = int(math.ceil(float(end)))
184 |                 step = int(step) - 1
185 |                 Y[start:end, step] = 1
186 |         return Y
187 | 
188 |     def __getitem__(self, idx):
189 |         video_id = self.csv['video_id'][idx]
190 |         task = str(self.csv['task'][idx])
191 |         if self.zeus:
192 |             vid_path_2d = os.path.join(self.feature_path['2d'], self.csv['path'][idx].split('.')[0] + '.pth')
193 |             vid_path_3d = os.path.join(self.feature_path['3d'], self.csv['path'][idx].split('.')[0] + '.pth')
194 |         else:
195 |             # vid_path_2d = os.path.join(self.feature_path['2d'], self.csv['path'][idx])
196 |             # vid_path_3d = os.path.join(self.feature_path['3d'], self.csv['path'][idx])
197 |             vid_path_2d = os.path.join(self.feature_path, self.csv['video_id'][idx] + '_2d.npy')
198 |             vid_path_3d = os.path.join(self.feature_path, self.csv['video_id'][idx] + '_3d.npy')
199 | 
200 |         annot = th.from_numpy(np.load(os.path.join(self.annot_path, task + '_' + video_id + '.npy')))
201 |         T = annot.size()[0]  # number of frames
202 |         # video[frame,2048] -> [1,2048,frame]
203 |         """
204 |         video_2d = adaptive_max_pool1d(video_2d.transpose(1,0)[None,:,:],T).view(-1,T).transpose(1,0)
205 | 
206 |         s = [i for i in range(T)]
207 |         e = [i+1 for i in range(T)]
208 |         video_3d_r = th.zeros(T, video_3d.shape[-1])
209 |         for i in range(len(s)):
210 |             start = int(s[i] * self.fps['3d'])
211 |             end = int(e[i] * self.fps['3d']) + 1
212 |             slice_v = video_3d[start:end]
213 |             if len(slice_v) < 1:
214 |                 print("error")
215 |             else:
216 |                 video_3d_r[i] = F.normalize(th.max(slice_v, dim=0)[0], dim=0)
217 |         video_3d = video_3d_r#adaptive_max_pool1d(video_3d.transpose(1,0)[None,:,:],T).view(-1,T).transpose(1,0)
218 |         """
219 |         # video_3d = adaptive_max_pool1d(video_3d.transpose(1,0)[None,:,:],T).view(-1,T).transpose(1,0)
220 |         #
221 | 
222 |         # """
223 |         # audio
224 |         au_path = os.path.join(self.audio_path, self.csv['video_id'][idx] + '.npz')
225 |         mel_spec = np.load(au_path)['arr_0']
226 |         audio, nframes, starts, ends = self._get_audio_and_text(T, mel_spec)
227 |         #video_2d = self._get_video_me(vid_path_2d, starts, ends, self.fps['2d'])
228 |         #video_3d = self._get_video_me(vid_path_3d, starts, ends, self.fps['3d'])
229 |         video_2d = self._get_video(vid_path_2d)
230 |         video_3d = self._get_video(vid_path_3d)
231 |         annot = th.from_numpy(np.load(os.path.join(self.annot_path, task + '_' + video_id + '.npy')))
232 |         T = annot.size()[0]
233 |         video_2d = adaptive_max_pool1d(video_2d.transpose(1, 0)[None, :, :], T).view(-1, T).transpose(1, 0)
234 |         video_3d = adaptive_max_pool1d(video_3d.transpose(1, 0)[None, :, :], T).view(-1, T).transpose(1, 0)
235 |         #video = th.cat((F.normalize(video_2d, dim=1), F.normalize(video_3d, dim=1)), dim=1)
236 | 
237 |         video = th.cat((F.normalize(video_2d, dim=1), F.normalize(video_3d, dim=1)), dim=1)
238 |         #video = th.cat(video_2d,video_3d)
239 | 
240 |         frames = len(video_2d)
241 |         step_num = len(self.steps[task])
242 |         #annot = self.read_assignment(frames,step_num,os.path.join(self.annot_path_time, task + '_' + video_id + '.csv'))
243 |         # print(video.shape)
244 |         if task in self.cook_set:
245 |             iscook = 1
246 |         else:
247 |             iscook = 0
248 |         if not self.with_audio:
249 |             return {'video': video, 'nframes': th.IntTensor(nframes), 'steps': self.steps[task], 'video_id': video_id,
250 |                     'task': task, 'Y': annot, 'cook': iscook}
251 |         else:
252 |             return {'video': video, 'audio': th.FloatTensor(audio.float()), \
253 |                     'nframes': th.IntTensor(nframes), 'steps': self.steps[task], 'video_id': video_id, \
254 |                     'task': task, 'Y': annot, 'cook': iscook}
255 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import DataLoader
  8 | from args import get_args
  9 | from model import Net
 10 | from metrics import compute_metrics, print_computed_metrics
 11 | from gensim.models.keyedvectors import KeyedVectors
 12 | import pickle
 13 | import glob
 14 | from lsmdc_dataloader import LSMDC_DataLoader
 15 | from msrvtt_dataloader import MSRVTT_DataLoader
 16 | from youcook_dataloader import Youcook_DataLoader
 17 | from video_evaluation import evaluate_recall_youcook
 18 | 
 19 | 
 20 | args = get_args()
 21 | if args.verbose:
 22 |     print(args)
 23 | 
 24 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument'
 25 | 
 26 | print('Loading word vectors: {}'.format(args.word2vec_path))
 27 | we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True)
 28 | print('done')
 29 | 
 30 | 
 31 | if args.eval_youcook:
 32 |     dataset_val = Youcook_DataLoader(
 33 |         data=args.youcook_val_path,
 34 |         we=we,
 35 |         max_words=args.max_words,
 36 |         we_dim=args.we_dim,
 37 |         tri_modal=True
 38 |     )
 39 |     dataloader_val = DataLoader(
 40 |         dataset_val,
 41 |         batch_size=args.batch_size_val,
 42 |         num_workers=args.num_thread_reader,
 43 |         shuffle=False,
 44 |     )
 45 | if args.eval_lsmdc:
 46 |     dataset_lsmdc = LSMDC_DataLoader(
 47 |         csv_path=args.lsmdc_test_csv_path,
 48 |         features_path=args.lsmdc_test_features_path,
 49 |         we=we,
 50 |         max_words=args.max_words,
 51 |         we_dim=args.we_dim,
 52 |     )
 53 |     dataloader_lsmdc = DataLoader(
 54 |         dataset_lsmdc,
 55 |         batch_size=args.batch_size_val,
 56 |         num_workers=args.num_thread_reader,
 57 |         shuffle=False,
 58 |     )
 59 | if args.eval_msrvtt:
 60 |     msrvtt_testset = MSRVTT_DataLoader(
 61 |         csv_path=args.msrvtt_test_csv_path,
 62 |         features_path=args.msrvtt_test_features_path,
 63 |         we=we,
 64 |         max_words=args.max_words,
 65 |         we_dim=args.we_dim,
 66 |     )
 67 |     dataloader_msrvtt = DataLoader(
 68 |         msrvtt_testset,
 69 |         batch_size=3000,
 70 |         num_workers=args.num_thread_reader,
 71 |         shuffle=False,
 72 |         drop_last=False,
 73 |     )
 74 | net = Net(
 75 |     video_dim=args.feature_dim,
 76 |     embd_dim=args.embd_dim,
 77 |     we_dim=args.we_dim,
 78 |     max_words=args.max_words,
 79 | )
 80 | net.eval()
 81 | net.cuda()
 82 | 
 83 | pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
 84 | print(pytorch_total_params)
 85 | exit()
 86 | 
 87 | if args.verbose:
 88 |     print('Starting evaluation loop ...')
 89 | 
 90 | 
 91 | def Eval_retrieval(model, eval_dataloader, dataset_name):
 92 |     model.eval()
 93 |     print('Evaluating Text-Video retrieval on {} data'.format(dataset_name))
 94 |     with th.no_grad():
 95 |         for i_batch, data in enumerate(eval_dataloader):
 96 |             text = data['text'].cuda()
 97 |             video = data['video'].cuda()
 98 |             vid = data['video_id']
 99 |             m = model(video, text)
100 |             m  = m.cpu().detach().numpy()
101 |             metrics = compute_metrics(m)
102 |             print_computed_metrics(metrics)
103 |             if args.eval_youcook:
104 |                 evaluate_recall_youcook(None, None, data['video_id'], m)
105 | 
106 | 
107 | all_checkpoints = glob.glob(args.pretrain_path)
108 | 
109 | for c in all_checkpoints:
110 |     print('Eval checkpoint: {}'.format(c))
111 |     print('Loading checkpoint: {}'.format(c))
112 |     net.load_checkpoint(c)
113 |     if args.eval_youcook:
114 |         Eval_retrieval(net, dataloader_val, 'YouCook2')
115 |     if args.eval_msrvtt:
116 |         Eval_retrieval(net, dataloader_msrvtt, 'MSR-VTT')
117 |     if args.eval_lsmdc:
118 |         Eval_retrieval(net, dataloader_lsmdc, 'LSMDC')
119 | 


--------------------------------------------------------------------------------
/eval_avlnet.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #SBATCH --qos=sched_level_2
  3 | #SBATCH --gres=gpu:4 
  4 | #SBATCH --gpus-per-node=4
  5 | #SBATCH --nodes=1
  6 | #SBATCH --time=2:00:00
  7 | #SBATCH --cpus-per-task 74
  8 | #SBATCH --ntasks-per-node=1
  9 | #SBATCH --mem=1T
 10 | #SBATCH --exclusive
 11 | #SBATCH --job-name="ht"
 12 | #SBATCH --output logs/ht-%j.out
 13 | #SBATCH --error logs/ht-%j.err
 14 | ## NOTE: adjust the dependency if needed for the 2nd and 3rd run
 15 | ##SBATCH --dependency=afterok:12625
 16 | 
 17 | ## Number of total processes
 18 | echo " "
 19 | echo " Nodelist:= " $SLURM_JOB_NODELIST
 20 | echo " Number of nodes:= " $SLURM_JOB_NUM_NODES
 21 | echo " GPUs per node:= " $SLURM_JOB_GPUS
 22 | echo " Ntasks per node:= "  $SLURM_NTASKS_PER_NODE
 23 | 
 24 | echo " Running on multiple nodes/GPU devices"
 25 | echo ""
 26 | echo " Run started at:- "
 27 | date
 28 | 
 29 | source /nobackup/users/duartek/anaconda3/bin/activate
 30 | conda activate wmlce-1.6.2
 31 | 
 32 | 
 33 | nvidia-smi
 34 | pwd
 35 | 
 36 | #####################
 37 | 
 38 | #python gen_loader.py
 39 | 
 40 | #python eval.py --eval_youcook=1 --num_thread_reader=74 --embd_dim=6144 --pretrain_path=/nobackup/users/brian27/howto100m/model/howto100m_pt_model.pth
 41 | 
 42 | python eval.py --eval_youcook=1 --num_thread_reader=74 --embd_dim=4096 --pretrain_path=/nobackup/users/brian27/howto100m/model_me/mil_nce_two/e18.pth
 43 | 
 44 | 
 45 | #python train_tri_kmeans.py --eval_youcook=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
 46 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
 47 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
 48 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
 49 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
 50 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth
 51 | 
 52 | #python train_tri_kmeans.py --eval_msrvtt=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
 53 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
 54 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
 55 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
 56 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
 57 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth
 58 | 
 59 | 
 60 | echo "Weights 16"
 61 | 
 62 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
 63 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
 64 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
 65 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
 66 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
 67 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth
 68 | 
 69 | python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
 70 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
 71 | --lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
 72 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
 73 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
 74 | --pretrain_path=model_mcn/MCN_KMeans/e16.pth
 75 | 
 76 | 
 77 | 
 78 | #echo "Weights 21"
 79 | 
 80 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
 81 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
 82 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
 83 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
 84 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
 85 | #--pretrain_path=model_mcn/MCN_KMeans/e21.pth
 86 | 
 87 | #python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
 88 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
 89 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
 90 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
 91 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
 92 | #--pretrain_path=model_mcn/MCN_KMeans/e21.pth
 93 | 
 94 | #echo "Weights 24"
 95 | 
 96 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
 97 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
 98 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
 99 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
100 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
101 | #--pretrain_path=model_mcn/MCN_KMeans/e24.pth
102 | 
103 | #python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
104 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
105 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
106 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
107 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
108 | #--pretrain_path=model_mcn/MCN_KMeans/e24.pth
109 | 
110 | 
111 | echo "Weights 26"
112 | 
113 | #python train_tri_kmeans.py --eval_ucf=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
114 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
115 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
116 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
117 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
118 | #--pretrain_path=model_mcn/MCN_KMeans/e26.pth
119 | 
120 | python train_tri_kmeans.py --eval_hmdb=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
121 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
122 | --lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
123 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
124 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
125 | --pretrain_path=model_mcn/MCN_KMeans/e26.pth
126 | 
127 | 
128 | 
129 | 
130 | 
131 | #python train_tri_kmeans.py --eval_cross=1 --num_thread_reader=74 --epochs=0 --batch_size=512 \
132 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
133 | #--lr=0.0001 --tri_modal=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=0 --recon_size=1024 \
134 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
135 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
136 | #--pretrain_path=model_mcn/MCN_KMeans/e16.pth
137 | 
138 | #python train_tri_cos_mil.py --eval_cross=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
139 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Joint_Recon_Hard/e15.pth \
140 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \
141 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
142 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos
143 | 
144 | #python train_tri_cos_mil.py --eval_youcook=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
145 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Joint_Recon_Cross_Hard/e9.pth \
146 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \
147 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
148 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos
149 | 
150 | #python train_tri_cos_mil.py --eval_msrvtt=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
151 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Joint_Recon_Cross_Hard/e9.pth \
152 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \
153 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
154 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos
155 | 
156 | #python train_tri_cos_mil.py --eval_ucf=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
157 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Sports/e20.pth \
158 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \
159 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
160 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos
161 | 
162 | #python train_tri_cos_mil.py --eval_hmdb=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
163 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Sports/e20.pth \
164 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \
165 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
166 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos
167 | 
168 | #python train_tri_cos_mil.py --eval_msrvtt=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
169 | #python train_tri_cos_mil.py --eval_youcook=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
170 | #python train_tri_cos_mil.py --eval_ucf=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
171 | #python train_tri_cos_mil.py --eval_hmdb=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
172 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Joint_Recon/e11.pth \
173 | #--lr=1e-5 --tri_modal=1 --finetune_video=0 --video_interp=0 --output_norm=1 --joint_cluster=1 --multi_head=0 \
174 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
175 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos
176 | 
177 | 
178 | #python local_eval.py
179 | 
180 | # model_mcn/MCN1/e9.pth
181 | 
182 | #python train_tri_c.py --eval_youcook=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
183 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Recon2/e10.pth \
184 | #--lr=1e-5 --tri_modal=1
185 | 
186 | 
187 | 
188 | #python train_tri_c.py --eval_msrvtt=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
189 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Recon2/e14.pth \
190 | #--lr=1e-5 --tri_modal=1 \
191 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
192 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos
193 | 
194 | # model_mcn/MCN1/e9.pth
195 | 
196 | #python train_tri_c.py --eval_youcook=1  --num_thread_reader=74 --batch_size=512 --epochs=0 --project=1 --project_dim=8000 \
197 | #--lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/MCN_Recon2/e14.pth \
198 | #--lr=1e-5 --tri_modal=1
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | ## Wait for all commands to finish
210 | wait 
211 | echo "Run completed at:- "
212 | date
213 | 


--------------------------------------------------------------------------------
/eval_cross.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import DataLoader
  8 | from args import get_args
  9 | import numpy as np
 10 | from dp.dp import dp
 11 | from tqdm import tqdm
 12 | from sklearn.metrics import average_precision_score
 13 | from tqdm import tqdm as std_tqdm
 14 | from functools import partial
 15 | tqdm = partial(std_tqdm, dynamic_ncols=True)
 16 | import torch.nn as nn
 17 | from metrics import compute_metrics, print_computed_metrics
 18 | from gensim.models.keyedvectors import KeyedVectors
 19 | import pickle
 20 | import glob
 21 | from lsmdc_dataloader import LSMDC_DataLoader
 22 | from msrvtt_dataloader import MSRVTT_DataLoader
 23 | from youcook_dataloader import Youcook_DataLoader
 24 | from cvpr19_dataloader import CVPR19_DataLoader
 25 | from mining_dataloader import Mining_DataLoader
 26 | import pprint
 27 | 
 28 | #th.backends.cudnn.enabled = False
 29 | 
 30 | pp = pprint.PrettyPrinter(indent=4)
 31 | 
 32 | args = get_args()
 33 | if args.verbose:
 34 |     print(args)
 35 | 
 36 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument'
 37 | 
 38 | 
 39 | 
 40 | print('Loading word vectors: {}'.format(args.word2vec_path))
 41 | we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True)
 42 | print('done')
 43 | 
 44 | if args.save_feature==1:
 45 |     step_path = 'step_all.json'
 46 | else:
 47 |     step_path = 'step.json'
 48 | 
 49 | if args.cross:
 50 |     cross_testset = CVPR19_DataLoader(
 51 |         #csv='vids_27.csv',
 52 |         csv='/nobackup/users/brian27/CrossTask/howto100m_crosstask_eval/cvpr19_test.csv',
 53 |         features_path='vids_feature',
 54 |         #features_path = '/nobackup/users/brian27/CrossTask/howto100m_crosstask_eval/features_2d',
 55 |         #features_path_3D = '/nobackup/users/brian27/CrossTask/howto100m_crosstask_eval/features_3d',
 56 |         annot_path = 'anno', #'/nobackup/users/brian27/CrossTask/crosstask_release/Y-1',
 57 |         steps_path = step_path,
 58 |         audio_path = 'audio_feature',#'/home/brian27/nobackup/CrossTask/audio_feature_new',#
 59 |         annot_path_time='/nobackup/users/brian27/CrossTask/crosstask_release/annotations',
 60 |         cook_path = '/home/brian27/nobackup/CrossTask/crosstask_release/cook.txt',
 61 |         with_audio = args.with_audio,
 62 |         we=we
 63 |         #features_path_3D='howto100m_crosstask_eval/features_3d'
 64 |     )
 65 | if args.mining:
 66 |     cross_testset = Mining_DataLoader(
 67 |         csv='/nobackup/users/brian27/Weak_YouTube_dataset/mining.csv',
 68 |         features_path='/nobackup/users/brian27/Weak_YouTube_dataset/test_new_f',
 69 |         annot_path='/nobackup/users/brian27/Weak_YouTube_dataset/anno',
 70 |         steps_path='/nobackup/users/brian27/Weak_YouTube_dataset/'+step_path,
 71 |         audio_path='/nobackup/users/brian27/Weak_YouTube_dataset/test_new_a_f',  # 'audio_feature',
 72 |         we=we
 73 |         # features_path_3D='howto100m_crosstask_eval/features_3d'
 74 |     )
 75 | #print(cross_testset)
 76 | dataloader_cross = DataLoader(
 77 |     cross_testset,
 78 |     batch_size=1,
 79 |     num_workers=args.num_thread_reader,
 80 |     shuffle=False,
 81 |     drop_last=False,
 82 | )
 83 | 
 84 | 
 85 | #def cvpr19_score(X, steps, model):
 86 | def cvpr19_score_a(X, audio, nframes, steps, model):
 87 |     #sim_matrix = model.forward(X.cuda(),steps.cuda()).transpose(1,0) #[frame,class]
 88 |     #print('video',X.shape)
 89 |     #print('audio',audio.shape)
 90 |     #print('text',steps.shape)
 91 |     if args.v_only==1:
 92 |         sim_matrix = model.forward(X, audio, nframes, args.v_only, steps)
 93 |         return sim_matrix.transpose(1, 0).detach().cpu().numpy()
 94 | 
 95 |     #sim_matrix,s2,s3 = model.forward(X, audio, nframes, steps).transpose(1, 0)
 96 |     sim_matrix,s2,s3 = model.forward(X, audio, nframes, args.v_only, steps)  # [frame,class]
 97 |     #v,a,t = model.forward(X, audio, nframes, steps)#.transpose(1, 0)  # [frame,class]
 98 |     #print('sim_matrix',sim_matrix.shape)
 99 |     return sim_matrix.transpose(1, 0).detach().cpu().numpy(),s2.transpose(1, 0).detach().cpu().numpy(),s3.transpose(1, 0).detach().cpu().numpy()
100 |     #return v,a,t
101 | 
102 | def cvpr19_score(X, steps, model):
103 |     sim_matrix = model.forward(X.cuda(),steps.cuda()).transpose(1,0) #[frame,class]
104 |     #print('video',X.shape)
105 |     #print('audio',audio.shape)
106 |     #print('text',steps.shape)
107 |     #sim_matrix = model.forward(X, audio, nframes, steps).transpose(1, 0)  # [frame,class]
108 |     #print('sim_matrix',sim_matrix.shape)
109 |     return sim_matrix.detach().cpu().numpy()
110 | 
111 | def cvpr19_predict(scores):
112 |     C = -scores#.cpu().detach().numpy()
113 |     y = np.empty(scores.shape, dtype=np.float32)
114 |     dp(y, C, exactly_one=True) #[frame,class]
115 |     return y
116 | 
117 | def arg_max_predict(scores):
118 |     y_final = np.zeros((scores.shape[0], scores.shape[1]))
119 |     arg_y = np.argmax(scores, axis=1)
120 |     for i in range(scores.shape[0]):
121 |         y_final[i][arg_y[i]] = 1
122 |     return y_final
123 | 
124 | def get_recall(y_true, y):
125 |     #return ((y*y_true).sum(axis=1)>0).sum() / (y_true.sum(axis=1)>0).sum()
126 |     if args.recall_frame==0:
127 |         return ((y*y_true).sum(axis=0)>0).sum() / (y_true.sum(axis=0)>0).sum()
128 |     else:
129 |         return ((y * y_true).sum(axis=0) > 0).sum() / (y_true.sum(axis=0) > 0).sum()
130 | 
131 | def align_eval(model, dataloader, gpu_mode=1):
132 |     print('start cross')
133 |     recalls = {}
134 |     counts = {}
135 |     recalls_m = 0
136 |     counts_m = 0
137 |     task_scores = {}
138 |     task_gt = {}
139 |     for sample in tqdm(dataloader):
140 |         with th.no_grad():
141 | 
142 |             #print(sample)
143 |             #for sample in batch:
144 | 
145 | 
146 | 
147 |             video = sample['video'].cuda() if gpu_mode else sample['video']
148 |             text = sample['steps'].cuda() if gpu_mode else sample['steps']
149 | 
150 |             video = video.view(-1, video.shape[-1])
151 |             text = th.squeeze(text)# class x emb
152 |             #n_frame = th.tensor([])
153 |             n_frame = sample['nframes'].cuda()#th.ones(video.shape[0],1)*1#.cuda()
154 |             n_frame = n_frame.view(-1)
155 |             #print(n_frame.shape)
156 | 
157 |             #print('n_frame',n_frame.shape)
158 |             if args.tri==1:
159 |                 audio = sample['audio'].cuda() if gpu_mode else sample['video']
160 |                 audio = audio.view(-1, audio.shape[-2], audio.shape[-1])
161 |                 #print(audio.shape)
162 |                 scores_list = []
163 |                 split = 15
164 |                 batch_size = 25
165 |                 #print(video.shape[0])
166 |                 b_s = int(video.shape[0] / batch_size)
167 |                 # for i in range(video.shape[0]):
168 |                 # video_1 = th.unsqueeze(video[:half],0)
169 |                 # audio_1 = th.unsqueeze(audio[:half],0)
170 |                 if video.shape[0] < batch_size:
171 |                     if args.v_only==0:
172 |                         scores,s2,s3 = cvpr19_score_a(video, audio, n_frame, text, model)
173 |                     else:
174 |                         scores = cvpr19_score_a(video, audio, n_frame, text, model)
175 |                 else:
176 |                     for i in range(b_s):
177 |                         if i == b_s - 1:
178 |                             video_1 = video[batch_size * i:]
179 |                             audio_1 = audio[batch_size * i:]
180 |                             n_frame_1 = n_frame[batch_size * i:]
181 |                         else:
182 |                             video_1 = video[batch_size * i:batch_size * (i + 1)]
183 |                             audio_1 = audio[batch_size * i:batch_size * (i + 1)]
184 |                             n_frame_1 = n_frame[batch_size * i:batch_size * (i + 1)]
185 |                         # text_1 = th.unsqueeze(text[i])
186 |                         if args.v_only==0:
187 |                             scores,s2,s3 = cvpr19_score_a(video_1, audio_1, n_frame_1, text, model)
188 |                         else:
189 |                             scores = cvpr19_score_a(video_1, audio_1, n_frame_1, text, model)
190 |                         scores_list.append(scores)
191 |                     scores = np.vstack(scores_list)
192 |                 if args.save_feature==1:
193 | 
194 |                     scores = th.from_numpy(scores)
195 |                     m = nn.LogSoftmax(dim=1)
196 |                     scores = m(scores).detach().cpu().numpy()
197 |                     #print(scores)
198 |                     method = args.method_name
199 |                     if args.mining == 1:
200 |                         path = 'mining_score_'+method+'/'
201 |                     else:
202 |                         path = 'cross_score_' + method + '/'
203 |                     from pathlib import Path
204 |                     Path(path).mkdir(parents=True, exist_ok=True)
205 | 
206 |                     file1 = open(path + sample['video_id'][0] + '.probs', 'w')
207 |                     for i in range(scores.shape[0]):
208 |                         for j in range(scores.shape[1]):
209 |                             # for k in range(30):
210 |                             file1.write(str(scores[i][j]) + ' ')
211 |                         file1.write('\n')
212 |                     file1.close()
213 | 
214 |             else:
215 |                 scores = cvpr19_score(video, text, model) #[time,class]
216 |                 if args.save_feature == 1:
217 |                     scores = th.from_numpy(scores)
218 |                     m = nn.LogSoftmax(dim=1)
219 |                     scores = m(scores).detach().cpu().numpy()
220 |                     from pathlib import Path
221 |                     path = 'mining_score_ver/'
222 |                     Path(path).mkdir(parents=True, exist_ok=True)
223 |                     file1 = open(path + sample['video_id'][0] + '.probs', 'w')
224 |                     for i in range(scores.shape[0]):
225 |                         for j in range(scores.shape[1]):
226 |                             # for k in range(30):
227 |                             file1.write(str(scores[i][j]) + ' ')
228 |                         file1.write('\n')
229 |                     file1.close()
230 |             #"""
231 |             if args.save_feature == 0:
232 |                 if args.recall_frame==0:
233 |                     #scores = np.log(scores)
234 |                     #"""
235 |                     if args.mining==1:
236 |                         m = nn.LogSoftmax(dim=1)
237 |                         #m = nn.LogSigmoid()
238 |                         scores = th.from_numpy(scores)
239 |                         scores = m(scores).detach().cpu().numpy()
240 |                     #"""
241 |                     y = cvpr19_predict(scores) #[time,class]
242 |                 else:
243 |                     y = arg_max_predict(scores)
244 |                 y_true = th.squeeze(sample['Y']).numpy()
245 | 
246 |                 if args.cross==1:
247 |                     task = sample['task']
248 |                     #y_true = y_true.view(-1, y_true.shape[-1])
249 | 
250 |                     task = task[0]#.view(-1, task.shape[-1])
251 | 
252 |                     if task not in recalls:
253 |                         recalls[task] = 0.
254 |                     recalls[task] += get_recall(y_true, y)
255 |                     if task not in counts:
256 |                         counts[task] = 0
257 |                     counts[task] += 1
258 | 
259 |                     # mAP ----------------------------------------
260 |                     if task not in task_scores:
261 |                         task_scores[task] = []
262 |                         task_gt[task] = []
263 |                     task_scores[task].append(scores)
264 |                     task_gt[task].append(y_true)
265 |                 else:
266 |                     recalls_m += get_recall(y_true, y)
267 |                     counts_m += 1
268 | 
269 |             #if task == '77721':
270 |             #    print('recall:', recalls['77721'])
271 |             #    print('counts:', counts['77721'])
272 |             #    print(sample['video_id'])
273 |             #print(recalls)
274 |             #"""
275 |             # --------------------------------------------
276 |     #"""
277 |     if args.save_feature == 0:
278 |         if args.cross==1:
279 |             recalls = {task: recall / counts[task] for task,recall in recalls.items()}
280 |             # mAP ----------------------------------------
281 |             task_scores = {task: np.concatenate(scores) for task,scores in task_scores.items()}
282 |             task_gt = {task: np.concatenate(y) for task,y in task_gt.items()}
283 |             mAPs = {task: average_precision_score(task_gt[task],scores) for task,scores in task_scores.items()}
284 |             # --------------------------------------------
285 |             #"""
286 |             return recalls, mAPs
287 |         else:
288 |             print(recalls_m/counts_m)
289 |             return recalls_m, None
290 | 
291 | 
292 | 
293 | if args.tri == 0:
294 |     from model import Net
295 | else:
296 |     from model_avl import Net
297 | 
298 | net = Net(
299 |     embd_dim=args.embd_dim, #2048
300 |     video_dim=args.feature_dim, #4096
301 |     we_dim=args.we_dim,
302 |     ratio=args.ratio,
303 | )
304 | 
305 | net.eval()
306 | net.cuda()
307 | 
308 | if args.verbose:
309 |     print('Starting evaluation loop ...')
310 | 
311 | 
312 | all_checkpoints = glob.glob(args.pretrain_path)
313 | 
314 | for c in all_checkpoints:
315 |     print('Eval checkpoint: {}'.format(c))
316 |     print('Loading checkpoint: {}'.format(c))
317 |     net.load_checkpoint(c)
318 | 
319 |     if args.save_feature == 1:
320 |         align_eval(net, dataloader_cross)
321 |     elif args.save_feature == 0:
322 |         recall, mAPs = align_eval(net, dataloader_cross)
323 | 
324 |         pp.pprint(recall)
325 |         if args.cross==1:
326 | 
327 |             pp.pprint(mAPs)
328 |             sum = 0
329 |             count = 0
330 |             sum_c = 0
331 |             count_c = 0
332 |             sum_nc = 0
333 |             count_nc = 0
334 | 
335 |             cook_set=set()
336 |             file1 = open('/home/brian27/nobackup/CrossTask/crosstask_release/cook.txt')
337 |             for line in file1:
338 |                 data = line.strip()
339 |                 cook_set.add(data)
340 | 
341 |             for x,y in recall.items():
342 |                 sum+=y
343 |                 count+=1
344 |                 if x in cook_set:
345 |                     sum_c += y
346 |                     count_c += 1
347 |                 else:
348 |                     sum_nc += y
349 |                     count_nc += 1
350 | 
351 |             print('recall',sum/float(count))
352 |             print('recall cook', sum_c / float(count_c))
353 |             print('recall not cook', sum_nc / float(count_nc))
354 |             sum = 0
355 |             count = 0
356 |             for x,y in mAPs.items():
357 |                 sum+=y
358 |                 count+=1
359 |             print('mAPs',sum/float(count))
360 |             #"""
361 | 


--------------------------------------------------------------------------------
/gen_loader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import numpy as np
  4 | 
  5 | 
  6 | #a = np.load('./temp_data/v_ApplyEyeMakeup_g01_c01.npz')
  7 | #print(a['arr_0'].shape)
  8 | #exit()
  9 | 
 10 | 
 11 | def generate_ucf101_pickle():
 12 |     data_dir = '/nobackup/users/brian27/data/UCF-101_feature/'
 13 |     audio_dir = '/nobackup/users/brian27/data/UCF-101_audio/'
 14 |     #data_dir = '../Data/'
 15 | 
 16 |     feature_list = os.listdir(data_dir)
 17 |     #print(feature_list)
 18 |     videos = sorted(set([v[:-6] for v in feature_list]))
 19 |     print('# Videos', len(videos))
 20 | 
 21 |     train_list = open('./data/ucf_trainlist01.txt').readlines()
 22 | 
 23 |     #print(videos)
 24 |     #v_Basketball_g07_c02_2d.npy
 25 |     data = []
 26 |     for video_name in videos:
 27 |         training = 0
 28 |         for tr_vid in train_list:
 29 |             if video_name[:-1] in tr_vid:
 30 |                 training = 1
 31 |         try:
 32 |             feats_3d = np.load(data_dir + video_name + '3d.npy')
 33 |             #print(feats_3d.shape)
 34 |             feats_2d = np.load(data_dir.replace('brian27', 'duartek') + video_name + '2d.npy')
 35 |             #print(feats_2d.shape)
 36 |         except:
 37 |             continue
 38 |         try:
 39 |             audio = np.load(audio_dir + video_name[:-1] + '.npz')
 40 |             print(audio.files, audio_dir + video_name + '.npz', audio['arr_0'].shape)
 41 |             audio = audio['arr_0']
 42 |             has_audio = 1
 43 |         except:
 44 |             audio = np.zeros((40, 1), dtype=np.float32)
 45 |             has_audio = 0
 46 | 
 47 |         data.append({'2d': feats_2d,
 48 |                      '3d': feats_3d,
 49 |                      '2d_pooled': np.mean(feats_2d, 0),
 50 |                      '3d_pooled': np.mean(feats_3d, 0),
 51 |                      'class': video_name.split('_')[1],
 52 |                      'video': video_name,
 53 |                      'audio': audio,
 54 |                      'has_audio': has_audio,
 55 |                      'training': training
 56 |         })
 57 |     pickle.dump(data, open('./data/UCF101_data.pkl', 'wb'))
 58 |     print('# Videos with features extracted:', len(data))
 59 |     #a = os.listdir('/nobackup/users/brian27/data/hmdb51_feature/')
 60 | 
 61 | 
 62 | def generate_hmdb_pickle():
 63 |     data_dir = '/nobackup/users/brian27/data/hmdb51_feature/'
 64 |     folders_dir = '/nobackup/users/brian27/data/hmdb51_org/'
 65 | 
 66 |     classes = os.listdir(folders_dir)
 67 | 
 68 |     feature_list = os.listdir(data_dir)
 69 |     videos = sorted(set([v[:-6] for v in feature_list]))
 70 |     print('# Videos', len(videos))
 71 | 
 72 |     train_list = open('./data/hmdb_train_split1.txt').readlines()
 73 |     test_list = open('./data/hmdb_test_split1.txt').readlines()
 74 | 
 75 |     n_samples = np.zeros((len(classes), ))
 76 |     data = []
 77 |     for video_name in videos:
 78 |         training = 0
 79 |         for tr_vid in train_list:
 80 |             if video_name[:-1] in tr_vid:
 81 |                 training = 1
 82 | 
 83 |         testing = 0
 84 |         for te_vid in test_list:
 85 |             if video_name[:-1] in te_vid:
 86 |                 testing = 1
 87 | 
 88 |         if training == 0 and testing == 0:
 89 |             training = 2
 90 | 
 91 |         try:
 92 |             feats_3d = np.load(data_dir + video_name + '3d.npy')
 93 |             #print(feats_3d.shape)
 94 |             feats_2d = np.load(data_dir.replace('brian27', 'duartek') + video_name + '2d.npy')
 95 |             #print(feats_2d.shape)
 96 |         except:
 97 |             continue
 98 | 
 99 |         split_name = '_'.join(video_name.split('_')[:-7]) + '_'
100 |         class_name = [cls for cls in classes if '_'+cls+'_' == split_name[-(len(cls)+2):]]
101 |         class_name = sorted(class_name, key=lambda x: len(x))
102 |         #print(class_name, class_name[-1])
103 |         class_name = class_name[-1]
104 |         n_samples[classes.index(class_name)] += 1
105 |         data.append({'2d': feats_2d,
106 |                      '3d': feats_3d,
107 |                      '2d_pooled': np.mean(feats_2d, 0),
108 |                      '3d_pooled': np.mean(feats_3d, 0),
109 |                      'class': class_name,
110 |                      'video': video_name,
111 |                      'training': training
112 |         })
113 |     pickle.dump(data, open('./data/HMDB_data.pkl', 'wb'))
114 |     print('# Videos with features extracted:', len(data))
115 |     for i, cls in enumerate(classes):
116 |         print(cls, n_samples[i])
117 |     #print(n_samples)
118 | 
119 | generate_ucf101_pickle()
120 | generate_hmdb_pickle()
121 | 


--------------------------------------------------------------------------------
/hmdb_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import pickle
  9 | import torch.nn.functional as F
 10 | import numpy as np
 11 | import re
 12 | import pandas as pd
 13 | from collections import defaultdict
 14 | from torch.utils.data.dataloader import default_collate
 15 | import json
 16 | import random
 17 | 
 18 | 
 19 | def name_to_stringlist(name):
 20 |     change = {'claping': ['clapping']}
 21 |     if name in change:
 22 |         name_vec = change[name]
 23 |     else:
 24 |         name_vec = name.split('_')
 25 |     return name_vec
 26 | 
 27 | 
 28 | class HMDB_DataLoader(Dataset):
 29 |     """MSRVTT dataset loader."""
 30 | 
 31 |     def __init__(
 32 |             self,
 33 |             data_path,
 34 |             we,
 35 |             we_dim=300,
 36 |             max_words=30,
 37 |             num_frames_multiplier=5,
 38 |             training=True,
 39 |             tri_modal=False,
 40 |             finetune_video=False,
 41 |             video_interp=False
 42 |     ):
 43 |         """
 44 |         Args:
 45 |         """
 46 |         self.data = pickle.load(open(data_path, 'rb'))  # contains a list of video names
 47 |         self.we = we
 48 |         self.we_dim = we_dim
 49 |         self.max_words = max_words
 50 |         self.max_video = 30
 51 |         self.num_frames_multiplier = num_frames_multiplier
 52 |         self.training = training
 53 |         self.tri_modal = tri_modal
 54 |         self.finetune_video = finetune_video
 55 |         self.max_frames = 16
 56 |         self.video_interp = video_interp
 57 | 
 58 |         names = []
 59 |         for vid in self.data:
 60 |             names.append(vid['class'])
 61 | 
 62 |         self.classes = sorted(set(names))
 63 |         print('# Classes', len(self.classes))
 64 | 
 65 |         self.class_embeds = []
 66 |         for name in self.classes:
 67 |             word_list = name_to_stringlist(name)
 68 |             caption = ' '.join(word_list)
 69 |             self.class_embeds.append(self._get_caption(caption))
 70 |         self.class_embeds = th.stack(self.class_embeds, 0)
 71 |         print('Shape of class embeds', self.class_embeds.shape)
 72 | 
 73 |     def __len__(self):
 74 |         return len(self.data)
 75 | 
 76 |     def custom_collate(self, batch):
 77 |         return default_collate(batch)
 78 | 
 79 |     def _zero_pad_tensor(self, tensor, size):
 80 |         if len(tensor) >= size:
 81 |             return tensor[:size]
 82 |         else:
 83 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 84 |             return np.concatenate((tensor, zero), axis=0)
 85 | 
 86 |     def _tokenize_text(self, sentence):
 87 |         w = re.findall(r"[\w']+", str(sentence))
 88 |         return w
 89 | 
 90 |     def _words_to_we(self, words):
 91 |         words = [word for word in words if word in self.we.vocab]
 92 |         if words:
 93 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 94 |             return th.from_numpy(we)
 95 |         else:
 96 |             return th.zeros(self.max_words, self.we_dim)
 97 | 
 98 |     def _get_caption(self, idx):
 99 |         """Chooses random caption if training. Uses set caption if evaluating."""
100 |         if self.training:
101 |             captions = idx
102 |             caption = self._words_to_we(self._tokenize_text(random.choice(captions)))
103 |             return caption
104 |         else:
105 |             caption = idx
106 |             return self._words_to_we(self._tokenize_text(caption))
107 | 
108 |     def __getitem__(self, idx):
109 |         data = self.data[idx]
110 |         # load 2d and 3d features (features are pooled over the time dimension)
111 | 
112 |         if self.finetune_video:
113 |             feat_2d = th.from_numpy(self.data[idx]['2d']).float()
114 |             feat_3d = th.from_numpy(self.data[idx]['3d']).float()
115 |             if self.video_interp:
116 |                 feat_2d = F.interpolate(feat_2d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear',
117 |                                         align_corners=True).squeeze(0)
118 |                 feat_3d = F.interpolate(feat_3d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear',
119 |                                         align_corners=True).squeeze(0)
120 |             else:
121 |                 feat2d_buffer = th.zeros(self.max_frames, feat_2d.shape[-1])
122 |                 feat_2d = feat_2d[:self.max_frames]
123 |                 feat2d_buffer[:len(feat_2d)] = feat_2d
124 | 
125 |                 feat3d_buffer = th.zeros(self.max_frames, feat_3d.shape[-1])
126 |                 feat_3d = feat_3d[:self.max_frames]
127 |                 feat3d_buffer[:len(feat_3d)] = feat_3d
128 | 
129 |                 feat_2d = feat2d_buffer.transpose(1, 0)
130 |                 feat_3d = feat3d_buffer.transpose(1, 0)
131 | 
132 |             feat_2d = F.normalize(feat_2d, dim=0)
133 |             feat_3d = F.normalize(feat_3d, dim=0)
134 |             video = th.cat((feat_2d, feat_3d), dim=0)
135 |         else:
136 |             feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0)
137 |             feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0)
138 |             video = th.cat((feat_2d, feat_3d))
139 | 
140 |         # load audio and zero pad/truncate if necessary
141 |         audio = th.FloatTensor(th.from_numpy(np.zeros((40, 1000), dtype=np.float32)))
142 | 
143 |         # choose a caption
144 |         caption = ''
145 |         name = self.data[idx]['class']
146 |         if self.tri_modal:
147 |             word_list = name_to_stringlist(name)
148 |             caption = ' '.join(word_list)
149 |             caption = self._get_caption(caption)
150 | 
151 |         return {'video': video, 'text': caption, 'video_id': idx,
152 |                 'audio': audio, 'nframes': 32, 'class_name': name,
153 |                 'class_id': th.ones(1)*self.classes.index(name),
154 |                 'has_audio': th.zeros(1),
155 |                 'video_name': self.data[idx]['video'],
156 |                 'training': th.ones(1)*self.data[idx]['training']}
157 | 
158 | 
159 | class MSRVTT_DataLoader_label(Dataset):
160 |     """MSRVTT dataset loader."""
161 | 
162 |     def __init__(
163 |             self,
164 |             data_path,
165 |             we,
166 |             pseudo_v,
167 |             pseudo_a,
168 |             we_dim=300,
169 |             max_words=30,
170 |             num_frames_multiplier=5,
171 |             training=True,
172 |             tri_modal=False,
173 |     ):
174 |         """
175 |         Args:
176 |         """
177 |         self.data = pickle.load(open(data_path, 'rb'))
178 |         self.we = we
179 |         self.we_dim = we_dim
180 |         self.max_words = max_words
181 |         self.max_video = 30
182 |         self.num_frames_multiplier = num_frames_multiplier
183 |         self.training = training
184 |         self.tri_modal = tri_modal
185 |         self.pseudo_v = pseudo_v
186 |         self.pseudo_a = pseudo_a
187 | 
188 | 
189 | 
190 |     def __len__(self):
191 |         return len(self.data)
192 | 
193 |     def custom_collate(self, batch):
194 |         return default_collate(batch)
195 | 
196 |     def _zero_pad_tensor(self, tensor, size):
197 |         if len(tensor) >= size:
198 |             return tensor[:size]
199 |         else:
200 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
201 |             return np.concatenate((tensor, zero), axis=0)
202 | 
203 |     def _tokenize_text(self, sentence):
204 |         w = re.findall(r"[\w']+", str(sentence))
205 |         return w
206 | 
207 |     def _words_to_we(self, words):
208 |         words = [word for word in words if word in self.we.vocab]
209 |         if words:
210 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
211 |             return th.from_numpy(we)
212 |         else:
213 |             return th.zeros(self.max_words, self.we_dim)
214 | 
215 |     def _get_caption(self, idx):
216 |         """Chooses random caption if training. Uses set caption if evaluating."""
217 |         if self.training:
218 |             captions = self.data[idx]['caption']
219 |             caption = self._words_to_we(self._tokenize_text(random.choice(captions)))
220 |             return caption
221 |         else:
222 |             caption = self.data[idx]['eval_caption']
223 |             return self._words_to_we(self._tokenize_text(caption))
224 | 
225 |     def __getitem__(self, idx):
226 |         video_id = self.data[idx]['id']
227 |         # load 2d and 3d features (features are pooled over the time dimension)
228 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0)
229 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0)
230 |         video = th.cat((feat_2d, feat_3d))
231 | 
232 |         # load audio and zero pad/truncate if necessary
233 |         audio = self.data[idx]['audio']
234 |         target_length = 1024 * self.num_frames_multiplier
235 |         nframes = audio.numpy().shape[1]
236 |         p = target_length - nframes
237 |         if p > 0:
238 |             audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
239 |         elif p < 0:
240 |             audio = audio[:, 0:p]
241 |         audio = th.FloatTensor(audio)
242 | 
243 |         # choose a caption
244 |         caption = ''
245 |         if self.tri_modal:
246 |             caption = self._get_caption(idx)
247 | 
248 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
249 |                 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]}
250 | 


--------------------------------------------------------------------------------
/local_eval.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from metrics import compute_metrics, print_computed_metrics
 3 | import pickle
 4 | import torch
 5 | 
 6 | 
 7 | eval_lang_retrieval = 0
 8 | eval_msrvtt = 1
 9 | 
10 | data = pickle.load(open('temp_data/MSR-VTT.pkl', 'rb'))
11 | #data = pickle.load(open('temp_data/YouCook2.pkl', 'rb'))
12 | 
13 | text = data['text']
14 | video = data['video']
15 | audio = data['audio']
16 | 
17 | text2 = data['out_t']
18 | video2 = data['out_v']
19 | audio2 = data['out_a']
20 | 
21 | text3 = data['out_t2']
22 | video3 = data['out_v2']
23 | audio3 = data['out_a2']
24 | 
25 | #m = np.matmul(text, video.T) #+ np.matmul(text2, video2.T)
26 | #m = np.matmul(text, (video+audio).T) #+ np.matmul(text2, video2.T)#+ np.matmul(text, audio.T)
27 | m = np.matmul(text, (video).T)# + np.matmul(text, (audio).T)
28 | #m = np.matmul(text, (audio).T)
29 | 
30 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt)
31 | print('Combined Space')
32 | print_computed_metrics(metrics)
33 | 
34 | def norm(mat, axis=-1):
35 |     return np.sqrt(np.sum(mat**2, axis=axis, keepdims=True) + 1e-9)
36 | 
37 | 
38 | def softmax(x, axis=-1):
39 |     return np.exp(x)/np.sum(np.exp(x)+1e-12, axis=axis, keepdims=True)
40 | 
41 | #text2 = text3#softmax(text2*10)
42 | #video2 = video3#softmax(video2*10)
43 | 
44 | m = np.matmul(text2, (video2).T)# + np.matmul(text2, (audio2).T)
45 | 
46 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt)
47 | print('Dot Product on Embedding 2')
48 | print_computed_metrics(metrics)
49 | 
50 | text2 = softmax(text2*10)
51 | video2 = softmax(video2*10)
52 | m = np.matmul(text2, (video2).T)# + np.matmul(text2, (audio2).T)
53 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt)
54 | print('Dot Product on softmax Embedding 2 x10 temp')
55 | print_computed_metrics(metrics)
56 | 
57 | 
58 | text2 = text3#softmax(text2*10)
59 | video2 = text3#softmax(video2*10)
60 | m = np.matmul(text3, (video3).T)# + np.matmul(text2, (audio2).T)
61 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt)
62 | print('Dot Product on normalized Embedding')
63 | print_computed_metrics(metrics)
64 | 
65 | exit()
66 | m = torch.zeros((text2.shape[0], video2.shape[0]))
67 | 
68 | text2 = torch.from_numpy(text2)
69 | video2 = torch.from_numpy(video2)
70 | 
71 | 
72 | for i, v in enumerate(video2):
73 |     diff = (text2 - torch.unsqueeze(v, 0)) ** 2
74 |     diff = torch.sum(diff, -1)
75 |     m[:, i] = 0-diff
76 | 
77 | metrics = compute_metrics(m, eval_lang_retrieval, eval_msrvtt)
78 | print('Euclidian Distance Embedding 2')
79 | print_computed_metrics(metrics)
80 | 
81 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import torch.nn.functional as F
 7 | import torch as th
 8 | import numpy as np
 9 | 
10 | class MMS_loss(th.nn.Module):
11 |     def __init__(self):
12 |         super(MMS_loss, self).__init__()
13 | 
14 |     def forward(self, S, margin=0.001):
15 |         deltas = margin * th.eye(S.size(0)).to(S.device)
16 |         S = S - deltas
17 | 
18 |         target = th.LongTensor(list(range(S.size(0)))).to(S.device)
19 |         I2C_loss = F.nll_loss(F.log_softmax(S, dim=1), target)
20 |         C2I_loss = F.nll_loss(F.log_softmax(S.t(), dim=1), target)
21 |         loss = I2C_loss + C2I_loss
22 |         return loss


--------------------------------------------------------------------------------
/loss_mil.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | 
 3 | 
 4 | class MILNCELoss(th.nn.Module):
 5 |     def __init__(self):
 6 |         super(MILNCELoss, self).__init__()
 7 | 
 8 |     def forward(self, video_embd, text_embd):
 9 |         x = th.matmul(video_embd, text_embd.t())
10 |         x = x.view(video_embd.shape[0], video_embd.shape[0], -1)
11 |         nominator = x * th.eye(x.shape[0])[:,:,None].cuda()
12 |         nominator = nominator.sum(dim=1)
13 |         nominator = th.logsumexp(nominator, dim=1)
14 |         denominator = th.cat((x, x.permute(1,0,2)), dim=1).view(x.shape[0], -1)
15 |         denominator = th.logsumexp(denominator, dim=1)
16 |         return th.mean(denominator - nominator)


--------------------------------------------------------------------------------
/loss_sink.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn.functional as F
  7 | import torch as th
  8 | import numpy as np
  9 | eps = 1e-7
 10 | 
 11 | class MMS_loss(th.nn.Module):
 12 |     def __init__(self):
 13 |         super(MMS_loss, self).__init__()
 14 | 
 15 |     def forward(self, S,audio,video, margin=0.001):
 16 |         #print(audio.shape)
 17 |         #print(video.shape)
 18 | 
 19 |         #video = video.view(64,4, video.shape[-1])[:,0,:].squeeze()
 20 |         #audio = audio.view(64, 4, audio.shape[-1])[:, 0, :].squeeze()
 21 | 
 22 |         #print(video.shape)
 23 |         #video = video.permute(1, 0, 2) #4*64*4096
 24 |         #audio = audio.view(64,4, audio.shape[-1])
 25 |         video_embd = audio
 26 |         text_embd = video
 27 |         deltas = margin * th.eye(S.size(0)).cuda()#.to(S.device) #batch size eye
 28 |         #S = th.matmul(audio, video.t())  # 256*4096
 29 |         S = S - deltas #???
 30 |         """
 31 |         pseudo_v = pseudo_v.cpu().detach().numpy()
 32 |         soft = th.nn.Softmax(dim=1)(S)
 33 |         #pseudo_a = pseudo_v.cpu().detach().numpy()
 34 |         z_arr = np.ones((256, 256), dtype=float)
 35 |         for i in range(256):
 36 |             result, = np.where(pseudo_v == pseudo_v[i])
 37 |             # print(result)
 38 |             for r in result:
 39 |                 # print(r)
 40 |                 if i==r:
 41 |                     z_arr[i][r] = 1#-1000
 42 |                 else:
 43 |                     #if S[i][r]>0:
 44 |                     z_arr[i][r] = 1#-soft[i][r]#0.001
 45 |                     #print(1-soft[i][r])
 46 |             # break
 47 |         #print(z_arr)
 48 |         z_arr = th.from_numpy(z_arr).type(th.FloatTensor).to(S.device)#z_arr.cuda()
 49 |         """
 50 |         """
 51 |         target = th.LongTensor(list(range(S.size(0)))).cuda()#.to(S.device) #0 to batch size list of numbers
 52 |         #print(target)
 53 |         #print(pseudo_a)
 54 |         #target_a = th.LongTensor(pseudo_a).to(S.device)
 55 |         #print(target_a)
 56 |         #target_v = th.LongTensor(pseudo_v).to(S.device)
 57 | 
 58 | 
 59 |         I2C_loss = F.nll_loss(F.log_softmax(S, dim=1), target) #softmax on feature
 60 |         C2I_loss = F.nll_loss(F.log_softmax(S.t(), dim=1), target)
 61 |         loss = I2C_loss + C2I_loss
 62 | 
 63 |         #I2C_loss = th.nn.BCELoss()(F.softmax(S, dim=1), z_arr)  # softmax on feature
 64 |         #C2I_loss = th.nn.BCELoss()(F.softmax(S.t(), dim=1), z_arr)
 65 |         #loss = I2C_loss + C2I_loss
 66 | 
 67 |         #return loss
 68 |         #"""
 69 |         #"""
 70 |         #video_embd = pseudo_v
 71 |         #text_embd = pseudo_a
 72 |         x = th.matmul(video_embd, text_embd.t())
 73 | 
 74 |         x = S
 75 |         x = x.view(video_embd.shape[0], video_embd.shape[0], -1)  # batch*batch*1
 76 | 
 77 |         #print(S)
 78 |         #x = x.view(S.shape[0], S.shape[0], -1)  # batch*batch*1
 79 |         nominator = x * th.eye(x.shape[0])[:, :, None].cuda()  # correct pairs, assume batches are same video
 80 |         #nominator = x * z_arr[:, :, None]
 81 |         #print(z_arr)
 82 |         #print(nominator)
 83 |         # replace eye by our one hot cluster label
 84 |         nominator = nominator.sum(dim=1)
 85 |         nominator = th.logsumexp(nominator, dim=1)
 86 |         #print(nominator)
 87 |         #p = x * z_arr[:, :, None]
 88 |         #pos = th.logsumexp(pos, dim=1)
 89 | 
 90 |         #pos = th.cat((p, p.permute(1, 0, 2)), dim=1).view(p.shape[0], -1)
 91 |         #pos = th.logsumexp(pos, dim=1)
 92 | 
 93 |         #x = x * z_arr[:, :, None]
 94 |         denominator = th.cat((x, x.permute(1, 0, 2)), dim=1).view(x.shape[0], -1)
 95 |         denominator = th.logsumexp(denominator, dim=1)
 96 |         #print(nominator)
 97 |         #print(denominator)
 98 |         return th.mean(denominator- nominator )
 99 |         #"""
100 |         """
101 |         numerator = th.logsumexp(th.diag(S).view(-1, 1), dim=1) # only  diagnal
102 |         #print(th.diag(S).shape)
103 |         #print(th.diag(S).view(-1, 1).shape) # 256*1
104 |         #print(numerator.shape) #[256]
105 |         denominator = th.logsumexp(th.cat([S, S.t()], dim=1), dim=1)
106 |         #print(th.cat([S, S.t()], dim=1).shape)
107 |         #print(denominator.shape) #256
108 |         loss = th.mean(denominator - numerator)
109 |         print(numerator)
110 |         print(denominator)
111 |         """
112 |         #return loss
113 | 
114 | 
115 |         #return loss
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/lsmdc_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import pickle
  9 | import torch.nn.functional as F
 10 | import numpy as np
 11 | import re
 12 | from torch.utils.data.dataloader import default_collate
 13 | 
 14 | class LSMDC_DataLoader(Dataset):
 15 |     """LSMDC dataset loader."""
 16 | 
 17 |     def __init__(
 18 |             self,
 19 |             data_path,
 20 |             we,
 21 |             we_dim=300,
 22 |             max_words=30,
 23 |             num_frames_multiplier=5,
 24 |             tri_modal=False,
 25 |     ):
 26 |         """
 27 |         Args:
 28 |         """
 29 |         self.data = pickle.load(open(data_path, 'rb'))
 30 |         self.we = we
 31 |         self.we_dim = we_dim
 32 |         self.max_words = max_words
 33 |         self.max_video = 30
 34 |         self.num_frames_multiplier = num_frames_multiplier
 35 |         self.tri_modal = tri_modal
 36 | 
 37 |     def __len__(self):
 38 |         return len(self.data)
 39 | 
 40 |     def custom_collate(self, batch):
 41 |         return default_collate(batch)
 42 | 
 43 |     def _zero_pad_tensor(self, tensor, size):
 44 |         if len(tensor) >= size:
 45 |             return tensor[:size]
 46 |         else:
 47 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 48 |             return np.concatenate((tensor, zero), axis=0)
 49 | 
 50 |     def _tokenize_text(self, sentence):
 51 |         w = re.findall(r"[\w']+", str(sentence))
 52 |         return w
 53 | 
 54 |     def _words_to_we(self, words):
 55 |         words = [word for word in words if word in self.we.vocab]
 56 |         if words:
 57 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 58 |             return th.from_numpy(we)
 59 |         else:
 60 |             return th.zeros(self.max_words, self.we_dim)
 61 |         
 62 |     def __getitem__(self, idx):
 63 |         video_id = self.data[idx]['id']
 64 |         # load 2d and 3d features (features are pooled over the time dimension)
 65 |         feat_2d = F.normalize(self.data[idx]['2d_pooled'].float(), dim=0)
 66 |         feat_3d = F.normalize(self.data[idx]['3d_pooled'].float(), dim=0)
 67 |         video = th.cat((feat_2d, feat_3d))
 68 |         
 69 |         # load audio and zero pad/truncate if necessary
 70 |         audio = self.data[idx]['audio']
 71 |         target_length = 1024 * self.num_frames_multiplier
 72 |         nframes = audio.numpy().shape[1]
 73 |         p = target_length - nframes
 74 |         if p > 0:
 75 |             audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0))
 76 |         elif p < 0:
 77 |             audio = audio[:,0:p]
 78 |         audio = th.FloatTensor(audio)
 79 | 
 80 |         # choose a caption
 81 |         caption = ''
 82 |         if self.tri_modal:
 83 |             caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption']))
 84 | 
 85 |         return {'video': video, 'text': caption, 'video_id': video_id,
 86 |                 'audio': audio, 'nframes': nframes}
 87 | 
 88 | 
 89 | class LSMDC_DataLoader_label(Dataset):
 90 |     """LSMDC dataset loader."""
 91 | 
 92 |     def __init__(
 93 |             self,
 94 |             data_path,
 95 |             we,
 96 |             pseudo_v,
 97 |             pseudo_a,
 98 |             we_dim=300,
 99 |             max_words=30,
100 |             num_frames_multiplier=5,
101 |             tri_modal=False,
102 |     ):
103 |         """
104 |         Args:
105 |         """
106 |         self.data = pickle.load(open(data_path, 'rb'))
107 |         self.we = we
108 |         self.we_dim = we_dim
109 |         self.max_words = max_words
110 |         self.max_video = 30
111 |         self.num_frames_multiplier = num_frames_multiplier
112 |         self.tri_modal = tri_modal
113 |         self.pseudo_v = pseudo_v
114 |         self.pseudo_a = pseudo_a
115 | 
116 |     def __len__(self):
117 |         return len(self.data)
118 | 
119 |     def custom_collate(self, batch):
120 |         return default_collate(batch)
121 | 
122 |     def _zero_pad_tensor(self, tensor, size):
123 |         if len(tensor) >= size:
124 |             return tensor[:size]
125 |         else:
126 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
127 |             return np.concatenate((tensor, zero), axis=0)
128 | 
129 |     def _tokenize_text(self, sentence):
130 |         w = re.findall(r"[\w']+", str(sentence))
131 |         return w
132 | 
133 |     def _words_to_we(self, words):
134 |         words = [word for word in words if word in self.we.vocab]
135 |         if words:
136 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
137 |             return th.from_numpy(we)
138 |         else:
139 |             return th.zeros(self.max_words, self.we_dim)
140 | 
141 |     def __getitem__(self, idx):
142 |         video_id = self.data[idx]['id']
143 |         # load 2d and 3d features (features are pooled over the time dimension)
144 |         feat_2d = F.normalize(self.data[idx]['2d_pooled'].float(), dim=0)
145 |         feat_3d = F.normalize(self.data[idx]['3d_pooled'].float(), dim=0)
146 |         video = th.cat((feat_2d, feat_3d))
147 | 
148 |         # load audio and zero pad/truncate if necessary
149 |         audio = self.data[idx]['audio']
150 |         target_length = 1024 * self.num_frames_multiplier
151 |         nframes = audio.numpy().shape[1]
152 |         p = target_length - nframes
153 |         if p > 0:
154 |             audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
155 |         elif p < 0:
156 |             audio = audio[:, 0:p]
157 |         audio = th.FloatTensor(audio)
158 | 
159 |         # choose a caption
160 |         caption = ''
161 |         if self.tri_modal:
162 |             caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption']))
163 | 
164 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
165 |                 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]}


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import numpy as np
 7 | 
 8 | def compute_metrics(x, eval_lang_retrieval=False, eval_msrvtt=False):
 9 |     if eval_lang_retrieval:
10 |         print("Retrieving language given input video clips")
11 |         x = x.T
12 |     else:
13 |         print("Retrieving video clips given input language")
14 |     sx = np.sort(-x, axis=1)
15 |     d = np.diag(-x)
16 |     d = d[:, np.newaxis]
17 |     ind = sx - d
18 |     ind = np.where(ind == 0)
19 |     ind = ind[1]
20 |     metrics = {}
21 |     test_set_size = x.shape[0] if not eval_msrvtt else 1000
22 |     if eval_msrvtt: print("MSR-VTT: counting {} missing test clips as mistakes".format(1000 - x.shape[0]))
23 |     metrics['R1'] = float(np.sum(ind == 0)) / test_set_size
24 |     metrics['R5'] = float(np.sum(ind < 5)) / test_set_size
25 |     metrics['R10'] = float(np.sum(ind < 10)) / test_set_size
26 |     metrics['MR'] = np.median(ind) + 1
27 |     return metrics
28 | 
29 | def print_computed_metrics(metrics):
30 |     r1 = metrics['R1']
31 |     r5 = metrics['R5']
32 |     r10 = metrics['R10']
33 |     mr = metrics['MR']
34 |     print('R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.format(r1, r5, r10, mr))
35 |     return r1,r5,r10,mr
36 | 
37 | class AverageMeter(object):
38 |     """Computes and stores the average and current value"""
39 |     def __init__(self):
40 |         self.reset()
41 | 
42 |     def reset(self):
43 |         self.val = 0
44 |         self.avg = 0
45 |         self.sum = 0
46 |         self.count = 0
47 | 
48 |     def update(self, val, n=1):
49 |         self.val = val
50 |         self.sum += val * n
51 |         self.count += n
52 |         self.avg = self.sum / self.count


--------------------------------------------------------------------------------
/minY_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import pickle
  9 | import torch.nn.functional as F
 10 | import numpy as np
 11 | import re
 12 | from torch.utils.data.dataloader import default_collate
 13 | 
 14 | class Youcook_DataLoader(Dataset):
 15 |     """Youcook dataset loader."""
 16 | 
 17 |     def __init__(
 18 |             self,
 19 |             data,
 20 |             we,
 21 |             we_dim=300,
 22 |             max_words=30,
 23 |             num_frames_multiplier=5,
 24 |             tri_modal=False,    
 25 |     ):
 26 |         """
 27 |         Args:
 28 |         """
 29 |         self.data = pickle.load(open(data, 'rb'))
 30 |         self.we = we
 31 |         self.we_dim = we_dim
 32 |         self.max_words = max_words
 33 |         self.num_frames_multiplier = num_frames_multiplier
 34 |         self.tri_modal = tri_modal
 35 | 
 36 |     def __len__(self):
 37 |         return len(self.data)
 38 | 
 39 |     def custom_collate(self, batch):
 40 |         return default_collate(batch)
 41 | 
 42 |     def _zero_pad_tensor(self, tensor, size):
 43 |         if len(tensor) >= size:
 44 |             return tensor[:size]
 45 |         else:
 46 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 47 |             return np.concatenate((tensor, zero), axis=0)
 48 | 
 49 |     def _tokenize_text(self, sentence):
 50 |         w = re.findall(r"[\w']+", str(sentence))
 51 |         return w
 52 | 
 53 |     def _words_to_we(self, words):
 54 |         words = [word for word in words if word in self.we.vocab]
 55 |         if words:
 56 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 57 |             return th.from_numpy(we)
 58 |         else:
 59 |             return th.zeros(self.max_words, self.we_dim)
 60 | 
 61 |     def __getitem__(self, idx):
 62 |         # load 2d and 3d features (features are pooled over the time dimension)
 63 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0)
 64 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0)
 65 |         video = th.cat((feat_2d, feat_3d))
 66 | 
 67 |         # load audio and zero pad/truncate if necessary
 68 |         audio = self.data[idx]['audio']
 69 |         target_length = 1024 * self.num_frames_multiplier
 70 |         nframes = audio.numpy().shape[1]
 71 |         p = target_length - nframes
 72 |         if p > 0:
 73 |             audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0))
 74 |         elif p < 0:
 75 |             audio = audio[:,0:p]
 76 |         audio = th.FloatTensor(audio)
 77 | 
 78 |         caption = ''
 79 |         if self.tri_modal:
 80 |             caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 
 81 | 
 82 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
 83 |                 'audio': audio, 'nframes': nframes}
 84 | 
 85 | 
 86 | class Youcook_DataLoader_label(Dataset):
 87 |     """Youcook dataset loader."""
 88 | 
 89 |     def __init__(
 90 |             self,
 91 |             data,
 92 |             we,
 93 |             pseudo_v,
 94 |             pseudo_a,
 95 |             we_dim=300,
 96 |             max_words=30,
 97 |             num_frames_multiplier=5,
 98 |             tri_modal=False,
 99 | 
100 |     ):
101 |         """
102 |         Args:
103 |         """
104 |         self.data = pickle.load(open(data, 'rb')) #9000*4800
105 |         self.we = we
106 |         self.we_dim = we_dim
107 |         self.max_words = max_words
108 |         self.num_frames_multiplier = num_frames_multiplier
109 |         self.tri_modal = tri_modal
110 |         self.pseudo_v = pseudo_v
111 |         self.pseudo_a = pseudo_a
112 | 
113 |     def __len__(self):
114 |         return len(self.data)
115 | 
116 |     def custom_collate(self, batch):
117 |         return default_collate(batch)
118 | 
119 |     def _zero_pad_tensor(self, tensor, size):
120 |         if len(tensor) >= size:
121 |             return tensor[:size]
122 |         else:
123 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
124 |             return np.concatenate((tensor, zero), axis=0)
125 | 
126 |     def _tokenize_text(self, sentence):
127 |         w = re.findall(r"[\w']+", str(sentence))
128 |         return w
129 | 
130 |     def _words_to_we(self, words):
131 |         words = [word for word in words if word in self.we.vocab]
132 |         if words:
133 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
134 |             return th.from_numpy(we)
135 |         else:
136 |             return th.zeros(self.max_words, self.we_dim)
137 | 
138 |     def __getitem__(self, idx):
139 |         # load 2d and 3d features (features are pooled over the time dimension)
140 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0)
141 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0)
142 |         video = th.cat((feat_2d, feat_3d))
143 | 
144 |         # load audio and zero pad/truncate if necessary
145 |         audio = self.data[idx]['audio']
146 |         target_length = 1024 * self.num_frames_multiplier
147 |         nframes = audio.numpy().shape[1]
148 |         p = target_length - nframes
149 |         if p > 0:
150 |             audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
151 |         elif p < 0:
152 |             audio = audio[:, 0:p]
153 |         audio = th.FloatTensor(audio)
154 | 
155 |         caption = ''
156 |         if self.tri_modal:
157 |             caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption']))
158 | 
159 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
160 |                 'audio': audio, 'nframes': nframes, 'pseudo_v':self.pseudo_v[idx], 'pseudo_a':self.pseudo_a[idx]}
161 | 
162 | class Youcook_DataLoader_knn(Dataset):
163 |     """Youcook dataset loader."""
164 | 
165 |     def __init__(
166 |             self,
167 |             data,
168 |             we,
169 |             knn_v,
170 |             knn_a,
171 |             we_dim=300,
172 |             max_words=30,
173 |             num_frames_multiplier=5,
174 |             tri_modal=False,
175 | 
176 |     ):
177 |         """
178 |         Args:
179 |         """
180 |         self.data = pickle.load(open(data, 'rb')) #9000*4800
181 |         self.we = we
182 |         self.we_dim = we_dim
183 |         self.max_words = max_words
184 |         self.num_frames_multiplier = num_frames_multiplier
185 |         self.tri_modal = tri_modal
186 |         self.knn_v = knn_v
187 |         self.knn_a = knn_a
188 | 
189 |     def __len__(self):
190 |         return len(self.data)
191 | 
192 |     def custom_collate(self, batch):
193 |         return default_collate(batch)
194 | 
195 |     def _zero_pad_tensor(self, tensor, size):
196 |         if len(tensor) >= size:
197 |             return tensor[:size]
198 |         else:
199 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
200 |             return np.concatenate((tensor, zero), axis=0)
201 | 
202 |     def _tokenize_text(self, sentence):
203 |         w = re.findall(r"[\w']+", str(sentence))
204 |         return w
205 | 
206 |     def _words_to_we(self, words):
207 |         words = [word for word in words if word in self.we.vocab]
208 |         if words:
209 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
210 |             return th.from_numpy(we)
211 |         else:
212 |             return th.zeros(self.max_words, self.we_dim)
213 | 
214 |     def __getitem__(self, idx):
215 |         video_feature = []
216 |         text_feature = []
217 |         audio_feature = []
218 |         nframes_list = []
219 |         caption_text = []
220 |         for i in self.knn_v[idx]:
221 |             # load 2d and 3d features (features are pooled over the time dimension)
222 |             feat_2d = F.normalize(th.from_numpy(self.data[i]['2d']).float(), dim=0)
223 |             feat_3d = F.normalize(th.from_numpy(self.data[i]['3d']).float(), dim=0)
224 |             video = th.cat((feat_2d, feat_3d))
225 |             video_feature.append(video.numpy())
226 |             # load audio and zero pad/truncate if necessary
227 |             audio = self.data[i]['audio']
228 |             target_length = 1024 * self.num_frames_multiplier
229 |             nframes = audio.numpy().shape[1]
230 |             nframes_list.append(nframes)
231 |             p = target_length - nframes
232 |             if p > 0:
233 |                 audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
234 |             elif p < 0:
235 |                 audio = audio[:, 0:p]
236 |             audio = th.FloatTensor(audio)
237 |             audio_feature.append(audio.numpy())
238 |             caption = ''
239 |             if self.tri_modal:
240 |                 caption = self._words_to_we(self._tokenize_text(self.data[i]['caption']))
241 |                 text_feature.append(caption.numpy())
242 |         video_f = np.asarray(video_feature)
243 |         text_f = np.asarray(text_feature)
244 |         audio_f = np.asarray(audio_feature)
245 |         nframes_l = np.asarray(nframes_list)
246 |         """
247 |         print('dataload')
248 |         print(video_f.shape)
249 |         print(text_f.shape)
250 |         print(audio_f.shape)
251 |         print(nframes_l.shape)
252 |         print('dataload_fin')
253 |         """
254 |         #caption_text =
255 |         return {'video': video_f, 'text': text_f, 'video_id': self.data[i]['id'],
256 |                 'audio': audio_f, 'nframes': nframes_l}


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn as nn
  7 | import torch as th
  8 | import torch.nn.functional as F
  9 | import re
 10 | 
 11 | class Net(nn.Module):
 12 |     def __init__(
 13 |             self,
 14 |             embd_dim=1024,
 15 |             video_dim=2048,
 16 |             n_pair=1,
 17 |             we_dim=300,
 18 |             max_words=30,
 19 |             sentence_dim=-1,
 20 |             mil=0,
 21 |             no_norm=0,
 22 |             we=None,
 23 |     ):
 24 |         super(Net, self).__init__()
 25 |         if sentence_dim <= 0:
 26 |             self.text_pooling = Sentence_Maxpool(we_dim, embd_dim)
 27 |         else:
 28 |             self.text_pooling = Sentence_Maxpool(we_dim, sentence_dim)
 29 |         self.GU_text = Gated_Embedding_Unit(
 30 |             self.text_pooling.out_dim, embd_dim, gating=True)
 31 |         self.GU_video = Gated_Embedding_Unit(
 32 |             video_dim, embd_dim, gating=True)
 33 |         self.n_pair = n_pair
 34 |         self.embd_dim = embd_dim
 35 |         self.we = we
 36 |         self.we_dim = we_dim
 37 |         self.mil=mil
 38 |         self.no_norm = no_norm
 39 | 
 40 | 
 41 |     def save_checkpoint(self, path):
 42 |         th.save(self.state_dict(), path)
 43 | 
 44 |     def load_checkpoint(self, path, cpu=False):
 45 |         if cpu:
 46 |             self.load_state_dict(th.load(path,
 47 |                 map_location=lambda storage, loc: storage))
 48 |         else:
 49 |             self.load_state_dict(th.load(path))
 50 | 
 51 |     def forward(self, video, text):
 52 |         video = self.GU_video(video) #[frames,emb]
 53 |         text = self.GU_text(self.text_pooling(text)) #[class,emb]
 54 |         if self.mil==0:
 55 |             return th.matmul(text, video.t()) #[class,frame]
 56 |         else:
 57 |             return video, text
 58 | 
 59 | 
 60 | 
 61 | class Gated_Embedding_Unit(nn.Module):
 62 |     def __init__(self, input_dimension, output_dimension, gating=True):
 63 |         super(Gated_Embedding_Unit, self).__init__()
 64 |         self.fc = nn.Linear(input_dimension, output_dimension)
 65 |         self.cg = Context_Gating(output_dimension)
 66 |         self.gating = gating
 67 | 
 68 |     def forward(self, x):
 69 |         x = self.fc(x)
 70 |         if self.gating:
 71 |             x = self.cg(x)
 72 |         # un comment when inferencing
 73 |         x = F.normalize(x)
 74 |         return x
 75 | 
 76 | class Sentence_Maxpool(nn.Module):
 77 |     def __init__(self, word_dimension, output_dim, relu=True):
 78 |         super(Sentence_Maxpool, self).__init__()
 79 |         self.fc = nn.Linear(word_dimension, output_dim)
 80 |         self.out_dim = output_dim
 81 |         self.relu = relu
 82 | 
 83 |     def forward(self, x):
 84 |         x = self.fc(x)
 85 |         if self.relu:
 86 |             x = F.relu(x)
 87 |         return th.max(x, dim=1)[0]
 88 | 
 89 | 
 90 | class Context_Gating(nn.Module):
 91 |     def __init__(self, dimension, add_batch_norm=False):
 92 |         super(Context_Gating, self).__init__()
 93 |         self.fc = nn.Linear(dimension, dimension)
 94 |         self.add_batch_norm = add_batch_norm
 95 |         self.batch_norm = nn.BatchNorm1d(dimension)
 96 | 
 97 |     def forward(self, x):
 98 |         x1 = self.fc(x)
 99 |         if self.add_batch_norm:
100 |             x1 = self.batch_norm(x1)
101 |         x = th.cat((x, x1), 1)
102 |         return F.glu(x, 1)
103 | 


--------------------------------------------------------------------------------
/model_davenet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch.nn as nn
  3 | import librosa
  4 | import numpy as np
  5 | import scipy.signal
  6 | 
  7 | def conv1x9(in_planes, out_planes, stride=1):
  8 |     """1x9 convolution with padding"""
  9 |     return nn.Conv2d(in_planes, out_planes, kernel_size=(1,9), stride=stride, padding=(0,4), bias=False)
 10 | 
 11 | def conv1d(in_planes, out_planes, width=9, stride=1, bias=False):
 12 |     """1xd convolution with padding"""
 13 |     if width % 2 == 0:
 14 |         pad_amt = int(width / 2)
 15 |     else:
 16 |         pad_amt = int((width - 1) / 2)
 17 |     return nn.Conv2d(in_planes, out_planes, kernel_size=(1,width), stride=stride, padding=(0,pad_amt), bias=bias)
 18 | 
 19 | class SpeechBasicBlock(nn.Module):
 20 |     expansion = 1
 21 |     def __init__(self, inplanes, planes, width=9, stride=1, downsample=None):
 22 |         super(SpeechBasicBlock, self).__init__()
 23 |         self.conv1 = conv1d(inplanes, planes, width=width, stride=stride)
 24 |         self.bn1 = nn.BatchNorm2d(planes)
 25 |         self.relu = nn.ReLU(inplace=True)
 26 |         self.conv2 = conv1d(planes, planes, width=width)
 27 |         self.bn2 = nn.BatchNorm2d(planes)
 28 |         self.downsample = downsample
 29 |         self.stride = stride
 30 | 
 31 |     def forward(self, x):
 32 |         residual = x
 33 |         out = self.conv1(x)
 34 |         out = self.bn1(out)
 35 |         out = self.relu(out)
 36 |         out = self.conv2(out)
 37 |         out = self.bn2(out)
 38 |         if self.downsample is not None:
 39 |             residual = self.downsample(x)
 40 |         out += residual
 41 |         out = self.relu(out)
 42 |         return out
 43 | 
 44 | class ResDavenet(nn.Module):
 45 |     def __init__(self, feat_dim=40, block=SpeechBasicBlock, layers=[2, 2, 2, 2], layer_widths=[128, 128, 256, 512, 1024], convsize=9):
 46 |         super(ResDavenet, self).__init__()
 47 |         self.feat_dim = feat_dim
 48 |         self.inplanes = layer_widths[0]
 49 |         self.batchnorm1 = nn.BatchNorm2d(1)
 50 |         self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=(self.feat_dim,1), stride=1, padding=(0,0), bias=False)
 51 |         self.bn1 = nn.BatchNorm2d(self.inplanes)
 52 |         self.relu = nn.ReLU(inplace=True)
 53 |         self.layer1 = self._make_layer(block, layer_widths[1], layers[0], width=convsize, stride=2)
 54 |         self.layer2 = self._make_layer(block, layer_widths[2], layers[1], width=convsize, stride=2)
 55 |         self.layer3 = self._make_layer(block, layer_widths[3], layers[2], width=convsize, stride=2)
 56 |         self.layer4 = self._make_layer(block, layer_widths[4], layers[3], width=convsize, stride=2)
 57 | 
 58 |         for m in self.modules():
 59 |             if isinstance(m, nn.Conv2d):
 60 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 61 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 62 |             elif isinstance(m, nn.BatchNorm2d):
 63 |                 m.weight.data.fill_(1)
 64 |                 m.bias.data.zero_()
 65 | 
 66 |     def _make_layer(self, block, planes, blocks, width=9, stride=1):
 67 |         downsample = None
 68 |         if stride != 1 or self.inplanes != planes * block.expansion:
 69 |             downsample = nn.Sequential(
 70 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 71 |                         kernel_size=1, stride=stride, bias=False),
 72 |                 nn.BatchNorm2d(planes * block.expansion),
 73 |             )       
 74 |         layers = []
 75 |         layers.append(block(self.inplanes, planes, width=width, stride=stride, downsample=downsample))
 76 |         self.inplanes = planes * block.expansion
 77 |         for i in range(1, blocks):
 78 |             layers.append(block(self.inplanes, planes, width=width, stride=1))
 79 |         return nn.Sequential(*layers)
 80 | 
 81 |     def forward(self, x):
 82 |         if x.dim() == 3:
 83 |             x = x.unsqueeze(1)
 84 |         x = self.conv1(x)
 85 |         x = self.bn1(x)
 86 |         x = self.relu(x)
 87 |         x = self.layer1(x)
 88 |         x = self.layer2(x)
 89 |         x = self.layer3(x)
 90 |         x = self.layer4(x)
 91 |         x = x.squeeze(2)
 92 |         return x
 93 | 
 94 | def preemphasis(signal,coeff=0.97):
 95 |     """perform preemphasis on the input signal.
 96 |     
 97 |     :param signal: The signal to filter.
 98 |     :param coeff: The preemphasis coefficient. 0 is none, default 0.97.
 99 |     :returns: the filtered signal.
100 |     """    
101 |     return np.append(signal[0],signal[1:]-coeff*signal[:-1])
102 | 
103 | def load_DAVEnet():
104 |     layer_widths = [128,128,256,512,1024]
105 |     layer_depths = [2,2,2,2]
106 |     audio_model = ResDavenet(feat_dim=40, layers=layer_depths, convsize=9, layer_widths=layer_widths)
107 | 
108 |     return audio_model
109 | 
110 | def LoadAudio(path, target_length=2048, use_raw_length=False):
111 |     audio_type = 'melspectrogram'
112 |     preemph_coef = 0.97
113 |     sample_rate = 16000
114 |     window_size = 0.025
115 |     window_stride = 0.01
116 |     window_type = 'hamming'
117 |     num_mel_bins = 40
118 |     padval = 0
119 |     fmin = 20
120 |     n_fft = int(sample_rate * window_size)
121 |     win_length = int(sample_rate * window_size)
122 |     hop_length = int(sample_rate * window_stride)
123 | 
124 |     windows = {'hamming': scipy.signal.hamming}
125 |     # load audio, subtract DC, preemphasis
126 |     # sr=None to avoid resampling (assuming audio already at 16 kHz sr)
127 |     y, sr = librosa.load(path, sr=None)
128 |     if y.size == 0:
129 |         y = np.zeros(200)
130 |     y = y - y.mean()
131 |     y = preemphasis(y, preemph_coef)
132 |     stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
133 |         win_length=win_length,
134 |         window=windows[window_type])
135 |     spec = np.abs(stft)**2
136 |     if audio_type == 'melspectrogram':
137 |         mel_basis = librosa.filters.mel(sr, n_fft, n_mels=num_mel_bins, fmin=fmin)
138 |         melspec = np.dot(mel_basis, spec)
139 |         feats = librosa.power_to_db(melspec, ref=np.max)
140 |     n_frames = feats.shape[1]
141 | 
142 |     if use_raw_length:
143 |         target_length = n_frames
144 |     p = target_length - n_frames
145 |     if p > 0:
146 |         feats = np.pad(feats, ((0,0),(0,p)), 'constant',
147 |             constant_values=(padval,padval))
148 |     elif p < 0:
149 |         feats = feats[:,0:p]
150 |         n_frames = target_length
151 | 
152 |     return feats, n_frames


--------------------------------------------------------------------------------
/model_kmeans_ICCV.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn as nn
  7 | import torch as th
  8 | import torch.nn.functional as F
  9 | from model_davenet import load_DAVEnet
 10 | 
 11 | 
 12 | class Net(nn.Module):
 13 |     def __init__(
 14 |             self,
 15 |             embd_dim=4096,
 16 |             video_dim=4096,
 17 |             we_dim=300,
 18 |             tri_modal=False,
 19 |             tri_modal_fuse=False,
 20 |             cluster_size=256,
 21 |             layer=0,
 22 |             project=0,
 23 |             project_dim=6000,
 24 |             multi_cluster=0,
 25 |             recon=0,
 26 |             withMLP=0,
 27 |             recon_size=768,
 28 | 
 29 |     ):
 30 |         super(Net, self).__init__()
 31 |         self.DAVEnet = load_DAVEnet()
 32 |         self.DAVEnet_projection = nn.Linear(1024, embd_dim)
 33 |         self.GU_audio = Gated_Embedding_Unit(1024, 1024)
 34 |         self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim)
 35 |         if tri_modal and not tri_modal_fuse:
 36 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim)
 37 |             self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim)
 38 | 
 39 |         elif tri_modal_fuse:
 40 |             self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2)
 41 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2)
 42 |             self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim)
 43 |         self.tri_modal = tri_modal
 44 |         self.tri_modal_fuse = tri_modal_fuse
 45 |         self.project = project
 46 |         self.withMLP = withMLP
 47 |         self.recon_size = recon_size
 48 |         if withMLP==1:
 49 |             if project==0:
 50 |                 self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256
 51 |                 self.classification2 = nn.Linear(embd_dim, project_dim, bias=False)  # 4096,256
 52 |                 self.classification3 = nn.Linear(embd_dim, project_dim, bias=False)  # 4096,256
 53 |             else:
 54 | 
 55 |                 self.projection_head = nn.Sequential(
 56 |                     nn.Linear(embd_dim, embd_dim//8),
 57 |                     nn.BatchNorm1d(embd_dim//8),
 58 |                     nn.ReLU(inplace=True),
 59 |                     nn.Linear(embd_dim//8, cluster_size),
 60 |                 )
 61 | 
 62 |                 self.classification = nn.Linear(cluster_size, project_dim, bias=False)
 63 | 
 64 | 
 65 |         self.layer=layer
 66 |         self.recon = recon
 67 |         if recon:
 68 |             inp_dim = embd_dim
 69 | 
 70 |             self.recon_v = nn.Sequential(
 71 |                 nn.Linear(inp_dim, recon_size),
 72 |                 nn.ReLU(inplace=True),
 73 |                 nn.Linear(recon_size, video_dim),
 74 |                 nn.ReLU(inplace=True)
 75 |             )
 76 |             self.recon_a = nn.Sequential(
 77 |                 nn.Linear(inp_dim, recon_size),
 78 |                 nn.ReLU(inplace=True),
 79 |                 nn.Linear(recon_size, 1024),
 80 |                 nn.ReLU(inplace=True)
 81 |             )
 82 |             self.recon_t = nn.Sequential(
 83 |                 nn.Linear(inp_dim, recon_size),
 84 |                 nn.ReLU(inplace=True),
 85 |                 nn.Linear(recon_size, embd_dim),
 86 |                 nn.ReLU(inplace=True)
 87 |             )
 88 |             self.mse = nn.MSELoss(reduction='none')
 89 | 
 90 | 
 91 |     def save_checkpoint(self, path):
 92 |         th.save(self.state_dict(), path)
 93 | 
 94 |     def load_checkpoint(self, path):
 95 |         try:
 96 |             self.load_state_dict(th.load(path, map_location='cpu'))
 97 |         except Exception as e:
 98 |             print(e)
 99 |             print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE")
100 |             self.load_state_dict(th.load(path, map_location='cpu'), strict=False)
101 |         print("Loaded model checkpoint from {}".format(path))
102 | 
103 |     def forward(self, video, audio_input, nframes, text=None):
104 |         video_gt = video
105 |         video = self.GU_video(video)
106 |         if self.recon:
107 |             video_recon = self.recon_v(video)
108 |         audio = self.DAVEnet(audio_input)
109 |         if not self.training:  # controlled by net.train() / net.eval() (use for downstream tasks)
110 |             # Mean-pool audio embeddings and disregard embeddings from input 0 padding
111 |             pooling_ratio = round(audio_input.size(-1) / audio.size(-1))
112 |             nframes.div_(pooling_ratio)
113 |             audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) #
114 |             #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1))
115 |             audio_outputs = audio.unsqueeze(2)
116 |             pooled_audio_outputs_list = []
117 |             for idx in range(audio.shape[0]):
118 |                 nF = max(1, nframes[idx])
119 |                 pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0))
120 |             audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2)
121 |         else:
122 |             audio = audio.mean(dim=2)  # this averages features from 0 padding too
123 | 
124 |         if self.tri_modal_fuse:
125 |             text = self.text_pooling_caption(text)
126 |             audio = self.DAVEnet_projection(audio)
127 |             audio_text = self.GU_audio_text(audio, text)
128 |             return audio_text, video
129 | 
130 |         # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training
131 |         audio_gt = audio
132 |         audio = self.GU_audio(audio)
133 |         audio = self.DAVEnet_projection(audio)
134 |         if self.recon:
135 |             audio_recon = self.recon_a(audio)
136 |         if self.tri_modal and not self.tri_modal_fuse:
137 |             text_gt = self.text_pooling_caption(text)
138 |             text = self.GU_text_captions(text_gt)
139 | 
140 |             if self.recon:
141 |                 text_recon = self.recon_t(text)
142 | 
143 | 
144 |             if self.layer==1:
145 |                 video_c = self.layer1(video)
146 |                 audio_c = self.layer2(audio)
147 |                 text_c = self.layer3(text)
148 |             else:
149 |                 if self.withMLP==1:
150 |                     if self.project==1:
151 |                         video_c = self.projection_head(video)
152 |                         video_c = nn.functional.normalize(video_c, dim=1, p=2)
153 |                     else:
154 |                         video_c = nn.functional.normalize(video, dim=1, p=2)
155 |                     video_c = self.classification(video_c)
156 | 
157 |                     #
158 |                     if self.project == 1:
159 |                         audio_c = self.projection_head(audio)
160 |                         audio_c = nn.functional.normalize(audio_c, dim=1, p=2)
161 |                     else:
162 |                         audio_c = nn.functional.normalize(audio, dim=1, p=2)
163 |                     audio_c = self.classification(audio_c)
164 | 
165 |                     if self.project == 1:
166 |                         text_c = self.projection_head(text)
167 |                         text_c = nn.functional.normalize(text_c, dim=1, p=2)
168 |                     else:
169 |                         text_c = nn.functional.normalize(text, dim=1, p=2)
170 |                     text_c = self.classification(text_c)
171 | 
172 |             if self.recon:
173 |                 mse_v = th.mean(self.mse(video_recon, video_gt), dim=-1)
174 |                 mse_a = th.mean(self.mse(audio_recon, audio_gt), dim=-1)
175 |                 mse_t = th.mean(self.mse(text_recon, text_gt), dim=-1)
176 |                 if self.withMLP == 1:
177 |                     return audio, video, text, audio_c, video_c, text_c, mse_v + mse_a + mse_t
178 |                 else:
179 |                     return audio, video, text, mse_v + mse_a + mse_t
180 |             return audio, video, text
181 | 
182 |         return audio, video
183 | 
184 | 
185 | class Gated_Embedding_Unit(nn.Module):
186 |     def __init__(self, input_dimension, output_dimension):
187 |         super(Gated_Embedding_Unit, self).__init__()
188 |         self.fc = nn.Linear(input_dimension, output_dimension)
189 |         self.cg = Context_Gating(output_dimension)
190 | 
191 |     def forward(self, x):
192 |         x = self.fc(x)
193 |         x = self.cg(x)
194 |         return x
195 | 
196 | 
197 | class Fused_Gated_Unit(nn.Module):
198 |     def __init__(self, input_dimension, output_dimension):
199 |         super(Fused_Gated_Unit, self).__init__()
200 |         self.fc_audio = nn.Linear(input_dimension, output_dimension)
201 |         self.fc_text = nn.Linear(input_dimension, output_dimension)
202 |         self.cg = Context_Gating(output_dimension)
203 | 
204 |     def forward(self, audio, text):
205 |         audio = self.fc_audio(audio)
206 |         text = self.fc_text(text)
207 |         x = audio + text
208 |         x = self.cg(x)
209 |         return x
210 | 
211 | 
212 | class Context_Gating(nn.Module):
213 |     def __init__(self, dimension):
214 |         super(Context_Gating, self).__init__()
215 |         self.fc = nn.Linear(dimension, dimension)
216 | 
217 |     def forward(self, x):
218 |         x1 = self.fc(x)
219 |         x = th.cat((x, x1), 1)
220 |         return F.glu(x, 1)
221 | 
222 | 
223 | class Sentence_Maxpool(nn.Module):
224 |     def __init__(self, word_dimension, output_dim):
225 |         super(Sentence_Maxpool, self).__init__()
226 |         self.fc = nn.Linear(word_dimension, output_dim)
227 | 
228 |     def forward(self, x):
229 |         x = self.fc(x)
230 |         x = F.relu(x)
231 |         return th.max(x, dim=1)[0]


--------------------------------------------------------------------------------
/model_tri_c.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn as nn
  7 | import torch as th
  8 | import torch.nn.functional as F
  9 | from model_davenet import load_DAVEnet
 10 | 
 11 | 
 12 | class Net(nn.Module):
 13 |     def __init__(
 14 |             self,
 15 |             embd_dim=4096,
 16 |             video_dim=4096,
 17 |             we_dim=300,
 18 |             tri_modal=False,
 19 |             tri_modal_fuse=False,
 20 |             cluster_size=512,
 21 |             layer=0,
 22 |             project=0,
 23 |             project_dim=6000,
 24 |             multi_cluster=0,
 25 |             finetune_video=False
 26 | 
 27 |     ):
 28 |         super(Net, self).__init__()
 29 |         self.DAVEnet = load_DAVEnet()
 30 |         self.DAVEnet_projection = nn.Linear(1024, embd_dim)
 31 |         self.GU_audio = Gated_Embedding_Unit(1024, 1024)
 32 |         self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim)
 33 |         if tri_modal and not tri_modal_fuse:
 34 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim)
 35 |             self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim)
 36 | 
 37 |         elif tri_modal_fuse:
 38 |             self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2)
 39 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2)
 40 |             self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim)
 41 |         self.tri_modal = tri_modal
 42 |         self.tri_modal_fuse = tri_modal_fuse
 43 |         self.project = project
 44 |         if project==0:
 45 |             self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256
 46 |         else:
 47 | 
 48 |             self.projection_head = nn.Sequential(
 49 |                 nn.Linear(embd_dim, embd_dim),
 50 |                 nn.BatchNorm1d(embd_dim),
 51 |                 nn.ReLU(inplace=True),
 52 |                 nn.Linear(embd_dim, cluster_size),
 53 |             )
 54 |             self.classification = nn.Linear(cluster_size, project_dim, bias=False)
 55 |         if project_dim==8000:
 56 |             self.classification2 = nn.Linear(embd_dim, cluster_size)
 57 |             self.classification3 = nn.Linear(embd_dim, cluster_size)
 58 |         self.layer=layer
 59 |         if self.layer==1:
 60 |             self.layer1 = nn.Sequential(
 61 |                 nn.Linear(embd_dim, 4096),
 62 |                 nn.BatchNorm1d(4096),
 63 |                 nn.ReLU(True),
 64 |                 nn.Linear(4096, 256)
 65 |             )
 66 |             self.layer2 = nn.Sequential(
 67 |                 nn.Linear(embd_dim, 4096),
 68 |                 nn.BatchNorm1d(4096),
 69 |                 nn.ReLU(True),
 70 |                 nn.Linear(4096, 256)
 71 |             )
 72 |             self.layer3 = nn.Sequential(
 73 |                 nn.Linear(embd_dim, 4096),
 74 |                 nn.BatchNorm1d(4096),
 75 |                 nn.ReLU(True),
 76 |                 nn.Linear(4096, 256)
 77 |             )
 78 |         self.finetune_video = finetune_video
 79 |         if self.finetune_video:
 80 |             self.video_encoder = nn.Sequential(
 81 |                 nn.Conv1d(video_dim, embd_dim, 3, 2),
 82 |                 nn.ReLU(inplace=True),
 83 |                 nn.Conv1d(embd_dim, video_dim, 3, 2),
 84 |                 nn.ReLU(inplace=True)
 85 |             )
 86 | 
 87 |     def save_checkpoint(self, path):
 88 |         th.save(self.state_dict(), path)
 89 | 
 90 |     def load_checkpoint(self, path):
 91 |         try:
 92 |             self.load_state_dict(th.load(path, map_location='cpu'))
 93 |         except Exception as e:
 94 |             print(e)
 95 |             print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE")
 96 |             self.load_state_dict(th.load(path, map_location='cpu'), strict=False)
 97 |         print("Loaded model checkpoint from {}".format(path))
 98 | 
 99 |     def forward(self, video, audio_input, nframes, text=None):
100 |         if self.finetune_video:
101 |             video = self.video_encoder(video)
102 |             video = th.max(video, -1)[0]  # Max pools along the last dimension
103 |         video = self.GU_video(video)
104 | 
105 |         audio = self.DAVEnet(audio_input)
106 |         if not self.training:  # controlled by net.train() / net.eval() (use for downstream tasks)
107 |             # Mean-pool audio embeddings and disregard embeddings from input 0 padding
108 |             pooling_ratio = round(audio_input.size(-1) / audio.size(-1))
109 |             nframes.div_(pooling_ratio)
110 |             audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) #
111 |             #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1))
112 |             audio_outputs = audio.unsqueeze(2)
113 |             pooled_audio_outputs_list = []
114 |             for idx in range(audio.shape[0]):
115 |                 nF = max(1, nframes[idx])
116 |                 pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0))
117 |             audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2)
118 |         else:
119 |             audio = audio.mean(dim=2)  # this averages features from 0 padding too
120 | 
121 |         if self.tri_modal_fuse:
122 |             text = self.text_pooling_caption(text)
123 |             audio = self.DAVEnet_projection(audio)
124 |             audio_text = self.GU_audio_text(audio, text)
125 |             return audio_text, video
126 | 
127 |         # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training
128 |         audio = self.GU_audio(audio)
129 |         audio = self.DAVEnet_projection(audio)
130 |         if self.tri_modal and not self.tri_modal_fuse:
131 |             text = self.GU_text_captions(self.text_pooling_caption(text))
132 | 
133 |             # video_c2 = self.layer2(video)
134 |             #"""
135 | 
136 |             #"""
137 |             if self.layer==1:
138 |                 video_c = self.layer1(video)
139 |                 audio_c = self.layer2(audio)
140 |                 text_c = self.layer3(text)
141 |             else:
142 |                 if self.project==1:
143 |                     video_c = self.projection_head(video)
144 |                     video_c2 = nn.functional.normalize(video_c, dim=1, p=2)
145 |                 else:
146 |                     video_c = nn.functional.normalize(video, dim=1, p=2)
147 |                 video_c = self.classification(video_c2)
148 | 
149 |                 #
150 |                 if self.project == 1:
151 |                     audio_c = self.projection_head(audio)
152 |                     audio_c2 = nn.functional.normalize(audio_c, dim=1, p=2)
153 |                 else:
154 |                     audio_c = nn.functional.normalize(audio, dim=1, p=2)
155 |                 audio_c = self.classification(audio_c2)
156 | 
157 |                 #text_c = self.projection_head(text)
158 |                 if self.project == 1:
159 |                     text_c = self.projection_head(text)
160 |                     text_c2 = nn.functional.normalize(text_c, dim=1, p=2)
161 |                 else:
162 |                     text_c = nn.functional.normalize(text, dim=1, p=2)
163 |                 text_c = self.classification(text_c2)
164 | 
165 |             return audio, video, text, audio_c, video_c, text_c, audio_c2, video_c2, text_c2
166 | 
167 |         return audio, video
168 | 
169 | 
170 | class Gated_Embedding_Unit(nn.Module):
171 |     def __init__(self, input_dimension, output_dimension):
172 |         super(Gated_Embedding_Unit, self).__init__()
173 |         self.fc = nn.Linear(input_dimension, output_dimension)
174 |         self.cg = Context_Gating(output_dimension)
175 | 
176 |     def forward(self, x):
177 |         x = self.fc(x)
178 |         x = self.cg(x)
179 |         return x
180 | 
181 | 
182 | class Fused_Gated_Unit(nn.Module):
183 |     def __init__(self, input_dimension, output_dimension):
184 |         super(Fused_Gated_Unit, self).__init__()
185 |         self.fc_audio = nn.Linear(input_dimension, output_dimension)
186 |         self.fc_text = nn.Linear(input_dimension, output_dimension)
187 |         self.cg = Context_Gating(output_dimension)
188 | 
189 |     def forward(self, audio, text):
190 |         audio = self.fc_audio(audio)
191 |         text = self.fc_text(text)
192 |         x = audio + text
193 |         x = self.cg(x)
194 |         return x
195 | 
196 | 
197 | class Context_Gating(nn.Module):
198 |     def __init__(self, dimension):
199 |         super(Context_Gating, self).__init__()
200 |         self.fc = nn.Linear(dimension, dimension)
201 | 
202 |     def forward(self, x):
203 |         x1 = self.fc(x)
204 |         x = th.cat((x, x1), 1)
205 |         return F.glu(x, 1)
206 | 
207 | 
208 | class Sentence_Maxpool(nn.Module):
209 |     def __init__(self, word_dimension, output_dim):
210 |         super(Sentence_Maxpool, self).__init__()
211 |         self.fc = nn.Linear(word_dimension, output_dim)
212 | 
213 |     def forward(self, x):
214 |         x = self.fc(x)
215 |         x = F.relu(x)
216 |         return th.max(x, dim=1)[0]


--------------------------------------------------------------------------------
/model_tri_c_clean_sp.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn as nn
  7 | import torch as th
  8 | import torch.nn.functional as F
  9 | from model_davenet import load_DAVEnet
 10 | from attention import MultiHeadAttention
 11 | 
 12 | 
 13 | class Net(nn.Module):
 14 |     def __init__(
 15 |             self,
 16 |             embd_dim=4096,
 17 |             video_dim=4096,
 18 |             we_dim=300,
 19 |             tri_modal=False,
 20 |             tri_modal_fuse=False,
 21 |             cluster_size=256,
 22 |             layer=0,
 23 |             project=0,
 24 |             project_dim=6000,
 25 |             multi_cluster=0,
 26 |             recon=0,
 27 |             recon_b=0,
 28 |             finetune_video=0,
 29 |             multi_head=0,
 30 |             joint_cluster=0,
 31 |             output_norm=0,
 32 |             recon_cross=0
 33 | 
 34 |     ):
 35 |         super(Net, self).__init__()
 36 |         self.DAVEnet = load_DAVEnet()
 37 |         self.DAVEnet_projection = nn.Linear(1024, embd_dim)
 38 |         self.GU_audio = Gated_Embedding_Unit(1024, 1024)
 39 |         self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim)
 40 |         if tri_modal and not tri_modal_fuse:
 41 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim)
 42 |             self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim)
 43 | 
 44 |         elif tri_modal_fuse:
 45 |             self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2)
 46 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2)
 47 |             self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim)
 48 |         self.tri_modal = tri_modal
 49 |         self.tri_modal_fuse = tri_modal_fuse
 50 |         self.project = project
 51 |         if project==0:
 52 |             self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256
 53 |         else:
 54 | 
 55 |             self.projection_head = nn.Sequential(
 56 |                 nn.Linear(embd_dim, embd_dim),
 57 |                 nn.BatchNorm1d(embd_dim),
 58 |                 nn.ReLU(inplace=True),
 59 |                 nn.Linear(embd_dim, cluster_size),
 60 |             )
 61 |             if joint_cluster:
 62 |                 self.projection_head2 = self.projection_head
 63 |                 self.projection_head3 = self.projection_head
 64 |             else:
 65 |                 self.projection_head2 = nn.Sequential(
 66 |                     nn.Linear(embd_dim, embd_dim),
 67 |                     nn.BatchNorm1d(embd_dim),
 68 |                     nn.ReLU(inplace=True),
 69 |                     nn.Linear(embd_dim, cluster_size),
 70 |                 )
 71 |                 self.projection_head3 = nn.Sequential(
 72 |                     nn.Linear(embd_dim, embd_dim),
 73 |                     nn.BatchNorm1d(embd_dim),
 74 |                     nn.ReLU(inplace=True),
 75 |                     nn.Linear(embd_dim, cluster_size),
 76 |                 )
 77 |             self.classification = nn.Linear(cluster_size, project_dim, bias=False)
 78 | 
 79 |         self.layer=layer
 80 | 
 81 |         self.output_norm = output_norm
 82 | 
 83 |         self.recon = recon
 84 |         self.recon_b = recon_b
 85 |         self.recon_cross = recon_cross
 86 |         if recon:
 87 |             if recon_b:
 88 |                 inp_dim = cluster_size
 89 |             else:
 90 |                 inp_dim = embd_dim
 91 | 
 92 |             self.recon_v = nn.Sequential(
 93 |                 nn.Linear(inp_dim, embd_dim//8),
 94 |                 nn.ReLU(inplace=True),
 95 |                 nn.Linear(embd_dim//8, video_dim),
 96 |                 nn.ReLU(inplace=True)
 97 |             )
 98 |             self.recon_a = nn.Sequential(
 99 |                 nn.Linear(inp_dim, embd_dim//8),
100 |                 nn.ReLU(inplace=True),
101 |                 nn.Linear(embd_dim//8, 1024),
102 |                 nn.ReLU(inplace=True)
103 |             )
104 |             self.recon_t = nn.Sequential(
105 |                 nn.Linear(inp_dim, embd_dim//8),
106 |                 nn.ReLU(inplace=True),
107 |                 nn.Linear(embd_dim//8, embd_dim),
108 |                 nn.ReLU(inplace=True)
109 |             )
110 |             self.mse = nn.MSELoss(reduction='none')
111 | 
112 |         self.finetune_video = finetune_video
113 |         self.multi_head = multi_head
114 |         if self.finetune_video:
115 |             if self.multi_head:
116 |                 self.video_encoder = MultiHeadAttention(8, video_dim)
117 |             else:
118 |                 self.video_encoder = nn.Sequential(
119 |                     nn.Conv1d(video_dim, embd_dim, 3, 2),
120 |                     nn.ReLU(inplace=True),
121 |                     nn.Conv1d(embd_dim, video_dim, 3, 2),
122 |                     nn.ReLU(inplace=True)
123 |                 )
124 | 
125 |     def save_checkpoint(self, path):
126 |         th.save(self.state_dict(), path)
127 | 
128 |     def load_checkpoint(self, path):
129 |         try:
130 |             self.load_state_dict(th.load(path, map_location='cpu'))
131 |         except Exception as e:
132 |             print(e)
133 |             print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE")
134 |             self.load_state_dict(th.load(path, map_location='cpu'), strict=False)
135 |         print("Loaded model checkpoint from {}".format(path))
136 | 
137 |     def forward(self, video, audio_input, nframes, text=None):
138 |         if self.finetune_video:
139 |             if self.multi_head:
140 |                 video = video.transpose(-1, -2)
141 |                 video = self.video_encoder(video, video, video)
142 |                 video = th.max(video, 1)[0]
143 |             else:
144 |                 video = self.video_encoder(video)
145 |                 video = th.max(video, -1)[0]  # Max pools along the last dimension
146 |         video_gt = video
147 |         video = self.GU_video(video)
148 |         if self.recon and not self.recon_b:
149 |             video_recon = self.recon_v(video)
150 |             if self.recon_cross:
151 |                 audio_recon_v = self.recon_a(video)
152 |                 text_recon_v = self.recon_t(video)
153 | 
154 |         audio = self.DAVEnet(audio_input)
155 |         if not self.training:  # controlled by net.train() / net.eval() (use for downstream tasks)
156 |             # Mean-pool audio embeddings and disregard embeddings from input 0 padding
157 |             pooling_ratio = round(audio_input.size(-1) / audio.size(-1))
158 |             nframes.div_(pooling_ratio)
159 |             audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) #
160 |             #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1))
161 |             audio_outputs = audio.unsqueeze(2)
162 |             pooled_audio_outputs_list = []
163 |             for idx in range(audio.shape[0]):
164 |                 nF = max(1, nframes[idx])
165 |                 pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0))
166 |             audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2)
167 |         else:
168 |             audio = audio.mean(dim=2)  # this averages features from 0 padding too
169 | 
170 |         if self.tri_modal_fuse:
171 |             text = self.text_pooling_caption(text)
172 |             audio = self.DAVEnet_projection(audio)
173 |             audio_text = self.GU_audio_text(audio, text)
174 |             return audio_text, video
175 | 
176 |         # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training
177 |         audio_gt = audio
178 |         audio = self.GU_audio(audio)
179 |         audio = self.DAVEnet_projection(audio)
180 |         if self.recon and not self.recon_b:
181 |             audio_recon = self.recon_a(audio)
182 |             if self.recon_cross:
183 |                 video_recon_a = self.recon_v(audio)
184 |                 text_recon_a = self.recon_t(audio)
185 |         if self.tri_modal and not self.tri_modal_fuse:
186 |             text_gt = self.text_pooling_caption(text)
187 |             text = self.GU_text_captions(text_gt)
188 |             if self.recon and not self.recon_b:
189 |                 text_recon = self.recon_t(text)
190 |                 if self.recon_cross:
191 |                     audio_recon_t = self.recon_a(text)
192 |                     video_recon_t = self.recon_v(text)
193 | 
194 |             # video_c2 = self.layer2(video)
195 |             #"""
196 | 
197 |             #"""
198 |             if self.layer==1:
199 |                 video_c = self.layer1(video)
200 |                 audio_c = self.layer2(audio)
201 |                 text_c = self.layer3(text)
202 |             else:
203 |                 if self.project==1:
204 |                     video_c = self.projection_head(video)
205 |                     video_c2 = nn.functional.normalize(video_c, dim=1, p=2)
206 |                 else:
207 |                     video_c2 = nn.functional.normalize(video, dim=1, p=2)
208 |                 if self.recon and self.recon_b:
209 |                     video_recon = self.recon_v(video_c2)
210 |                 video_c = self.classification(video_c2)
211 | 
212 |                 #
213 |                 if self.project == 1:
214 |                     audio_c = self.projection_head2(audio)
215 |                     audio_c2 = nn.functional.normalize(audio_c, dim=1, p=2)
216 |                 else:
217 |                     audio_c2 = nn.functional.normalize(audio, dim=1, p=2)
218 |                 if self.recon and self.recon_b:
219 |                     audio_recon = self.recon_a(audio_c2)
220 |                 audio_c = self.classification(audio_c2)
221 | 
222 |                 #text_c = self.projection_head(text)
223 |                 if self.project == 1:
224 |                     text_c = self.projection_head3(text)
225 |                     text_c2 = nn.functional.normalize(text_c, dim=1, p=2)
226 |                 else:
227 |                     text_c2 = nn.functional.normalize(text, dim=1, p=2)
228 |                 if self.recon and self.recon_b:
229 |                     text_recon = self.recon_t(text_c2)
230 |                 text_c = self.classification(text_c2)
231 | 
232 |             if self.recon:
233 |                 mse_v = th.mean(self.mse(video_recon, video_gt), dim=-1)
234 |                 mse_a = th.mean(self.mse(audio_recon, audio_gt), dim=-1)
235 |                 mse_t = th.mean(self.mse(text_recon, text_gt), dim=-1)
236 |                 mse = mse_v+mse_a+mse_t
237 | 
238 |                 if self.recon_cross:
239 |                     mse = mse + th.mean(self.mse(video_recon_a, video_gt), dim=-1)
240 |                     mse = mse + th.mean(self.mse(video_recon_t, video_gt), dim=-1)
241 |                     mse = mse + th.mean(self.mse(audio_recon_v, audio_gt), dim=-1)
242 |                     mse = mse + th.mean(self.mse(audio_recon_t, audio_gt), dim=-1)
243 |                     mse = mse + th.mean(self.mse(text_recon_v, text_gt), dim=-1)
244 |                     mse = mse + th.mean(self.mse(text_recon_a, text_gt), dim=-1)
245 | 
246 |                 return audio, video, text, audio_c, video_c, text_c, mse
247 | 
248 |             if self.output_norm:
249 |                 return audio, video, text, audio_c, video_c, text_c, audio_c2, video_c2, text_c2
250 |             else:
251 |                 return audio, video, text, audio_c, video_c, text_c
252 |             #return audio, video, text
253 |         return audio, video
254 | 
255 | 
256 | class Gated_Embedding_Unit(nn.Module):
257 |     def __init__(self, input_dimension, output_dimension):
258 |         super(Gated_Embedding_Unit, self).__init__()
259 |         self.fc = nn.Linear(input_dimension, output_dimension)
260 |         self.cg = Context_Gating(output_dimension)
261 | 
262 |     def forward(self, x):
263 |         x = self.fc(x)
264 |         x = self.cg(x)
265 |         return x
266 | 
267 | 
268 | class Fused_Gated_Unit(nn.Module):
269 |     def __init__(self, input_dimension, output_dimension):
270 |         super(Fused_Gated_Unit, self).__init__()
271 |         self.fc_audio = nn.Linear(input_dimension, output_dimension)
272 |         self.fc_text = nn.Linear(input_dimension, output_dimension)
273 |         self.cg = Context_Gating(output_dimension)
274 | 
275 |     def forward(self, audio, text):
276 |         audio = self.fc_audio(audio)
277 |         text = self.fc_text(text)
278 |         x = audio + text
279 |         x = self.cg(x)
280 |         return x
281 | 
282 | 
283 | class Context_Gating(nn.Module):
284 |     def __init__(self, dimension):
285 |         super(Context_Gating, self).__init__()
286 |         self.fc = nn.Linear(dimension, dimension)
287 | 
288 |     def forward(self, x):
289 |         x1 = self.fc(x)
290 |         x = th.cat((x, x1), 1)
291 |         return F.glu(x, 1)
292 | 
293 | 
294 | class Sentence_Maxpool(nn.Module):
295 |     def __init__(self, word_dimension, output_dim):
296 |         super(Sentence_Maxpool, self).__init__()
297 |         self.fc = nn.Linear(word_dimension, output_dim)
298 | 
299 |     def forward(self, x):
300 |         x = self.fc(x)
301 |         x = F.relu(x)
302 |         return th.max(x, dim=1)[0]


--------------------------------------------------------------------------------
/model_tri_kmeans.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn as nn
  7 | import torch as th
  8 | import torch.nn.functional as F
  9 | from model_davenet import load_DAVEnet
 10 | 
 11 | 
 12 | class Net(nn.Module):
 13 |     def __init__(
 14 |             self,
 15 |             embd_dim=4096,
 16 |             video_dim=4096,
 17 |             we_dim=300,
 18 |             tri_modal=False,
 19 |             tri_modal_fuse=False,
 20 |             cluster_size=256,
 21 |             layer=0,
 22 |             project=0,
 23 |             project_dim=6000,
 24 |             multi_cluster=0,
 25 |             recon=0,
 26 |             withMLP=0,
 27 |             recon_size=768,
 28 | 
 29 |     ):
 30 |         super(Net, self).__init__()
 31 |         self.DAVEnet = load_DAVEnet()
 32 |         self.DAVEnet_projection = nn.Linear(1024, embd_dim)
 33 |         self.GU_audio = Gated_Embedding_Unit(1024, 1024)
 34 |         self.GU_video = Gated_Embedding_Unit(video_dim, embd_dim)
 35 |         if tri_modal and not tri_modal_fuse:
 36 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim)
 37 |             self.GU_text_captions = Gated_Embedding_Unit(embd_dim, embd_dim)
 38 | 
 39 |         elif tri_modal_fuse:
 40 |             self.DAVEnet_projection = nn.Linear(1024, embd_dim // 2)
 41 |             self.text_pooling_caption = Sentence_Maxpool(we_dim, embd_dim // 2)
 42 |             self.GU_audio_text = Fused_Gated_Unit(embd_dim // 2, embd_dim)
 43 |         self.tri_modal = tri_modal
 44 |         self.tri_modal_fuse = tri_modal_fuse
 45 |         self.project = project
 46 |         self.withMLP = withMLP
 47 |         self.recon_size = recon_size
 48 |         if withMLP==1:
 49 |             if project==0:
 50 |                 self.classification = nn.Linear(embd_dim, project_dim, bias=False) #4096,256
 51 |                 self.classification2 = nn.Linear(embd_dim, project_dim, bias=False)  # 4096,256
 52 |                 self.classification3 = nn.Linear(embd_dim, project_dim, bias=False)  # 4096,256
 53 |             else:
 54 | 
 55 |                 self.projection_head = nn.Sequential(
 56 |                     nn.Linear(embd_dim, embd_dim//8),
 57 |                     nn.BatchNorm1d(embd_dim//8),
 58 |                     nn.ReLU(inplace=True),
 59 |                     nn.Linear(embd_dim//8, cluster_size),
 60 |                 )
 61 |                 """
 62 |                 self.projection_head2 = nn.Sequential(
 63 |                     nn.Linear(embd_dim, embd_dim),
 64 |                     nn.BatchNorm1d(embd_dim),
 65 |                     nn.ReLU(inplace=True),
 66 |                     nn.Linear(embd_dim, cluster_size),
 67 |                 )
 68 |                 self.projection_head3 = nn.Sequential(
 69 |                     nn.Linear(embd_dim, embd_dim),
 70 |                     nn.BatchNorm1d(embd_dim),
 71 |                     nn.ReLU(inplace=True),
 72 |                     nn.Linear(embd_dim, cluster_size),
 73 |                 )
 74 |                 """
 75 |                 self.classification = nn.Linear(cluster_size, project_dim, bias=False)
 76 |                 #self.classification2 = nn.Linear(cluster_size, project_dim, bias=False)  # 4096,256
 77 |                 #self.classification3 = nn.Linear(cluster_size, project_dim, bias=False)  # 4096,256
 78 | 
 79 |         self.layer=layer
 80 |         self.recon = recon
 81 |         if recon:
 82 |             inp_dim = embd_dim
 83 | 
 84 |             self.recon_v = nn.Sequential(
 85 |                 nn.Linear(inp_dim, recon_size),
 86 |                 nn.ReLU(inplace=True),
 87 |                 nn.Linear(recon_size, video_dim),
 88 |                 nn.ReLU(inplace=True)
 89 |             )
 90 |             self.recon_a = nn.Sequential(
 91 |                 nn.Linear(inp_dim, recon_size),
 92 |                 nn.ReLU(inplace=True),
 93 |                 nn.Linear(recon_size, 1024),
 94 |                 nn.ReLU(inplace=True)
 95 |             )
 96 |             self.recon_t = nn.Sequential(
 97 |                 nn.Linear(inp_dim, recon_size),
 98 |                 nn.ReLU(inplace=True),
 99 |                 nn.Linear(recon_size, embd_dim),
100 |                 nn.ReLU(inplace=True)
101 |             )
102 |             self.mse = nn.MSELoss(reduction='none')
103 | 
104 | 
105 |     def save_checkpoint(self, path):
106 |         th.save(self.state_dict(), path)
107 | 
108 |     def load_checkpoint(self, path):
109 |         try:
110 |             self.load_state_dict(th.load(path, map_location='cpu'))
111 |         except Exception as e:
112 |             print(e)
113 |             print("IGNORING ERROR, LOADING MODEL USING STRICT=FALSE")
114 |             self.load_state_dict(th.load(path, map_location='cpu'), strict=False)
115 |         print("Loaded model checkpoint from {}".format(path))
116 | 
117 |     def forward(self, video, audio_input, nframes, text=None):
118 |         video_gt = video
119 |         video = self.GU_video(video)
120 |         if self.recon:
121 |             video_recon = self.recon_v(video)
122 |         audio = self.DAVEnet(audio_input)
123 |         if not self.training:  # controlled by net.train() / net.eval() (use for downstream tasks)
124 |             # Mean-pool audio embeddings and disregard embeddings from input 0 padding
125 |             pooling_ratio = round(audio_input.size(-1) / audio.size(-1))
126 |             nframes.div_(pooling_ratio)
127 |             audioPoolfunc = th.nn.AdaptiveAvgPool2d((1, 1)) #
128 |             #audioPoolfunc = th.nn.AdaptiveMaxPool2d((1, 1))
129 |             audio_outputs = audio.unsqueeze(2)
130 |             pooled_audio_outputs_list = []
131 |             for idx in range(audio.shape[0]):
132 |                 nF = max(1, nframes[idx])
133 |                 pooled_audio_outputs_list.append(audioPoolfunc(audio_outputs[idx][:, :, 0:nF]).unsqueeze(0))
134 |             audio = th.cat(pooled_audio_outputs_list).squeeze(3).squeeze(2)
135 |         else:
136 |             audio = audio.mean(dim=2)  # this averages features from 0 padding too
137 | 
138 |         if self.tri_modal_fuse:
139 |             text = self.text_pooling_caption(text)
140 |             audio = self.DAVEnet_projection(audio)
141 |             audio_text = self.GU_audio_text(audio, text)
142 |             return audio_text, video
143 | 
144 |         # Gating in lower embedding dimension (1024 vs 4096) for stability with mixed-precision training
145 |         audio_gt = audio
146 |         audio = self.GU_audio(audio)
147 |         audio = self.DAVEnet_projection(audio)
148 |         if self.recon:
149 |             audio_recon = self.recon_a(audio)
150 |         if self.tri_modal and not self.tri_modal_fuse:
151 |             text_gt = self.text_pooling_caption(text)
152 |             text = self.GU_text_captions(text_gt)
153 |             #fushed = (audio+text+video)/3
154 |             # video_c2 = self.layer2(video)
155 |             #"""
156 |             if self.recon:
157 |                 text_recon = self.recon_t(text)
158 | 
159 | 
160 |             if self.layer==1:
161 |                 video_c = self.layer1(video)
162 |                 audio_c = self.layer2(audio)
163 |                 text_c = self.layer3(text)
164 |             else:
165 |                 if self.withMLP==1:
166 |                     if self.project==1:
167 |                         video_c = self.projection_head(video)
168 |                         video_c = nn.functional.normalize(video_c, dim=1, p=2)
169 |                     else:
170 |                         video_c = nn.functional.normalize(video, dim=1, p=2)
171 |                     video_c = self.classification(video_c)
172 | 
173 |                     #
174 |                     if self.project == 1:
175 |                         audio_c = self.projection_head(audio)
176 |                         audio_c = nn.functional.normalize(audio_c, dim=1, p=2)
177 |                     else:
178 |                         audio_c = nn.functional.normalize(audio, dim=1, p=2)
179 |                     audio_c = self.classification(audio_c)
180 | 
181 |                     #text_c = self.projection_head(text)
182 |                     if self.project == 1:
183 |                         text_c = self.projection_head(text)
184 |                         text_c = nn.functional.normalize(text_c, dim=1, p=2)
185 |                     else:
186 |                         text_c = nn.functional.normalize(text, dim=1, p=2)
187 |                     text_c = self.classification(text_c)
188 |                 #else:
189 |                 #    audio_c = video_c = text_c = audio
190 |             #"""
191 |             #fushed = (audio_c + text_c + video_c) / 3
192 | 
193 |             #fushed = self.projection_head(fushed)
194 |             #fushed = nn.functional.normalize(fushed, dim=1, p=2)
195 |             #video_c = audio_c = text_c= fushed#self.classification(fushed)
196 |             if self.recon:
197 |                 mse_v = th.mean(self.mse(video_recon, video_gt), dim=-1)
198 |                 mse_a = th.mean(self.mse(audio_recon, audio_gt), dim=-1)
199 |                 mse_t = th.mean(self.mse(text_recon, text_gt), dim=-1)
200 |                 if self.withMLP == 1:
201 |                     return audio, video, text, audio_c, video_c, text_c, mse_v + mse_a + mse_t
202 |                 else:
203 |                     return audio, video, text, mse_v + mse_a + mse_t
204 |             return audio, video, text, text#, audio_c, video_c, text_c
205 |             #return audio, video, text 
206 |         return audio, video
207 | 
208 | 
209 | class Gated_Embedding_Unit(nn.Module):
210 |     def __init__(self, input_dimension, output_dimension):
211 |         super(Gated_Embedding_Unit, self).__init__()
212 |         self.fc = nn.Linear(input_dimension, output_dimension)
213 |         self.cg = Context_Gating(output_dimension)
214 | 
215 |     def forward(self, x):
216 |         x = self.fc(x)
217 |         x = self.cg(x)
218 |         return x
219 | 
220 | 
221 | class Fused_Gated_Unit(nn.Module):
222 |     def __init__(self, input_dimension, output_dimension):
223 |         super(Fused_Gated_Unit, self).__init__()
224 |         self.fc_audio = nn.Linear(input_dimension, output_dimension)
225 |         self.fc_text = nn.Linear(input_dimension, output_dimension)
226 |         self.cg = Context_Gating(output_dimension)
227 | 
228 |     def forward(self, audio, text):
229 |         audio = self.fc_audio(audio)
230 |         text = self.fc_text(text)
231 |         x = audio + text
232 |         x = self.cg(x)
233 |         return x
234 | 
235 | 
236 | class Context_Gating(nn.Module):
237 |     def __init__(self, dimension):
238 |         super(Context_Gating, self).__init__()
239 |         self.fc = nn.Linear(dimension, dimension)
240 | 
241 |     def forward(self, x):
242 |         x1 = self.fc(x)
243 |         x = th.cat((x, x1), 1)
244 |         return F.glu(x, 1)
245 | 
246 | 
247 | class Sentence_Maxpool(nn.Module):
248 |     def __init__(self, word_dimension, output_dim):
249 |         super(Sentence_Maxpool, self).__init__()
250 |         self.fc = nn.Linear(word_dimension, output_dim)
251 | 
252 |     def forward(self, x):
253 |         x = self.fc(x)
254 |         x = F.relu(x)
255 |         return th.max(x, dim=1)[0]
256 | 


--------------------------------------------------------------------------------
/msrvtt_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import pickle
  9 | import torch.nn.functional as F
 10 | import numpy as np
 11 | import re
 12 | import pandas as pd
 13 | from collections import defaultdict
 14 | from torch.utils.data.dataloader import default_collate
 15 | import json
 16 | import random
 17 | 
 18 | class MSRVTT_DataLoader(Dataset):
 19 |     """MSRVTT dataset loader."""
 20 | 
 21 |     def __init__(
 22 |             self,
 23 |             data_path,
 24 |             we,
 25 |             we_dim=300,
 26 |             max_words=30,
 27 |             num_frames_multiplier=5,
 28 |             training=True,
 29 |             tri_modal=False,
 30 |     ):
 31 |         """
 32 |         Args:
 33 |         """
 34 |         self.data = pickle.load(open(data_path, 'rb'))
 35 |         self.we = we
 36 |         self.we_dim = we_dim
 37 |         self.max_words = max_words
 38 |         self.max_video = 30
 39 |         self.num_frames_multiplier = num_frames_multiplier
 40 |         self.training = training
 41 |         self.tri_modal = tri_modal
 42 | 
 43 |     def __len__(self):
 44 |         return len(self.data)
 45 | 
 46 |     def custom_collate(self, batch):
 47 |         return default_collate(batch)
 48 | 
 49 |     def _zero_pad_tensor(self, tensor, size):
 50 |         if len(tensor) >= size:
 51 |             return tensor[:size]
 52 |         else:
 53 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 54 |             return np.concatenate((tensor, zero), axis=0)
 55 | 
 56 |     def _tokenize_text(self, sentence):
 57 |         w = re.findall(r"[\w']+", str(sentence))
 58 |         return w
 59 | 
 60 |     def _words_to_we(self, words):
 61 |         words = [word for word in words if word in self.we.vocab]
 62 |         if words:
 63 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 64 |             return th.from_numpy(we)
 65 |         else:
 66 |             return th.zeros(self.max_words, self.we_dim)
 67 |         
 68 |     def _get_caption(self, idx):
 69 |         """Chooses random caption if training. Uses set caption if evaluating."""
 70 |         if self.training:
 71 |             captions = self.data[idx]['caption']
 72 |             caption = self._words_to_we(self._tokenize_text(random.choice(captions)))
 73 |             return caption
 74 |         else:
 75 |             caption = self.data[idx]['eval_caption']
 76 |             return self._words_to_we(self._tokenize_text(caption))
 77 |             
 78 |         
 79 |     def __getitem__(self, idx):
 80 |         video_id = self.data[idx]['id']
 81 |         # load 2d and 3d features (features are pooled over the time dimension)
 82 |         #"""
 83 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0)
 84 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0)
 85 |         video = th.cat((feat_2d, feat_3d))
 86 |         """
 87 |         feat_2d = th.from_numpy(self.data[idx]['2d']).float()
 88 |         feat_3d = th.from_numpy(self.data[idx]['3d']).float()
 89 |         feat_2d = feat_2d[:10]
 90 |         feat_3d = feat_3d[:10]
 91 |         #feat_2d = F.interpolate(feat_2d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear',
 92 |         #                        align_corners=True).squeeze(0)
 93 |         #feat_3d = F.interpolate(feat_3d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear',
 94 |         #                        align_corners=True).squeeze(0)
 95 | 
 96 |         feat_2d = F.normalize(feat_2d, dim=1)
 97 |         feat_3d = F.normalize(feat_3d, dim=1)
 98 |         video = th.cat((feat_2d, feat_3d), dim=1)
 99 |         #"""
100 |         # load audio and zero pad/truncate if necessary
101 |         audio = self.data[idx]['audio']
102 |         target_length = 1024 * self.num_frames_multiplier
103 |         nframes = audio.numpy().shape[1]
104 |         p = target_length - nframes
105 |         if p > 0:
106 |             audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0))
107 |         elif p < 0:
108 |             audio = audio[:,0:p]
109 |         audio = th.FloatTensor(audio)
110 | 
111 |         # choose a caption
112 |         caption=''
113 |         if self.tri_modal:
114 |             caption = self._get_caption(idx)
115 |             
116 |         return {'video': video, 'text': caption, 'video_id': video_id,
117 |                 'audio': audio, 'nframes': nframes}
118 | 
119 | 
120 | class MSRVTT_DataLoader_label(Dataset):
121 |     """MSRVTT dataset loader."""
122 | 
123 |     def __init__(
124 |             self,
125 |             data_path,
126 |             we,
127 |             pseudo_v,
128 |             pseudo_a,
129 |             we_dim=300,
130 |             max_words=30,
131 |             num_frames_multiplier=5,
132 |             training=True,
133 |             tri_modal=False,
134 |     ):
135 |         """
136 |         Args:
137 |         """
138 |         self.data = pickle.load(open(data_path, 'rb'))
139 |         self.we = we
140 |         self.we_dim = we_dim
141 |         self.max_words = max_words
142 |         self.max_video = 30
143 |         self.num_frames_multiplier = num_frames_multiplier
144 |         self.training = training
145 |         self.tri_modal = tri_modal
146 |         self.pseudo_v = pseudo_v
147 |         self.pseudo_a = pseudo_a
148 | 
149 |     def __len__(self):
150 |         return len(self.data)
151 | 
152 |     def custom_collate(self, batch):
153 |         return default_collate(batch)
154 | 
155 |     def _zero_pad_tensor(self, tensor, size):
156 |         if len(tensor) >= size:
157 |             return tensor[:size]
158 |         else:
159 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
160 |             return np.concatenate((tensor, zero), axis=0)
161 | 
162 |     def _tokenize_text(self, sentence):
163 |         w = re.findall(r"[\w']+", str(sentence))
164 |         return w
165 | 
166 |     def _words_to_we(self, words):
167 |         words = [word for word in words if word in self.we.vocab]
168 |         if words:
169 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
170 |             return th.from_numpy(we)
171 |         else:
172 |             return th.zeros(self.max_words, self.we_dim)
173 | 
174 |     def _get_caption(self, idx):
175 |         """Chooses random caption if training. Uses set caption if evaluating."""
176 |         if self.training:
177 |             captions = self.data[idx]['caption']
178 |             caption = self._words_to_we(self._tokenize_text(random.choice(captions)))
179 |             return caption
180 |         else:
181 |             caption = self.data[idx]['eval_caption']
182 |             return self._words_to_we(self._tokenize_text(caption))
183 | 
184 |     def __getitem__(self, idx):
185 |         video_id = self.data[idx]['id']
186 |         # load 2d and 3d features (features are pooled over the time dimension)
187 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0)
188 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0)
189 |         video = th.cat((feat_2d, feat_3d))
190 | 
191 |         # load audio and zero pad/truncate if necessary
192 |         audio = self.data[idx]['audio']
193 |         target_length = 1024 * self.num_frames_multiplier
194 |         nframes = audio.numpy().shape[1]
195 |         p = target_length - nframes
196 |         if p > 0:
197 |             audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
198 |         elif p < 0:
199 |             audio = audio[:, 0:p]
200 |         audio = th.FloatTensor(audio)
201 | 
202 |         # choose a caption
203 |         caption = ''
204 |         if self.tri_modal:
205 |             caption = self._get_caption(idx)
206 | 
207 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
208 |                 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]}
209 | 


--------------------------------------------------------------------------------
/script.txt:
--------------------------------------------------------------------------------
 1 | # get machine
 2 | 
 3 | srun --gres=gpu:4 -N 1 --exclusive --mem=1000G --time 24:00:00 --cpus-per-task=74 --qos=sched_level_2 --pty /bin/bash
 4 | 
 5 | # training
 6 | 
 7 | model1=AVLnet_tri_single_cluster_128_soft_8000_project_cos_mil_e4_sp_6144
 8 | 
 9 | python -u train_tri_cos_mil.py --num_thread_reader=74 --epochs=30 --batch_size=128 \
10 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
11 | --lr=0.0001 --tri_modal=1 --apex_level=1 --cluster=1 --soft_label=1 --start_cluster=0 --project=1 --project_dim=8000 \
12 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
13 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
14 | --pretrain_path=/nobackup/users/brian27/MCN_public/model_mcn/$model1/e9.pth \
15 | --checkpoint_dir=model_mcn/$model1 >> logs/$model1
16 | 
17 | # resume pretrain
18 | 
19 | --pretrain_path=/nobackup/users/brian27/avlnet_private/model_mcn/$model1/e9.pth \
20 | 
21 | # test on youcook, MSR-VTT
22 | 
23 | python train_tri_c.py --eval_msrvtt=1  --num_thread_reader=74 --batch_size=512 --epochs=30 --project=1 --project_dim=8000 \
24 | --lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/AVLnet_tri_single_cluster_128_soft_8000_project_cos_mil_e4_sp_6144/e9.pth \
25 | --lr=1e-5 --tri_modal=1
26 | 
27 | python train_tri_c.py --eval_youcook=1  --num_thread_reader=74 --batch_size=512 --epochs=30 --project=1 --project_dim=8000 \
28 | --lr_decay=1.0 --embd_dim=6144  --pretrain_path=model_mcn/AVLnet_tri_single_cluster_128_soft_8000_project_cos_mil_e4_sp_6144/e9.pth \
29 | --lr=1e-5 --tri_modal=1


--------------------------------------------------------------------------------
/train_avlnet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --qos=sched_level_2
 3 | #SBATCH --gres=gpu:4 
 4 | #SBATCH --gpus-per-node=4
 5 | #SBATCH --nodes=1
 6 | #SBATCH --time=24:00:00
 7 | #SBATCH --cpus-per-task 74
 8 | #SBATCH --ntasks-per-node=1
 9 | #SBATCH --mem=1T
10 | #SBATCH --exclusive
11 | #SBATCH --job-name="ht"
12 | #SBATCH --output logs/ht-%j.out
13 | #SBATCH --error logs/ht-%j.err
14 | ## NOTE: adjust the dependency if needed for the 2nd and 3rd run
15 | ##SBATCH --dependency=afterok:12625
16 | 
17 | ## Number of total processes
18 | echo " "
19 | echo " Nodelist:= " $SLURM_JOB_NODELIST
20 | echo " Number of nodes:= " $SLURM_JOB_NUM_NODES
21 | echo " GPUs per node:= " $SLURM_JOB_GPUS
22 | echo " Ntasks per node:= "  $SLURM_NTASKS_PER_NODE
23 | 
24 | echo " Running on multiple nodes/GPU devices"
25 | echo ""
26 | echo " Run started at:- "
27 | date
28 | 
29 | source /nobackup/users/duartek/anaconda3/bin/activate
30 | conda activate wmlce-1.6.2
31 | 
32 | nvidia-smi
33 | pwd
34 | 
35 | #####################
36 | 
37 |  
38 | python -u train_tri_kmeans.py --num_thread_reader=74 --epochs=10 --batch_size=128 \
39 | --n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
40 | --lr=0.0001 --tri_modal=1 --apex_level=1 --kmeans=1 --use_queue=1 --queue_size=20 --fastC=1 --mean=1 --recon=1 --recon_size=1024 \
41 | --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
42 | --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
43 | --pretrain_path=model_mcn/MCN_KMeans/e16.pth --train_csv=data/HowTo100M_336_videopaths.txt \
44 | --checkpoint_dir=model_mcn/MCN_KMeans >> logs/MCN_KMeans
45 | 
46 | 
47 | #python -u train_tri_cos_mil.py --num_thread_reader=74 --epochs=30 --batch_size=128 \
48 | #--n_pair=32 --embd_dim=6144 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0 --finetune_video=0 --video_interp=0 \
49 | #--recon=1 --recon_b=0 --recon_cross=0 --joint_cluster=1 --cluster_a=0 --multi_head=0 \
50 | #--lr=0.0001 --tri_modal=1 --apex_level=1 --cluster=1 --soft_label=0 --start_cluster=0 --project=1 --project_dim=8000 \
51 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
52 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
53 | #--pretrain_path=model_mcn/MCN_Sports/e10.pth --train_csv=data/HowTo100M_336_videopaths.txt \
54 | #--checkpoint_dir=model_mcn/MCN_Sports >> logs/MCN_Sports
55 | 
56 | # --pretrain_path=/nobackup/users/brian27/MCN_public/model_mcn/$model1/e9.pth \
57 | ## Run two training commands in the background, each on two V100 GPUs
58 | #model1=AVLnet_test_code_release
59 | #model2=AVLnet_text_test_code_release
60 | 
61 | 
62 | #CUDA_VISIBLE_DEVICES=0,1 python -u train.py --num_thread_reader=20 --epochs=7 --batch_size=128 --n_pair=32 --embd_dim=4096 --howto_audio_frames=1000 --lr=0.001 --apex_level=1 \
63 | #--features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos --features_path_audio=/nobackup/projects/public/howto100m/parsed_videos \
64 | #--checkpoint_dir=model/$model1 >> logs/$model1 & \
65 | 
66 | ## Add --pretrain_path to the command before the >> for the second run
67 | # --pretrain_path=model/$model1/e7.pth
68 | 
69 | #CUDA_VISIBLE_DEVICES=2,3 python -u train.py --num_thread_reader=20 --epochs=7 --batch_size=128 --n_pair=32 --embd_dim=4096 --howto_audio_frames=1000 --min_time=10.0 --random_audio_windows=0  \
70 | #--lr=0.0001 --tri_modal=1 --tri_modal_fuse=1 --apex_level=1 --features_path=/nobackup/users/kaudhkha/sightsound/data/howto/parsed_videos \
71 | #--features_path_audio=/nobackup/projects/public/howto100m/parsed_videos --checkpoint_dir=model/$model2 >> logs/$model2 & \
72 | 
73 | ## Add --pretrain_path to the command before the >> for the second run 
74 | # --pretrain_path=model/$model2/e7.pth
75 | 
76 | ## Wait for all commands to finish
77 | wait 
78 | echo "Run completed at:- "
79 | date
80 | 


--------------------------------------------------------------------------------
/ucf_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import pickle
  9 | import torch.nn.functional as F
 10 | import numpy as np
 11 | import re
 12 | import pandas as pd
 13 | from collections import defaultdict
 14 | from torch.utils.data.dataloader import default_collate
 15 | import json
 16 | import random
 17 | 
 18 | 
 19 | def name_to_stringlist(name):
 20 |     change = {}
 21 |     """
 22 |     change = {'HandStandPushups': ['handstand', 'pushups'],
 23 |         'HandstandPushups': ['handstand', 'pushups'],
 24 |         'PushUps': ['pushups'],
 25 |         'PullUps': ['pullups']}
 26 |     """
 27 |     """
 28 |     change = {
 29 |         'CleanAndJerk': ['weight', 'lift'],
 30 |         'Skijet': ['Skyjet'],
 31 |         'HandStandPushups': ['handstand', 'pushups'],
 32 |         'HandstandPushups': ['handstand', 'pushups'],
 33 |         'PushUps': ['pushups'],
 34 |         'PullUps': ['pullups'],
 35 |         'WalkingWithDog': ['walk', 'dog'],
 36 |         'ThrowDiscus': ['throw', 'disc'],
 37 |         'TaiChi': ['taichi'],
 38 |         'CuttingInKitchen': ['cut', 'kitchen'],
 39 |         'YoYo': ['yoyo'],
 40 |     }
 41 |     """
 42 |     if name in change:
 43 |         name_vec = change[name]
 44 |     else:
 45 |         upper_idx = np.where([x.isupper() for x in name])[0].tolist()
 46 |         upper_idx += [len(name)]
 47 |         name_vec = []
 48 |         for i in range(len(upper_idx)-1):
 49 |             name_vec.append(name[upper_idx[i]: upper_idx[i+1]])
 50 |         name_vec = [n.lower() for n in name_vec]
 51 |         #name_vec = verbs2basicform(name_vec)
 52 |     return name_vec
 53 | 
 54 | 
 55 | class UCF_DataLoader(Dataset):
 56 |     """MSRVTT dataset loader."""
 57 | 
 58 |     def __init__(
 59 |             self,
 60 |             data_path,
 61 |             we,
 62 |             we_dim=300,
 63 |             max_words=30,
 64 |             num_frames_multiplier=5,
 65 |             training=True,
 66 |             tri_modal=False,
 67 |             finetune_video=False,
 68 |             video_interp=False
 69 |     ):
 70 |         """
 71 |         Args:
 72 |         """
 73 |         self.data = pickle.load(open(data_path, 'rb'))  # contains a list of video names
 74 |         self.we = we
 75 |         self.we_dim = we_dim
 76 |         self.max_words = max_words
 77 |         self.max_video = 30
 78 |         self.num_frames_multiplier = num_frames_multiplier
 79 |         self.training = training
 80 |         self.tri_modal = tri_modal
 81 |         self.finetune_video = finetune_video
 82 |         self.max_frames = 16
 83 |         self.video_interp = video_interp
 84 | 
 85 |         names = []
 86 |         for vid in self.data:
 87 |             names.append(vid['class'])
 88 | 
 89 |         self.classes = sorted(set(names))
 90 |         print('# Classes', len(self.classes))
 91 | 
 92 |         self.class_embeds = []
 93 |         for name in self.classes:
 94 |             word_list = name_to_stringlist(name)
 95 |             caption = ' '.join(word_list)
 96 |             self.class_embeds.append(self._get_caption(caption))
 97 |         self.class_embeds = th.stack(self.class_embeds, 0)
 98 |         print('Shape of class embeds', self.class_embeds.shape)
 99 | 
100 |     def __len__(self):
101 |         return len(self.data)
102 | 
103 |     def custom_collate(self, batch):
104 |         return default_collate(batch)
105 | 
106 |     def _zero_pad_tensor(self, tensor, size):
107 |         if len(tensor) >= size:
108 |             return tensor[:size]
109 |         else:
110 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
111 |             return np.concatenate((tensor, zero), axis=0)
112 | 
113 |     def _tokenize_text(self, sentence):
114 |         w = re.findall(r"[\w']+", str(sentence))
115 |         return w
116 | 
117 |     def _words_to_we(self, words):
118 |         words = [word for word in words if word in self.we.vocab]
119 |         if words:
120 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
121 |             return th.from_numpy(we)
122 |         else:
123 |             return th.zeros(self.max_words, self.we_dim)
124 | 
125 |     def _get_caption(self, idx):
126 |         """Chooses random caption if training. Uses set caption if evaluating."""
127 |         if self.training:
128 |             captions = idx
129 |             caption = self._words_to_we(self._tokenize_text(random.choice(captions)))
130 |             return caption
131 |         else:
132 |             caption = idx
133 |             return self._words_to_we(self._tokenize_text(caption))
134 | 
135 |     def __getitem__(self, idx):
136 |         data = self.data[idx]
137 |         # load 2d and 3d features (features are pooled over the time dimension)
138 | 
139 |         if self.finetune_video:
140 |             feat_2d = th.from_numpy(self.data[idx]['2d']).float()
141 |             feat_3d = th.from_numpy(self.data[idx]['3d']).float()
142 |             if self.video_interp:
143 |                 feat_2d = F.interpolate(feat_2d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear',
144 |                                         align_corners=True).squeeze(0)
145 |                 feat_3d = F.interpolate(feat_3d.transpose(1, 0).unsqueeze(0), self.max_frames, mode='linear',
146 |                                         align_corners=True).squeeze(0)
147 |             else:
148 |                 feat2d_buffer = th.zeros(self.max_frames, feat_2d.shape[-1])
149 |                 feat_2d = feat_2d[:self.max_frames]
150 |                 feat2d_buffer[:len(feat_2d)] = feat_2d
151 | 
152 |                 feat3d_buffer = th.zeros(self.max_frames, feat_3d.shape[-1])
153 |                 feat_3d = feat_3d[:self.max_frames]
154 |                 feat3d_buffer[:len(feat_3d)] = feat_3d
155 | 
156 |                 feat_2d = feat2d_buffer.transpose(1, 0)
157 |                 feat_3d = feat3d_buffer.transpose(1, 0)
158 | 
159 |             feat_2d = F.normalize(feat_2d, dim=0)
160 |             feat_3d = F.normalize(feat_3d, dim=0)
161 |             video = th.cat((feat_2d, feat_3d), dim=0)
162 |         else:
163 |             feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0)
164 |             feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0)
165 |             video = th.cat((feat_2d, feat_3d))
166 | 
167 |         # load audio and zero pad/truncate if necessary
168 |         audio = self.data[idx]['audio']
169 |         target_length = 1024 * self.num_frames_multiplier
170 |         nframes = audio.shape[1]
171 |         p = target_length - nframes
172 |         if p > 0:
173 |             audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
174 |         elif p < 0:
175 |             audio = audio[:, 0:p]
176 |         audio = th.FloatTensor(audio)
177 | 
178 |         # choose a caption
179 |         caption = ''
180 |         name = self.data[idx]['class']
181 |         if self.tri_modal:
182 |             word_list = name_to_stringlist(name)
183 |             caption = ' '.join(word_list)
184 |             caption = self._get_caption(caption)
185 | 
186 |         return {'video': video, 'text': caption, 'video_id': idx,
187 |                 'audio': audio, 'nframes': 32, 'class_name': name,
188 |                 'class_id': th.ones(1)*self.classes.index(name),
189 |                 'has_audio': th.ones(1)*self.data[idx]['has_audio'],
190 |                 'video_name': self.data[idx]['video'],
191 |                 'training': th.ones(1)*self.data[idx]['training']}
192 | 
193 | 
194 | class MSRVTT_DataLoader_label(Dataset):
195 |     """MSRVTT dataset loader."""
196 | 
197 |     def __init__(
198 |             self,
199 |             data_path,
200 |             we,
201 |             pseudo_v,
202 |             pseudo_a,
203 |             we_dim=300,
204 |             max_words=30,
205 |             num_frames_multiplier=5,
206 |             training=True,
207 |             tri_modal=False,
208 |     ):
209 |         """
210 |         Args:
211 |         """
212 |         self.data = pickle.load(open(data_path, 'rb'))
213 |         self.we = we
214 |         self.we_dim = we_dim
215 |         self.max_words = max_words
216 |         self.max_video = 30
217 |         self.num_frames_multiplier = num_frames_multiplier
218 |         self.training = training
219 |         self.tri_modal = tri_modal
220 |         self.pseudo_v = pseudo_v
221 |         self.pseudo_a = pseudo_a
222 | 
223 | 
224 | 
225 |     def __len__(self):
226 |         return len(self.data)
227 | 
228 |     def custom_collate(self, batch):
229 |         return default_collate(batch)
230 | 
231 |     def _zero_pad_tensor(self, tensor, size):
232 |         if len(tensor) >= size:
233 |             return tensor[:size]
234 |         else:
235 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
236 |             return np.concatenate((tensor, zero), axis=0)
237 | 
238 |     def _tokenize_text(self, sentence):
239 |         w = re.findall(r"[\w']+", str(sentence))
240 |         return w
241 | 
242 |     def _words_to_we(self, words):
243 |         words = [word for word in words if word in self.we.vocab]
244 |         if words:
245 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
246 |             return th.from_numpy(we)
247 |         else:
248 |             return th.zeros(self.max_words, self.we_dim)
249 | 
250 |     def _get_caption(self, idx):
251 |         """Chooses random caption if training. Uses set caption if evaluating."""
252 |         if self.training:
253 |             captions = self.data[idx]['caption']
254 |             caption = self._words_to_we(self._tokenize_text(random.choice(captions)))
255 |             return caption
256 |         else:
257 |             caption = self.data[idx]['eval_caption']
258 |             return self._words_to_we(self._tokenize_text(caption))
259 | 
260 |     def __getitem__(self, idx):
261 |         video_id = self.data[idx]['id']
262 |         # load 2d and 3d features (features are pooled over the time dimension)
263 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d_pooled']).float(), dim=0)
264 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d_pooled']).float(), dim=0)
265 |         video = th.cat((feat_2d, feat_3d))
266 | 
267 |         # load audio and zero pad/truncate if necessary
268 |         audio = self.data[idx]['audio']
269 |         target_length = 1024 * self.num_frames_multiplier
270 |         nframes = audio.numpy().shape[1]
271 |         p = target_length - nframes
272 |         if p > 0:
273 |             audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
274 |         elif p < 0:
275 |             audio = audio[:, 0:p]
276 |         audio = th.FloatTensor(audio)
277 | 
278 |         # choose a caption
279 |         caption = ''
280 |         if self.tri_modal:
281 |             caption = self._get_caption(idx)
282 | 
283 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
284 |                 'audio': audio, 'nframes': nframes, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]}
285 | 


--------------------------------------------------------------------------------
/video_evaluation.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import numpy as np
  3 | 
  4 | 
  5 | def recall(mat, gts):
  6 |     # mat is of shape (Queries, Targets), where higher=prediction
  7 |     # gts is of shape (Queries, )
  8 | 
  9 |     predictions = np.argsort(mat, 1)  # (Queries, Targets)
 10 | 
 11 |     top_1 = predictions[:, -1]
 12 | 
 13 |     recall = np.mean(top_1 == gts)
 14 |     print('NN Retrieval R@1:', recall)
 15 | 
 16 |     recall_top5 = np.mean([l in p for l, p in zip(gts, predictions[:, -5:])])
 17 |     print('NN Retrieval R@5:', recall_top5)
 18 | 
 19 |     recall_top10 = np.mean([l in p for l, p in zip(gts, predictions[:, -10:])])
 20 |     print('NN Retrieval R@10:', recall_top10)
 21 | 
 22 | 
 23 | def evaluate_recall_youcook(text, video_audio, clip_ids, m=None):
 24 |     # text is of shape (n_clips, n_feats)
 25 |     # video_audio is of shape (n_clips, n_feats)
 26 |     # video_ids is a list of length n_clips with all the clip_ids
 27 |     full_videos = sorted(list(set([d[:11] for d in clip_ids])))
 28 |     print('# Clips', len(clip_ids))
 29 |     print('# Videos', len(full_videos))
 30 | 
 31 |     n_clips = len(clip_ids)
 32 |     n_vids = len(full_videos)
 33 |     clip_to_video = []
 34 |     [clip_to_video.extend([i for i, x in enumerate(full_videos) if x in clip_id]) for clip_id in clip_ids]
 35 |     clip_to_video = np.array(clip_to_video)
 36 | 
 37 |     if m is None:
 38 |         m = np.matmul(text, video_audio.T)  # (n_clips, n_clips)
 39 |     print('Standard Retrieval | single caption -> single clip')
 40 |     recall(m, np.arange(m.shape[0]))
 41 | 
 42 |     predictions = np.argsort(m, 1)
 43 | 
 44 |     video_predictions = clip_to_video[predictions]
 45 |     video_gts = clip_to_video[np.arange(len(clip_to_video))]
 46 | 
 47 |     print('Retrieval single | single caption -> full video')
 48 |     recall_top1 = np.mean(video_predictions[:, -1] == video_gts)
 49 |     print('NN Retrieval R@1:', recall_top1)
 50 | 
 51 |     recall_top5 = np.mean([l in p for l, p in zip(video_gts, video_predictions[:, -5:])])
 52 |     print('NN Retrieval R@5:', recall_top5)
 53 | 
 54 |     recall_top10 = np.mean([l in p for l, p in zip(video_gts, video_predictions[:, -10:])])
 55 |     print('NN Retrieval R@10:', recall_top10)
 56 | 
 57 |     video_inds = [[i for i, x in enumerate(clip_ids) if video in x] for video in full_videos] # list of length n_vids, with the corresponding clip_inds
 58 | 
 59 |     video_preds_m = np.stack([np.max(m[:, v], axis=1) for v in video_inds], 1)  # (n_clips, n_vids)
 60 |     video_preds_m2 = np.stack([np.mean(video_preds_m[v, :], axis=0) for v in video_inds], 0)  # (n_vids, n_vids)
 61 | 
 62 |     print('Retrieval single | full caption -> full video | for each caption get max prediction over a video, then average over all captions of a video.')
 63 |     recall(video_preds_m2, np.arange(n_vids))
 64 | 
 65 |     corr_preds = []
 66 |     for video_id in range(len(full_videos)):
 67 |         vid_i_m = video_preds_m[video_gts == video_id]
 68 |         vid_i_pred = np.argsort(vid_i_m, 1)
 69 |         prs = []
 70 |         for i in [1, 5, 10]:
 71 |             top_i_preds = vid_i_pred[:, -i:]
 72 |             unique_ids, counts = np.unique(top_i_preds, return_counts=True)
 73 |             id_pred = unique_ids[np.argsort(counts)[-i:]]
 74 |             #print(id_pred)
 75 |             prs.append(video_id in id_pred)
 76 |         corr_preds.append(prs)
 77 | 
 78 |     t1, t5, t10 = zip(*corr_preds)
 79 |     print('Retrieval single | full caption -> full video | for each caption get top_k video predictions, then get sorted majority vote for final top_k predictions.')
 80 |     print('NN Retrieval R@1:', np.mean(t1))
 81 |     print('NN Retrieval R@5:', np.mean(t5))
 82 |     print('NN Retrieval R@10:', np.mean(t10))
 83 | 
 84 |     corr_preds = []
 85 |     for video_id in range(len(full_videos)):
 86 |         vid_i_m = m[video_gts == video_id]
 87 |         vid_i_pred = clip_to_video[np.argsort(vid_i_m, 1)]
 88 | 
 89 |         prs = []
 90 |         for i in [1, 5, 10]:
 91 |             top_i_preds = vid_i_pred[:, -i:]
 92 |             unique_ids, counts = np.unique(top_i_preds, return_counts=True)
 93 |             id_pred = unique_ids[np.argsort(counts)[-i:]]
 94 |             prs.append(video_id in id_pred)
 95 |         corr_preds.append(prs)
 96 | 
 97 |     t1, t5, t10 = zip(*corr_preds)
 98 |     print('Retrieval single | full caption -> full video | for each caption get top_k clip predictions, then get sorted majority vote for final top_k predictions.')
 99 |     print('NN Retrieval R@1:', np.mean(t1))
100 |     print('NN Retrieval R@5:', np.mean(t5))
101 |     print('NN Retrieval R@10:', np.mean(t10))
102 | 
103 | #data = pickle.load(open('temp_data/YouCook2.pkl', 'rb'))
104 | #print(data.keys())
105 | #evaluate_recall(data['text'], data['audio']+data['video'], data['video_id'])
106 | 
107 | 


--------------------------------------------------------------------------------
/youcook_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import pickle
  9 | import torch.nn.functional as F
 10 | import numpy as np
 11 | import re
 12 | from torch.utils.data.dataloader import default_collate
 13 | 
 14 | class Youcook_DataLoader(Dataset):
 15 |     """Youcook dataset loader."""
 16 | 
 17 |     def __init__(
 18 |             self,
 19 |             data,
 20 |             we,
 21 |             we_dim=300,
 22 |             max_words=30,
 23 |             num_frames_multiplier=5,
 24 |             tri_modal=False,    
 25 |     ):
 26 |         """
 27 |         Args:
 28 |         """
 29 |         self.data = pickle.load(open(data, 'rb'))
 30 |         self.we = we
 31 |         self.we_dim = we_dim
 32 |         self.max_words = max_words
 33 |         self.num_frames_multiplier = num_frames_multiplier
 34 |         self.tri_modal = tri_modal
 35 | 
 36 |     def __len__(self):
 37 |         return len(self.data)
 38 | 
 39 |     def custom_collate(self, batch):
 40 |         return default_collate(batch)
 41 | 
 42 |     def _zero_pad_tensor(self, tensor, size):
 43 |         if len(tensor) >= size:
 44 |             return tensor[:size]
 45 |         else:
 46 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 47 |             return np.concatenate((tensor, zero), axis=0)
 48 | 
 49 |     def _tokenize_text(self, sentence):
 50 |         w = re.findall(r"[\w']+", str(sentence))
 51 |         return w
 52 | 
 53 |     def _words_to_we(self, words):
 54 |         words = [word for word in words if word in self.we.vocab]
 55 |         if words:
 56 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 57 |             return th.from_numpy(we)
 58 |         else:
 59 |             return th.zeros(self.max_words, self.we_dim)
 60 | 
 61 |     def __getitem__(self, idx):
 62 |         # load 2d and 3d features (features are pooled over the time dimension)
 63 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0)
 64 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0)
 65 |         video = th.cat((feat_2d, feat_3d))
 66 | 
 67 |         # load audio and zero pad/truncate if necessary
 68 |         audio = self.data[idx]['audio']
 69 |         target_length = 1024 * self.num_frames_multiplier
 70 |         nframes = audio.numpy().shape[1]
 71 |         p = target_length - nframes
 72 |         if p > 0:
 73 |             audio = np.pad(audio, ((0,0),(0,p)), 'constant', constant_values=(0,0))
 74 |         elif p < 0:
 75 |             audio = audio[:,0:p]
 76 |         audio = th.FloatTensor(audio)
 77 | 
 78 |         caption = ''
 79 |         if self.tri_modal:
 80 |             caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption'])) 
 81 | 
 82 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
 83 |                 'audio': audio, 'nframes': nframes}
 84 | 
 85 | 
 86 | class Youcook_DataLoader_label(Dataset):
 87 |     """Youcook dataset loader."""
 88 | 
 89 |     def __init__(
 90 |             self,
 91 |             data,
 92 |             we,
 93 |             pseudo_v,
 94 |             pseudo_a,
 95 |             we_dim=300,
 96 |             max_words=30,
 97 |             num_frames_multiplier=5,
 98 |             tri_modal=False,
 99 | 
100 |     ):
101 |         """
102 |         Args:
103 |         """
104 |         self.data = pickle.load(open(data, 'rb')) #9000*4800
105 |         self.we = we
106 |         self.we_dim = we_dim
107 |         self.max_words = max_words
108 |         self.num_frames_multiplier = num_frames_multiplier
109 |         self.tri_modal = tri_modal
110 |         self.pseudo_v = pseudo_v
111 |         self.pseudo_a = pseudo_a
112 | 
113 |     def __len__(self):
114 |         return len(self.data)
115 | 
116 |     def custom_collate(self, batch):
117 |         return default_collate(batch)
118 | 
119 |     def _zero_pad_tensor(self, tensor, size):
120 |         if len(tensor) >= size:
121 |             return tensor[:size]
122 |         else:
123 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
124 |             return np.concatenate((tensor, zero), axis=0)
125 | 
126 |     def _tokenize_text(self, sentence):
127 |         w = re.findall(r"[\w']+", str(sentence))
128 |         return w
129 | 
130 |     def _words_to_we(self, words):
131 |         words = [word for word in words if word in self.we.vocab]
132 |         if words:
133 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
134 |             return th.from_numpy(we)
135 |         else:
136 |             return th.zeros(self.max_words, self.we_dim)
137 | 
138 |     def __getitem__(self, idx):
139 |         # load 2d and 3d features (features are pooled over the time dimension)
140 |         feat_2d = F.normalize(th.from_numpy(self.data[idx]['2d']).float(), dim=0)
141 |         feat_3d = F.normalize(th.from_numpy(self.data[idx]['3d']).float(), dim=0)
142 |         video = th.cat((feat_2d, feat_3d))
143 | 
144 |         # load audio and zero pad/truncate if necessary
145 |         audio = self.data[idx]['audio']
146 |         target_length = 1024 * self.num_frames_multiplier
147 |         nframes = audio.numpy().shape[1]
148 |         p = target_length - nframes
149 |         if p > 0:
150 |             audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
151 |         elif p < 0:
152 |             audio = audio[:, 0:p]
153 |         audio = th.FloatTensor(audio)
154 | 
155 |         caption = ''
156 |         if self.tri_modal:
157 |             caption = self._words_to_we(self._tokenize_text(self.data[idx]['caption']))
158 | 
159 |         return {'video': video, 'text': caption, 'video_id': self.data[idx]['id'],
160 |                 'audio': audio, 'nframes': nframes, 'pseudo_v':self.pseudo_v[idx], 'pseudo_a':self.pseudo_a[idx]}
161 | 
162 | class Youcook_DataLoader_knn(Dataset):
163 |     """Youcook dataset loader."""
164 | 
165 |     def __init__(
166 |             self,
167 |             data,
168 |             we,
169 |             knn_v,
170 |             knn_a,
171 |             we_dim=300,
172 |             max_words=30,
173 |             num_frames_multiplier=5,
174 |             tri_modal=False,
175 | 
176 |     ):
177 |         """
178 |         Args:
179 |         """
180 |         self.data = pickle.load(open(data, 'rb')) #9000*4800
181 |         self.we = we
182 |         self.we_dim = we_dim
183 |         self.max_words = max_words
184 |         self.num_frames_multiplier = num_frames_multiplier
185 |         self.tri_modal = tri_modal
186 |         self.knn_v = knn_v
187 |         self.knn_a = knn_a
188 | 
189 |     def __len__(self):
190 |         return len(self.data)
191 | 
192 |     def custom_collate(self, batch):
193 |         return default_collate(batch)
194 | 
195 |     def _zero_pad_tensor(self, tensor, size):
196 |         if len(tensor) >= size:
197 |             return tensor[:size]
198 |         else:
199 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
200 |             return np.concatenate((tensor, zero), axis=0)
201 | 
202 |     def _tokenize_text(self, sentence):
203 |         w = re.findall(r"[\w']+", str(sentence))
204 |         return w
205 | 
206 |     def _words_to_we(self, words):
207 |         words = [word for word in words if word in self.we.vocab]
208 |         if words:
209 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
210 |             return th.from_numpy(we)
211 |         else:
212 |             return th.zeros(self.max_words, self.we_dim)
213 | 
214 |     def __getitem__(self, idx):
215 |         video_feature = []
216 |         text_feature = []
217 |         audio_feature = []
218 |         nframes_list = []
219 |         caption_text = []
220 |         for i in self.knn_v[idx]:
221 |             # load 2d and 3d features (features are pooled over the time dimension)
222 |             feat_2d = F.normalize(th.from_numpy(self.data[i]['2d']).float(), dim=0)
223 |             feat_3d = F.normalize(th.from_numpy(self.data[i]['3d']).float(), dim=0)
224 |             video = th.cat((feat_2d, feat_3d))
225 |             video_feature.append(video.numpy())
226 |             # load audio and zero pad/truncate if necessary
227 |             audio = self.data[i]['audio']
228 |             target_length = 1024 * self.num_frames_multiplier
229 |             nframes = audio.numpy().shape[1]
230 |             nframes_list.append(nframes)
231 |             p = target_length - nframes
232 |             if p > 0:
233 |                 audio = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
234 |             elif p < 0:
235 |                 audio = audio[:, 0:p]
236 |             audio = th.FloatTensor(audio)
237 |             audio_feature.append(audio.numpy())
238 |             caption = ''
239 |             if self.tri_modal:
240 |                 caption = self._words_to_we(self._tokenize_text(self.data[i]['caption']))
241 |                 text_feature.append(caption.numpy())
242 |         video_f = np.asarray(video_feature)
243 |         text_f = np.asarray(text_feature)
244 |         audio_f = np.asarray(audio_feature)
245 |         nframes_l = np.asarray(nframes_list)
246 |         """
247 |         print('dataload')
248 |         print(video_f.shape)
249 |         print(text_f.shape)
250 |         print(audio_f.shape)
251 |         print(nframes_l.shape)
252 |         print('dataload_fin')
253 |         """
254 |         #caption_text =
255 |         return {'video': video_f, 'text': text_f, 'video_id': self.data[i]['id'],
256 |                 'audio': audio_f, 'nframes': nframes_l}


--------------------------------------------------------------------------------
/youtube_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import torch.nn.functional as F
  9 | import pandas as pd
 10 | import os
 11 | import numpy as np
 12 | import re
 13 | import random
 14 | import librosa
 15 | from model_davenet import LoadAudio
 16 | 
 17 | 
 18 | class Youtube_DataLoader(Dataset):
 19 |     """Youtube dataset loader."""
 20 | 
 21 |     def __init__(
 22 |             self,
 23 |             csv,
 24 |             features_path,
 25 |             features_path_audio,
 26 |             caption,
 27 |             we,
 28 |             min_time=10.0,
 29 |             feature_framerate=1.0,
 30 |             feature_framerate_3D=24.0 / 16.0,
 31 |             we_dim=300,
 32 |             max_words=30,
 33 |             min_words=0,
 34 |             n_pair=1,
 35 |             num_audio_frames=1024,
 36 |             random_audio_windows=False,
 37 |     ):
 38 |         """
 39 |         Args:
 40 |         """
 41 |         self.csv = pd.read_csv(csv)
 42 |         self.features_path = features_path
 43 |         self.features_path_audio = features_path_audio if features_path_audio != "" \
 44 |             else features_path
 45 |         self.caption = caption
 46 |         self.min_time = min_time
 47 |         self.feature_framerate = feature_framerate
 48 |         self.feature_framerate_3D = feature_framerate_3D
 49 |         self.we_dim = we_dim
 50 |         self.max_words = max_words
 51 |         self.min_words = min_words
 52 |         self.num_audio_frames = num_audio_frames
 53 |         self.we = we
 54 |         self.n_pair = n_pair
 55 |         self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D}
 56 |         self.feature_path = {'2d': features_path}
 57 |         if features_path != '':
 58 |             self.feature_path['3d'] = features_path
 59 |         self.random_audio_windows = random_audio_windows
 60 | 
 61 |     def __len__(self):
 62 |         return len(self.csv)
 63 | 
 64 |     def _zero_pad_tensor(self, tensor, size):
 65 |         if len(tensor) >= size:
 66 |             return tensor[:size]
 67 |         else:
 68 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 69 |             return np.concatenate((tensor, zero), axis=0)
 70 | 
 71 |     def _zero_pad_audio(self, audio, max_frames):
 72 |         n_frames = audio.shape[1]
 73 |         if n_frames >= max_frames:
 74 |             return audio[:, 0:max_frames], int(max_frames)
 75 |         else:
 76 |             p = max_frames - n_frames
 77 |             audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
 78 |             return audio_padded, n_frames
 79 | 
 80 |     def _tokenize_text(self, sentence):
 81 |         w = re.findall(r"[\w']+", str(sentence))
 82 |         return w
 83 | 
 84 |     def _words_to_we(self, words):
 85 |         words = [word for word in words if word in self.we.vocab]
 86 |         if words:
 87 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 88 |             return th.from_numpy(we)
 89 |         else:
 90 |             return th.zeros(self.max_words, self.we_dim)
 91 | 
 92 |     def _get_audio_and_text(self, caption, n_pair_max, mel_spec):
 93 |         n_caption = len(caption['start'])
 94 |         k = n_pair_max
 95 |         starts = np.zeros(k)
 96 |         ends = np.zeros(k)
 97 |         text = th.zeros(k, self.max_words, self.we_dim)
 98 |         audio = [0 for i in range(k)]
 99 |         nframes = np.zeros(k)
100 |         r_ind = np.random.choice(range(n_caption), k, replace=True)
101 | 
102 |         for i in range(k):
103 |             ind = r_ind[i]
104 |             audio[i], nframes[i], starts[i], ends[i], text[i] = self._get_single_audio_text(caption, ind, mel_spec)
105 | 
106 |         audio = th.cat([i.unsqueeze(0) for i in audio], dim=0)
107 |         return audio, nframes, starts, ends, text
108 | 
109 |     def _get_single_audio_text(self, caption, ind, mel_spec):
110 |         start, end = ind, ind
111 |         words = self._tokenize_text(caption['text'][ind])
112 |         diff = caption['end'][end] - caption['start'][start]
113 |         # Extend the video clip if shorter than the minimum desired clip duration
114 |         while diff < self.min_time:
115 |             if start > 0 and end < len(caption['end']) - 1:
116 |                 next_words = self._tokenize_text(caption['text'][end + 1])
117 |                 prev_words = self._tokenize_text(caption['text'][start - 1])
118 |                 d1 = caption['end'][end + 1] - caption['start'][start]
119 |                 d2 = caption['end'][end] - caption['start'][start - 1]
120 |                 # Use the closest neighboring video clip
121 |                 if d2 <= d1:
122 |                     start -= 1
123 |                     words.extend(prev_words)
124 |                 else:
125 |                     end += 1
126 |                     words.extend(next_words)
127 |             # If no video clips after it, use the clip before it
128 |             elif start > 0:
129 |                 words.extend(self._tokenize_text(caption['text'][start - 1]))
130 |                 start -= 1
131 |             # If no video clips before it, use the clip after it.
132 |             elif end < len(caption['end']) - 1:
133 |                 words.extend(self._tokenize_text(caption['text'][end + 1]))
134 |                 end += 1
135 |             # If there's no clips before or after
136 |             else:
137 |                 break
138 |             diff = caption['end'][end] - caption['start'][start]
139 | 
140 |         frames = librosa.core.time_to_frames([caption['start'][start], caption['end'][end]], sr=16000, hop_length=160,
141 |                                              n_fft=400)
142 |         padded_mel_spec, nframes = self._zero_pad_audio(mel_spec[:, frames[0]: frames[1]], self.num_audio_frames)
143 |         return th.from_numpy(padded_mel_spec), nframes, caption['start'][start], caption['end'][end], self._words_to_we(
144 |             words)
145 | 
146 |     def _get_audio_random(self, n_pair_max, mel_spec):
147 |         k = n_pair_max
148 |         starts = np.zeros(k)
149 |         ends = np.zeros(k)
150 |         audio = [0 for i in range(k)]
151 |         nframes = np.zeros(k)
152 |         video_duration_seconds = int(
153 |             librosa.core.frames_to_time(mel_spec.shape[1], sr=16000, hop_length=160, n_fft=400))
154 |         num_audio_seconds = int(librosa.core.frames_to_time(self.num_audio_frames, sr=16000, hop_length=160, n_fft=400))
155 |         # Sample clips that end before the end of the video
156 |         # If the video is shorter than the desired window, use the entire video
157 |         start_seconds = np.random.choice(range(max(1, video_duration_seconds - (num_audio_seconds + 1))), k,
158 |                                          replace=True)
159 | 
160 |         for i in range(k):
161 |             start_frame = max(0, librosa.core.time_to_frames(start_seconds[i], sr=16000, hop_length=160, n_fft=400))
162 |             audio_window = mel_spec[:, start_frame: start_frame + self.num_audio_frames]
163 |             # Pad in the case that the audio wasn't long enough
164 |             padded_mel_spec, nframes_spec = self._zero_pad_audio(audio_window, self.num_audio_frames)
165 |             end_second = start_seconds[i] + num_audio_seconds
166 |             audio[i], nframes[i], starts[i], ends[i] = th.from_numpy(padded_mel_spec), nframes_spec, start_seconds[
167 |                 i], end_second
168 | 
169 |         audio = th.cat([i.unsqueeze(0) for i in audio], dim=0)
170 |         return audio, nframes, starts, ends
171 | 
172 |     def _get_video(self, vid_path, s, e, video_id):
173 |         feature_path = {}
174 |         video = {}
175 |         output = {}
176 |         for k in self.feature_path:
177 |             feature_path[k] = os.path.join(self.feature_path[k], vid_path, video_id + "_{}.npz".format(k))
178 |             np_arr = np.load(feature_path[k])['features']
179 |             video[k] = th.from_numpy(np_arr).float()
180 |             output[k] = th.zeros(len(s), video[k].shape[-1])
181 |             for i in range(len(s)):
182 |                 start = int(s[i] * self.fps[k])
183 |                 end = int(e[i] * self.fps[k]) + 1
184 |                 slice = video[k][start:end]
185 |                 if len(slice) < 1:
186 |                     #print("missing visual feats; video_id: {}, start: {}, end: {}".format(feature_path[k], start, end))
187 |                     missing=1
188 |                 else:
189 |                     output[k][i] = F.normalize(th.max(slice, dim=0)[0], dim=0)
190 | 
191 |         return th.cat([output[k] for k in output], dim=1)
192 | 
193 |     def __getitem__(self, idx):
194 |         vid_path = self.csv['path'].values[idx].replace("None/", "")
195 |         video_id = vid_path.split("/")[-1]
196 |         audio_path = os.path.join(self.features_path_audio, vid_path, video_id + "_spec.npz")
197 |         mel_spec = np.load(audio_path)['arr_0']
198 |         if self.random_audio_windows:
199 |             audio, nframes, starts, ends = self._get_audio_random(self.n_pair, mel_spec)
200 |         else:
201 |             audio, nframes, starts, ends, text = self._get_audio_and_text(self.caption[video_id], self.n_pair, mel_spec)
202 |         video = self._get_video(vid_path, starts, ends, video_id)
203 |         if self.random_audio_windows:
204 |             return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes),
205 |                     'video_id': video_id}
206 |         else:
207 |             return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes),
208 |                     'video_id': video_id,
209 |                     'text': text}
210 | 
211 | 
212 | class Youtube_DataLoader_label(Dataset):
213 |     """Youtube dataset loader."""
214 | 
215 |     def __init__(
216 |             self,
217 |             csv,
218 |             pseu_label_a,
219 |             pseu_label_v,
220 |             features_path,
221 |             features_path_audio,
222 |             caption,
223 |             we,
224 |             min_time=10.0,
225 |             feature_framerate=1.0,
226 |             feature_framerate_3D=24.0 / 16.0,
227 |             we_dim=300,
228 |             max_words=30,
229 |             min_words=0,
230 |             n_pair=1,
231 |             num_audio_frames=1024,
232 |             random_audio_windows=False,
233 |     ):
234 |         """
235 |         Args:
236 |         """
237 |         self.csv = pd.read_csv(csv)
238 |         self.features_path = features_path
239 |         self.features_path_audio = features_path_audio if features_path_audio != "" \
240 |             else features_path
241 |         self.caption = caption
242 |         self.min_time = min_time
243 |         self.feature_framerate = feature_framerate
244 |         self.feature_framerate_3D = feature_framerate_3D
245 |         self.we_dim = we_dim
246 |         self.max_words = max_words
247 |         self.min_words = min_words
248 |         self.num_audio_frames = num_audio_frames
249 |         self.we = we
250 |         self.n_pair = n_pair
251 |         self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D}
252 |         self.feature_path = {'2d': features_path}
253 |         if features_path != '':
254 |             self.feature_path['3d'] = features_path
255 |         self.random_audio_windows = random_audio_windows
256 |         self.pseu_label_a = pseu_label_a
257 |         self.pseu_label_v = pseu_label_v
258 | 
259 |     def __len__(self):
260 |         return len(self.csv)
261 | 
262 |     def _zero_pad_tensor(self, tensor, size):
263 |         if len(tensor) >= size:
264 |             return tensor[:size]
265 |         else:
266 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
267 |             return np.concatenate((tensor, zero), axis=0)
268 | 
269 |     def _zero_pad_audio(self, audio, max_frames):
270 |         n_frames = audio.shape[1]
271 |         if n_frames >= max_frames:
272 |             return audio[:, 0:max_frames], int(max_frames)
273 |         else:
274 |             p = max_frames - n_frames
275 |             audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
276 |             return audio_padded, n_frames
277 | 
278 |     def _tokenize_text(self, sentence):
279 |         w = re.findall(r"[\w']+", str(sentence))
280 |         return w
281 | 
282 |     def _words_to_we(self, words):
283 |         words = [word for word in words if word in self.we.vocab]
284 |         if words:
285 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
286 |             return th.from_numpy(we)
287 |         else:
288 |             return th.zeros(self.max_words, self.we_dim)
289 | 
290 |     def _get_audio_and_text(self, caption, n_pair_max, mel_spec):
291 |         n_caption = len(caption['start'])
292 |         k = n_pair_max
293 |         starts = np.zeros(k)
294 |         ends = np.zeros(k)
295 |         text = th.zeros(k, self.max_words, self.we_dim)
296 |         audio = [0 for i in range(k)]
297 |         nframes = np.zeros(k)
298 |         r_ind = np.random.choice(range(n_caption), k, replace=True)
299 | 
300 |         for i in range(k):
301 |             ind = r_ind[i]
302 |             audio[i], nframes[i], starts[i], ends[i], text[i] = self._get_single_audio_text(caption, ind, mel_spec)
303 | 
304 |         audio = th.cat([i.unsqueeze(0) for i in audio], dim=0)
305 |         return audio, nframes, starts, ends, text
306 | 
307 |     def _get_single_audio_text(self, caption, ind, mel_spec):
308 |         start, end = ind, ind
309 |         words = self._tokenize_text(caption['text'][ind])
310 |         diff = caption['end'][end] - caption['start'][start]
311 |         # Extend the video clip if shorter than the minimum desired clip duration
312 |         while diff < self.min_time:
313 |             if start > 0 and end < len(caption['end']) - 1:
314 |                 next_words = self._tokenize_text(caption['text'][end + 1])
315 |                 prev_words = self._tokenize_text(caption['text'][start - 1])
316 |                 d1 = caption['end'][end + 1] - caption['start'][start]
317 |                 d2 = caption['end'][end] - caption['start'][start - 1]
318 |                 # Use the closest neighboring video clip
319 |                 if d2 <= d1:
320 |                     start -= 1
321 |                     words.extend(prev_words)
322 |                 else:
323 |                     end += 1
324 |                     words.extend(next_words)
325 |             # If no video clips after it, use the clip before it
326 |             elif start > 0:
327 |                 words.extend(self._tokenize_text(caption['text'][start - 1]))
328 |                 start -= 1
329 |             # If no video clips before it, use the clip after it.
330 |             elif end < len(caption['end']) - 1:
331 |                 words.extend(self._tokenize_text(caption['text'][end + 1]))
332 |                 end += 1
333 |             # If there's no clips before or after
334 |             else:
335 |                 break
336 |             diff = caption['end'][end] - caption['start'][start]
337 | 
338 |         frames = librosa.core.time_to_frames([caption['start'][start], caption['end'][end]], sr=16000, hop_length=160,
339 |                                              n_fft=400)
340 |         padded_mel_spec, nframes = self._zero_pad_audio(mel_spec[:, frames[0]: frames[1]], self.num_audio_frames)
341 |         return th.from_numpy(padded_mel_spec), nframes, caption['start'][start], caption['end'][end], self._words_to_we(
342 |             words)
343 | 
344 |     def _get_audio_random(self, n_pair_max, mel_spec):
345 |         k = n_pair_max
346 |         starts = np.zeros(k)
347 |         ends = np.zeros(k)
348 |         audio = [0 for i in range(k)]
349 |         nframes = np.zeros(k)
350 |         video_duration_seconds = int(
351 |             librosa.core.frames_to_time(mel_spec.shape[1], sr=16000, hop_length=160, n_fft=400))
352 |         num_audio_seconds = int(librosa.core.frames_to_time(self.num_audio_frames, sr=16000, hop_length=160, n_fft=400))
353 |         # Sample clips that end before the end of the video
354 |         # If the video is shorter than the desired window, use the entire video
355 |         start_seconds = np.random.choice(range(max(1, video_duration_seconds - (num_audio_seconds + 1))), k,
356 |                                          replace=True)
357 | 
358 |         for i in range(k):
359 |             start_frame = max(0, librosa.core.time_to_frames(start_seconds[i], sr=16000, hop_length=160, n_fft=400))
360 |             audio_window = mel_spec[:, start_frame: start_frame + self.num_audio_frames]
361 |             # Pad in the case that the audio wasn't long enough
362 |             padded_mel_spec, nframes_spec = self._zero_pad_audio(audio_window, self.num_audio_frames)
363 |             end_second = start_seconds[i] + num_audio_seconds
364 |             audio[i], nframes[i], starts[i], ends[i] = th.from_numpy(padded_mel_spec), nframes_spec, start_seconds[
365 |                 i], end_second
366 | 
367 |         audio = th.cat([i.unsqueeze(0) for i in audio], dim=0)
368 |         return audio, nframes, starts, ends
369 | 
370 |     def _get_video(self, vid_path, s, e, video_id):
371 |         feature_path = {}
372 |         video = {}
373 |         output = {}
374 |         for k in self.feature_path:
375 |             feature_path[k] = os.path.join(self.feature_path[k], vid_path, video_id + "_{}.npz".format(k))
376 |             np_arr = np.load(feature_path[k])['features']
377 |             video[k] = th.from_numpy(np_arr).float()
378 |             output[k] = th.zeros(len(s), video[k].shape[-1])
379 |             for i in range(len(s)):
380 |                 start = int(s[i] * self.fps[k])
381 |                 end = int(e[i] * self.fps[k]) + 1
382 |                 slice = video[k][start:end]
383 |                 if len(slice) < 1:
384 |                     #print("missing visual feats; video_id: {}, start: {}, end: {}".format(feature_path[k], start, end))
385 |                     missing = 1
386 |                 else:
387 |                     output[k][i] = F.normalize(th.max(slice, dim=0)[0], dim=0)
388 | 
389 |         return th.cat([output[k] for k in output], dim=1)
390 | 
391 |     def __getitem__(self, idx):
392 |         vid_path = self.csv['path'].values[idx].replace("None/", "")
393 |         video_id = vid_path.split("/")[-1]
394 |         audio_path = os.path.join(self.features_path_audio, vid_path, video_id + "_spec.npz")
395 |         mel_spec = np.load(audio_path)['arr_0']
396 |         if self.random_audio_windows:
397 |             audio, nframes, starts, ends = self._get_audio_random(self.n_pair, mel_spec)
398 |         else:
399 |             audio, nframes, starts, ends, text = self._get_audio_and_text(self.caption[video_id], self.n_pair, mel_spec)
400 |         video = self._get_video(vid_path, starts, ends, video_id)
401 |         if self.random_audio_windows:
402 |             return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes),
403 |                     'video_id': video_id}
404 |         else:
405 |             return {'video': video, 'audio': th.HalfTensor(audio), 'nframes': th.IntTensor(nframes),
406 |                     'video_id': video_id,
407 |                     'text': text, 'pseudo_v': self.pseudo_v[idx], 'pseudo_a': self.pseudo_a[idx]}


--------------------------------------------------------------------------------
/youtube_mil_dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import Dataset
  8 | import torch.nn.functional as F
  9 | import pandas as pd
 10 | import os
 11 | import numpy as np
 12 | import re
 13 | import random
 14 | import librosa
 15 | from model_davenet import LoadAudio
 16 | 
 17 | 
 18 | class Youtube_DataLoader(Dataset):
 19 |     """Youtube dataset loader."""
 20 | 
 21 |     def __init__(
 22 |             self,
 23 |             csv,
 24 |             features_path,
 25 |             features_path_audio,
 26 |             caption,
 27 |             we,
 28 |             min_time=10.0,
 29 |             feature_framerate=1.0,
 30 |             feature_framerate_3D=24.0 / 16.0,
 31 |             we_dim=300,
 32 |             max_words=30,
 33 |             min_words=0,
 34 |             n_pair=1,
 35 |             num_audio_frames=1024,
 36 |             num_candidates=1,
 37 |             random_audio_windows=False,
 38 |     ):
 39 |         """
 40 |         Args:
 41 |         """
 42 |         self.csv = pd.read_csv(csv)
 43 |         self.features_path = features_path
 44 |         self.features_path_audio = features_path_audio if features_path_audio != "" \
 45 |             else features_path
 46 |         self.caption = caption
 47 |         self.min_time = min_time
 48 |         self.feature_framerate = feature_framerate
 49 |         self.feature_framerate_3D = feature_framerate_3D
 50 |         self.we_dim = we_dim
 51 |         self.max_words = max_words
 52 |         self.min_words = min_words
 53 |         self.num_audio_frames = num_audio_frames
 54 |         self.we = we
 55 |         self.n_pair = n_pair
 56 |         self.fps = {'2d': feature_framerate, '3d': feature_framerate_3D}
 57 |         self.feature_path = {'2d': features_path}
 58 |         if features_path != '':
 59 |             self.feature_path['3d'] = features_path
 60 |         self.num_candidates = num_candidates
 61 |         self.random_audio_windows = random_audio_windows
 62 | 
 63 |     def __len__(self):
 64 |         return len(self.csv)
 65 | 
 66 |     def _zero_pad_tensor(self, tensor, size):
 67 |         if len(tensor) >= size:
 68 |             return tensor[:size]
 69 |         else:
 70 |             zero = np.zeros((size - len(tensor), self.we_dim), dtype=np.float32)
 71 |             return np.concatenate((tensor, zero), axis=0)
 72 | 
 73 |     def _zero_pad_audio(self, audio, max_frames):
 74 |         n_frames = audio.shape[1]
 75 |         if n_frames >= max_frames:
 76 |             return audio[:, 0:max_frames], int(max_frames)
 77 |         else:
 78 |             p = max_frames - n_frames
 79 |             audio_padded = np.pad(audio, ((0, 0), (0, p)), 'constant', constant_values=(0, 0))
 80 |             return audio_padded, n_frames
 81 | 
 82 |     def _tokenize_text(self, sentence):
 83 |         w = re.findall(r"[\w']+", str(sentence))
 84 |         return w
 85 | 
 86 |     def _words_to_we(self, words):
 87 |         words = [word for word in words if word in self.we.vocab]
 88 |         if words:
 89 |             we = self._zero_pad_tensor(self.we[words], self.max_words)
 90 |             return th.from_numpy(we)
 91 |         else:
 92 |             return th.zeros(self.max_words, self.we_dim)
 93 |     """
 94 |     def _get_text(self, caption, n_pair_max):
 95 |         n_caption = len(caption['start'])
 96 |         k = n_pair_max
 97 |         starts = np.zeros(k)
 98 |         ends = np.zeros(k)
 99 |         text = th.zeros(k, self.max_words, self.we_dim)
100 |         r_ind = np.random.choice(range(n_caption), k, replace=True)
101 | 
102 |         for i in range(k):
103 |             ind = r_ind[i]
104 |             text[i], starts[i], ends[i] = self._get_single_text(caption, ind)
105 | 
106 |         return text, starts, ends
107 |     """
108 |     def _get_single_text(self, caption, ind):
109 |         start, end = ind, ind
110 |         words = self._tokenize_text(caption['text'][ind])
111 |         diff = caption['end'][end] - caption['start'][start]
112 |         while len(words) < self.min_words or diff < self.min_time:
113 |             if start > 0 and end < len(caption['end']) - 1:
114 |                 next_words = self._tokenize_text(caption['text'][end + 1])
115 |                 prev_words = self._tokenize_text(caption['text'][start - 1])
116 |                 d1 = caption['end'][end + 1] - caption['start'][start]
117 |                 d2 = caption['end'][end] - caption['start'][start - 1]
118 |                 if (self.min_time > 0 and d2 <= d1) or \
119 |                     (self.min_time == 0 and len(next_words) <= len(prev_words)):
120 |                     start -= 1
121 |                     words.extend(prev_words)
122 |                 else:
123 |                     end += 1
124 |                     words.extend(next_words)
125 |             elif start > 0:
126 |                 words.extend(self._tokenize_text(caption['text'][start - 1]))
127 |                 start -= 1
128 |             elif end < len(caption['end']) - 1:
129 |                 words.extend(self._tokenize_text(caption['text'][end + 1]))
130 |                 end += 1
131 |             else:
132 |                 break
133 |             diff = caption['end'][end] - caption['start'][start]
134 |         return self._words_to_we(words), \
135 |             caption['start'][start], caption['end'][end]
136 | 
137 | 
138 |     def _get_video(self, vid_path, s, e, video_id):
139 |         feature_path = {}
140 |         video = {}
141 |         output = {}
142 |         for k in self.feature_path:
143 |             feature_path[k] = os.path.join(self.feature_path[k], vid_path, video_id + "_{}.npz".format(k))
144 |             np_arr = np.load(feature_path[k])['features']
145 |             video[k] = th.from_numpy(np_arr).float()
146 |             output[k] = th.zeros(len(s), video[k].shape[-1])
147 | 
148 |             start = int(s * self.fps[k])
149 |             end = int(e * self.fps[k]) + 1
150 |             slice = video[k][start:end]
151 |             if len(slice) < 1:
152 |                 #print("missing visual feats; video_id: {}, start: {}, end: {}".format(feature_path[k], start, end))
153 |                 missing=1
154 |             else:
155 |                 output[k] = F.normalize(th.max(slice, dim=0)[0], dim=0)
156 | 
157 |         return th.cat([output[k] for k in output], dim=1)
158 | 
159 |     def _find_nearest_candidates(self, caption, ind):
160 |         start, end = ind, ind
161 |         diff = caption['end'][end] - caption['start'][start]
162 |         n_candidate = 1
163 |         while n_candidate < self.num_candidates:
164 |             if start == 0:
165 |                 return 0
166 |             elif end == len(caption) - 1:
167 |                 return start - (self.num_candidates - n_candidate)
168 |             elif caption['end'][end] - caption['start'][start - 1] < caption['end'][end + 1] - caption['start'][start]:
169 |                 start -= 1
170 |             else:
171 |                 end += 1
172 |             n_candidate += 1
173 |         return start
174 | 
175 |     def _get_text(self, cap):
176 |         #cap = pd.read_csv(caption)
177 |         ind = random.randint(0, len(cap) - 1)
178 |         if self.num_candidates == 1:
179 |             #words = self.words_to_ids(cap['text'].values[ind])
180 |             words = self._tokenize_text(cap['text'][ind])
181 |         else:
182 |             #words = th.zeros(self.num_candidates, self.max_words, dtype=th.long)
183 |             words = th.zeros(self.num_candidates, self.max_words, self.we_dim)
184 |             cap_start = self._find_nearest_candidates(cap, ind)
185 |             for i in range(self.num_candidates):
186 |                 candidate_w = cap['text'].values[max(0, min(len(cap['text']) - 1, cap_start + i))]
187 |                 word_token = self._tokenize_text(candidate_w)
188 |                 words[i] = self._words_to_we(word_token)#self.words_to_ids()
189 |         start, end = cap['start'].values[ind], cap['end'].values[ind]
190 |         # TODO: May need to be improved for edge cases.
191 |         if end - start < self.min_time:
192 |             diff = self.min_time - end + start
193 |             start = max(0, start - diff / 2)
194 |             end = start + self.min_time
195 |         return words, int(start), int(end)
196 | 
197 |     def __getitem__(self, idx):
198 |         vid_path = self.csv['path'].values[idx].replace("None/", "")
199 |         video_id = vid_path.split("/")[-1]
200 |         #audio_path = os.path.join(self.features_path_audio, vid_path, video_id + "_spec.npz")
201 |         #mel_spec = np.load(audio_path)['arr_0']
202 | 
203 |         #video_path = os.path.join(self.video_root, video_file)
204 |         text, start, end = self._get_text(self.caption[video_id])
205 |         video = self._get_video(vid_path, start, end, video_id)
206 |         #video = self._get_video(video_path, start, end)
207 |         return {'video': video, 'text': text}
208 | 


--------------------------------------------------------------------------------