├── misc ├── __init__.py ├── text_embedding_models.json ├── datasets │ ├── querydsegments │ │ ├── tar_include.txt │ │ └── README.md │ ├── activity-net │ │ ├── tar_include.txt │ │ └── README.md │ ├── clotho │ │ └── tar_include.txt │ ├── queryd │ │ ├── tar_include.txt │ │ ├── README.md │ │ ├── val_list.txt │ │ └── test_list.txt │ └── audiocaps │ │ └── tar_include.txt ├── experiments-queryd.json ├── yaspi_gpu_defaults.json ├── exps-names.md ├── find_latest_checkpoints.py ├── experiments-audiocaps.json ├── experiments_teachText.json ├── generate_exps.py ├── aggregate_logs_and_stats.py ├── launch_exps_from_list.py └── gen_tar_lists.py ├── utils ├── __init__.py ├── cos_restart.py ├── gen_ablations_for_dataset.py ├── datastructures.py ├── visualizer.py ├── html.py ├── ranger.py └── radam.py ├── trainer └── __init__.py ├── model ├── __init__.py ├── mil_nce_net.py ├── text_embedding_models.json ├── loss.py └── net_vlad.py ├── logger ├── __init__.py ├── logger_config.json ├── logger.py ├── visualization.py └── log_parser.py ├── base ├── __init__.py └── base_model.py ├── launch_ablations_txt ├── audio_experts.txt ├── all_audio_experts.txt └── single_audio_experts.txt ├── configs ├── clotho │ ├── train-vggish-vggsound.json │ ├── train-vggish-vggsound-moee.json │ └── train-full-ce-only-audio.json ├── audiocaps │ ├── train-vggish-vggsound-train_list_10.json │ ├── train-vggish-vggsound-train_list_25.json │ ├── train-vggish-vggsound-train_list_50.json │ ├── train-vggish-vggsound-train_list_75.json │ ├── train-vggish-vggsound.json │ ├── train-full-ce-scene-r2p1d.json │ ├── train-full-ce-r2p1d-inst.json │ ├── train-full-ce-scene-inst.json │ ├── train-full-ce-r2p1d-inst-vggish.json │ ├── train-full-ce-r2p1d-inst-vggsound.json │ ├── train-full-ce-scene-r2p1d-inst.json │ ├── train-full-ce-r2p1d-inst-vggish-vggsound.json │ ├── train-full-ce-scene-r2p1d-inst-vggsound.json │ ├── train-vggish-vggsound-moee.json │ ├── train-full-ce-only-audio.json │ ├── train-only-vggsound.json │ ├── train-full-ce-only-r2p1d.json │ ├── train-full-ce-only-scene.json │ ├── train-full-ce-scene-r2p1d-inst-vggish-vggsound.json │ └── train-full-ce-only-inst.json ├── queryd │ └── train-full-ce-only-audio.json ├── querydsegments │ └── train-full-ce-only-audio.json ├── activity-net │ └── train-full-ce-audio-only.json ├── data_loader_activity-net.json ├── data_loader_queryd.json ├── data_loader_querydsegments.json ├── data_loader_clotho.json └── data_loader_audiocaps.json ├── .gitignore ├── requirements └── requirements.txt ├── data_loader ├── QuerYDSegments_dataset.py ├── QuerYD_dataset.py ├── CLOTHO_dataset.py ├── ActivityNet_dataset.py ├── AudioCaps_dataset.py └── data_loaders.py ├── exp_to_seed_time.json ├── eval.py └── dataset_stats └── get_videoid_perclass.py /misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | -------------------------------------------------------------------------------- /trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .trainer import * 2 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import CENet 2 | from .mil_nce_net import MNNet -------------------------------------------------------------------------------- /logger/__init__.py: -------------------------------------------------------------------------------- 1 | from .logger import * 2 | from .visualization import * 3 | from .log_parser import * -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- 1 | # from .base_data_loader import * 2 | from .base_model import * 3 | from .base_trainer import * 4 | -------------------------------------------------------------------------------- /launch_ablations_txt/audio_experts.txt: -------------------------------------------------------------------------------- 1 | --config configs/audiocaps/train-ce-audio-speech.json --group_seed 0|1|2 --device 0 2 | -------------------------------------------------------------------------------- /launch_ablations_txt/all_audio_experts.txt: -------------------------------------------------------------------------------- 1 | --config configs/audiocaps/train-full-ce-only-audio_sophia_pann_soundnet.json --group_seed 0|1|2 --device 0 2 | -------------------------------------------------------------------------------- /configs/clotho/train-vggish-vggsound.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_clotho.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio", 6 | "vggsound" 7 | ] 8 | } 9 | } -------------------------------------------------------------------------------- /launch_ablations_txt/single_audio_experts.txt: -------------------------------------------------------------------------------- 1 | --config configs/clotho/train-full-ce-only-audio.json --group_seed 0|1|2 --device 0 --resume data/saved/models/audiocaps-train-full-ce-only-audio/2021-03-23_09-06-26/trained_model.pth 2 | -------------------------------------------------------------------------------- /configs/audiocaps/train-vggish-vggsound-train_list_10.json: -------------------------------------------------------------------------------- 1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_10.txt"} -------------------------------------------------------------------------------- /configs/audiocaps/train-vggish-vggsound-train_list_25.json: -------------------------------------------------------------------------------- 1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_25.txt"} -------------------------------------------------------------------------------- /configs/audiocaps/train-vggish-vggsound-train_list_50.json: -------------------------------------------------------------------------------- 1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_50.txt"} -------------------------------------------------------------------------------- /configs/audiocaps/train-vggish-vggsound-train_list_75.json: -------------------------------------------------------------------------------- 1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_75.txt"} -------------------------------------------------------------------------------- /misc/text_embedding_models.json: -------------------------------------------------------------------------------- 1 | { 2 | "w2v": { 3 | "weights_path": "data/text_models/GoogleNews-vectors-negative300.bin.gz", 4 | "dim": 300, 5 | "mirror": "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /configs/audiocaps/train-vggish-vggsound.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio", 6 | "vggsound" 7 | ] 8 | }, 9 | "testing_file": "final_filtered_test_list.txt", 10 | "training_file": "train_list.txt" 11 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-scene-r2p1d.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "scene.densenet161.0", 6 | "r2p1d.r2p1d-ig65m.0" 7 | ] 8 | }, 9 | "testing_file": "final_filtered_test_list.txt", 10 | "training_file": "train_list.txt" 11 | } -------------------------------------------------------------------------------- /configs/clotho/train-vggish-vggsound-moee.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_clotho.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio", 6 | "vggsound" 7 | ] 8 | }, 9 | "arch": { 10 | "type": "CENet", 11 | "args": { 12 | "use_ce": "" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /configs/clotho/train-full-ce-only-audio.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_clotho.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /configs/queryd/train-full-ce-only-audio.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_queryd.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-r2p1d-inst.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "r2p1d.r2p1d-ig65m.0", 6 | "imagenet.resnext101_32x48d.0" 7 | ] 8 | }, 9 | "testing_file": "final_filtered_test_list.txt", 10 | "training_file": "train_list.txt" 11 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-scene-inst.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "scene.densenet161.0", 6 | "imagenet.resnext101_32x48d.0" 7 | ] 8 | }, 9 | "testing_file": "final_filtered_test_list.txt", 10 | "training_file": "train_list.txt" 11 | } -------------------------------------------------------------------------------- /configs/querydsegments/train-full-ce-only-audio.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_querydsegments.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-r2p1d-inst-vggish.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio", 6 | "r2p1d.r2p1d-ig65m.0", 7 | "imagenet.resnext101_32x48d.0" 8 | ] 9 | }, 10 | "testing_file": "final_filtered_test_list.txt", 11 | "training_file": "train_list.txt" 12 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-r2p1d-inst-vggsound.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "vggsound", 6 | "r2p1d.r2p1d-ig65m.0", 7 | "imagenet.resnext101_32x48d.0" 8 | ] 9 | }, 10 | "testing_file": "final_filtered_test_list.txt", 11 | "training_file": "train_list.txt" 12 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-scene-r2p1d-inst.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "scene.densenet161.0", 6 | "r2p1d.r2p1d-ig65m.0", 7 | "imagenet.resnext101_32x48d.0" 8 | ] 9 | }, 10 | "testing_file": "final_filtered_test_list.txt", 11 | "training_file": "train_list.txt" 12 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-r2p1d-inst-vggish-vggsound.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio", 6 | "r2p1d.r2p1d-ig65m.0", 7 | "imagenet.resnext101_32x48d.0", 8 | "vggsound" 9 | ] 10 | }, 11 | "testing_file": "final_filtered_test_list.txt", 12 | "training_file": "train_list.txt" 13 | } -------------------------------------------------------------------------------- /misc/datasets/querydsegments/tar_include.txt: -------------------------------------------------------------------------------- 1 | data/QuerYDSegments/structured-symlinks/aggregated_audio/vggish-raw.hickle 2 | data/QuerYDSegments/structured-symlinks/structured-symlinks/split_raw_captions_filtered.pkl 3 | data/QuerYDSegments/structured-symlinks/test_list.txt 4 | data/QuerYDSegments/structured-symlinks/text_embeddings/w2v.pkl 5 | data/QuerYDSegments/structured-symlinks/train_list.txt 6 | data/QuerYDSegments/structured-symlinks/val_list.txt 7 | -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-scene-r2p1d-inst-vggsound.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "scene.densenet161.0", 6 | "vggsound", 7 | "r2p1d.r2p1d-ig65m.0", 8 | "imagenet.resnext101_32x48d.0" 9 | ] 10 | }, 11 | "testing_file": "final_filtered_test_list.txt", 12 | "training_file": "train_list.txt" 13 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-vggish-vggsound-moee.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio", 6 | "vggsound" 7 | ] 8 | }, 9 | "arch": { 10 | "type": "CENet", 11 | "args": { 12 | "use_ce": "" 13 | } 14 | }, 15 | "testing_file": "final_filtered_test_list.txt", 16 | "training_file": "train_list.txt" 17 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-only-audio.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | }, 15 | "testing_file": "final_filtered_test_list.txt", 16 | "training_file": "train_list.txt" 17 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-only-vggsound.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "vggsound" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | }, 15 | "testing_file": "final_filtered_test_list.txt", 16 | "training_file": "train_list.txt" 17 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-only-r2p1d.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "r2p1d.r2p1d-ig65m.0" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | }, 15 | "testing_file": "final_filtered_test_list.txt", 16 | "training_file": "train_list.txt" 17 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-only-scene.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "scene.densenet161.0" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | }, 15 | "testing_file": "final_filtered_test_list.txt", 16 | "training_file": "train_list.txt" 17 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-scene-r2p1d-inst-vggish-vggsound.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "scene.densenet161.0", 6 | "audio", 7 | "r2p1d.r2p1d-ig65m.0", 8 | "imagenet.resnext101_32x48d.0", 9 | "vggsound" 10 | ] 11 | }, 12 | "testing_file": "final_filtered_test_list.txt", 13 | "training_file": "train_list.txt" 14 | } -------------------------------------------------------------------------------- /configs/audiocaps/train-full-ce-only-inst.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_audiocaps.json", 3 | "experts": { 4 | "modalities": [ 5 | "imagenet.resnext101_32x48d.0" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | }, 15 | "testing_file": "final_filtered_test_list.txt", 16 | "training_file": "train_list.txt" 17 | } -------------------------------------------------------------------------------- /misc/datasets/activity-net/tar_include.txt: -------------------------------------------------------------------------------- 1 | data/activity-net/structured-symlinks/aggregated_audio/vggish-audio-raw.pickle 2 | data/activity-net/structured-symlinks/aggregated_facefeats_25fps_256px_stride1/face-avg.pickle 3 | data/activity-net/structured-symlinks/aggregated_ocr_feats/ocr-w2v.pkl 4 | data/activity-net/structured-symlinks/aggregated_text_feats/w2v.pkl 5 | data/activity-net/structured-symlinks/raw-captions-train-val_1.pkl 6 | data/activity-net/structured-symlinks/train_list.txt 7 | data/activity-net/structured-symlinks/val_1_list.txt 8 | -------------------------------------------------------------------------------- /misc/datasets/clotho/tar_include.txt: -------------------------------------------------------------------------------- 1 | data/CLOTHO/structured-symlinks/aggregated_audio/vggish-raw.hickle 2 | data/CLOTHO/structured-symlinks/aggregated_pann/pann-raw.hickle 3 | data/CLOTHO/structured-symlinks/aggregated_syncnet/syncnet-raw.hickle 4 | data/CLOTHO/structured-symlinks/aggregated_vggsound/vggsound-raw.hickle 5 | data/CLOTHO/structured-symlinks/structured-symlinks/raw-captions.pkl 6 | data/CLOTHO/structured-symlinks/test_list.txt 7 | data/CLOTHO/structured-symlinks/text_embeddings/w2v.pkl 8 | data/CLOTHO/structured-symlinks/train_list.txt 9 | data/CLOTHO/structured-symlinks/val_list.txt 10 | -------------------------------------------------------------------------------- /base/base_model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import numpy as np 3 | from abc import abstractmethod 4 | 5 | 6 | class BaseModel(nn.Module): 7 | """ 8 | Base class for all models 9 | """ 10 | @abstractmethod 11 | def forward(self, *inputs): 12 | """ 13 | Forward pass logic 14 | 15 | :return: Model output 16 | """ 17 | raise NotImplementedError 18 | 19 | def __str__(self): 20 | """ 21 | Model prints with number of trainable parameters 22 | """ 23 | model_parameters = filter(lambda p: p.requires_grad, self.parameters()) 24 | params = sum([np.prod(p.size()) for p in model_parameters]) 25 | return super().__str__() + f"\nTrainable parameters: {params}" 26 | -------------------------------------------------------------------------------- /configs/activity-net/train-full-ce-audio-only.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/data_loader_activity-net.json", 3 | "experts": { 4 | "modalities": [ 5 | "audio" 6 | ] 7 | }, 8 | "arch": { 9 | "type": "CENet", 10 | "args": { 11 | "use_ce": "", 12 | "mimic_ce_dims": 1 13 | } 14 | }, 15 | "trainer": { 16 | "epochs": 20 17 | }, 18 | "optimizer": { 19 | "type": "Ranger", 20 | "args": { 21 | "lr": 0.01, 22 | "weight_decay": 1E-03 23 | } 24 | }, 25 | "loss": { 26 | "type": "MaxMarginRankingLoss", 27 | "args": { 28 | "margin": 0.2, 29 | "fix_norm": true 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /misc/experiments-queryd.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "queryd-train-full-ce": ["c50d3616", "2021-05-28_15-24-39"], 4 | "queryd-train-full-ce-only-scene": ["766c0b81", "2021-05-28_15-39-29"], 5 | "queryd-train-full-ce-only-scene-audio": ["e576753f", "2021-05-28_16-20-15"], 6 | "queryd-train-full-ce-only-scene-audio-inst": ["e40f68bf", "2021-05-28_16-21-50"], 7 | "queryd-train-full-ce-only-scene-audio-inst-r2p1d": ["54ca249c", "2021-05-28_16-24-04"], 8 | "queryd-train-full-mnnet": ["7e1a7420", "2021-05-28_16-38-33"], 9 | "queryd-train-full-moee": ["ab5db961", "2021-05-28_15-32-38"], 10 | 11 | "querydsegments-train-full-ce": ["0d1b703c", "2021-05-28_15-26-57"], 12 | "querydsegments-train-full-mnnet": ["1404fc28", "2021-05-28_16-38-32"], 13 | "querydsegments-train-full-moee": ["7b3b466e", "2021-05-28_15-32-44"] 14 | 15 | 16 | } -------------------------------------------------------------------------------- /misc/yaspi_gpu_defaults.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe": "gpu-proc", 3 | "partition": "gpu", 4 | "time_limit": "96:00:00", 5 | "gen_script_dir": "data/slurm-gen-scripts", 6 | "mem": "100G", 7 | "gpus_per_task": 1, 8 | "cpus_per_task": 5, 9 | "throttle_array": 20, 10 | "ssh_forward": "", 11 | "log_dir": "data/slurm-logs", 12 | "use_custom_ray_tmp_dir": false, 13 | "refresh_logs": false, 14 | "exclude": "gnodef1,gnodee7,gnodef2,gnodee1,gnodee2,gnodee3,gnodee4,gnodee5,gnodee6,gnodee8,gnodeb1,gnodeb2,gnodeb3,gnodeb4,gnodeb5,gnodec1,gnodec2,gnodec3,gnodec4,gnodec5,gnodej1", 15 | "constraint_str": "", 16 | "prep": "", 17 | "env_setup": "export PYTHONPATH=\"${BASE}\":$PYTHONPATH; export PATH=\"${HOME}\"/local/anaconda3/condabin/:$PATH; source ~/local/anaconda3/etc/profile.d/conda.sh; conda activate pt37" 18 | } 19 | -------------------------------------------------------------------------------- /misc/datasets/queryd/tar_include.txt: -------------------------------------------------------------------------------- 1 | data/QuerYD/structured-symlinks/aggregated_audio/vggish-raw.hickle 2 | data/QuerYD/structured-symlinks/aggregated_imagenet_25fps_256px_stride1_offset0/resnext101_32x48d-avg.pickle 3 | data/QuerYD/structured-symlinks/aggregated_r2p1d_30fps_256px_stride32_offset0_inner_stride1/r2p1d-ig65m-avg.pickle 4 | data/QuerYD/structured-symlinks/aggregated_s3dg_10fps_256px_stride16_offset0_inner_stride1/s3dg-avg-logits.pickle 5 | data/QuerYD/structured-symlinks/aggregated_scene_25fps_256px_stride1_offset0/densenet161-avg.pickle 6 | data/QuerYD/structured-symlinks/raw_captions_combined_filtered.pkl 7 | data/QuerYD/structured-symlinks/text_embeddings/howto100m_mil_nce.pkl 8 | data/QuerYD/structured-symlinks/text_embeddings/w2v.pkl 9 | data/QuerYD/structured-symlinks/test_list.txt 10 | data/QuerYD/structured-symlinks/train_list.txt 11 | data/QuerYD/structured-symlinks/val_list.txt 12 | -------------------------------------------------------------------------------- /misc/datasets/audiocaps/tar_include.txt: -------------------------------------------------------------------------------- 1 | data/AudioCaps/structured-symlinks/aggregated_audio/vggish-raw.hickle 2 | data/AudioCaps/structured-symlinks/aggregated_imagenet_25fps_256px_stride1_offset0/resnext101_32x48d-avg.pickle 3 | data/AudioCaps/structured-symlinks/aggregated_pann/pann-raw.hickle 4 | data/AudioCaps/structured-symlinks/aggregated_r2p1d_30fps_256px_stride32_offset0_inner_stride1/r2p1d-ig65m-avg.pickle 5 | data/AudioCaps/structured-symlinks/aggregated_scene_25fps_256px_stride1_offset0/densenet161-avg.pickle 6 | data/AudioCaps/structured-symlinks/aggregated_syncnet/syncnet-raw.hickle 7 | data/AudioCaps/structured-symlinks/aggregated_vggsound/vggsound-raw.hickle 8 | data/AudioCaps/structured-symlinks/filtered_val_list.txt 9 | data/AudioCaps/structured-symlinks/final_filtered_test_list.txt 10 | data/AudioCaps/structured-symlinks/structured-symlinks/raw-captions.pkl 11 | data/AudioCaps/structured-symlinks/text_embeddings/w2v.pkl 12 | data/AudioCaps/structured-symlinks/train_list.txt 13 | -------------------------------------------------------------------------------- /logger/logger_config.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "version": 1, 4 | "disable_existing_loggers": false, 5 | "formatters": { 6 | "simple": {"format": "%(message)s"}, 7 | "datetime": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"} 8 | }, 9 | "handlers": { 10 | "console": { 11 | "class": "logging.StreamHandler", 12 | "level": "DEBUG", 13 | "formatter": "simple", 14 | "stream": "ext://sys.stdout" 15 | }, 16 | "info_file_handler": { 17 | "class": "logging.handlers.RotatingFileHandler", 18 | "level": "INFO", 19 | "formatter": "datetime", 20 | "filename": "info.log", 21 | "maxBytes": 10485760, 22 | "backupCount": 20, "encoding": "utf8" 23 | } 24 | }, 25 | "root": { 26 | "level": "INFO", 27 | "handlers": [ 28 | "console", 29 | "info_file_handler" 30 | ] 31 | } 32 | } -------------------------------------------------------------------------------- /logger/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import logging.config 4 | from pathlib import Path 5 | from utils import read_json 6 | 7 | 8 | def setup_logging(save_dir, log_config='logger/logger_config.json', 9 | default_level=logging.INFO): 10 | """Setup logging configuration.""" 11 | print(os.getcwd()) 12 | log_config = Path(log_config) 13 | print(f"log config: {log_config} exists: {log_config.exists()}") 14 | if log_config.is_file(): 15 | config = read_json(log_config) 16 | # modify logging paths based on run config 17 | for _, handler in config['handlers'].items(): 18 | if 'filename' in handler: 19 | handler['filename'] = str(save_dir / handler['filename']) 20 | 21 | logging.config.dictConfig(config) 22 | else: 23 | print(f"Warning: logging configuration file is not found in {log_config}.") 24 | logging.basicConfig(level=default_level) 25 | return config["handlers"]["info_file_handler"]["filename"] 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # symlinked data 2 | data 3 | 4 | # ignore predictions 5 | pred 6 | 7 | # local dev files 8 | scratch/ 9 | 10 | # nuisance files 11 | *.DS_Store 12 | .nfs* 13 | __pycache__ 14 | 15 | # generated files 16 | misc/slurm/scripts/slurm-job.sh 17 | .vscode/tags 18 | 19 | # exclude files that are not for release 20 | hp* 21 | hc* 22 | 23 | # exclude unsupported datasets 24 | configs/mit 25 | configs/mmit 26 | data_loader/MIT* 27 | data_loader/MMIT* 28 | configs/templates 29 | misc/ablations-template.md 30 | misc/ablations.md 31 | misc/README-ablations-template.md 32 | misc/README-audiocaps-ablations-template.md 33 | misc/README-model-study.md 34 | misc/README-queryd-ablations-template.md 35 | misc/README-template.md 36 | 37 | slurm 38 | 39 | .vscode 40 | 41 | # exclude long video tar lists to avoid an overly heavy git repo 42 | misc/cvpr2020_challenge/datasets/activity-net/challenge-release-1/video_tar_include.txt 43 | misc/cvpr2020_challenge/datasets/MSVD/challenge-release-1/video_tar_include.txt 44 | misc/cvpr2020_challenge/datasets/DiDeMo/challenge-release-1/video_tar_include.txt 45 | misc/cvpr2020_challenge/datasets/MSRVTT/challenge-release-1/video_tar_include.txt 46 | misc/cvpr2020_challenge/datasets/YouCook2/challenge-release-1/video_tar_include.txt 47 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | backcall==0.2.0 2 | beartype==0.7.0 3 | certifi==2021.5.30 4 | chardet==4.0.0 5 | colored==1.4.2 6 | common-cmplr-lib-rt==2021.2.0 7 | common-cmplr-lic-rt==2021.2.0 8 | cycler==0.10.0 9 | decorator==5.0.9 10 | dill==0.3.3 11 | dominate==2.6.0 12 | dpcpp-cpp-rt==2021.2.0 13 | h5py==2.10.0 14 | hickle==4.0.4 15 | humanize==3.7.1 16 | idna==2.10 17 | intel-openmp==2021.2.0 18 | ipdb==0.13.9 19 | ipython==7.24.1 20 | ipython-genutils==0.2.0 21 | jedi==0.18.0 22 | joblib==1.0.1 23 | kiwisolver==1.3.1 24 | matplotlib==3.4.2 25 | matplotlib-inline==0.1.2 26 | mergedeep==1.3.4 27 | mkl==2021.2.0 28 | mkl-fft==1.3.0 29 | mkl-random==1.2.2 30 | mkl-service==2.4.0 31 | mock==4.0.3 32 | msgpack==1.0.2 33 | msgpack-numpy==0.4.7.1 34 | numpy==1.20.3 35 | opencl-rt==2021.2.0 36 | ordered-set==4.0.2 37 | pandas==1.0.3 38 | parso==0.8.2 39 | pexpect==4.8.0 40 | pickleshare==0.7.5 41 | Pillow==8.2.0 42 | prompt-toolkit==3.0.18 43 | protobuf==3.17.3 44 | psutil==5.8.0 45 | ptyprocess==0.7.0 46 | Pygments==2.9.0 47 | PyLaTeX==1.4.1 48 | pyparsing==2.4.7 49 | python-dateutil==2.8.1 50 | pytorch-swats==0.1.0 51 | pytz==2021.1 52 | PyYAML==5.4.1 53 | requests==2.25.1 54 | scikit-learn==0.24.2 55 | scipy==1.6.3 56 | seaborn==0.11.1 57 | six==1.16.0 58 | tailf==0.3.2 59 | tbb==2021.2.0 60 | tensorboardX==2.2 61 | threadpoolctl==2.1.0 62 | toml==0.10.2 63 | torch==1.7.1 64 | torchvision==0.8.2 65 | tqdm==4.61.1 66 | traitlets==5.0.5 67 | typeguard==2.12.1 68 | typing-extensions==3.10.0.0 69 | urllib3==1.26.5 70 | watchlogs==0.1.3.21 71 | wcwidth==0.2.5 72 | wget==3.2 73 | yaspi==0.0.5 74 | zsvision==0.7.8 75 | -------------------------------------------------------------------------------- /misc/datasets/queryd/README.md: -------------------------------------------------------------------------------- 1 | ## Pretrained Experts 2 | 3 | This folder contains a collection of features, extracted from the QuerYD [2] dataset as part of the paper: 4 | *QuerYD: A video dataset with high-quality textual and audio narrations*. 5 | 6 | ### Training splits 7 | 8 | The training splits are given in the files linked below: 9 | 10 | * [train_list.txt](train_list.txt) (1796 videos) 11 | * [val_list.txt](val_list.txt) (384 videos) 12 | * [test_list.txt](test_list.txt) (386 videos) 13 | 14 | 15 | **Tar contents** 16 | 17 | The compressed tar file (402MB) can be downloaded from: 18 | 19 | ``` 20 | http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/QuerYD-experts.tar.gz 21 | sha1sum: 0207ea85eeb52a4f50b06a31af28484afe4d9e86 22 | ``` 23 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt). 24 | 25 | 26 | ### References: 27 | 28 | [1] If you use these features, please consider citing: 29 | ``` 30 | @inproceedings{Liu2019a, 31 | author = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.}, 32 | booktitle = {British Machine Vision Conference}, 33 | title = {Use What You Have: Video retrieval using representations from collaborative experts}, 34 | date = {2019}, 35 | } 36 | ``` 37 | 38 | [2] Please also consider citing the original QuerYD dataset, which was described in: 39 | 40 | ``` 41 | @misc{oncescu2020queryd, 42 | title={QuerYD: A video dataset with high-quality textual and audio narrations}, 43 | author={Andreea-Maria Oncescu and Jõao F. Henriques and Yang Liu and Andrew Zisserman and Samuel Albanie}, 44 | year={2020}, 45 | } 46 | ``` -------------------------------------------------------------------------------- /misc/datasets/querydsegments/README.md: -------------------------------------------------------------------------------- 1 | ## Pretrained Experts 2 | 3 | This folder contains a collection of features, extracted from the QuerYD [2] dataset as part of the paper: 4 | *QuerYD: A video dataset with high-quality textual and audio narrations*. 5 | 6 | ### Training splits 7 | 8 | The training splits are given in the files linked below: 9 | 10 | * [train_list.txt](train_list.txt) (9113 videos) 11 | * [val_list.txt](val_list.txt) (1952 videos) 12 | * [test_list.txt](test_list.txt) (1954 videos) 13 | 14 | 15 | **Tar contents** 16 | 17 | The compressed tar file (244MB) can be downloaded from: 18 | 19 | ``` 20 | https://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/QuerYDSegments-experts.tar.gz 21 | sha1sum: f2be088890294f92355ccfe109f824d814cf2cd5 22 | ``` 23 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt). 24 | 25 | 26 | ### References: 27 | 28 | [1] If you use these features, please consider citing: 29 | ``` 30 | @inproceedings{Liu2019a, 31 | author = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.}, 32 | booktitle = {British Machine Vision Conference}, 33 | title = {Use What You Have: Video retrieval using representations from collaborative experts}, 34 | date = {2019}, 35 | } 36 | ``` 37 | 38 | [2] Please also consider citing the original QuerYD dataset, which was described in: 39 | 40 | ``` 41 | @misc{oncescu2020queryd, 42 | title={QuerYD: A video dataset with high-quality textual and audio narrations}, 43 | author={Andreea-Maria Oncescu and Jõao F. Henriques and Yang Liu and Andrew Zisserman and Samuel Albanie}, 44 | year={2020}, 45 | } 46 | ``` -------------------------------------------------------------------------------- /misc/exps-names.md: -------------------------------------------------------------------------------- 1 | ## This file contains additional instructions for running the commands provided in the main README file 2 | 3 | ### Downloading required features and splits: 4 | ``` 5 | python3 misc/sync_experts.py --dataset AudioCaps 6 | python3 misc/sync_experts.py --dataset CLOTHO 7 | python3 misc/sync_experts.py --dataset activity-net 8 | python3 misc/sync_experts.py --dataset QuerYDSegments 9 | ``` 10 | 11 | ### Finding the corresponding .json file names for evaluation of pre-trained models 12 | 13 | #### AudioCaps: 14 | |Experiment type | Model name| 15 | |---|---| 16 | |CE VGGish only | audiocaps-train-full-ce-only-audio| 17 | |CE VGGSound only | audicaps-train-only-vggsound| 18 | |CE VGGish + VGGSound | audiocaps-train-vggish-vggsound| 19 | |MoEE VGGish + VGGSound | audiocaps-train-vggish-vggsound-moee| 20 | |CE Scene | audiocaps-train-full-ce-only-scene| 21 | |CE R2P1D | audiocaps-train-full-ce-only-r2p1d| 22 | |CE Inst | audiocaps-train-full-ce-only-inst| 23 | |CE Scene + R2P1D | audiocaps-train-full-ce-scene-r2p1d| 24 | |CE Scene + Inst | audiocaps-train-full-ce-scene-inst| 25 | |CE R2P1D + Inst | audiocaps-train-full-ce-r2p1d-inst| 26 | |CE - R2P1D + Inst + VGGish | audiocaps-train-full-ce-r2p1d-inst-vggish | 27 | |CE - R2P1D + Inst + VGGSound | audiocaps-train-full-ce-r2p1d-inst-vggsound | 28 | |CE - R2P1D + Inst + VGGish + VGGSound | audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound | 29 | 30 | #### CLOTHO: 31 | |Experiment type | Model name| 32 | |---|---| 33 | |CE VGGish only | clotho-train-full-ce-only-audio| 34 | |CE VGGish + VGGSound | clotho-train-vggish-vggsound| 35 | |MoEE VGGish + VGGSound | clotho-train-vggish-vggsound-moee| 36 | 37 | #### Activity-net: 38 | |Experiment type | Model name| 39 | |---|---| 40 | |CE VGGish only | activity-net-train-full-ce-audio-only| 41 | 42 | #### QuerYDSegments: 43 | |Experiment type | Model name| 44 | |---|---| 45 | |CE VGGish only | querydsegments-train-full-ce-audio-only| 46 | -------------------------------------------------------------------------------- /misc/find_latest_checkpoints.py: -------------------------------------------------------------------------------- 1 | """Simple aggregation script for experiments 2 | 3 | ipy misc/find_latest_checkpoints.py -- --dataset audiocaps 4 | """ 5 | import argparse 6 | from pathlib import Path 7 | from datetime import datetime 8 | 9 | 10 | def formatted_summary(dataset, exp_root, fname): 11 | try: 12 | summaries = list(Path(exp_root).glob(f"**/*{fname}")) 13 | summaries = [x for x in summaries if dataset in str(x)] 14 | except FileNotFoundError: 15 | fname = "summary-seed-1_seed-2_seed-3.json" 16 | summaries = list(Path(exp_root).glob(f"**/*{fname}")) 17 | summaries = [x for x in summaries if dataset in str(x)] 18 | print(f"Found {len(summaries)}") 19 | latest = {} 20 | time_format = "%Y-%m-%d_%H-%M-%S" 21 | for summary in summaries: 22 | rel_path = summary.relative_to(exp_root) 23 | key, group, timestamp = rel_path.parts[0], rel_path.parts[1], rel_path.parts[3] 24 | val = {"timestamp": timestamp, "group": group} 25 | if key in latest: 26 | prev_ts = datetime.strptime(latest[key]["timestamp"], time_format) 27 | curr_ts = datetime.strptime(timestamp, time_format) 28 | if curr_ts > prev_ts: 29 | latest[key] = val 30 | else: 31 | latest[key] = val 32 | for key, val in sorted(latest.items()): 33 | ts, group = val["timestamp"], val["group"] 34 | print(f'"{key}": ["{group}", "{ts}"],') 35 | 36 | 37 | def main(): 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("--dataset", default="audiocaps") 40 | parser.add_argument("--exp_root", default="data/saved/log") 41 | parser.add_argument("--fname", default="summary-seed-0_seed-1_seed-2.json") 42 | args = parser.parse_args() 43 | 44 | formatted_summary( 45 | fname=args.fname, 46 | dataset=args.dataset, 47 | exp_root=args.exp_root, 48 | ) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /misc/experiments-audiocaps.json: -------------------------------------------------------------------------------- 1 | { 2 | "audiocaps-train-full-ce-only-audio": ["c0b5bc86", "2021-06-10_15-34-48"], 3 | "audiocaps-train-full-ce-only-inst": ["5ee05383", "2021-06-10_15-32-29"], 4 | "audiocaps-train-full-ce-only-r2p1d": ["88d3ab9e", "2021-06-10_15-30-03"], 5 | "audiocaps-train-full-ce-only-scene": ["74d71d8b", "2021-06-10_15-27-11"], 6 | "audiocaps-train-full-ce-r2p1d-inst": ["cf11d710", "2021-06-10_15-23-04"], 7 | "audiocaps-train-full-ce-r2p1d-inst-vggish": ["74991f95", "2021-06-10_15-06-31"], 8 | "audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound": ["b51f941a", "2021-06-10_14-56-45"], 9 | "audiocaps-train-full-ce-r2p1d-inst-vggsound": ["1b623fdc", "2021-06-10_14-49-00"], 10 | "audiocaps-train-full-ce-scene-inst": ["55c40cc6", "2021-06-10_15-18-50"], 11 | "audiocaps-train-full-ce-scene-r2p1d": ["b2b14107", "2021-06-10_15-13-04"], 12 | "audiocaps-train-only-vggsound": ["afab0e0c", "2021-06-16_01-21-37"], 13 | "audiocaps-train-vggish-vggsound": ["7e2eda12", "2021-06-09_17-06-26"], 14 | "audiocaps-train-vggish-vggsound-moee": ["f66525f8", "2021-06-09_16-44-00"], 15 | "audiocaps-train-vggish-vggsound-train_list_10": ["68747f8c", "2021-06-10_11-02-21"], 16 | "audiocaps-train-vggish-vggsound-train_list_25": ["0151ad7f", "2021-06-10_11-14-25"], 17 | "audiocaps-train-vggish-vggsound-train_list_50": ["4aeeaa0d", "2021-06-10_11-27-36"], 18 | "audiocaps-train-vggish-vggsound-train_list_75": ["3a8d0584", "2021-06-10_11-45-26"], 19 | 20 | "clotho-train-full-ce-only-audio": ["4f58ef05", "2021-06-10_15-38-28"], 21 | "clotho-train-vggish-vggsound": ["dec0c820", "2021-06-10_14-45-51"], 22 | "clotho-train-vggish-vggsound-moee": ["fafa3e91", "2021-06-10_14-44-51"], 23 | "clotho-train-vggish-vggsound-finetuned": ["74560a6c", "2021-06-10_16-38-40"], 24 | "clotho-train-vggish-vggsound-moee-finetuned": ["5395fa47", "2021-06-10_16-36-13"], 25 | 26 | 27 | "querydsegments-train-full-ce-only-audio": ["70111434", "2021-06-10_14-33-03"], 28 | "activity-net-train-full-ce-audio-only": ["e8639db7", "2021-06-11_12-23-42"] 29 | } -------------------------------------------------------------------------------- /misc/experiments_teachText.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "msrvtt-train-gpt2-xl-finetuned-mte-denoising-adam-less80": ["c58ecf3b", "2020-11-21_14-00-26"], 4 | "msrvtt-train-gpt2-xl-finetuned-denoising-adam": ["a61447a9", "2020-11-11_05-31-29"], 5 | "msrvtt-train-gpt2-xl-finetuned-mte-denoising-adam": ["2cc98676", "2020-11-11_06-21-03"], 6 | "msrvtt-train-full-ce": ["6becbb74", "2020-06-28_18-31-21"], 7 | "msrvtt-train-ce-intra-mte": ["4d4508a2", "2020-11-06_17-27-00"], 8 | "msrvtt-train-gpt2-xl-finetuned-adam": ["244af891", "2020-10-01_12-22-00"], 9 | "msrvtt-train-gpt2-xl-finetuned-mte-adam": ["6427fd41", "2020-09-30_20-34-12"], 10 | 11 | "msvd-train-full-ce": ["2ae80bea", "2020-11-11_13-16-14"], 12 | "msvd-train-gpt2-xl-finetuned-adam": ["db396303", "2020-10-01_13-17-33"], 13 | "msvd-train-gpt2-xl-finetuned-mte-adam": ["0af2a1ed", "2020-09-30_21-30-15"], 14 | "msvd-train-ce-intra-mte": ["a3026a07", "2020-11-13_00-19-59"], 15 | "msvd-train-gpt2-xl-finetuned-denoising-adam": ["71686a77", "2020-11-11_12-19-27"], 16 | "msvd-train-gpt2-xl-finetuned-mte-denoising-adam": ["66dc5dff", "2020-11-11_12-57-29"], 17 | 18 | "didemo-train-full-ce": ["4ea49b50", "2020-06-28_20-04-46"], 19 | "didemo-train-gpt2-xl-finetuned-adam": ["616cf11b", "2020-10-01_13-31-57"], 20 | "didemo-train-gpt2-xl-finetuned-mte-adam": ["f004e587", "2020-09-30_20-19-13"], 21 | "didemo-train-ce-intra-mte": ["1a5a249f", "2020-11-06_19-12-39"], 22 | 23 | 24 | "lsmdc-train-full-ce": ["7af368b1", "2020-06-28_20-40-54"], 25 | "lsmdc-train-gpt2-xl-finetuned-mte-adam": ["38e65732", "2020-09-30_20-52-52"], 26 | "lsmdc-train-gpt2-xl-finetuned-adam": ["9e2c8afd", "2020-10-01_13-48-49"], 27 | "lsmdc-train-ce-intra-mte": ["1a5555af", "2020-11-06_19-32-23"], 28 | 29 | 30 | "activity-net-train-full-ce": ["9601c704", "2020-07-31_00-23-01"], 31 | "activity-net-train-gpt2-xl-finetuned-adam": ["a791f27d", "2020-10-01_13-42-29"], 32 | "activity-net-train-gpt2-xl-finetuned-mte-adam": ["87d04a50", "2020-10-01_08-48-36"], 33 | "activity-net-train-ce-intra-mte": ["620ad6b4", "2020-11-06_19-12-39"] 34 | } 35 | -------------------------------------------------------------------------------- /configs/data_loader_activity-net.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/base_config.json", 3 | "eval_mode": "test_run", 4 | "experts": { 5 | "face_dim": 512, 6 | "text_feat": "w2v", 7 | "modalities": [ 8 | "imagenet.resnext101_32x48d.0", 9 | "imagenet.senet154.0", 10 | "scene.densenet161.0", 11 | "r2p1d.r2p1d-ig65m.0", 12 | "i3d.i3d.0", 13 | "face", 14 | "ocr", 15 | "audio", 16 | "speech" 17 | ] 18 | }, 19 | "arch": { 20 | "args": { 21 | "test_caption_mode": "indep", 22 | "use_ce": "pairwise", 23 | "use_mish": 1, 24 | "use_bn_reason": 1, 25 | "num_g_layers": 3, 26 | "num_h_layers": 0, 27 | "include_self": 1, 28 | "l2renorm": false, 29 | "randomise_feats": "", 30 | "vlad_clusters": { 31 | "text": 20, 32 | "audio": 16 33 | }, 34 | "ghost_clusters": { 35 | "text": 1 36 | } 37 | } 38 | }, 39 | "data_loader": { 40 | "args":{ 41 | "dataset_name": "ActivityNet", 42 | "root_feat_folder": "structured-symlinks", 43 | "data_dir": "data/activity-net", 44 | "split_name": "val1", 45 | "batch_size": 128, 46 | "fuse_captions": true, 47 | "num_test_captions": 1, 48 | "max_tokens": { 49 | "text": 20, 50 | "audio": 29 51 | } 52 | } 53 | }, 54 | "trainer": { 55 | "epochs": 40 56 | }, 57 | "optimizer": { 58 | "type": "Ranger", 59 | "args":{ 60 | "lr": 0.1, 61 | "weight_decay": 1e-3 62 | } 63 | }, 64 | "loss": { 65 | "type": "MaxMarginRankingLoss", 66 | "args": { 67 | "margin": 0.060496613740311816, 68 | "fix_norm": true 69 | } 70 | }, 71 | "eval_settings": { 72 | "data_loader": { 73 | "args":{ 74 | "split_name": "val1" 75 | } 76 | }, 77 | "tester": { 78 | "save_dir": "data/saved/", 79 | "verbosity": 2 80 | }, 81 | "disable_gpu": true 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /utils/cos_restart.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import numpy as np 4 | from torch.optim.optimizer import Optimizer, required 5 | from torch.optim.lr_scheduler import _LRScheduler 6 | 7 | class CosineAnnealingWithRestartsLR(_LRScheduler): 8 | 9 | r"""Set the learning rate of each parameter group using a cosine annealing 10 | schedule, where :math:`\eta_{max}` is set to the initial lr and 11 | :math:`T_{cur}` is the number of epochs since the last restart in SGDR: 12 | .. math:: 13 | \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 + 14 | \cos(\frac{T_{cur}}{T_{max}}\pi)) 15 | When last_epoch=-1, sets initial lr as lr. 16 | It has been proposed in 17 | `SGDR: Stochastic Gradient Descent with Warm Restarts`_. This implements 18 | the cosine annealing part of SGDR, the restarts and number of iterations multiplier. 19 | Args: 20 | optimizer (Optimizer): Wrapped optimizer. 21 | T_max (int): Maximum number of iterations. 22 | T_mult (float): Multiply T_max by this number after each restart. Default: 1. 23 | eta_min (float): Minimum learning rate. Default: 0. 24 | last_epoch (int): The index of last epoch. Default: -1. 25 | .. _SGDR\: Stochastic Gradient Descent with Warm Restarts: 26 | https://arxiv.org/abs/1608.03983 27 | 28 | src: https://github.com/lkhphuc/pytorch-3d-point-cloud-generation/blob/ 29 | master/custom_scheduler.py 30 | """ 31 | 32 | def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, T_mult=1): 33 | self.T_max = T_max 34 | self.T_mult = T_mult 35 | self.restart_every = T_max 36 | self.eta_min = eta_min 37 | self.restarts = 0 38 | self.restarted_at = 0 39 | super().__init__(optimizer, last_epoch) 40 | 41 | def restart(self): 42 | self.restart_every *= self.T_mult 43 | self.restarted_at = self.last_epoch 44 | 45 | def cosine(self, base_lr): 46 | return self.eta_min + (base_lr - self.eta_min) * \ 47 | (1 + math.cos(math.pi * self.step_n / self.restart_every)) / 2 48 | 49 | @property 50 | def step_n(self): 51 | return self.last_epoch - self.restarted_at 52 | 53 | def get_lr(self): 54 | if self.step_n >= self.restart_every: 55 | self.restart() 56 | return [self.cosine(base_lr) for base_lr in self.base_lrs] -------------------------------------------------------------------------------- /misc/generate_exps.py: -------------------------------------------------------------------------------- 1 | """A utility for generating experiment config files. 2 | """ 3 | import json 4 | import copy 5 | import argparse 6 | import itertools 7 | from pathlib import Path 8 | from datetime import datetime 9 | from collections import OrderedDict 10 | 11 | 12 | def generate_configs(base_config, grid): 13 | job_queue = [] 14 | timestamp = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S") 15 | hparam_vals = [x for x in grid.values()] 16 | grid_vals = list(itertools.product(*hparam_vals)) 17 | hparams = list(grid.keys()) 18 | 19 | for cfg_vals in grid_vals: 20 | custom_tokens = [f"{hparam}@{val}" for hparam, val in zip(hparams, cfg_vals)] 21 | custom_args = "+".join(custom_tokens) 22 | job = f"--config {base_config} --custom_args {custom_args}" 23 | job_queue.append(job) 24 | 25 | job_queue_path = f"data/job-queues/latest.txt" 26 | Path(job_queue_path).parent.mkdir(exist_ok=True, parents=True) 27 | with open(str(job_queue_path), "w") as f: 28 | f.write("\n".join(job_queue)) 29 | print(f"Wrote {len(job_queue)} jobs to queue at {job_queue_path}") 30 | job_queue_path = f"data/job-queues/{Path(base_config).stem}-{timestamp}.txt" 31 | with open(str(job_queue_path), "w") as f: 32 | f.write("\n".join(job_queue)) 33 | print(f"Wrote backup {len(job_queue)} jobs to queue at {job_queue_path}") 34 | 35 | 36 | def parse_grid(key_val_strs): 37 | print(f"parsing grid str: {key_val_strs}") 38 | key_val_pairs = key_val_strs.split("+") 39 | parsed = OrderedDict() 40 | for pair in key_val_pairs: 41 | key, val_str = pair.split("@") 42 | vals = [] 43 | opts = [x for x in val_str.split(":")] 44 | for token in opts: 45 | if "," in token: 46 | val = [x for x in token.split(",") if x] 47 | else: 48 | val = token 49 | vals.append(val) 50 | parsed[key] = vals 51 | return parsed 52 | 53 | 54 | def main(): 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('--grid', default="") 57 | parser.add_argument('--config', default="configs/msrvtt/only-i3d.json") 58 | args = parser.parse_args() 59 | 60 | grid = parse_grid(args.grid) 61 | generate_configs( 62 | grid=grid, 63 | base_config=args.config, 64 | ) 65 | 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /model/mil_nce_net.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple 2 | 3 | import torch 4 | from typeguard import typechecked 5 | 6 | from base import BaseModel 7 | 8 | 9 | class MNNet(BaseModel): 10 | 11 | @typechecked 12 | def __init__( 13 | self, 14 | text_dim: int, 15 | expert_dims: Dict[str, Tuple[int, int]], 16 | **_unused, 17 | ): 18 | self.text_dim = text_dim 19 | self.expert_dims = expert_dims 20 | self.modalities = list(expert_dims.keys()) 21 | super().__init__() 22 | self.dummy_param = torch.nn.Parameter(torch.ones(1) * 1E-5) 23 | 24 | @typechecked 25 | def forward( 26 | self, 27 | text: torch.Tensor, 28 | ind: Dict[str, torch.Tensor], 29 | experts: Dict[str, torch.Tensor], 30 | **_unused, 31 | ): 32 | self.sanity_checks(text=text, experts=experts, ind=ind) 33 | vid_embedding = next(iter(experts.values())) 34 | vid_embedding = self.dummy_param + vid_embedding 35 | text = text.view(text.shape[0] * text.shape[1], text.shape[-1]) 36 | # text = text / torch.norm(text, p=2, dim=1).reshape(-1, 1) 37 | # vid_embedding = vid_embedding / torch.norm(vid_embedding, p=2, 38 | # dim=1).reshape(-1, 1) 39 | sims = torch.matmul(text, vid_embedding.t()) 40 | return { 41 | "modalities": self.modalities, 42 | "cross_view_conf_matrix": sims, 43 | "text_embds": {self.modalities[0]: text}, 44 | "vid_embds": {self.modalities[0]: vid_embedding}, 45 | } 46 | 47 | @typechecked 48 | def sanity_checks( 49 | self, 50 | text: torch.Tensor, 51 | ind: Dict[str, torch.Tensor], 52 | experts: Dict[str, torch.Tensor], 53 | ): 54 | msg = f"Text dim {text.shape[-1]} did not match expected {self.text_dim}" 55 | assert text.shape[-1] == self.text_dim, msg 56 | assert len(experts) == 1, "Expected single modality experts" 57 | assert len(text.shape) == 4, "Expected four axes for text input" 58 | assert text.shape[2] == 1, "Expected singleton for text input on dim 2" 59 | for expert in self.expert_dims: 60 | msg = f"Expected all features to be present for {expert}" 61 | assert ind[expert].sum() == len(ind[expert]), msg 62 | feats = experts[expert] 63 | expected = self.expert_dims[expert] 64 | msg = f"Feature shape {feats.shape[1]} did not match expected {expected}" 65 | assert feats.shape[1] == expected[-1], msg 66 | -------------------------------------------------------------------------------- /misc/aggregate_logs_and_stats.py: -------------------------------------------------------------------------------- 1 | """Aggregate logs across multiple seeded runs and summarise their statistics. 2 | 3 | ipy misc/aggregate_logs_and_stats.py -- --group_id 3b737e0d 4 | """ 5 | import argparse 6 | import logging 7 | from pathlib import Path 8 | from collections import OrderedDict 9 | from utils.util import read_json 10 | from glob import glob 11 | from logger.log_parser import log_summary 12 | 13 | 14 | def summarise(group_id, log_dir="data/saved/log", model_dir="data/saved/models"): 15 | seeded_runs = sorted(list(Path(log_dir).glob(f"**/{group_id}/seed-*"))) 16 | print(f"Found a total of {len(seeded_runs)} seed runs in {group_id}") 17 | msg = f"Found no seeded runs for group_id: {group_id} in {log_dir}" 18 | assert len(seeded_runs) > 0, msg 19 | 20 | info_logs = OrderedDict() 21 | for seeded_run in seeded_runs: 22 | info_log_matches = list(Path(seeded_run).glob("**/info.log")) 23 | msg = f"expected to find a single info.log file, found {len(info_log_matches)}" 24 | assert len(info_log_matches) == 1, msg 25 | info_logs[seeded_run.stem] = info_log_matches[0] 26 | 27 | summary_log = [] 28 | for seeded_run, info_log_path in info_logs.items(): 29 | with open(info_log_path, "r") as f: 30 | log = f.read().splitlines() 31 | summary_log.extend(log) 32 | first_info_log = list(info_logs.values())[0] 33 | summary_log_name = f"summary-{'_'.join(list(info_logs.keys()))}.json" 34 | summary_log_path = first_info_log.parent / summary_log_name 35 | with open(summary_log_path, "w") as f: 36 | f.write("\n".join(summary_log)) 37 | print(f"Wrote concatenated logs to {summary_log_path}") 38 | 39 | # retrieve the config from the first run 40 | rel_path = first_info_log.relative_to(log_dir).parent 41 | config_path = Path(model_dir) / rel_path / "config.json" 42 | assert config_path.exists(), f"Could not find config at {config_path}" 43 | config = read_json(config_path) 44 | 45 | logger = logging.getLogger("summary") 46 | 47 | # some care is required with logging to avoid sending all experiment logs 48 | # to the same file. We avoid this by essentially resetting the logging utility 49 | 50 | # Remove all handlers associated with the root logger object 51 | for handler in logging.root.handlers[:]: 52 | logging.root.removeHandler(handler) 53 | logging.basicConfig(filename=summary_log_path, level=logging.INFO) 54 | if not logger.handlers: 55 | logger.addHandler(logging.StreamHandler()) 56 | 57 | log_summary( 58 | logger=logger, 59 | log_path=summary_log_path, 60 | eval_mode=config["eval_mode"], 61 | fixed_num_epochs=config["trainer"]["epochs"], 62 | ) 63 | 64 | 65 | def main(): 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument("--group_id", default="ed53d01d") 68 | args = parser.parse_args() 69 | summarise(group_id=args.group_id) 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /misc/datasets/activity-net/README.md: -------------------------------------------------------------------------------- 1 | ## Pretrained Experts 2 | 3 | This folder contains a collection of features, extracted from the ActivityNet [2] and ActivityNet-captions [3] datasets as part of the paper: 4 | *Use what you have: Video retrieval using representations from collaborative experts*. 5 | 6 | ### Training splits 7 | 8 | The training splits were taken from [3] and are given in the files linked below: 9 | 10 | * [train_list.txt](train_list.txt) (10009 videos) 11 | * [val_1_list.txt](val_1_list.txt) (4917 videos) 12 | * [val_2_list.txt](val_2_list.txt) (4885 videos) 13 | 14 | In our work, we use the `train` split for training and the `val_1` split for evaluation (the `val_1` split forms a superset of the `val_2` split, with differing captions). 15 | 16 | 17 | **Tar contents** 18 | 19 | The compressed tar file (3.7 GiB) can be downloaded from: 20 | 21 | ``` 22 | http:/www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/activity-net-experts.tar.gz 23 | sha1sum: 2901046fa6a3d6f6393ee0047818e960fcfabd69 24 | ``` 25 | 26 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt). 27 | 28 | [**Deprecated**] *The features made available with the previous code release are also available as a compressed tar file (3.8 GiB). These should be considered deprecated, since they are incompatible with the current codebase, but are still available and can be downloaded from:* 29 | 30 | ``` 31 | http:/www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data-deprecated/features/activity-net-experts.tar.gz 32 | sha1sum: b16685576c97cdec2783fb89ea30ca7d17abb021 33 | ``` 34 | 35 | 36 | ### References: 37 | 38 | [1] If you use these features, please consider citing: 39 | ``` 40 | @inproceedings{Liu2019a, 41 | author = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.}, 42 | booktitle = {British Machine Vision Conference}, 43 | title = {Use What You Have: Video retrieval using representations from collaborative experts}, 44 | date = {2019}, 45 | } 46 | ``` 47 | 48 | [2] Please also consider citing the original ActivityNet dataset, which was described in: 49 | 50 | ``` 51 | @inproceedings{caba2015activitynet, 52 | title={Activitynet: A large-scale video benchmark for human activity understanding}, 53 | author={Caba Heilbron, Fabian and Escorcia, Victor and Ghanem, Bernard and Carlos Niebles, Juan}, 54 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, 55 | pages={961--970}, 56 | year={2015} 57 | } 58 | ``` 59 | 60 | [3] In addition, please consider citing the ActivityNet-captions dataset, which provides the text descriptions, and which was described in: 61 | 62 | ``` 63 | @inproceedings{krishna2017dense, 64 | title={Dense-captioning events in videos}, 65 | author={Krishna, Ranjay and Hata, Kenji and Ren, Frederic and Fei-Fei, Li and Carlos Niebles, Juan}, 66 | booktitle={Proceedings of the IEEE international conference on computer vision}, 67 | pages={706--715}, 68 | year={2017} 69 | } 70 | ``` -------------------------------------------------------------------------------- /misc/launch_exps_from_list.py: -------------------------------------------------------------------------------- 1 | """Launch a collection of experiments on SLURM from a text file. 2 | 3 | EXP_LIST=audio-retrieval-exps.txt 4 | ipy misc/launch_exps_from_list.py -- --exp_list "slurm/${EXP_LIST}" --yaspify 5 | 6 | """ 7 | import os 8 | import sys 9 | import json 10 | import argparse 11 | from pathlib import Path 12 | 13 | from yaspi.yaspi import Yaspi 14 | from utils.util import parse_grid, filter_cmd_args 15 | from misc.aggregate_logs_and_stats import summarise 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--exp_list", default="data/job-queues/latest.txt") 20 | parser.add_argument("--yaspify", action="store_true", help="launch via slurm") 21 | parser.add_argument("--slurm", action="store_true") 22 | parser.add_argument("--limit", type=int, default=0) 23 | parser.add_argument('--mini_train', action="store_true") 24 | parser.add_argument("--use_cnodes", action="store_true") 25 | parser.add_argument('--train_single_epoch', action="store_true") 26 | parser.add_argument("--yaspi_defaults_path", type=Path, 27 | default="misc/yaspi_gpu_defaults.json") 28 | parser.add_argument("--evaluation", type=str, default='train', choices=['train', 'test']) 29 | args = parser.parse_args() 30 | 31 | # construct list of experiments from text file 32 | with open(args.exp_list, "r") as f: 33 | custom_args = f.read().splitlines() 34 | # remove blank lines 35 | custom_args = [x for x in custom_args if x] 36 | 37 | if args.limit: 38 | custom_args = custom_args[:args.limit] 39 | 40 | parsed = {} 41 | for line in custom_args: 42 | parsed.update(parse_grid(line, args.evaluation)) 43 | 44 | # flatten all parsed experiments 45 | custom_args = [x for group in parsed.values() for x in group] 46 | 47 | cmd_args = sys.argv[1:] 48 | remove = ["--yaspify", "--exp_list", "--use_cnodes", "--evaluation"] 49 | cmd_args = filter_cmd_args(cmd_args, remove=remove) 50 | base_cmd = f"python {args.evaluation}.py {' '.join(cmd_args)}" 51 | 52 | if args.yaspify: 53 | with open(args.yaspi_defaults_path, "r") as f: 54 | yaspi_defaults = json.load(f) 55 | if args.use_cnodes: 56 | yaspi_defaults.update({"partition": "compute", "gpus_per_task": 0}) 57 | job_name = f"{Path(args.exp_list).stem}-{len(custom_args)}-exps" 58 | job_queue = [f'"{x}"' for x in custom_args] 59 | job_queue = " ".join(job_queue) 60 | job = Yaspi( 61 | cmd=base_cmd, 62 | job_queue=job_queue, 63 | job_name=job_name, 64 | job_array_size=len(custom_args), 65 | **yaspi_defaults, 66 | ) 67 | job.submit(watch=True, conserve_resources=5) 68 | else: 69 | for custom_args_ in custom_args: 70 | base_cmd = f"{base_cmd} {custom_args_}" 71 | print(f"Running cmd: {base_cmd}") 72 | os.system(base_cmd) 73 | if args.evaluation =='train': 74 | for group_id in parsed: 75 | summarise(group_id=group_id) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /logger/visualization.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from utils import Timer 3 | 4 | 5 | class TensorboardWriter(): 6 | def __init__(self, log_dir, logger, enabled): 7 | self.writer = None 8 | self.selected_module = "" 9 | 10 | if enabled: 11 | log_dir = str(log_dir) 12 | 13 | # Retrieve vizualization writer 14 | succeeded = False 15 | for module in ["torch.utils.tensorboard", "tensorboardX"]: 16 | try: 17 | self.writer = importlib.import_module(module).SummaryWriter(log_dir) 18 | succeeded = True 19 | break 20 | except ImportError: 21 | succeeded = False 22 | self.selected_module = module 23 | 24 | if not succeeded: 25 | message = ("Warning: visualization (Tensorboard) is configured to use, " 26 | "but currently not installed on this machine. Please install" 27 | " either TensorboardX with 'pip install tensorboardx', " 28 | " upgrade PyTorch to version >= 1.1 for using " 29 | "'torch.utils.tensorboard' or turn off the option in " 30 | "the 'config.json' file.") 31 | logger.warning(message) 32 | 33 | self.step = 0 34 | self.mode = '' 35 | 36 | self.tb_writer_ftns = { 37 | 'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio', 38 | 'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding' 39 | } 40 | self.tag_mode_exceptions = {'add_histogram', 'add_embedding'} 41 | 42 | self.timer = Timer() 43 | 44 | def set_step(self, step, mode='train'): 45 | self.mode = mode 46 | self.step = step 47 | if step == 0: 48 | self.timer.reset() 49 | else: 50 | duration = self.timer.check() 51 | self.add_scalar('steps_per_sec', 1 / duration) 52 | 53 | def __getattr__(self, name): 54 | """ 55 | If visualization is configured to use: 56 | return add_data() methods of tensorboard with additional information 57 | (step, tag) added. 58 | Otherwise: 59 | return a blank function handle that does nothing 60 | """ 61 | if name in self.tb_writer_ftns: 62 | add_data = getattr(self.writer, name, None) 63 | 64 | def wrapper(tag, data, *args, **kwargs): 65 | if add_data is not None: 66 | # add mode(train/valid) tag 67 | if name not in self.tag_mode_exceptions: 68 | tag = '{}/{}'.format(tag, self.mode) 69 | add_data(tag, data, self.step, *args, **kwargs) 70 | return wrapper 71 | else: 72 | # default action for returning methods defined in this class, set_step() 73 | # for instance. 74 | try: 75 | attr = object.__getattr__(name) 76 | except AttributeError: 77 | msg = "type object '{}' has no attribute '{}'" 78 | raise AttributeError(msg.format(self.selected_module, name)) 79 | return attr 80 | -------------------------------------------------------------------------------- /model/text_embedding_models.json: -------------------------------------------------------------------------------- 1 | { 2 | "w2v": { 3 | "weights_path": "data/text_models/GoogleNews-vectors-negative300.bin.gz", 4 | "dim": 300, 5 | "force_cpu": true, 6 | "remove_stopwords": false, 7 | "mirror": "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz" 8 | }, 9 | "grovle": { 10 | "dim": 300, 11 | "weights_path": "data/text_models/grovle.zip", 12 | "force_cpu": true, 13 | "remove_stopwords": false, 14 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE" 15 | }, 16 | "mt_grovle": { 17 | "dim": 300, 18 | "weights_path": "data/text_models/mt_grovle.zip", 19 | "force_cpu": true, 20 | "remove_stopwords": false, 21 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE" 22 | }, 23 | "hglmm_300d": { 24 | "dim": 300, 25 | "weights_path": "data/text_models/hglmm_300d.zip", 26 | "force_cpu": true, 27 | "remove_stopwords": false, 28 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE" 29 | }, 30 | "hglmm_6kd": { 31 | "dim": 6000, 32 | "weights_path": "data/text_models/hglmm_6kd.zip", 33 | "force_cpu": true, 34 | "remove_stopwords": false, 35 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE" 36 | }, 37 | "howto100m_mil_nce": { 38 | "word_dict_path": "data/text_models/howto100m/s3d_dict.npy", 39 | "weights_path": "data/text_models/howto100m/s3d_howto100m.pth", 40 | "dim": 512, 41 | "mirror": "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m" 42 | }, 43 | "openai": { 44 | "dim": 768, 45 | "custom_pipeline": true 46 | }, 47 | "electra": { 48 | "dim": 256 49 | }, 50 | "openai-gpt": { 51 | "dim": 768 52 | }, 53 | "gpt2": { 54 | "dim": 768 55 | }, 56 | "gpt2-medium": { 57 | "dim": 1024 58 | }, 59 | "gpt2-large": { 60 | "dim": 1280 61 | }, 62 | "gpt2-xl": { 63 | "dim": 1600 64 | }, 65 | "gpt2-xl-finetune": { 66 | "dim": 1600 67 | }, 68 | "bert-base-uncased": { 69 | "dim": 768 70 | }, 71 | "t5-small": { 72 | "dim": 512 73 | }, 74 | "t5-base": { 75 | "dim": 768 76 | }, 77 | "t5-large": { 78 | "dim": 1024 79 | }, 80 | "t5-3b": { 81 | "dim": 1024 82 | }, 83 | "t5-11b": { 84 | "force_cpu": true, 85 | "dim": 1024 86 | }, 87 | "albert-base-v2": { 88 | "dim": 768 89 | }, 90 | "albert-large-v2": { 91 | "dim": 1024 92 | }, 93 | "albert-xlarge-v2": { 94 | "dim": 2048 95 | }, 96 | "ctrl": { 97 | "dim": 1280 98 | }, 99 | "roberta-base": { 100 | "dim": 768 101 | }, 102 | "roberta-large": { 103 | "dim": 1024 104 | }, 105 | "xlnet-base-cased": { 106 | "dim": 768 107 | }, 108 | "xlnet-large-cased": { 109 | "dim": 1024 110 | }, 111 | "transfo-xl-wt103": { 112 | "dim": 1024 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /model/loss.py: -------------------------------------------------------------------------------- 1 | """This module contains an implementation of the max margin ranking loss, slightly 2 | modified from this code: 3 | https://github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/loss.py 4 | 5 | The modification is the `fix_norm` conditional, which removes zero terms from the 6 | diagonal when performing the averaging calculation. 7 | 8 | Original licence below. 9 | """ 10 | # Copyright 2018 Antoine Miech All Rights Reserved. 11 | # 12 | # Licensed under the Apache License, Version 2.0 (the "License"); 13 | # you may not use this file except in compliance with the License. 14 | # You may obtain a copy of the License at 15 | # 16 | # http://www.apache.org/licenses/LICENSE-2.0 17 | # 18 | # Unless required by applicable law or agreed to in writing, software 19 | # distributed under the License is distributed on an "AS-IS" BASIS, 20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 | # See the License for the specific language governing permissions and 22 | # limitations under the License. 23 | 24 | import torch.nn as nn 25 | import torch as th 26 | import torch.nn.functional as F 27 | 28 | 29 | class MaxMarginRankingLoss(nn.Module): 30 | 31 | def __init__(self, margin=1, fix_norm=True): 32 | super().__init__() 33 | self.fix_norm = fix_norm 34 | self.loss = th.nn.MarginRankingLoss(margin) 35 | self.margin = margin 36 | 37 | def forward(self, x): 38 | n = x.size()[0] 39 | 40 | x1 = th.diag(x) 41 | x1 = x1.unsqueeze(1) 42 | x1 = x1.expand(n, n) 43 | x1 = x1.contiguous().view(-1, 1) 44 | x1 = th.cat((x1, x1), 0) 45 | 46 | x2 = x.view(-1, 1) 47 | x3 = x.transpose(0, 1).contiguous().view(-1, 1) 48 | 49 | x2 = th.cat((x2, x3), 0) 50 | max_margin = F.relu(self.margin - (x1 - x2)) 51 | 52 | if self.fix_norm: 53 | # remove the elements from the diagonal 54 | keep = th.ones(x.shape) - th.eye(x.shape[0]) # 128 x 128 55 | keep1 = keep.view(-1, 1) 56 | keep2 = keep.transpose(0, 1).contiguous().view(-1, 1) 57 | keep_idx = th.nonzero(th.cat((keep1, keep2), 0).flatten()).flatten() 58 | if x1.is_cuda: 59 | keep_idx = keep_idx.cuda() 60 | x1_ = th.index_select(x1, dim=0, index=keep_idx) 61 | x2_ = th.index_select(x2, dim=0, index=keep_idx) 62 | max_margin = F.relu(self.margin - (x1_ - x2_)) 63 | 64 | return max_margin.mean() 65 | 66 | 67 | class BCEWithLogitsLoss(nn.Module): 68 | 69 | def __init__(self, weight=None): 70 | super().__init__() 71 | self.loss = th.nn.BCEWithLogitsLoss(weight=weight) 72 | 73 | def forward(self, x, target): 74 | return self.loss(x, target) 75 | 76 | 77 | class CrossEntropyLoss(nn.Module): 78 | 79 | def __init__(self, weight=None): 80 | super().__init__() 81 | self.loss = th.nn.CrossEntropyLoss(weight=weight) 82 | 83 | def forward(self, x, target): 84 | return self.loss(x, target.long().to(x.device)) 85 | 86 | 87 | if __name__ == "__main__": 88 | loss = BCEWithLogitsLoss() 89 | x = th.randn(3, requires_grad=True) 90 | target = th.empty(3).random_(2) 91 | output = loss(x, target) 92 | output.backward() 93 | print(target) 94 | -------------------------------------------------------------------------------- /model/net_vlad.py: -------------------------------------------------------------------------------- 1 | """NetVLAD implementation. 2 | """ 3 | # Copyright 2018 Antoine Miech All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS-IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import math 19 | import ipdb 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | import torch as th 23 | 24 | 25 | class NetVLAD(nn.Module): 26 | def __init__(self, cluster_size, feature_size, ghost_clusters=0, 27 | add_batch_norm=True): 28 | super().__init__() 29 | 30 | self.feature_size = feature_size 31 | self.cluster_size = cluster_size 32 | self.ghost_clusters = ghost_clusters 33 | 34 | init_sc = (1 / math.sqrt(feature_size)) 35 | clusters = cluster_size + ghost_clusters 36 | 37 | # The `clusters` weights are the `(w,b)` in the paper 38 | self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters)) 39 | self.batch_norm = nn.BatchNorm1d(clusters) if add_batch_norm else None 40 | # The `clusters2` weights are the visual words `c_k` in the paper 41 | self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size)) 42 | self.out_dim = self.cluster_size * feature_size 43 | 44 | def forward(self, x, mask=None): 45 | """Aggregates feature maps into a fixed size representation. In the following 46 | notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size. 47 | 48 | Args: 49 | x (th.Tensor): B x N x D 50 | 51 | Returns: 52 | (th.Tensor): B x DK 53 | """ 54 | self.sanity_checks(x) 55 | max_sample = x.size()[1] 56 | x = x.view(-1, self.feature_size) # B x N x D -> BN x D 57 | 58 | if x.device != self.clusters.device: 59 | msg = f"x.device {x.device} != cluster.device {self.clusters.device}" 60 | raise ValueError(msg) 61 | 62 | assignment = th.matmul(x, self.clusters) # (BN x D) x (D x (K+G)) -> BN x (K+G) 63 | 64 | if self.batch_norm: 65 | assignment = self.batch_norm(assignment) 66 | 67 | assignment = F.softmax(assignment, dim=1) # BN x (K+G) -> BN x (K+G) 68 | # remove ghost assigments 69 | assignment = assignment[:, :self.cluster_size] 70 | assignment = assignment.view(-1, max_sample, self.cluster_size) # -> B x N x K 71 | a_sum = th.sum(assignment, dim=1, keepdim=True) # B x N x K -> B x 1 x K 72 | a = a_sum * self.clusters2 73 | 74 | assignment = assignment.transpose(1, 2) # B x N x K -> B x K x N 75 | 76 | x = x.view(-1, max_sample, self.feature_size) # BN x D -> B x N x D 77 | vlad = th.matmul(assignment, x) # (B x K x N) x (B x N x D) -> B x K x D 78 | vlad = vlad.transpose(1, 2) # -> B x D x K 79 | vlad = vlad - a 80 | 81 | # L2 intra norm 82 | vlad = F.normalize(vlad) 83 | 84 | # flattening + L2 norm 85 | vlad = vlad.reshape(-1, self.cluster_size * self.feature_size) # -> B x DK 86 | vlad = F.normalize(vlad) 87 | return vlad # B x DK 88 | 89 | def sanity_checks(self, x): 90 | """Catch any nans in the inputs/clusters""" 91 | if th.isnan(th.sum(x)): 92 | print("nan inputs") 93 | ipdb.set_trace() 94 | if th.isnan(self.clusters[0][0]): 95 | print("nan clusters") 96 | ipdb.set_trace() 97 | -------------------------------------------------------------------------------- /utils/gen_ablations_for_dataset.py: -------------------------------------------------------------------------------- 1 | """Generate a set of ablations for each dataset, using the config structure of the 2 | MSRVTT experiments. 3 | 4 | ipy utils/gen_ablations_for_dataset.py -- --refresh --dest_dataset didemo \ 5 | --update_ablation_list 1 6 | 7 | """ 8 | import json 9 | import argparse 10 | from pathlib import Path 11 | 12 | 13 | def handle_moee_config(config): 14 | """For the official ablations on MSRVTT, we provide MoEE with the same hyperparam 15 | budget as CE and run a search to find the best hyperparams. For the unofficial 16 | ablations, we use the same padding/VLAD settings as CE. 17 | """ 18 | config = { 19 | "inherit_from": config["inherit_from"], 20 | "arch": {"type": "CENet", "args": {"use_ce": ""}}, 21 | } 22 | return config 23 | 24 | 25 | def remove_audio_streams(config, dest_path): 26 | """Prune audio-based features from the config and dest_path name (necessary for 27 | datasets like MSVD which do not possess sound.) If the audio feature was the control 28 | variable in the experiment, we return False for the dest_path, such that the ablation 29 | is removed altogether. 30 | """ 31 | audio_tags = ["audio", "speech"] 32 | for audio_tag in audio_tags: 33 | if f"-{audio_tag}." in dest_path: 34 | return config, False 35 | 36 | dest_path = dest_path.replace(f"-{audio_tag}", "") 37 | if "experts" in config and "modalities" in config["experts"]: 38 | if audio_tag in config["experts"]["modalities"]: 39 | config["experts"]["modalities"].remove(audio_tag) 40 | return config, dest_path 41 | 42 | 43 | def main(): 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--refresh', action="store_true") 46 | parser.add_argument('--update_ablation_list', type=int, default=1) 47 | parser.add_argument('--src_dataset', default="msrvtt") 48 | parser.add_argument('--dest_dataset', default="lsmdc") 49 | parser.add_argument('--exp_list', default="slurm/msrvtt-ablations.txt") 50 | args = parser.parse_args() 51 | 52 | with open(args.exp_list, "r") as f: 53 | exps = [x for x in f.read().splitlines() if x] 54 | 55 | print(f"Found {len(exps)} experiments in {args.exp_list}") 56 | dest_exp_path = Path(args.exp_list.replace("msrvtt", args.dest_dataset)) 57 | if dest_exp_path.exists() and not args.refresh: 58 | print(f"experiment list found at {dest_exp_path}, skipping...") 59 | return 60 | 61 | output_rows = [] 62 | exclude = ["miech", "jsfusion"] 63 | for row in exps: 64 | flag, config_path, seed_flag, seed_opts = row.split() 65 | if any([x in config_path for x in exclude]): 66 | continue 67 | with open(config_path, "r") as f: 68 | config = json.load(f) 69 | if Path(config_path).stem == "train-full-moee": 70 | config = handle_moee_config(config) 71 | dest_path = config_path.replace(args.src_dataset, args.dest_dataset) 72 | config["inherit_from"] = config["inherit_from"].replace(args.src_dataset, 73 | args.dest_dataset) 74 | if args.dest_dataset == "msvd": 75 | config, dest_path = remove_audio_streams(config, dest_path) 76 | if not dest_path: 77 | continue 78 | 79 | print(f"writing config to {dest_path}") 80 | with open(dest_path, "w") as f: 81 | json.dump(config, f, indent=4, sort_keys=False) 82 | output_rows.append([flag, dest_path, seed_flag, seed_opts]) 83 | 84 | if args.update_ablation_list: 85 | print(f"Writing new experiment list to {dest_exp_path}") 86 | output_rows = [" ".join(x) for x in output_rows] 87 | with open(dest_exp_path, "w") as f: 88 | for row in sorted(list(set(output_rows))): 89 | f.write(f"{row}\n") 90 | 91 | 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /data_loader/QuerYDSegments_dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Dict, Union, List 3 | from pathlib import Path 4 | 5 | from zsvision.zs_utils import memcache, concat_features 6 | from typeguard import typechecked 7 | 8 | from utils import memory_summary 9 | from base.base_dataset import BaseDataset 10 | 11 | 12 | class QuerYDSegments(BaseDataset): 13 | 14 | @staticmethod 15 | @typechecked 16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]: 17 | subset_paths = {} 18 | test_splits = { 19 | "val": "val_list.txt", 20 | "test": "test_list.txt", 21 | } 22 | for split_name, fname in test_splits.items(): 23 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname} 24 | 25 | feature_names = BaseDataset.common_feat_names() 26 | feature_names.append("audio.vggish.0") 27 | text_feat_paths = BaseDataset.common_text_feat_paths() 28 | text_feat_paths = {key: Path("text_embeddings") / fname 29 | for key, fname in text_feat_paths.items()} 30 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl" 31 | for key in text_feat_paths} 32 | custom_paths = { 33 | "audio": ["aggregated_audio/vggish-raw.hickle"], 34 | } 35 | feature_info = { 36 | "custom_paths": custom_paths, 37 | "feature_names": feature_names, 38 | "subset_list_paths": subset_paths, 39 | "text_feat_paths": text_feat_paths, 40 | "challenge_text_feat_paths": challenge_text_feat_paths, 41 | "raw_captions_path": "structured-symlinks/split_raw_captions_filtered.pkl", 42 | } 43 | return feature_info 44 | 45 | def load_features(self): 46 | root_feat = self.root_feat 47 | # import pdb; pdb.set_trace() 48 | feat_names = {key: self.visual_feat_paths(key) for key in 49 | self.paths["feature_names"]} 50 | feat_names.update(self.paths["custom_paths"]) 51 | features = {} 52 | for expert, rel_names in feat_names.items(): 53 | if expert not in self.ordered_experts: 54 | continue 55 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names]) 56 | if len(feat_paths) == 1: 57 | features[expert] = memcache(feat_paths[0]) 58 | else: 59 | # support multiple forms of feature (e.g. max and avg pooling). For 60 | # now, we only support direct concatenation 61 | msg = f"{expert}: Only direct concatenation of muliple feats is possible" 62 | print(f"Concatenating aggregates for {expert}....") 63 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg 64 | axis = self.feat_aggregation[expert]["aggregate-axis"] 65 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter 66 | print(f"concat cache info: {x}") 67 | features_ = concat_features(feat_paths, axis=axis) 68 | memory_summary() 69 | 70 | # Make separate feature copies for each split to allow in-place filtering 71 | features[expert] = copy.deepcopy(features_) 72 | 73 | self.features = features 74 | if self.challenge_mode: 75 | self.load_challenge_text_features() 76 | else: 77 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) 78 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat] 79 | self.text_features = memcache(text_feat_path) 80 | 81 | 82 | # overload video paths 83 | self.video_path_retrieval = [f"videos/{x}.mp4" 84 | for x in self.partition_lists["val"]] 85 | 86 | def sanity_checks(self): 87 | msg = (f"Expected to have single test caption for QuerYD, since we assume" 88 | f"that the captions are fused (but using {self.num_test_captions})") 89 | assert self.num_test_captions == 1, msg 90 | -------------------------------------------------------------------------------- /exp_to_seed_time.json: -------------------------------------------------------------------------------- 1 | {"audiocaps-train-full-ce-only-audio": [["c0b5bc86", "seed-0", "2021-06-10_15-34-48"], ["c0b5bc86", "seed-1", "2021-06-10_15-36-14"], ["c0b5bc86", "seed-2", "2021-06-10_15-36-15"]], "audiocaps-train-full-ce-only-inst": [["5ee05383", "seed-0", "2021-06-10_15-32-29"], ["5ee05383", "seed-1", "2021-06-10_15-33-37"], ["5ee05383", "seed-2", "2021-06-10_15-33-51"]], "audiocaps-train-full-ce-only-r2p1d": [["88d3ab9e", "seed-0", "2021-06-10_15-30-03"], ["88d3ab9e", "seed-1", "2021-06-10_15-31-11"], ["88d3ab9e", "seed-2", "2021-06-10_15-31-32"]], "audiocaps-train-full-ce-only-scene": [["74d71d8b", "seed-0", "2021-06-10_15-27-11"], ["74d71d8b", "seed-1", "2021-06-10_15-27-40"], ["74d71d8b", "seed-2", "2021-06-10_15-29-16"]], "audiocaps-train-full-ce-r2p1d-inst": [["cf11d710", "seed-0", "2021-06-10_15-23-04"], ["cf11d710", "seed-1", "2021-06-10_15-23-25"], ["cf11d710", "seed-2", "2021-06-10_15-26-45"]], "audiocaps-train-full-ce-r2p1d-inst-vggish": [["74991f95", "seed-0", "2021-06-10_15-06-31"], ["74991f95", "seed-1", "2021-06-10_15-07-40"], ["74991f95", "seed-2", "2021-06-10_15-12-39"]], "audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound": [["b51f941a", "seed-0", "2021-06-10_14-56-45"], ["b51f941a", "seed-1", "2021-06-10_14-57-08"], ["b51f941a", "seed-2", "2021-06-10_14-59-04"]], "audiocaps-train-full-ce-r2p1d-inst-vggsound": [["1b623fdc", "seed-0", "2021-06-10_14-49-00"], ["1b623fdc", "seed-1", "2021-06-10_14-49-00"], ["1b623fdc", "seed-2", "2021-06-10_14-48-59"]], "audiocaps-train-full-ce-scene-inst": [["55c40cc6", "seed-0", "2021-06-10_15-18-50"], ["55c40cc6", "seed-1", "2021-06-10_15-18-51"], ["55c40cc6", "seed-2", "2021-06-10_15-22-00"]], "audiocaps-train-full-ce-scene-r2p1d": [["b2b14107", "seed-0", "2021-06-10_15-13-04"], ["b2b14107", "seed-1", "2021-06-10_15-14-38"], ["b2b14107", "seed-2", "2021-06-10_15-17-36"]], "audiocaps-train-only-vggsound": [["afab0e0c", "seed-0", "2021-06-16_01-21-37"], ["afab0e0c", "seed-1", "2021-06-16_01-28-08"], ["afab0e0c", "seed-2", "2021-06-16_01-33-51"]], "audiocaps-train-vggish-vggsound": [["7e2eda12", "seed-0", "2021-06-09_17-06-26"], ["7e2eda12", "seed-1", "2021-06-09_17-15-12"], ["7e2eda12", "seed-2", "2021-06-09_17-24-01"]], "audiocaps-train-vggish-vggsound-moee": [["f66525f8", "seed-0", "2021-06-09_16-44-00"], ["f66525f8", "seed-1", "2021-06-09_16-51-31"], ["f66525f8", "seed-2", "2021-06-09_16-59-01"]], "audiocaps-train-vggish-vggsound-train_list_10": [["68747f8c", "seed-0", "2021-06-10_11-02-21"], ["68747f8c", "seed-1", "2021-06-10_11-07-21"], ["68747f8c", "seed-2", "2021-06-10_11-10-54"]], "audiocaps-train-vggish-vggsound-train_list_25": [["0151ad7f", "seed-0", "2021-06-10_11-14-25"], ["0151ad7f", "seed-1", "2021-06-10_11-18-48"], ["0151ad7f", "seed-2", "2021-06-10_11-23-12"]], "audiocaps-train-vggish-vggsound-train_list_50": [["4aeeaa0d", "seed-0", "2021-06-10_11-27-36"], ["4aeeaa0d", "seed-1", "2021-06-10_11-33-28"], ["4aeeaa0d", "seed-2", "2021-06-10_11-39-36"]], "audiocaps-train-vggish-vggsound-train_list_75": [["3a8d0584", "seed-0", "2021-06-10_11-45-26"], ["3a8d0584", "seed-1", "2021-06-10_11-52-47"], ["3a8d0584", "seed-2", "2021-06-10_12-00-02"]], "clotho-train-full-ce-only-audio": [["4f58ef05", "seed-0", "2021-06-10_15-38-28"], ["4f58ef05", "seed-1", "2021-06-10_15-39-02"], ["4f58ef05", "seed-2", "2021-06-10_15-39-33"]], "clotho-train-vggish-vggsound": [["dec0c820", "seed-0", "2021-06-10_14-45-51"], ["dec0c820", "seed-1", "2021-06-10_14-45-59"], ["dec0c820", "seed-2", "2021-06-10_14-46-07"]], "clotho-train-vggish-vggsound-moee": [["fafa3e91", "seed-0", "2021-06-10_14-44-51"], ["fafa3e91", "seed-1", "2021-06-10_14-44-51"], ["fafa3e91", "seed-2", "2021-06-10_14-44-51"]], "clotho-train-vggish-vggsound-finetuned": [["74560a6c", "seed-0", "2021-06-10_16-38-40"], ["74560a6c", "seed-1", "2021-06-10_16-39-29"], ["74560a6c", "seed-2", "2021-06-10_16-47-02"]], "clotho-train-vggish-vggsound-moee-finetuned": [["5395fa47", "seed-0", "2021-06-10_16-36-13"], ["5395fa47", "seed-1", "2021-06-10_16-37-11"], ["5395fa47", "seed-2", "2021-06-10_16-37-55"]], "querydsegments-train-full-ce-only-audio": [["70111434", "seed-0", "2021-06-10_14-33-03"], ["70111434", "seed-1", "2021-06-10_14-36-34"], ["70111434", "seed-2", "2021-06-10_14-40-01"]], "activity-net-train-full-ce-audio-only": [["f3ebaada", "seed-0", "2021-07-22_12-44-19"], ["f3ebaada", "seed-1", "2021-07-22_12-46-48"], ["f3ebaada", "seed-2", "2021-07-22_12-49-19"]]} 2 | -------------------------------------------------------------------------------- /data_loader/QuerYD_dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | from pathlib import Path 4 | from typing import Dict, List, Union 5 | 6 | from base.base_dataset import BaseDataset 7 | from typeguard import typechecked 8 | from utils import memory_summary 9 | from zsvision.zs_utils import concat_features, memcache 10 | 11 | 12 | class QuerYD(BaseDataset): 13 | 14 | @staticmethod 15 | @typechecked 16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]: 17 | subset_paths = {} 18 | test_splits = { 19 | "val": "val_list.txt", 20 | "test": "test_list.txt", 21 | } 22 | for split_name, fname in test_splits.items(): 23 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname} 24 | 25 | feature_names = BaseDataset.common_feat_names() 26 | feature_names.append("audio.vggish.0") 27 | text_feat_paths = BaseDataset.common_text_feat_paths() 28 | text_feat_paths = {key: Path("text_embeddings") / fname 29 | for key, fname in text_feat_paths.items()} 30 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl" 31 | for key in text_feat_paths} 32 | custom_paths = { 33 | "audio": ["aggregated_audio/vggish-raw.hickle"], 34 | } 35 | feature_info = { 36 | "custom_paths": custom_paths, 37 | "feature_names": feature_names, 38 | "subset_list_paths": subset_paths, 39 | "text_feat_paths": text_feat_paths, 40 | "challenge_text_feat_paths": challenge_text_feat_paths, 41 | "raw_captions_path": "structured-symlinks/raw_captions_combined_filtered.pkl", 42 | } 43 | return feature_info 44 | 45 | def load_features(self): 46 | root_feat = self.root_feat 47 | feat_names = {key: self.visual_feat_paths(key) for key in 48 | self.paths["feature_names"]} 49 | feat_names.update(self.paths["custom_paths"]) 50 | features = {} 51 | for expert, rel_names in feat_names.items(): 52 | if expert not in self.ordered_experts: 53 | continue 54 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names]) 55 | if len(feat_paths) == 1: 56 | features[expert] = memcache(feat_paths[0]) 57 | else: 58 | # support multiple forms of feature (e.g. max and avg pooling). For 59 | # now, we only support direct concatenation 60 | msg = f"{expert}: Only direct concatenation of muliple feats is possible" 61 | print(f"Concatenating aggregates for {expert}....") 62 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg 63 | axis = self.feat_aggregation[expert]["aggregate-axis"] 64 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter 65 | print(f"concat cache info: {x}") 66 | features_ = concat_features(feat_paths, axis=axis) 67 | memory_summary() 68 | 69 | # Make separate feature copies for each split to allow in-place filtering 70 | features[expert] = copy.deepcopy(features_) 71 | 72 | self.features = features 73 | if self.challenge_mode: 74 | self.load_challenge_text_features() 75 | else: 76 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) 77 | # keys = list(raw_captions.keys()) 78 | # raw_captions_fused = {} 79 | # for key in keys: 80 | # raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key])) 81 | # self.raw_captions = raw_captions_fused 82 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat] 83 | self.text_features = memcache(text_feat_path) 84 | 85 | # overload video paths, which are structured differently for YouCook2 86 | self.video_path_retrieval = [f"videos/{x}.mp4" 87 | for x in self.partition_lists["val"]] 88 | 89 | def sanity_checks(self): 90 | msg = (f"Expected to have single test caption for QuerYD, since we assume" 91 | f"that the captions are fused (but using {self.num_test_captions})") 92 | assert self.num_test_captions == 1, msg 93 | -------------------------------------------------------------------------------- /utils/datastructures.py: -------------------------------------------------------------------------------- 1 | """This module defines a datastructure for storing pre-computed features for datasets. 2 | 3 | It provides key-value access, but is backed by a monolithic array to prevent memory 4 | fragmentation. This can be useful for loading large feature sets into memory (e.g. 5 | those that are > 100 GiB) in a manner that minimises OOM issues. 6 | """ 7 | 8 | import pickle 9 | import argparse 10 | import numpy as np 11 | import humanize 12 | 13 | 14 | class ExpertStore: 15 | 16 | def __init__(self, keylist, dim, dtype=np.float16): 17 | self.keys = keylist 18 | self.dim = dim 19 | self.store_dtype = dtype 20 | self.store = np.zeros((len(keylist), dim), dtype=dtype) 21 | self.keymap = {} 22 | self.missing = set() 23 | self.rebuild_keymap() 24 | 25 | def __setitem__(self, key, value): 26 | idx = self.keymap[key] 27 | if isinstance(value, np.ndarray): 28 | # non-nan values must be vectors of the appropriate size 29 | assert value.size == self.dim, f"cannot set value with size {value.size}" 30 | else: 31 | assert np.isnan(value) 32 | self.store[idx] = value 33 | 34 | def rebuild_keymap(self): 35 | for idx, key in enumerate(self.keys): 36 | self.keymap[key] = idx 37 | 38 | def filter_keys(self, keys, tag, allow_mismatch="", exceptions=None): 39 | keyset = set(keys) 40 | missing = keyset - set(self.keys) 41 | if exceptions is not None and missing: 42 | excluded = missing.intersection(set(exceptions)) 43 | print(f"filter_keys >>> applying exceptions for {len(excluded)} videos") 44 | missing = missing - excluded 45 | print(f"filter_keys >>> {tag}") 46 | if allow_mismatch and missing: 47 | print(f"Key mismatch (missing {len(missing)}) {allow_mismatch}") 48 | else: 49 | samples = list(missing)[:3] 50 | msg = f"cannot apply filter since missing {len(missing)} keys e.g. {samples}" 51 | assert not missing, msg 52 | keep = np.array([x in keyset for x in self.keys]) 53 | filtered_keys = np.array(self.keys)[keep] 54 | print(f"Filtering from {len(self.keys)} keys to {len(filtered_keys)} keys") 55 | self.keys = filtered_keys 56 | self.store = self.store[keep] 57 | self.rebuild_keymap() 58 | 59 | def __getitem__(self, key): 60 | return self.store[self.keymap[key]] 61 | 62 | def __len__(self): 63 | return len(self.keys) 64 | 65 | def __repr__(self): 66 | keep_samples = 3 67 | samples = list(self.keymap.items())[:keep_samples] 68 | sample_str = "\n".join([f"{key}: {val}" for key, val in samples]) 69 | summary = ( 70 | f"ExpertStore object with {len(self.keys)} features (dim: {self.dim})" 71 | f" (storage is using {humanize.naturalsize(self.store.nbytes)})" 72 | f"\nFirst {keep_samples} elements of keymap: \n{sample_str}" 73 | ) 74 | return summary 75 | 76 | 77 | def gen_dict_store(keylist, dim): 78 | store = dict() 79 | for key in keylist: 80 | store[key] = np.random.rand(1, dim).astype(np.float16) 81 | return store 82 | 83 | 84 | def main(): 85 | parser = argparse.ArgumentParser() 86 | parser.add_argument("--dataset", default="moments-in-time") 87 | parser.add_argument("--dim", type=int, default=2048) 88 | args = parser.parse_args() 89 | 90 | from config import get_data_paths 91 | data_paths = get_data_paths(args.dataset) 92 | relevant_path = data_paths["relevant-id-list"] 93 | with open(relevant_path, "r") as f: 94 | relevant_ids = sorted(f.read().splitlines()) 95 | 96 | for store_name in "dict", "np", "expert_store": 97 | if store_name == "dict": 98 | store = gen_dict_store(keylist=relevant_ids, dim=args.dim) 99 | elif store_name == "np": 100 | store = np.random.rand(len(relevant_ids), args.dim).astype(np.float16) 101 | elif store_name == "expert_store": 102 | store = ExpertStore(keylist=relevant_ids, dim=args.dim) 103 | print(store) 104 | serialised = pickle.dumps(store) 105 | print(f"Memory needs for {store_name}: {humanize.naturalsize(len(serialised))}") 106 | 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /misc/gen_tar_lists.py: -------------------------------------------------------------------------------- 1 | """ 2 | ipy misc/gen_tar_lists.py -- --dataset YouCook2 3 | """ 4 | import copy 5 | import json 6 | import argparse 7 | from typing import Dict, List, Tuple 8 | from pathlib import Path 9 | 10 | import tqdm 11 | from beartype import beartype 12 | from zsvision.zs_utils import load_json_config 13 | from gen_readme import dataset_paths, model_specs2path 14 | 15 | 16 | @beartype 17 | def generate_tar_lists( 18 | save_dir: Path, 19 | experiments: Dict[str, Tuple[str, str]], 20 | datasets: List[str], 21 | refresh: bool, 22 | ): 23 | all_feat_paths = {} 24 | # import pdb; pdb.set_trace() 25 | for exp_name, (group_id, timestamp) in tqdm.tqdm(experiments.items()): 26 | rel_path = Path(group_id) / "seed-0" / timestamp / "config.json" 27 | config_path = Path(save_dir) / "models" / exp_name / rel_path 28 | try: 29 | with open(config_path, "r") as f: 30 | config = json.load(f) 31 | except FileNotFoundError: 32 | rel_path = Path(group_id) / "seed-1" / timestamp / "config.json" 33 | config_path = Path(save_dir) / "models" / exp_name / rel_path 34 | with open(config_path, "r") as f: 35 | config = json.load(f) 36 | 37 | feat_aggregation = config["data_loader"]["args"]["feat_aggregation"] 38 | dataset_name = exp_name.split("-train")[0] 39 | if dataset_name not in [x.lower() for x in datasets]: 40 | continue 41 | if dataset_name not in all_feat_paths: 42 | all_feat_paths[dataset_name] = set() 43 | split_names = [config["data_loader"]["args"]["split_name"]] 44 | if "eval_settings" in config and config["eval_settings"]: 45 | test_split = config["eval_settings"]["data_loader"]["args"]["split_name"] 46 | split_names.append(test_split) 47 | keep = set(config["experts"]["modalities"]) 48 | text_feat = config["experts"]["text_feat"] 49 | root_feat, paths = dataset_paths(dataset_name) 50 | modern_feat_agg = {key: val for key, val in feat_aggregation.items() 51 | if key in paths["feature_names"]} 52 | feat_paths = model_specs2path(modern_feat_agg, keep) 53 | all_feat_paths[dataset_name].update({root_feat / x for x in feat_paths}) 54 | for key, feat_list in paths["custom_paths"].items(): 55 | for feat_path in feat_list: 56 | all_feat_paths[dataset_name].add(root_feat / feat_path) 57 | # import pdb; pdb.set_trace() 58 | text_paths = [root_feat / paths["text_feat_paths"][text_feat]] 59 | all_feat_paths[dataset_name].update(set(text_paths)) 60 | all_feat_paths[dataset_name].add(root_feat / paths["raw_captions_path"]) 61 | if "dict_youtube_mapping_path" in paths: 62 | all_feat_paths[dataset_name].add( 63 | root_feat / paths["dict_youtube_mapping_path"]) 64 | for split_name in split_names: 65 | split_paths = set(root_feat / x for x in 66 | paths["subset_list_paths"][split_name].values()) 67 | all_feat_paths[dataset_name].update(split_paths) 68 | 69 | for dataset_name, paths in all_feat_paths.items(): 70 | tar_include_list = Path("misc") / "datasets" / dataset_name / "tar_include.txt" 71 | tar_include_list.parent.mkdir(exist_ok=True, parents=True) 72 | if tar_include_list.exists() and not refresh: 73 | print(f"Found existing tar include list at {tar_include_list}, skipping...") 74 | continue 75 | with open(tar_include_list, "w") as f: 76 | for path in sorted(paths): 77 | if "aggregated_speech" not in str(path): 78 | print(f"Writing {path} to {tar_include_list}") 79 | f.write(f"{path}\n") 80 | 81 | 82 | def main(): 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument("--save_dir", default="data/saved", type=Path) 85 | parser.add_argument("--refresh", action="store_true") 86 | parser.add_argument("--experiments_path", default="misc/experiments.json") 87 | parser.add_argument("--target", default="main") 88 | parser.add_argument("--data_dir", type=Path, default="data") 89 | parser.add_argument("--challenge_phase", default="public_server_val", 90 | choices=["public_server_val", "public_server_test"]) 91 | parser.add_argument("--datasets", nargs="+", 92 | default=["activity-net", 93 | "QuerYD", "QuerYDSegments"]) 94 | args = parser.parse_args() 95 | 96 | with open(args.experiments_path, "r") as f: 97 | experiments = json.load(f) 98 | 99 | generate_tar_lists( 100 | save_dir=args.save_dir, 101 | datasets=args.datasets, 102 | experiments=experiments, 103 | refresh=args.refresh, 104 | ) 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /data_loader/CLOTHO_dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | from pathlib import Path 4 | from typing import Dict, List, Union 5 | 6 | from base.base_dataset import BaseDataset 7 | from typeguard import typechecked 8 | from utils import memory_summary 9 | from zsvision.zs_utils import concat_features, memcache 10 | 11 | 12 | class CLOTHO(BaseDataset): 13 | 14 | @staticmethod 15 | @typechecked 16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]: 17 | subset_paths = {} 18 | test_splits = { 19 | "val": "val_list.txt", 20 | "test": "test_list.txt", 21 | } 22 | for split_name, fname in test_splits.items(): 23 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname} 24 | 25 | feature_names = BaseDataset.common_feat_names() 26 | feature_names.append("audio.vggish.0") 27 | text_feat_paths = BaseDataset.common_text_feat_paths() 28 | text_feat_paths = {key: Path("text_embeddings") / fname 29 | for key, fname in text_feat_paths.items()} 30 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl" 31 | for key in text_feat_paths} 32 | custom_paths = { 33 | "audio": ["aggregated_audio/vggish-raw.hickle"], 34 | "pann": ["aggregated_pann/pann-raw.hickle"], 35 | "syncnet": ["aggregated_syncnet/syncnet-raw.hickle"], 36 | "vggsound": ["aggregated_vggsound/vggsound-raw.hickle"], 37 | # "vggsound": ["aggregated_vggsound/vggsound-avg.pickle"], 38 | "speech": ["aggregated_speech/w2v_mean.pkl"] 39 | } 40 | feature_info = { 41 | "custom_paths": custom_paths, 42 | "feature_names": feature_names, 43 | "subset_list_paths": subset_paths, 44 | "text_feat_paths": text_feat_paths, 45 | "challenge_text_feat_paths": challenge_text_feat_paths, 46 | "raw_captions_path": "structured-symlinks/raw-captions.pkl", 47 | } 48 | return feature_info 49 | 50 | def load_features(self): 51 | root_feat = self.root_feat 52 | feat_names = {key: self.visual_feat_paths(key) for key in 53 | self.paths["feature_names"]} 54 | feat_names.update(self.paths["custom_paths"]) 55 | features = {} 56 | for expert, rel_names in feat_names.items(): 57 | if expert not in self.ordered_experts: 58 | continue 59 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names]) 60 | if len(feat_paths) == 1: 61 | features[expert] = memcache(feat_paths[0]) 62 | else: 63 | # support multiple forms of feature (e.g. max and avg pooling). For 64 | # now, we only support direct concatenation 65 | msg = f"{expert}: Only direct concatenation of muliple feats is possible" 66 | print(f"Concatenating aggregates for {expert}....") 67 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg 68 | axis = self.feat_aggregation[expert]["aggregate-axis"] 69 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter 70 | print(f"concat cache info: {x}") 71 | features_ = concat_features(feat_paths, axis=axis) 72 | memory_summary() 73 | 74 | # if expert == "speech": 75 | # features_defaults = defaultdict(lambda: np.zeros((1, 300))) 76 | # features_defaults.update(features_) 77 | # features_ = features_defaults 78 | # Make separate feature copies for each split to allow in-place filtering 79 | features[expert] = copy.deepcopy(features_) 80 | 81 | self.features = features 82 | if self.challenge_mode: 83 | self.load_challenge_text_features() 84 | else: 85 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) 86 | # keys = list(raw_captions.keys()) 87 | # raw_captions_fused = {} 88 | # for key in keys: 89 | # raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key])) 90 | # self.raw_captions = raw_captions_fused 91 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat] 92 | self.text_features = memcache(text_feat_path) 93 | 94 | # overload video paths, which are structured differently for YouCook2 95 | self.video_path_retrieval = [f"videos/{x}.mp4" 96 | for x in self.partition_lists["val"]] 97 | 98 | def sanity_checks(self): 99 | msg = (f"Expected to have single test caption for AudioCaps, since we assume" 100 | f"that the captions are fused (but using {self.num_test_captions})") 101 | if self.fuse_captions is True: 102 | assert self.num_test_captions == 1, msg 103 | -------------------------------------------------------------------------------- /logger/log_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import scipy.stats 3 | import logging 4 | import numpy as np 5 | from collections import defaultdict 6 | 7 | 8 | def log_summary(logger, log_path, eval_mode="test_run", fixed_num_epochs=None): 9 | """Extract performace statistics from experiment log files. 10 | 11 | Args: 12 | logger (logger): reference to primary logging instance 13 | log_path (Path): the path to the log file 14 | eval_mode (str): the method use to collect the statistics. Can be one of: 15 | `test_run`, `fixed_num_epochs` or `geometric_mean` 16 | 17 | NOTE: The `eval_mode` argument differs by dataset: for datasets which provide a 18 | validation set, we use validation set performance to complete a single test run. For 19 | datasets where no validation set is available, we aim to match prior work by either 20 | fixing the number of training epochs, or selecting directly from validation set 21 | performance (Details can be found in the supplementary material of the paper.) 22 | """ 23 | with open(str(log_path), "r") as f: 24 | log = f.read().splitlines() 25 | 26 | # keep track of the random seed used for the part of the logfile being processed 27 | current_seed = None 28 | 29 | # Regex tag for finding the seed 30 | seed_tag = "Setting experiment random seed to" 31 | 32 | if eval_mode == "test_run": 33 | subset = "test" 34 | else: 35 | subset = "val" 36 | 37 | for mode in "t2v", "v2t": 38 | logger.info("") 39 | logger.info("----------------------------------------------------") 40 | logger.info(f"[{mode}] loaded log file with {len(log)} lines....") 41 | logger.info("----------------------------------------------------") 42 | 43 | # Search for the following metrics 44 | scores = { 45 | "R1": defaultdict(list), 46 | "R5": defaultdict(list), 47 | "R10": defaultdict(list), 48 | "R50": defaultdict(list), 49 | "MedR": defaultdict(list), 50 | "MeanR": defaultdict(list), 51 | } 52 | 53 | for row in log: 54 | if seed_tag in row: 55 | # Search for the log file entry describing the current random seed 56 | match = re.search(seed_tag + " (\d+)$", row) # NOQA 57 | assert len(match.groups()) == 1, "expected a single regex match" 58 | current_seed = match.groups()[0] 59 | 60 | if f"{subset}_{mode}_metrics" in row: 61 | tokens = row.split(" ") 62 | for key in scores: 63 | tag = f"{subset}_{mode}_metrics_{key}:" 64 | if tag in tokens: 65 | pos = tokens.index(tag) + 1 66 | val = tokens[pos] 67 | val = float(val) 68 | assert current_seed is not None, "failed to determine the seed" 69 | scores[key][current_seed].append(val) 70 | 71 | agg_scores = {"R1": [], "R5": [], "R10": [], "R50": [], "MedR": [], "MeanR": []} 72 | 73 | # compute the best performance for a single epoch (i.e. sharing the same model 74 | # to compute all stats) 75 | geometric_stats = defaultdict(list) 76 | best_epochs = {} 77 | if eval_mode == "geometric_mean": 78 | raise NotImplementedError("Need to fix this for new log format") 79 | consider = ["R1", "R5", "R10"] 80 | seeds = list(scores["R1"].keys()) 81 | for seed in seeds: 82 | for metric, subdict in scores.items(): 83 | if metric in consider: 84 | geometric_stats[seed].append(subdict[seed]) 85 | gms_raw = np.array(geometric_stats[seed]) 86 | geo_means = scipy.stats.mstats.gmean(gms_raw, axis=0) 87 | best_epochs[seed] = np.argmax(geo_means) 88 | 89 | for metric, subdict in scores.items(): 90 | for seed, values in subdict.items(): 91 | if eval_mode == "test_run": 92 | stat = values[0] 93 | elif eval_mode == "fixed_num_epochs": 94 | stat = values[fixed_num_epochs - 1] 95 | else: 96 | raise ValueError(f"unrecognised eval_mode: {eval_mode}") 97 | agg_scores[metric].append(stat) 98 | 99 | if eval_mode == "fixed_num_epochs": 100 | logger.info(f"Reporting stats with fixed training length: {fixed_num_epochs}") 101 | for metric, values in agg_scores.items(): 102 | logger.info(f"{metric}: {np.mean(values):.1f}, {np.std(values, ddof=1):.1f}") 103 | 104 | 105 | if __name__ == "__main__": 106 | sample_path = "data/saved/log/audiocaps-train-vggish-vggsound/2021-04-03_11-48-50/info.log" 107 | logger_ = logging.getLogger("parser") 108 | logging.basicConfig(level=logging.INFO) 109 | log_summary( 110 | logger=logger_, 111 | log_path=sample_path, 112 | eval_mode="fixed_num_epochs", 113 | fixed_num_epochs=9, 114 | ) 115 | -------------------------------------------------------------------------------- /utils/visualizer.py: -------------------------------------------------------------------------------- 1 | """A simple HTML visualizer. 2 | 3 | It is based on the Cycle-GAN codebase: 4 | https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix 5 | """ 6 | import os 7 | import numpy as np 8 | from pathlib import Path 9 | from . import util, html 10 | 11 | 12 | class Visualizer: 13 | """This class includes several functions that can display/save images. 14 | 15 | It uses a Python library 'visdom' for display, and a Python library 'dominate' 16 | (wrapped in 'HTML') for creating HTML files with images. 17 | """ 18 | 19 | def __init__(self, exp_name, web_dir, src_video_dir, vis_vid_freq, num_samples=50): 20 | """Initialize the Visualizer class 21 | Create an HTML object for saveing HTML filters 22 | """ 23 | self.name = exp_name 24 | self.web_dir = web_dir 25 | self.vis_vid_freq = vis_vid_freq 26 | self.img_dir = os.path.join(self.web_dir, "images") 27 | self.num_samples = num_samples 28 | print(f"create web directory {self.web_dir}...") 29 | util.mkdirs([self.web_dir, self.img_dir]) 30 | src_dir = Path(src_video_dir).absolute() 31 | print(f"symlinking videos from {src_dir}...") 32 | sym_dir = (Path(self.web_dir) / "videos").absolute() 33 | if sym_dir.is_symlink(): 34 | os.remove(sym_dir) 35 | sym_dir.symlink_to(src_dir) 36 | 37 | def visualize_ranking(self, sims, epoch, meta, nested_metrics): 38 | if not (self.vis_vid_freq and epoch % self.vis_vid_freq == 0): 39 | return 40 | 41 | dists = -sims 42 | np.random.seed(0) 43 | sorted_ranks = np.argsort(dists, axis=1) 44 | gt_dists = np.diag(dists) 45 | rankings = [] 46 | vis_top_k = 5 47 | hide_gt = False 48 | # num_indep_samples = 1 49 | # random_seeds = np.arange(num_indep_samples) 50 | sample = np.random.choice(np.arange(dists.shape[0]), size=self.num_samples, 51 | replace=False) 52 | for ii in sample: 53 | ranked_idx = sorted_ranks[ii][:vis_top_k] 54 | gt_captions = meta["raw_captions"][ii] 55 | # if args.sample_single_gt_caption: 56 | # gt_captions = np.random.choice(gt_captions, 1).tolist() 57 | 58 | datum = { 59 | "gt-sim": -gt_dists[ii], 60 | "gt-captions": gt_captions, 61 | "gt-rank": np.where(sorted_ranks[ii] == ii)[0][0], 62 | "gt-path": meta["paths"][ii], 63 | "top-k-sims": -dists[ii][ranked_idx], 64 | "top-k-paths": np.array(meta["paths"])[ranked_idx], 65 | "hide-gt": hide_gt, 66 | } 67 | rankings.append(datum) 68 | self.display_current_results( 69 | rankings, 70 | epoch=epoch, 71 | metrics=nested_metrics["t2v_metrics"], 72 | ) 73 | 74 | def display_current_results(self, rankings, epoch, metrics): 75 | """Display current results on visdom; save current results to an HTML file. 76 | 77 | Parameters: 78 | visuals (OrderedDict) - - dictionary of images to display or save 79 | epoch (int) - - the current epoch 80 | save_result (bool) - - if save the current results to an HTML file 81 | """ 82 | if not Path(self.web_dir).exists(): 83 | Path(self.web_dir).mkdir(exist_ok=True, parents=True) 84 | print(f"updating webpage at {self.web_dir}") 85 | title = f"Experiment name = {self.name}" 86 | refresh = True 87 | if not refresh: 88 | print("DISABLING WEB PAGE REFRESH") 89 | webpage = html.HTML(web_dir=self.web_dir, title=title, refresh=refresh) 90 | 91 | msg = f"epoch [{epoch}] - {self.name}" 92 | webpage.add_header(msg) 93 | msg = (f"R1: {metrics['R1']:.1f}, " 94 | f"R5: {metrics['R5']:.1f}, " 95 | f"R10: {metrics['R10']:.1f}, " 96 | f"MedR: {metrics['MedR']}") 97 | webpage.add_header(msg) 98 | print(f"Top {len(rankings[0])} retreived videos at epoch: {epoch}") 99 | 100 | for ranking in rankings: 101 | vids, txts, links = [], [], [] 102 | gt_vid_path = ranking["gt-path"] 103 | gt_captions = [" ".join(x) for x in ranking["gt-captions"]] 104 | gt_captions = "
".join(gt_captions) 105 | if ranking["hide-gt"]: 106 | txts.append(gt_captions) 107 | links.append("hidden") 108 | vids.append("hidden") 109 | else: 110 | txt = (f"{gt_captions}
Rank: {ranking['gt-rank']}, " 111 | f"Sim: {ranking['gt-sim']:.3f} [{Path(ranking['gt-path']).stem}]") 112 | txts.append(txt) 113 | links.append(gt_vid_path) 114 | vids.append(gt_vid_path) 115 | 116 | for idx, (vid_path, sim) in enumerate(zip(ranking["top-k-paths"], 117 | ranking["top-k-sims"])): 118 | vid_path = Path(vid_path) 119 | if ranking["hide-gt"]: 120 | txt = f"choice: {idx}" 121 | else: 122 | txt = f"Rank: {idx}, Sim: {sim:.3f}, [{Path(vid_path).stem}]" 123 | txts.append(txt) 124 | vids.append(vid_path) 125 | links.append(vid_path) 126 | webpage.add_videos(vids, txts, links, width=200) 127 | print(f"added {len(vids)} videos") 128 | webpage.save() 129 | -------------------------------------------------------------------------------- /data_loader/ActivityNet_dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Dict, Union, List 3 | from pathlib import Path 4 | 5 | from zsvision.zs_utils import memcache, concat_features 6 | from typeguard import typechecked 7 | 8 | from utils import memory_summary 9 | from base.base_dataset import BaseDataset 10 | 11 | 12 | class ActivityNet(BaseDataset): 13 | 14 | @staticmethod 15 | @typechecked 16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]: 17 | subset_paths = {} 18 | test_splits = { 19 | "val1": "val_1_list.txt", 20 | "val": "val_list.txt", 21 | "public_server_val": "public_server_val.txt", 22 | "public_server_test": "public_server_test.txt", 23 | } 24 | for split_name, fname in test_splits.items(): 25 | if training_file is None: 26 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname} 27 | else: 28 | subset_paths[split_name] = {"train": training_file, "val": fname} 29 | 30 | 31 | feature_names = BaseDataset.common_feat_names() 32 | custom_paths = { 33 | "audio": ["aggregated_audio/vggish-audio-raw.pickle"], 34 | "speech": ["aggregated_speech/goog_w2v-speech-raw.pickle"], 35 | "ocr": ["aggregated_ocr_feats/ocr-w2v.pkl"], 36 | "face": ["aggregated_facefeats_25fps_256px_stride1/face-avg.pickle"], 37 | } 38 | text_feat_paths = BaseDataset.common_text_feat_paths() 39 | text_feat_dir = Path("aggregated_text_feats") 40 | 41 | text_feat_paths = {key: text_feat_dir / fname 42 | for key, fname in text_feat_paths.items()} 43 | challenge_text_feat_paths = {} 44 | # include non-standard text features 45 | for text_feat in ("openai", ): 46 | text_feat_names = {key: f"{text_feat}-{key}" 47 | for key in {"train", "val1"}} 48 | text_feat_paths[text_feat] = {key: f"aggregated_text_feats/{val}.pkl" 49 | for key, val in text_feat_names.items()} 50 | challenge_text_feat_paths[text_feat] = \ 51 | f"aggregated_text_feats/{text_feat}.pkl" 52 | feature_info = { 53 | "custom_paths": custom_paths, 54 | "feature_names": feature_names, 55 | "subset_list_paths": subset_paths, 56 | "text_feat_paths": text_feat_paths, 57 | "challenge_text_feat_paths": challenge_text_feat_paths, 58 | "raw_captions_path": "raw-captions-train-val_1.pkl", 59 | } 60 | return feature_info 61 | 62 | def load_features(self): 63 | root_feat = self.root_feat 64 | if self.distil_params is not None: 65 | self.distil_features = {} 66 | d_base_path = self.distil_params['base_path'] 67 | 68 | teachers = list(map(lambda x: root_feat / Path(d_base_path + x), self.distil_params['teachers'])) 69 | 70 | for i, f_name in enumerate(teachers): 71 | self.distil_features[i] = memcache(f_name) 72 | 73 | feat_names = {key: self.visual_feat_paths(key) for key in 74 | self.paths["feature_names"]} 75 | feat_names.update(self.paths["custom_paths"]) 76 | features = {} 77 | for expert, rel_names in feat_names.items(): 78 | if expert not in self.ordered_experts: 79 | continue 80 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names]) 81 | if len(feat_paths) == 1: 82 | features[expert] = memcache(feat_paths[0]) 83 | else: 84 | # support multiple forms of feature (e.g. max and avg pooling). For 85 | # now, we only support direct concatenation 86 | msg = f"{expert}: Only direct concatenation of muliple feats is possible" 87 | print(f"Concatenating aggregates for {expert}....") 88 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg 89 | axis = self.feat_aggregation[expert]["aggregate-axis"] 90 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter 91 | print(f"concat cache info: {x}") 92 | features_ = concat_features(feat_paths, axis=axis) 93 | memory_summary() 94 | 95 | # Make separate feature copies for each split to allow in-place filtering 96 | features[expert] = copy.deepcopy(features_) 97 | 98 | self.features = features 99 | if self.challenge_mode: 100 | self.load_challenge_text_features() 101 | else: 102 | text_feat_paths = self.paths["text_feat_paths"][self.text_feat] 103 | if isinstance(text_feat_paths, dict): 104 | text_features = memcache(root_feat / text_feat_paths["train"]) 105 | text_features.update(memcache( 106 | root_feat / text_feat_paths[self.split_name])) 107 | elif isinstance(text_feat_paths, (Path, str)): 108 | text_features = memcache(root_feat / text_feat_paths) 109 | else: 110 | raise TypeError(f"Unexpected type {type(text_feat_paths)}") 111 | self.text_features = text_features 112 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) 113 | 114 | def sanity_checks(self): 115 | msg = (f"Expected to have single test caption for ANet, since we assume" 116 | f"that the captions are fused (but using {self.num_test_captions})") 117 | assert self.num_test_captions == 1, msg 118 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import subprocess 6 | import tqdm 7 | import wget 8 | from collections import defaultdict 9 | from datetime import datetime 10 | from pathlib import Path 11 | 12 | import numpy as np 13 | import torch 14 | import random 15 | 16 | def extracting_log_info(log_files, experiment, logging): 17 | metrics_t2v = defaultdict(list) 18 | metrics_v2t = defaultdict(list) 19 | 20 | for file_name in log_files: 21 | output_string = f"{experiment}:\n" 22 | with open(Path("logs_eval") / file_name, 'r') as f: 23 | content_lines = f.read().splitlines() 24 | content_lines = content_lines[-14:] 25 | for line in content_lines: 26 | if 't2v' in line: 27 | metric_entry = line.split('test_t2v_metrics_')[1].split(':')[0] 28 | metrics_t2v[metric_entry].append(float(line.split('test_t2v_metrics_')[1].split(':')[1])) 29 | elif 'v2t' in line: 30 | metric_entry = line.split('test_v2t_metrics_')[1].split(':')[0] 31 | metrics_v2t[metric_entry].append(float(line.split('test_v2t_metrics_')[1].split(':')[1])) 32 | keys = list(metrics_t2v.keys()) 33 | 34 | for key in keys: 35 | output_string += f"{key}_t2v: {np.mean(metrics_t2v[key]):.1f}, {np.std(metrics_t2v[key], ddof=1):.1f}\n" 36 | for key in keys: 37 | output_string += f"{key}_v2t: {np.mean(metrics_v2t[key]):.1f}, {np.std(metrics_v2t[key], ddof=1):.1f}\n" 38 | logging.info(output_string) 39 | with open(Path("logs_eval") / f"{experiment}_summary.txt", 'w') as f: 40 | f.write(output_string) 41 | 42 | def run_exp(experiments, logging): 43 | for experiment in experiments: 44 | logging.info(f"Now running {experiment}") 45 | run_one_exp(experiment, experiments, logging) 46 | 47 | 48 | def download_configs(experiment, trained_model_path, group_id, seed, timestamp): 49 | new_folder = str(trained_model_path).split('/trained_model.pth')[0] 50 | url_config = f"http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/models/{experiment}/{group_id}/{seed}/{timestamp}/config.json" 51 | config_path = Path(new_folder) / 'config.json' 52 | wget.download(url_config, out=str(config_path)) 53 | with open(config_path, 'r') as f: 54 | config_content = json.load(f) 55 | config_content['seed'] = int(seed[-1]) 56 | with open(config_path, 'w') as f: 57 | json.dump(config_content, f) 58 | 59 | 60 | def download_models(experiment, logging, trained_model_path, 61 | group_id, seed, timestamp): 62 | new_folder = str(trained_model_path).split('/trained_model.pth')[0] 63 | if os.path.exists(trained_model_path) is False: 64 | logging.info(f"Downloading model for {seed} since it does not exist on the local machine") 65 | url = f"http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/models/{experiment}/{group_id}/{seed}/{timestamp}/trained_model.pth" 66 | # import pdb; pdb.set_trace() 67 | Path(new_folder).mkdir(exist_ok=True, parents=True) 68 | wget.download(url, out=str(trained_model_path)) 69 | else: 70 | logging.info(f"Model already downloaded for {experiment} seed {seed}") 71 | if os.path.exists(Path(new_folder) / 'config.json') is False: 72 | download_configs(experiment, trained_model_path, group_id, seed, timestamp) 73 | else: 74 | logging.info(f"Config already downloaded for {experiment} seed {seed}") 75 | 76 | def run_one_exp(experiment, experiments, logging): 77 | group_id = experiments[experiment][0] 78 | 79 | with open('exp_to_seed_time.json', 'r') as f: 80 | json_dict = json.load(f) 81 | log_files = [] 82 | for (group_id, seed, timestamp) in json_dict[experiment]: 83 | 84 | group_id_path = Path("data/saved/models") / experiment / group_id 85 | logging.info("Running evaluation on existent seeds") 86 | (Path("logs_eval")).mkdir(exist_ok=True, parents=True) 87 | trained_model_path = group_id_path / seed / timestamp / 'trained_model.pth' 88 | download_models(experiment, logging, trained_model_path, 89 | group_id, seed, timestamp) 90 | config_path = group_id_path / seed / timestamp / 'config.json' 91 | cmd = f"python test.py --config {config_path} --resume {trained_model_path} --device 0 --eval_from_training_config >&1 | tee logs_eval/log_{group_id}_{seed}.txt" 92 | 93 | log_files.append(f"log_{group_id}_{seed}.txt") 94 | logging.info(cmd) 95 | subprocess.call(cmd, shell=True) 96 | logging.info("Now averaging results") 97 | 98 | extracting_log_info(log_files, experiment, logging) 99 | 100 | 101 | 102 | def main(): 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument("--experiments_path", default="misc/experiments-audiocaps.json") 105 | parser.add_argument("--experiment", type=str, default=None) 106 | parser.add_argument( 107 | "--data_dir", 108 | type=Path, 109 | default="data", 110 | ) 111 | parser.add_argument( 112 | "--dataset", 113 | type=str, 114 | default="data", 115 | ) 116 | parser.add_argument( 117 | "--refresh", 118 | action="store_true", 119 | ) 120 | args = parser.parse_args() 121 | os.makedirs('logs', exist_ok=True) 122 | logging.basicConfig(filename=f"logs/{datetime.now().strftime(r'%m%d_%H%M%S')}.log", 123 | level=logging.INFO) 124 | logging.getLogger().addHandler(logging.StreamHandler()) 125 | logging.info(args) 126 | 127 | with open(args.experiments_path, "r") as f: 128 | experiments = json.load(f) 129 | 130 | if args.experiment is None: 131 | run_exp(experiments, logging) 132 | else: 133 | run_one_exp(args.experiment, experiments, logging) 134 | 135 | 136 | 137 | if __name__ == "__main__": 138 | main() 139 | -------------------------------------------------------------------------------- /configs/data_loader_queryd.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/base_config_queryd.json", 3 | "eval_mode": "test_run", 4 | "experts": { 5 | "text_feat": "w2v", 6 | "modalities": [ 7 | "imagenet.resnext101_32x48d.0", 8 | "r2p1d.r2p1d-ig65m.0", 9 | "scene.densenet161.0", 10 | "audio" 11 | ] 12 | }, 13 | "arch": { 14 | "type": "CENet", 15 | "args": { 16 | "test_caption_mode": "indep", 17 | "use_ce": "pairwise", 18 | "use_mish": 1, 19 | "use_bn_reason": 1, 20 | "num_g_layers": 3, 21 | "num_h_layers": 0, 22 | "include_self": 1, 23 | "l2renorm": false, 24 | "randomise_feats": "", 25 | "vlad_clusters": { 26 | "text": 20, 27 | "audio": 16 28 | }, 29 | "ghost_clusters": { 30 | "text": 1 31 | }, 32 | "mimic_ce_dims": 0 33 | } 34 | }, 35 | "optimizer": { 36 | "type": "Ranger", 37 | "args": { 38 | "lr": 0.01, 39 | "weight_decay": 1E-03 40 | } 41 | }, 42 | "loss": { 43 | "type": "MaxMarginRankingLoss", 44 | "args": { 45 | "margin": 0.2, 46 | "fix_norm": true 47 | } 48 | }, 49 | "data_loader": { 50 | "type": "ExpertDataLoader", 51 | "args":{ 52 | "dataset_name": "QuerYD", 53 | "data_dir": "data/QuerYD", 54 | "root_feat_folder": "structured-symlinks", 55 | "trn_cat": 0, 56 | "batch_size": 128, 57 | "split_name": "val", 58 | "fuse_captions": true, 59 | "num_test_captions": 1, 60 | "max_tokens": { 61 | "text": 70, 62 | "audio": 500 63 | }, 64 | "feat_aggregation": { 65 | "imagenet.senet154.0": { 66 | "fps": 25, 67 | "stride": 1, 68 | "pixel_dim": 256, 69 | "aggregate-axis": 1, 70 | "offset": 0, 71 | "temporal": "avg", 72 | "aggregate": "concat", 73 | "type": "embed", 74 | "feat_dims": { 75 | "embed": 2048, 76 | "logits": 1000 77 | } 78 | }, 79 | "imagenet.resnext101_32x48d.0": { 80 | "fps": 25, 81 | "stride": 1, 82 | "offset": 0, 83 | "pixel_dim": 256, 84 | "temporal": "avg", 85 | "aggregate": "concat", 86 | "aggregate-axis": 1, 87 | "type": "embed", 88 | "feat_dims": { 89 | "embed": 2048, 90 | "logits": 1000 91 | } 92 | }, 93 | "scene.densenet161.0": { 94 | "stride": 1, 95 | "fps": 25, 96 | "offset": 0, 97 | "temporal": "avg", 98 | "pixel_dim": 256, 99 | "aggregate": "concat", 100 | "aggregate-axis": 1, 101 | "type": "embed", 102 | "feat_dims": { 103 | "embed": 2208, 104 | "logits": 1000 105 | } 106 | }, 107 | "i3d.i3d.0": { 108 | "fps": 25, 109 | "offset": 0, 110 | "stride": 25, 111 | "inner_stride": 1, 112 | "pixel_dim": 256, 113 | "temporal": "avg", 114 | "aggregate": "concat", 115 | "aggregate-axis": 1, 116 | "type": "embed", 117 | "feat_dims": { 118 | "embed": 1024, 119 | "logits": 400 120 | } 121 | }, 122 | "r2p1d.r2p1d-ig65m.0": { 123 | "fps": 30, 124 | "offset": 0, 125 | "stride": 32, 126 | "inner_stride": 1, 127 | "pixel_dim": 256, 128 | "temporal": "avg", 129 | "aggregate": "concat", 130 | "aggregate-axis": 1, 131 | "type": "embed", 132 | "feat_dims": { 133 | "embed": 512, 134 | "logits": 359 135 | } 136 | }, 137 | "r2p1d.r2p1d-ig65m-kinetics.0": { 138 | "fps": 30, 139 | "offset": 0, 140 | "stride": 32, 141 | "inner_stride": 1, 142 | "pixel_dim": 256, 143 | "temporal": "avg", 144 | "aggregate": "concat", 145 | "aggregate-axis": 1, 146 | "type": "embed", 147 | "feat_dims": { 148 | "embed": 512, 149 | "logits": 400 150 | } 151 | } 152 | } 153 | } 154 | }, 155 | "trainer": { 156 | "epochs": 20 157 | }, 158 | "eval_settings": { 159 | "data_loader": { 160 | "args": { 161 | "split_name": "test" 162 | } 163 | }, 164 | "tester": { 165 | "save_dir": "data/saved/", 166 | "verbosity": 2 167 | }, 168 | "disable_gpu": true 169 | }, 170 | "visualizer": { 171 | "type": "Visualizer", 172 | "args":{ 173 | "src_video_dir": "data/QuerYD/videos", 174 | "vis_vid_freq": 500, 175 | "num_samples": 100 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /utils/html.py: -------------------------------------------------------------------------------- 1 | import dominate 2 | from dominate.tags import meta, h3, table, tr, td, p, a, img, br, video, source, attr 3 | from dominate.tags import span 4 | import os 5 | 6 | 7 | class HTML: 8 | """This HTML class allows us to save images and write texts into a single HTML file. 9 | 10 | It consists of functions such as (add a text header to the HTML file), 11 | (add a row of images to the HTML file), and (save the HTML to the disk). 12 | It is based on Python library 'dominate', a Python library for creating and 13 | manipulating HTML documents using a DOM API. 14 | """ 15 | 16 | def __init__(self, web_dir, title, refresh=0): 17 | """Initialize the HTML classes 18 | 19 | Parameters: 20 | web_dir (str) -- a directory that stores the webpage. HTML file will be 21 | created at /index.html; images will be saved at 0: 35 | with self.doc.head: 36 | meta(http_equiv="refresh", content=str(refresh)) 37 | 38 | def get_image_dir(self): 39 | """Return the directory that stores images""" 40 | return self.img_dir 41 | 42 | def add_header(self, text): 43 | """Insert a header to the HTML file 44 | 45 | Parameters: 46 | text (str) -- the header text 47 | """ 48 | with self.doc: 49 | h3(text) 50 | 51 | def add_videos(self, vids, txts, links, width=400, hidden_tag="hidden"): 52 | """add images to the HTML file 53 | 54 | Parameters: 55 | vids (str list) -- a list of image paths 56 | txts (str list) -- a list of image names shown on the website 57 | links (str list) -- a list of hyperref links; when you click an image, 58 | it will redirect you to a new page 59 | """ 60 | self.t = table(border=1, style="table-layout: fixed;") # Insert a table 61 | self.doc.add(self.t) 62 | colors = ["red", "blue", "gold", "salman"] 63 | with self.t: 64 | with tr(): 65 | for vid, txt, link in zip(vids, txts, links): 66 | td_style = "word-wrap: break-word; width:{}px".format(width) 67 | with td(style=td_style, halign="center", valign="top"): 68 | with p(): 69 | vid_path = str(vid) 70 | if vid_path == hidden_tag: 71 | p_style = "font-weight: bold; width:{}px;" 72 | p_style = p_style.format(width * 3) 73 | p("hidden video", style=p_style) 74 | else: 75 | with a(href=str(link)): 76 | with video(): 77 | attr(controls="controls") 78 | source(src=vid_path, type="video/mp4") 79 | br() 80 | rows = txt.split("
") 81 | for idx, row in enumerate(rows): 82 | color = colors[idx % len(colors)] 83 | bold_tag = "" 84 | if not row.startswith(bold_tag): 85 | s_style = "color:{};".format(color) 86 | else: 87 | s_style = "color:black; font-weight: bold;" 88 | row = row[len(bold_tag):] 89 | span(row, style=s_style) 90 | 91 | def add_images(self, ims, txts, links, width=400): 92 | """add images to the HTML file 93 | 94 | Parameters: 95 | ims (str list) -- a list of image paths 96 | txts (str list) -- a list of image names shown on the website 97 | links (str list) -- a list of hyperref links; when you click an image, 98 | it will redirect you to a new page 99 | """ 100 | self.t = table(border=1, style="table-layout: fixed;") # Insert a table 101 | self.doc.add(self.t) 102 | with self.t: 103 | with tr(): 104 | for im, txt, link in zip(ims, txts, links): 105 | td_style = "word-wrap: break-word;" 106 | with td(style=td_style, halign="center", valign="top"): 107 | with p(): 108 | with a(href=os.path.join("images", link)): 109 | img( 110 | style="width:%dpx" % width, 111 | src=os.path.join("images", im), 112 | ) 113 | br() 114 | p(txt) 115 | 116 | def save(self): 117 | """save the current content to the HMTL file""" 118 | html_file = "%s/index.html" % self.web_dir 119 | f = open(html_file, "wt") 120 | f.write(self.doc.render()) 121 | f.close() 122 | 123 | 124 | if __name__ == "__main__": # we show an example usage here. 125 | html = HTML("web/", "test_html") 126 | html.add_header("hello world") 127 | 128 | ims, txts, links = [], [], [] 129 | for n in range(4): 130 | ims.append("image_%d.png" % n) 131 | txts.append("text_%d" % n) 132 | links.append("image_%d.png" % n) 133 | html.add_images(ims, txts, links) 134 | html.save() 135 | -------------------------------------------------------------------------------- /configs/data_loader_querydsegments.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/base_config_queryd.json", 3 | "eval_mode": "test_run", 4 | "experts": { 5 | "text_feat": "w2v", 6 | "modalities": [ 7 | "imagenet.resnext101_32x48d.0", 8 | "r2p1d.r2p1d-ig65m.0", 9 | "scene.densenet161.0", 10 | "audio" 11 | ] 12 | }, 13 | "arch": { 14 | "type": "CENet", 15 | "args": { 16 | "test_caption_mode": "indep", 17 | "use_ce": "pairwise", 18 | "use_mish": 1, 19 | "use_bn_reason": 1, 20 | "num_g_layers": 3, 21 | "num_h_layers": 0, 22 | "include_self": 1, 23 | "l2renorm": false, 24 | "randomise_feats": "", 25 | "vlad_clusters": { 26 | "text": 20, 27 | "audio": 16 28 | }, 29 | "ghost_clusters": { 30 | "text": 1 31 | }, 32 | "mimic_ce_dims": 0 33 | } 34 | }, 35 | "optimizer": { 36 | "type": "Ranger", 37 | "args": { 38 | "lr": 0.01, 39 | "weight_decay": 1E-03 40 | } 41 | }, 42 | "loss": { 43 | "type": "MaxMarginRankingLoss", 44 | "args": { 45 | "margin": 0.2, 46 | "fix_norm": true 47 | } 48 | }, 49 | "data_loader": { 50 | "type": "ExpertDataLoader", 51 | "args":{ 52 | "dataset_name": "QuerYDSegments", 53 | "data_dir": "data/QuerYDSegments", 54 | "root_feat_folder": "structured-symlinks", 55 | "trn_cat": 0, 56 | "batch_size": 128, 57 | "split_name": "val", 58 | "fuse_captions": false, 59 | "num_test_captions": 1, 60 | "max_tokens": { 61 | "text": 70, 62 | "audio": 500 63 | }, 64 | "feat_aggregation": { 65 | "imagenet.senet154.0": { 66 | "fps": 25, 67 | "stride": 1, 68 | "pixel_dim": 256, 69 | "aggregate-axis": 1, 70 | "offset": 0, 71 | "temporal": "avg", 72 | "aggregate": "concat", 73 | "type": "embed", 74 | "feat_dims": { 75 | "embed": 2048, 76 | "logits": 1000 77 | } 78 | }, 79 | "imagenet.resnext101_32x48d.0": { 80 | "fps": 25, 81 | "stride": 1, 82 | "offset": 0, 83 | "pixel_dim": 256, 84 | "temporal": "avg", 85 | "aggregate": "concat", 86 | "aggregate-axis": 1, 87 | "type": "embed", 88 | "feat_dims": { 89 | "embed": 2048, 90 | "logits": 1000 91 | } 92 | }, 93 | "scene.densenet161.0": { 94 | "stride": 1, 95 | "fps": 25, 96 | "offset": 0, 97 | "temporal": "avg", 98 | "pixel_dim": 256, 99 | "aggregate": "concat", 100 | "aggregate-axis": 1, 101 | "type": "embed", 102 | "feat_dims": { 103 | "embed": 2208, 104 | "logits": 1000 105 | } 106 | }, 107 | "i3d.i3d.0": { 108 | "fps": 25, 109 | "offset": 0, 110 | "stride": 25, 111 | "inner_stride": 1, 112 | "pixel_dim": 256, 113 | "temporal": "avg", 114 | "aggregate": "concat", 115 | "aggregate-axis": 1, 116 | "type": "embed", 117 | "feat_dims": { 118 | "embed": 1024, 119 | "logits": 400 120 | } 121 | }, 122 | "r2p1d.r2p1d-ig65m.0": { 123 | "fps": 30, 124 | "offset": 0, 125 | "stride": 32, 126 | "inner_stride": 1, 127 | "pixel_dim": 256, 128 | "temporal": "avg", 129 | "aggregate": "concat", 130 | "aggregate-axis": 1, 131 | "type": "embed", 132 | "feat_dims": { 133 | "embed": 512, 134 | "logits": 359 135 | } 136 | }, 137 | "r2p1d.r2p1d-ig65m-kinetics.0": { 138 | "fps": 30, 139 | "offset": 0, 140 | "stride": 32, 141 | "inner_stride": 1, 142 | "pixel_dim": 256, 143 | "temporal": "avg", 144 | "aggregate": "concat", 145 | "aggregate-axis": 1, 146 | "type": "embed", 147 | "feat_dims": { 148 | "embed": 512, 149 | "logits": 400 150 | } 151 | }, 152 | "s3dg.s3dg.0": { 153 | "fps": 10, 154 | "offset": 0, 155 | "stride": 16, 156 | "num_segments": null, 157 | "pixel_dim": 256, 158 | "inner_stride": 1, 159 | "temporal": "avg", 160 | "aggregate": "concat", 161 | "aggregate-axis": 1, 162 | "type": "embed", 163 | "feat_dims": { 164 | "embed": 1024, 165 | "logits": 512 166 | } 167 | } 168 | } 169 | } 170 | }, 171 | "trainer": { 172 | "epochs": 20 173 | }, 174 | "eval_settings": { 175 | "data_loader": { 176 | "args": { 177 | "split_name": "test" 178 | } 179 | }, 180 | "tester": { 181 | "save_dir": "data/saved/", 182 | "verbosity": 2 183 | }, 184 | "disable_gpu": true 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /utils/ranger.py: -------------------------------------------------------------------------------- 1 | #Ranger deep learning optimizer - RAdam + Lookahead combined. 2 | #https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer 3 | 4 | #Ranger has now been used to capture 12 records on the FastAI leaderboard. 5 | 6 | #This version = 9.3.19 7 | 8 | #Credits: 9 | #RAdam --> https://github.com/LiyuanLucasLiu/RAdam 10 | #Lookahead --> rewritten by lessw2020, but big thanks to Github @LonePatient and @RWightman for ideas from their code. 11 | #Lookahead paper --> MZhang,G Hinton https://arxiv.org/abs/1907.08610 12 | 13 | #summary of changes: 14 | #full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights), 15 | #supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues. 16 | #changes 8/31/19 - fix references to *self*.N_sma_threshold; 17 | #changed eps to 1e-5 as better default than 1e-8. 18 | 19 | import math 20 | import torch 21 | from torch.optim.optimizer import Optimizer, required 22 | import itertools as it 23 | 24 | 25 | 26 | class Ranger(Optimizer): 27 | 28 | def __init__(self, params, lr=1e-3, alpha=0.5, k=6, N_sma_threshhold=5, betas=(.95,0.999), eps=1e-5, weight_decay=0): 29 | #parameter checks 30 | if not 0.0 <= alpha <= 1.0: 31 | raise ValueError(f'Invalid slow update rate: {alpha}') 32 | if not 1 <= k: 33 | raise ValueError(f'Invalid lookahead steps: {k}') 34 | if not lr > 0: 35 | raise ValueError(f'Invalid Learning Rate: {lr}') 36 | if not eps > 0: 37 | raise ValueError(f'Invalid eps: {eps}') 38 | 39 | #parameter comments: 40 | # beta1 (momentum) of .95 seems to work better than .90... 41 | #N_sma_threshold of 5 seems better in testing than 4. 42 | #In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you. 43 | 44 | #prep defaults and init torch.optim base 45 | defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay) 46 | super().__init__(params,defaults) 47 | 48 | #adjustable threshold 49 | self.N_sma_threshhold = N_sma_threshhold 50 | 51 | #now we can get to work... 52 | #removed as we now use step from RAdam...no need for duplicate step counting 53 | #for group in self.param_groups: 54 | # group["step_counter"] = 0 55 | #print("group step counter init") 56 | 57 | #look ahead params 58 | self.alpha = alpha 59 | self.k = k 60 | 61 | #radam buffer for state 62 | self.radam_buffer = [[None,None,None] for ind in range(10)] 63 | 64 | #self.first_run_check=0 65 | 66 | #lookahead weights 67 | #9/2/19 - lookahead param tensors have been moved to state storage. 68 | #This should resolve issues with load/save where weights were left in GPU memory from first load, slowing down future runs. 69 | 70 | #self.slow_weights = [[p.clone().detach() for p in group['params']] 71 | # for group in self.param_groups] 72 | 73 | #don't use grad for lookahead weights 74 | #for w in it.chain(*self.slow_weights): 75 | # w.requires_grad = False 76 | 77 | def __setstate__(self, state): 78 | print("set state called") 79 | super(Ranger, self).__setstate__(state) 80 | 81 | 82 | def step(self, closure=None): 83 | loss = None 84 | #note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure. 85 | #Uncomment if you need to use the actual closure... 86 | 87 | #if closure is not None: 88 | #loss = closure() 89 | 90 | #Evaluate averages and grad, update param tensors 91 | for group in self.param_groups: 92 | 93 | for p in group['params']: 94 | if p.grad is None: 95 | continue 96 | grad = p.grad.data.float() 97 | if grad.is_sparse: 98 | raise RuntimeError('Ranger optimizer does not support sparse gradients') 99 | 100 | p_data_fp32 = p.data.float() 101 | 102 | state = self.state[p] #get state dict for this param 103 | 104 | if len(state) == 0: #if first time to run...init dictionary with our desired entries 105 | #if self.first_run_check==0: 106 | #self.first_run_check=1 107 | #print("Initializing slow buffer...should not see this at load from saved model!") 108 | state['step'] = 0 109 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 110 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 111 | 112 | #look ahead weight storage now in state dict 113 | state['slow_buffer'] = torch.empty_like(p.data) 114 | state['slow_buffer'].copy_(p.data) 115 | 116 | else: 117 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 118 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 119 | 120 | #begin computations 121 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 122 | beta1, beta2 = group['betas'] 123 | 124 | #compute variance mov avg 125 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 126 | #compute mean moving avg 127 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 128 | 129 | state['step'] += 1 130 | 131 | 132 | buffered = self.radam_buffer[int(state['step'] % 10)] 133 | if state['step'] == buffered[0]: 134 | N_sma, step_size = buffered[1], buffered[2] 135 | else: 136 | buffered[0] = state['step'] 137 | beta2_t = beta2 ** state['step'] 138 | N_sma_max = 2 / (1 - beta2) - 1 139 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 140 | buffered[1] = N_sma 141 | if N_sma > self.N_sma_threshhold: 142 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 143 | else: 144 | step_size = 1.0 / (1 - beta1 ** state['step']) 145 | buffered[2] = step_size 146 | 147 | if group['weight_decay'] != 0: 148 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 149 | 150 | if N_sma > self.N_sma_threshhold: 151 | denom = exp_avg_sq.sqrt().add_(group['eps']) 152 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 153 | else: 154 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 155 | 156 | p.data.copy_(p_data_fp32) 157 | 158 | #integrated look ahead... 159 | #we do it at the param level instead of group level 160 | if state['step'] % group['k'] == 0: 161 | slow_p = state['slow_buffer'] #get access to slow param tensor 162 | slow_p.add_(self.alpha, p.data - slow_p) #(fast weights - slow weights) * alpha 163 | p.data.copy_(slow_p) #copy interpolated weights to RAdam param tensor 164 | 165 | return loss -------------------------------------------------------------------------------- /data_loader/AudioCaps_dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | from pathlib import Path 4 | from typing import Dict, List, Union 5 | 6 | from base.base_dataset import BaseDataset 7 | from typeguard import typechecked 8 | from utils import memory_summary 9 | from zsvision.zs_utils import concat_features, memcache 10 | import time 11 | import data_loader 12 | 13 | 14 | class AudioCaps(BaseDataset): 15 | 16 | @typechecked 17 | def __init__(self, testing_file: Union[None, str]=None, **kwargs): 18 | self.testing_file = testing_file 19 | super().__init__(**kwargs) 20 | 21 | print(f"self.testing_file: {self.testing_file}") 22 | 23 | @staticmethod 24 | @typechecked 25 | def dataset_paths(training_file=None, testing_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]: 26 | subset_paths = {} 27 | # import pdb; pdb.set_trace() 28 | if testing_file is None: 29 | test_splits = { 30 | "val": "filtered_val_list.txt", 31 | "test": "final_filtered_test_list.txt", 32 | } 33 | using_testing_file = False 34 | else: 35 | test_splits = { 36 | "val": "filtered_val_list.txt", 37 | "test": testing_file, 38 | } 39 | using_testing_file = True 40 | print(f"using {testing_file}") 41 | if training_file is not None: 42 | try: 43 | val_per = training_file.split('.txt')[0].split('train_list_')[1] 44 | test_splits['val'] = f"filtered_val_list_{val_per}.txt" 45 | except IndexError: 46 | pass 47 | for split_name, fname in test_splits.items(): 48 | if training_file is None: 49 | print(f"using {test_splits['test']} is {using_testing_file} split {split_name}") 50 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname} 51 | print(f"using {subset_paths[split_name]['train']} and {subset_paths[split_name]['val']}") 52 | else: 53 | print(f"using {test_splits['test']} is {using_testing_file} split {split_name}") 54 | subset_paths[split_name] = {"train": training_file, "val": fname} 55 | print(f"using {subset_paths[split_name]['train']} and {subset_paths[split_name]['val']}") 56 | 57 | feature_names = BaseDataset.common_feat_names() 58 | feature_names.append("audio.vggish.0") 59 | feature_names.append("pann.pann.0") 60 | feature_names.append("syncnet.syncnet.0") 61 | feature_names.append("vggsound.vggsound.0") 62 | text_feat_paths = BaseDataset.common_text_feat_paths() 63 | text_feat_paths = {key: Path("text_embeddings") / fname 64 | for key, fname in text_feat_paths.items()} 65 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl" 66 | for key in text_feat_paths} 67 | custom_paths = { 68 | "audio": ["aggregated_audio/vggish-raw.hickle"], 69 | "pann": ["aggregated_pann/pann-raw.hickle"], 70 | "syncnet": ["aggregated_syncnet/syncnet-raw.hickle"], 71 | "vggsound": ["aggregated_vggsound/vggsound-raw.hickle"], 72 | "speech": ["aggregated_speech/w2v_mean.pkl"] 73 | } 74 | feature_info = { 75 | "custom_paths": custom_paths, 76 | "feature_names": feature_names, 77 | "subset_list_paths": subset_paths, 78 | "text_feat_paths": text_feat_paths, 79 | "challenge_text_feat_paths": challenge_text_feat_paths, 80 | "raw_captions_path": "structured-symlinks/raw-captions.pkl", 81 | } 82 | return feature_info 83 | 84 | def load_features(self): 85 | root_feat = self.root_feat 86 | feat_names = {key: self.visual_feat_paths(key) for key in 87 | self.paths["feature_names"]} 88 | feat_names.update(self.paths["custom_paths"]) 89 | features = {} 90 | for expert, rel_names in feat_names.items(): 91 | if expert not in self.ordered_experts: 92 | continue 93 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names]) 94 | if len(feat_paths) == 1: 95 | features[expert] = memcache(feat_paths[0]) 96 | else: 97 | # support multiple forms of feature (e.g. max and avg pooling). For 98 | # now, we only support direct concatenation 99 | msg = f"{expert}: Only direct concatenation of muliple feats is possible" 100 | print(f"Concatenating aggregates for {expert}....") 101 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg 102 | axis = self.feat_aggregation[expert]["aggregate-axis"] 103 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter 104 | print(f"concat cache info: {x}") 105 | features_ = concat_features(feat_paths, axis=axis) 106 | 107 | memory_summary() 108 | 109 | #if expert == "speech": 110 | # features_defaults = defaultdict(lambda: np.zeros((1, 300))) 111 | # features_defaults.update(features_) 112 | # features_ = features_defaults 113 | # Make separate feature copies for each split to allow in-place filtering 114 | features[expert] = copy.deepcopy(features_) 115 | 116 | self.features = features 117 | if self.challenge_mode: 118 | self.load_challenge_text_features() 119 | else: 120 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"]) 121 | # keys = list(raw_captions.keys()) 122 | # raw_captions_fused = {} 123 | # for key in keys: 124 | # raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key])) 125 | # self.raw_captions = raw_captions_fused 126 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat] 127 | self.text_features = memcache(text_feat_path) 128 | 129 | # overload video paths, which are structured differently for YouCook2 130 | self.video_path_retrieval = [f"videos/{x}.mp4" 131 | for x in self.partition_lists["val"]] 132 | 133 | def sanity_checks(self): 134 | msg = (f"Expected to have single test caption for AudioCaps, since we assume" 135 | f"that the captions are fused (but using {self.num_test_captions})") 136 | if self.fuse_captions is True: 137 | assert self.num_test_captions == 1, msg 138 | 139 | def configure_train_test_splits(self, split_name): 140 | """Partition the datset into train/val/test splits. 141 | 142 | Args: 143 | split_name (str): the name of the split 144 | """ 145 | print(f"Now working on {split_name}") 146 | # import pdb; pdb.set_trace() 147 | self.paths = type(self).dataset_paths(training_file=self.training_file, testing_file=self.testing_file) 148 | print("loading training/val splits....") 149 | tic = time.time() 150 | for subset, path in self.paths["subset_list_paths"][split_name].items(): 151 | if self.challenge_mode and split_name == "public_server_test" \ 152 | and subset == "val": 153 | root_feat = Path(self.challenge_test_root_feat_folder) 154 | else: 155 | root_feat = Path(self.root_feat) 156 | subset_list_path = root_feat / path 157 | if subset == "train" and self.eval_only: 158 | rows = [] 159 | else: 160 | with open(subset_list_path) as f: 161 | rows = f.read().splitlines() 162 | self.partition_lists[subset] = rows 163 | print("done in {:.3f}s".format(time.time() - tic)) 164 | self.split_name = split_name 165 | -------------------------------------------------------------------------------- /data_loader/data_loaders.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import functools 3 | from pathlib import Path 4 | from typing import Dict, List, Union 5 | 6 | import torch 7 | from typeguard import typechecked 8 | from torch.utils.data import DataLoader 9 | from zsvision.zs_utils import memcache 10 | 11 | from zsvision.zs_data_structures import HashableDict, HashableOrderedDict 12 | from data_loader.ActivityNet_dataset import ActivityNet 13 | from data_loader.QuerYD_dataset import QuerYD 14 | from data_loader.QuerYDSegments_dataset import QuerYDSegments 15 | from data_loader.AudioCaps_dataset import AudioCaps 16 | from data_loader.CLOTHO_dataset import CLOTHO 17 | 18 | @functools.lru_cache(maxsize=64, typed=False) 19 | def dataset_loader( 20 | text_dropout: float, 21 | fuse_captions: bool, 22 | spatial_feats: bool, 23 | use_zeros_for_missing: bool, 24 | challenge_mode: bool, 25 | eval_only: bool, 26 | task: str, 27 | data_dir: str, 28 | text_agg: str, 29 | text_feat: str, 30 | split_name: str, 31 | dataset_name: str, 32 | cls_partition: str, 33 | root_feat_folder: str, 34 | challenge_test_root_feat_folder: str, 35 | text_dim: int, 36 | num_test_captions: int, 37 | restrict_train_captions: int, 38 | logger: logging.Logger, 39 | max_tokens: Dict[str, int], 40 | raw_input_dims: HashableOrderedDict, 41 | feat_aggregation: HashableDict, 42 | distil_params: Union[None, Dict], 43 | training_file: Union[None, str], 44 | caption_masks: Union[None, str], 45 | ce_shared_dim: Union[None, int], 46 | **args, 47 | ): 48 | print(f"refreshing cache for {dataset_name} data loader [{split_name}]") 49 | kwargs = dict( 50 | task=task, 51 | data_dir=Path(data_dir), 52 | text_dim=text_dim, 53 | logger=logger, 54 | eval_only=eval_only, 55 | text_agg=text_agg, 56 | text_feat=text_feat, 57 | max_tokens=max_tokens, 58 | split_name=split_name, 59 | cls_partition=cls_partition, 60 | spatial_feats=spatial_feats, 61 | text_dropout=text_dropout, 62 | fuse_captions=fuse_captions, 63 | raw_input_dims=raw_input_dims, 64 | challenge_mode=challenge_mode, 65 | root_feat_folder=root_feat_folder, 66 | feat_aggregation=feat_aggregation, 67 | num_test_captions=num_test_captions, 68 | use_zeros_for_missing=use_zeros_for_missing, 69 | restrict_train_captions=restrict_train_captions, 70 | challenge_test_root_feat_folder=challenge_test_root_feat_folder, 71 | distil_params=distil_params, 72 | training_file=training_file, 73 | caption_masks=caption_masks, 74 | ce_shared_dim=ce_shared_dim, 75 | **args, 76 | ) 77 | if dataset_name == "ActivityNet": 78 | dataset = ActivityNet(**kwargs) 79 | elif dataset_name == "QuerYD": 80 | dataset = QuerYD(**kwargs) 81 | elif dataset_name == "QuerYDSegments": 82 | dataset = QuerYDSegments(**kwargs) 83 | elif dataset_name == "AudioCaps": 84 | dataset = AudioCaps(**kwargs) 85 | elif dataset_name == "CLOTHO": 86 | dataset = CLOTHO(**kwargs) 87 | return dataset 88 | 89 | 90 | class ExpertDataLoader: 91 | 92 | @typechecked 93 | def __init__( 94 | self, 95 | eval_only: bool, 96 | fuse_captions: bool, 97 | challenge_mode: bool, 98 | use_zeros_for_missing: bool, 99 | trn_cat: int, 100 | text_dim: int, 101 | batch_size: int, 102 | num_workers: int, 103 | num_test_captions: int, 104 | task: str, 105 | data_dir: str, 106 | text_agg: str, 107 | text_feat: str, 108 | split_name: str, 109 | dataset_name: str, 110 | root_feat_folder: str, 111 | text_dropout: float, 112 | max_tokens: Dict[str, int], 113 | raw_input_dims: Dict[str, int], 114 | feat_aggregation: Dict[str, Dict], 115 | logger: logging.Logger, 116 | spatial_feats: bool = False, 117 | restrict_train_captions: int = 0, 118 | drop_last: bool = False, 119 | refresh_lru_cache: bool = False, 120 | cls_partitions: List[str] = ["train", "val", "tiny", "challenge"], 121 | challenge_test_root_feat_folder: str = "challenge", 122 | distil_params: Union[None, Dict] = None, 123 | training_file: Union[None, str] = None, 124 | caption_masks: Union[None, str] = None, 125 | ce_shared_dim: Union[None, int] = None, 126 | **args, 127 | ): 128 | 129 | # Ensure that the dictionaries are hashable to allow use of caching 130 | raw_input_dims = HashableOrderedDict(raw_input_dims) 131 | feat_aggregation = HashableDict(feat_aggregation) 132 | if distil_params is not None: 133 | distil_params = HashableDict(distil_params) 134 | max_tokens = HashableDict(max_tokens) 135 | 136 | if refresh_lru_cache: 137 | logger.info("Explicitly refreshing dataloader and cuda cache") 138 | dataset_loader.cache_clear() 139 | torch.cuda.empty_cache() 140 | memcache.cache_clear() 141 | 142 | if trn_cat: 143 | raise NotImplementedError(f"Support for trn cat will need to be re-added") 144 | 145 | common_kwargs = dict( 146 | task=task, 147 | logger=logger, 148 | data_dir=data_dir, 149 | text_dim=text_dim, 150 | text_agg=text_agg, 151 | eval_only=eval_only, 152 | text_feat=text_feat, 153 | max_tokens=max_tokens, 154 | dataset_name=dataset_name, 155 | text_dropout=text_dropout, 156 | fuse_captions=fuse_captions, 157 | spatial_feats=spatial_feats, 158 | split_name=split_name, 159 | challenge_mode=challenge_mode, 160 | root_feat_folder=root_feat_folder, 161 | use_zeros_for_missing=use_zeros_for_missing, 162 | challenge_test_root_feat_folder=challenge_test_root_feat_folder, 163 | num_test_captions=num_test_captions, 164 | raw_input_dims=raw_input_dims, 165 | feat_aggregation=feat_aggregation, 166 | restrict_train_captions=restrict_train_captions, 167 | distil_params=distil_params, 168 | training_file=training_file, 169 | caption_masks=caption_masks, 170 | ce_shared_dim=ce_shared_dim, 171 | **args, 172 | ) 173 | 174 | if "retrieval" in task: 175 | # import pdb; pdb.set_trace() 176 | dataset = dataset_loader(cls_partition="train", **common_kwargs) 177 | x = dataset_loader.cache_info() # pylint: disable=no-value-for-parameter 178 | logger.info(f"cache info {x}") 179 | self.dataloaders = {"dataset": dataset} 180 | self.dataloaders["retrieval"] = dataset.get_retrieval_data() 181 | if not eval_only: 182 | train_loader = DataLoader( 183 | dataset=dataset, 184 | batch_size=batch_size, 185 | num_workers=num_workers, 186 | collate_fn=dataset.collate_data, 187 | drop_last=drop_last, 188 | shuffle=True, 189 | ) 190 | self.dataloaders["train"] = train_loader 191 | else: 192 | self.dataloaders = {} 193 | for cls_partition in cls_partitions: 194 | cls_dataset = dataset_loader(cls_partition=cls_partition, **common_kwargs) 195 | x = dataset_loader.cache_info() # pylint: disable=no-value-for-parameter 196 | logger.info(f"cache info [{cls_partition}] {x}") 197 | loader = DataLoader( 198 | dataset=cls_dataset, 199 | batch_size=batch_size, 200 | num_workers=num_workers, 201 | collate_fn=cls_dataset.collate_data, 202 | drop_last=False, 203 | shuffle=False, 204 | ) 205 | self.dataloaders[cls_partition] = loader 206 | 207 | logger.info(f"Loading data loaders with {num_workers} workers") 208 | self.num_test_captions = num_test_captions 209 | self.dataset_name = dataset_name 210 | 211 | def __getitem__(self, key): 212 | return self.dataloaders[key] 213 | -------------------------------------------------------------------------------- /misc/datasets/queryd/val_list.txt: -------------------------------------------------------------------------------- 1 | video-RXFilHLLqPM 2 | video-epKCqDN9fBo 3 | video-HkKRouJqGCg 4 | video-Q1CQUivEths 5 | video-0P1Td5OTS-A 6 | video-X2-S3pN1pt0 7 | video-UPA3bwVVzGI 8 | video-qi2m4V21bw4 9 | video-eRT_mIpXjbs 10 | video-qjZtHyPLQCE 11 | video-je4nDvNJXsg 12 | video-_iQ-Rb1ohDU 13 | video-r1FbiXDKonk 14 | video-zXBmZLmfQZ4 15 | video-bwzLiQZDw2I 16 | video-yf9dyfeFsFg 17 | video--rgDvP39Lqw 18 | video-VssqNaBnWoM 19 | video-KdpoLklTozo 20 | video-tyxYHIcIJoc 21 | video-zRgZ3sWvnqs 22 | video-vXccpwytjL8 23 | video-YkAX7Vk3JEw 24 | video-yZyxJxR6RCA 25 | video-5z7fKiO5Uzg 26 | video-81Y1Ligkpb4 27 | video-K_P8kQg1Qq8 28 | video-T0NPYZyI7V8 29 | video-_vUG5rqC6qI 30 | video-KD9vAYYLItg 31 | video-XTW3LVp4pWA 32 | video-oVS7kHYlLBc 33 | video-DvaPRlZtfyc 34 | video-8j3NmTv9AWg 35 | video-jTXOEBHC0HY 36 | video-gBdyU1b0ADQ 37 | video-uEu6r8MkQ0o 38 | video-1qM0p24SNhc 39 | video-SFQB8hJdLZw 40 | video-V4rufe1J-Q8 41 | video-sq83Saeop9Y 42 | video-HK9vbhTTwWU 43 | video-cT_Wuzag6VU 44 | video-ahj8Vef9L24 45 | video-KIZProYn7R4 46 | video-VYOjWnS4cMY 47 | video-OINa46HeWg8 48 | video-SNTcxD_xPfk 49 | video-jhfLlamufKE 50 | video-d61MkuYttDI 51 | video-JgWgJa1NtAY 52 | video-iDnE3PV4YNc 53 | video-UTLCo4PHRAw 54 | video-jcBLKdsmpo0 55 | video-F1NR-_YqOgE 56 | video-YoMEedm1DXM 57 | video-EYmN3Sjgvts 58 | video-HodFzcJHIYI 59 | video-bJw0_Fj4PGY 60 | video-RUkMwGquH_A 61 | video-sc5aZsS0-0Y 62 | video-p4cc4LkJFjg 63 | video-ineZXLbL7s8 64 | video-GaSRAzyxLKs 65 | video-2PEvPfsNDrw 66 | video-tRWbo2x5lnA 67 | video-czTG7JxOruo 68 | video-Y-Z7LObUlwA 69 | video-RBAmLm_jYyY 70 | video-ymc30meWzfg 71 | video-sHLKy3z7HwM 72 | video-d25HklopoSs 73 | video-uqxzi_ghjgc 74 | video-v9wRrYhlRgs 75 | video-zd2xn5U6e-E 76 | video-1I8ICdOySkw 77 | video-RcM0NG2Fuxo 78 | video-CBlaiBV_yJs 79 | video-_tKp2eARy3o 80 | video-I8ZvdblLcnk 81 | video-Gu0wmiIngAw 82 | video-s9jX0S7mvB8 83 | video-cX03usETYI0 84 | video-4FHckDWnDKI 85 | video-pYekAIt9wW8 86 | video-LiK2fhOY0nE 87 | video-ndA4YL-bBAQ 88 | video-uQp0Eihw2WA 89 | video-A5WeiYHnvNY 90 | video-b6yYd6Pq7Ic 91 | video-YTszmB9fqEs 92 | video-Bk4MR0IItiQ 93 | video-OsNH9Tm-A04 94 | video-GbycvPwr1Wg 95 | video--FcsIyqJDzc 96 | video-9pX1hxYW3YY 97 | video-gupNRww6vFc 98 | video-oJsYwehp_r4 99 | video-37aUB92yvHI 100 | video-a5V6gdu5ih8 101 | video-AWjBNSshF3s 102 | video-s_RV4Btuv2c 103 | video-sOnqjkJTMaA 104 | video-aLackFf0Zjw 105 | video-On7TvTDOyMQ 106 | video-LvigW4InYyk 107 | video-3yd_1z6OsrE 108 | video-pU0GSbe6r_4 109 | video-x9FzWnWW95U 110 | video-2y1QQWNZxZM 111 | video-beECZjCRLmQ 112 | video-hseWMRV3lA8 113 | video-LSfJQkA-bKE 114 | video-uwCbe2yBqTI 115 | video-kBeggSzwKQ4 116 | video-paXvS0cnQM4 117 | video-_zOX6BO2zjc 118 | video-vI_B7dtF7Q0 119 | video-ka8-Nefp_gk 120 | video-dsf_z4urc4s 121 | video-xrlgfC0SJ9g 122 | video-KzWYP4LsaJw 123 | video-FTL9gQ0pux0 124 | video-c5gLf3_SK3Q 125 | video-DfPMxdHZKsw 126 | video-Dev6T3ZCrY8 127 | video-vU6Ay6yvaLo 128 | video-baM917Zy04A 129 | video-5jjeIH8Y6XM 130 | video-7N98N0GkGjY 131 | video-DCM-sEpyh1Q 132 | video-3JNLwlcPBPI 133 | video-1Ez6dw3ywcc 134 | video-yhofIxEfld0 135 | video-Cdpf1Dl5b_4 136 | video-2DHYhZNHtck 137 | video-L3MtFGWRXAA 138 | video--CR65sS1Frw 139 | video-HJENMThDg0k 140 | video-nIs4S9YDPRs 141 | video-5OBvbyAQ68g 142 | video-ih3CbjixhoI 143 | video-PV34pW-53Os 144 | video-XOqqP5Ww9lE 145 | video-SQBGJr8THGk 146 | video-YlrGp1YxMrc 147 | video-6US4AyvEO_A 148 | video-RF99-5G-Hrk 149 | video-JuKCOthud68 150 | video-Co4dLH29PvM 151 | video-3yQUzU8c4us 152 | video-Y8ZWX2NP3i0 153 | video-xqEqAQadKqE 154 | video-ghg2AP3i5TI 155 | video-jLeZc7li5HM 156 | video-RaRpFuSLyPI 157 | video-T7o0KMXccEA 158 | video-TdUsyXQ8Wrs 159 | video-zYX7iexkODw 160 | video-dkjbMoj0JY4 161 | video-mOaRH-aVFb4 162 | video-wekSrZ-d1bM 163 | video-GyTxtJ4gVLE 164 | video-4GfBVEoxStA 165 | video-7VS1wPeWqAA 166 | video-skKUzMST92g 167 | video-pzAZnOyMTI4 168 | video-J3iVxb8cwOU 169 | video-OZd9jf5nV7I 170 | video-xrhkfADEtMU 171 | video-CRxshNHF98U 172 | video-7WhJ2L5xUqI 173 | video-ClW-SQ7GdiM 174 | video-NjyWl-Bz6Q8 175 | video-rCZ3SN65kIs 176 | video-osP9iJjvlAE 177 | video-30qOijVBS7o 178 | video-63fcAemH_wg 179 | video-MsOzAbUt8n8 180 | video-ndhwbt9OQ0Q 181 | video-WkxE_Fs_mHI 182 | video-i4eADcCnFjo 183 | video--cf_-i_gCdY 184 | video-NB69vdkxn4Y 185 | video-WxXiQqul4io 186 | video-nojC6fP56VI 187 | video-R3qJ-u4b5W4 188 | video-r1AOXI0eBL8 189 | video-oM_M_d9OiHc 190 | video-U5oHhI_GmJs 191 | video-KAyVk_sH42k 192 | video-ezX-a1FT_ns 193 | video-GNZBSZD16cY 194 | video-eWXOurnVTYg 195 | video-sdbHXKlpPAM 196 | video-vfCddWB_Jlw 197 | video-DexH4oCXw-Y 198 | video-fmOaYJ4K09k 199 | video-duF40iZq464 200 | video-z5UScMQUO6Q 201 | video-KHiR4qVpcG8 202 | video-zFCNUW0TfqE 203 | video-Ujg7vcIa7kM 204 | video-yM6UU6QTt4M 205 | video-phQDinMbmic 206 | video-r5L7Iokg5RY 207 | video-DIBw9dSVKdU 208 | video-_La7IMssNOA 209 | video-7Fjt-mlIlTY 210 | video-EpcDZbXslfw 211 | video-prZuZlP4Pqo 212 | video-ndh11VDx_J8 213 | video-X2niZRgGZ7E 214 | video-8jaxiha8-rY 215 | video-C7uAB94aRrQ 216 | video-AoKlbyqbEGM 217 | video-Mv1FKi_-A1I 218 | video-dx0-pNkwOv8 219 | video-KWNEc1Igadg 220 | video-Hf_2ilitep0 221 | video-F5jNkpjGh8A 222 | video-VsY834tcKw8 223 | video-mG4Y2Snygfk 224 | video-vxjW8sfUCCU 225 | video-yCXSgVFsQnQ 226 | video-KYazqIHYqNI 227 | video-5lgHJB1lwYs 228 | video-GPQwSEzXBXU 229 | video-kg-EEBIe7Lk 230 | video-_YoeHOTJBI4 231 | video-yonJuvlA34U 232 | video-kshxs2WBjmE 233 | video-UUlaseGrkLc 234 | video-LlndnhlJnIw 235 | video-4o-qnznd10Y 236 | video-q3wJ32w4s_A 237 | video-qCNodlSc6Hw 238 | video-ml9EdqgtVfU 239 | video-83Wu5xmstn4 240 | video-mrA0oL6wLQA 241 | video-WbmXpHfabuQ 242 | video-Suv9QImeAog 243 | video-jF3I7VpfCEU 244 | video-IYlOZxb0ViI 245 | video-oY6tCnu-1Do 246 | video-fmnUwvZAMjQ 247 | video-g2wsNw07wRY 248 | video-L9KC5W7A2yI 249 | video-db3Ep-jM6ZE 250 | video-9NvPlA3G53I 251 | video-uGl4PRmhRxk 252 | video-UZO5q0B5wfw 253 | video-KItqbZXlrdY 254 | video-W4qMyGXTcsg 255 | video-pIyTWg9oV0M 256 | video-9_5wHw6l11o 257 | video-Juj026QZGDo 258 | video-25lxyul1lb0 259 | video-V-AFUpW3oNg 260 | video-W-xuEJVEraw 261 | video-tFf6pt9HOq8 262 | video-FlYf0F1fuTc 263 | video-Xl6yy6a3emw 264 | video-Ta9K22D0o5Q 265 | video-xA9uSxI36Ik 266 | video-MqIJKnUkGLY 267 | video-L-LE-j2zkCU 268 | video-ZdzD897w11s 269 | video-s0GtT-vN33I 270 | video-fWDaRN490BI 271 | video-XbxRqlHtKUE 272 | video-S0zfR9DTwSY 273 | video-p-_UlScFrQ8 274 | video-JMKHbmwltWQ 275 | video-g-uBt1SoCRQ 276 | video-2b7aoZsavu0 277 | video-A0Wk08f8mUU 278 | video-a7CpzJ-sNl8 279 | video-B1yJuGQOUaY 280 | video-gryenlQKTbE 281 | video-txqiwrbYGrs 282 | video-k-Z8xxygd2Y 283 | video-Qo95rTt9ikU 284 | video-xiSIQzwIPzQ 285 | video-ZtCZGwLH5_o 286 | video-ghLkwSlWSXw 287 | video-Ui7jeZSsgFs 288 | video-rVu1oVDRLgE 289 | video-hDjzdFXmH4g 290 | video-f8PXvqYpGCM 291 | video-fAr-ZYq4RmU 292 | video-6MW7bkk3MZ0 293 | video-tNeYTDLZUJA 294 | video-b7cSIiKxEt4 295 | video-_6PNGyfwjTA 296 | video-pGQd630EpLU 297 | video-ZGc06DUIpaA 298 | video-OKaD4EcmZO8 299 | video-_F6h0yH7EyU 300 | video--A8EERSVAdk 301 | video-II5UsqP2JAk 302 | video-60kjpwyQhqg 303 | video-GR2o6k8aPlI 304 | video-vmzlLFAf3M4 305 | video-mm3rTwAxH20 306 | video-9lZlt-SlABw 307 | video-Yz9u-oG3BgM 308 | video-TuE6z8X-rGg 309 | video-QeYISW-Jplw 310 | video-C6a9AQY_srk 311 | video-aWOCk_57xj8 312 | video-VUs6l9p34v8 313 | video-MdEV-jWMGWw 314 | video-T0rrS51ry2s 315 | video-ODDZPV-Avfw 316 | video-g8Ir5rQJeX0 317 | video-x1kQ-38-Drg 318 | video-T47vNsMjjn8 319 | video-vT-naHyejL8 320 | video-UTO0ogdNMdY 321 | video-xxuQnCg9ML0 322 | video-VfDWQG47pAQ 323 | video-ebyf6FzKEiI 324 | video-6XfdHj11-N0 325 | video-lLYp4b_p_wg 326 | video-RxtzQg57O7w 327 | video-qDMMRpSmzq4 328 | video-_4BT6iLtzUM 329 | video-F6j0EbS7skc 330 | video-0LHxvxdRnYc 331 | video-BDnXdeoiYRE 332 | video-vbuq7w3ZDUQ 333 | video-uE74-8YAV9E 334 | video-6ri6gK4FcnE 335 | video-pRfZmKKX2c4 336 | video-bln98NpCLQA 337 | video-gU2vD-FewGM 338 | video-w17iS0AJHjU 339 | video-6yd2tv4Ni4c 340 | video-GSvbZyWXsME 341 | video-t2mU6USTBRE 342 | video-ccgW5CHFg0Y 343 | video-G0E31788Nfg 344 | video-S7VYhBwbprE 345 | video-xSz0zs0v6e8 346 | video-V7QYLEusDU4 347 | video-mwm0OwqWvF4 348 | video-OxrBik16Hzg 349 | video-63d0vV0kk_Y 350 | video-74v22joL7J4 351 | video-RFinNxS5KN4 352 | video-xBD_s0RhUko 353 | video-M1GO1X09Gec 354 | video-LkjuS_tzmIE 355 | video-AB2oAgjjt3g 356 | video-4yG8caPPY1Y 357 | video-4cR7tNWsuNM 358 | video-ztim_RY82G8 359 | video-HF0Ev_skUAY 360 | video-nGkUzdNi_gs 361 | video-TzZuNeRdFIc 362 | video-8zkVKHy1hyM 363 | video-VjOSxus84WY 364 | video-kupuUVYxZxU 365 | video-khvaIwonxUk 366 | video-c_Ex_qS5Djo 367 | video--W37TDK6dBM 368 | video-cN6uZkmGSLM 369 | video-k431Cy2-kkA 370 | video-JCOqo88eW1E 371 | video-8Fu5pKcrTZI 372 | video-_Z2iurLVDEQ 373 | video-zC2G6lf9fCs 374 | video-V3wLiAmIrGk 375 | video-g_QG77WomHo 376 | video-KLzYvzQbBLI 377 | video-QYsg3rtT79o 378 | video-Q8fUy8qwV3M 379 | video-__5k7e0f3r4 380 | video-GRi80V8ire8 381 | video-ckEoLBiE3Xs 382 | video-24f0OwnZE-Q 383 | video-1WpStml5fe8 384 | video-UJfqp1dmJ3I 385 | -------------------------------------------------------------------------------- /utils/radam.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim.optimizer import Optimizer, required 4 | 5 | class RAdam(Optimizer): 6 | 7 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 8 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 9 | self.buffer = [[None, None, None] for ind in range(10)] 10 | super(RAdam, self).__init__(params, defaults) 11 | 12 | def __setstate__(self, state): 13 | super(RAdam, self).__setstate__(state) 14 | 15 | def step(self, closure=None): 16 | 17 | loss = None 18 | if closure is not None: 19 | loss = closure() 20 | 21 | for group in self.param_groups: 22 | 23 | for p in group['params']: 24 | if p.grad is None: 25 | continue 26 | grad = p.grad.data.float() 27 | if grad.is_sparse: 28 | raise RuntimeError('RAdam does not support sparse gradients') 29 | 30 | p_data_fp32 = p.data.float() 31 | 32 | state = self.state[p] 33 | 34 | if len(state) == 0: 35 | state['step'] = 0 36 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 37 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 38 | else: 39 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 40 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 41 | 42 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 43 | beta1, beta2 = group['betas'] 44 | 45 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 46 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 47 | 48 | state['step'] += 1 49 | buffered = self.buffer[int(state['step'] % 10)] 50 | if state['step'] == buffered[0]: 51 | N_sma, step_size = buffered[1], buffered[2] 52 | else: 53 | buffered[0] = state['step'] 54 | beta2_t = beta2 ** state['step'] 55 | N_sma_max = 2 / (1 - beta2) - 1 56 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 57 | buffered[1] = N_sma 58 | 59 | # more conservative since it's an approximated value 60 | if N_sma >= 5: 61 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 62 | else: 63 | step_size = 1.0 / (1 - beta1 ** state['step']) 64 | buffered[2] = step_size 65 | 66 | if group['weight_decay'] != 0: 67 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 68 | 69 | # more conservative since it's an approximated value 70 | if N_sma >= 5: 71 | denom = exp_avg_sq.sqrt().add_(group['eps']) 72 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 73 | else: 74 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 75 | 76 | p.data.copy_(p_data_fp32) 77 | 78 | return loss 79 | 80 | class PlainRAdam(Optimizer): 81 | 82 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 83 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 84 | 85 | super(PlainRAdam, self).__init__(params, defaults) 86 | 87 | def __setstate__(self, state): 88 | super(PlainRAdam, self).__setstate__(state) 89 | 90 | def step(self, closure=None): 91 | 92 | loss = None 93 | if closure is not None: 94 | loss = closure() 95 | 96 | for group in self.param_groups: 97 | 98 | for p in group['params']: 99 | if p.grad is None: 100 | continue 101 | grad = p.grad.data.float() 102 | if grad.is_sparse: 103 | raise RuntimeError('RAdam does not support sparse gradients') 104 | 105 | p_data_fp32 = p.data.float() 106 | 107 | state = self.state[p] 108 | 109 | if len(state) == 0: 110 | state['step'] = 0 111 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 112 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 113 | else: 114 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 115 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 116 | 117 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 118 | beta1, beta2 = group['betas'] 119 | 120 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 121 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 122 | 123 | state['step'] += 1 124 | beta2_t = beta2 ** state['step'] 125 | N_sma_max = 2 / (1 - beta2) - 1 126 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 127 | 128 | if group['weight_decay'] != 0: 129 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 130 | 131 | # more conservative since it's an approximated value 132 | if N_sma >= 5: 133 | step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 134 | denom = exp_avg_sq.sqrt().add_(group['eps']) 135 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 136 | else: 137 | step_size = group['lr'] / (1 - beta1 ** state['step']) 138 | p_data_fp32.add_(-step_size, exp_avg) 139 | 140 | p.data.copy_(p_data_fp32) 141 | 142 | return loss 143 | 144 | 145 | class AdamW(Optimizer): 146 | 147 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): 148 | defaults = dict(lr=lr, betas=betas, eps=eps, 149 | weight_decay=weight_decay, warmup = warmup) 150 | super(AdamW, self).__init__(params, defaults) 151 | 152 | def __setstate__(self, state): 153 | super(AdamW, self).__setstate__(state) 154 | 155 | def step(self, closure=None): 156 | loss = None 157 | if closure is not None: 158 | loss = closure() 159 | 160 | for group in self.param_groups: 161 | 162 | for p in group['params']: 163 | if p.grad is None: 164 | continue 165 | grad = p.grad.data.float() 166 | if grad.is_sparse: 167 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 168 | 169 | p_data_fp32 = p.data.float() 170 | 171 | state = self.state[p] 172 | 173 | if len(state) == 0: 174 | state['step'] = 0 175 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 176 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 177 | else: 178 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 179 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 180 | 181 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 182 | beta1, beta2 = group['betas'] 183 | 184 | state['step'] += 1 185 | 186 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 187 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 188 | 189 | denom = exp_avg_sq.sqrt().add_(group['eps']) 190 | bias_correction1 = 1 - beta1 ** state['step'] 191 | bias_correction2 = 1 - beta2 ** state['step'] 192 | 193 | if group['warmup'] > state['step']: 194 | scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] 195 | else: 196 | scheduled_lr = group['lr'] 197 | 198 | step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1 199 | 200 | if group['weight_decay'] != 0: 201 | p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) 202 | 203 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 204 | 205 | p.data.copy_(p_data_fp32) 206 | 207 | return loss 208 | -------------------------------------------------------------------------------- /misc/datasets/queryd/test_list.txt: -------------------------------------------------------------------------------- 1 | video-uc0O6cqYbyk 2 | video-vzJqeyxye_E 3 | video-1wn-5-HaKaI 4 | video-BhmvJuTc4aY 5 | video-DqilSuFK3B8 6 | video-BxyXk1sr2io 7 | video-rMXsAtYtLTo 8 | video-HX8KX5u0gkg 9 | video-cWX9iR_PzIQ 10 | video-aooZ26YKH-8 11 | video-eOrNdBpGMv8 12 | video-alLd1Bobkf8 13 | video-G_TwXO0yFeM 14 | video-fbri4or6Uhk 15 | video-uWuOBonC1ds 16 | video-lm0OZ8cMW6M 17 | video-Ro7tuDVP6Ks 18 | video-dNJdJIwCF_Y 19 | video-avl2RPsmlnI 20 | video-9AMPsDXGAxY 21 | video-MXVz4izTWZY 22 | video-W2asVll3OSU 23 | video-x152oloLhVM 24 | video-cNi_HC839Wo 25 | video-8gy5tYVR-28 26 | video-Ceu5cSJiAic 27 | video-6fan6ggvh4U 28 | video-DYldBHQw3s4 29 | video-Kqr2ibw5FKA 30 | video-KJiCalhfN6k 31 | video-Ahvl7V82ycE 32 | video-bSfDQr-FFJU 33 | video-1WsDtn-feuI 34 | video-pQD6gJMnWVs 35 | video-5xJ6h6OTveU 36 | video-Iqtv2oqNNIE 37 | video-z41fUWaNwO0 38 | video-wAnrYQGSHMw 39 | video-oyv6hjQr8MA 40 | video-mvD5wvlIE7c 41 | video-H5MKL1duTrg 42 | video-vN1fuDgj-Qk 43 | video-c38r-SAnTWM 44 | video-q6rAllJAdWk 45 | video-F-a5tZ_5g9E 46 | video-USGLDwaLnaI 47 | video-UE2Pe5DaKiE 48 | video-c6KD-kOqwpE 49 | video-NLlGopyXT_g 50 | video-_NWYeVyZz9I 51 | video--IRNEMiRg0Y 52 | video-lTxn2BuqyzU 53 | video-AZvdGSr7roA 54 | video-TvYCwP1U0xs 55 | video-MovSMelAxWg 56 | video-ER8yEOY0NCc 57 | video-8GGfE7zsD-0 58 | video-IWntTYTdXG8 59 | video-Q_P8WCbhC6s 60 | video-akYVGzeS7_A 61 | video-HZ21eT9lyog 62 | video-kPm0_jB5EQI 63 | video-3EBfr6KdnQc 64 | video-yq3Z9msqnOg 65 | video-T_FdjXqSZlc 66 | video-H7muODd2pCo 67 | video-gJWo7Z5m6e8 68 | video-V49ENdZlOx4 69 | video-E0pemP7JGV4 70 | video-AMjMFbhyhwY 71 | video-cEfD-Mr7m2c 72 | video-j8waYyUSSxg 73 | video-SBTL1vI4-mc 74 | video-4w7sVSMbjyM 75 | video-QPe3QSw49_Y 76 | video-cx-T137FKM4 77 | video-twKxRnoGxoM 78 | video-G9hIzIG1sPY 79 | video-oSO9q-2JjUs 80 | video-cOvModiiUjs 81 | video-Q-yA3q0mGCM 82 | video-yUkgydZh0Bk 83 | video-IDIQKPApIxM 84 | video-7HDtvRHyq2M 85 | video-g6KUxEfUVm0 86 | video-oCLTRjF2eq0 87 | video-9-k5J4RxQdE 88 | video-7-i6uxo4HS0 89 | video-GF60Iuh643I 90 | video-h56Cr2ho1Y8 91 | video-UdNZzQD0qY0 92 | video-bh-dRJaZdgM 93 | video-HTj2n52jz94 94 | video-gsYL4PC0hyk 95 | video-agqgBkpbCoY 96 | video-CrqIVVd3hp0 97 | video-XHGWTDHchVQ 98 | video-Ei6TjvfGMpE 99 | video-_MC3XuMvsDI 100 | video-ikUora2uPnU 101 | video-bGph2eX8RMI 102 | video-cd_AOrSEeRc 103 | video-3heXk6Oj6hU 104 | video-_tw7PR89IOI 105 | video-KUGf-irpTMQ 106 | video-z1fbwPHv-wA 107 | video-hhaNVna7eQs 108 | video-KwOdedfBqHE 109 | video-1g4AUYiz0LU 110 | video-_lmKuKrsKRA 111 | video-07d2dXHYb94 112 | video-pQu3dufotPM 113 | video-VpPJP7o7NnA 114 | video-hpsh8dYl7PE 115 | video-XrgVtuDRBjM 116 | video-0nB9BcZTBag 117 | video-nB-444rPm_8 118 | video-FwqsdGLhdgA 119 | video-5ZYcY-KX2JA 120 | video-yEvhDTWSRec 121 | video-3aZadizk_Io 122 | video-3CJl_S7uzqU 123 | video-u3VVUu-lZsM 124 | video-x4sadYeLHKU 125 | video-jQHVrDFNmJE 126 | video-AZGaCqDAlsU 127 | video-VqrBsMFRaLA 128 | video-CtBBL7Pb9Q0 129 | video-3EFiduilmn8 130 | video-EyVuypKJOq0 131 | video-p5u-vBV8NUU 132 | video-NvgYhf2LnVI 133 | video-9Ky2nyzOnMw 134 | video-SkB4gG8ke7Q 135 | video-GHDz-XDD8OU 136 | video-xyor66WBWPk 137 | video-UDcrGE3le20 138 | video-vYLaKMpqnOc 139 | video-Wu-gd9tLpmU 140 | video-BeGOgA18NIQ 141 | video-JNvpcGV1frQ 142 | video-jHe5vPlKgJA 143 | video-ZSd_IpzmcLM 144 | video-B8ISzf2pryI 145 | video-GZ0Bey4YUGI 146 | video-cEx5bSYJxtg 147 | video-IAq8pEFNeJs 148 | video-uhUUXMWoC_4 149 | video-DS8yeXFeEPA 150 | video-v3iPrBrGSJM 151 | video-uc1Hn4INDjk 152 | video-YPFGT4ecnIU 153 | video-D0a0aNqTehM 154 | video-tEZzagPGls4 155 | video-4QdmRufojsU 156 | video-4SkWU60v9Cg 157 | video-pjJEXkbeL-o 158 | video-W0_tLK37W24 159 | video-LJ9KtxNZdWE 160 | video-N_AcmtmegKI 161 | video-I6lZBoR5gvs 162 | video-8jucxdaifbs 163 | video-zW39WTnHCc8 164 | video-7RhQIZmkgDQ 165 | video-QNJx7Vi4Sg4 166 | video-Vlb2udqPx-M 167 | video-tUN-8TvevGU 168 | video-3SIfsFz_kMQ 169 | video-7KWKxe5HvLw 170 | video-VZaqHyHFCzc 171 | video-dZiJkicepzM 172 | video-94NanQuVkA4 173 | video-fhZo17Pxq1A 174 | video-HvPbH30KWLE 175 | video-J5R18MzrtKU 176 | video-sdUUx5FdySs 177 | video-88VViI5gNA4 178 | video-LsjNWQQOmNg 179 | video-LF71pZXhYrw 180 | video-gRD53bcAM8E 181 | video-CKwUNBEFI0E 182 | video-LnJwH_PZXnM 183 | video-Bzua8Zvlppo 184 | video-o2VFgHGKzx4 185 | video-G9YuKs3Jitk 186 | video-VGNFvm-YCEk 187 | video-huT5__BqY_U 188 | video-hywRdDVR76A 189 | video-WUG-x1TFewA 190 | video-osVxO-RA-pE 191 | video-t1PGWO2Lvmw 192 | video-1awM6kmpd2g 193 | video-KK-Mff60ZIE 194 | video-StAF3NSro-w 195 | video-makIgB4X3q8 196 | video-B_bdAJXsjvk 197 | video-jnaPpgK33Lo 198 | video-6EiRjwjp30I 199 | video-p_Rrovk5nsk 200 | video-EthCVn45VyU 201 | video-DeKXFHPr_oE 202 | video-kJzNZ10I1MY 203 | video-uceySVBjKNw 204 | video-CnOJgDW0gPI 205 | video-XxhKUP9Ixco 206 | video-wt62ayeVd44 207 | video-DSBhSywLRNA 208 | video-ZmdOe1hjW-s 209 | video-Qb2xoiVM7UA 210 | video-qniwI2hNhDs 211 | video-nPo2B-vjZ28 212 | video-4b6ttHSgIFM 213 | video-URTR3AtKTM8 214 | video-4XNjwKBqvxE 215 | video-V3L1qrisKFE 216 | video-0Cqt__04bAk 217 | video-CFq441el_ls 218 | video-R9Puz5RFl5o 219 | video-b1XGPvbWn0A 220 | video-zSlhbBBBi3A 221 | video-JNyn_w3hdZ0 222 | video-JnR2dpLnS14 223 | video-ENaJcHwQEVs 224 | video-QjqS7jzjX34 225 | video-4X1DieuShKI 226 | video-qI34nBlJxP8 227 | video-vjW4UOC7U3w 228 | video-aB_s9lw9E-M 229 | video-NXXkBSuIAl8 230 | video-y07at1bU89Q 231 | video-763brdRmWuc 232 | video-R4WDWpR4oRM 233 | video-Y_dXFLaEVJk 234 | video-d6g1c18Cy-8 235 | video-FRgDsBFC2IA 236 | video-1NunXMcaslA 237 | video-9Byx6TxOPx4 238 | video-xqZQ9KM_LjY 239 | video-tloBMf_KmX4 240 | video-JZIerGNMtnk 241 | video-4oETtq9w9Zg 242 | video-HLRxoMiagO8 243 | video-cdg193GvnBA 244 | video-O_HyZ5aW76c 245 | video-gLax3zOBN40 246 | video-ZVXz6ymCSIo 247 | video-wWVppdfYOx8 248 | video-jKXrOTdbtVQ 249 | video-KauXf6nihPY 250 | video-XeSW_3JEeTs 251 | video-kbzEFa7fiOE 252 | video-Z-E3cRZCne4 253 | video-peXSoTlkwVY 254 | video-G6fMV1UPzkg 255 | video-sBdqOWSZ56w 256 | video-OA1ZRGFKRVI 257 | video-4ClKFnnzSRA 258 | video-ygzR-ltUWug 259 | video-kltuUtE6jQo 260 | video-aI51UWF8_9Y 261 | video-cda7mSowTEI 262 | video-5NVYg2HNAdA 263 | video-Ezg4sr67OGA 264 | video-6ZjRKYPfO8g 265 | video-XPKf24_pXfQ 266 | video-ansWZq7yULE 267 | video-J0HiXwK5s2k 268 | video-QEpCsMbMx7w 269 | video-P0ISZpljc3E 270 | video-PFjp1MW6Lzc 271 | video-dPgs0GHgiYc 272 | video-mgNgscHJh6I 273 | video-oowcsynjIwc 274 | video-Q04KG7gVQtw 275 | video-FK3dav4bA4s 276 | video-iS9QQ8YOofQ 277 | video-D-hPct3oIow 278 | video-wxm8jTzU_8o 279 | video-Zh2-rVsXWUU 280 | video-Wji-BZ0oCwg 281 | video-hgLQQe5uUCE 282 | video-3h6KMumLAvI 283 | video-gHCxdlZ7G18 284 | video-PeBAzI9LuHM 285 | video-qI3AWoK7ABU 286 | video-nqkyzpaoMug 287 | video-IFgh9WU0lPs 288 | video-ppyYdn2nPoU 289 | video-iWYCoBiTnA0 290 | video-TubxNbCQ4Fk 291 | video-lwS74rI92YQ 292 | video-GjHkkTGf7fc 293 | video-TDquUlVDdbU 294 | video-5xKnmuDnJMs 295 | video-uW9KEiQFUE8 296 | video-RzR_O2DoSVs 297 | video-1ePcSm1ninM 298 | video-MxNfvh7vaSs 299 | video-6jLfuoOBX2I 300 | video-Qw6RD5S3e8o 301 | video-2kUMAA9yZgk 302 | video-QcAcBHosPzQ 303 | video-hCsVT9TKahk 304 | video-DAHbtsjuNws 305 | video-U6fPh2mm3pw 306 | video-PGKmexNTHNE 307 | video-NVItPJAu_Fk 308 | video-woWiyBgp5cs 309 | video-qX9FSZJu448 310 | video-YKejnIOvACY 311 | video-5nmhHL3sVIk 312 | video-3yHsRjoRec8 313 | video-6Ts-deSDnRM 314 | video-xZfZ-HB2yJI 315 | video-wj8XXvD4kGE 316 | video-9nVvIz8nYxo 317 | video-TqPCGGHoxsE 318 | video-QAEkuVgt6Aw 319 | video-3Bs4LOtIuxg 320 | video-Gv1aDEFlXq8 321 | video-gZp6CGgsS4A 322 | video-YOqmroV2cRo 323 | video-cHUNbTfzOr4 324 | video-MDdQBWyFmtc 325 | video-JdYSnsEM0gg 326 | video-_U-J9PqgmIc 327 | video-d6PMG7kXpF4 328 | video-xEaCpSzUq3Q 329 | video-28FyDT4cKrg 330 | video-JvQcabZ1zrk 331 | video-Bv2vT665bGI 332 | video-DMCSP73Rq4I 333 | video-o2AsIXSh2xo 334 | video-NoPMX5lqT6A 335 | video-7ToAmWnTsAI 336 | video-tbBzXKN32Sk 337 | video-5_uSZcXMV7s 338 | video-rbNB0jqMv7s 339 | video-AW0jm6i9U3M 340 | video--wHytb5Fe2k 341 | video-JtzsCx0P3tI 342 | video-G64wuf-rHoo 343 | video-FRpIk7yd2RA 344 | video-XvhlK0WGBr4 345 | video-tUyeaT2ZX1I 346 | video-bV3Ib6Ato6c 347 | video-bYw1gRtyGiw 348 | video-LrI4FmRIHpI 349 | video-8pfPl8BkfVY 350 | video-3veKbPi4r90 351 | video-_-O6Ppkrf98 352 | video-5hpBAn5lQPs 353 | video-I6PXKSiJchU 354 | video-sNcJMejrcnM 355 | video-OITWgx8K6Ko 356 | video-wXL5zXz550I 357 | video-bap6XjDDE3k 358 | video-g26mbST0YhU 359 | video-tkQuXvgvNPk 360 | video-dJJ0yadpqKI 361 | video-5yGNbyAmkVY 362 | video-0-NBRA1aSXk 363 | video-xVihCNfZaDg 364 | video-s1FWVQFeOpQ 365 | video-gFuEo2ccTPA 366 | video-L8hM2kbw2Ik 367 | video-yKGeJXk2qWQ 368 | video-tREqJ1_7h0w 369 | video-0m1IfJUNzmc 370 | video-LJosiEHwWxc 371 | video-W4Pr7PZ3Bgc 372 | video-M0nDEbrp9nM 373 | video-D6lmibFiur8 374 | video-oFV9ayoss_o 375 | video-7SldSIviMkg 376 | video-SQ6H-Mz6hgw 377 | video-GrzDQGVprjE 378 | video-ixh8KqEr6LE 379 | video-zywWM3J3i8M 380 | video-WjqiU5FgsYc 381 | video-mpDOscUDQ_0 382 | video-qMzt3yQFT-Q 383 | video-3NDfWjywzsI 384 | video-F2bk_9T482g 385 | video-aLjHyP683QU 386 | video-6Jgwc3sXLCc 387 | -------------------------------------------------------------------------------- /dataset_stats/get_videoid_perclass.py: -------------------------------------------------------------------------------- 1 | # AudioCaps dataset statistics 2 | # March 2021, ask 3 | 4 | 5 | import os 6 | import csv 7 | import json 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | # ----------------------------------------------------------------------------- 12 | # Load AudioCaps test data 13 | # ----------------------------------------------------------------------------- 14 | 15 | audiocaps_base = '/home/askoepke97/coding/ce/collab-experts/data/AudioCaps/audiocaps/dataset' 16 | audiocaps_test_file = os.path.join(audiocaps_base, 'test.csv') 17 | 18 | audiocapid = [] 19 | youtubeid = [] 20 | yid_dict = dict() 21 | 22 | with open(audiocaps_test_file, 'r') as csvfile: 23 | reader = csv.reader(csvfile) 24 | i = 0 25 | for row in reader: 26 | if i > 0: 27 | if not int(row[2]) == 0: 28 | filename = row[1] + '_%d000'%int(row[2]) 29 | else: 30 | filename = row[1] + '_%d'%int(row[2]) 31 | ytname = row[1] 32 | yid_dict[filename] = ytname 33 | i += 1 34 | 35 | # ----------------------------------------------------------------------------- 36 | # Load audioset ontology and train (and eval) data 37 | # ----------------------------------------------------------------------------- 38 | 39 | audiosetbase = '/home/askoepke97/coding/ce/collab-experts/data/dataset_statistics' 40 | ontology = os.path.join(audiosetbase, 'ontology.json') 41 | 42 | evalcsv = os.path.join(audiosetbase, 'eval_segments.csv') 43 | traincsv = os.path.join(audiosetbase, 'unbalanced_train_segments.csv') 44 | 45 | with open(ontology) as json_file: 46 | ontology_data = json.load(json_file) 47 | 48 | classids = dict() 49 | 50 | for ind in np.arange(len(ontology_data)): 51 | classids[ontology_data[ind]['id']] = ontology_data[ind]['name'] 52 | 53 | evaldict = dict() 54 | 55 | with open(evalcsv, 'r') as as_csvfile: 56 | reader = csv.reader(as_csvfile) 57 | i = 0 58 | for row in reader: 59 | if i > 2: 60 | ytname = row[0] 61 | starttime = row[1] 62 | classes = row[3].split(',') 63 | newclasses = [] 64 | for classe in classes: 65 | if classe.strip()[0] == '"' and not classe.strip()[-1] == '"': 66 | newclasses.append(classids[row[3].strip()[1:]]) 67 | elif classe.strip()[-1] == '"' and not classe.strip()[0] == '"': 68 | newclasses.append(classids[row[3].strip()[:-1]]) 69 | elif classe.strip()[-1] == '"' and classe.strip()[0] == '"': 70 | newclasses.append(classids[row[3].strip()[1:-1]]) 71 | else: 72 | newclasses.append(classids[row[3].strip()]) 73 | evaldict[ytname] = newclasses 74 | i += 1 75 | 76 | traindict = dict() 77 | 78 | with open(traincsv, 'r') as as_csvfile: 79 | reader = csv.reader(as_csvfile) 80 | i = 0 81 | for row in reader: 82 | if i > 2: 83 | ytname = row[0] 84 | starttime = row[1] 85 | classes = row[3].split(',') 86 | newclasses = [] 87 | for classe in classes: 88 | if classe.strip()[0] == '"' and not classe.strip()[-1] == '"': 89 | newclasses.append(classids[row[3].strip()[1:]]) 90 | elif classe.strip()[-1] == '"' and not classe.strip()[0] == '"': 91 | newclasses.append(classids[row[3].strip()[:-1]]) 92 | elif classe.strip()[-1] == '"' and classe.strip()[0] == '"': 93 | newclasses.append(classids[row[3].strip()[1:-1]]) 94 | else: 95 | newclasses.append(classids[row[3].strip()]) 96 | traindict[ytname] = newclasses 97 | i += 1 98 | print(i, 'len train') 99 | 100 | # ----------------------------------------------------------------------------- 101 | # Load VGGSound training ulrs 102 | # ----------------------------------------------------------------------------- 103 | 104 | vggsoundpath = '/home/askoepke97/coding/gitrepos/sound_features/VGGSound/data/train.csv' 105 | vggvids = [] 106 | with open(vggsoundpath, 'r') as csv_file: 107 | reader = csv.reader(csv_file) 108 | for row in tqdm(reader): 109 | vggvids.append(row[0].split('_')[0]) 110 | 111 | # ----------------------------------------------------------------------------- 112 | # Find overlap between VGGSound training set and AudioCaps test set 113 | # ----------------------------------------------------------------------------- 114 | 115 | overlap_counter = 0 116 | vggcounter = 0 117 | uniqueclasses = [] #111 unique classes in unfiltered (before removing overlap with VGGSound) AudioCaps test set, 97 in the val set, 238 in train 118 | newclassdict = dict() 119 | overlap_test_videos = [] 120 | for key, value in tqdm(yid_dict.items()): 121 | if value in vggvids: 122 | vggcounter += 1 123 | overlap_test_videos.append(value) 124 | # # Check for overlap between AudioCaps test set and AudioSet training data 125 | # if value in evaldict.keys(): 126 | # if not evaldict[value] in newclassdict.values(): 127 | # uniqueclasses.append(evaldict[value]) 128 | # newclassdict[key] = evaldict[value] 129 | # elif value in traindict.keys(): 130 | # overlap_counter += 1 131 | # if not traindict[value] in newclassdict.values(): 132 | # uniqueclasses.append(traindict[value]) 133 | # newclassdict[key] = traindict[value] 134 | 135 | 136 | # ----------------------------------------------------------------------------- 137 | # Filter the test.csv dictionary yid_dict from AudioCaps for overlap with VGGSound 138 | # ----------------------------------------------------------------------------- 139 | 140 | new_yid_dict = dict() 141 | for key, value in yid_dict.items(): 142 | if not key.split('_')[0] in overlap_test_videos: 143 | new_yid_dict[key] = value 144 | 145 | # ----------------------------------------------------------------------------- 146 | # Make dictionaries that contain classes as keys and video names in AudioCaps 147 | # test as values 148 | # ----------------------------------------------------------------------------- 149 | 150 | class_video_dict = dict() 151 | for key, value in tqdm(new_yid_dict.items()): 152 | if value in traindict.keys(): 153 | for vid_class in traindict[value]: #traindict[value] could contain a list with multiple classes 154 | if not vid_class in class_video_dict.keys(): 155 | class_video_dict[vid_class] = [key] 156 | elif vid_class in class_video_dict.keys(): 157 | class_video_dict[vid_class].append(key) 158 | else: 159 | import pdb; pdb.set_trace() 160 | 161 | new_class_video_dict = class_video_dict.copy() 162 | for key, value in class_video_dict.items(): 163 | if len(value) < 10: 164 | new_class_video_dict.pop(key) 165 | 166 | print(len(new_class_video_dict.keys())) 167 | 168 | # print count of each class in dictionary, videos belong to single class only 169 | count_no_videos = 0 170 | for key, value in tqdm(new_class_video_dict.items()): 171 | print(key, len(value)) 172 | count_no_videos += len(value) 173 | print(len(new_yid_dict.keys()), count_no_videos, 'number of videos in test set and number of videos in dictionaries') 174 | 175 | # save class dictionary that only contains classes with more than 10 example videos in the test set (34 classes) 176 | 177 | with open('test_class_videoid_dict_morethan10.json', 'w') as fp: 178 | json.dump(new_class_video_dict, fp) 179 | 180 | # save class dictionary with all classes in the test set (106 classes) 181 | 182 | with open('test_class_videoid_dict_all.json', 'w') as fp: 183 | json.dump(class_video_dict, fp) 184 | 185 | ## ----------------------------------------------------------------------------- 186 | ## Filter the AudioCaps test_list.txt to remove overlap with the VGGSound training data 187 | ## ----------------------------------------------------------------------------- 188 | # 189 | #audiocaps_testfile = '/home/askoepke97/akata-shared/askoepke97/data/AR/AudioCaps/structured-symlinks/test_list.txt' 190 | #file1 = open(audiocaps_testfile, 'r') 191 | #oldtestfiles = file1.readlines() 192 | #file1.close() 193 | #newtestfiles = [] 194 | #for oldtestfile in oldtestfiles: 195 | # if not oldtestfile.split('_')[0] in overlap_test_videos: 196 | # newtestfiles.append(oldtestfile) 197 | #file1 = open('/home/askoepke97/akata-shared/askoepke97/data/AR/AudioCaps/structured-symlinks/filtered_test_list.txt', 'w') 198 | #file1.writelines(newtestfiles) 199 | #file1.close() 200 | # 201 | ## ----------------------------------------------------------------------------- 202 | 203 | #print('There are %d videos in the AudioCaps test set that are contained in the AudioSet training set.'%overlap_counter) #975 and there are only 975 videos in the AudioCaps test set, 495 in the val set, 49838 in train 204 | -------------------------------------------------------------------------------- /configs/data_loader_clotho.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/base_config.json", 3 | "eval_mode": "test_run", 4 | "experts": { 5 | "text_feat": "w2v", 6 | "modalities": [ 7 | "imagenet.resnext101_32x48d.0", 8 | "r2p1d.r2p1d-ig65m.0", 9 | "scene.densenet161.0", 10 | "audio" 11 | ] 12 | }, 13 | "arch": { 14 | "type": "CENet", 15 | "args": { 16 | "test_caption_mode": "indep", 17 | "use_ce": "pairwise", 18 | "use_mish": 1, 19 | "use_bn_reason": 1, 20 | "num_g_layers": 3, 21 | "num_h_layers": 0, 22 | "include_self": 1, 23 | "l2renorm": false, 24 | "randomise_feats": "", 25 | "vlad_clusters": { 26 | "text": 20, 27 | "audio": 16, 28 | "pann": 16, 29 | "speech": 5, 30 | "syncnet": 16, 31 | "vggsound": 16 32 | }, 33 | "ghost_clusters": { 34 | "text": 1, 35 | "speech": 2 36 | }, 37 | "mimic_ce_dims": 0 38 | } 39 | }, 40 | "optimizer": { 41 | "type": "Ranger", 42 | "args": { 43 | "lr": 0.01, 44 | "weight_decay": 1E-03 45 | } 46 | }, 47 | "loss": { 48 | "type": "MaxMarginRankingLoss", 49 | "args": { 50 | "margin": 0.2, 51 | "fix_norm": true 52 | } 53 | }, 54 | "data_loader": { 55 | "type": "ExpertDataLoader", 56 | "args":{ 57 | "dataset_name": "CLOTHO", 58 | "data_dir": "data/CLOTHO", 59 | "root_feat_folder": "structured-symlinks", 60 | "trn_cat": 0, 61 | "batch_size": 128, 62 | "split_name": "val", 63 | "fuse_captions": false, 64 | "num_test_captions": 1, 65 | "max_tokens": { 66 | "text": 21, 67 | "audio": 31, 68 | "pann": 29, 69 | "speech": 35, 70 | "syncnet": 29, 71 | "vggsound": 95 72 | }, 73 | "feat_aggregation": { 74 | "imagenet.senet154.0": { 75 | "fps": 25, 76 | "stride": 1, 77 | "pixel_dim": 256, 78 | "aggregate-axis": 1, 79 | "offset": 0, 80 | "temporal": "avg", 81 | "aggregate": "concat", 82 | "type": "embed", 83 | "feat_dims": { 84 | "embed": 2048, 85 | "logits": 1000 86 | } 87 | }, 88 | "imagenet.resnext101_32x48d.0": { 89 | "fps": 25, 90 | "stride": 1, 91 | "offset": 0, 92 | "pixel_dim": 256, 93 | "temporal": "avg", 94 | "aggregate": "concat", 95 | "aggregate-axis": 1, 96 | "type": "embed", 97 | "feat_dims": { 98 | "embed": 2048, 99 | "logits": 1000 100 | } 101 | }, 102 | "scene.densenet161.0": { 103 | "stride": 1, 104 | "fps": 25, 105 | "offset": 0, 106 | "temporal": "avg", 107 | "pixel_dim": 256, 108 | "aggregate": "concat", 109 | "aggregate-axis": 1, 110 | "type": "embed", 111 | "feat_dims": { 112 | "embed": 2208, 113 | "logits": 1000 114 | } 115 | }, 116 | "i3d.i3d.0": { 117 | "fps": 25, 118 | "offset": 0, 119 | "stride": 25, 120 | "inner_stride": 1, 121 | "pixel_dim": 256, 122 | "temporal": "avg", 123 | "aggregate": "concat", 124 | "aggregate-axis": 1, 125 | "type": "embed", 126 | "feat_dims": { 127 | "embed": 1024, 128 | "logits": 400 129 | } 130 | }, 131 | "r2p1d.r2p1d-ig65m.0": { 132 | "fps": 30, 133 | "offset": 0, 134 | "stride": 32, 135 | "inner_stride": 1, 136 | "pixel_dim": 256, 137 | "temporal": "avg", 138 | "aggregate": "concat", 139 | "aggregate-axis": 1, 140 | "type": "embed", 141 | "feat_dims": { 142 | "embed": 512, 143 | "logits": 359 144 | } 145 | }, 146 | "r2p1d.r2p1d-ig65m-kinetics.0": { 147 | "fps": 30, 148 | "offset": 0, 149 | "stride": 32, 150 | "inner_stride": 1, 151 | "pixel_dim": 256, 152 | "temporal": "avg", 153 | "aggregate": "concat", 154 | "aggregate-axis": 1, 155 | "type": "embed", 156 | "feat_dims": { 157 | "embed": 512, 158 | "logits": 400 159 | } 160 | }, 161 | "pann.pann.0": { 162 | "model": "pann", 163 | "flaky": false, 164 | "temporal": "vlad", 165 | "type": "embed", 166 | "binarise": false 167 | }, 168 | "pann": { 169 | "model": "pann", 170 | "flaky": false, 171 | "temporal": "vlad", 172 | "type": "embed", 173 | "binarise": false 174 | }, 175 | "syncnet": { 176 | "model": "syncnet", 177 | "flaky": false, 178 | "temporal": "vlad", 179 | "type": "embed", 180 | "binarise": false 181 | }, 182 | "audio.syncnet.0": { 183 | "model": "syncnet", 184 | "flaky": false, 185 | "temporal": "vlad", 186 | "type": "embed", 187 | "binarise": false 188 | }, 189 | "vggsound": { 190 | "model": "vggsound", 191 | "flaky": false, 192 | "temporal": "vlad", 193 | "type": "embed", 194 | "binarise": false 195 | }, 196 | "audio.vggsound.0": { 197 | "model": "vggsound", 198 | "flaky": false, 199 | "temporal": "vlad", 200 | "type": "embed", 201 | "binarise": false 202 | }, 203 | "speech": { 204 | "model": "w2v", 205 | "flaky": true, 206 | "temporal": "vlad", 207 | "type": "embed", 208 | "binarise": false, 209 | "feat_dims": { 210 | "embed": 300 211 | } 212 | }, 213 | "audio": { 214 | "model": "vggish", 215 | "flaky": false, 216 | "temporal": "vlad", 217 | "type": "embed", 218 | "binarise": false 219 | }, 220 | "audio.vggish.0": { 221 | "model": "vggish", 222 | "flaky": false, 223 | "temporal": "vlad", 224 | "type": "embed", 225 | "binarise": false 226 | } 227 | } 228 | } 229 | }, 230 | "trainer": { 231 | "epochs": 20 232 | }, 233 | "eval_settings": { 234 | "data_loader": { 235 | "args": { 236 | "split_name": "test", 237 | "num_test_captions": 5 238 | } 239 | }, 240 | "tester": { 241 | "save_dir": "data/saved/", 242 | "verbosity": 2 243 | }, 244 | "disable_gpu": true 245 | }, 246 | "visualizer": { 247 | "type": "Visualizer", 248 | "args":{ 249 | "src_video_dir": "data/CLOTHO/videos", 250 | "vis_vid_freq": 500, 251 | "num_samples": 100 252 | } 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /configs/data_loader_audiocaps.json: -------------------------------------------------------------------------------- 1 | { 2 | "inherit_from": "configs/base_config.json", 3 | "eval_mode": "test_run", 4 | "experts": { 5 | "text_feat": "w2v", 6 | "modalities": [ 7 | "imagenet.resnext101_32x48d.0", 8 | "r2p1d.r2p1d-ig65m.0", 9 | "scene.densenet161.0", 10 | "audio" 11 | ] 12 | }, 13 | "arch": { 14 | "type": "CENet", 15 | "args": { 16 | "test_caption_mode": "indep", 17 | "use_ce": "pairwise", 18 | "use_mish": 1, 19 | "use_bn_reason": 1, 20 | "num_g_layers": 3, 21 | "num_h_layers": 0, 22 | "include_self": 1, 23 | "l2renorm": false, 24 | "randomise_feats": "", 25 | "vlad_clusters": { 26 | "text": 20, 27 | "audio": 16, 28 | "pann": 16, 29 | "syncnet": 16, 30 | "vggsound": 16, 31 | "speech": 5 32 | }, 33 | "ghost_clusters": { 34 | "text": 1 35 | }, 36 | "mimic_ce_dims": 0 37 | } 38 | }, 39 | "optimizer": { 40 | "type": "Ranger", 41 | "args": { 42 | "lr": 0.01, 43 | "weight_decay": 1E-03 44 | } 45 | }, 46 | "loss": { 47 | "type": "MaxMarginRankingLoss", 48 | "args": { 49 | "margin": 0.2, 50 | "fix_norm": true 51 | } 52 | }, 53 | "data_loader": { 54 | "type": "ExpertDataLoader", 55 | "args":{ 56 | "dataset_name": "AudioCaps", 57 | "data_dir": "data/AudioCaps", 58 | "root_feat_folder": "structured-symlinks", 59 | "trn_cat": 0, 60 | "batch_size": 128, 61 | "split_name": "val", 62 | "fuse_captions": false, 63 | "num_test_captions": 1, 64 | "max_tokens": { 65 | "text": 20, 66 | "audio": 29, 67 | "pann": 29, 68 | "syncnet": 29, 69 | "vggsound": 29, 70 | "speech": 35 71 | }, 72 | "feat_aggregation": { 73 | "imagenet.senet154.0": { 74 | "fps": 25, 75 | "stride": 1, 76 | "pixel_dim": 256, 77 | "aggregate-axis": 1, 78 | "offset": 0, 79 | "temporal": "avg", 80 | "aggregate": "concat", 81 | "type": "embed", 82 | "feat_dims": { 83 | "embed": 2048, 84 | "logits": 1000 85 | } 86 | }, 87 | "imagenet.resnext101_32x48d.0": { 88 | "fps": 25, 89 | "stride": 1, 90 | "offset": 0, 91 | "pixel_dim": 256, 92 | "temporal": "avg", 93 | "aggregate": "concat", 94 | "aggregate-axis": 1, 95 | "type": "embed", 96 | "feat_dims": { 97 | "embed": 2048, 98 | "logits": 1000 99 | } 100 | }, 101 | "scene.densenet161.0": { 102 | "stride": 1, 103 | "fps": 25, 104 | "offset": 0, 105 | "temporal": "avg", 106 | "pixel_dim": 256, 107 | "aggregate": "concat", 108 | "aggregate-axis": 1, 109 | "type": "embed", 110 | "feat_dims": { 111 | "embed": 2208, 112 | "logits": 1000 113 | } 114 | }, 115 | "i3d.i3d.0": { 116 | "fps": 25, 117 | "offset": 0, 118 | "stride": 25, 119 | "inner_stride": 1, 120 | "pixel_dim": 256, 121 | "temporal": "avg", 122 | "aggregate": "concat", 123 | "aggregate-axis": 1, 124 | "type": "embed", 125 | "feat_dims": { 126 | "embed": 1024, 127 | "logits": 400 128 | } 129 | }, 130 | "r2p1d.r2p1d-ig65m.0": { 131 | "fps": 30, 132 | "offset": 0, 133 | "stride": 32, 134 | "inner_stride": 1, 135 | "pixel_dim": 256, 136 | "temporal": "avg", 137 | "aggregate": "concat", 138 | "aggregate-axis": 1, 139 | "type": "embed", 140 | "feat_dims": { 141 | "embed": 512, 142 | "logits": 359 143 | } 144 | }, 145 | "r2p1d.r2p1d-ig65m-kinetics.0": { 146 | "fps": 30, 147 | "offset": 0, 148 | "stride": 32, 149 | "inner_stride": 1, 150 | "pixel_dim": 256, 151 | "temporal": "avg", 152 | "aggregate": "concat", 153 | "aggregate-axis": 1, 154 | "type": "embed", 155 | "feat_dims": { 156 | "embed": 512, 157 | "logits": 400 158 | } 159 | }, 160 | "pann.pann.0": { 161 | "model": "pann", 162 | "flaky": false, 163 | "temporal": "vlad", 164 | "type": "embed", 165 | "binarise": false 166 | }, 167 | "pann": { 168 | "model": "pann", 169 | "flaky": false, 170 | "temporal": "vlad", 171 | "type": "embed", 172 | "binarise": false 173 | }, 174 | "syncnet": { 175 | "model": "syncnet", 176 | "flaky": false, 177 | "temporal": "vlad", 178 | "type": "embed", 179 | "binarise": false 180 | }, 181 | "audio.syncnet.0": { 182 | "model": "syncnet", 183 | "flaky": false, 184 | "temporal": "vlad", 185 | "type": "embed", 186 | "binarise": false 187 | }, 188 | "vggsound": { 189 | "model": "vggsound", 190 | "flaky": false, 191 | "temporal": "vlad", 192 | "type": "embed", 193 | "binarise": false 194 | }, 195 | "audio.vggsound.0": { 196 | "model": "vggsound", 197 | "flaky": false, 198 | "temporal": "vlad", 199 | "type": "embed", 200 | "binarise": false 201 | }, 202 | "speech": { 203 | "model": "w2v", 204 | "flaky": true, 205 | "temporal": "vlad", 206 | "type": "embed", 207 | "binarise": false, 208 | "feat_dims": { 209 | "embed": 300 210 | } 211 | }, 212 | "audio": { 213 | "model": "vggish", 214 | "flaky": true, 215 | "temporal": "vlad", 216 | "type": "embed", 217 | "binarise": false 218 | }, 219 | "audio.vggish.0": { 220 | "model": "vggish", 221 | "flaky": true, 222 | "temporal": "vlad", 223 | "type": "embed", 224 | "binarise": false 225 | } 226 | } 227 | } 228 | }, 229 | "metrics": [ 230 | "t2v_metrics", 231 | "v2t_metrics" 232 | ], 233 | "trainer": { 234 | "epochs": 20 235 | }, 236 | "eval_settings": { 237 | "data_loader": { 238 | "args": { 239 | "split_name": "test", 240 | "num_test_captions": 5 241 | } 242 | }, 243 | "tester": { 244 | "save_dir": "data/saved/", 245 | "verbosity": 2 246 | }, 247 | "disable_gpu": true 248 | }, 249 | "testing_file": "final_filtered_test_list.txt", 250 | "visualizer": { 251 | "type": "Visualizer", 252 | "args":{ 253 | "src_video_dir": "data/AudioCaps/videos", 254 | "vis_vid_freq": 500, 255 | "num_samples": 5 256 | } 257 | } 258 | } 259 | --------------------------------------------------------------------------------