├── misc
├── __init__.py
├── text_embedding_models.json
├── datasets
│ ├── querydsegments
│ │ ├── tar_include.txt
│ │ └── README.md
│ ├── activity-net
│ │ ├── tar_include.txt
│ │ └── README.md
│ ├── clotho
│ │ └── tar_include.txt
│ ├── queryd
│ │ ├── tar_include.txt
│ │ ├── README.md
│ │ ├── val_list.txt
│ │ └── test_list.txt
│ └── audiocaps
│ │ └── tar_include.txt
├── experiments-queryd.json
├── yaspi_gpu_defaults.json
├── exps-names.md
├── find_latest_checkpoints.py
├── experiments-audiocaps.json
├── experiments_teachText.json
├── generate_exps.py
├── aggregate_logs_and_stats.py
├── launch_exps_from_list.py
└── gen_tar_lists.py
├── utils
├── __init__.py
├── cos_restart.py
├── gen_ablations_for_dataset.py
├── datastructures.py
├── visualizer.py
├── html.py
├── ranger.py
└── radam.py
├── trainer
└── __init__.py
├── model
├── __init__.py
├── mil_nce_net.py
├── text_embedding_models.json
├── loss.py
└── net_vlad.py
├── logger
├── __init__.py
├── logger_config.json
├── logger.py
├── visualization.py
└── log_parser.py
├── base
├── __init__.py
└── base_model.py
├── launch_ablations_txt
├── audio_experts.txt
├── all_audio_experts.txt
└── single_audio_experts.txt
├── configs
├── clotho
│ ├── train-vggish-vggsound.json
│ ├── train-vggish-vggsound-moee.json
│ └── train-full-ce-only-audio.json
├── audiocaps
│ ├── train-vggish-vggsound-train_list_10.json
│ ├── train-vggish-vggsound-train_list_25.json
│ ├── train-vggish-vggsound-train_list_50.json
│ ├── train-vggish-vggsound-train_list_75.json
│ ├── train-vggish-vggsound.json
│ ├── train-full-ce-scene-r2p1d.json
│ ├── train-full-ce-r2p1d-inst.json
│ ├── train-full-ce-scene-inst.json
│ ├── train-full-ce-r2p1d-inst-vggish.json
│ ├── train-full-ce-r2p1d-inst-vggsound.json
│ ├── train-full-ce-scene-r2p1d-inst.json
│ ├── train-full-ce-r2p1d-inst-vggish-vggsound.json
│ ├── train-full-ce-scene-r2p1d-inst-vggsound.json
│ ├── train-vggish-vggsound-moee.json
│ ├── train-full-ce-only-audio.json
│ ├── train-only-vggsound.json
│ ├── train-full-ce-only-r2p1d.json
│ ├── train-full-ce-only-scene.json
│ ├── train-full-ce-scene-r2p1d-inst-vggish-vggsound.json
│ └── train-full-ce-only-inst.json
├── queryd
│ └── train-full-ce-only-audio.json
├── querydsegments
│ └── train-full-ce-only-audio.json
├── activity-net
│ └── train-full-ce-audio-only.json
├── data_loader_activity-net.json
├── data_loader_queryd.json
├── data_loader_querydsegments.json
├── data_loader_clotho.json
└── data_loader_audiocaps.json
├── .gitignore
├── requirements
└── requirements.txt
├── data_loader
├── QuerYDSegments_dataset.py
├── QuerYD_dataset.py
├── CLOTHO_dataset.py
├── ActivityNet_dataset.py
├── AudioCaps_dataset.py
└── data_loaders.py
├── exp_to_seed_time.json
├── eval.py
└── dataset_stats
└── get_videoid_perclass.py
/misc/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 |
--------------------------------------------------------------------------------
/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .trainer import *
2 |
--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import CENet
2 | from .mil_nce_net import MNNet
--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from .logger import *
2 | from .visualization import *
3 | from .log_parser import *
--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 | # from .base_data_loader import *
2 | from .base_model import *
3 | from .base_trainer import *
4 |
--------------------------------------------------------------------------------
/launch_ablations_txt/audio_experts.txt:
--------------------------------------------------------------------------------
1 | --config configs/audiocaps/train-ce-audio-speech.json --group_seed 0|1|2 --device 0
2 |
--------------------------------------------------------------------------------
/launch_ablations_txt/all_audio_experts.txt:
--------------------------------------------------------------------------------
1 | --config configs/audiocaps/train-full-ce-only-audio_sophia_pann_soundnet.json --group_seed 0|1|2 --device 0
2 |
--------------------------------------------------------------------------------
/configs/clotho/train-vggish-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_clotho.json",
3 | "experts": {
4 | "modalities": [
5 | "audio",
6 | "vggsound"
7 | ]
8 | }
9 | }
--------------------------------------------------------------------------------
/launch_ablations_txt/single_audio_experts.txt:
--------------------------------------------------------------------------------
1 | --config configs/clotho/train-full-ce-only-audio.json --group_seed 0|1|2 --device 0 --resume data/saved/models/audiocaps-train-full-ce-only-audio/2021-03-23_09-06-26/trained_model.pth
2 |
--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_10.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_10.txt"}
--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_25.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_25.txt"}
--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_50.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_50.txt"}
--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_75.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_75.txt"}
--------------------------------------------------------------------------------
/misc/text_embedding_models.json:
--------------------------------------------------------------------------------
1 | {
2 | "w2v": {
3 | "weights_path": "data/text_models/GoogleNews-vectors-negative300.bin.gz",
4 | "dim": 300,
5 | "mirror": "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "audio",
6 | "vggsound"
7 | ]
8 | },
9 | "testing_file": "final_filtered_test_list.txt",
10 | "training_file": "train_list.txt"
11 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "scene.densenet161.0",
6 | "r2p1d.r2p1d-ig65m.0"
7 | ]
8 | },
9 | "testing_file": "final_filtered_test_list.txt",
10 | "training_file": "train_list.txt"
11 | }
--------------------------------------------------------------------------------
/configs/clotho/train-vggish-vggsound-moee.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_clotho.json",
3 | "experts": {
4 | "modalities": [
5 | "audio",
6 | "vggsound"
7 | ]
8 | },
9 | "arch": {
10 | "type": "CENet",
11 | "args": {
12 | "use_ce": ""
13 | }
14 | }
15 | }
--------------------------------------------------------------------------------
/configs/clotho/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_clotho.json",
3 | "experts": {
4 | "modalities": [
5 | "audio"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | }
15 | }
--------------------------------------------------------------------------------
/configs/queryd/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_queryd.json",
3 | "experts": {
4 | "modalities": [
5 | "audio"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | }
15 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "r2p1d.r2p1d-ig65m.0",
6 | "imagenet.resnext101_32x48d.0"
7 | ]
8 | },
9 | "testing_file": "final_filtered_test_list.txt",
10 | "training_file": "train_list.txt"
11 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-inst.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "scene.densenet161.0",
6 | "imagenet.resnext101_32x48d.0"
7 | ]
8 | },
9 | "testing_file": "final_filtered_test_list.txt",
10 | "training_file": "train_list.txt"
11 | }
--------------------------------------------------------------------------------
/configs/querydsegments/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_querydsegments.json",
3 | "experts": {
4 | "modalities": [
5 | "audio"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | }
15 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst-vggish.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "audio",
6 | "r2p1d.r2p1d-ig65m.0",
7 | "imagenet.resnext101_32x48d.0"
8 | ]
9 | },
10 | "testing_file": "final_filtered_test_list.txt",
11 | "training_file": "train_list.txt"
12 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "vggsound",
6 | "r2p1d.r2p1d-ig65m.0",
7 | "imagenet.resnext101_32x48d.0"
8 | ]
9 | },
10 | "testing_file": "final_filtered_test_list.txt",
11 | "training_file": "train_list.txt"
12 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d-inst.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "scene.densenet161.0",
6 | "r2p1d.r2p1d-ig65m.0",
7 | "imagenet.resnext101_32x48d.0"
8 | ]
9 | },
10 | "testing_file": "final_filtered_test_list.txt",
11 | "training_file": "train_list.txt"
12 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst-vggish-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "audio",
6 | "r2p1d.r2p1d-ig65m.0",
7 | "imagenet.resnext101_32x48d.0",
8 | "vggsound"
9 | ]
10 | },
11 | "testing_file": "final_filtered_test_list.txt",
12 | "training_file": "train_list.txt"
13 | }
--------------------------------------------------------------------------------
/misc/datasets/querydsegments/tar_include.txt:
--------------------------------------------------------------------------------
1 | data/QuerYDSegments/structured-symlinks/aggregated_audio/vggish-raw.hickle
2 | data/QuerYDSegments/structured-symlinks/structured-symlinks/split_raw_captions_filtered.pkl
3 | data/QuerYDSegments/structured-symlinks/test_list.txt
4 | data/QuerYDSegments/structured-symlinks/text_embeddings/w2v.pkl
5 | data/QuerYDSegments/structured-symlinks/train_list.txt
6 | data/QuerYDSegments/structured-symlinks/val_list.txt
7 |
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d-inst-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "scene.densenet161.0",
6 | "vggsound",
7 | "r2p1d.r2p1d-ig65m.0",
8 | "imagenet.resnext101_32x48d.0"
9 | ]
10 | },
11 | "testing_file": "final_filtered_test_list.txt",
12 | "training_file": "train_list.txt"
13 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-moee.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "audio",
6 | "vggsound"
7 | ]
8 | },
9 | "arch": {
10 | "type": "CENet",
11 | "args": {
12 | "use_ce": ""
13 | }
14 | },
15 | "testing_file": "final_filtered_test_list.txt",
16 | "training_file": "train_list.txt"
17 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "audio"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | },
15 | "testing_file": "final_filtered_test_list.txt",
16 | "training_file": "train_list.txt"
17 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-only-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "vggsound"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | },
15 | "testing_file": "final_filtered_test_list.txt",
16 | "training_file": "train_list.txt"
17 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-r2p1d.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "r2p1d.r2p1d-ig65m.0"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | },
15 | "testing_file": "final_filtered_test_list.txt",
16 | "training_file": "train_list.txt"
17 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-scene.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "scene.densenet161.0"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | },
15 | "testing_file": "final_filtered_test_list.txt",
16 | "training_file": "train_list.txt"
17 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d-inst-vggish-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "scene.densenet161.0",
6 | "audio",
7 | "r2p1d.r2p1d-ig65m.0",
8 | "imagenet.resnext101_32x48d.0",
9 | "vggsound"
10 | ]
11 | },
12 | "testing_file": "final_filtered_test_list.txt",
13 | "training_file": "train_list.txt"
14 | }
--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-inst.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_audiocaps.json",
3 | "experts": {
4 | "modalities": [
5 | "imagenet.resnext101_32x48d.0"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | },
15 | "testing_file": "final_filtered_test_list.txt",
16 | "training_file": "train_list.txt"
17 | }
--------------------------------------------------------------------------------
/misc/datasets/activity-net/tar_include.txt:
--------------------------------------------------------------------------------
1 | data/activity-net/structured-symlinks/aggregated_audio/vggish-audio-raw.pickle
2 | data/activity-net/structured-symlinks/aggregated_facefeats_25fps_256px_stride1/face-avg.pickle
3 | data/activity-net/structured-symlinks/aggregated_ocr_feats/ocr-w2v.pkl
4 | data/activity-net/structured-symlinks/aggregated_text_feats/w2v.pkl
5 | data/activity-net/structured-symlinks/raw-captions-train-val_1.pkl
6 | data/activity-net/structured-symlinks/train_list.txt
7 | data/activity-net/structured-symlinks/val_1_list.txt
8 |
--------------------------------------------------------------------------------
/misc/datasets/clotho/tar_include.txt:
--------------------------------------------------------------------------------
1 | data/CLOTHO/structured-symlinks/aggregated_audio/vggish-raw.hickle
2 | data/CLOTHO/structured-symlinks/aggregated_pann/pann-raw.hickle
3 | data/CLOTHO/structured-symlinks/aggregated_syncnet/syncnet-raw.hickle
4 | data/CLOTHO/structured-symlinks/aggregated_vggsound/vggsound-raw.hickle
5 | data/CLOTHO/structured-symlinks/structured-symlinks/raw-captions.pkl
6 | data/CLOTHO/structured-symlinks/test_list.txt
7 | data/CLOTHO/structured-symlinks/text_embeddings/w2v.pkl
8 | data/CLOTHO/structured-symlinks/train_list.txt
9 | data/CLOTHO/structured-symlinks/val_list.txt
10 |
--------------------------------------------------------------------------------
/base/base_model.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import numpy as np
3 | from abc import abstractmethod
4 |
5 |
6 | class BaseModel(nn.Module):
7 | """
8 | Base class for all models
9 | """
10 | @abstractmethod
11 | def forward(self, *inputs):
12 | """
13 | Forward pass logic
14 |
15 | :return: Model output
16 | """
17 | raise NotImplementedError
18 |
19 | def __str__(self):
20 | """
21 | Model prints with number of trainable parameters
22 | """
23 | model_parameters = filter(lambda p: p.requires_grad, self.parameters())
24 | params = sum([np.prod(p.size()) for p in model_parameters])
25 | return super().__str__() + f"\nTrainable parameters: {params}"
26 |
--------------------------------------------------------------------------------
/configs/activity-net/train-full-ce-audio-only.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/data_loader_activity-net.json",
3 | "experts": {
4 | "modalities": [
5 | "audio"
6 | ]
7 | },
8 | "arch": {
9 | "type": "CENet",
10 | "args": {
11 | "use_ce": "",
12 | "mimic_ce_dims": 1
13 | }
14 | },
15 | "trainer": {
16 | "epochs": 20
17 | },
18 | "optimizer": {
19 | "type": "Ranger",
20 | "args": {
21 | "lr": 0.01,
22 | "weight_decay": 1E-03
23 | }
24 | },
25 | "loss": {
26 | "type": "MaxMarginRankingLoss",
27 | "args": {
28 | "margin": 0.2,
29 | "fix_norm": true
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/misc/experiments-queryd.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "queryd-train-full-ce": ["c50d3616", "2021-05-28_15-24-39"],
4 | "queryd-train-full-ce-only-scene": ["766c0b81", "2021-05-28_15-39-29"],
5 | "queryd-train-full-ce-only-scene-audio": ["e576753f", "2021-05-28_16-20-15"],
6 | "queryd-train-full-ce-only-scene-audio-inst": ["e40f68bf", "2021-05-28_16-21-50"],
7 | "queryd-train-full-ce-only-scene-audio-inst-r2p1d": ["54ca249c", "2021-05-28_16-24-04"],
8 | "queryd-train-full-mnnet": ["7e1a7420", "2021-05-28_16-38-33"],
9 | "queryd-train-full-moee": ["ab5db961", "2021-05-28_15-32-38"],
10 |
11 | "querydsegments-train-full-ce": ["0d1b703c", "2021-05-28_15-26-57"],
12 | "querydsegments-train-full-mnnet": ["1404fc28", "2021-05-28_16-38-32"],
13 | "querydsegments-train-full-moee": ["7b3b466e", "2021-05-28_15-32-44"]
14 |
15 |
16 | }
--------------------------------------------------------------------------------
/misc/yaspi_gpu_defaults.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe": "gpu-proc",
3 | "partition": "gpu",
4 | "time_limit": "96:00:00",
5 | "gen_script_dir": "data/slurm-gen-scripts",
6 | "mem": "100G",
7 | "gpus_per_task": 1,
8 | "cpus_per_task": 5,
9 | "throttle_array": 20,
10 | "ssh_forward": "",
11 | "log_dir": "data/slurm-logs",
12 | "use_custom_ray_tmp_dir": false,
13 | "refresh_logs": false,
14 | "exclude": "gnodef1,gnodee7,gnodef2,gnodee1,gnodee2,gnodee3,gnodee4,gnodee5,gnodee6,gnodee8,gnodeb1,gnodeb2,gnodeb3,gnodeb4,gnodeb5,gnodec1,gnodec2,gnodec3,gnodec4,gnodec5,gnodej1",
15 | "constraint_str": "",
16 | "prep": "",
17 | "env_setup": "export PYTHONPATH=\"${BASE}\":$PYTHONPATH; export PATH=\"${HOME}\"/local/anaconda3/condabin/:$PATH; source ~/local/anaconda3/etc/profile.d/conda.sh; conda activate pt37"
18 | }
19 |
--------------------------------------------------------------------------------
/misc/datasets/queryd/tar_include.txt:
--------------------------------------------------------------------------------
1 | data/QuerYD/structured-symlinks/aggregated_audio/vggish-raw.hickle
2 | data/QuerYD/structured-symlinks/aggregated_imagenet_25fps_256px_stride1_offset0/resnext101_32x48d-avg.pickle
3 | data/QuerYD/structured-symlinks/aggregated_r2p1d_30fps_256px_stride32_offset0_inner_stride1/r2p1d-ig65m-avg.pickle
4 | data/QuerYD/structured-symlinks/aggregated_s3dg_10fps_256px_stride16_offset0_inner_stride1/s3dg-avg-logits.pickle
5 | data/QuerYD/structured-symlinks/aggregated_scene_25fps_256px_stride1_offset0/densenet161-avg.pickle
6 | data/QuerYD/structured-symlinks/raw_captions_combined_filtered.pkl
7 | data/QuerYD/structured-symlinks/text_embeddings/howto100m_mil_nce.pkl
8 | data/QuerYD/structured-symlinks/text_embeddings/w2v.pkl
9 | data/QuerYD/structured-symlinks/test_list.txt
10 | data/QuerYD/structured-symlinks/train_list.txt
11 | data/QuerYD/structured-symlinks/val_list.txt
12 |
--------------------------------------------------------------------------------
/misc/datasets/audiocaps/tar_include.txt:
--------------------------------------------------------------------------------
1 | data/AudioCaps/structured-symlinks/aggregated_audio/vggish-raw.hickle
2 | data/AudioCaps/structured-symlinks/aggregated_imagenet_25fps_256px_stride1_offset0/resnext101_32x48d-avg.pickle
3 | data/AudioCaps/structured-symlinks/aggregated_pann/pann-raw.hickle
4 | data/AudioCaps/structured-symlinks/aggregated_r2p1d_30fps_256px_stride32_offset0_inner_stride1/r2p1d-ig65m-avg.pickle
5 | data/AudioCaps/structured-symlinks/aggregated_scene_25fps_256px_stride1_offset0/densenet161-avg.pickle
6 | data/AudioCaps/structured-symlinks/aggregated_syncnet/syncnet-raw.hickle
7 | data/AudioCaps/structured-symlinks/aggregated_vggsound/vggsound-raw.hickle
8 | data/AudioCaps/structured-symlinks/filtered_val_list.txt
9 | data/AudioCaps/structured-symlinks/final_filtered_test_list.txt
10 | data/AudioCaps/structured-symlinks/structured-symlinks/raw-captions.pkl
11 | data/AudioCaps/structured-symlinks/text_embeddings/w2v.pkl
12 | data/AudioCaps/structured-symlinks/train_list.txt
13 |
--------------------------------------------------------------------------------
/logger/logger_config.json:
--------------------------------------------------------------------------------
1 |
2 | {
3 | "version": 1,
4 | "disable_existing_loggers": false,
5 | "formatters": {
6 | "simple": {"format": "%(message)s"},
7 | "datetime": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"}
8 | },
9 | "handlers": {
10 | "console": {
11 | "class": "logging.StreamHandler",
12 | "level": "DEBUG",
13 | "formatter": "simple",
14 | "stream": "ext://sys.stdout"
15 | },
16 | "info_file_handler": {
17 | "class": "logging.handlers.RotatingFileHandler",
18 | "level": "INFO",
19 | "formatter": "datetime",
20 | "filename": "info.log",
21 | "maxBytes": 10485760,
22 | "backupCount": 20, "encoding": "utf8"
23 | }
24 | },
25 | "root": {
26 | "level": "INFO",
27 | "handlers": [
28 | "console",
29 | "info_file_handler"
30 | ]
31 | }
32 | }
--------------------------------------------------------------------------------
/logger/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import logging.config
4 | from pathlib import Path
5 | from utils import read_json
6 |
7 |
8 | def setup_logging(save_dir, log_config='logger/logger_config.json',
9 | default_level=logging.INFO):
10 | """Setup logging configuration."""
11 | print(os.getcwd())
12 | log_config = Path(log_config)
13 | print(f"log config: {log_config} exists: {log_config.exists()}")
14 | if log_config.is_file():
15 | config = read_json(log_config)
16 | # modify logging paths based on run config
17 | for _, handler in config['handlers'].items():
18 | if 'filename' in handler:
19 | handler['filename'] = str(save_dir / handler['filename'])
20 |
21 | logging.config.dictConfig(config)
22 | else:
23 | print(f"Warning: logging configuration file is not found in {log_config}.")
24 | logging.basicConfig(level=default_level)
25 | return config["handlers"]["info_file_handler"]["filename"]
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # symlinked data
2 | data
3 |
4 | # ignore predictions
5 | pred
6 |
7 | # local dev files
8 | scratch/
9 |
10 | # nuisance files
11 | *.DS_Store
12 | .nfs*
13 | __pycache__
14 |
15 | # generated files
16 | misc/slurm/scripts/slurm-job.sh
17 | .vscode/tags
18 |
19 | # exclude files that are not for release
20 | hp*
21 | hc*
22 |
23 | # exclude unsupported datasets
24 | configs/mit
25 | configs/mmit
26 | data_loader/MIT*
27 | data_loader/MMIT*
28 | configs/templates
29 | misc/ablations-template.md
30 | misc/ablations.md
31 | misc/README-ablations-template.md
32 | misc/README-audiocaps-ablations-template.md
33 | misc/README-model-study.md
34 | misc/README-queryd-ablations-template.md
35 | misc/README-template.md
36 |
37 | slurm
38 |
39 | .vscode
40 |
41 | # exclude long video tar lists to avoid an overly heavy git repo
42 | misc/cvpr2020_challenge/datasets/activity-net/challenge-release-1/video_tar_include.txt
43 | misc/cvpr2020_challenge/datasets/MSVD/challenge-release-1/video_tar_include.txt
44 | misc/cvpr2020_challenge/datasets/DiDeMo/challenge-release-1/video_tar_include.txt
45 | misc/cvpr2020_challenge/datasets/MSRVTT/challenge-release-1/video_tar_include.txt
46 | misc/cvpr2020_challenge/datasets/YouCook2/challenge-release-1/video_tar_include.txt
47 |
--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | backcall==0.2.0
2 | beartype==0.7.0
3 | certifi==2021.5.30
4 | chardet==4.0.0
5 | colored==1.4.2
6 | common-cmplr-lib-rt==2021.2.0
7 | common-cmplr-lic-rt==2021.2.0
8 | cycler==0.10.0
9 | decorator==5.0.9
10 | dill==0.3.3
11 | dominate==2.6.0
12 | dpcpp-cpp-rt==2021.2.0
13 | h5py==2.10.0
14 | hickle==4.0.4
15 | humanize==3.7.1
16 | idna==2.10
17 | intel-openmp==2021.2.0
18 | ipdb==0.13.9
19 | ipython==7.24.1
20 | ipython-genutils==0.2.0
21 | jedi==0.18.0
22 | joblib==1.0.1
23 | kiwisolver==1.3.1
24 | matplotlib==3.4.2
25 | matplotlib-inline==0.1.2
26 | mergedeep==1.3.4
27 | mkl==2021.2.0
28 | mkl-fft==1.3.0
29 | mkl-random==1.2.2
30 | mkl-service==2.4.0
31 | mock==4.0.3
32 | msgpack==1.0.2
33 | msgpack-numpy==0.4.7.1
34 | numpy==1.20.3
35 | opencl-rt==2021.2.0
36 | ordered-set==4.0.2
37 | pandas==1.0.3
38 | parso==0.8.2
39 | pexpect==4.8.0
40 | pickleshare==0.7.5
41 | Pillow==8.2.0
42 | prompt-toolkit==3.0.18
43 | protobuf==3.17.3
44 | psutil==5.8.0
45 | ptyprocess==0.7.0
46 | Pygments==2.9.0
47 | PyLaTeX==1.4.1
48 | pyparsing==2.4.7
49 | python-dateutil==2.8.1
50 | pytorch-swats==0.1.0
51 | pytz==2021.1
52 | PyYAML==5.4.1
53 | requests==2.25.1
54 | scikit-learn==0.24.2
55 | scipy==1.6.3
56 | seaborn==0.11.1
57 | six==1.16.0
58 | tailf==0.3.2
59 | tbb==2021.2.0
60 | tensorboardX==2.2
61 | threadpoolctl==2.1.0
62 | toml==0.10.2
63 | torch==1.7.1
64 | torchvision==0.8.2
65 | tqdm==4.61.1
66 | traitlets==5.0.5
67 | typeguard==2.12.1
68 | typing-extensions==3.10.0.0
69 | urllib3==1.26.5
70 | watchlogs==0.1.3.21
71 | wcwidth==0.2.5
72 | wget==3.2
73 | yaspi==0.0.5
74 | zsvision==0.7.8
75 |
--------------------------------------------------------------------------------
/misc/datasets/queryd/README.md:
--------------------------------------------------------------------------------
1 | ## Pretrained Experts
2 |
3 | This folder contains a collection of features, extracted from the QuerYD [2] dataset as part of the paper:
4 | *QuerYD: A video dataset with high-quality textual and audio narrations*.
5 |
6 | ### Training splits
7 |
8 | The training splits are given in the files linked below:
9 |
10 | * [train_list.txt](train_list.txt) (1796 videos)
11 | * [val_list.txt](val_list.txt) (384 videos)
12 | * [test_list.txt](test_list.txt) (386 videos)
13 |
14 |
15 | **Tar contents**
16 |
17 | The compressed tar file (402MB) can be downloaded from:
18 |
19 | ```
20 | http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/QuerYD-experts.tar.gz
21 | sha1sum: 0207ea85eeb52a4f50b06a31af28484afe4d9e86
22 | ```
23 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt).
24 |
25 |
26 | ### References:
27 |
28 | [1] If you use these features, please consider citing:
29 | ```
30 | @inproceedings{Liu2019a,
31 | author = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.},
32 | booktitle = {British Machine Vision Conference},
33 | title = {Use What You Have: Video retrieval using representations from collaborative experts},
34 | date = {2019},
35 | }
36 | ```
37 |
38 | [2] Please also consider citing the original QuerYD dataset, which was described in:
39 |
40 | ```
41 | @misc{oncescu2020queryd,
42 | title={QuerYD: A video dataset with high-quality textual and audio narrations},
43 | author={Andreea-Maria Oncescu and Jõao F. Henriques and Yang Liu and Andrew Zisserman and Samuel Albanie},
44 | year={2020},
45 | }
46 | ```
--------------------------------------------------------------------------------
/misc/datasets/querydsegments/README.md:
--------------------------------------------------------------------------------
1 | ## Pretrained Experts
2 |
3 | This folder contains a collection of features, extracted from the QuerYD [2] dataset as part of the paper:
4 | *QuerYD: A video dataset with high-quality textual and audio narrations*.
5 |
6 | ### Training splits
7 |
8 | The training splits are given in the files linked below:
9 |
10 | * [train_list.txt](train_list.txt) (9113 videos)
11 | * [val_list.txt](val_list.txt) (1952 videos)
12 | * [test_list.txt](test_list.txt) (1954 videos)
13 |
14 |
15 | **Tar contents**
16 |
17 | The compressed tar file (244MB) can be downloaded from:
18 |
19 | ```
20 | https://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/QuerYDSegments-experts.tar.gz
21 | sha1sum: f2be088890294f92355ccfe109f824d814cf2cd5
22 | ```
23 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt).
24 |
25 |
26 | ### References:
27 |
28 | [1] If you use these features, please consider citing:
29 | ```
30 | @inproceedings{Liu2019a,
31 | author = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.},
32 | booktitle = {British Machine Vision Conference},
33 | title = {Use What You Have: Video retrieval using representations from collaborative experts},
34 | date = {2019},
35 | }
36 | ```
37 |
38 | [2] Please also consider citing the original QuerYD dataset, which was described in:
39 |
40 | ```
41 | @misc{oncescu2020queryd,
42 | title={QuerYD: A video dataset with high-quality textual and audio narrations},
43 | author={Andreea-Maria Oncescu and Jõao F. Henriques and Yang Liu and Andrew Zisserman and Samuel Albanie},
44 | year={2020},
45 | }
46 | ```
--------------------------------------------------------------------------------
/misc/exps-names.md:
--------------------------------------------------------------------------------
1 | ## This file contains additional instructions for running the commands provided in the main README file
2 |
3 | ### Downloading required features and splits:
4 | ```
5 | python3 misc/sync_experts.py --dataset AudioCaps
6 | python3 misc/sync_experts.py --dataset CLOTHO
7 | python3 misc/sync_experts.py --dataset activity-net
8 | python3 misc/sync_experts.py --dataset QuerYDSegments
9 | ```
10 |
11 | ### Finding the corresponding .json file names for evaluation of pre-trained models
12 |
13 | #### AudioCaps:
14 | |Experiment type | Model name|
15 | |---|---|
16 | |CE VGGish only | audiocaps-train-full-ce-only-audio|
17 | |CE VGGSound only | audicaps-train-only-vggsound|
18 | |CE VGGish + VGGSound | audiocaps-train-vggish-vggsound|
19 | |MoEE VGGish + VGGSound | audiocaps-train-vggish-vggsound-moee|
20 | |CE Scene | audiocaps-train-full-ce-only-scene|
21 | |CE R2P1D | audiocaps-train-full-ce-only-r2p1d|
22 | |CE Inst | audiocaps-train-full-ce-only-inst|
23 | |CE Scene + R2P1D | audiocaps-train-full-ce-scene-r2p1d|
24 | |CE Scene + Inst | audiocaps-train-full-ce-scene-inst|
25 | |CE R2P1D + Inst | audiocaps-train-full-ce-r2p1d-inst|
26 | |CE - R2P1D + Inst + VGGish | audiocaps-train-full-ce-r2p1d-inst-vggish |
27 | |CE - R2P1D + Inst + VGGSound | audiocaps-train-full-ce-r2p1d-inst-vggsound |
28 | |CE - R2P1D + Inst + VGGish + VGGSound | audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound |
29 |
30 | #### CLOTHO:
31 | |Experiment type | Model name|
32 | |---|---|
33 | |CE VGGish only | clotho-train-full-ce-only-audio|
34 | |CE VGGish + VGGSound | clotho-train-vggish-vggsound|
35 | |MoEE VGGish + VGGSound | clotho-train-vggish-vggsound-moee|
36 |
37 | #### Activity-net:
38 | |Experiment type | Model name|
39 | |---|---|
40 | |CE VGGish only | activity-net-train-full-ce-audio-only|
41 |
42 | #### QuerYDSegments:
43 | |Experiment type | Model name|
44 | |---|---|
45 | |CE VGGish only | querydsegments-train-full-ce-audio-only|
46 |
--------------------------------------------------------------------------------
/misc/find_latest_checkpoints.py:
--------------------------------------------------------------------------------
1 | """Simple aggregation script for experiments
2 |
3 | ipy misc/find_latest_checkpoints.py -- --dataset audiocaps
4 | """
5 | import argparse
6 | from pathlib import Path
7 | from datetime import datetime
8 |
9 |
10 | def formatted_summary(dataset, exp_root, fname):
11 | try:
12 | summaries = list(Path(exp_root).glob(f"**/*{fname}"))
13 | summaries = [x for x in summaries if dataset in str(x)]
14 | except FileNotFoundError:
15 | fname = "summary-seed-1_seed-2_seed-3.json"
16 | summaries = list(Path(exp_root).glob(f"**/*{fname}"))
17 | summaries = [x for x in summaries if dataset in str(x)]
18 | print(f"Found {len(summaries)}")
19 | latest = {}
20 | time_format = "%Y-%m-%d_%H-%M-%S"
21 | for summary in summaries:
22 | rel_path = summary.relative_to(exp_root)
23 | key, group, timestamp = rel_path.parts[0], rel_path.parts[1], rel_path.parts[3]
24 | val = {"timestamp": timestamp, "group": group}
25 | if key in latest:
26 | prev_ts = datetime.strptime(latest[key]["timestamp"], time_format)
27 | curr_ts = datetime.strptime(timestamp, time_format)
28 | if curr_ts > prev_ts:
29 | latest[key] = val
30 | else:
31 | latest[key] = val
32 | for key, val in sorted(latest.items()):
33 | ts, group = val["timestamp"], val["group"]
34 | print(f'"{key}": ["{group}", "{ts}"],')
35 |
36 |
37 | def main():
38 | parser = argparse.ArgumentParser()
39 | parser.add_argument("--dataset", default="audiocaps")
40 | parser.add_argument("--exp_root", default="data/saved/log")
41 | parser.add_argument("--fname", default="summary-seed-0_seed-1_seed-2.json")
42 | args = parser.parse_args()
43 |
44 | formatted_summary(
45 | fname=args.fname,
46 | dataset=args.dataset,
47 | exp_root=args.exp_root,
48 | )
49 |
50 |
51 | if __name__ == "__main__":
52 | main()
53 |
--------------------------------------------------------------------------------
/misc/experiments-audiocaps.json:
--------------------------------------------------------------------------------
1 | {
2 | "audiocaps-train-full-ce-only-audio": ["c0b5bc86", "2021-06-10_15-34-48"],
3 | "audiocaps-train-full-ce-only-inst": ["5ee05383", "2021-06-10_15-32-29"],
4 | "audiocaps-train-full-ce-only-r2p1d": ["88d3ab9e", "2021-06-10_15-30-03"],
5 | "audiocaps-train-full-ce-only-scene": ["74d71d8b", "2021-06-10_15-27-11"],
6 | "audiocaps-train-full-ce-r2p1d-inst": ["cf11d710", "2021-06-10_15-23-04"],
7 | "audiocaps-train-full-ce-r2p1d-inst-vggish": ["74991f95", "2021-06-10_15-06-31"],
8 | "audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound": ["b51f941a", "2021-06-10_14-56-45"],
9 | "audiocaps-train-full-ce-r2p1d-inst-vggsound": ["1b623fdc", "2021-06-10_14-49-00"],
10 | "audiocaps-train-full-ce-scene-inst": ["55c40cc6", "2021-06-10_15-18-50"],
11 | "audiocaps-train-full-ce-scene-r2p1d": ["b2b14107", "2021-06-10_15-13-04"],
12 | "audiocaps-train-only-vggsound": ["afab0e0c", "2021-06-16_01-21-37"],
13 | "audiocaps-train-vggish-vggsound": ["7e2eda12", "2021-06-09_17-06-26"],
14 | "audiocaps-train-vggish-vggsound-moee": ["f66525f8", "2021-06-09_16-44-00"],
15 | "audiocaps-train-vggish-vggsound-train_list_10": ["68747f8c", "2021-06-10_11-02-21"],
16 | "audiocaps-train-vggish-vggsound-train_list_25": ["0151ad7f", "2021-06-10_11-14-25"],
17 | "audiocaps-train-vggish-vggsound-train_list_50": ["4aeeaa0d", "2021-06-10_11-27-36"],
18 | "audiocaps-train-vggish-vggsound-train_list_75": ["3a8d0584", "2021-06-10_11-45-26"],
19 |
20 | "clotho-train-full-ce-only-audio": ["4f58ef05", "2021-06-10_15-38-28"],
21 | "clotho-train-vggish-vggsound": ["dec0c820", "2021-06-10_14-45-51"],
22 | "clotho-train-vggish-vggsound-moee": ["fafa3e91", "2021-06-10_14-44-51"],
23 | "clotho-train-vggish-vggsound-finetuned": ["74560a6c", "2021-06-10_16-38-40"],
24 | "clotho-train-vggish-vggsound-moee-finetuned": ["5395fa47", "2021-06-10_16-36-13"],
25 |
26 |
27 | "querydsegments-train-full-ce-only-audio": ["70111434", "2021-06-10_14-33-03"],
28 | "activity-net-train-full-ce-audio-only": ["e8639db7", "2021-06-11_12-23-42"]
29 | }
--------------------------------------------------------------------------------
/misc/experiments_teachText.json:
--------------------------------------------------------------------------------
1 | {
2 |
3 | "msrvtt-train-gpt2-xl-finetuned-mte-denoising-adam-less80": ["c58ecf3b", "2020-11-21_14-00-26"],
4 | "msrvtt-train-gpt2-xl-finetuned-denoising-adam": ["a61447a9", "2020-11-11_05-31-29"],
5 | "msrvtt-train-gpt2-xl-finetuned-mte-denoising-adam": ["2cc98676", "2020-11-11_06-21-03"],
6 | "msrvtt-train-full-ce": ["6becbb74", "2020-06-28_18-31-21"],
7 | "msrvtt-train-ce-intra-mte": ["4d4508a2", "2020-11-06_17-27-00"],
8 | "msrvtt-train-gpt2-xl-finetuned-adam": ["244af891", "2020-10-01_12-22-00"],
9 | "msrvtt-train-gpt2-xl-finetuned-mte-adam": ["6427fd41", "2020-09-30_20-34-12"],
10 |
11 | "msvd-train-full-ce": ["2ae80bea", "2020-11-11_13-16-14"],
12 | "msvd-train-gpt2-xl-finetuned-adam": ["db396303", "2020-10-01_13-17-33"],
13 | "msvd-train-gpt2-xl-finetuned-mte-adam": ["0af2a1ed", "2020-09-30_21-30-15"],
14 | "msvd-train-ce-intra-mte": ["a3026a07", "2020-11-13_00-19-59"],
15 | "msvd-train-gpt2-xl-finetuned-denoising-adam": ["71686a77", "2020-11-11_12-19-27"],
16 | "msvd-train-gpt2-xl-finetuned-mte-denoising-adam": ["66dc5dff", "2020-11-11_12-57-29"],
17 |
18 | "didemo-train-full-ce": ["4ea49b50", "2020-06-28_20-04-46"],
19 | "didemo-train-gpt2-xl-finetuned-adam": ["616cf11b", "2020-10-01_13-31-57"],
20 | "didemo-train-gpt2-xl-finetuned-mte-adam": ["f004e587", "2020-09-30_20-19-13"],
21 | "didemo-train-ce-intra-mte": ["1a5a249f", "2020-11-06_19-12-39"],
22 |
23 |
24 | "lsmdc-train-full-ce": ["7af368b1", "2020-06-28_20-40-54"],
25 | "lsmdc-train-gpt2-xl-finetuned-mte-adam": ["38e65732", "2020-09-30_20-52-52"],
26 | "lsmdc-train-gpt2-xl-finetuned-adam": ["9e2c8afd", "2020-10-01_13-48-49"],
27 | "lsmdc-train-ce-intra-mte": ["1a5555af", "2020-11-06_19-32-23"],
28 |
29 |
30 | "activity-net-train-full-ce": ["9601c704", "2020-07-31_00-23-01"],
31 | "activity-net-train-gpt2-xl-finetuned-adam": ["a791f27d", "2020-10-01_13-42-29"],
32 | "activity-net-train-gpt2-xl-finetuned-mte-adam": ["87d04a50", "2020-10-01_08-48-36"],
33 | "activity-net-train-ce-intra-mte": ["620ad6b4", "2020-11-06_19-12-39"]
34 | }
35 |
--------------------------------------------------------------------------------
/configs/data_loader_activity-net.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/base_config.json",
3 | "eval_mode": "test_run",
4 | "experts": {
5 | "face_dim": 512,
6 | "text_feat": "w2v",
7 | "modalities": [
8 | "imagenet.resnext101_32x48d.0",
9 | "imagenet.senet154.0",
10 | "scene.densenet161.0",
11 | "r2p1d.r2p1d-ig65m.0",
12 | "i3d.i3d.0",
13 | "face",
14 | "ocr",
15 | "audio",
16 | "speech"
17 | ]
18 | },
19 | "arch": {
20 | "args": {
21 | "test_caption_mode": "indep",
22 | "use_ce": "pairwise",
23 | "use_mish": 1,
24 | "use_bn_reason": 1,
25 | "num_g_layers": 3,
26 | "num_h_layers": 0,
27 | "include_self": 1,
28 | "l2renorm": false,
29 | "randomise_feats": "",
30 | "vlad_clusters": {
31 | "text": 20,
32 | "audio": 16
33 | },
34 | "ghost_clusters": {
35 | "text": 1
36 | }
37 | }
38 | },
39 | "data_loader": {
40 | "args":{
41 | "dataset_name": "ActivityNet",
42 | "root_feat_folder": "structured-symlinks",
43 | "data_dir": "data/activity-net",
44 | "split_name": "val1",
45 | "batch_size": 128,
46 | "fuse_captions": true,
47 | "num_test_captions": 1,
48 | "max_tokens": {
49 | "text": 20,
50 | "audio": 29
51 | }
52 | }
53 | },
54 | "trainer": {
55 | "epochs": 40
56 | },
57 | "optimizer": {
58 | "type": "Ranger",
59 | "args":{
60 | "lr": 0.1,
61 | "weight_decay": 1e-3
62 | }
63 | },
64 | "loss": {
65 | "type": "MaxMarginRankingLoss",
66 | "args": {
67 | "margin": 0.060496613740311816,
68 | "fix_norm": true
69 | }
70 | },
71 | "eval_settings": {
72 | "data_loader": {
73 | "args":{
74 | "split_name": "val1"
75 | }
76 | },
77 | "tester": {
78 | "save_dir": "data/saved/",
79 | "verbosity": 2
80 | },
81 | "disable_gpu": true
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/utils/cos_restart.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import numpy as np
4 | from torch.optim.optimizer import Optimizer, required
5 | from torch.optim.lr_scheduler import _LRScheduler
6 |
7 | class CosineAnnealingWithRestartsLR(_LRScheduler):
8 |
9 | r"""Set the learning rate of each parameter group using a cosine annealing
10 | schedule, where :math:`\eta_{max}` is set to the initial lr and
11 | :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
12 | .. math::
13 | \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
14 | \cos(\frac{T_{cur}}{T_{max}}\pi))
15 | When last_epoch=-1, sets initial lr as lr.
16 | It has been proposed in
17 | `SGDR: Stochastic Gradient Descent with Warm Restarts`_. This implements
18 | the cosine annealing part of SGDR, the restarts and number of iterations multiplier.
19 | Args:
20 | optimizer (Optimizer): Wrapped optimizer.
21 | T_max (int): Maximum number of iterations.
22 | T_mult (float): Multiply T_max by this number after each restart. Default: 1.
23 | eta_min (float): Minimum learning rate. Default: 0.
24 | last_epoch (int): The index of last epoch. Default: -1.
25 | .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
26 | https://arxiv.org/abs/1608.03983
27 |
28 | src: https://github.com/lkhphuc/pytorch-3d-point-cloud-generation/blob/
29 | master/custom_scheduler.py
30 | """
31 |
32 | def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, T_mult=1):
33 | self.T_max = T_max
34 | self.T_mult = T_mult
35 | self.restart_every = T_max
36 | self.eta_min = eta_min
37 | self.restarts = 0
38 | self.restarted_at = 0
39 | super().__init__(optimizer, last_epoch)
40 |
41 | def restart(self):
42 | self.restart_every *= self.T_mult
43 | self.restarted_at = self.last_epoch
44 |
45 | def cosine(self, base_lr):
46 | return self.eta_min + (base_lr - self.eta_min) * \
47 | (1 + math.cos(math.pi * self.step_n / self.restart_every)) / 2
48 |
49 | @property
50 | def step_n(self):
51 | return self.last_epoch - self.restarted_at
52 |
53 | def get_lr(self):
54 | if self.step_n >= self.restart_every:
55 | self.restart()
56 | return [self.cosine(base_lr) for base_lr in self.base_lrs]
--------------------------------------------------------------------------------
/misc/generate_exps.py:
--------------------------------------------------------------------------------
1 | """A utility for generating experiment config files.
2 | """
3 | import json
4 | import copy
5 | import argparse
6 | import itertools
7 | from pathlib import Path
8 | from datetime import datetime
9 | from collections import OrderedDict
10 |
11 |
12 | def generate_configs(base_config, grid):
13 | job_queue = []
14 | timestamp = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
15 | hparam_vals = [x for x in grid.values()]
16 | grid_vals = list(itertools.product(*hparam_vals))
17 | hparams = list(grid.keys())
18 |
19 | for cfg_vals in grid_vals:
20 | custom_tokens = [f"{hparam}@{val}" for hparam, val in zip(hparams, cfg_vals)]
21 | custom_args = "+".join(custom_tokens)
22 | job = f"--config {base_config} --custom_args {custom_args}"
23 | job_queue.append(job)
24 |
25 | job_queue_path = f"data/job-queues/latest.txt"
26 | Path(job_queue_path).parent.mkdir(exist_ok=True, parents=True)
27 | with open(str(job_queue_path), "w") as f:
28 | f.write("\n".join(job_queue))
29 | print(f"Wrote {len(job_queue)} jobs to queue at {job_queue_path}")
30 | job_queue_path = f"data/job-queues/{Path(base_config).stem}-{timestamp}.txt"
31 | with open(str(job_queue_path), "w") as f:
32 | f.write("\n".join(job_queue))
33 | print(f"Wrote backup {len(job_queue)} jobs to queue at {job_queue_path}")
34 |
35 |
36 | def parse_grid(key_val_strs):
37 | print(f"parsing grid str: {key_val_strs}")
38 | key_val_pairs = key_val_strs.split("+")
39 | parsed = OrderedDict()
40 | for pair in key_val_pairs:
41 | key, val_str = pair.split("@")
42 | vals = []
43 | opts = [x for x in val_str.split(":")]
44 | for token in opts:
45 | if "," in token:
46 | val = [x for x in token.split(",") if x]
47 | else:
48 | val = token
49 | vals.append(val)
50 | parsed[key] = vals
51 | return parsed
52 |
53 |
54 | def main():
55 | parser = argparse.ArgumentParser()
56 | parser.add_argument('--grid', default="")
57 | parser.add_argument('--config', default="configs/msrvtt/only-i3d.json")
58 | args = parser.parse_args()
59 |
60 | grid = parse_grid(args.grid)
61 | generate_configs(
62 | grid=grid,
63 | base_config=args.config,
64 | )
65 |
66 |
67 | if __name__ == "__main__":
68 | main()
69 |
--------------------------------------------------------------------------------
/model/mil_nce_net.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Tuple
2 |
3 | import torch
4 | from typeguard import typechecked
5 |
6 | from base import BaseModel
7 |
8 |
9 | class MNNet(BaseModel):
10 |
11 | @typechecked
12 | def __init__(
13 | self,
14 | text_dim: int,
15 | expert_dims: Dict[str, Tuple[int, int]],
16 | **_unused,
17 | ):
18 | self.text_dim = text_dim
19 | self.expert_dims = expert_dims
20 | self.modalities = list(expert_dims.keys())
21 | super().__init__()
22 | self.dummy_param = torch.nn.Parameter(torch.ones(1) * 1E-5)
23 |
24 | @typechecked
25 | def forward(
26 | self,
27 | text: torch.Tensor,
28 | ind: Dict[str, torch.Tensor],
29 | experts: Dict[str, torch.Tensor],
30 | **_unused,
31 | ):
32 | self.sanity_checks(text=text, experts=experts, ind=ind)
33 | vid_embedding = next(iter(experts.values()))
34 | vid_embedding = self.dummy_param + vid_embedding
35 | text = text.view(text.shape[0] * text.shape[1], text.shape[-1])
36 | # text = text / torch.norm(text, p=2, dim=1).reshape(-1, 1)
37 | # vid_embedding = vid_embedding / torch.norm(vid_embedding, p=2,
38 | # dim=1).reshape(-1, 1)
39 | sims = torch.matmul(text, vid_embedding.t())
40 | return {
41 | "modalities": self.modalities,
42 | "cross_view_conf_matrix": sims,
43 | "text_embds": {self.modalities[0]: text},
44 | "vid_embds": {self.modalities[0]: vid_embedding},
45 | }
46 |
47 | @typechecked
48 | def sanity_checks(
49 | self,
50 | text: torch.Tensor,
51 | ind: Dict[str, torch.Tensor],
52 | experts: Dict[str, torch.Tensor],
53 | ):
54 | msg = f"Text dim {text.shape[-1]} did not match expected {self.text_dim}"
55 | assert text.shape[-1] == self.text_dim, msg
56 | assert len(experts) == 1, "Expected single modality experts"
57 | assert len(text.shape) == 4, "Expected four axes for text input"
58 | assert text.shape[2] == 1, "Expected singleton for text input on dim 2"
59 | for expert in self.expert_dims:
60 | msg = f"Expected all features to be present for {expert}"
61 | assert ind[expert].sum() == len(ind[expert]), msg
62 | feats = experts[expert]
63 | expected = self.expert_dims[expert]
64 | msg = f"Feature shape {feats.shape[1]} did not match expected {expected}"
65 | assert feats.shape[1] == expected[-1], msg
66 |
--------------------------------------------------------------------------------
/misc/aggregate_logs_and_stats.py:
--------------------------------------------------------------------------------
1 | """Aggregate logs across multiple seeded runs and summarise their statistics.
2 |
3 | ipy misc/aggregate_logs_and_stats.py -- --group_id 3b737e0d
4 | """
5 | import argparse
6 | import logging
7 | from pathlib import Path
8 | from collections import OrderedDict
9 | from utils.util import read_json
10 | from glob import glob
11 | from logger.log_parser import log_summary
12 |
13 |
14 | def summarise(group_id, log_dir="data/saved/log", model_dir="data/saved/models"):
15 | seeded_runs = sorted(list(Path(log_dir).glob(f"**/{group_id}/seed-*")))
16 | print(f"Found a total of {len(seeded_runs)} seed runs in {group_id}")
17 | msg = f"Found no seeded runs for group_id: {group_id} in {log_dir}"
18 | assert len(seeded_runs) > 0, msg
19 |
20 | info_logs = OrderedDict()
21 | for seeded_run in seeded_runs:
22 | info_log_matches = list(Path(seeded_run).glob("**/info.log"))
23 | msg = f"expected to find a single info.log file, found {len(info_log_matches)}"
24 | assert len(info_log_matches) == 1, msg
25 | info_logs[seeded_run.stem] = info_log_matches[0]
26 |
27 | summary_log = []
28 | for seeded_run, info_log_path in info_logs.items():
29 | with open(info_log_path, "r") as f:
30 | log = f.read().splitlines()
31 | summary_log.extend(log)
32 | first_info_log = list(info_logs.values())[0]
33 | summary_log_name = f"summary-{'_'.join(list(info_logs.keys()))}.json"
34 | summary_log_path = first_info_log.parent / summary_log_name
35 | with open(summary_log_path, "w") as f:
36 | f.write("\n".join(summary_log))
37 | print(f"Wrote concatenated logs to {summary_log_path}")
38 |
39 | # retrieve the config from the first run
40 | rel_path = first_info_log.relative_to(log_dir).parent
41 | config_path = Path(model_dir) / rel_path / "config.json"
42 | assert config_path.exists(), f"Could not find config at {config_path}"
43 | config = read_json(config_path)
44 |
45 | logger = logging.getLogger("summary")
46 |
47 | # some care is required with logging to avoid sending all experiment logs
48 | # to the same file. We avoid this by essentially resetting the logging utility
49 |
50 | # Remove all handlers associated with the root logger object
51 | for handler in logging.root.handlers[:]:
52 | logging.root.removeHandler(handler)
53 | logging.basicConfig(filename=summary_log_path, level=logging.INFO)
54 | if not logger.handlers:
55 | logger.addHandler(logging.StreamHandler())
56 |
57 | log_summary(
58 | logger=logger,
59 | log_path=summary_log_path,
60 | eval_mode=config["eval_mode"],
61 | fixed_num_epochs=config["trainer"]["epochs"],
62 | )
63 |
64 |
65 | def main():
66 | parser = argparse.ArgumentParser()
67 | parser.add_argument("--group_id", default="ed53d01d")
68 | args = parser.parse_args()
69 | summarise(group_id=args.group_id)
70 |
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/misc/datasets/activity-net/README.md:
--------------------------------------------------------------------------------
1 | ## Pretrained Experts
2 |
3 | This folder contains a collection of features, extracted from the ActivityNet [2] and ActivityNet-captions [3] datasets as part of the paper:
4 | *Use what you have: Video retrieval using representations from collaborative experts*.
5 |
6 | ### Training splits
7 |
8 | The training splits were taken from [3] and are given in the files linked below:
9 |
10 | * [train_list.txt](train_list.txt) (10009 videos)
11 | * [val_1_list.txt](val_1_list.txt) (4917 videos)
12 | * [val_2_list.txt](val_2_list.txt) (4885 videos)
13 |
14 | In our work, we use the `train` split for training and the `val_1` split for evaluation (the `val_1` split forms a superset of the `val_2` split, with differing captions).
15 |
16 |
17 | **Tar contents**
18 |
19 | The compressed tar file (3.7 GiB) can be downloaded from:
20 |
21 | ```
22 | http:/www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/activity-net-experts.tar.gz
23 | sha1sum: 2901046fa6a3d6f6393ee0047818e960fcfabd69
24 | ```
25 |
26 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt).
27 |
28 | [**Deprecated**] *The features made available with the previous code release are also available as a compressed tar file (3.8 GiB). These should be considered deprecated, since they are incompatible with the current codebase, but are still available and can be downloaded from:*
29 |
30 | ```
31 | http:/www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data-deprecated/features/activity-net-experts.tar.gz
32 | sha1sum: b16685576c97cdec2783fb89ea30ca7d17abb021
33 | ```
34 |
35 |
36 | ### References:
37 |
38 | [1] If you use these features, please consider citing:
39 | ```
40 | @inproceedings{Liu2019a,
41 | author = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.},
42 | booktitle = {British Machine Vision Conference},
43 | title = {Use What You Have: Video retrieval using representations from collaborative experts},
44 | date = {2019},
45 | }
46 | ```
47 |
48 | [2] Please also consider citing the original ActivityNet dataset, which was described in:
49 |
50 | ```
51 | @inproceedings{caba2015activitynet,
52 | title={Activitynet: A large-scale video benchmark for human activity understanding},
53 | author={Caba Heilbron, Fabian and Escorcia, Victor and Ghanem, Bernard and Carlos Niebles, Juan},
54 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
55 | pages={961--970},
56 | year={2015}
57 | }
58 | ```
59 |
60 | [3] In addition, please consider citing the ActivityNet-captions dataset, which provides the text descriptions, and which was described in:
61 |
62 | ```
63 | @inproceedings{krishna2017dense,
64 | title={Dense-captioning events in videos},
65 | author={Krishna, Ranjay and Hata, Kenji and Ren, Frederic and Fei-Fei, Li and Carlos Niebles, Juan},
66 | booktitle={Proceedings of the IEEE international conference on computer vision},
67 | pages={706--715},
68 | year={2017}
69 | }
70 | ```
--------------------------------------------------------------------------------
/misc/launch_exps_from_list.py:
--------------------------------------------------------------------------------
1 | """Launch a collection of experiments on SLURM from a text file.
2 |
3 | EXP_LIST=audio-retrieval-exps.txt
4 | ipy misc/launch_exps_from_list.py -- --exp_list "slurm/${EXP_LIST}" --yaspify
5 |
6 | """
7 | import os
8 | import sys
9 | import json
10 | import argparse
11 | from pathlib import Path
12 |
13 | from yaspi.yaspi import Yaspi
14 | from utils.util import parse_grid, filter_cmd_args
15 | from misc.aggregate_logs_and_stats import summarise
16 |
17 | def main():
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("--exp_list", default="data/job-queues/latest.txt")
20 | parser.add_argument("--yaspify", action="store_true", help="launch via slurm")
21 | parser.add_argument("--slurm", action="store_true")
22 | parser.add_argument("--limit", type=int, default=0)
23 | parser.add_argument('--mini_train', action="store_true")
24 | parser.add_argument("--use_cnodes", action="store_true")
25 | parser.add_argument('--train_single_epoch', action="store_true")
26 | parser.add_argument("--yaspi_defaults_path", type=Path,
27 | default="misc/yaspi_gpu_defaults.json")
28 | parser.add_argument("--evaluation", type=str, default='train', choices=['train', 'test'])
29 | args = parser.parse_args()
30 |
31 | # construct list of experiments from text file
32 | with open(args.exp_list, "r") as f:
33 | custom_args = f.read().splitlines()
34 | # remove blank lines
35 | custom_args = [x for x in custom_args if x]
36 |
37 | if args.limit:
38 | custom_args = custom_args[:args.limit]
39 |
40 | parsed = {}
41 | for line in custom_args:
42 | parsed.update(parse_grid(line, args.evaluation))
43 |
44 | # flatten all parsed experiments
45 | custom_args = [x for group in parsed.values() for x in group]
46 |
47 | cmd_args = sys.argv[1:]
48 | remove = ["--yaspify", "--exp_list", "--use_cnodes", "--evaluation"]
49 | cmd_args = filter_cmd_args(cmd_args, remove=remove)
50 | base_cmd = f"python {args.evaluation}.py {' '.join(cmd_args)}"
51 |
52 | if args.yaspify:
53 | with open(args.yaspi_defaults_path, "r") as f:
54 | yaspi_defaults = json.load(f)
55 | if args.use_cnodes:
56 | yaspi_defaults.update({"partition": "compute", "gpus_per_task": 0})
57 | job_name = f"{Path(args.exp_list).stem}-{len(custom_args)}-exps"
58 | job_queue = [f'"{x}"' for x in custom_args]
59 | job_queue = " ".join(job_queue)
60 | job = Yaspi(
61 | cmd=base_cmd,
62 | job_queue=job_queue,
63 | job_name=job_name,
64 | job_array_size=len(custom_args),
65 | **yaspi_defaults,
66 | )
67 | job.submit(watch=True, conserve_resources=5)
68 | else:
69 | for custom_args_ in custom_args:
70 | base_cmd = f"{base_cmd} {custom_args_}"
71 | print(f"Running cmd: {base_cmd}")
72 | os.system(base_cmd)
73 | if args.evaluation =='train':
74 | for group_id in parsed:
75 | summarise(group_id=group_id)
76 |
77 |
78 | if __name__ == "__main__":
79 | main()
80 |
--------------------------------------------------------------------------------
/logger/visualization.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from utils import Timer
3 |
4 |
5 | class TensorboardWriter():
6 | def __init__(self, log_dir, logger, enabled):
7 | self.writer = None
8 | self.selected_module = ""
9 |
10 | if enabled:
11 | log_dir = str(log_dir)
12 |
13 | # Retrieve vizualization writer
14 | succeeded = False
15 | for module in ["torch.utils.tensorboard", "tensorboardX"]:
16 | try:
17 | self.writer = importlib.import_module(module).SummaryWriter(log_dir)
18 | succeeded = True
19 | break
20 | except ImportError:
21 | succeeded = False
22 | self.selected_module = module
23 |
24 | if not succeeded:
25 | message = ("Warning: visualization (Tensorboard) is configured to use, "
26 | "but currently not installed on this machine. Please install"
27 | " either TensorboardX with 'pip install tensorboardx', "
28 | " upgrade PyTorch to version >= 1.1 for using "
29 | "'torch.utils.tensorboard' or turn off the option in "
30 | "the 'config.json' file.")
31 | logger.warning(message)
32 |
33 | self.step = 0
34 | self.mode = ''
35 |
36 | self.tb_writer_ftns = {
37 | 'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio',
38 | 'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding'
39 | }
40 | self.tag_mode_exceptions = {'add_histogram', 'add_embedding'}
41 |
42 | self.timer = Timer()
43 |
44 | def set_step(self, step, mode='train'):
45 | self.mode = mode
46 | self.step = step
47 | if step == 0:
48 | self.timer.reset()
49 | else:
50 | duration = self.timer.check()
51 | self.add_scalar('steps_per_sec', 1 / duration)
52 |
53 | def __getattr__(self, name):
54 | """
55 | If visualization is configured to use:
56 | return add_data() methods of tensorboard with additional information
57 | (step, tag) added.
58 | Otherwise:
59 | return a blank function handle that does nothing
60 | """
61 | if name in self.tb_writer_ftns:
62 | add_data = getattr(self.writer, name, None)
63 |
64 | def wrapper(tag, data, *args, **kwargs):
65 | if add_data is not None:
66 | # add mode(train/valid) tag
67 | if name not in self.tag_mode_exceptions:
68 | tag = '{}/{}'.format(tag, self.mode)
69 | add_data(tag, data, self.step, *args, **kwargs)
70 | return wrapper
71 | else:
72 | # default action for returning methods defined in this class, set_step()
73 | # for instance.
74 | try:
75 | attr = object.__getattr__(name)
76 | except AttributeError:
77 | msg = "type object '{}' has no attribute '{}'"
78 | raise AttributeError(msg.format(self.selected_module, name))
79 | return attr
80 |
--------------------------------------------------------------------------------
/model/text_embedding_models.json:
--------------------------------------------------------------------------------
1 | {
2 | "w2v": {
3 | "weights_path": "data/text_models/GoogleNews-vectors-negative300.bin.gz",
4 | "dim": 300,
5 | "force_cpu": true,
6 | "remove_stopwords": false,
7 | "mirror": "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
8 | },
9 | "grovle": {
10 | "dim": 300,
11 | "weights_path": "data/text_models/grovle.zip",
12 | "force_cpu": true,
13 | "remove_stopwords": false,
14 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
15 | },
16 | "mt_grovle": {
17 | "dim": 300,
18 | "weights_path": "data/text_models/mt_grovle.zip",
19 | "force_cpu": true,
20 | "remove_stopwords": false,
21 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
22 | },
23 | "hglmm_300d": {
24 | "dim": 300,
25 | "weights_path": "data/text_models/hglmm_300d.zip",
26 | "force_cpu": true,
27 | "remove_stopwords": false,
28 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
29 | },
30 | "hglmm_6kd": {
31 | "dim": 6000,
32 | "weights_path": "data/text_models/hglmm_6kd.zip",
33 | "force_cpu": true,
34 | "remove_stopwords": false,
35 | "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
36 | },
37 | "howto100m_mil_nce": {
38 | "word_dict_path": "data/text_models/howto100m/s3d_dict.npy",
39 | "weights_path": "data/text_models/howto100m/s3d_howto100m.pth",
40 | "dim": 512,
41 | "mirror": "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m"
42 | },
43 | "openai": {
44 | "dim": 768,
45 | "custom_pipeline": true
46 | },
47 | "electra": {
48 | "dim": 256
49 | },
50 | "openai-gpt": {
51 | "dim": 768
52 | },
53 | "gpt2": {
54 | "dim": 768
55 | },
56 | "gpt2-medium": {
57 | "dim": 1024
58 | },
59 | "gpt2-large": {
60 | "dim": 1280
61 | },
62 | "gpt2-xl": {
63 | "dim": 1600
64 | },
65 | "gpt2-xl-finetune": {
66 | "dim": 1600
67 | },
68 | "bert-base-uncased": {
69 | "dim": 768
70 | },
71 | "t5-small": {
72 | "dim": 512
73 | },
74 | "t5-base": {
75 | "dim": 768
76 | },
77 | "t5-large": {
78 | "dim": 1024
79 | },
80 | "t5-3b": {
81 | "dim": 1024
82 | },
83 | "t5-11b": {
84 | "force_cpu": true,
85 | "dim": 1024
86 | },
87 | "albert-base-v2": {
88 | "dim": 768
89 | },
90 | "albert-large-v2": {
91 | "dim": 1024
92 | },
93 | "albert-xlarge-v2": {
94 | "dim": 2048
95 | },
96 | "ctrl": {
97 | "dim": 1280
98 | },
99 | "roberta-base": {
100 | "dim": 768
101 | },
102 | "roberta-large": {
103 | "dim": 1024
104 | },
105 | "xlnet-base-cased": {
106 | "dim": 768
107 | },
108 | "xlnet-large-cased": {
109 | "dim": 1024
110 | },
111 | "transfo-xl-wt103": {
112 | "dim": 1024
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/model/loss.py:
--------------------------------------------------------------------------------
1 | """This module contains an implementation of the max margin ranking loss, slightly
2 | modified from this code:
3 | https://github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/loss.py
4 |
5 | The modification is the `fix_norm` conditional, which removes zero terms from the
6 | diagonal when performing the averaging calculation.
7 |
8 | Original licence below.
9 | """
10 | # Copyright 2018 Antoine Miech All Rights Reserved.
11 | #
12 | # Licensed under the Apache License, Version 2.0 (the "License");
13 | # you may not use this file except in compliance with the License.
14 | # You may obtain a copy of the License at
15 | #
16 | # http://www.apache.org/licenses/LICENSE-2.0
17 | #
18 | # Unless required by applicable law or agreed to in writing, software
19 | # distributed under the License is distributed on an "AS-IS" BASIS,
20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | # See the License for the specific language governing permissions and
22 | # limitations under the License.
23 |
24 | import torch.nn as nn
25 | import torch as th
26 | import torch.nn.functional as F
27 |
28 |
29 | class MaxMarginRankingLoss(nn.Module):
30 |
31 | def __init__(self, margin=1, fix_norm=True):
32 | super().__init__()
33 | self.fix_norm = fix_norm
34 | self.loss = th.nn.MarginRankingLoss(margin)
35 | self.margin = margin
36 |
37 | def forward(self, x):
38 | n = x.size()[0]
39 |
40 | x1 = th.diag(x)
41 | x1 = x1.unsqueeze(1)
42 | x1 = x1.expand(n, n)
43 | x1 = x1.contiguous().view(-1, 1)
44 | x1 = th.cat((x1, x1), 0)
45 |
46 | x2 = x.view(-1, 1)
47 | x3 = x.transpose(0, 1).contiguous().view(-1, 1)
48 |
49 | x2 = th.cat((x2, x3), 0)
50 | max_margin = F.relu(self.margin - (x1 - x2))
51 |
52 | if self.fix_norm:
53 | # remove the elements from the diagonal
54 | keep = th.ones(x.shape) - th.eye(x.shape[0]) # 128 x 128
55 | keep1 = keep.view(-1, 1)
56 | keep2 = keep.transpose(0, 1).contiguous().view(-1, 1)
57 | keep_idx = th.nonzero(th.cat((keep1, keep2), 0).flatten()).flatten()
58 | if x1.is_cuda:
59 | keep_idx = keep_idx.cuda()
60 | x1_ = th.index_select(x1, dim=0, index=keep_idx)
61 | x2_ = th.index_select(x2, dim=0, index=keep_idx)
62 | max_margin = F.relu(self.margin - (x1_ - x2_))
63 |
64 | return max_margin.mean()
65 |
66 |
67 | class BCEWithLogitsLoss(nn.Module):
68 |
69 | def __init__(self, weight=None):
70 | super().__init__()
71 | self.loss = th.nn.BCEWithLogitsLoss(weight=weight)
72 |
73 | def forward(self, x, target):
74 | return self.loss(x, target)
75 |
76 |
77 | class CrossEntropyLoss(nn.Module):
78 |
79 | def __init__(self, weight=None):
80 | super().__init__()
81 | self.loss = th.nn.CrossEntropyLoss(weight=weight)
82 |
83 | def forward(self, x, target):
84 | return self.loss(x, target.long().to(x.device))
85 |
86 |
87 | if __name__ == "__main__":
88 | loss = BCEWithLogitsLoss()
89 | x = th.randn(3, requires_grad=True)
90 | target = th.empty(3).random_(2)
91 | output = loss(x, target)
92 | output.backward()
93 | print(target)
94 |
--------------------------------------------------------------------------------
/model/net_vlad.py:
--------------------------------------------------------------------------------
1 | """NetVLAD implementation.
2 | """
3 | # Copyright 2018 Antoine Miech All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | import math
19 | import ipdb
20 | import torch.nn as nn
21 | import torch.nn.functional as F
22 | import torch as th
23 |
24 |
25 | class NetVLAD(nn.Module):
26 | def __init__(self, cluster_size, feature_size, ghost_clusters=0,
27 | add_batch_norm=True):
28 | super().__init__()
29 |
30 | self.feature_size = feature_size
31 | self.cluster_size = cluster_size
32 | self.ghost_clusters = ghost_clusters
33 |
34 | init_sc = (1 / math.sqrt(feature_size))
35 | clusters = cluster_size + ghost_clusters
36 |
37 | # The `clusters` weights are the `(w,b)` in the paper
38 | self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))
39 | self.batch_norm = nn.BatchNorm1d(clusters) if add_batch_norm else None
40 | # The `clusters2` weights are the visual words `c_k` in the paper
41 | self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
42 | self.out_dim = self.cluster_size * feature_size
43 |
44 | def forward(self, x, mask=None):
45 | """Aggregates feature maps into a fixed size representation. In the following
46 | notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.
47 |
48 | Args:
49 | x (th.Tensor): B x N x D
50 |
51 | Returns:
52 | (th.Tensor): B x DK
53 | """
54 | self.sanity_checks(x)
55 | max_sample = x.size()[1]
56 | x = x.view(-1, self.feature_size) # B x N x D -> BN x D
57 |
58 | if x.device != self.clusters.device:
59 | msg = f"x.device {x.device} != cluster.device {self.clusters.device}"
60 | raise ValueError(msg)
61 |
62 | assignment = th.matmul(x, self.clusters) # (BN x D) x (D x (K+G)) -> BN x (K+G)
63 |
64 | if self.batch_norm:
65 | assignment = self.batch_norm(assignment)
66 |
67 | assignment = F.softmax(assignment, dim=1) # BN x (K+G) -> BN x (K+G)
68 | # remove ghost assigments
69 | assignment = assignment[:, :self.cluster_size]
70 | assignment = assignment.view(-1, max_sample, self.cluster_size) # -> B x N x K
71 | a_sum = th.sum(assignment, dim=1, keepdim=True) # B x N x K -> B x 1 x K
72 | a = a_sum * self.clusters2
73 |
74 | assignment = assignment.transpose(1, 2) # B x N x K -> B x K x N
75 |
76 | x = x.view(-1, max_sample, self.feature_size) # BN x D -> B x N x D
77 | vlad = th.matmul(assignment, x) # (B x K x N) x (B x N x D) -> B x K x D
78 | vlad = vlad.transpose(1, 2) # -> B x D x K
79 | vlad = vlad - a
80 |
81 | # L2 intra norm
82 | vlad = F.normalize(vlad)
83 |
84 | # flattening + L2 norm
85 | vlad = vlad.reshape(-1, self.cluster_size * self.feature_size) # -> B x DK
86 | vlad = F.normalize(vlad)
87 | return vlad # B x DK
88 |
89 | def sanity_checks(self, x):
90 | """Catch any nans in the inputs/clusters"""
91 | if th.isnan(th.sum(x)):
92 | print("nan inputs")
93 | ipdb.set_trace()
94 | if th.isnan(self.clusters[0][0]):
95 | print("nan clusters")
96 | ipdb.set_trace()
97 |
--------------------------------------------------------------------------------
/utils/gen_ablations_for_dataset.py:
--------------------------------------------------------------------------------
1 | """Generate a set of ablations for each dataset, using the config structure of the
2 | MSRVTT experiments.
3 |
4 | ipy utils/gen_ablations_for_dataset.py -- --refresh --dest_dataset didemo \
5 | --update_ablation_list 1
6 |
7 | """
8 | import json
9 | import argparse
10 | from pathlib import Path
11 |
12 |
13 | def handle_moee_config(config):
14 | """For the official ablations on MSRVTT, we provide MoEE with the same hyperparam
15 | budget as CE and run a search to find the best hyperparams. For the unofficial
16 | ablations, we use the same padding/VLAD settings as CE.
17 | """
18 | config = {
19 | "inherit_from": config["inherit_from"],
20 | "arch": {"type": "CENet", "args": {"use_ce": ""}},
21 | }
22 | return config
23 |
24 |
25 | def remove_audio_streams(config, dest_path):
26 | """Prune audio-based features from the config and dest_path name (necessary for
27 | datasets like MSVD which do not possess sound.) If the audio feature was the control
28 | variable in the experiment, we return False for the dest_path, such that the ablation
29 | is removed altogether.
30 | """
31 | audio_tags = ["audio", "speech"]
32 | for audio_tag in audio_tags:
33 | if f"-{audio_tag}." in dest_path:
34 | return config, False
35 |
36 | dest_path = dest_path.replace(f"-{audio_tag}", "")
37 | if "experts" in config and "modalities" in config["experts"]:
38 | if audio_tag in config["experts"]["modalities"]:
39 | config["experts"]["modalities"].remove(audio_tag)
40 | return config, dest_path
41 |
42 |
43 | def main():
44 | parser = argparse.ArgumentParser()
45 | parser.add_argument('--refresh', action="store_true")
46 | parser.add_argument('--update_ablation_list', type=int, default=1)
47 | parser.add_argument('--src_dataset', default="msrvtt")
48 | parser.add_argument('--dest_dataset', default="lsmdc")
49 | parser.add_argument('--exp_list', default="slurm/msrvtt-ablations.txt")
50 | args = parser.parse_args()
51 |
52 | with open(args.exp_list, "r") as f:
53 | exps = [x for x in f.read().splitlines() if x]
54 |
55 | print(f"Found {len(exps)} experiments in {args.exp_list}")
56 | dest_exp_path = Path(args.exp_list.replace("msrvtt", args.dest_dataset))
57 | if dest_exp_path.exists() and not args.refresh:
58 | print(f"experiment list found at {dest_exp_path}, skipping...")
59 | return
60 |
61 | output_rows = []
62 | exclude = ["miech", "jsfusion"]
63 | for row in exps:
64 | flag, config_path, seed_flag, seed_opts = row.split()
65 | if any([x in config_path for x in exclude]):
66 | continue
67 | with open(config_path, "r") as f:
68 | config = json.load(f)
69 | if Path(config_path).stem == "train-full-moee":
70 | config = handle_moee_config(config)
71 | dest_path = config_path.replace(args.src_dataset, args.dest_dataset)
72 | config["inherit_from"] = config["inherit_from"].replace(args.src_dataset,
73 | args.dest_dataset)
74 | if args.dest_dataset == "msvd":
75 | config, dest_path = remove_audio_streams(config, dest_path)
76 | if not dest_path:
77 | continue
78 |
79 | print(f"writing config to {dest_path}")
80 | with open(dest_path, "w") as f:
81 | json.dump(config, f, indent=4, sort_keys=False)
82 | output_rows.append([flag, dest_path, seed_flag, seed_opts])
83 |
84 | if args.update_ablation_list:
85 | print(f"Writing new experiment list to {dest_exp_path}")
86 | output_rows = [" ".join(x) for x in output_rows]
87 | with open(dest_exp_path, "w") as f:
88 | for row in sorted(list(set(output_rows))):
89 | f.write(f"{row}\n")
90 |
91 |
92 |
93 | if __name__ == "__main__":
94 | main()
95 |
--------------------------------------------------------------------------------
/data_loader/QuerYDSegments_dataset.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from typing import Dict, Union, List
3 | from pathlib import Path
4 |
5 | from zsvision.zs_utils import memcache, concat_features
6 | from typeguard import typechecked
7 |
8 | from utils import memory_summary
9 | from base.base_dataset import BaseDataset
10 |
11 |
12 | class QuerYDSegments(BaseDataset):
13 |
14 | @staticmethod
15 | @typechecked
16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
17 | subset_paths = {}
18 | test_splits = {
19 | "val": "val_list.txt",
20 | "test": "test_list.txt",
21 | }
22 | for split_name, fname in test_splits.items():
23 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
24 |
25 | feature_names = BaseDataset.common_feat_names()
26 | feature_names.append("audio.vggish.0")
27 | text_feat_paths = BaseDataset.common_text_feat_paths()
28 | text_feat_paths = {key: Path("text_embeddings") / fname
29 | for key, fname in text_feat_paths.items()}
30 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
31 | for key in text_feat_paths}
32 | custom_paths = {
33 | "audio": ["aggregated_audio/vggish-raw.hickle"],
34 | }
35 | feature_info = {
36 | "custom_paths": custom_paths,
37 | "feature_names": feature_names,
38 | "subset_list_paths": subset_paths,
39 | "text_feat_paths": text_feat_paths,
40 | "challenge_text_feat_paths": challenge_text_feat_paths,
41 | "raw_captions_path": "structured-symlinks/split_raw_captions_filtered.pkl",
42 | }
43 | return feature_info
44 |
45 | def load_features(self):
46 | root_feat = self.root_feat
47 | # import pdb; pdb.set_trace()
48 | feat_names = {key: self.visual_feat_paths(key) for key in
49 | self.paths["feature_names"]}
50 | feat_names.update(self.paths["custom_paths"])
51 | features = {}
52 | for expert, rel_names in feat_names.items():
53 | if expert not in self.ordered_experts:
54 | continue
55 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
56 | if len(feat_paths) == 1:
57 | features[expert] = memcache(feat_paths[0])
58 | else:
59 | # support multiple forms of feature (e.g. max and avg pooling). For
60 | # now, we only support direct concatenation
61 | msg = f"{expert}: Only direct concatenation of muliple feats is possible"
62 | print(f"Concatenating aggregates for {expert}....")
63 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
64 | axis = self.feat_aggregation[expert]["aggregate-axis"]
65 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter
66 | print(f"concat cache info: {x}")
67 | features_ = concat_features(feat_paths, axis=axis)
68 | memory_summary()
69 |
70 | # Make separate feature copies for each split to allow in-place filtering
71 | features[expert] = copy.deepcopy(features_)
72 |
73 | self.features = features
74 | if self.challenge_mode:
75 | self.load_challenge_text_features()
76 | else:
77 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
78 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
79 | self.text_features = memcache(text_feat_path)
80 |
81 |
82 | # overload video paths
83 | self.video_path_retrieval = [f"videos/{x}.mp4"
84 | for x in self.partition_lists["val"]]
85 |
86 | def sanity_checks(self):
87 | msg = (f"Expected to have single test caption for QuerYD, since we assume"
88 | f"that the captions are fused (but using {self.num_test_captions})")
89 | assert self.num_test_captions == 1, msg
90 |
--------------------------------------------------------------------------------
/exp_to_seed_time.json:
--------------------------------------------------------------------------------
1 | {"audiocaps-train-full-ce-only-audio": [["c0b5bc86", "seed-0", "2021-06-10_15-34-48"], ["c0b5bc86", "seed-1", "2021-06-10_15-36-14"], ["c0b5bc86", "seed-2", "2021-06-10_15-36-15"]], "audiocaps-train-full-ce-only-inst": [["5ee05383", "seed-0", "2021-06-10_15-32-29"], ["5ee05383", "seed-1", "2021-06-10_15-33-37"], ["5ee05383", "seed-2", "2021-06-10_15-33-51"]], "audiocaps-train-full-ce-only-r2p1d": [["88d3ab9e", "seed-0", "2021-06-10_15-30-03"], ["88d3ab9e", "seed-1", "2021-06-10_15-31-11"], ["88d3ab9e", "seed-2", "2021-06-10_15-31-32"]], "audiocaps-train-full-ce-only-scene": [["74d71d8b", "seed-0", "2021-06-10_15-27-11"], ["74d71d8b", "seed-1", "2021-06-10_15-27-40"], ["74d71d8b", "seed-2", "2021-06-10_15-29-16"]], "audiocaps-train-full-ce-r2p1d-inst": [["cf11d710", "seed-0", "2021-06-10_15-23-04"], ["cf11d710", "seed-1", "2021-06-10_15-23-25"], ["cf11d710", "seed-2", "2021-06-10_15-26-45"]], "audiocaps-train-full-ce-r2p1d-inst-vggish": [["74991f95", "seed-0", "2021-06-10_15-06-31"], ["74991f95", "seed-1", "2021-06-10_15-07-40"], ["74991f95", "seed-2", "2021-06-10_15-12-39"]], "audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound": [["b51f941a", "seed-0", "2021-06-10_14-56-45"], ["b51f941a", "seed-1", "2021-06-10_14-57-08"], ["b51f941a", "seed-2", "2021-06-10_14-59-04"]], "audiocaps-train-full-ce-r2p1d-inst-vggsound": [["1b623fdc", "seed-0", "2021-06-10_14-49-00"], ["1b623fdc", "seed-1", "2021-06-10_14-49-00"], ["1b623fdc", "seed-2", "2021-06-10_14-48-59"]], "audiocaps-train-full-ce-scene-inst": [["55c40cc6", "seed-0", "2021-06-10_15-18-50"], ["55c40cc6", "seed-1", "2021-06-10_15-18-51"], ["55c40cc6", "seed-2", "2021-06-10_15-22-00"]], "audiocaps-train-full-ce-scene-r2p1d": [["b2b14107", "seed-0", "2021-06-10_15-13-04"], ["b2b14107", "seed-1", "2021-06-10_15-14-38"], ["b2b14107", "seed-2", "2021-06-10_15-17-36"]], "audiocaps-train-only-vggsound": [["afab0e0c", "seed-0", "2021-06-16_01-21-37"], ["afab0e0c", "seed-1", "2021-06-16_01-28-08"], ["afab0e0c", "seed-2", "2021-06-16_01-33-51"]], "audiocaps-train-vggish-vggsound": [["7e2eda12", "seed-0", "2021-06-09_17-06-26"], ["7e2eda12", "seed-1", "2021-06-09_17-15-12"], ["7e2eda12", "seed-2", "2021-06-09_17-24-01"]], "audiocaps-train-vggish-vggsound-moee": [["f66525f8", "seed-0", "2021-06-09_16-44-00"], ["f66525f8", "seed-1", "2021-06-09_16-51-31"], ["f66525f8", "seed-2", "2021-06-09_16-59-01"]], "audiocaps-train-vggish-vggsound-train_list_10": [["68747f8c", "seed-0", "2021-06-10_11-02-21"], ["68747f8c", "seed-1", "2021-06-10_11-07-21"], ["68747f8c", "seed-2", "2021-06-10_11-10-54"]], "audiocaps-train-vggish-vggsound-train_list_25": [["0151ad7f", "seed-0", "2021-06-10_11-14-25"], ["0151ad7f", "seed-1", "2021-06-10_11-18-48"], ["0151ad7f", "seed-2", "2021-06-10_11-23-12"]], "audiocaps-train-vggish-vggsound-train_list_50": [["4aeeaa0d", "seed-0", "2021-06-10_11-27-36"], ["4aeeaa0d", "seed-1", "2021-06-10_11-33-28"], ["4aeeaa0d", "seed-2", "2021-06-10_11-39-36"]], "audiocaps-train-vggish-vggsound-train_list_75": [["3a8d0584", "seed-0", "2021-06-10_11-45-26"], ["3a8d0584", "seed-1", "2021-06-10_11-52-47"], ["3a8d0584", "seed-2", "2021-06-10_12-00-02"]], "clotho-train-full-ce-only-audio": [["4f58ef05", "seed-0", "2021-06-10_15-38-28"], ["4f58ef05", "seed-1", "2021-06-10_15-39-02"], ["4f58ef05", "seed-2", "2021-06-10_15-39-33"]], "clotho-train-vggish-vggsound": [["dec0c820", "seed-0", "2021-06-10_14-45-51"], ["dec0c820", "seed-1", "2021-06-10_14-45-59"], ["dec0c820", "seed-2", "2021-06-10_14-46-07"]], "clotho-train-vggish-vggsound-moee": [["fafa3e91", "seed-0", "2021-06-10_14-44-51"], ["fafa3e91", "seed-1", "2021-06-10_14-44-51"], ["fafa3e91", "seed-2", "2021-06-10_14-44-51"]], "clotho-train-vggish-vggsound-finetuned": [["74560a6c", "seed-0", "2021-06-10_16-38-40"], ["74560a6c", "seed-1", "2021-06-10_16-39-29"], ["74560a6c", "seed-2", "2021-06-10_16-47-02"]], "clotho-train-vggish-vggsound-moee-finetuned": [["5395fa47", "seed-0", "2021-06-10_16-36-13"], ["5395fa47", "seed-1", "2021-06-10_16-37-11"], ["5395fa47", "seed-2", "2021-06-10_16-37-55"]], "querydsegments-train-full-ce-only-audio": [["70111434", "seed-0", "2021-06-10_14-33-03"], ["70111434", "seed-1", "2021-06-10_14-36-34"], ["70111434", "seed-2", "2021-06-10_14-40-01"]], "activity-net-train-full-ce-audio-only": [["f3ebaada", "seed-0", "2021-07-22_12-44-19"], ["f3ebaada", "seed-1", "2021-07-22_12-46-48"], ["f3ebaada", "seed-2", "2021-07-22_12-49-19"]]}
2 |
--------------------------------------------------------------------------------
/data_loader/QuerYD_dataset.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import itertools
3 | from pathlib import Path
4 | from typing import Dict, List, Union
5 |
6 | from base.base_dataset import BaseDataset
7 | from typeguard import typechecked
8 | from utils import memory_summary
9 | from zsvision.zs_utils import concat_features, memcache
10 |
11 |
12 | class QuerYD(BaseDataset):
13 |
14 | @staticmethod
15 | @typechecked
16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
17 | subset_paths = {}
18 | test_splits = {
19 | "val": "val_list.txt",
20 | "test": "test_list.txt",
21 | }
22 | for split_name, fname in test_splits.items():
23 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
24 |
25 | feature_names = BaseDataset.common_feat_names()
26 | feature_names.append("audio.vggish.0")
27 | text_feat_paths = BaseDataset.common_text_feat_paths()
28 | text_feat_paths = {key: Path("text_embeddings") / fname
29 | for key, fname in text_feat_paths.items()}
30 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
31 | for key in text_feat_paths}
32 | custom_paths = {
33 | "audio": ["aggregated_audio/vggish-raw.hickle"],
34 | }
35 | feature_info = {
36 | "custom_paths": custom_paths,
37 | "feature_names": feature_names,
38 | "subset_list_paths": subset_paths,
39 | "text_feat_paths": text_feat_paths,
40 | "challenge_text_feat_paths": challenge_text_feat_paths,
41 | "raw_captions_path": "structured-symlinks/raw_captions_combined_filtered.pkl",
42 | }
43 | return feature_info
44 |
45 | def load_features(self):
46 | root_feat = self.root_feat
47 | feat_names = {key: self.visual_feat_paths(key) for key in
48 | self.paths["feature_names"]}
49 | feat_names.update(self.paths["custom_paths"])
50 | features = {}
51 | for expert, rel_names in feat_names.items():
52 | if expert not in self.ordered_experts:
53 | continue
54 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
55 | if len(feat_paths) == 1:
56 | features[expert] = memcache(feat_paths[0])
57 | else:
58 | # support multiple forms of feature (e.g. max and avg pooling). For
59 | # now, we only support direct concatenation
60 | msg = f"{expert}: Only direct concatenation of muliple feats is possible"
61 | print(f"Concatenating aggregates for {expert}....")
62 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
63 | axis = self.feat_aggregation[expert]["aggregate-axis"]
64 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter
65 | print(f"concat cache info: {x}")
66 | features_ = concat_features(feat_paths, axis=axis)
67 | memory_summary()
68 |
69 | # Make separate feature copies for each split to allow in-place filtering
70 | features[expert] = copy.deepcopy(features_)
71 |
72 | self.features = features
73 | if self.challenge_mode:
74 | self.load_challenge_text_features()
75 | else:
76 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
77 | # keys = list(raw_captions.keys())
78 | # raw_captions_fused = {}
79 | # for key in keys:
80 | # raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key]))
81 | # self.raw_captions = raw_captions_fused
82 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
83 | self.text_features = memcache(text_feat_path)
84 |
85 | # overload video paths, which are structured differently for YouCook2
86 | self.video_path_retrieval = [f"videos/{x}.mp4"
87 | for x in self.partition_lists["val"]]
88 |
89 | def sanity_checks(self):
90 | msg = (f"Expected to have single test caption for QuerYD, since we assume"
91 | f"that the captions are fused (but using {self.num_test_captions})")
92 | assert self.num_test_captions == 1, msg
93 |
--------------------------------------------------------------------------------
/utils/datastructures.py:
--------------------------------------------------------------------------------
1 | """This module defines a datastructure for storing pre-computed features for datasets.
2 |
3 | It provides key-value access, but is backed by a monolithic array to prevent memory
4 | fragmentation. This can be useful for loading large feature sets into memory (e.g.
5 | those that are > 100 GiB) in a manner that minimises OOM issues.
6 | """
7 |
8 | import pickle
9 | import argparse
10 | import numpy as np
11 | import humanize
12 |
13 |
14 | class ExpertStore:
15 |
16 | def __init__(self, keylist, dim, dtype=np.float16):
17 | self.keys = keylist
18 | self.dim = dim
19 | self.store_dtype = dtype
20 | self.store = np.zeros((len(keylist), dim), dtype=dtype)
21 | self.keymap = {}
22 | self.missing = set()
23 | self.rebuild_keymap()
24 |
25 | def __setitem__(self, key, value):
26 | idx = self.keymap[key]
27 | if isinstance(value, np.ndarray):
28 | # non-nan values must be vectors of the appropriate size
29 | assert value.size == self.dim, f"cannot set value with size {value.size}"
30 | else:
31 | assert np.isnan(value)
32 | self.store[idx] = value
33 |
34 | def rebuild_keymap(self):
35 | for idx, key in enumerate(self.keys):
36 | self.keymap[key] = idx
37 |
38 | def filter_keys(self, keys, tag, allow_mismatch="", exceptions=None):
39 | keyset = set(keys)
40 | missing = keyset - set(self.keys)
41 | if exceptions is not None and missing:
42 | excluded = missing.intersection(set(exceptions))
43 | print(f"filter_keys >>> applying exceptions for {len(excluded)} videos")
44 | missing = missing - excluded
45 | print(f"filter_keys >>> {tag}")
46 | if allow_mismatch and missing:
47 | print(f"Key mismatch (missing {len(missing)}) {allow_mismatch}")
48 | else:
49 | samples = list(missing)[:3]
50 | msg = f"cannot apply filter since missing {len(missing)} keys e.g. {samples}"
51 | assert not missing, msg
52 | keep = np.array([x in keyset for x in self.keys])
53 | filtered_keys = np.array(self.keys)[keep]
54 | print(f"Filtering from {len(self.keys)} keys to {len(filtered_keys)} keys")
55 | self.keys = filtered_keys
56 | self.store = self.store[keep]
57 | self.rebuild_keymap()
58 |
59 | def __getitem__(self, key):
60 | return self.store[self.keymap[key]]
61 |
62 | def __len__(self):
63 | return len(self.keys)
64 |
65 | def __repr__(self):
66 | keep_samples = 3
67 | samples = list(self.keymap.items())[:keep_samples]
68 | sample_str = "\n".join([f"{key}: {val}" for key, val in samples])
69 | summary = (
70 | f"ExpertStore object with {len(self.keys)} features (dim: {self.dim})"
71 | f" (storage is using {humanize.naturalsize(self.store.nbytes)})"
72 | f"\nFirst {keep_samples} elements of keymap: \n{sample_str}"
73 | )
74 | return summary
75 |
76 |
77 | def gen_dict_store(keylist, dim):
78 | store = dict()
79 | for key in keylist:
80 | store[key] = np.random.rand(1, dim).astype(np.float16)
81 | return store
82 |
83 |
84 | def main():
85 | parser = argparse.ArgumentParser()
86 | parser.add_argument("--dataset", default="moments-in-time")
87 | parser.add_argument("--dim", type=int, default=2048)
88 | args = parser.parse_args()
89 |
90 | from config import get_data_paths
91 | data_paths = get_data_paths(args.dataset)
92 | relevant_path = data_paths["relevant-id-list"]
93 | with open(relevant_path, "r") as f:
94 | relevant_ids = sorted(f.read().splitlines())
95 |
96 | for store_name in "dict", "np", "expert_store":
97 | if store_name == "dict":
98 | store = gen_dict_store(keylist=relevant_ids, dim=args.dim)
99 | elif store_name == "np":
100 | store = np.random.rand(len(relevant_ids), args.dim).astype(np.float16)
101 | elif store_name == "expert_store":
102 | store = ExpertStore(keylist=relevant_ids, dim=args.dim)
103 | print(store)
104 | serialised = pickle.dumps(store)
105 | print(f"Memory needs for {store_name}: {humanize.naturalsize(len(serialised))}")
106 |
107 |
108 |
109 | if __name__ == "__main__":
110 | main()
111 |
--------------------------------------------------------------------------------
/misc/gen_tar_lists.py:
--------------------------------------------------------------------------------
1 | """
2 | ipy misc/gen_tar_lists.py -- --dataset YouCook2
3 | """
4 | import copy
5 | import json
6 | import argparse
7 | from typing import Dict, List, Tuple
8 | from pathlib import Path
9 |
10 | import tqdm
11 | from beartype import beartype
12 | from zsvision.zs_utils import load_json_config
13 | from gen_readme import dataset_paths, model_specs2path
14 |
15 |
16 | @beartype
17 | def generate_tar_lists(
18 | save_dir: Path,
19 | experiments: Dict[str, Tuple[str, str]],
20 | datasets: List[str],
21 | refresh: bool,
22 | ):
23 | all_feat_paths = {}
24 | # import pdb; pdb.set_trace()
25 | for exp_name, (group_id, timestamp) in tqdm.tqdm(experiments.items()):
26 | rel_path = Path(group_id) / "seed-0" / timestamp / "config.json"
27 | config_path = Path(save_dir) / "models" / exp_name / rel_path
28 | try:
29 | with open(config_path, "r") as f:
30 | config = json.load(f)
31 | except FileNotFoundError:
32 | rel_path = Path(group_id) / "seed-1" / timestamp / "config.json"
33 | config_path = Path(save_dir) / "models" / exp_name / rel_path
34 | with open(config_path, "r") as f:
35 | config = json.load(f)
36 |
37 | feat_aggregation = config["data_loader"]["args"]["feat_aggregation"]
38 | dataset_name = exp_name.split("-train")[0]
39 | if dataset_name not in [x.lower() for x in datasets]:
40 | continue
41 | if dataset_name not in all_feat_paths:
42 | all_feat_paths[dataset_name] = set()
43 | split_names = [config["data_loader"]["args"]["split_name"]]
44 | if "eval_settings" in config and config["eval_settings"]:
45 | test_split = config["eval_settings"]["data_loader"]["args"]["split_name"]
46 | split_names.append(test_split)
47 | keep = set(config["experts"]["modalities"])
48 | text_feat = config["experts"]["text_feat"]
49 | root_feat, paths = dataset_paths(dataset_name)
50 | modern_feat_agg = {key: val for key, val in feat_aggregation.items()
51 | if key in paths["feature_names"]}
52 | feat_paths = model_specs2path(modern_feat_agg, keep)
53 | all_feat_paths[dataset_name].update({root_feat / x for x in feat_paths})
54 | for key, feat_list in paths["custom_paths"].items():
55 | for feat_path in feat_list:
56 | all_feat_paths[dataset_name].add(root_feat / feat_path)
57 | # import pdb; pdb.set_trace()
58 | text_paths = [root_feat / paths["text_feat_paths"][text_feat]]
59 | all_feat_paths[dataset_name].update(set(text_paths))
60 | all_feat_paths[dataset_name].add(root_feat / paths["raw_captions_path"])
61 | if "dict_youtube_mapping_path" in paths:
62 | all_feat_paths[dataset_name].add(
63 | root_feat / paths["dict_youtube_mapping_path"])
64 | for split_name in split_names:
65 | split_paths = set(root_feat / x for x in
66 | paths["subset_list_paths"][split_name].values())
67 | all_feat_paths[dataset_name].update(split_paths)
68 |
69 | for dataset_name, paths in all_feat_paths.items():
70 | tar_include_list = Path("misc") / "datasets" / dataset_name / "tar_include.txt"
71 | tar_include_list.parent.mkdir(exist_ok=True, parents=True)
72 | if tar_include_list.exists() and not refresh:
73 | print(f"Found existing tar include list at {tar_include_list}, skipping...")
74 | continue
75 | with open(tar_include_list, "w") as f:
76 | for path in sorted(paths):
77 | if "aggregated_speech" not in str(path):
78 | print(f"Writing {path} to {tar_include_list}")
79 | f.write(f"{path}\n")
80 |
81 |
82 | def main():
83 | parser = argparse.ArgumentParser()
84 | parser.add_argument("--save_dir", default="data/saved", type=Path)
85 | parser.add_argument("--refresh", action="store_true")
86 | parser.add_argument("--experiments_path", default="misc/experiments.json")
87 | parser.add_argument("--target", default="main")
88 | parser.add_argument("--data_dir", type=Path, default="data")
89 | parser.add_argument("--challenge_phase", default="public_server_val",
90 | choices=["public_server_val", "public_server_test"])
91 | parser.add_argument("--datasets", nargs="+",
92 | default=["activity-net",
93 | "QuerYD", "QuerYDSegments"])
94 | args = parser.parse_args()
95 |
96 | with open(args.experiments_path, "r") as f:
97 | experiments = json.load(f)
98 |
99 | generate_tar_lists(
100 | save_dir=args.save_dir,
101 | datasets=args.datasets,
102 | experiments=experiments,
103 | refresh=args.refresh,
104 | )
105 |
106 |
107 | if __name__ == "__main__":
108 | main()
109 |
--------------------------------------------------------------------------------
/data_loader/CLOTHO_dataset.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import itertools
3 | from pathlib import Path
4 | from typing import Dict, List, Union
5 |
6 | from base.base_dataset import BaseDataset
7 | from typeguard import typechecked
8 | from utils import memory_summary
9 | from zsvision.zs_utils import concat_features, memcache
10 |
11 |
12 | class CLOTHO(BaseDataset):
13 |
14 | @staticmethod
15 | @typechecked
16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
17 | subset_paths = {}
18 | test_splits = {
19 | "val": "val_list.txt",
20 | "test": "test_list.txt",
21 | }
22 | for split_name, fname in test_splits.items():
23 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
24 |
25 | feature_names = BaseDataset.common_feat_names()
26 | feature_names.append("audio.vggish.0")
27 | text_feat_paths = BaseDataset.common_text_feat_paths()
28 | text_feat_paths = {key: Path("text_embeddings") / fname
29 | for key, fname in text_feat_paths.items()}
30 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
31 | for key in text_feat_paths}
32 | custom_paths = {
33 | "audio": ["aggregated_audio/vggish-raw.hickle"],
34 | "pann": ["aggregated_pann/pann-raw.hickle"],
35 | "syncnet": ["aggregated_syncnet/syncnet-raw.hickle"],
36 | "vggsound": ["aggregated_vggsound/vggsound-raw.hickle"],
37 | # "vggsound": ["aggregated_vggsound/vggsound-avg.pickle"],
38 | "speech": ["aggregated_speech/w2v_mean.pkl"]
39 | }
40 | feature_info = {
41 | "custom_paths": custom_paths,
42 | "feature_names": feature_names,
43 | "subset_list_paths": subset_paths,
44 | "text_feat_paths": text_feat_paths,
45 | "challenge_text_feat_paths": challenge_text_feat_paths,
46 | "raw_captions_path": "structured-symlinks/raw-captions.pkl",
47 | }
48 | return feature_info
49 |
50 | def load_features(self):
51 | root_feat = self.root_feat
52 | feat_names = {key: self.visual_feat_paths(key) for key in
53 | self.paths["feature_names"]}
54 | feat_names.update(self.paths["custom_paths"])
55 | features = {}
56 | for expert, rel_names in feat_names.items():
57 | if expert not in self.ordered_experts:
58 | continue
59 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
60 | if len(feat_paths) == 1:
61 | features[expert] = memcache(feat_paths[0])
62 | else:
63 | # support multiple forms of feature (e.g. max and avg pooling). For
64 | # now, we only support direct concatenation
65 | msg = f"{expert}: Only direct concatenation of muliple feats is possible"
66 | print(f"Concatenating aggregates for {expert}....")
67 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
68 | axis = self.feat_aggregation[expert]["aggregate-axis"]
69 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter
70 | print(f"concat cache info: {x}")
71 | features_ = concat_features(feat_paths, axis=axis)
72 | memory_summary()
73 |
74 | # if expert == "speech":
75 | # features_defaults = defaultdict(lambda: np.zeros((1, 300)))
76 | # features_defaults.update(features_)
77 | # features_ = features_defaults
78 | # Make separate feature copies for each split to allow in-place filtering
79 | features[expert] = copy.deepcopy(features_)
80 |
81 | self.features = features
82 | if self.challenge_mode:
83 | self.load_challenge_text_features()
84 | else:
85 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
86 | # keys = list(raw_captions.keys())
87 | # raw_captions_fused = {}
88 | # for key in keys:
89 | # raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key]))
90 | # self.raw_captions = raw_captions_fused
91 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
92 | self.text_features = memcache(text_feat_path)
93 |
94 | # overload video paths, which are structured differently for YouCook2
95 | self.video_path_retrieval = [f"videos/{x}.mp4"
96 | for x in self.partition_lists["val"]]
97 |
98 | def sanity_checks(self):
99 | msg = (f"Expected to have single test caption for AudioCaps, since we assume"
100 | f"that the captions are fused (but using {self.num_test_captions})")
101 | if self.fuse_captions is True:
102 | assert self.num_test_captions == 1, msg
103 |
--------------------------------------------------------------------------------
/logger/log_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | import scipy.stats
3 | import logging
4 | import numpy as np
5 | from collections import defaultdict
6 |
7 |
8 | def log_summary(logger, log_path, eval_mode="test_run", fixed_num_epochs=None):
9 | """Extract performace statistics from experiment log files.
10 |
11 | Args:
12 | logger (logger): reference to primary logging instance
13 | log_path (Path): the path to the log file
14 | eval_mode (str): the method use to collect the statistics. Can be one of:
15 | `test_run`, `fixed_num_epochs` or `geometric_mean`
16 |
17 | NOTE: The `eval_mode` argument differs by dataset: for datasets which provide a
18 | validation set, we use validation set performance to complete a single test run. For
19 | datasets where no validation set is available, we aim to match prior work by either
20 | fixing the number of training epochs, or selecting directly from validation set
21 | performance (Details can be found in the supplementary material of the paper.)
22 | """
23 | with open(str(log_path), "r") as f:
24 | log = f.read().splitlines()
25 |
26 | # keep track of the random seed used for the part of the logfile being processed
27 | current_seed = None
28 |
29 | # Regex tag for finding the seed
30 | seed_tag = "Setting experiment random seed to"
31 |
32 | if eval_mode == "test_run":
33 | subset = "test"
34 | else:
35 | subset = "val"
36 |
37 | for mode in "t2v", "v2t":
38 | logger.info("")
39 | logger.info("----------------------------------------------------")
40 | logger.info(f"[{mode}] loaded log file with {len(log)} lines....")
41 | logger.info("----------------------------------------------------")
42 |
43 | # Search for the following metrics
44 | scores = {
45 | "R1": defaultdict(list),
46 | "R5": defaultdict(list),
47 | "R10": defaultdict(list),
48 | "R50": defaultdict(list),
49 | "MedR": defaultdict(list),
50 | "MeanR": defaultdict(list),
51 | }
52 |
53 | for row in log:
54 | if seed_tag in row:
55 | # Search for the log file entry describing the current random seed
56 | match = re.search(seed_tag + " (\d+)$", row) # NOQA
57 | assert len(match.groups()) == 1, "expected a single regex match"
58 | current_seed = match.groups()[0]
59 |
60 | if f"{subset}_{mode}_metrics" in row:
61 | tokens = row.split(" ")
62 | for key in scores:
63 | tag = f"{subset}_{mode}_metrics_{key}:"
64 | if tag in tokens:
65 | pos = tokens.index(tag) + 1
66 | val = tokens[pos]
67 | val = float(val)
68 | assert current_seed is not None, "failed to determine the seed"
69 | scores[key][current_seed].append(val)
70 |
71 | agg_scores = {"R1": [], "R5": [], "R10": [], "R50": [], "MedR": [], "MeanR": []}
72 |
73 | # compute the best performance for a single epoch (i.e. sharing the same model
74 | # to compute all stats)
75 | geometric_stats = defaultdict(list)
76 | best_epochs = {}
77 | if eval_mode == "geometric_mean":
78 | raise NotImplementedError("Need to fix this for new log format")
79 | consider = ["R1", "R5", "R10"]
80 | seeds = list(scores["R1"].keys())
81 | for seed in seeds:
82 | for metric, subdict in scores.items():
83 | if metric in consider:
84 | geometric_stats[seed].append(subdict[seed])
85 | gms_raw = np.array(geometric_stats[seed])
86 | geo_means = scipy.stats.mstats.gmean(gms_raw, axis=0)
87 | best_epochs[seed] = np.argmax(geo_means)
88 |
89 | for metric, subdict in scores.items():
90 | for seed, values in subdict.items():
91 | if eval_mode == "test_run":
92 | stat = values[0]
93 | elif eval_mode == "fixed_num_epochs":
94 | stat = values[fixed_num_epochs - 1]
95 | else:
96 | raise ValueError(f"unrecognised eval_mode: {eval_mode}")
97 | agg_scores[metric].append(stat)
98 |
99 | if eval_mode == "fixed_num_epochs":
100 | logger.info(f"Reporting stats with fixed training length: {fixed_num_epochs}")
101 | for metric, values in agg_scores.items():
102 | logger.info(f"{metric}: {np.mean(values):.1f}, {np.std(values, ddof=1):.1f}")
103 |
104 |
105 | if __name__ == "__main__":
106 | sample_path = "data/saved/log/audiocaps-train-vggish-vggsound/2021-04-03_11-48-50/info.log"
107 | logger_ = logging.getLogger("parser")
108 | logging.basicConfig(level=logging.INFO)
109 | log_summary(
110 | logger=logger_,
111 | log_path=sample_path,
112 | eval_mode="fixed_num_epochs",
113 | fixed_num_epochs=9,
114 | )
115 |
--------------------------------------------------------------------------------
/utils/visualizer.py:
--------------------------------------------------------------------------------
1 | """A simple HTML visualizer.
2 |
3 | It is based on the Cycle-GAN codebase:
4 | https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
5 | """
6 | import os
7 | import numpy as np
8 | from pathlib import Path
9 | from . import util, html
10 |
11 |
12 | class Visualizer:
13 | """This class includes several functions that can display/save images.
14 |
15 | It uses a Python library 'visdom' for display, and a Python library 'dominate'
16 | (wrapped in 'HTML') for creating HTML files with images.
17 | """
18 |
19 | def __init__(self, exp_name, web_dir, src_video_dir, vis_vid_freq, num_samples=50):
20 | """Initialize the Visualizer class
21 | Create an HTML object for saveing HTML filters
22 | """
23 | self.name = exp_name
24 | self.web_dir = web_dir
25 | self.vis_vid_freq = vis_vid_freq
26 | self.img_dir = os.path.join(self.web_dir, "images")
27 | self.num_samples = num_samples
28 | print(f"create web directory {self.web_dir}...")
29 | util.mkdirs([self.web_dir, self.img_dir])
30 | src_dir = Path(src_video_dir).absolute()
31 | print(f"symlinking videos from {src_dir}...")
32 | sym_dir = (Path(self.web_dir) / "videos").absolute()
33 | if sym_dir.is_symlink():
34 | os.remove(sym_dir)
35 | sym_dir.symlink_to(src_dir)
36 |
37 | def visualize_ranking(self, sims, epoch, meta, nested_metrics):
38 | if not (self.vis_vid_freq and epoch % self.vis_vid_freq == 0):
39 | return
40 |
41 | dists = -sims
42 | np.random.seed(0)
43 | sorted_ranks = np.argsort(dists, axis=1)
44 | gt_dists = np.diag(dists)
45 | rankings = []
46 | vis_top_k = 5
47 | hide_gt = False
48 | # num_indep_samples = 1
49 | # random_seeds = np.arange(num_indep_samples)
50 | sample = np.random.choice(np.arange(dists.shape[0]), size=self.num_samples,
51 | replace=False)
52 | for ii in sample:
53 | ranked_idx = sorted_ranks[ii][:vis_top_k]
54 | gt_captions = meta["raw_captions"][ii]
55 | # if args.sample_single_gt_caption:
56 | # gt_captions = np.random.choice(gt_captions, 1).tolist()
57 |
58 | datum = {
59 | "gt-sim": -gt_dists[ii],
60 | "gt-captions": gt_captions,
61 | "gt-rank": np.where(sorted_ranks[ii] == ii)[0][0],
62 | "gt-path": meta["paths"][ii],
63 | "top-k-sims": -dists[ii][ranked_idx],
64 | "top-k-paths": np.array(meta["paths"])[ranked_idx],
65 | "hide-gt": hide_gt,
66 | }
67 | rankings.append(datum)
68 | self.display_current_results(
69 | rankings,
70 | epoch=epoch,
71 | metrics=nested_metrics["t2v_metrics"],
72 | )
73 |
74 | def display_current_results(self, rankings, epoch, metrics):
75 | """Display current results on visdom; save current results to an HTML file.
76 |
77 | Parameters:
78 | visuals (OrderedDict) - - dictionary of images to display or save
79 | epoch (int) - - the current epoch
80 | save_result (bool) - - if save the current results to an HTML file
81 | """
82 | if not Path(self.web_dir).exists():
83 | Path(self.web_dir).mkdir(exist_ok=True, parents=True)
84 | print(f"updating webpage at {self.web_dir}")
85 | title = f"Experiment name = {self.name}"
86 | refresh = True
87 | if not refresh:
88 | print("DISABLING WEB PAGE REFRESH")
89 | webpage = html.HTML(web_dir=self.web_dir, title=title, refresh=refresh)
90 |
91 | msg = f"epoch [{epoch}] - {self.name}"
92 | webpage.add_header(msg)
93 | msg = (f"R1: {metrics['R1']:.1f}, "
94 | f"R5: {metrics['R5']:.1f}, "
95 | f"R10: {metrics['R10']:.1f}, "
96 | f"MedR: {metrics['MedR']}")
97 | webpage.add_header(msg)
98 | print(f"Top {len(rankings[0])} retreived videos at epoch: {epoch}")
99 |
100 | for ranking in rankings:
101 | vids, txts, links = [], [], []
102 | gt_vid_path = ranking["gt-path"]
103 | gt_captions = [" ".join(x) for x in ranking["gt-captions"]]
104 | gt_captions = "
".join(gt_captions)
105 | if ranking["hide-gt"]:
106 | txts.append(gt_captions)
107 | links.append("hidden")
108 | vids.append("hidden")
109 | else:
110 | txt = (f"{gt_captions}
Rank: {ranking['gt-rank']}, "
111 | f"Sim: {ranking['gt-sim']:.3f} [{Path(ranking['gt-path']).stem}]")
112 | txts.append(txt)
113 | links.append(gt_vid_path)
114 | vids.append(gt_vid_path)
115 |
116 | for idx, (vid_path, sim) in enumerate(zip(ranking["top-k-paths"],
117 | ranking["top-k-sims"])):
118 | vid_path = Path(vid_path)
119 | if ranking["hide-gt"]:
120 | txt = f"choice: {idx}"
121 | else:
122 | txt = f"Rank: {idx}, Sim: {sim:.3f}, [{Path(vid_path).stem}]"
123 | txts.append(txt)
124 | vids.append(vid_path)
125 | links.append(vid_path)
126 | webpage.add_videos(vids, txts, links, width=200)
127 | print(f"added {len(vids)} videos")
128 | webpage.save()
129 |
--------------------------------------------------------------------------------
/data_loader/ActivityNet_dataset.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from typing import Dict, Union, List
3 | from pathlib import Path
4 |
5 | from zsvision.zs_utils import memcache, concat_features
6 | from typeguard import typechecked
7 |
8 | from utils import memory_summary
9 | from base.base_dataset import BaseDataset
10 |
11 |
12 | class ActivityNet(BaseDataset):
13 |
14 | @staticmethod
15 | @typechecked
16 | def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
17 | subset_paths = {}
18 | test_splits = {
19 | "val1": "val_1_list.txt",
20 | "val": "val_list.txt",
21 | "public_server_val": "public_server_val.txt",
22 | "public_server_test": "public_server_test.txt",
23 | }
24 | for split_name, fname in test_splits.items():
25 | if training_file is None:
26 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
27 | else:
28 | subset_paths[split_name] = {"train": training_file, "val": fname}
29 |
30 |
31 | feature_names = BaseDataset.common_feat_names()
32 | custom_paths = {
33 | "audio": ["aggregated_audio/vggish-audio-raw.pickle"],
34 | "speech": ["aggregated_speech/goog_w2v-speech-raw.pickle"],
35 | "ocr": ["aggregated_ocr_feats/ocr-w2v.pkl"],
36 | "face": ["aggregated_facefeats_25fps_256px_stride1/face-avg.pickle"],
37 | }
38 | text_feat_paths = BaseDataset.common_text_feat_paths()
39 | text_feat_dir = Path("aggregated_text_feats")
40 |
41 | text_feat_paths = {key: text_feat_dir / fname
42 | for key, fname in text_feat_paths.items()}
43 | challenge_text_feat_paths = {}
44 | # include non-standard text features
45 | for text_feat in ("openai", ):
46 | text_feat_names = {key: f"{text_feat}-{key}"
47 | for key in {"train", "val1"}}
48 | text_feat_paths[text_feat] = {key: f"aggregated_text_feats/{val}.pkl"
49 | for key, val in text_feat_names.items()}
50 | challenge_text_feat_paths[text_feat] = \
51 | f"aggregated_text_feats/{text_feat}.pkl"
52 | feature_info = {
53 | "custom_paths": custom_paths,
54 | "feature_names": feature_names,
55 | "subset_list_paths": subset_paths,
56 | "text_feat_paths": text_feat_paths,
57 | "challenge_text_feat_paths": challenge_text_feat_paths,
58 | "raw_captions_path": "raw-captions-train-val_1.pkl",
59 | }
60 | return feature_info
61 |
62 | def load_features(self):
63 | root_feat = self.root_feat
64 | if self.distil_params is not None:
65 | self.distil_features = {}
66 | d_base_path = self.distil_params['base_path']
67 |
68 | teachers = list(map(lambda x: root_feat / Path(d_base_path + x), self.distil_params['teachers']))
69 |
70 | for i, f_name in enumerate(teachers):
71 | self.distil_features[i] = memcache(f_name)
72 |
73 | feat_names = {key: self.visual_feat_paths(key) for key in
74 | self.paths["feature_names"]}
75 | feat_names.update(self.paths["custom_paths"])
76 | features = {}
77 | for expert, rel_names in feat_names.items():
78 | if expert not in self.ordered_experts:
79 | continue
80 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
81 | if len(feat_paths) == 1:
82 | features[expert] = memcache(feat_paths[0])
83 | else:
84 | # support multiple forms of feature (e.g. max and avg pooling). For
85 | # now, we only support direct concatenation
86 | msg = f"{expert}: Only direct concatenation of muliple feats is possible"
87 | print(f"Concatenating aggregates for {expert}....")
88 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
89 | axis = self.feat_aggregation[expert]["aggregate-axis"]
90 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter
91 | print(f"concat cache info: {x}")
92 | features_ = concat_features(feat_paths, axis=axis)
93 | memory_summary()
94 |
95 | # Make separate feature copies for each split to allow in-place filtering
96 | features[expert] = copy.deepcopy(features_)
97 |
98 | self.features = features
99 | if self.challenge_mode:
100 | self.load_challenge_text_features()
101 | else:
102 | text_feat_paths = self.paths["text_feat_paths"][self.text_feat]
103 | if isinstance(text_feat_paths, dict):
104 | text_features = memcache(root_feat / text_feat_paths["train"])
105 | text_features.update(memcache(
106 | root_feat / text_feat_paths[self.split_name]))
107 | elif isinstance(text_feat_paths, (Path, str)):
108 | text_features = memcache(root_feat / text_feat_paths)
109 | else:
110 | raise TypeError(f"Unexpected type {type(text_feat_paths)}")
111 | self.text_features = text_features
112 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
113 |
114 | def sanity_checks(self):
115 | msg = (f"Expected to have single test caption for ANet, since we assume"
116 | f"that the captions are fused (but using {self.num_test_captions})")
117 | assert self.num_test_captions == 1, msg
118 |
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import logging
4 | import os
5 | import subprocess
6 | import tqdm
7 | import wget
8 | from collections import defaultdict
9 | from datetime import datetime
10 | from pathlib import Path
11 |
12 | import numpy as np
13 | import torch
14 | import random
15 |
16 | def extracting_log_info(log_files, experiment, logging):
17 | metrics_t2v = defaultdict(list)
18 | metrics_v2t = defaultdict(list)
19 |
20 | for file_name in log_files:
21 | output_string = f"{experiment}:\n"
22 | with open(Path("logs_eval") / file_name, 'r') as f:
23 | content_lines = f.read().splitlines()
24 | content_lines = content_lines[-14:]
25 | for line in content_lines:
26 | if 't2v' in line:
27 | metric_entry = line.split('test_t2v_metrics_')[1].split(':')[0]
28 | metrics_t2v[metric_entry].append(float(line.split('test_t2v_metrics_')[1].split(':')[1]))
29 | elif 'v2t' in line:
30 | metric_entry = line.split('test_v2t_metrics_')[1].split(':')[0]
31 | metrics_v2t[metric_entry].append(float(line.split('test_v2t_metrics_')[1].split(':')[1]))
32 | keys = list(metrics_t2v.keys())
33 |
34 | for key in keys:
35 | output_string += f"{key}_t2v: {np.mean(metrics_t2v[key]):.1f}, {np.std(metrics_t2v[key], ddof=1):.1f}\n"
36 | for key in keys:
37 | output_string += f"{key}_v2t: {np.mean(metrics_v2t[key]):.1f}, {np.std(metrics_v2t[key], ddof=1):.1f}\n"
38 | logging.info(output_string)
39 | with open(Path("logs_eval") / f"{experiment}_summary.txt", 'w') as f:
40 | f.write(output_string)
41 |
42 | def run_exp(experiments, logging):
43 | for experiment in experiments:
44 | logging.info(f"Now running {experiment}")
45 | run_one_exp(experiment, experiments, logging)
46 |
47 |
48 | def download_configs(experiment, trained_model_path, group_id, seed, timestamp):
49 | new_folder = str(trained_model_path).split('/trained_model.pth')[0]
50 | url_config = f"http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/models/{experiment}/{group_id}/{seed}/{timestamp}/config.json"
51 | config_path = Path(new_folder) / 'config.json'
52 | wget.download(url_config, out=str(config_path))
53 | with open(config_path, 'r') as f:
54 | config_content = json.load(f)
55 | config_content['seed'] = int(seed[-1])
56 | with open(config_path, 'w') as f:
57 | json.dump(config_content, f)
58 |
59 |
60 | def download_models(experiment, logging, trained_model_path,
61 | group_id, seed, timestamp):
62 | new_folder = str(trained_model_path).split('/trained_model.pth')[0]
63 | if os.path.exists(trained_model_path) is False:
64 | logging.info(f"Downloading model for {seed} since it does not exist on the local machine")
65 | url = f"http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/models/{experiment}/{group_id}/{seed}/{timestamp}/trained_model.pth"
66 | # import pdb; pdb.set_trace()
67 | Path(new_folder).mkdir(exist_ok=True, parents=True)
68 | wget.download(url, out=str(trained_model_path))
69 | else:
70 | logging.info(f"Model already downloaded for {experiment} seed {seed}")
71 | if os.path.exists(Path(new_folder) / 'config.json') is False:
72 | download_configs(experiment, trained_model_path, group_id, seed, timestamp)
73 | else:
74 | logging.info(f"Config already downloaded for {experiment} seed {seed}")
75 |
76 | def run_one_exp(experiment, experiments, logging):
77 | group_id = experiments[experiment][0]
78 |
79 | with open('exp_to_seed_time.json', 'r') as f:
80 | json_dict = json.load(f)
81 | log_files = []
82 | for (group_id, seed, timestamp) in json_dict[experiment]:
83 |
84 | group_id_path = Path("data/saved/models") / experiment / group_id
85 | logging.info("Running evaluation on existent seeds")
86 | (Path("logs_eval")).mkdir(exist_ok=True, parents=True)
87 | trained_model_path = group_id_path / seed / timestamp / 'trained_model.pth'
88 | download_models(experiment, logging, trained_model_path,
89 | group_id, seed, timestamp)
90 | config_path = group_id_path / seed / timestamp / 'config.json'
91 | cmd = f"python test.py --config {config_path} --resume {trained_model_path} --device 0 --eval_from_training_config >&1 | tee logs_eval/log_{group_id}_{seed}.txt"
92 |
93 | log_files.append(f"log_{group_id}_{seed}.txt")
94 | logging.info(cmd)
95 | subprocess.call(cmd, shell=True)
96 | logging.info("Now averaging results")
97 |
98 | extracting_log_info(log_files, experiment, logging)
99 |
100 |
101 |
102 | def main():
103 | parser = argparse.ArgumentParser()
104 | parser.add_argument("--experiments_path", default="misc/experiments-audiocaps.json")
105 | parser.add_argument("--experiment", type=str, default=None)
106 | parser.add_argument(
107 | "--data_dir",
108 | type=Path,
109 | default="data",
110 | )
111 | parser.add_argument(
112 | "--dataset",
113 | type=str,
114 | default="data",
115 | )
116 | parser.add_argument(
117 | "--refresh",
118 | action="store_true",
119 | )
120 | args = parser.parse_args()
121 | os.makedirs('logs', exist_ok=True)
122 | logging.basicConfig(filename=f"logs/{datetime.now().strftime(r'%m%d_%H%M%S')}.log",
123 | level=logging.INFO)
124 | logging.getLogger().addHandler(logging.StreamHandler())
125 | logging.info(args)
126 |
127 | with open(args.experiments_path, "r") as f:
128 | experiments = json.load(f)
129 |
130 | if args.experiment is None:
131 | run_exp(experiments, logging)
132 | else:
133 | run_one_exp(args.experiment, experiments, logging)
134 |
135 |
136 |
137 | if __name__ == "__main__":
138 | main()
139 |
--------------------------------------------------------------------------------
/configs/data_loader_queryd.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/base_config_queryd.json",
3 | "eval_mode": "test_run",
4 | "experts": {
5 | "text_feat": "w2v",
6 | "modalities": [
7 | "imagenet.resnext101_32x48d.0",
8 | "r2p1d.r2p1d-ig65m.0",
9 | "scene.densenet161.0",
10 | "audio"
11 | ]
12 | },
13 | "arch": {
14 | "type": "CENet",
15 | "args": {
16 | "test_caption_mode": "indep",
17 | "use_ce": "pairwise",
18 | "use_mish": 1,
19 | "use_bn_reason": 1,
20 | "num_g_layers": 3,
21 | "num_h_layers": 0,
22 | "include_self": 1,
23 | "l2renorm": false,
24 | "randomise_feats": "",
25 | "vlad_clusters": {
26 | "text": 20,
27 | "audio": 16
28 | },
29 | "ghost_clusters": {
30 | "text": 1
31 | },
32 | "mimic_ce_dims": 0
33 | }
34 | },
35 | "optimizer": {
36 | "type": "Ranger",
37 | "args": {
38 | "lr": 0.01,
39 | "weight_decay": 1E-03
40 | }
41 | },
42 | "loss": {
43 | "type": "MaxMarginRankingLoss",
44 | "args": {
45 | "margin": 0.2,
46 | "fix_norm": true
47 | }
48 | },
49 | "data_loader": {
50 | "type": "ExpertDataLoader",
51 | "args":{
52 | "dataset_name": "QuerYD",
53 | "data_dir": "data/QuerYD",
54 | "root_feat_folder": "structured-symlinks",
55 | "trn_cat": 0,
56 | "batch_size": 128,
57 | "split_name": "val",
58 | "fuse_captions": true,
59 | "num_test_captions": 1,
60 | "max_tokens": {
61 | "text": 70,
62 | "audio": 500
63 | },
64 | "feat_aggregation": {
65 | "imagenet.senet154.0": {
66 | "fps": 25,
67 | "stride": 1,
68 | "pixel_dim": 256,
69 | "aggregate-axis": 1,
70 | "offset": 0,
71 | "temporal": "avg",
72 | "aggregate": "concat",
73 | "type": "embed",
74 | "feat_dims": {
75 | "embed": 2048,
76 | "logits": 1000
77 | }
78 | },
79 | "imagenet.resnext101_32x48d.0": {
80 | "fps": 25,
81 | "stride": 1,
82 | "offset": 0,
83 | "pixel_dim": 256,
84 | "temporal": "avg",
85 | "aggregate": "concat",
86 | "aggregate-axis": 1,
87 | "type": "embed",
88 | "feat_dims": {
89 | "embed": 2048,
90 | "logits": 1000
91 | }
92 | },
93 | "scene.densenet161.0": {
94 | "stride": 1,
95 | "fps": 25,
96 | "offset": 0,
97 | "temporal": "avg",
98 | "pixel_dim": 256,
99 | "aggregate": "concat",
100 | "aggregate-axis": 1,
101 | "type": "embed",
102 | "feat_dims": {
103 | "embed": 2208,
104 | "logits": 1000
105 | }
106 | },
107 | "i3d.i3d.0": {
108 | "fps": 25,
109 | "offset": 0,
110 | "stride": 25,
111 | "inner_stride": 1,
112 | "pixel_dim": 256,
113 | "temporal": "avg",
114 | "aggregate": "concat",
115 | "aggregate-axis": 1,
116 | "type": "embed",
117 | "feat_dims": {
118 | "embed": 1024,
119 | "logits": 400
120 | }
121 | },
122 | "r2p1d.r2p1d-ig65m.0": {
123 | "fps": 30,
124 | "offset": 0,
125 | "stride": 32,
126 | "inner_stride": 1,
127 | "pixel_dim": 256,
128 | "temporal": "avg",
129 | "aggregate": "concat",
130 | "aggregate-axis": 1,
131 | "type": "embed",
132 | "feat_dims": {
133 | "embed": 512,
134 | "logits": 359
135 | }
136 | },
137 | "r2p1d.r2p1d-ig65m-kinetics.0": {
138 | "fps": 30,
139 | "offset": 0,
140 | "stride": 32,
141 | "inner_stride": 1,
142 | "pixel_dim": 256,
143 | "temporal": "avg",
144 | "aggregate": "concat",
145 | "aggregate-axis": 1,
146 | "type": "embed",
147 | "feat_dims": {
148 | "embed": 512,
149 | "logits": 400
150 | }
151 | }
152 | }
153 | }
154 | },
155 | "trainer": {
156 | "epochs": 20
157 | },
158 | "eval_settings": {
159 | "data_loader": {
160 | "args": {
161 | "split_name": "test"
162 | }
163 | },
164 | "tester": {
165 | "save_dir": "data/saved/",
166 | "verbosity": 2
167 | },
168 | "disable_gpu": true
169 | },
170 | "visualizer": {
171 | "type": "Visualizer",
172 | "args":{
173 | "src_video_dir": "data/QuerYD/videos",
174 | "vis_vid_freq": 500,
175 | "num_samples": 100
176 | }
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/utils/html.py:
--------------------------------------------------------------------------------
1 | import dominate
2 | from dominate.tags import meta, h3, table, tr, td, p, a, img, br, video, source, attr
3 | from dominate.tags import span
4 | import os
5 |
6 |
7 | class HTML:
8 | """This HTML class allows us to save images and write texts into a single HTML file.
9 |
10 | It consists of functions such as (add a text header to the HTML file),
11 | (add a row of images to the HTML file), and (save the HTML to the disk).
12 | It is based on Python library 'dominate', a Python library for creating and
13 | manipulating HTML documents using a DOM API.
14 | """
15 |
16 | def __init__(self, web_dir, title, refresh=0):
17 | """Initialize the HTML classes
18 |
19 | Parameters:
20 | web_dir (str) -- a directory that stores the webpage. HTML file will be
21 | created at /index.html; images will be saved at 0:
35 | with self.doc.head:
36 | meta(http_equiv="refresh", content=str(refresh))
37 |
38 | def get_image_dir(self):
39 | """Return the directory that stores images"""
40 | return self.img_dir
41 |
42 | def add_header(self, text):
43 | """Insert a header to the HTML file
44 |
45 | Parameters:
46 | text (str) -- the header text
47 | """
48 | with self.doc:
49 | h3(text)
50 |
51 | def add_videos(self, vids, txts, links, width=400, hidden_tag="hidden"):
52 | """add images to the HTML file
53 |
54 | Parameters:
55 | vids (str list) -- a list of image paths
56 | txts (str list) -- a list of image names shown on the website
57 | links (str list) -- a list of hyperref links; when you click an image,
58 | it will redirect you to a new page
59 | """
60 | self.t = table(border=1, style="table-layout: fixed;") # Insert a table
61 | self.doc.add(self.t)
62 | colors = ["red", "blue", "gold", "salman"]
63 | with self.t:
64 | with tr():
65 | for vid, txt, link in zip(vids, txts, links):
66 | td_style = "word-wrap: break-word; width:{}px".format(width)
67 | with td(style=td_style, halign="center", valign="top"):
68 | with p():
69 | vid_path = str(vid)
70 | if vid_path == hidden_tag:
71 | p_style = "font-weight: bold; width:{}px;"
72 | p_style = p_style.format(width * 3)
73 | p("hidden video", style=p_style)
74 | else:
75 | with a(href=str(link)):
76 | with video():
77 | attr(controls="controls")
78 | source(src=vid_path, type="video/mp4")
79 | br()
80 | rows = txt.split("
")
81 | for idx, row in enumerate(rows):
82 | color = colors[idx % len(colors)]
83 | bold_tag = ""
84 | if not row.startswith(bold_tag):
85 | s_style = "color:{};".format(color)
86 | else:
87 | s_style = "color:black; font-weight: bold;"
88 | row = row[len(bold_tag):]
89 | span(row, style=s_style)
90 |
91 | def add_images(self, ims, txts, links, width=400):
92 | """add images to the HTML file
93 |
94 | Parameters:
95 | ims (str list) -- a list of image paths
96 | txts (str list) -- a list of image names shown on the website
97 | links (str list) -- a list of hyperref links; when you click an image,
98 | it will redirect you to a new page
99 | """
100 | self.t = table(border=1, style="table-layout: fixed;") # Insert a table
101 | self.doc.add(self.t)
102 | with self.t:
103 | with tr():
104 | for im, txt, link in zip(ims, txts, links):
105 | td_style = "word-wrap: break-word;"
106 | with td(style=td_style, halign="center", valign="top"):
107 | with p():
108 | with a(href=os.path.join("images", link)):
109 | img(
110 | style="width:%dpx" % width,
111 | src=os.path.join("images", im),
112 | )
113 | br()
114 | p(txt)
115 |
116 | def save(self):
117 | """save the current content to the HMTL file"""
118 | html_file = "%s/index.html" % self.web_dir
119 | f = open(html_file, "wt")
120 | f.write(self.doc.render())
121 | f.close()
122 |
123 |
124 | if __name__ == "__main__": # we show an example usage here.
125 | html = HTML("web/", "test_html")
126 | html.add_header("hello world")
127 |
128 | ims, txts, links = [], [], []
129 | for n in range(4):
130 | ims.append("image_%d.png" % n)
131 | txts.append("text_%d" % n)
132 | links.append("image_%d.png" % n)
133 | html.add_images(ims, txts, links)
134 | html.save()
135 |
--------------------------------------------------------------------------------
/configs/data_loader_querydsegments.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/base_config_queryd.json",
3 | "eval_mode": "test_run",
4 | "experts": {
5 | "text_feat": "w2v",
6 | "modalities": [
7 | "imagenet.resnext101_32x48d.0",
8 | "r2p1d.r2p1d-ig65m.0",
9 | "scene.densenet161.0",
10 | "audio"
11 | ]
12 | },
13 | "arch": {
14 | "type": "CENet",
15 | "args": {
16 | "test_caption_mode": "indep",
17 | "use_ce": "pairwise",
18 | "use_mish": 1,
19 | "use_bn_reason": 1,
20 | "num_g_layers": 3,
21 | "num_h_layers": 0,
22 | "include_self": 1,
23 | "l2renorm": false,
24 | "randomise_feats": "",
25 | "vlad_clusters": {
26 | "text": 20,
27 | "audio": 16
28 | },
29 | "ghost_clusters": {
30 | "text": 1
31 | },
32 | "mimic_ce_dims": 0
33 | }
34 | },
35 | "optimizer": {
36 | "type": "Ranger",
37 | "args": {
38 | "lr": 0.01,
39 | "weight_decay": 1E-03
40 | }
41 | },
42 | "loss": {
43 | "type": "MaxMarginRankingLoss",
44 | "args": {
45 | "margin": 0.2,
46 | "fix_norm": true
47 | }
48 | },
49 | "data_loader": {
50 | "type": "ExpertDataLoader",
51 | "args":{
52 | "dataset_name": "QuerYDSegments",
53 | "data_dir": "data/QuerYDSegments",
54 | "root_feat_folder": "structured-symlinks",
55 | "trn_cat": 0,
56 | "batch_size": 128,
57 | "split_name": "val",
58 | "fuse_captions": false,
59 | "num_test_captions": 1,
60 | "max_tokens": {
61 | "text": 70,
62 | "audio": 500
63 | },
64 | "feat_aggregation": {
65 | "imagenet.senet154.0": {
66 | "fps": 25,
67 | "stride": 1,
68 | "pixel_dim": 256,
69 | "aggregate-axis": 1,
70 | "offset": 0,
71 | "temporal": "avg",
72 | "aggregate": "concat",
73 | "type": "embed",
74 | "feat_dims": {
75 | "embed": 2048,
76 | "logits": 1000
77 | }
78 | },
79 | "imagenet.resnext101_32x48d.0": {
80 | "fps": 25,
81 | "stride": 1,
82 | "offset": 0,
83 | "pixel_dim": 256,
84 | "temporal": "avg",
85 | "aggregate": "concat",
86 | "aggregate-axis": 1,
87 | "type": "embed",
88 | "feat_dims": {
89 | "embed": 2048,
90 | "logits": 1000
91 | }
92 | },
93 | "scene.densenet161.0": {
94 | "stride": 1,
95 | "fps": 25,
96 | "offset": 0,
97 | "temporal": "avg",
98 | "pixel_dim": 256,
99 | "aggregate": "concat",
100 | "aggregate-axis": 1,
101 | "type": "embed",
102 | "feat_dims": {
103 | "embed": 2208,
104 | "logits": 1000
105 | }
106 | },
107 | "i3d.i3d.0": {
108 | "fps": 25,
109 | "offset": 0,
110 | "stride": 25,
111 | "inner_stride": 1,
112 | "pixel_dim": 256,
113 | "temporal": "avg",
114 | "aggregate": "concat",
115 | "aggregate-axis": 1,
116 | "type": "embed",
117 | "feat_dims": {
118 | "embed": 1024,
119 | "logits": 400
120 | }
121 | },
122 | "r2p1d.r2p1d-ig65m.0": {
123 | "fps": 30,
124 | "offset": 0,
125 | "stride": 32,
126 | "inner_stride": 1,
127 | "pixel_dim": 256,
128 | "temporal": "avg",
129 | "aggregate": "concat",
130 | "aggregate-axis": 1,
131 | "type": "embed",
132 | "feat_dims": {
133 | "embed": 512,
134 | "logits": 359
135 | }
136 | },
137 | "r2p1d.r2p1d-ig65m-kinetics.0": {
138 | "fps": 30,
139 | "offset": 0,
140 | "stride": 32,
141 | "inner_stride": 1,
142 | "pixel_dim": 256,
143 | "temporal": "avg",
144 | "aggregate": "concat",
145 | "aggregate-axis": 1,
146 | "type": "embed",
147 | "feat_dims": {
148 | "embed": 512,
149 | "logits": 400
150 | }
151 | },
152 | "s3dg.s3dg.0": {
153 | "fps": 10,
154 | "offset": 0,
155 | "stride": 16,
156 | "num_segments": null,
157 | "pixel_dim": 256,
158 | "inner_stride": 1,
159 | "temporal": "avg",
160 | "aggregate": "concat",
161 | "aggregate-axis": 1,
162 | "type": "embed",
163 | "feat_dims": {
164 | "embed": 1024,
165 | "logits": 512
166 | }
167 | }
168 | }
169 | }
170 | },
171 | "trainer": {
172 | "epochs": 20
173 | },
174 | "eval_settings": {
175 | "data_loader": {
176 | "args": {
177 | "split_name": "test"
178 | }
179 | },
180 | "tester": {
181 | "save_dir": "data/saved/",
182 | "verbosity": 2
183 | },
184 | "disable_gpu": true
185 | }
186 | }
187 |
--------------------------------------------------------------------------------
/utils/ranger.py:
--------------------------------------------------------------------------------
1 | #Ranger deep learning optimizer - RAdam + Lookahead combined.
2 | #https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer
3 |
4 | #Ranger has now been used to capture 12 records on the FastAI leaderboard.
5 |
6 | #This version = 9.3.19
7 |
8 | #Credits:
9 | #RAdam --> https://github.com/LiyuanLucasLiu/RAdam
10 | #Lookahead --> rewritten by lessw2020, but big thanks to Github @LonePatient and @RWightman for ideas from their code.
11 | #Lookahead paper --> MZhang,G Hinton https://arxiv.org/abs/1907.08610
12 |
13 | #summary of changes:
14 | #full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights),
15 | #supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues.
16 | #changes 8/31/19 - fix references to *self*.N_sma_threshold;
17 | #changed eps to 1e-5 as better default than 1e-8.
18 |
19 | import math
20 | import torch
21 | from torch.optim.optimizer import Optimizer, required
22 | import itertools as it
23 |
24 |
25 |
26 | class Ranger(Optimizer):
27 |
28 | def __init__(self, params, lr=1e-3, alpha=0.5, k=6, N_sma_threshhold=5, betas=(.95,0.999), eps=1e-5, weight_decay=0):
29 | #parameter checks
30 | if not 0.0 <= alpha <= 1.0:
31 | raise ValueError(f'Invalid slow update rate: {alpha}')
32 | if not 1 <= k:
33 | raise ValueError(f'Invalid lookahead steps: {k}')
34 | if not lr > 0:
35 | raise ValueError(f'Invalid Learning Rate: {lr}')
36 | if not eps > 0:
37 | raise ValueError(f'Invalid eps: {eps}')
38 |
39 | #parameter comments:
40 | # beta1 (momentum) of .95 seems to work better than .90...
41 | #N_sma_threshold of 5 seems better in testing than 4.
42 | #In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.
43 |
44 | #prep defaults and init torch.optim base
45 | defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
46 | super().__init__(params,defaults)
47 |
48 | #adjustable threshold
49 | self.N_sma_threshhold = N_sma_threshhold
50 |
51 | #now we can get to work...
52 | #removed as we now use step from RAdam...no need for duplicate step counting
53 | #for group in self.param_groups:
54 | # group["step_counter"] = 0
55 | #print("group step counter init")
56 |
57 | #look ahead params
58 | self.alpha = alpha
59 | self.k = k
60 |
61 | #radam buffer for state
62 | self.radam_buffer = [[None,None,None] for ind in range(10)]
63 |
64 | #self.first_run_check=0
65 |
66 | #lookahead weights
67 | #9/2/19 - lookahead param tensors have been moved to state storage.
68 | #This should resolve issues with load/save where weights were left in GPU memory from first load, slowing down future runs.
69 |
70 | #self.slow_weights = [[p.clone().detach() for p in group['params']]
71 | # for group in self.param_groups]
72 |
73 | #don't use grad for lookahead weights
74 | #for w in it.chain(*self.slow_weights):
75 | # w.requires_grad = False
76 |
77 | def __setstate__(self, state):
78 | print("set state called")
79 | super(Ranger, self).__setstate__(state)
80 |
81 |
82 | def step(self, closure=None):
83 | loss = None
84 | #note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.
85 | #Uncomment if you need to use the actual closure...
86 |
87 | #if closure is not None:
88 | #loss = closure()
89 |
90 | #Evaluate averages and grad, update param tensors
91 | for group in self.param_groups:
92 |
93 | for p in group['params']:
94 | if p.grad is None:
95 | continue
96 | grad = p.grad.data.float()
97 | if grad.is_sparse:
98 | raise RuntimeError('Ranger optimizer does not support sparse gradients')
99 |
100 | p_data_fp32 = p.data.float()
101 |
102 | state = self.state[p] #get state dict for this param
103 |
104 | if len(state) == 0: #if first time to run...init dictionary with our desired entries
105 | #if self.first_run_check==0:
106 | #self.first_run_check=1
107 | #print("Initializing slow buffer...should not see this at load from saved model!")
108 | state['step'] = 0
109 | state['exp_avg'] = torch.zeros_like(p_data_fp32)
110 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
111 |
112 | #look ahead weight storage now in state dict
113 | state['slow_buffer'] = torch.empty_like(p.data)
114 | state['slow_buffer'].copy_(p.data)
115 |
116 | else:
117 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
118 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
119 |
120 | #begin computations
121 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
122 | beta1, beta2 = group['betas']
123 |
124 | #compute variance mov avg
125 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
126 | #compute mean moving avg
127 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
128 |
129 | state['step'] += 1
130 |
131 |
132 | buffered = self.radam_buffer[int(state['step'] % 10)]
133 | if state['step'] == buffered[0]:
134 | N_sma, step_size = buffered[1], buffered[2]
135 | else:
136 | buffered[0] = state['step']
137 | beta2_t = beta2 ** state['step']
138 | N_sma_max = 2 / (1 - beta2) - 1
139 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
140 | buffered[1] = N_sma
141 | if N_sma > self.N_sma_threshhold:
142 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
143 | else:
144 | step_size = 1.0 / (1 - beta1 ** state['step'])
145 | buffered[2] = step_size
146 |
147 | if group['weight_decay'] != 0:
148 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
149 |
150 | if N_sma > self.N_sma_threshhold:
151 | denom = exp_avg_sq.sqrt().add_(group['eps'])
152 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
153 | else:
154 | p_data_fp32.add_(-step_size * group['lr'], exp_avg)
155 |
156 | p.data.copy_(p_data_fp32)
157 |
158 | #integrated look ahead...
159 | #we do it at the param level instead of group level
160 | if state['step'] % group['k'] == 0:
161 | slow_p = state['slow_buffer'] #get access to slow param tensor
162 | slow_p.add_(self.alpha, p.data - slow_p) #(fast weights - slow weights) * alpha
163 | p.data.copy_(slow_p) #copy interpolated weights to RAdam param tensor
164 |
165 | return loss
--------------------------------------------------------------------------------
/data_loader/AudioCaps_dataset.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import itertools
3 | from pathlib import Path
4 | from typing import Dict, List, Union
5 |
6 | from base.base_dataset import BaseDataset
7 | from typeguard import typechecked
8 | from utils import memory_summary
9 | from zsvision.zs_utils import concat_features, memcache
10 | import time
11 | import data_loader
12 |
13 |
14 | class AudioCaps(BaseDataset):
15 |
16 | @typechecked
17 | def __init__(self, testing_file: Union[None, str]=None, **kwargs):
18 | self.testing_file = testing_file
19 | super().__init__(**kwargs)
20 |
21 | print(f"self.testing_file: {self.testing_file}")
22 |
23 | @staticmethod
24 | @typechecked
25 | def dataset_paths(training_file=None, testing_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
26 | subset_paths = {}
27 | # import pdb; pdb.set_trace()
28 | if testing_file is None:
29 | test_splits = {
30 | "val": "filtered_val_list.txt",
31 | "test": "final_filtered_test_list.txt",
32 | }
33 | using_testing_file = False
34 | else:
35 | test_splits = {
36 | "val": "filtered_val_list.txt",
37 | "test": testing_file,
38 | }
39 | using_testing_file = True
40 | print(f"using {testing_file}")
41 | if training_file is not None:
42 | try:
43 | val_per = training_file.split('.txt')[0].split('train_list_')[1]
44 | test_splits['val'] = f"filtered_val_list_{val_per}.txt"
45 | except IndexError:
46 | pass
47 | for split_name, fname in test_splits.items():
48 | if training_file is None:
49 | print(f"using {test_splits['test']} is {using_testing_file} split {split_name}")
50 | subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
51 | print(f"using {subset_paths[split_name]['train']} and {subset_paths[split_name]['val']}")
52 | else:
53 | print(f"using {test_splits['test']} is {using_testing_file} split {split_name}")
54 | subset_paths[split_name] = {"train": training_file, "val": fname}
55 | print(f"using {subset_paths[split_name]['train']} and {subset_paths[split_name]['val']}")
56 |
57 | feature_names = BaseDataset.common_feat_names()
58 | feature_names.append("audio.vggish.0")
59 | feature_names.append("pann.pann.0")
60 | feature_names.append("syncnet.syncnet.0")
61 | feature_names.append("vggsound.vggsound.0")
62 | text_feat_paths = BaseDataset.common_text_feat_paths()
63 | text_feat_paths = {key: Path("text_embeddings") / fname
64 | for key, fname in text_feat_paths.items()}
65 | challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
66 | for key in text_feat_paths}
67 | custom_paths = {
68 | "audio": ["aggregated_audio/vggish-raw.hickle"],
69 | "pann": ["aggregated_pann/pann-raw.hickle"],
70 | "syncnet": ["aggregated_syncnet/syncnet-raw.hickle"],
71 | "vggsound": ["aggregated_vggsound/vggsound-raw.hickle"],
72 | "speech": ["aggregated_speech/w2v_mean.pkl"]
73 | }
74 | feature_info = {
75 | "custom_paths": custom_paths,
76 | "feature_names": feature_names,
77 | "subset_list_paths": subset_paths,
78 | "text_feat_paths": text_feat_paths,
79 | "challenge_text_feat_paths": challenge_text_feat_paths,
80 | "raw_captions_path": "structured-symlinks/raw-captions.pkl",
81 | }
82 | return feature_info
83 |
84 | def load_features(self):
85 | root_feat = self.root_feat
86 | feat_names = {key: self.visual_feat_paths(key) for key in
87 | self.paths["feature_names"]}
88 | feat_names.update(self.paths["custom_paths"])
89 | features = {}
90 | for expert, rel_names in feat_names.items():
91 | if expert not in self.ordered_experts:
92 | continue
93 | feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
94 | if len(feat_paths) == 1:
95 | features[expert] = memcache(feat_paths[0])
96 | else:
97 | # support multiple forms of feature (e.g. max and avg pooling). For
98 | # now, we only support direct concatenation
99 | msg = f"{expert}: Only direct concatenation of muliple feats is possible"
100 | print(f"Concatenating aggregates for {expert}....")
101 | assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
102 | axis = self.feat_aggregation[expert]["aggregate-axis"]
103 | x = concat_features.cache_info() # pylint: disable=no-value-for-parameter
104 | print(f"concat cache info: {x}")
105 | features_ = concat_features(feat_paths, axis=axis)
106 |
107 | memory_summary()
108 |
109 | #if expert == "speech":
110 | # features_defaults = defaultdict(lambda: np.zeros((1, 300)))
111 | # features_defaults.update(features_)
112 | # features_ = features_defaults
113 | # Make separate feature copies for each split to allow in-place filtering
114 | features[expert] = copy.deepcopy(features_)
115 |
116 | self.features = features
117 | if self.challenge_mode:
118 | self.load_challenge_text_features()
119 | else:
120 | self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
121 | # keys = list(raw_captions.keys())
122 | # raw_captions_fused = {}
123 | # for key in keys:
124 | # raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key]))
125 | # self.raw_captions = raw_captions_fused
126 | text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
127 | self.text_features = memcache(text_feat_path)
128 |
129 | # overload video paths, which are structured differently for YouCook2
130 | self.video_path_retrieval = [f"videos/{x}.mp4"
131 | for x in self.partition_lists["val"]]
132 |
133 | def sanity_checks(self):
134 | msg = (f"Expected to have single test caption for AudioCaps, since we assume"
135 | f"that the captions are fused (but using {self.num_test_captions})")
136 | if self.fuse_captions is True:
137 | assert self.num_test_captions == 1, msg
138 |
139 | def configure_train_test_splits(self, split_name):
140 | """Partition the datset into train/val/test splits.
141 |
142 | Args:
143 | split_name (str): the name of the split
144 | """
145 | print(f"Now working on {split_name}")
146 | # import pdb; pdb.set_trace()
147 | self.paths = type(self).dataset_paths(training_file=self.training_file, testing_file=self.testing_file)
148 | print("loading training/val splits....")
149 | tic = time.time()
150 | for subset, path in self.paths["subset_list_paths"][split_name].items():
151 | if self.challenge_mode and split_name == "public_server_test" \
152 | and subset == "val":
153 | root_feat = Path(self.challenge_test_root_feat_folder)
154 | else:
155 | root_feat = Path(self.root_feat)
156 | subset_list_path = root_feat / path
157 | if subset == "train" and self.eval_only:
158 | rows = []
159 | else:
160 | with open(subset_list_path) as f:
161 | rows = f.read().splitlines()
162 | self.partition_lists[subset] = rows
163 | print("done in {:.3f}s".format(time.time() - tic))
164 | self.split_name = split_name
165 |
--------------------------------------------------------------------------------
/data_loader/data_loaders.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import functools
3 | from pathlib import Path
4 | from typing import Dict, List, Union
5 |
6 | import torch
7 | from typeguard import typechecked
8 | from torch.utils.data import DataLoader
9 | from zsvision.zs_utils import memcache
10 |
11 | from zsvision.zs_data_structures import HashableDict, HashableOrderedDict
12 | from data_loader.ActivityNet_dataset import ActivityNet
13 | from data_loader.QuerYD_dataset import QuerYD
14 | from data_loader.QuerYDSegments_dataset import QuerYDSegments
15 | from data_loader.AudioCaps_dataset import AudioCaps
16 | from data_loader.CLOTHO_dataset import CLOTHO
17 |
18 | @functools.lru_cache(maxsize=64, typed=False)
19 | def dataset_loader(
20 | text_dropout: float,
21 | fuse_captions: bool,
22 | spatial_feats: bool,
23 | use_zeros_for_missing: bool,
24 | challenge_mode: bool,
25 | eval_only: bool,
26 | task: str,
27 | data_dir: str,
28 | text_agg: str,
29 | text_feat: str,
30 | split_name: str,
31 | dataset_name: str,
32 | cls_partition: str,
33 | root_feat_folder: str,
34 | challenge_test_root_feat_folder: str,
35 | text_dim: int,
36 | num_test_captions: int,
37 | restrict_train_captions: int,
38 | logger: logging.Logger,
39 | max_tokens: Dict[str, int],
40 | raw_input_dims: HashableOrderedDict,
41 | feat_aggregation: HashableDict,
42 | distil_params: Union[None, Dict],
43 | training_file: Union[None, str],
44 | caption_masks: Union[None, str],
45 | ce_shared_dim: Union[None, int],
46 | **args,
47 | ):
48 | print(f"refreshing cache for {dataset_name} data loader [{split_name}]")
49 | kwargs = dict(
50 | task=task,
51 | data_dir=Path(data_dir),
52 | text_dim=text_dim,
53 | logger=logger,
54 | eval_only=eval_only,
55 | text_agg=text_agg,
56 | text_feat=text_feat,
57 | max_tokens=max_tokens,
58 | split_name=split_name,
59 | cls_partition=cls_partition,
60 | spatial_feats=spatial_feats,
61 | text_dropout=text_dropout,
62 | fuse_captions=fuse_captions,
63 | raw_input_dims=raw_input_dims,
64 | challenge_mode=challenge_mode,
65 | root_feat_folder=root_feat_folder,
66 | feat_aggregation=feat_aggregation,
67 | num_test_captions=num_test_captions,
68 | use_zeros_for_missing=use_zeros_for_missing,
69 | restrict_train_captions=restrict_train_captions,
70 | challenge_test_root_feat_folder=challenge_test_root_feat_folder,
71 | distil_params=distil_params,
72 | training_file=training_file,
73 | caption_masks=caption_masks,
74 | ce_shared_dim=ce_shared_dim,
75 | **args,
76 | )
77 | if dataset_name == "ActivityNet":
78 | dataset = ActivityNet(**kwargs)
79 | elif dataset_name == "QuerYD":
80 | dataset = QuerYD(**kwargs)
81 | elif dataset_name == "QuerYDSegments":
82 | dataset = QuerYDSegments(**kwargs)
83 | elif dataset_name == "AudioCaps":
84 | dataset = AudioCaps(**kwargs)
85 | elif dataset_name == "CLOTHO":
86 | dataset = CLOTHO(**kwargs)
87 | return dataset
88 |
89 |
90 | class ExpertDataLoader:
91 |
92 | @typechecked
93 | def __init__(
94 | self,
95 | eval_only: bool,
96 | fuse_captions: bool,
97 | challenge_mode: bool,
98 | use_zeros_for_missing: bool,
99 | trn_cat: int,
100 | text_dim: int,
101 | batch_size: int,
102 | num_workers: int,
103 | num_test_captions: int,
104 | task: str,
105 | data_dir: str,
106 | text_agg: str,
107 | text_feat: str,
108 | split_name: str,
109 | dataset_name: str,
110 | root_feat_folder: str,
111 | text_dropout: float,
112 | max_tokens: Dict[str, int],
113 | raw_input_dims: Dict[str, int],
114 | feat_aggregation: Dict[str, Dict],
115 | logger: logging.Logger,
116 | spatial_feats: bool = False,
117 | restrict_train_captions: int = 0,
118 | drop_last: bool = False,
119 | refresh_lru_cache: bool = False,
120 | cls_partitions: List[str] = ["train", "val", "tiny", "challenge"],
121 | challenge_test_root_feat_folder: str = "challenge",
122 | distil_params: Union[None, Dict] = None,
123 | training_file: Union[None, str] = None,
124 | caption_masks: Union[None, str] = None,
125 | ce_shared_dim: Union[None, int] = None,
126 | **args,
127 | ):
128 |
129 | # Ensure that the dictionaries are hashable to allow use of caching
130 | raw_input_dims = HashableOrderedDict(raw_input_dims)
131 | feat_aggregation = HashableDict(feat_aggregation)
132 | if distil_params is not None:
133 | distil_params = HashableDict(distil_params)
134 | max_tokens = HashableDict(max_tokens)
135 |
136 | if refresh_lru_cache:
137 | logger.info("Explicitly refreshing dataloader and cuda cache")
138 | dataset_loader.cache_clear()
139 | torch.cuda.empty_cache()
140 | memcache.cache_clear()
141 |
142 | if trn_cat:
143 | raise NotImplementedError(f"Support for trn cat will need to be re-added")
144 |
145 | common_kwargs = dict(
146 | task=task,
147 | logger=logger,
148 | data_dir=data_dir,
149 | text_dim=text_dim,
150 | text_agg=text_agg,
151 | eval_only=eval_only,
152 | text_feat=text_feat,
153 | max_tokens=max_tokens,
154 | dataset_name=dataset_name,
155 | text_dropout=text_dropout,
156 | fuse_captions=fuse_captions,
157 | spatial_feats=spatial_feats,
158 | split_name=split_name,
159 | challenge_mode=challenge_mode,
160 | root_feat_folder=root_feat_folder,
161 | use_zeros_for_missing=use_zeros_for_missing,
162 | challenge_test_root_feat_folder=challenge_test_root_feat_folder,
163 | num_test_captions=num_test_captions,
164 | raw_input_dims=raw_input_dims,
165 | feat_aggregation=feat_aggregation,
166 | restrict_train_captions=restrict_train_captions,
167 | distil_params=distil_params,
168 | training_file=training_file,
169 | caption_masks=caption_masks,
170 | ce_shared_dim=ce_shared_dim,
171 | **args,
172 | )
173 |
174 | if "retrieval" in task:
175 | # import pdb; pdb.set_trace()
176 | dataset = dataset_loader(cls_partition="train", **common_kwargs)
177 | x = dataset_loader.cache_info() # pylint: disable=no-value-for-parameter
178 | logger.info(f"cache info {x}")
179 | self.dataloaders = {"dataset": dataset}
180 | self.dataloaders["retrieval"] = dataset.get_retrieval_data()
181 | if not eval_only:
182 | train_loader = DataLoader(
183 | dataset=dataset,
184 | batch_size=batch_size,
185 | num_workers=num_workers,
186 | collate_fn=dataset.collate_data,
187 | drop_last=drop_last,
188 | shuffle=True,
189 | )
190 | self.dataloaders["train"] = train_loader
191 | else:
192 | self.dataloaders = {}
193 | for cls_partition in cls_partitions:
194 | cls_dataset = dataset_loader(cls_partition=cls_partition, **common_kwargs)
195 | x = dataset_loader.cache_info() # pylint: disable=no-value-for-parameter
196 | logger.info(f"cache info [{cls_partition}] {x}")
197 | loader = DataLoader(
198 | dataset=cls_dataset,
199 | batch_size=batch_size,
200 | num_workers=num_workers,
201 | collate_fn=cls_dataset.collate_data,
202 | drop_last=False,
203 | shuffle=False,
204 | )
205 | self.dataloaders[cls_partition] = loader
206 |
207 | logger.info(f"Loading data loaders with {num_workers} workers")
208 | self.num_test_captions = num_test_captions
209 | self.dataset_name = dataset_name
210 |
211 | def __getitem__(self, key):
212 | return self.dataloaders[key]
213 |
--------------------------------------------------------------------------------
/misc/datasets/queryd/val_list.txt:
--------------------------------------------------------------------------------
1 | video-RXFilHLLqPM
2 | video-epKCqDN9fBo
3 | video-HkKRouJqGCg
4 | video-Q1CQUivEths
5 | video-0P1Td5OTS-A
6 | video-X2-S3pN1pt0
7 | video-UPA3bwVVzGI
8 | video-qi2m4V21bw4
9 | video-eRT_mIpXjbs
10 | video-qjZtHyPLQCE
11 | video-je4nDvNJXsg
12 | video-_iQ-Rb1ohDU
13 | video-r1FbiXDKonk
14 | video-zXBmZLmfQZ4
15 | video-bwzLiQZDw2I
16 | video-yf9dyfeFsFg
17 | video--rgDvP39Lqw
18 | video-VssqNaBnWoM
19 | video-KdpoLklTozo
20 | video-tyxYHIcIJoc
21 | video-zRgZ3sWvnqs
22 | video-vXccpwytjL8
23 | video-YkAX7Vk3JEw
24 | video-yZyxJxR6RCA
25 | video-5z7fKiO5Uzg
26 | video-81Y1Ligkpb4
27 | video-K_P8kQg1Qq8
28 | video-T0NPYZyI7V8
29 | video-_vUG5rqC6qI
30 | video-KD9vAYYLItg
31 | video-XTW3LVp4pWA
32 | video-oVS7kHYlLBc
33 | video-DvaPRlZtfyc
34 | video-8j3NmTv9AWg
35 | video-jTXOEBHC0HY
36 | video-gBdyU1b0ADQ
37 | video-uEu6r8MkQ0o
38 | video-1qM0p24SNhc
39 | video-SFQB8hJdLZw
40 | video-V4rufe1J-Q8
41 | video-sq83Saeop9Y
42 | video-HK9vbhTTwWU
43 | video-cT_Wuzag6VU
44 | video-ahj8Vef9L24
45 | video-KIZProYn7R4
46 | video-VYOjWnS4cMY
47 | video-OINa46HeWg8
48 | video-SNTcxD_xPfk
49 | video-jhfLlamufKE
50 | video-d61MkuYttDI
51 | video-JgWgJa1NtAY
52 | video-iDnE3PV4YNc
53 | video-UTLCo4PHRAw
54 | video-jcBLKdsmpo0
55 | video-F1NR-_YqOgE
56 | video-YoMEedm1DXM
57 | video-EYmN3Sjgvts
58 | video-HodFzcJHIYI
59 | video-bJw0_Fj4PGY
60 | video-RUkMwGquH_A
61 | video-sc5aZsS0-0Y
62 | video-p4cc4LkJFjg
63 | video-ineZXLbL7s8
64 | video-GaSRAzyxLKs
65 | video-2PEvPfsNDrw
66 | video-tRWbo2x5lnA
67 | video-czTG7JxOruo
68 | video-Y-Z7LObUlwA
69 | video-RBAmLm_jYyY
70 | video-ymc30meWzfg
71 | video-sHLKy3z7HwM
72 | video-d25HklopoSs
73 | video-uqxzi_ghjgc
74 | video-v9wRrYhlRgs
75 | video-zd2xn5U6e-E
76 | video-1I8ICdOySkw
77 | video-RcM0NG2Fuxo
78 | video-CBlaiBV_yJs
79 | video-_tKp2eARy3o
80 | video-I8ZvdblLcnk
81 | video-Gu0wmiIngAw
82 | video-s9jX0S7mvB8
83 | video-cX03usETYI0
84 | video-4FHckDWnDKI
85 | video-pYekAIt9wW8
86 | video-LiK2fhOY0nE
87 | video-ndA4YL-bBAQ
88 | video-uQp0Eihw2WA
89 | video-A5WeiYHnvNY
90 | video-b6yYd6Pq7Ic
91 | video-YTszmB9fqEs
92 | video-Bk4MR0IItiQ
93 | video-OsNH9Tm-A04
94 | video-GbycvPwr1Wg
95 | video--FcsIyqJDzc
96 | video-9pX1hxYW3YY
97 | video-gupNRww6vFc
98 | video-oJsYwehp_r4
99 | video-37aUB92yvHI
100 | video-a5V6gdu5ih8
101 | video-AWjBNSshF3s
102 | video-s_RV4Btuv2c
103 | video-sOnqjkJTMaA
104 | video-aLackFf0Zjw
105 | video-On7TvTDOyMQ
106 | video-LvigW4InYyk
107 | video-3yd_1z6OsrE
108 | video-pU0GSbe6r_4
109 | video-x9FzWnWW95U
110 | video-2y1QQWNZxZM
111 | video-beECZjCRLmQ
112 | video-hseWMRV3lA8
113 | video-LSfJQkA-bKE
114 | video-uwCbe2yBqTI
115 | video-kBeggSzwKQ4
116 | video-paXvS0cnQM4
117 | video-_zOX6BO2zjc
118 | video-vI_B7dtF7Q0
119 | video-ka8-Nefp_gk
120 | video-dsf_z4urc4s
121 | video-xrlgfC0SJ9g
122 | video-KzWYP4LsaJw
123 | video-FTL9gQ0pux0
124 | video-c5gLf3_SK3Q
125 | video-DfPMxdHZKsw
126 | video-Dev6T3ZCrY8
127 | video-vU6Ay6yvaLo
128 | video-baM917Zy04A
129 | video-5jjeIH8Y6XM
130 | video-7N98N0GkGjY
131 | video-DCM-sEpyh1Q
132 | video-3JNLwlcPBPI
133 | video-1Ez6dw3ywcc
134 | video-yhofIxEfld0
135 | video-Cdpf1Dl5b_4
136 | video-2DHYhZNHtck
137 | video-L3MtFGWRXAA
138 | video--CR65sS1Frw
139 | video-HJENMThDg0k
140 | video-nIs4S9YDPRs
141 | video-5OBvbyAQ68g
142 | video-ih3CbjixhoI
143 | video-PV34pW-53Os
144 | video-XOqqP5Ww9lE
145 | video-SQBGJr8THGk
146 | video-YlrGp1YxMrc
147 | video-6US4AyvEO_A
148 | video-RF99-5G-Hrk
149 | video-JuKCOthud68
150 | video-Co4dLH29PvM
151 | video-3yQUzU8c4us
152 | video-Y8ZWX2NP3i0
153 | video-xqEqAQadKqE
154 | video-ghg2AP3i5TI
155 | video-jLeZc7li5HM
156 | video-RaRpFuSLyPI
157 | video-T7o0KMXccEA
158 | video-TdUsyXQ8Wrs
159 | video-zYX7iexkODw
160 | video-dkjbMoj0JY4
161 | video-mOaRH-aVFb4
162 | video-wekSrZ-d1bM
163 | video-GyTxtJ4gVLE
164 | video-4GfBVEoxStA
165 | video-7VS1wPeWqAA
166 | video-skKUzMST92g
167 | video-pzAZnOyMTI4
168 | video-J3iVxb8cwOU
169 | video-OZd9jf5nV7I
170 | video-xrhkfADEtMU
171 | video-CRxshNHF98U
172 | video-7WhJ2L5xUqI
173 | video-ClW-SQ7GdiM
174 | video-NjyWl-Bz6Q8
175 | video-rCZ3SN65kIs
176 | video-osP9iJjvlAE
177 | video-30qOijVBS7o
178 | video-63fcAemH_wg
179 | video-MsOzAbUt8n8
180 | video-ndhwbt9OQ0Q
181 | video-WkxE_Fs_mHI
182 | video-i4eADcCnFjo
183 | video--cf_-i_gCdY
184 | video-NB69vdkxn4Y
185 | video-WxXiQqul4io
186 | video-nojC6fP56VI
187 | video-R3qJ-u4b5W4
188 | video-r1AOXI0eBL8
189 | video-oM_M_d9OiHc
190 | video-U5oHhI_GmJs
191 | video-KAyVk_sH42k
192 | video-ezX-a1FT_ns
193 | video-GNZBSZD16cY
194 | video-eWXOurnVTYg
195 | video-sdbHXKlpPAM
196 | video-vfCddWB_Jlw
197 | video-DexH4oCXw-Y
198 | video-fmOaYJ4K09k
199 | video-duF40iZq464
200 | video-z5UScMQUO6Q
201 | video-KHiR4qVpcG8
202 | video-zFCNUW0TfqE
203 | video-Ujg7vcIa7kM
204 | video-yM6UU6QTt4M
205 | video-phQDinMbmic
206 | video-r5L7Iokg5RY
207 | video-DIBw9dSVKdU
208 | video-_La7IMssNOA
209 | video-7Fjt-mlIlTY
210 | video-EpcDZbXslfw
211 | video-prZuZlP4Pqo
212 | video-ndh11VDx_J8
213 | video-X2niZRgGZ7E
214 | video-8jaxiha8-rY
215 | video-C7uAB94aRrQ
216 | video-AoKlbyqbEGM
217 | video-Mv1FKi_-A1I
218 | video-dx0-pNkwOv8
219 | video-KWNEc1Igadg
220 | video-Hf_2ilitep0
221 | video-F5jNkpjGh8A
222 | video-VsY834tcKw8
223 | video-mG4Y2Snygfk
224 | video-vxjW8sfUCCU
225 | video-yCXSgVFsQnQ
226 | video-KYazqIHYqNI
227 | video-5lgHJB1lwYs
228 | video-GPQwSEzXBXU
229 | video-kg-EEBIe7Lk
230 | video-_YoeHOTJBI4
231 | video-yonJuvlA34U
232 | video-kshxs2WBjmE
233 | video-UUlaseGrkLc
234 | video-LlndnhlJnIw
235 | video-4o-qnznd10Y
236 | video-q3wJ32w4s_A
237 | video-qCNodlSc6Hw
238 | video-ml9EdqgtVfU
239 | video-83Wu5xmstn4
240 | video-mrA0oL6wLQA
241 | video-WbmXpHfabuQ
242 | video-Suv9QImeAog
243 | video-jF3I7VpfCEU
244 | video-IYlOZxb0ViI
245 | video-oY6tCnu-1Do
246 | video-fmnUwvZAMjQ
247 | video-g2wsNw07wRY
248 | video-L9KC5W7A2yI
249 | video-db3Ep-jM6ZE
250 | video-9NvPlA3G53I
251 | video-uGl4PRmhRxk
252 | video-UZO5q0B5wfw
253 | video-KItqbZXlrdY
254 | video-W4qMyGXTcsg
255 | video-pIyTWg9oV0M
256 | video-9_5wHw6l11o
257 | video-Juj026QZGDo
258 | video-25lxyul1lb0
259 | video-V-AFUpW3oNg
260 | video-W-xuEJVEraw
261 | video-tFf6pt9HOq8
262 | video-FlYf0F1fuTc
263 | video-Xl6yy6a3emw
264 | video-Ta9K22D0o5Q
265 | video-xA9uSxI36Ik
266 | video-MqIJKnUkGLY
267 | video-L-LE-j2zkCU
268 | video-ZdzD897w11s
269 | video-s0GtT-vN33I
270 | video-fWDaRN490BI
271 | video-XbxRqlHtKUE
272 | video-S0zfR9DTwSY
273 | video-p-_UlScFrQ8
274 | video-JMKHbmwltWQ
275 | video-g-uBt1SoCRQ
276 | video-2b7aoZsavu0
277 | video-A0Wk08f8mUU
278 | video-a7CpzJ-sNl8
279 | video-B1yJuGQOUaY
280 | video-gryenlQKTbE
281 | video-txqiwrbYGrs
282 | video-k-Z8xxygd2Y
283 | video-Qo95rTt9ikU
284 | video-xiSIQzwIPzQ
285 | video-ZtCZGwLH5_o
286 | video-ghLkwSlWSXw
287 | video-Ui7jeZSsgFs
288 | video-rVu1oVDRLgE
289 | video-hDjzdFXmH4g
290 | video-f8PXvqYpGCM
291 | video-fAr-ZYq4RmU
292 | video-6MW7bkk3MZ0
293 | video-tNeYTDLZUJA
294 | video-b7cSIiKxEt4
295 | video-_6PNGyfwjTA
296 | video-pGQd630EpLU
297 | video-ZGc06DUIpaA
298 | video-OKaD4EcmZO8
299 | video-_F6h0yH7EyU
300 | video--A8EERSVAdk
301 | video-II5UsqP2JAk
302 | video-60kjpwyQhqg
303 | video-GR2o6k8aPlI
304 | video-vmzlLFAf3M4
305 | video-mm3rTwAxH20
306 | video-9lZlt-SlABw
307 | video-Yz9u-oG3BgM
308 | video-TuE6z8X-rGg
309 | video-QeYISW-Jplw
310 | video-C6a9AQY_srk
311 | video-aWOCk_57xj8
312 | video-VUs6l9p34v8
313 | video-MdEV-jWMGWw
314 | video-T0rrS51ry2s
315 | video-ODDZPV-Avfw
316 | video-g8Ir5rQJeX0
317 | video-x1kQ-38-Drg
318 | video-T47vNsMjjn8
319 | video-vT-naHyejL8
320 | video-UTO0ogdNMdY
321 | video-xxuQnCg9ML0
322 | video-VfDWQG47pAQ
323 | video-ebyf6FzKEiI
324 | video-6XfdHj11-N0
325 | video-lLYp4b_p_wg
326 | video-RxtzQg57O7w
327 | video-qDMMRpSmzq4
328 | video-_4BT6iLtzUM
329 | video-F6j0EbS7skc
330 | video-0LHxvxdRnYc
331 | video-BDnXdeoiYRE
332 | video-vbuq7w3ZDUQ
333 | video-uE74-8YAV9E
334 | video-6ri6gK4FcnE
335 | video-pRfZmKKX2c4
336 | video-bln98NpCLQA
337 | video-gU2vD-FewGM
338 | video-w17iS0AJHjU
339 | video-6yd2tv4Ni4c
340 | video-GSvbZyWXsME
341 | video-t2mU6USTBRE
342 | video-ccgW5CHFg0Y
343 | video-G0E31788Nfg
344 | video-S7VYhBwbprE
345 | video-xSz0zs0v6e8
346 | video-V7QYLEusDU4
347 | video-mwm0OwqWvF4
348 | video-OxrBik16Hzg
349 | video-63d0vV0kk_Y
350 | video-74v22joL7J4
351 | video-RFinNxS5KN4
352 | video-xBD_s0RhUko
353 | video-M1GO1X09Gec
354 | video-LkjuS_tzmIE
355 | video-AB2oAgjjt3g
356 | video-4yG8caPPY1Y
357 | video-4cR7tNWsuNM
358 | video-ztim_RY82G8
359 | video-HF0Ev_skUAY
360 | video-nGkUzdNi_gs
361 | video-TzZuNeRdFIc
362 | video-8zkVKHy1hyM
363 | video-VjOSxus84WY
364 | video-kupuUVYxZxU
365 | video-khvaIwonxUk
366 | video-c_Ex_qS5Djo
367 | video--W37TDK6dBM
368 | video-cN6uZkmGSLM
369 | video-k431Cy2-kkA
370 | video-JCOqo88eW1E
371 | video-8Fu5pKcrTZI
372 | video-_Z2iurLVDEQ
373 | video-zC2G6lf9fCs
374 | video-V3wLiAmIrGk
375 | video-g_QG77WomHo
376 | video-KLzYvzQbBLI
377 | video-QYsg3rtT79o
378 | video-Q8fUy8qwV3M
379 | video-__5k7e0f3r4
380 | video-GRi80V8ire8
381 | video-ckEoLBiE3Xs
382 | video-24f0OwnZE-Q
383 | video-1WpStml5fe8
384 | video-UJfqp1dmJ3I
385 |
--------------------------------------------------------------------------------
/utils/radam.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.optim.optimizer import Optimizer, required
4 |
5 | class RAdam(Optimizer):
6 |
7 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
8 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
9 | self.buffer = [[None, None, None] for ind in range(10)]
10 | super(RAdam, self).__init__(params, defaults)
11 |
12 | def __setstate__(self, state):
13 | super(RAdam, self).__setstate__(state)
14 |
15 | def step(self, closure=None):
16 |
17 | loss = None
18 | if closure is not None:
19 | loss = closure()
20 |
21 | for group in self.param_groups:
22 |
23 | for p in group['params']:
24 | if p.grad is None:
25 | continue
26 | grad = p.grad.data.float()
27 | if grad.is_sparse:
28 | raise RuntimeError('RAdam does not support sparse gradients')
29 |
30 | p_data_fp32 = p.data.float()
31 |
32 | state = self.state[p]
33 |
34 | if len(state) == 0:
35 | state['step'] = 0
36 | state['exp_avg'] = torch.zeros_like(p_data_fp32)
37 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
38 | else:
39 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
40 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
41 |
42 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
43 | beta1, beta2 = group['betas']
44 |
45 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
46 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
47 |
48 | state['step'] += 1
49 | buffered = self.buffer[int(state['step'] % 10)]
50 | if state['step'] == buffered[0]:
51 | N_sma, step_size = buffered[1], buffered[2]
52 | else:
53 | buffered[0] = state['step']
54 | beta2_t = beta2 ** state['step']
55 | N_sma_max = 2 / (1 - beta2) - 1
56 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
57 | buffered[1] = N_sma
58 |
59 | # more conservative since it's an approximated value
60 | if N_sma >= 5:
61 | step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
62 | else:
63 | step_size = 1.0 / (1 - beta1 ** state['step'])
64 | buffered[2] = step_size
65 |
66 | if group['weight_decay'] != 0:
67 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
68 |
69 | # more conservative since it's an approximated value
70 | if N_sma >= 5:
71 | denom = exp_avg_sq.sqrt().add_(group['eps'])
72 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
73 | else:
74 | p_data_fp32.add_(-step_size * group['lr'], exp_avg)
75 |
76 | p.data.copy_(p_data_fp32)
77 |
78 | return loss
79 |
80 | class PlainRAdam(Optimizer):
81 |
82 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
83 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
84 |
85 | super(PlainRAdam, self).__init__(params, defaults)
86 |
87 | def __setstate__(self, state):
88 | super(PlainRAdam, self).__setstate__(state)
89 |
90 | def step(self, closure=None):
91 |
92 | loss = None
93 | if closure is not None:
94 | loss = closure()
95 |
96 | for group in self.param_groups:
97 |
98 | for p in group['params']:
99 | if p.grad is None:
100 | continue
101 | grad = p.grad.data.float()
102 | if grad.is_sparse:
103 | raise RuntimeError('RAdam does not support sparse gradients')
104 |
105 | p_data_fp32 = p.data.float()
106 |
107 | state = self.state[p]
108 |
109 | if len(state) == 0:
110 | state['step'] = 0
111 | state['exp_avg'] = torch.zeros_like(p_data_fp32)
112 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
113 | else:
114 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
115 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
116 |
117 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
118 | beta1, beta2 = group['betas']
119 |
120 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
121 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
122 |
123 | state['step'] += 1
124 | beta2_t = beta2 ** state['step']
125 | N_sma_max = 2 / (1 - beta2) - 1
126 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
127 |
128 | if group['weight_decay'] != 0:
129 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
130 |
131 | # more conservative since it's an approximated value
132 | if N_sma >= 5:
133 | step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
134 | denom = exp_avg_sq.sqrt().add_(group['eps'])
135 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
136 | else:
137 | step_size = group['lr'] / (1 - beta1 ** state['step'])
138 | p_data_fp32.add_(-step_size, exp_avg)
139 |
140 | p.data.copy_(p_data_fp32)
141 |
142 | return loss
143 |
144 |
145 | class AdamW(Optimizer):
146 |
147 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0):
148 | defaults = dict(lr=lr, betas=betas, eps=eps,
149 | weight_decay=weight_decay, warmup = warmup)
150 | super(AdamW, self).__init__(params, defaults)
151 |
152 | def __setstate__(self, state):
153 | super(AdamW, self).__setstate__(state)
154 |
155 | def step(self, closure=None):
156 | loss = None
157 | if closure is not None:
158 | loss = closure()
159 |
160 | for group in self.param_groups:
161 |
162 | for p in group['params']:
163 | if p.grad is None:
164 | continue
165 | grad = p.grad.data.float()
166 | if grad.is_sparse:
167 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
168 |
169 | p_data_fp32 = p.data.float()
170 |
171 | state = self.state[p]
172 |
173 | if len(state) == 0:
174 | state['step'] = 0
175 | state['exp_avg'] = torch.zeros_like(p_data_fp32)
176 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
177 | else:
178 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
179 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
180 |
181 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
182 | beta1, beta2 = group['betas']
183 |
184 | state['step'] += 1
185 |
186 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
187 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
188 |
189 | denom = exp_avg_sq.sqrt().add_(group['eps'])
190 | bias_correction1 = 1 - beta1 ** state['step']
191 | bias_correction2 = 1 - beta2 ** state['step']
192 |
193 | if group['warmup'] > state['step']:
194 | scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
195 | else:
196 | scheduled_lr = group['lr']
197 |
198 | step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
199 |
200 | if group['weight_decay'] != 0:
201 | p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
202 |
203 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
204 |
205 | p.data.copy_(p_data_fp32)
206 |
207 | return loss
208 |
--------------------------------------------------------------------------------
/misc/datasets/queryd/test_list.txt:
--------------------------------------------------------------------------------
1 | video-uc0O6cqYbyk
2 | video-vzJqeyxye_E
3 | video-1wn-5-HaKaI
4 | video-BhmvJuTc4aY
5 | video-DqilSuFK3B8
6 | video-BxyXk1sr2io
7 | video-rMXsAtYtLTo
8 | video-HX8KX5u0gkg
9 | video-cWX9iR_PzIQ
10 | video-aooZ26YKH-8
11 | video-eOrNdBpGMv8
12 | video-alLd1Bobkf8
13 | video-G_TwXO0yFeM
14 | video-fbri4or6Uhk
15 | video-uWuOBonC1ds
16 | video-lm0OZ8cMW6M
17 | video-Ro7tuDVP6Ks
18 | video-dNJdJIwCF_Y
19 | video-avl2RPsmlnI
20 | video-9AMPsDXGAxY
21 | video-MXVz4izTWZY
22 | video-W2asVll3OSU
23 | video-x152oloLhVM
24 | video-cNi_HC839Wo
25 | video-8gy5tYVR-28
26 | video-Ceu5cSJiAic
27 | video-6fan6ggvh4U
28 | video-DYldBHQw3s4
29 | video-Kqr2ibw5FKA
30 | video-KJiCalhfN6k
31 | video-Ahvl7V82ycE
32 | video-bSfDQr-FFJU
33 | video-1WsDtn-feuI
34 | video-pQD6gJMnWVs
35 | video-5xJ6h6OTveU
36 | video-Iqtv2oqNNIE
37 | video-z41fUWaNwO0
38 | video-wAnrYQGSHMw
39 | video-oyv6hjQr8MA
40 | video-mvD5wvlIE7c
41 | video-H5MKL1duTrg
42 | video-vN1fuDgj-Qk
43 | video-c38r-SAnTWM
44 | video-q6rAllJAdWk
45 | video-F-a5tZ_5g9E
46 | video-USGLDwaLnaI
47 | video-UE2Pe5DaKiE
48 | video-c6KD-kOqwpE
49 | video-NLlGopyXT_g
50 | video-_NWYeVyZz9I
51 | video--IRNEMiRg0Y
52 | video-lTxn2BuqyzU
53 | video-AZvdGSr7roA
54 | video-TvYCwP1U0xs
55 | video-MovSMelAxWg
56 | video-ER8yEOY0NCc
57 | video-8GGfE7zsD-0
58 | video-IWntTYTdXG8
59 | video-Q_P8WCbhC6s
60 | video-akYVGzeS7_A
61 | video-HZ21eT9lyog
62 | video-kPm0_jB5EQI
63 | video-3EBfr6KdnQc
64 | video-yq3Z9msqnOg
65 | video-T_FdjXqSZlc
66 | video-H7muODd2pCo
67 | video-gJWo7Z5m6e8
68 | video-V49ENdZlOx4
69 | video-E0pemP7JGV4
70 | video-AMjMFbhyhwY
71 | video-cEfD-Mr7m2c
72 | video-j8waYyUSSxg
73 | video-SBTL1vI4-mc
74 | video-4w7sVSMbjyM
75 | video-QPe3QSw49_Y
76 | video-cx-T137FKM4
77 | video-twKxRnoGxoM
78 | video-G9hIzIG1sPY
79 | video-oSO9q-2JjUs
80 | video-cOvModiiUjs
81 | video-Q-yA3q0mGCM
82 | video-yUkgydZh0Bk
83 | video-IDIQKPApIxM
84 | video-7HDtvRHyq2M
85 | video-g6KUxEfUVm0
86 | video-oCLTRjF2eq0
87 | video-9-k5J4RxQdE
88 | video-7-i6uxo4HS0
89 | video-GF60Iuh643I
90 | video-h56Cr2ho1Y8
91 | video-UdNZzQD0qY0
92 | video-bh-dRJaZdgM
93 | video-HTj2n52jz94
94 | video-gsYL4PC0hyk
95 | video-agqgBkpbCoY
96 | video-CrqIVVd3hp0
97 | video-XHGWTDHchVQ
98 | video-Ei6TjvfGMpE
99 | video-_MC3XuMvsDI
100 | video-ikUora2uPnU
101 | video-bGph2eX8RMI
102 | video-cd_AOrSEeRc
103 | video-3heXk6Oj6hU
104 | video-_tw7PR89IOI
105 | video-KUGf-irpTMQ
106 | video-z1fbwPHv-wA
107 | video-hhaNVna7eQs
108 | video-KwOdedfBqHE
109 | video-1g4AUYiz0LU
110 | video-_lmKuKrsKRA
111 | video-07d2dXHYb94
112 | video-pQu3dufotPM
113 | video-VpPJP7o7NnA
114 | video-hpsh8dYl7PE
115 | video-XrgVtuDRBjM
116 | video-0nB9BcZTBag
117 | video-nB-444rPm_8
118 | video-FwqsdGLhdgA
119 | video-5ZYcY-KX2JA
120 | video-yEvhDTWSRec
121 | video-3aZadizk_Io
122 | video-3CJl_S7uzqU
123 | video-u3VVUu-lZsM
124 | video-x4sadYeLHKU
125 | video-jQHVrDFNmJE
126 | video-AZGaCqDAlsU
127 | video-VqrBsMFRaLA
128 | video-CtBBL7Pb9Q0
129 | video-3EFiduilmn8
130 | video-EyVuypKJOq0
131 | video-p5u-vBV8NUU
132 | video-NvgYhf2LnVI
133 | video-9Ky2nyzOnMw
134 | video-SkB4gG8ke7Q
135 | video-GHDz-XDD8OU
136 | video-xyor66WBWPk
137 | video-UDcrGE3le20
138 | video-vYLaKMpqnOc
139 | video-Wu-gd9tLpmU
140 | video-BeGOgA18NIQ
141 | video-JNvpcGV1frQ
142 | video-jHe5vPlKgJA
143 | video-ZSd_IpzmcLM
144 | video-B8ISzf2pryI
145 | video-GZ0Bey4YUGI
146 | video-cEx5bSYJxtg
147 | video-IAq8pEFNeJs
148 | video-uhUUXMWoC_4
149 | video-DS8yeXFeEPA
150 | video-v3iPrBrGSJM
151 | video-uc1Hn4INDjk
152 | video-YPFGT4ecnIU
153 | video-D0a0aNqTehM
154 | video-tEZzagPGls4
155 | video-4QdmRufojsU
156 | video-4SkWU60v9Cg
157 | video-pjJEXkbeL-o
158 | video-W0_tLK37W24
159 | video-LJ9KtxNZdWE
160 | video-N_AcmtmegKI
161 | video-I6lZBoR5gvs
162 | video-8jucxdaifbs
163 | video-zW39WTnHCc8
164 | video-7RhQIZmkgDQ
165 | video-QNJx7Vi4Sg4
166 | video-Vlb2udqPx-M
167 | video-tUN-8TvevGU
168 | video-3SIfsFz_kMQ
169 | video-7KWKxe5HvLw
170 | video-VZaqHyHFCzc
171 | video-dZiJkicepzM
172 | video-94NanQuVkA4
173 | video-fhZo17Pxq1A
174 | video-HvPbH30KWLE
175 | video-J5R18MzrtKU
176 | video-sdUUx5FdySs
177 | video-88VViI5gNA4
178 | video-LsjNWQQOmNg
179 | video-LF71pZXhYrw
180 | video-gRD53bcAM8E
181 | video-CKwUNBEFI0E
182 | video-LnJwH_PZXnM
183 | video-Bzua8Zvlppo
184 | video-o2VFgHGKzx4
185 | video-G9YuKs3Jitk
186 | video-VGNFvm-YCEk
187 | video-huT5__BqY_U
188 | video-hywRdDVR76A
189 | video-WUG-x1TFewA
190 | video-osVxO-RA-pE
191 | video-t1PGWO2Lvmw
192 | video-1awM6kmpd2g
193 | video-KK-Mff60ZIE
194 | video-StAF3NSro-w
195 | video-makIgB4X3q8
196 | video-B_bdAJXsjvk
197 | video-jnaPpgK33Lo
198 | video-6EiRjwjp30I
199 | video-p_Rrovk5nsk
200 | video-EthCVn45VyU
201 | video-DeKXFHPr_oE
202 | video-kJzNZ10I1MY
203 | video-uceySVBjKNw
204 | video-CnOJgDW0gPI
205 | video-XxhKUP9Ixco
206 | video-wt62ayeVd44
207 | video-DSBhSywLRNA
208 | video-ZmdOe1hjW-s
209 | video-Qb2xoiVM7UA
210 | video-qniwI2hNhDs
211 | video-nPo2B-vjZ28
212 | video-4b6ttHSgIFM
213 | video-URTR3AtKTM8
214 | video-4XNjwKBqvxE
215 | video-V3L1qrisKFE
216 | video-0Cqt__04bAk
217 | video-CFq441el_ls
218 | video-R9Puz5RFl5o
219 | video-b1XGPvbWn0A
220 | video-zSlhbBBBi3A
221 | video-JNyn_w3hdZ0
222 | video-JnR2dpLnS14
223 | video-ENaJcHwQEVs
224 | video-QjqS7jzjX34
225 | video-4X1DieuShKI
226 | video-qI34nBlJxP8
227 | video-vjW4UOC7U3w
228 | video-aB_s9lw9E-M
229 | video-NXXkBSuIAl8
230 | video-y07at1bU89Q
231 | video-763brdRmWuc
232 | video-R4WDWpR4oRM
233 | video-Y_dXFLaEVJk
234 | video-d6g1c18Cy-8
235 | video-FRgDsBFC2IA
236 | video-1NunXMcaslA
237 | video-9Byx6TxOPx4
238 | video-xqZQ9KM_LjY
239 | video-tloBMf_KmX4
240 | video-JZIerGNMtnk
241 | video-4oETtq9w9Zg
242 | video-HLRxoMiagO8
243 | video-cdg193GvnBA
244 | video-O_HyZ5aW76c
245 | video-gLax3zOBN40
246 | video-ZVXz6ymCSIo
247 | video-wWVppdfYOx8
248 | video-jKXrOTdbtVQ
249 | video-KauXf6nihPY
250 | video-XeSW_3JEeTs
251 | video-kbzEFa7fiOE
252 | video-Z-E3cRZCne4
253 | video-peXSoTlkwVY
254 | video-G6fMV1UPzkg
255 | video-sBdqOWSZ56w
256 | video-OA1ZRGFKRVI
257 | video-4ClKFnnzSRA
258 | video-ygzR-ltUWug
259 | video-kltuUtE6jQo
260 | video-aI51UWF8_9Y
261 | video-cda7mSowTEI
262 | video-5NVYg2HNAdA
263 | video-Ezg4sr67OGA
264 | video-6ZjRKYPfO8g
265 | video-XPKf24_pXfQ
266 | video-ansWZq7yULE
267 | video-J0HiXwK5s2k
268 | video-QEpCsMbMx7w
269 | video-P0ISZpljc3E
270 | video-PFjp1MW6Lzc
271 | video-dPgs0GHgiYc
272 | video-mgNgscHJh6I
273 | video-oowcsynjIwc
274 | video-Q04KG7gVQtw
275 | video-FK3dav4bA4s
276 | video-iS9QQ8YOofQ
277 | video-D-hPct3oIow
278 | video-wxm8jTzU_8o
279 | video-Zh2-rVsXWUU
280 | video-Wji-BZ0oCwg
281 | video-hgLQQe5uUCE
282 | video-3h6KMumLAvI
283 | video-gHCxdlZ7G18
284 | video-PeBAzI9LuHM
285 | video-qI3AWoK7ABU
286 | video-nqkyzpaoMug
287 | video-IFgh9WU0lPs
288 | video-ppyYdn2nPoU
289 | video-iWYCoBiTnA0
290 | video-TubxNbCQ4Fk
291 | video-lwS74rI92YQ
292 | video-GjHkkTGf7fc
293 | video-TDquUlVDdbU
294 | video-5xKnmuDnJMs
295 | video-uW9KEiQFUE8
296 | video-RzR_O2DoSVs
297 | video-1ePcSm1ninM
298 | video-MxNfvh7vaSs
299 | video-6jLfuoOBX2I
300 | video-Qw6RD5S3e8o
301 | video-2kUMAA9yZgk
302 | video-QcAcBHosPzQ
303 | video-hCsVT9TKahk
304 | video-DAHbtsjuNws
305 | video-U6fPh2mm3pw
306 | video-PGKmexNTHNE
307 | video-NVItPJAu_Fk
308 | video-woWiyBgp5cs
309 | video-qX9FSZJu448
310 | video-YKejnIOvACY
311 | video-5nmhHL3sVIk
312 | video-3yHsRjoRec8
313 | video-6Ts-deSDnRM
314 | video-xZfZ-HB2yJI
315 | video-wj8XXvD4kGE
316 | video-9nVvIz8nYxo
317 | video-TqPCGGHoxsE
318 | video-QAEkuVgt6Aw
319 | video-3Bs4LOtIuxg
320 | video-Gv1aDEFlXq8
321 | video-gZp6CGgsS4A
322 | video-YOqmroV2cRo
323 | video-cHUNbTfzOr4
324 | video-MDdQBWyFmtc
325 | video-JdYSnsEM0gg
326 | video-_U-J9PqgmIc
327 | video-d6PMG7kXpF4
328 | video-xEaCpSzUq3Q
329 | video-28FyDT4cKrg
330 | video-JvQcabZ1zrk
331 | video-Bv2vT665bGI
332 | video-DMCSP73Rq4I
333 | video-o2AsIXSh2xo
334 | video-NoPMX5lqT6A
335 | video-7ToAmWnTsAI
336 | video-tbBzXKN32Sk
337 | video-5_uSZcXMV7s
338 | video-rbNB0jqMv7s
339 | video-AW0jm6i9U3M
340 | video--wHytb5Fe2k
341 | video-JtzsCx0P3tI
342 | video-G64wuf-rHoo
343 | video-FRpIk7yd2RA
344 | video-XvhlK0WGBr4
345 | video-tUyeaT2ZX1I
346 | video-bV3Ib6Ato6c
347 | video-bYw1gRtyGiw
348 | video-LrI4FmRIHpI
349 | video-8pfPl8BkfVY
350 | video-3veKbPi4r90
351 | video-_-O6Ppkrf98
352 | video-5hpBAn5lQPs
353 | video-I6PXKSiJchU
354 | video-sNcJMejrcnM
355 | video-OITWgx8K6Ko
356 | video-wXL5zXz550I
357 | video-bap6XjDDE3k
358 | video-g26mbST0YhU
359 | video-tkQuXvgvNPk
360 | video-dJJ0yadpqKI
361 | video-5yGNbyAmkVY
362 | video-0-NBRA1aSXk
363 | video-xVihCNfZaDg
364 | video-s1FWVQFeOpQ
365 | video-gFuEo2ccTPA
366 | video-L8hM2kbw2Ik
367 | video-yKGeJXk2qWQ
368 | video-tREqJ1_7h0w
369 | video-0m1IfJUNzmc
370 | video-LJosiEHwWxc
371 | video-W4Pr7PZ3Bgc
372 | video-M0nDEbrp9nM
373 | video-D6lmibFiur8
374 | video-oFV9ayoss_o
375 | video-7SldSIviMkg
376 | video-SQ6H-Mz6hgw
377 | video-GrzDQGVprjE
378 | video-ixh8KqEr6LE
379 | video-zywWM3J3i8M
380 | video-WjqiU5FgsYc
381 | video-mpDOscUDQ_0
382 | video-qMzt3yQFT-Q
383 | video-3NDfWjywzsI
384 | video-F2bk_9T482g
385 | video-aLjHyP683QU
386 | video-6Jgwc3sXLCc
387 |
--------------------------------------------------------------------------------
/dataset_stats/get_videoid_perclass.py:
--------------------------------------------------------------------------------
1 | # AudioCaps dataset statistics
2 | # March 2021, ask
3 |
4 |
5 | import os
6 | import csv
7 | import json
8 | import numpy as np
9 | from tqdm import tqdm
10 |
11 | # -----------------------------------------------------------------------------
12 | # Load AudioCaps test data
13 | # -----------------------------------------------------------------------------
14 |
15 | audiocaps_base = '/home/askoepke97/coding/ce/collab-experts/data/AudioCaps/audiocaps/dataset'
16 | audiocaps_test_file = os.path.join(audiocaps_base, 'test.csv')
17 |
18 | audiocapid = []
19 | youtubeid = []
20 | yid_dict = dict()
21 |
22 | with open(audiocaps_test_file, 'r') as csvfile:
23 | reader = csv.reader(csvfile)
24 | i = 0
25 | for row in reader:
26 | if i > 0:
27 | if not int(row[2]) == 0:
28 | filename = row[1] + '_%d000'%int(row[2])
29 | else:
30 | filename = row[1] + '_%d'%int(row[2])
31 | ytname = row[1]
32 | yid_dict[filename] = ytname
33 | i += 1
34 |
35 | # -----------------------------------------------------------------------------
36 | # Load audioset ontology and train (and eval) data
37 | # -----------------------------------------------------------------------------
38 |
39 | audiosetbase = '/home/askoepke97/coding/ce/collab-experts/data/dataset_statistics'
40 | ontology = os.path.join(audiosetbase, 'ontology.json')
41 |
42 | evalcsv = os.path.join(audiosetbase, 'eval_segments.csv')
43 | traincsv = os.path.join(audiosetbase, 'unbalanced_train_segments.csv')
44 |
45 | with open(ontology) as json_file:
46 | ontology_data = json.load(json_file)
47 |
48 | classids = dict()
49 |
50 | for ind in np.arange(len(ontology_data)):
51 | classids[ontology_data[ind]['id']] = ontology_data[ind]['name']
52 |
53 | evaldict = dict()
54 |
55 | with open(evalcsv, 'r') as as_csvfile:
56 | reader = csv.reader(as_csvfile)
57 | i = 0
58 | for row in reader:
59 | if i > 2:
60 | ytname = row[0]
61 | starttime = row[1]
62 | classes = row[3].split(',')
63 | newclasses = []
64 | for classe in classes:
65 | if classe.strip()[0] == '"' and not classe.strip()[-1] == '"':
66 | newclasses.append(classids[row[3].strip()[1:]])
67 | elif classe.strip()[-1] == '"' and not classe.strip()[0] == '"':
68 | newclasses.append(classids[row[3].strip()[:-1]])
69 | elif classe.strip()[-1] == '"' and classe.strip()[0] == '"':
70 | newclasses.append(classids[row[3].strip()[1:-1]])
71 | else:
72 | newclasses.append(classids[row[3].strip()])
73 | evaldict[ytname] = newclasses
74 | i += 1
75 |
76 | traindict = dict()
77 |
78 | with open(traincsv, 'r') as as_csvfile:
79 | reader = csv.reader(as_csvfile)
80 | i = 0
81 | for row in reader:
82 | if i > 2:
83 | ytname = row[0]
84 | starttime = row[1]
85 | classes = row[3].split(',')
86 | newclasses = []
87 | for classe in classes:
88 | if classe.strip()[0] == '"' and not classe.strip()[-1] == '"':
89 | newclasses.append(classids[row[3].strip()[1:]])
90 | elif classe.strip()[-1] == '"' and not classe.strip()[0] == '"':
91 | newclasses.append(classids[row[3].strip()[:-1]])
92 | elif classe.strip()[-1] == '"' and classe.strip()[0] == '"':
93 | newclasses.append(classids[row[3].strip()[1:-1]])
94 | else:
95 | newclasses.append(classids[row[3].strip()])
96 | traindict[ytname] = newclasses
97 | i += 1
98 | print(i, 'len train')
99 |
100 | # -----------------------------------------------------------------------------
101 | # Load VGGSound training ulrs
102 | # -----------------------------------------------------------------------------
103 |
104 | vggsoundpath = '/home/askoepke97/coding/gitrepos/sound_features/VGGSound/data/train.csv'
105 | vggvids = []
106 | with open(vggsoundpath, 'r') as csv_file:
107 | reader = csv.reader(csv_file)
108 | for row in tqdm(reader):
109 | vggvids.append(row[0].split('_')[0])
110 |
111 | # -----------------------------------------------------------------------------
112 | # Find overlap between VGGSound training set and AudioCaps test set
113 | # -----------------------------------------------------------------------------
114 |
115 | overlap_counter = 0
116 | vggcounter = 0
117 | uniqueclasses = [] #111 unique classes in unfiltered (before removing overlap with VGGSound) AudioCaps test set, 97 in the val set, 238 in train
118 | newclassdict = dict()
119 | overlap_test_videos = []
120 | for key, value in tqdm(yid_dict.items()):
121 | if value in vggvids:
122 | vggcounter += 1
123 | overlap_test_videos.append(value)
124 | # # Check for overlap between AudioCaps test set and AudioSet training data
125 | # if value in evaldict.keys():
126 | # if not evaldict[value] in newclassdict.values():
127 | # uniqueclasses.append(evaldict[value])
128 | # newclassdict[key] = evaldict[value]
129 | # elif value in traindict.keys():
130 | # overlap_counter += 1
131 | # if not traindict[value] in newclassdict.values():
132 | # uniqueclasses.append(traindict[value])
133 | # newclassdict[key] = traindict[value]
134 |
135 |
136 | # -----------------------------------------------------------------------------
137 | # Filter the test.csv dictionary yid_dict from AudioCaps for overlap with VGGSound
138 | # -----------------------------------------------------------------------------
139 |
140 | new_yid_dict = dict()
141 | for key, value in yid_dict.items():
142 | if not key.split('_')[0] in overlap_test_videos:
143 | new_yid_dict[key] = value
144 |
145 | # -----------------------------------------------------------------------------
146 | # Make dictionaries that contain classes as keys and video names in AudioCaps
147 | # test as values
148 | # -----------------------------------------------------------------------------
149 |
150 | class_video_dict = dict()
151 | for key, value in tqdm(new_yid_dict.items()):
152 | if value in traindict.keys():
153 | for vid_class in traindict[value]: #traindict[value] could contain a list with multiple classes
154 | if not vid_class in class_video_dict.keys():
155 | class_video_dict[vid_class] = [key]
156 | elif vid_class in class_video_dict.keys():
157 | class_video_dict[vid_class].append(key)
158 | else:
159 | import pdb; pdb.set_trace()
160 |
161 | new_class_video_dict = class_video_dict.copy()
162 | for key, value in class_video_dict.items():
163 | if len(value) < 10:
164 | new_class_video_dict.pop(key)
165 |
166 | print(len(new_class_video_dict.keys()))
167 |
168 | # print count of each class in dictionary, videos belong to single class only
169 | count_no_videos = 0
170 | for key, value in tqdm(new_class_video_dict.items()):
171 | print(key, len(value))
172 | count_no_videos += len(value)
173 | print(len(new_yid_dict.keys()), count_no_videos, 'number of videos in test set and number of videos in dictionaries')
174 |
175 | # save class dictionary that only contains classes with more than 10 example videos in the test set (34 classes)
176 |
177 | with open('test_class_videoid_dict_morethan10.json', 'w') as fp:
178 | json.dump(new_class_video_dict, fp)
179 |
180 | # save class dictionary with all classes in the test set (106 classes)
181 |
182 | with open('test_class_videoid_dict_all.json', 'w') as fp:
183 | json.dump(class_video_dict, fp)
184 |
185 | ## -----------------------------------------------------------------------------
186 | ## Filter the AudioCaps test_list.txt to remove overlap with the VGGSound training data
187 | ## -----------------------------------------------------------------------------
188 | #
189 | #audiocaps_testfile = '/home/askoepke97/akata-shared/askoepke97/data/AR/AudioCaps/structured-symlinks/test_list.txt'
190 | #file1 = open(audiocaps_testfile, 'r')
191 | #oldtestfiles = file1.readlines()
192 | #file1.close()
193 | #newtestfiles = []
194 | #for oldtestfile in oldtestfiles:
195 | # if not oldtestfile.split('_')[0] in overlap_test_videos:
196 | # newtestfiles.append(oldtestfile)
197 | #file1 = open('/home/askoepke97/akata-shared/askoepke97/data/AR/AudioCaps/structured-symlinks/filtered_test_list.txt', 'w')
198 | #file1.writelines(newtestfiles)
199 | #file1.close()
200 | #
201 | ## -----------------------------------------------------------------------------
202 |
203 | #print('There are %d videos in the AudioCaps test set that are contained in the AudioSet training set.'%overlap_counter) #975 and there are only 975 videos in the AudioCaps test set, 495 in the val set, 49838 in train
204 |
--------------------------------------------------------------------------------
/configs/data_loader_clotho.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/base_config.json",
3 | "eval_mode": "test_run",
4 | "experts": {
5 | "text_feat": "w2v",
6 | "modalities": [
7 | "imagenet.resnext101_32x48d.0",
8 | "r2p1d.r2p1d-ig65m.0",
9 | "scene.densenet161.0",
10 | "audio"
11 | ]
12 | },
13 | "arch": {
14 | "type": "CENet",
15 | "args": {
16 | "test_caption_mode": "indep",
17 | "use_ce": "pairwise",
18 | "use_mish": 1,
19 | "use_bn_reason": 1,
20 | "num_g_layers": 3,
21 | "num_h_layers": 0,
22 | "include_self": 1,
23 | "l2renorm": false,
24 | "randomise_feats": "",
25 | "vlad_clusters": {
26 | "text": 20,
27 | "audio": 16,
28 | "pann": 16,
29 | "speech": 5,
30 | "syncnet": 16,
31 | "vggsound": 16
32 | },
33 | "ghost_clusters": {
34 | "text": 1,
35 | "speech": 2
36 | },
37 | "mimic_ce_dims": 0
38 | }
39 | },
40 | "optimizer": {
41 | "type": "Ranger",
42 | "args": {
43 | "lr": 0.01,
44 | "weight_decay": 1E-03
45 | }
46 | },
47 | "loss": {
48 | "type": "MaxMarginRankingLoss",
49 | "args": {
50 | "margin": 0.2,
51 | "fix_norm": true
52 | }
53 | },
54 | "data_loader": {
55 | "type": "ExpertDataLoader",
56 | "args":{
57 | "dataset_name": "CLOTHO",
58 | "data_dir": "data/CLOTHO",
59 | "root_feat_folder": "structured-symlinks",
60 | "trn_cat": 0,
61 | "batch_size": 128,
62 | "split_name": "val",
63 | "fuse_captions": false,
64 | "num_test_captions": 1,
65 | "max_tokens": {
66 | "text": 21,
67 | "audio": 31,
68 | "pann": 29,
69 | "speech": 35,
70 | "syncnet": 29,
71 | "vggsound": 95
72 | },
73 | "feat_aggregation": {
74 | "imagenet.senet154.0": {
75 | "fps": 25,
76 | "stride": 1,
77 | "pixel_dim": 256,
78 | "aggregate-axis": 1,
79 | "offset": 0,
80 | "temporal": "avg",
81 | "aggregate": "concat",
82 | "type": "embed",
83 | "feat_dims": {
84 | "embed": 2048,
85 | "logits": 1000
86 | }
87 | },
88 | "imagenet.resnext101_32x48d.0": {
89 | "fps": 25,
90 | "stride": 1,
91 | "offset": 0,
92 | "pixel_dim": 256,
93 | "temporal": "avg",
94 | "aggregate": "concat",
95 | "aggregate-axis": 1,
96 | "type": "embed",
97 | "feat_dims": {
98 | "embed": 2048,
99 | "logits": 1000
100 | }
101 | },
102 | "scene.densenet161.0": {
103 | "stride": 1,
104 | "fps": 25,
105 | "offset": 0,
106 | "temporal": "avg",
107 | "pixel_dim": 256,
108 | "aggregate": "concat",
109 | "aggregate-axis": 1,
110 | "type": "embed",
111 | "feat_dims": {
112 | "embed": 2208,
113 | "logits": 1000
114 | }
115 | },
116 | "i3d.i3d.0": {
117 | "fps": 25,
118 | "offset": 0,
119 | "stride": 25,
120 | "inner_stride": 1,
121 | "pixel_dim": 256,
122 | "temporal": "avg",
123 | "aggregate": "concat",
124 | "aggregate-axis": 1,
125 | "type": "embed",
126 | "feat_dims": {
127 | "embed": 1024,
128 | "logits": 400
129 | }
130 | },
131 | "r2p1d.r2p1d-ig65m.0": {
132 | "fps": 30,
133 | "offset": 0,
134 | "stride": 32,
135 | "inner_stride": 1,
136 | "pixel_dim": 256,
137 | "temporal": "avg",
138 | "aggregate": "concat",
139 | "aggregate-axis": 1,
140 | "type": "embed",
141 | "feat_dims": {
142 | "embed": 512,
143 | "logits": 359
144 | }
145 | },
146 | "r2p1d.r2p1d-ig65m-kinetics.0": {
147 | "fps": 30,
148 | "offset": 0,
149 | "stride": 32,
150 | "inner_stride": 1,
151 | "pixel_dim": 256,
152 | "temporal": "avg",
153 | "aggregate": "concat",
154 | "aggregate-axis": 1,
155 | "type": "embed",
156 | "feat_dims": {
157 | "embed": 512,
158 | "logits": 400
159 | }
160 | },
161 | "pann.pann.0": {
162 | "model": "pann",
163 | "flaky": false,
164 | "temporal": "vlad",
165 | "type": "embed",
166 | "binarise": false
167 | },
168 | "pann": {
169 | "model": "pann",
170 | "flaky": false,
171 | "temporal": "vlad",
172 | "type": "embed",
173 | "binarise": false
174 | },
175 | "syncnet": {
176 | "model": "syncnet",
177 | "flaky": false,
178 | "temporal": "vlad",
179 | "type": "embed",
180 | "binarise": false
181 | },
182 | "audio.syncnet.0": {
183 | "model": "syncnet",
184 | "flaky": false,
185 | "temporal": "vlad",
186 | "type": "embed",
187 | "binarise": false
188 | },
189 | "vggsound": {
190 | "model": "vggsound",
191 | "flaky": false,
192 | "temporal": "vlad",
193 | "type": "embed",
194 | "binarise": false
195 | },
196 | "audio.vggsound.0": {
197 | "model": "vggsound",
198 | "flaky": false,
199 | "temporal": "vlad",
200 | "type": "embed",
201 | "binarise": false
202 | },
203 | "speech": {
204 | "model": "w2v",
205 | "flaky": true,
206 | "temporal": "vlad",
207 | "type": "embed",
208 | "binarise": false,
209 | "feat_dims": {
210 | "embed": 300
211 | }
212 | },
213 | "audio": {
214 | "model": "vggish",
215 | "flaky": false,
216 | "temporal": "vlad",
217 | "type": "embed",
218 | "binarise": false
219 | },
220 | "audio.vggish.0": {
221 | "model": "vggish",
222 | "flaky": false,
223 | "temporal": "vlad",
224 | "type": "embed",
225 | "binarise": false
226 | }
227 | }
228 | }
229 | },
230 | "trainer": {
231 | "epochs": 20
232 | },
233 | "eval_settings": {
234 | "data_loader": {
235 | "args": {
236 | "split_name": "test",
237 | "num_test_captions": 5
238 | }
239 | },
240 | "tester": {
241 | "save_dir": "data/saved/",
242 | "verbosity": 2
243 | },
244 | "disable_gpu": true
245 | },
246 | "visualizer": {
247 | "type": "Visualizer",
248 | "args":{
249 | "src_video_dir": "data/CLOTHO/videos",
250 | "vis_vid_freq": 500,
251 | "num_samples": 100
252 | }
253 | }
254 | }
255 |
--------------------------------------------------------------------------------
/configs/data_loader_audiocaps.json:
--------------------------------------------------------------------------------
1 | {
2 | "inherit_from": "configs/base_config.json",
3 | "eval_mode": "test_run",
4 | "experts": {
5 | "text_feat": "w2v",
6 | "modalities": [
7 | "imagenet.resnext101_32x48d.0",
8 | "r2p1d.r2p1d-ig65m.0",
9 | "scene.densenet161.0",
10 | "audio"
11 | ]
12 | },
13 | "arch": {
14 | "type": "CENet",
15 | "args": {
16 | "test_caption_mode": "indep",
17 | "use_ce": "pairwise",
18 | "use_mish": 1,
19 | "use_bn_reason": 1,
20 | "num_g_layers": 3,
21 | "num_h_layers": 0,
22 | "include_self": 1,
23 | "l2renorm": false,
24 | "randomise_feats": "",
25 | "vlad_clusters": {
26 | "text": 20,
27 | "audio": 16,
28 | "pann": 16,
29 | "syncnet": 16,
30 | "vggsound": 16,
31 | "speech": 5
32 | },
33 | "ghost_clusters": {
34 | "text": 1
35 | },
36 | "mimic_ce_dims": 0
37 | }
38 | },
39 | "optimizer": {
40 | "type": "Ranger",
41 | "args": {
42 | "lr": 0.01,
43 | "weight_decay": 1E-03
44 | }
45 | },
46 | "loss": {
47 | "type": "MaxMarginRankingLoss",
48 | "args": {
49 | "margin": 0.2,
50 | "fix_norm": true
51 | }
52 | },
53 | "data_loader": {
54 | "type": "ExpertDataLoader",
55 | "args":{
56 | "dataset_name": "AudioCaps",
57 | "data_dir": "data/AudioCaps",
58 | "root_feat_folder": "structured-symlinks",
59 | "trn_cat": 0,
60 | "batch_size": 128,
61 | "split_name": "val",
62 | "fuse_captions": false,
63 | "num_test_captions": 1,
64 | "max_tokens": {
65 | "text": 20,
66 | "audio": 29,
67 | "pann": 29,
68 | "syncnet": 29,
69 | "vggsound": 29,
70 | "speech": 35
71 | },
72 | "feat_aggregation": {
73 | "imagenet.senet154.0": {
74 | "fps": 25,
75 | "stride": 1,
76 | "pixel_dim": 256,
77 | "aggregate-axis": 1,
78 | "offset": 0,
79 | "temporal": "avg",
80 | "aggregate": "concat",
81 | "type": "embed",
82 | "feat_dims": {
83 | "embed": 2048,
84 | "logits": 1000
85 | }
86 | },
87 | "imagenet.resnext101_32x48d.0": {
88 | "fps": 25,
89 | "stride": 1,
90 | "offset": 0,
91 | "pixel_dim": 256,
92 | "temporal": "avg",
93 | "aggregate": "concat",
94 | "aggregate-axis": 1,
95 | "type": "embed",
96 | "feat_dims": {
97 | "embed": 2048,
98 | "logits": 1000
99 | }
100 | },
101 | "scene.densenet161.0": {
102 | "stride": 1,
103 | "fps": 25,
104 | "offset": 0,
105 | "temporal": "avg",
106 | "pixel_dim": 256,
107 | "aggregate": "concat",
108 | "aggregate-axis": 1,
109 | "type": "embed",
110 | "feat_dims": {
111 | "embed": 2208,
112 | "logits": 1000
113 | }
114 | },
115 | "i3d.i3d.0": {
116 | "fps": 25,
117 | "offset": 0,
118 | "stride": 25,
119 | "inner_stride": 1,
120 | "pixel_dim": 256,
121 | "temporal": "avg",
122 | "aggregate": "concat",
123 | "aggregate-axis": 1,
124 | "type": "embed",
125 | "feat_dims": {
126 | "embed": 1024,
127 | "logits": 400
128 | }
129 | },
130 | "r2p1d.r2p1d-ig65m.0": {
131 | "fps": 30,
132 | "offset": 0,
133 | "stride": 32,
134 | "inner_stride": 1,
135 | "pixel_dim": 256,
136 | "temporal": "avg",
137 | "aggregate": "concat",
138 | "aggregate-axis": 1,
139 | "type": "embed",
140 | "feat_dims": {
141 | "embed": 512,
142 | "logits": 359
143 | }
144 | },
145 | "r2p1d.r2p1d-ig65m-kinetics.0": {
146 | "fps": 30,
147 | "offset": 0,
148 | "stride": 32,
149 | "inner_stride": 1,
150 | "pixel_dim": 256,
151 | "temporal": "avg",
152 | "aggregate": "concat",
153 | "aggregate-axis": 1,
154 | "type": "embed",
155 | "feat_dims": {
156 | "embed": 512,
157 | "logits": 400
158 | }
159 | },
160 | "pann.pann.0": {
161 | "model": "pann",
162 | "flaky": false,
163 | "temporal": "vlad",
164 | "type": "embed",
165 | "binarise": false
166 | },
167 | "pann": {
168 | "model": "pann",
169 | "flaky": false,
170 | "temporal": "vlad",
171 | "type": "embed",
172 | "binarise": false
173 | },
174 | "syncnet": {
175 | "model": "syncnet",
176 | "flaky": false,
177 | "temporal": "vlad",
178 | "type": "embed",
179 | "binarise": false
180 | },
181 | "audio.syncnet.0": {
182 | "model": "syncnet",
183 | "flaky": false,
184 | "temporal": "vlad",
185 | "type": "embed",
186 | "binarise": false
187 | },
188 | "vggsound": {
189 | "model": "vggsound",
190 | "flaky": false,
191 | "temporal": "vlad",
192 | "type": "embed",
193 | "binarise": false
194 | },
195 | "audio.vggsound.0": {
196 | "model": "vggsound",
197 | "flaky": false,
198 | "temporal": "vlad",
199 | "type": "embed",
200 | "binarise": false
201 | },
202 | "speech": {
203 | "model": "w2v",
204 | "flaky": true,
205 | "temporal": "vlad",
206 | "type": "embed",
207 | "binarise": false,
208 | "feat_dims": {
209 | "embed": 300
210 | }
211 | },
212 | "audio": {
213 | "model": "vggish",
214 | "flaky": true,
215 | "temporal": "vlad",
216 | "type": "embed",
217 | "binarise": false
218 | },
219 | "audio.vggish.0": {
220 | "model": "vggish",
221 | "flaky": true,
222 | "temporal": "vlad",
223 | "type": "embed",
224 | "binarise": false
225 | }
226 | }
227 | }
228 | },
229 | "metrics": [
230 | "t2v_metrics",
231 | "v2t_metrics"
232 | ],
233 | "trainer": {
234 | "epochs": 20
235 | },
236 | "eval_settings": {
237 | "data_loader": {
238 | "args": {
239 | "split_name": "test",
240 | "num_test_captions": 5
241 | }
242 | },
243 | "tester": {
244 | "save_dir": "data/saved/",
245 | "verbosity": 2
246 | },
247 | "disable_gpu": true
248 | },
249 | "testing_file": "final_filtered_test_list.txt",
250 | "visualizer": {
251 | "type": "Visualizer",
252 | "args":{
253 | "src_video_dir": "data/AudioCaps/videos",
254 | "vis_vid_freq": 500,
255 | "num_samples": 5
256 | }
257 | }
258 | }
259 |
--------------------------------------------------------------------------------