├── misc
    ├── __init__.py
    ├── text_embedding_models.json
    ├── datasets
    │   ├── querydsegments
    │   │   ├── tar_include.txt
    │   │   └── README.md
    │   ├── activity-net
    │   │   ├── tar_include.txt
    │   │   └── README.md
    │   ├── clotho
    │   │   └── tar_include.txt
    │   ├── queryd
    │   │   ├── tar_include.txt
    │   │   ├── README.md
    │   │   ├── val_list.txt
    │   │   └── test_list.txt
    │   └── audiocaps
    │   │   └── tar_include.txt
    ├── experiments-queryd.json
    ├── yaspi_gpu_defaults.json
    ├── exps-names.md
    ├── find_latest_checkpoints.py
    ├── experiments-audiocaps.json
    ├── experiments_teachText.json
    ├── generate_exps.py
    ├── aggregate_logs_and_stats.py
    ├── launch_exps_from_list.py
    └── gen_tar_lists.py
├── utils
    ├── __init__.py
    ├── cos_restart.py
    ├── gen_ablations_for_dataset.py
    ├── datastructures.py
    ├── visualizer.py
    ├── html.py
    ├── ranger.py
    └── radam.py
├── trainer
    └── __init__.py
├── model
    ├── __init__.py
    ├── mil_nce_net.py
    ├── text_embedding_models.json
    ├── loss.py
    └── net_vlad.py
├── logger
    ├── __init__.py
    ├── logger_config.json
    ├── logger.py
    ├── visualization.py
    └── log_parser.py
├── base
    ├── __init__.py
    └── base_model.py
├── launch_ablations_txt
    ├── audio_experts.txt
    ├── all_audio_experts.txt
    └── single_audio_experts.txt
├── configs
    ├── clotho
    │   ├── train-vggish-vggsound.json
    │   ├── train-vggish-vggsound-moee.json
    │   └── train-full-ce-only-audio.json
    ├── audiocaps
    │   ├── train-vggish-vggsound-train_list_10.json
    │   ├── train-vggish-vggsound-train_list_25.json
    │   ├── train-vggish-vggsound-train_list_50.json
    │   ├── train-vggish-vggsound-train_list_75.json
    │   ├── train-vggish-vggsound.json
    │   ├── train-full-ce-scene-r2p1d.json
    │   ├── train-full-ce-r2p1d-inst.json
    │   ├── train-full-ce-scene-inst.json
    │   ├── train-full-ce-r2p1d-inst-vggish.json
    │   ├── train-full-ce-r2p1d-inst-vggsound.json
    │   ├── train-full-ce-scene-r2p1d-inst.json
    │   ├── train-full-ce-r2p1d-inst-vggish-vggsound.json
    │   ├── train-full-ce-scene-r2p1d-inst-vggsound.json
    │   ├── train-vggish-vggsound-moee.json
    │   ├── train-full-ce-only-audio.json
    │   ├── train-only-vggsound.json
    │   ├── train-full-ce-only-r2p1d.json
    │   ├── train-full-ce-only-scene.json
    │   ├── train-full-ce-scene-r2p1d-inst-vggish-vggsound.json
    │   └── train-full-ce-only-inst.json
    ├── queryd
    │   └── train-full-ce-only-audio.json
    ├── querydsegments
    │   └── train-full-ce-only-audio.json
    ├── activity-net
    │   └── train-full-ce-audio-only.json
    ├── data_loader_activity-net.json
    ├── data_loader_queryd.json
    ├── data_loader_querydsegments.json
    ├── data_loader_clotho.json
    └── data_loader_audiocaps.json
├── .gitignore
├── requirements
    └── requirements.txt
├── data_loader
    ├── QuerYDSegments_dataset.py
    ├── QuerYD_dataset.py
    ├── CLOTHO_dataset.py
    ├── ActivityNet_dataset.py
    ├── AudioCaps_dataset.py
    └── data_loaders.py
├── exp_to_seed_time.json
├── eval.py
└── dataset_stats
    └── get_videoid_perclass.py


/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | 


--------------------------------------------------------------------------------
/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .trainer import *
2 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import CENet
2 | from .mil_nce_net import MNNet


--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from .logger import *
2 | from .visualization import *
3 | from .log_parser import *


--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 | # from .base_data_loader import *
2 | from .base_model import *
3 | from .base_trainer import *
4 | 


--------------------------------------------------------------------------------
/launch_ablations_txt/audio_experts.txt:
--------------------------------------------------------------------------------
1 | --config configs/audiocaps/train-ce-audio-speech.json --group_seed 0|1|2 --device 0
2 | 


--------------------------------------------------------------------------------
/launch_ablations_txt/all_audio_experts.txt:
--------------------------------------------------------------------------------
1 | --config configs/audiocaps/train-full-ce-only-audio_sophia_pann_soundnet.json --group_seed 0|1|2 --device 0
2 | 


--------------------------------------------------------------------------------
/configs/clotho/train-vggish-vggsound.json:
--------------------------------------------------------------------------------
1 | {
2 |     "inherit_from": "configs/data_loader_clotho.json",
3 |     "experts": {
4 |         "modalities": [
5 |             "audio",
6 |             "vggsound"
7 |         ]
8 |     }
9 | }


--------------------------------------------------------------------------------
/launch_ablations_txt/single_audio_experts.txt:
--------------------------------------------------------------------------------
1 | --config configs/clotho/train-full-ce-only-audio.json --group_seed 0|1|2 --device 0 --resume data/saved/models/audiocaps-train-full-ce-only-audio/2021-03-23_09-06-26/trained_model.pth
2 | 


--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_10.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_10.txt"}


--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_25.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_25.txt"}


--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_50.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_50.txt"}


--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-train_list_75.json:
--------------------------------------------------------------------------------
1 | {"inherit_from": "configs/data_loader_audiocaps.json", "experts": {"modalities": ["audio", "vggsound"]}, "testing_file": "final_filtered_test_list.txt", "training_file": "train_list_75.txt"}


--------------------------------------------------------------------------------
/misc/text_embedding_models.json:
--------------------------------------------------------------------------------
1 | {
2 |     "w2v": {
3 |         "weights_path": "data/text_models/GoogleNews-vectors-negative300.bin.gz",
4 |         "dim": 300,
5 |         "mirror": "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio",
 6 |             "vggsound"
 7 |         ]
 8 |     },
 9 |     "testing_file": "final_filtered_test_list.txt",
10 |     "training_file": "train_list.txt"
11 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "scene.densenet161.0",
 6 |             "r2p1d.r2p1d-ig65m.0"
 7 |         ]
 8 |     },
 9 |     "testing_file": "final_filtered_test_list.txt",
10 |     "training_file": "train_list.txt"
11 | }


--------------------------------------------------------------------------------
/configs/clotho/train-vggish-vggsound-moee.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_clotho.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio",
 6 |             "vggsound"
 7 |         ]
 8 |     },
 9 |     "arch": {
10 |         "type": "CENet",
11 |         "args": {
12 |             "use_ce": ""
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/configs/clotho/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_clotho.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/configs/queryd/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_queryd.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "r2p1d.r2p1d-ig65m.0",
 6 |             "imagenet.resnext101_32x48d.0"
 7 |         ]
 8 |     },
 9 |     "testing_file": "final_filtered_test_list.txt",
10 |     "training_file": "train_list.txt"
11 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-inst.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "scene.densenet161.0",
 6 |             "imagenet.resnext101_32x48d.0"
 7 |         ]
 8 |     },
 9 |     "testing_file": "final_filtered_test_list.txt",
10 |     "training_file": "train_list.txt"
11 | }


--------------------------------------------------------------------------------
/configs/querydsegments/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_querydsegments.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst-vggish.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio",
 6 |             "r2p1d.r2p1d-ig65m.0",
 7 |             "imagenet.resnext101_32x48d.0"
 8 |         ]
 9 |     },
10 |     "testing_file": "final_filtered_test_list.txt",
11 |     "training_file": "train_list.txt"
12 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst-vggsound.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "vggsound",
 6 |             "r2p1d.r2p1d-ig65m.0",
 7 |             "imagenet.resnext101_32x48d.0"
 8 |         ]
 9 |     },
10 |     "testing_file": "final_filtered_test_list.txt",
11 |     "training_file": "train_list.txt"
12 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d-inst.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "scene.densenet161.0",
 6 |             "r2p1d.r2p1d-ig65m.0",
 7 |             "imagenet.resnext101_32x48d.0"
 8 |         ]
 9 |     },
10 |     "testing_file": "final_filtered_test_list.txt",
11 |     "training_file": "train_list.txt"
12 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-r2p1d-inst-vggish-vggsound.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio",
 6 |             "r2p1d.r2p1d-ig65m.0",
 7 |             "imagenet.resnext101_32x48d.0",
 8 |             "vggsound"
 9 |         ]
10 |     },
11 |     "testing_file": "final_filtered_test_list.txt",
12 |     "training_file": "train_list.txt"
13 | }


--------------------------------------------------------------------------------
/misc/datasets/querydsegments/tar_include.txt:
--------------------------------------------------------------------------------
1 | data/QuerYDSegments/structured-symlinks/aggregated_audio/vggish-raw.hickle
2 | data/QuerYDSegments/structured-symlinks/structured-symlinks/split_raw_captions_filtered.pkl
3 | data/QuerYDSegments/structured-symlinks/test_list.txt
4 | data/QuerYDSegments/structured-symlinks/text_embeddings/w2v.pkl
5 | data/QuerYDSegments/structured-symlinks/train_list.txt
6 | data/QuerYDSegments/structured-symlinks/val_list.txt
7 | 


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d-inst-vggsound.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "scene.densenet161.0",
 6 |             "vggsound",
 7 |             "r2p1d.r2p1d-ig65m.0",
 8 |             "imagenet.resnext101_32x48d.0"
 9 |         ]
10 |     },
11 |     "testing_file": "final_filtered_test_list.txt",
12 |     "training_file": "train_list.txt"
13 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-vggish-vggsound-moee.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio",
 6 |             "vggsound"
 7 |         ]
 8 |     },
 9 |     "arch": {
10 |         "type": "CENet",
11 |         "args": {
12 |             "use_ce": ""
13 |         }
14 |     },
15 |     "testing_file": "final_filtered_test_list.txt",
16 |     "training_file": "train_list.txt"
17 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-audio.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     },
15 |     "testing_file": "final_filtered_test_list.txt",
16 |     "training_file": "train_list.txt"
17 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-only-vggsound.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "vggsound"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     },
15 |     "testing_file": "final_filtered_test_list.txt",
16 |     "training_file": "train_list.txt"
17 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-r2p1d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "r2p1d.r2p1d-ig65m.0"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     },
15 |     "testing_file": "final_filtered_test_list.txt",
16 |     "training_file": "train_list.txt"
17 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-scene.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "scene.densenet161.0"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     },
15 |     "testing_file": "final_filtered_test_list.txt",
16 |     "training_file": "train_list.txt"
17 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-scene-r2p1d-inst-vggish-vggsound.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "scene.densenet161.0",
 6 |             "audio",
 7 |             "r2p1d.r2p1d-ig65m.0",
 8 |             "imagenet.resnext101_32x48d.0",
 9 |             "vggsound"
10 |         ]
11 |     },
12 |     "testing_file": "final_filtered_test_list.txt",
13 |     "training_file": "train_list.txt"
14 | }


--------------------------------------------------------------------------------
/configs/audiocaps/train-full-ce-only-inst.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_audiocaps.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "imagenet.resnext101_32x48d.0"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     },
15 |     "testing_file": "final_filtered_test_list.txt",
16 |     "training_file": "train_list.txt"
17 | }


--------------------------------------------------------------------------------
/misc/datasets/activity-net/tar_include.txt:
--------------------------------------------------------------------------------
1 | data/activity-net/structured-symlinks/aggregated_audio/vggish-audio-raw.pickle
2 | data/activity-net/structured-symlinks/aggregated_facefeats_25fps_256px_stride1/face-avg.pickle
3 | data/activity-net/structured-symlinks/aggregated_ocr_feats/ocr-w2v.pkl
4 | data/activity-net/structured-symlinks/aggregated_text_feats/w2v.pkl
5 | data/activity-net/structured-symlinks/raw-captions-train-val_1.pkl
6 | data/activity-net/structured-symlinks/train_list.txt
7 | data/activity-net/structured-symlinks/val_1_list.txt
8 | 


--------------------------------------------------------------------------------
/misc/datasets/clotho/tar_include.txt:
--------------------------------------------------------------------------------
 1 | data/CLOTHO/structured-symlinks/aggregated_audio/vggish-raw.hickle
 2 | data/CLOTHO/structured-symlinks/aggregated_pann/pann-raw.hickle
 3 | data/CLOTHO/structured-symlinks/aggregated_syncnet/syncnet-raw.hickle
 4 | data/CLOTHO/structured-symlinks/aggregated_vggsound/vggsound-raw.hickle
 5 | data/CLOTHO/structured-symlinks/structured-symlinks/raw-captions.pkl
 6 | data/CLOTHO/structured-symlinks/test_list.txt
 7 | data/CLOTHO/structured-symlinks/text_embeddings/w2v.pkl
 8 | data/CLOTHO/structured-symlinks/train_list.txt
 9 | data/CLOTHO/structured-symlinks/val_list.txt
10 | 


--------------------------------------------------------------------------------
/base/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import numpy as np
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class BaseModel(nn.Module):
 7 |     """
 8 |     Base class for all models
 9 |     """
10 |     @abstractmethod
11 |     def forward(self, *inputs):
12 |         """
13 |         Forward pass logic
14 | 
15 |         :return: Model output
16 |         """
17 |         raise NotImplementedError
18 | 
19 |     def __str__(self):
20 |         """
21 |         Model prints with number of trainable parameters
22 |         """
23 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
24 |         params = sum([np.prod(p.size()) for p in model_parameters])
25 |         return super().__str__() + f"\nTrainable parameters: {params}"
26 | 


--------------------------------------------------------------------------------
/configs/activity-net/train-full-ce-audio-only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/data_loader_activity-net.json",
 3 |     "experts": {
 4 |         "modalities": [
 5 |             "audio"
 6 |         ]
 7 |     },
 8 |     "arch": {
 9 |         "type": "CENet",
10 |         "args": {
11 |             "use_ce": "",
12 |             "mimic_ce_dims": 1
13 |         }
14 |     },
15 |     "trainer": {
16 |         "epochs": 20
17 |     },
18 |     "optimizer": {
19 |         "type": "Ranger",
20 |         "args": {
21 |             "lr": 0.01,
22 |             "weight_decay": 1E-03
23 |         }
24 |     },
25 |     "loss": {
26 |         "type": "MaxMarginRankingLoss",
27 |         "args": {
28 |             "margin": 0.2,
29 |             "fix_norm": true
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/misc/experiments-queryd.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 |     "queryd-train-full-ce": ["c50d3616", "2021-05-28_15-24-39"],
 4 |     "queryd-train-full-ce-only-scene": ["766c0b81", "2021-05-28_15-39-29"],
 5 |     "queryd-train-full-ce-only-scene-audio": ["e576753f", "2021-05-28_16-20-15"],
 6 |     "queryd-train-full-ce-only-scene-audio-inst": ["e40f68bf", "2021-05-28_16-21-50"],
 7 |     "queryd-train-full-ce-only-scene-audio-inst-r2p1d": ["54ca249c", "2021-05-28_16-24-04"],
 8 |     "queryd-train-full-mnnet": ["7e1a7420", "2021-05-28_16-38-33"],
 9 |     "queryd-train-full-moee": ["ab5db961", "2021-05-28_15-32-38"],
10 | 
11 |     "querydsegments-train-full-ce": ["0d1b703c", "2021-05-28_15-26-57"],
12 |     "querydsegments-train-full-mnnet": ["1404fc28", "2021-05-28_16-38-32"],
13 |     "querydsegments-train-full-moee": ["7b3b466e", "2021-05-28_15-32-44"]
14 | 
15 | 
16 | }


--------------------------------------------------------------------------------
/misc/yaspi_gpu_defaults.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "recipe": "gpu-proc",
 3 |     "partition": "gpu",
 4 |     "time_limit": "96:00:00",
 5 |     "gen_script_dir": "data/slurm-gen-scripts",
 6 |     "mem": "100G",
 7 |     "gpus_per_task": 1,
 8 |     "cpus_per_task": 5,
 9 |     "throttle_array": 20,
10 |     "ssh_forward": "",
11 |     "log_dir": "data/slurm-logs",
12 |     "use_custom_ray_tmp_dir": false,
13 |     "refresh_logs": false,
14 |     "exclude": "gnodef1,gnodee7,gnodef2,gnodee1,gnodee2,gnodee3,gnodee4,gnodee5,gnodee6,gnodee8,gnodeb1,gnodeb2,gnodeb3,gnodeb4,gnodeb5,gnodec1,gnodec2,gnodec3,gnodec4,gnodec5,gnodej1",
15 |     "constraint_str": "",
16 |     "prep": "",
17 |     "env_setup": "export PYTHONPATH=\"${BASE}\":$PYTHONPATH; export PATH=\"${HOME}\"/local/anaconda3/condabin/:$PATH; source ~/local/anaconda3/etc/profile.d/conda.sh; conda activate pt37"
18 | }
19 | 


--------------------------------------------------------------------------------
/misc/datasets/queryd/tar_include.txt:
--------------------------------------------------------------------------------
 1 | data/QuerYD/structured-symlinks/aggregated_audio/vggish-raw.hickle
 2 | data/QuerYD/structured-symlinks/aggregated_imagenet_25fps_256px_stride1_offset0/resnext101_32x48d-avg.pickle
 3 | data/QuerYD/structured-symlinks/aggregated_r2p1d_30fps_256px_stride32_offset0_inner_stride1/r2p1d-ig65m-avg.pickle
 4 | data/QuerYD/structured-symlinks/aggregated_s3dg_10fps_256px_stride16_offset0_inner_stride1/s3dg-avg-logits.pickle
 5 | data/QuerYD/structured-symlinks/aggregated_scene_25fps_256px_stride1_offset0/densenet161-avg.pickle
 6 | data/QuerYD/structured-symlinks/raw_captions_combined_filtered.pkl
 7 | data/QuerYD/structured-symlinks/text_embeddings/howto100m_mil_nce.pkl
 8 | data/QuerYD/structured-symlinks/text_embeddings/w2v.pkl
 9 | data/QuerYD/structured-symlinks/test_list.txt
10 | data/QuerYD/structured-symlinks/train_list.txt
11 | data/QuerYD/structured-symlinks/val_list.txt
12 | 


--------------------------------------------------------------------------------
/misc/datasets/audiocaps/tar_include.txt:
--------------------------------------------------------------------------------
 1 | data/AudioCaps/structured-symlinks/aggregated_audio/vggish-raw.hickle
 2 | data/AudioCaps/structured-symlinks/aggregated_imagenet_25fps_256px_stride1_offset0/resnext101_32x48d-avg.pickle
 3 | data/AudioCaps/structured-symlinks/aggregated_pann/pann-raw.hickle
 4 | data/AudioCaps/structured-symlinks/aggregated_r2p1d_30fps_256px_stride32_offset0_inner_stride1/r2p1d-ig65m-avg.pickle
 5 | data/AudioCaps/structured-symlinks/aggregated_scene_25fps_256px_stride1_offset0/densenet161-avg.pickle
 6 | data/AudioCaps/structured-symlinks/aggregated_syncnet/syncnet-raw.hickle
 7 | data/AudioCaps/structured-symlinks/aggregated_vggsound/vggsound-raw.hickle
 8 | data/AudioCaps/structured-symlinks/filtered_val_list.txt
 9 | data/AudioCaps/structured-symlinks/final_filtered_test_list.txt
10 | data/AudioCaps/structured-symlinks/structured-symlinks/raw-captions.pkl
11 | data/AudioCaps/structured-symlinks/text_embeddings/w2v.pkl
12 | data/AudioCaps/structured-symlinks/train_list.txt
13 | 


--------------------------------------------------------------------------------
/logger/logger_config.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |     "version": 1, 
 4 |     "disable_existing_loggers": false, 
 5 |     "formatters": {
 6 |         "simple": {"format": "%(message)s"}, 
 7 |         "datetime": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"}
 8 |     }, 
 9 |     "handlers": {
10 |         "console": {
11 |             "class": "logging.StreamHandler", 
12 |             "level": "DEBUG", 
13 |             "formatter": "simple", 
14 |             "stream": "ext://sys.stdout"
15 |             }, 
16 |         "info_file_handler": {
17 |             "class": "logging.handlers.RotatingFileHandler", 
18 |             "level": "INFO", 
19 |             "formatter": "datetime", 
20 |             "filename": "info.log", 
21 |             "maxBytes": 10485760, 
22 |             "backupCount": 20, "encoding": "utf8"
23 |         }
24 |     }, 
25 |     "root": {
26 |         "level": "INFO", 
27 |         "handlers": [
28 |             "console", 
29 |             "info_file_handler"
30 |         ]
31 |     }
32 | }


--------------------------------------------------------------------------------
/logger/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import logging.config
 4 | from pathlib import Path
 5 | from utils import read_json
 6 | 
 7 | 
 8 | def setup_logging(save_dir, log_config='logger/logger_config.json',
 9 |                   default_level=logging.INFO):
10 |     """Setup logging configuration."""
11 |     print(os.getcwd())
12 |     log_config = Path(log_config)
13 |     print(f"log config: {log_config} exists: {log_config.exists()}")
14 |     if log_config.is_file():
15 |         config = read_json(log_config)
16 |         # modify logging paths based on run config
17 |         for _, handler in config['handlers'].items():
18 |             if 'filename' in handler:
19 |                 handler['filename'] = str(save_dir / handler['filename'])
20 | 
21 |         logging.config.dictConfig(config)
22 |     else:
23 |         print(f"Warning: logging configuration file is not found in {log_config}.")
24 |         logging.basicConfig(level=default_level)
25 |     return config["handlers"]["info_file_handler"]["filename"]
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # symlinked data
 2 | data
 3 | 
 4 | # ignore predictions
 5 | pred
 6 | 
 7 | # local dev files
 8 | scratch/
 9 | 
10 | # nuisance files
11 | *.DS_Store
12 | .nfs*
13 | __pycache__
14 | 
15 | # generated files
16 | misc/slurm/scripts/slurm-job.sh
17 | .vscode/tags
18 | 
19 | # exclude files that are not for release
20 | hp*
21 | hc*
22 | 
23 | # exclude unsupported datasets
24 | configs/mit
25 | configs/mmit
26 | data_loader/MIT*
27 | data_loader/MMIT*
28 | configs/templates
29 | misc/ablations-template.md
30 | misc/ablations.md
31 | misc/README-ablations-template.md
32 | misc/README-audiocaps-ablations-template.md
33 | misc/README-model-study.md
34 | misc/README-queryd-ablations-template.md
35 | misc/README-template.md
36 | 
37 | slurm
38 | 
39 | .vscode
40 | 
41 | # exclude long video tar lists to avoid an overly heavy git repo
42 | misc/cvpr2020_challenge/datasets/activity-net/challenge-release-1/video_tar_include.txt
43 | misc/cvpr2020_challenge/datasets/MSVD/challenge-release-1/video_tar_include.txt
44 | misc/cvpr2020_challenge/datasets/DiDeMo/challenge-release-1/video_tar_include.txt
45 | misc/cvpr2020_challenge/datasets/MSRVTT/challenge-release-1/video_tar_include.txt
46 | misc/cvpr2020_challenge/datasets/YouCook2/challenge-release-1/video_tar_include.txt
47 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | backcall==0.2.0
 2 | beartype==0.7.0
 3 | certifi==2021.5.30
 4 | chardet==4.0.0
 5 | colored==1.4.2
 6 | common-cmplr-lib-rt==2021.2.0
 7 | common-cmplr-lic-rt==2021.2.0
 8 | cycler==0.10.0
 9 | decorator==5.0.9
10 | dill==0.3.3
11 | dominate==2.6.0
12 | dpcpp-cpp-rt==2021.2.0
13 | h5py==2.10.0
14 | hickle==4.0.4
15 | humanize==3.7.1
16 | idna==2.10
17 | intel-openmp==2021.2.0
18 | ipdb==0.13.9
19 | ipython==7.24.1
20 | ipython-genutils==0.2.0
21 | jedi==0.18.0
22 | joblib==1.0.1
23 | kiwisolver==1.3.1
24 | matplotlib==3.4.2
25 | matplotlib-inline==0.1.2
26 | mergedeep==1.3.4
27 | mkl==2021.2.0
28 | mkl-fft==1.3.0
29 | mkl-random==1.2.2
30 | mkl-service==2.4.0
31 | mock==4.0.3
32 | msgpack==1.0.2
33 | msgpack-numpy==0.4.7.1
34 | numpy==1.20.3
35 | opencl-rt==2021.2.0
36 | ordered-set==4.0.2
37 | pandas==1.0.3
38 | parso==0.8.2
39 | pexpect==4.8.0
40 | pickleshare==0.7.5
41 | Pillow==8.2.0
42 | prompt-toolkit==3.0.18
43 | protobuf==3.17.3
44 | psutil==5.8.0
45 | ptyprocess==0.7.0
46 | Pygments==2.9.0
47 | PyLaTeX==1.4.1
48 | pyparsing==2.4.7
49 | python-dateutil==2.8.1
50 | pytorch-swats==0.1.0
51 | pytz==2021.1
52 | PyYAML==5.4.1
53 | requests==2.25.1
54 | scikit-learn==0.24.2
55 | scipy==1.6.3
56 | seaborn==0.11.1
57 | six==1.16.0
58 | tailf==0.3.2
59 | tbb==2021.2.0
60 | tensorboardX==2.2
61 | threadpoolctl==2.1.0
62 | toml==0.10.2
63 | torch==1.7.1
64 | torchvision==0.8.2
65 | tqdm==4.61.1
66 | traitlets==5.0.5
67 | typeguard==2.12.1
68 | typing-extensions==3.10.0.0
69 | urllib3==1.26.5
70 | watchlogs==0.1.3.21
71 | wcwidth==0.2.5
72 | wget==3.2
73 | yaspi==0.0.5
74 | zsvision==0.7.8
75 | 


--------------------------------------------------------------------------------
/misc/datasets/queryd/README.md:
--------------------------------------------------------------------------------
 1 | ## Pretrained Experts
 2 | 
 3 | This folder contains a collection of features, extracted from the QuerYD [2] dataset as part of the paper:
 4 | *QuerYD: A video dataset with high-quality textual and audio narrations*.
 5 | 
 6 | ### Training splits
 7 | 
 8 | The training splits are given in the files linked below:
 9 | 
10 | * [train_list.txt](train_list.txt) (1796 videos)
11 | * [val_list.txt](val_list.txt) (384 videos)
12 | * [test_list.txt](test_list.txt) (386 videos)
13 | 
14 | 
15 | **Tar contents**
16 | 
17 | The compressed tar file (402MB) can be downloaded from:
18 | 
19 | ```
20 | http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/QuerYD-experts.tar.gz
21 | sha1sum: 0207ea85eeb52a4f50b06a31af28484afe4d9e86
22 | ```
23 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt).
24 | 
25 | 
26 | ### References:
27 | 
28 | [1] If you use these features, please consider citing:
29 | ```
30 | @inproceedings{Liu2019a,
31 |   author    = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.},
32 |   booktitle = {British Machine Vision Conference},
33 |   title     = {Use What You Have: Video retrieval using representations from collaborative experts},
34 |   date      = {2019},
35 | }
36 | ```
37 | 
38 | [2] Please also consider citing the original QuerYD dataset, which was described in:
39 | 
40 | ```
41 | @misc{oncescu2020queryd,
42 |   title={QuerYD: A video dataset with high-quality textual and audio narrations}, 
43 |   author={Andreea-Maria Oncescu and Jõao F. Henriques and Yang Liu and Andrew Zisserman and Samuel Albanie},
44 |   year={2020},
45 | }
46 | ```


--------------------------------------------------------------------------------
/misc/datasets/querydsegments/README.md:
--------------------------------------------------------------------------------
 1 | ## Pretrained Experts
 2 | 
 3 | This folder contains a collection of features, extracted from the QuerYD [2] dataset as part of the paper:
 4 | *QuerYD: A video dataset with high-quality textual and audio narrations*.
 5 | 
 6 | ### Training splits
 7 | 
 8 | The training splits are given in the files linked below:
 9 | 
10 | * [train_list.txt](train_list.txt) (9113 videos)
11 | * [val_list.txt](val_list.txt) (1952 videos)
12 | * [test_list.txt](test_list.txt) (1954 videos)
13 | 
14 | 
15 | **Tar contents**
16 | 
17 | The compressed tar file (244MB) can be downloaded from:
18 | 
19 | ```
20 | https://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/QuerYDSegments-experts.tar.gz
21 | sha1sum: f2be088890294f92355ccfe109f824d814cf2cd5
22 | ```
23 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt).
24 | 
25 | 
26 | ### References:
27 | 
28 | [1] If you use these features, please consider citing:
29 | ```
30 | @inproceedings{Liu2019a,
31 |   author    = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.},
32 |   booktitle = {British Machine Vision Conference},
33 |   title     = {Use What You Have: Video retrieval using representations from collaborative experts},
34 |   date      = {2019},
35 | }
36 | ```
37 | 
38 | [2] Please also consider citing the original QuerYD dataset, which was described in:
39 | 
40 | ```
41 | @misc{oncescu2020queryd,
42 |   title={QuerYD: A video dataset with high-quality textual and audio narrations}, 
43 |   author={Andreea-Maria Oncescu and Jõao F. Henriques and Yang Liu and Andrew Zisserman and Samuel Albanie},
44 |   year={2020},
45 | }
46 | ```


--------------------------------------------------------------------------------
/misc/exps-names.md:
--------------------------------------------------------------------------------
 1 | ## This file contains additional instructions for running the commands provided in the main README file
 2 | 
 3 | ### Downloading required features and splits:
 4 | ```
 5 | python3 misc/sync_experts.py --dataset AudioCaps
 6 | python3 misc/sync_experts.py --dataset CLOTHO
 7 | python3 misc/sync_experts.py --dataset activity-net
 8 | python3 misc/sync_experts.py --dataset QuerYDSegments
 9 | ```
10 | 
11 | ### Finding the corresponding .json file names for evaluation of pre-trained models
12 | 
13 | #### AudioCaps:
14 | |Experiment type | Model name|
15 | |---|---|
16 | |CE VGGish only | audiocaps-train-full-ce-only-audio|
17 | |CE VGGSound only | audicaps-train-only-vggsound|
18 | |CE VGGish + VGGSound | audiocaps-train-vggish-vggsound|
19 | |MoEE VGGish + VGGSound | audiocaps-train-vggish-vggsound-moee|
20 | |CE Scene | audiocaps-train-full-ce-only-scene|
21 | |CE R2P1D | audiocaps-train-full-ce-only-r2p1d|
22 | |CE Inst | audiocaps-train-full-ce-only-inst|
23 | |CE Scene + R2P1D | audiocaps-train-full-ce-scene-r2p1d|
24 | |CE Scene + Inst | audiocaps-train-full-ce-scene-inst|
25 | |CE R2P1D + Inst | audiocaps-train-full-ce-r2p1d-inst|
26 | |CE - R2P1D + Inst + VGGish | audiocaps-train-full-ce-r2p1d-inst-vggish |
27 | |CE - R2P1D + Inst + VGGSound | audiocaps-train-full-ce-r2p1d-inst-vggsound |
28 | |CE - R2P1D + Inst + VGGish + VGGSound | audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound |
29 | 
30 | #### CLOTHO:
31 | |Experiment type | Model name|
32 | |---|---|
33 | |CE VGGish only | clotho-train-full-ce-only-audio|
34 | |CE VGGish + VGGSound | clotho-train-vggish-vggsound|
35 | |MoEE VGGish + VGGSound | clotho-train-vggish-vggsound-moee|
36 | 
37 | #### Activity-net:
38 | |Experiment type | Model name|
39 | |---|---|
40 | |CE VGGish only | activity-net-train-full-ce-audio-only|
41 | 
42 | #### QuerYDSegments:
43 | |Experiment type | Model name|
44 | |---|---|
45 | |CE VGGish only | querydsegments-train-full-ce-audio-only|
46 | 


--------------------------------------------------------------------------------
/misc/find_latest_checkpoints.py:
--------------------------------------------------------------------------------
 1 | """Simple aggregation script for experiments
 2 | 
 3 | ipy misc/find_latest_checkpoints.py -- --dataset audiocaps
 4 | """
 5 | import argparse
 6 | from pathlib import Path
 7 | from datetime import datetime
 8 | 
 9 | 
10 | def formatted_summary(dataset, exp_root, fname):
11 |     try:
12 |         summaries = list(Path(exp_root).glob(f"**/*{fname}"))
13 |         summaries = [x for x in summaries if dataset in str(x)]
14 |     except FileNotFoundError:
15 |         fname = "summary-seed-1_seed-2_seed-3.json"
16 |         summaries = list(Path(exp_root).glob(f"**/*{fname}"))
17 |         summaries = [x for x in summaries if dataset in str(x)]
18 |     print(f"Found {len(summaries)}")
19 |     latest = {}
20 |     time_format = "%Y-%m-%d_%H-%M-%S"
21 |     for summary in summaries:
22 |         rel_path = summary.relative_to(exp_root)
23 |         key, group, timestamp = rel_path.parts[0], rel_path.parts[1], rel_path.parts[3]
24 |         val = {"timestamp": timestamp, "group": group}
25 |         if key in latest:
26 |             prev_ts = datetime.strptime(latest[key]["timestamp"], time_format)
27 |             curr_ts = datetime.strptime(timestamp, time_format)
28 |             if curr_ts > prev_ts:
29 |                 latest[key] = val
30 |         else:
31 |             latest[key] = val
32 |     for key, val in sorted(latest.items()):
33 |         ts, group = val["timestamp"], val["group"]
34 |         print(f'"{key}": ["{group}", "{ts}"],')
35 | 
36 | 
37 | def main():
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument("--dataset", default="audiocaps")
40 |     parser.add_argument("--exp_root", default="data/saved/log")
41 |     parser.add_argument("--fname", default="summary-seed-0_seed-1_seed-2.json")
42 |     args = parser.parse_args()
43 | 
44 |     formatted_summary(
45 |         fname=args.fname,
46 |         dataset=args.dataset,
47 |         exp_root=args.exp_root,
48 |     )
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/misc/experiments-audiocaps.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "audiocaps-train-full-ce-only-audio": ["c0b5bc86", "2021-06-10_15-34-48"],
 3 |     "audiocaps-train-full-ce-only-inst": ["5ee05383", "2021-06-10_15-32-29"],
 4 |     "audiocaps-train-full-ce-only-r2p1d": ["88d3ab9e", "2021-06-10_15-30-03"],
 5 |     "audiocaps-train-full-ce-only-scene": ["74d71d8b", "2021-06-10_15-27-11"],
 6 |     "audiocaps-train-full-ce-r2p1d-inst": ["cf11d710", "2021-06-10_15-23-04"],
 7 |     "audiocaps-train-full-ce-r2p1d-inst-vggish": ["74991f95", "2021-06-10_15-06-31"],
 8 |     "audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound": ["b51f941a", "2021-06-10_14-56-45"],
 9 |     "audiocaps-train-full-ce-r2p1d-inst-vggsound": ["1b623fdc", "2021-06-10_14-49-00"],
10 |     "audiocaps-train-full-ce-scene-inst": ["55c40cc6", "2021-06-10_15-18-50"],
11 |     "audiocaps-train-full-ce-scene-r2p1d": ["b2b14107", "2021-06-10_15-13-04"],
12 |     "audiocaps-train-only-vggsound": ["afab0e0c", "2021-06-16_01-21-37"],
13 |     "audiocaps-train-vggish-vggsound": ["7e2eda12", "2021-06-09_17-06-26"],
14 |     "audiocaps-train-vggish-vggsound-moee": ["f66525f8", "2021-06-09_16-44-00"],
15 |     "audiocaps-train-vggish-vggsound-train_list_10": ["68747f8c", "2021-06-10_11-02-21"],
16 |     "audiocaps-train-vggish-vggsound-train_list_25": ["0151ad7f", "2021-06-10_11-14-25"],
17 |     "audiocaps-train-vggish-vggsound-train_list_50": ["4aeeaa0d", "2021-06-10_11-27-36"],
18 |     "audiocaps-train-vggish-vggsound-train_list_75": ["3a8d0584", "2021-06-10_11-45-26"],
19 | 
20 |     "clotho-train-full-ce-only-audio": ["4f58ef05", "2021-06-10_15-38-28"],
21 |     "clotho-train-vggish-vggsound": ["dec0c820", "2021-06-10_14-45-51"],
22 |     "clotho-train-vggish-vggsound-moee": ["fafa3e91", "2021-06-10_14-44-51"],
23 |     "clotho-train-vggish-vggsound-finetuned": ["74560a6c", "2021-06-10_16-38-40"],
24 |     "clotho-train-vggish-vggsound-moee-finetuned": ["5395fa47", "2021-06-10_16-36-13"],
25 | 
26 | 
27 |     "querydsegments-train-full-ce-only-audio": ["70111434", "2021-06-10_14-33-03"],
28 |     "activity-net-train-full-ce-audio-only": ["e8639db7", "2021-06-11_12-23-42"]
29 | }


--------------------------------------------------------------------------------
/misc/experiments_teachText.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 |     "msrvtt-train-gpt2-xl-finetuned-mte-denoising-adam-less80": ["c58ecf3b", "2020-11-21_14-00-26"],
 4 |     "msrvtt-train-gpt2-xl-finetuned-denoising-adam": ["a61447a9", "2020-11-11_05-31-29"],
 5 |     "msrvtt-train-gpt2-xl-finetuned-mte-denoising-adam": ["2cc98676", "2020-11-11_06-21-03"],
 6 |     "msrvtt-train-full-ce": ["6becbb74", "2020-06-28_18-31-21"],
 7 |     "msrvtt-train-ce-intra-mte": ["4d4508a2", "2020-11-06_17-27-00"],
 8 |     "msrvtt-train-gpt2-xl-finetuned-adam": ["244af891", "2020-10-01_12-22-00"],
 9 |     "msrvtt-train-gpt2-xl-finetuned-mte-adam": ["6427fd41", "2020-09-30_20-34-12"],
10 | 
11 |     "msvd-train-full-ce": ["2ae80bea", "2020-11-11_13-16-14"],
12 |     "msvd-train-gpt2-xl-finetuned-adam": ["db396303", "2020-10-01_13-17-33"],
13 |     "msvd-train-gpt2-xl-finetuned-mte-adam": ["0af2a1ed", "2020-09-30_21-30-15"],
14 |     "msvd-train-ce-intra-mte": ["a3026a07", "2020-11-13_00-19-59"],
15 |     "msvd-train-gpt2-xl-finetuned-denoising-adam": ["71686a77", "2020-11-11_12-19-27"],
16 |     "msvd-train-gpt2-xl-finetuned-mte-denoising-adam": ["66dc5dff", "2020-11-11_12-57-29"],
17 | 
18 |     "didemo-train-full-ce": ["4ea49b50", "2020-06-28_20-04-46"],
19 |     "didemo-train-gpt2-xl-finetuned-adam": ["616cf11b", "2020-10-01_13-31-57"],
20 |     "didemo-train-gpt2-xl-finetuned-mte-adam": ["f004e587", "2020-09-30_20-19-13"],
21 |     "didemo-train-ce-intra-mte": ["1a5a249f", "2020-11-06_19-12-39"],
22 | 
23 | 
24 |     "lsmdc-train-full-ce": ["7af368b1", "2020-06-28_20-40-54"],
25 |     "lsmdc-train-gpt2-xl-finetuned-mte-adam": ["38e65732", "2020-09-30_20-52-52"],
26 |     "lsmdc-train-gpt2-xl-finetuned-adam": ["9e2c8afd", "2020-10-01_13-48-49"],
27 |     "lsmdc-train-ce-intra-mte": ["1a5555af", "2020-11-06_19-32-23"],
28 | 
29 | 
30 |     "activity-net-train-full-ce": ["9601c704", "2020-07-31_00-23-01"],
31 |     "activity-net-train-gpt2-xl-finetuned-adam": ["a791f27d", "2020-10-01_13-42-29"],
32 |     "activity-net-train-gpt2-xl-finetuned-mte-adam": ["87d04a50", "2020-10-01_08-48-36"],
33 |     "activity-net-train-ce-intra-mte": ["620ad6b4", "2020-11-06_19-12-39"]
34 | }
35 | 


--------------------------------------------------------------------------------
/configs/data_loader_activity-net.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "inherit_from": "configs/base_config.json",
 3 |     "eval_mode": "test_run",
 4 |     "experts": {
 5 |         "face_dim": 512,
 6 |         "text_feat": "w2v",
 7 |         "modalities": [
 8 |             "imagenet.resnext101_32x48d.0",
 9 |             "imagenet.senet154.0",
10 |             "scene.densenet161.0",
11 |             "r2p1d.r2p1d-ig65m.0",
12 |             "i3d.i3d.0",
13 |             "face",
14 |             "ocr",
15 |             "audio",
16 |             "speech"
17 |         ]
18 |     },
19 |     "arch": {
20 |         "args": {
21 |             "test_caption_mode": "indep",
22 |             "use_ce": "pairwise",
23 |             "use_mish": 1,
24 |             "use_bn_reason": 1,
25 |             "num_g_layers": 3,
26 |             "num_h_layers": 0,
27 |             "include_self": 1,
28 |             "l2renorm": false,
29 |             "randomise_feats": "",
30 |             "vlad_clusters": {
31 |                 "text": 20,
32 |                 "audio": 16
33 |             },
34 |             "ghost_clusters": {
35 |                 "text": 1
36 |             }
37 |         }
38 |     },
39 |     "data_loader": {
40 |         "args":{
41 |             "dataset_name": "ActivityNet",
42 |             "root_feat_folder": "structured-symlinks",
43 |             "data_dir": "data/activity-net",
44 |             "split_name": "val1",
45 |             "batch_size": 128,
46 |             "fuse_captions": true,
47 |             "num_test_captions": 1,
48 |             "max_tokens": {
49 |                 "text": 20,
50 |                 "audio": 29
51 |             }
52 |         }
53 |     },
54 |     "trainer": {
55 |         "epochs": 40
56 |     },
57 |     "optimizer": {
58 |         "type": "Ranger",
59 |         "args":{
60 |             "lr": 0.1,
61 |             "weight_decay": 1e-3
62 |         }
63 |     },
64 |     "loss": {
65 |         "type": "MaxMarginRankingLoss",
66 |         "args": {
67 |             "margin": 0.060496613740311816,
68 |             "fix_norm": true
69 |         }
70 |     },
71 |     "eval_settings": {
72 |         "data_loader": {
73 |             "args":{
74 |                 "split_name": "val1"
75 |             }
76 |         },
77 |         "tester": {
78 |             "save_dir": "data/saved/",
79 |             "verbosity": 2
80 |         },
81 |         "disable_gpu": true
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/utils/cos_restart.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import numpy as np
 4 | from torch.optim.optimizer import Optimizer, required
 5 | from torch.optim.lr_scheduler import _LRScheduler
 6 | 
 7 | class CosineAnnealingWithRestartsLR(_LRScheduler):
 8 | 
 9 |     r"""Set the learning rate of each parameter group using a cosine annealing
10 |     schedule, where :math:`\eta_{max}` is set to the initial lr and
11 |     :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
12 |     .. math::
13 |         \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
14 |         \cos(\frac{T_{cur}}{T_{max}}\pi))
15 |     When last_epoch=-1, sets initial lr as lr.
16 |     It has been proposed in
17 |     `SGDR: Stochastic Gradient Descent with Warm Restarts`_. This implements
18 |     the cosine annealing part of SGDR, the restarts and number of iterations multiplier.
19 |     Args:
20 |         optimizer (Optimizer): Wrapped optimizer.
21 |         T_max (int): Maximum number of iterations.
22 |         T_mult (float): Multiply T_max by this number after each restart. Default: 1.
23 |         eta_min (float): Minimum learning rate. Default: 0.
24 |         last_epoch (int): The index of last epoch. Default: -1.
25 |     .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
26 |         https://arxiv.org/abs/1608.03983
27 | 
28 |     src: https://github.com/lkhphuc/pytorch-3d-point-cloud-generation/blob/
29 |          master/custom_scheduler.py
30 |     """
31 | 
32 |     def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, T_mult=1):
33 |         self.T_max = T_max
34 |         self.T_mult = T_mult
35 |         self.restart_every = T_max
36 |         self.eta_min = eta_min
37 |         self.restarts = 0
38 |         self.restarted_at = 0
39 |         super().__init__(optimizer, last_epoch)
40 | 
41 |     def restart(self):
42 |         self.restart_every *= self.T_mult
43 |         self.restarted_at = self.last_epoch
44 | 
45 |     def cosine(self, base_lr):
46 |         return self.eta_min + (base_lr - self.eta_min) * \
47 |                          (1 + math.cos(math.pi * self.step_n / self.restart_every)) / 2
48 | 
49 |     @property
50 |     def step_n(self):
51 |         return self.last_epoch - self.restarted_at
52 | 
53 |     def get_lr(self):
54 |         if self.step_n >= self.restart_every:
55 |             self.restart()
56 |         return [self.cosine(base_lr) for base_lr in self.base_lrs]


--------------------------------------------------------------------------------
/misc/generate_exps.py:
--------------------------------------------------------------------------------
 1 | """A utility for generating experiment config files.
 2 | """
 3 | import json
 4 | import copy
 5 | import argparse
 6 | import itertools
 7 | from pathlib import Path
 8 | from datetime import datetime
 9 | from collections import OrderedDict
10 | 
11 | 
12 | def generate_configs(base_config, grid):
13 |     job_queue = []
14 |     timestamp = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
15 |     hparam_vals = [x for x in grid.values()]
16 |     grid_vals = list(itertools.product(*hparam_vals))
17 |     hparams = list(grid.keys())
18 | 
19 |     for cfg_vals in grid_vals:
20 |         custom_tokens = [f"{hparam}@{val}" for hparam, val in zip(hparams, cfg_vals)]
21 |         custom_args = "+".join(custom_tokens)
22 |         job = f"--config {base_config} --custom_args {custom_args}"
23 |         job_queue.append(job)
24 | 
25 |     job_queue_path = f"data/job-queues/latest.txt"
26 |     Path(job_queue_path).parent.mkdir(exist_ok=True, parents=True)
27 |     with open(str(job_queue_path), "w") as f:
28 |         f.write("\n".join(job_queue))
29 |     print(f"Wrote {len(job_queue)} jobs to queue at {job_queue_path}")
30 |     job_queue_path = f"data/job-queues/{Path(base_config).stem}-{timestamp}.txt"
31 |     with open(str(job_queue_path), "w") as f:
32 |         f.write("\n".join(job_queue))
33 |     print(f"Wrote backup {len(job_queue)} jobs to queue at {job_queue_path}")
34 | 
35 | 
36 | def parse_grid(key_val_strs):
37 |     print(f"parsing grid str: {key_val_strs}")
38 |     key_val_pairs = key_val_strs.split("+")
39 |     parsed = OrderedDict()
40 |     for pair in key_val_pairs:
41 |         key, val_str = pair.split("@")
42 |         vals = []
43 |         opts = [x for x in val_str.split(":")]
44 |         for token in opts:
45 |             if "," in token:
46 |                 val = [x for x in token.split(",") if x]
47 |             else:
48 |                 val = token
49 |             vals.append(val)
50 |         parsed[key] = vals
51 |     return parsed
52 | 
53 | 
54 | def main():
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('--grid', default="")
57 |     parser.add_argument('--config', default="configs/msrvtt/only-i3d.json")
58 |     args = parser.parse_args()
59 | 
60 |     grid = parse_grid(args.grid)
61 |     generate_configs(
62 |         grid=grid,
63 |         base_config=args.config,
64 |     )
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/model/mil_nce_net.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Tuple
 2 | 
 3 | import torch
 4 | from typeguard import typechecked
 5 | 
 6 | from base import BaseModel
 7 | 
 8 | 
 9 | class MNNet(BaseModel):
10 | 
11 |     @typechecked
12 |     def __init__(
13 |             self,
14 |             text_dim: int,
15 |             expert_dims: Dict[str, Tuple[int, int]],
16 |             **_unused,
17 |     ):
18 |         self.text_dim = text_dim
19 |         self.expert_dims = expert_dims
20 |         self.modalities = list(expert_dims.keys())
21 |         super().__init__()
22 |         self.dummy_param = torch.nn.Parameter(torch.ones(1) * 1E-5)
23 | 
24 |     @typechecked
25 |     def forward(
26 |             self,
27 |             text: torch.Tensor,
28 |             ind: Dict[str, torch.Tensor],
29 |             experts: Dict[str, torch.Tensor],
30 |             **_unused,
31 |     ):
32 |         self.sanity_checks(text=text, experts=experts, ind=ind)
33 |         vid_embedding = next(iter(experts.values()))
34 |         vid_embedding = self.dummy_param + vid_embedding
35 |         text = text.view(text.shape[0] * text.shape[1], text.shape[-1])
36 |         # text = text / torch.norm(text, p=2, dim=1).reshape(-1, 1)
37 |         # vid_embedding = vid_embedding / torch.norm(vid_embedding, p=2,
38 |         #                                            dim=1).reshape(-1, 1)
39 |         sims = torch.matmul(text, vid_embedding.t())
40 |         return {
41 |             "modalities": self.modalities,
42 |             "cross_view_conf_matrix": sims,
43 |             "text_embds": {self.modalities[0]: text},
44 |             "vid_embds": {self.modalities[0]: vid_embedding},
45 |         }
46 | 
47 |     @typechecked
48 |     def sanity_checks(
49 |             self,
50 |             text: torch.Tensor,
51 |             ind: Dict[str, torch.Tensor],
52 |             experts: Dict[str, torch.Tensor],
53 |     ):
54 |         msg = f"Text dim {text.shape[-1]} did not match expected {self.text_dim}"
55 |         assert text.shape[-1] == self.text_dim, msg
56 |         assert len(experts) == 1, "Expected single modality experts"
57 |         assert len(text.shape) == 4, "Expected four axes for text input"
58 |         assert text.shape[2] == 1, "Expected singleton for text input on dim 2"
59 |         for expert in self.expert_dims:
60 |             msg = f"Expected all features to be present for {expert}"
61 |             assert ind[expert].sum() == len(ind[expert]), msg
62 |             feats = experts[expert]
63 |             expected = self.expert_dims[expert]
64 |             msg = f"Feature shape {feats.shape[1]} did not match expected {expected}"
65 |             assert feats.shape[1] == expected[-1], msg
66 | 


--------------------------------------------------------------------------------
/misc/aggregate_logs_and_stats.py:
--------------------------------------------------------------------------------
 1 | """Aggregate logs across multiple seeded runs and summarise their statistics.
 2 | 
 3 | ipy misc/aggregate_logs_and_stats.py -- --group_id 3b737e0d
 4 | """
 5 | import argparse
 6 | import logging
 7 | from pathlib import Path
 8 | from collections import OrderedDict
 9 | from utils.util import read_json
10 | from glob import glob
11 | from logger.log_parser import log_summary
12 | 
13 | 
14 | def summarise(group_id, log_dir="data/saved/log", model_dir="data/saved/models"):
15 |     seeded_runs = sorted(list(Path(log_dir).glob(f"**/{group_id}/seed-*")))
16 |     print(f"Found a total of {len(seeded_runs)} seed runs in {group_id}")
17 |     msg = f"Found no seeded runs for group_id: {group_id} in {log_dir}"
18 |     assert len(seeded_runs) > 0, msg
19 | 
20 |     info_logs = OrderedDict()
21 |     for seeded_run in seeded_runs:
22 |         info_log_matches = list(Path(seeded_run).glob("**/info.log"))
23 |         msg = f"expected to find a single info.log file, found {len(info_log_matches)}"
24 |         assert len(info_log_matches) == 1, msg
25 |         info_logs[seeded_run.stem] = info_log_matches[0]
26 | 
27 |     summary_log = []
28 |     for seeded_run, info_log_path in info_logs.items():
29 |         with open(info_log_path, "r") as f:
30 |             log = f.read().splitlines()
31 |         summary_log.extend(log)
32 |     first_info_log = list(info_logs.values())[0]
33 |     summary_log_name = f"summary-{'_'.join(list(info_logs.keys()))}.json"
34 |     summary_log_path = first_info_log.parent / summary_log_name
35 |     with open(summary_log_path, "w") as f:
36 |         f.write("\n".join(summary_log))
37 |     print(f"Wrote concatenated logs to {summary_log_path}")
38 | 
39 |     # retrieve the config from the first run
40 |     rel_path = first_info_log.relative_to(log_dir).parent
41 |     config_path = Path(model_dir) / rel_path / "config.json"
42 |     assert config_path.exists(), f"Could not find config at {config_path}"
43 |     config = read_json(config_path)
44 | 
45 |     logger = logging.getLogger("summary")
46 | 
47 |     # some care is required with logging to avoid sending all experiment logs
48 |     # to the same file.  We avoid this by essentially resetting the logging utility
49 | 
50 |     # Remove all handlers associated with the root logger object
51 |     for handler in logging.root.handlers[:]:
52 |         logging.root.removeHandler(handler)
53 |     logging.basicConfig(filename=summary_log_path, level=logging.INFO)
54 |     if not logger.handlers:
55 |         logger.addHandler(logging.StreamHandler())
56 | 
57 |     log_summary(
58 |         logger=logger,
59 |         log_path=summary_log_path,
60 |         eval_mode=config["eval_mode"],
61 |         fixed_num_epochs=config["trainer"]["epochs"],
62 |     )
63 | 
64 | 
65 | def main():
66 |     parser = argparse.ArgumentParser()
67 |     parser.add_argument("--group_id", default="ed53d01d")
68 |     args = parser.parse_args()
69 |     summarise(group_id=args.group_id)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/misc/datasets/activity-net/README.md:
--------------------------------------------------------------------------------
 1 | ## Pretrained Experts
 2 | 
 3 | This folder contains a collection of features, extracted from the ActivityNet [2] and ActivityNet-captions [3] datasets as part of the paper:
 4 | *Use what you have: Video retrieval using representations from collaborative experts*.
 5 | 
 6 | ### Training splits
 7 | 
 8 | The training splits were taken from [3] and are given in the files linked below:
 9 | 
10 | * [train_list.txt](train_list.txt) (10009 videos)
11 | * [val_1_list.txt](val_1_list.txt) (4917 videos)
12 | * [val_2_list.txt](val_2_list.txt) (4885 videos)
13 | 
14 | In our work, we use the `train` split for training and the `val_1` split for evaluation (the `val_1` split forms a superset of the `val_2` split, with differing captions).
15 | 
16 | 
17 | **Tar contents**
18 | 
19 | The compressed tar file (3.7 GiB) can be downloaded from:
20 | 
21 | ```
22 | http:/www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/features-v2/activity-net-experts.tar.gz
23 | sha1sum: 2901046fa6a3d6f6393ee0047818e960fcfabd69
24 | ```
25 | 
26 | A list of the contents of the tar file are given in [tar_include.txt](tar_include.txt).
27 | 
28 | [**Deprecated**] *The features made available with the previous code release are also available as a compressed tar file (3.8 GiB). These should be considered deprecated, since they are incompatible with the current codebase, but are still available and can be downloaded from:*
29 | 
30 | ```
31 | http:/www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data-deprecated/features/activity-net-experts.tar.gz
32 | sha1sum: b16685576c97cdec2783fb89ea30ca7d17abb021
33 | ```
34 | 
35 | 
36 | ### References:
37 | 
38 | [1] If you use these features, please consider citing:
39 | ```
40 | @inproceedings{Liu2019a,
41 |   author    = {Liu, Y. and Albanie, S. and Nagrani, A. and Zisserman, A.},
42 |   booktitle = {British Machine Vision Conference},
43 |   title     = {Use What You Have: Video retrieval using representations from collaborative experts},
44 |   date      = {2019},
45 | }
46 | ```
47 | 
48 | [2] Please also consider citing the original ActivityNet dataset, which was described in:
49 | 
50 | ```
51 | @inproceedings{caba2015activitynet,
52 |   title={Activitynet: A large-scale video benchmark for human activity understanding},
53 |   author={Caba Heilbron, Fabian and Escorcia, Victor and Ghanem, Bernard and Carlos Niebles, Juan},
54 |   booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
55 |   pages={961--970},
56 |   year={2015}
57 | }
58 | ```
59 | 
60 | [3] In addition, please consider citing the ActivityNet-captions dataset, which provides the text descriptions, and which was described in:
61 | 
62 | ```
63 | @inproceedings{krishna2017dense,
64 |   title={Dense-captioning events in videos},
65 |   author={Krishna, Ranjay and Hata, Kenji and Ren, Frederic and Fei-Fei, Li and Carlos Niebles, Juan},
66 |   booktitle={Proceedings of the IEEE international conference on computer vision},
67 |   pages={706--715},
68 |   year={2017}
69 | }
70 | ```


--------------------------------------------------------------------------------
/misc/launch_exps_from_list.py:
--------------------------------------------------------------------------------
 1 | """Launch a collection of experiments on SLURM from a text file.
 2 | 
 3 | EXP_LIST=audio-retrieval-exps.txt
 4 | ipy misc/launch_exps_from_list.py -- --exp_list "slurm/${EXP_LIST}" --yaspify
 5 | 
 6 | """
 7 | import os
 8 | import sys
 9 | import json
10 | import argparse
11 | from pathlib import Path
12 | 
13 | from yaspi.yaspi import Yaspi
14 | from utils.util import parse_grid, filter_cmd_args
15 | from misc.aggregate_logs_and_stats import summarise
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--exp_list", default="data/job-queues/latest.txt")
20 |     parser.add_argument("--yaspify", action="store_true", help="launch via slurm")
21 |     parser.add_argument("--slurm", action="store_true")
22 |     parser.add_argument("--limit", type=int, default=0)
23 |     parser.add_argument('--mini_train', action="store_true")
24 |     parser.add_argument("--use_cnodes", action="store_true")
25 |     parser.add_argument('--train_single_epoch', action="store_true")
26 |     parser.add_argument("--yaspi_defaults_path", type=Path,
27 |                         default="misc/yaspi_gpu_defaults.json")
28 |     parser.add_argument("--evaluation", type=str, default='train', choices=['train', 'test'])
29 |     args = parser.parse_args()
30 | 
31 |     # construct list of experiments from text file
32 |     with open(args.exp_list, "r") as f:
33 |         custom_args = f.read().splitlines()
34 |     # remove blank lines
35 |     custom_args = [x for x in custom_args if x]
36 | 
37 |     if args.limit:
38 |         custom_args = custom_args[:args.limit]
39 | 
40 |     parsed = {}
41 |     for line in custom_args:
42 |         parsed.update(parse_grid(line, args.evaluation))
43 | 
44 |     # flatten all parsed experiments
45 |     custom_args = [x for group in parsed.values() for x in group]
46 | 
47 |     cmd_args = sys.argv[1:]
48 |     remove = ["--yaspify", "--exp_list", "--use_cnodes", "--evaluation"]
49 |     cmd_args = filter_cmd_args(cmd_args, remove=remove)
50 |     base_cmd = f"python {args.evaluation}.py {' '.join(cmd_args)}"
51 | 
52 |     if args.yaspify:
53 |         with open(args.yaspi_defaults_path, "r") as f:
54 |             yaspi_defaults = json.load(f)
55 |         if args.use_cnodes:
56 |             yaspi_defaults.update({"partition": "compute", "gpus_per_task": 0})
57 |         job_name = f"{Path(args.exp_list).stem}-{len(custom_args)}-exps"
58 |         job_queue = [f'"{x}"' for x in custom_args]
59 |         job_queue = " ".join(job_queue)
60 |         job = Yaspi(
61 |             cmd=base_cmd,
62 |             job_queue=job_queue,
63 |             job_name=job_name,
64 |             job_array_size=len(custom_args),
65 |             **yaspi_defaults,
66 |         )
67 |         job.submit(watch=True, conserve_resources=5)
68 |     else:
69 |         for custom_args_ in custom_args:
70 |             base_cmd = f"{base_cmd} {custom_args_}"
71 |             print(f"Running cmd: {base_cmd}")
72 |             os.system(base_cmd)
73 |     if args.evaluation =='train':
74 |         for group_id in parsed:
75 |             summarise(group_id=group_id)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/logger/visualization.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from utils import Timer
 3 | 
 4 | 
 5 | class TensorboardWriter():
 6 |     def __init__(self, log_dir, logger, enabled):
 7 |         self.writer = None
 8 |         self.selected_module = ""
 9 | 
10 |         if enabled:
11 |             log_dir = str(log_dir)
12 | 
13 |             # Retrieve vizualization writer
14 |             succeeded = False
15 |             for module in ["torch.utils.tensorboard", "tensorboardX"]:
16 |                 try:
17 |                     self.writer = importlib.import_module(module).SummaryWriter(log_dir)
18 |                     succeeded = True
19 |                     break
20 |                 except ImportError:
21 |                     succeeded = False
22 |                 self.selected_module = module
23 | 
24 |             if not succeeded:
25 |                 message = ("Warning: visualization (Tensorboard) is configured to use, "
26 |                            "but currently not installed on this machine. Please install"
27 |                            " either TensorboardX with 'pip install tensorboardx', "
28 |                            " upgrade PyTorch to version >= 1.1 for using "
29 |                            "'torch.utils.tensorboard' or turn off the option in "
30 |                            "the 'config.json' file.")
31 |                 logger.warning(message)
32 | 
33 |         self.step = 0
34 |         self.mode = ''
35 | 
36 |         self.tb_writer_ftns = {
37 |             'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio',
38 |             'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding'
39 |         }
40 |         self.tag_mode_exceptions = {'add_histogram', 'add_embedding'}
41 | 
42 |         self.timer = Timer()
43 | 
44 |     def set_step(self, step, mode='train'):
45 |         self.mode = mode
46 |         self.step = step
47 |         if step == 0:
48 |             self.timer.reset()
49 |         else:
50 |             duration = self.timer.check()
51 |             self.add_scalar('steps_per_sec', 1 / duration)
52 | 
53 |     def __getattr__(self, name):
54 |         """
55 |         If visualization is configured to use:
56 |             return add_data() methods of tensorboard with additional information
57 |             (step, tag) added.
58 |         Otherwise:
59 |             return a blank function handle that does nothing
60 |         """
61 |         if name in self.tb_writer_ftns:
62 |             add_data = getattr(self.writer, name, None)
63 | 
64 |             def wrapper(tag, data, *args, **kwargs):
65 |                 if add_data is not None:
66 |                     # add mode(train/valid) tag
67 |                     if name not in self.tag_mode_exceptions:
68 |                         tag = '{}/{}'.format(tag, self.mode)
69 |                     add_data(tag, data, self.step, *args, **kwargs)
70 |             return wrapper
71 |         else:
72 |             # default action for returning methods defined in this class, set_step()
73 |             # for instance.
74 |             try:
75 |                 attr = object.__getattr__(name)
76 |             except AttributeError:
77 |                 msg = "type object '{}' has no attribute '{}'"
78 |                 raise AttributeError(msg.format(self.selected_module, name))
79 |             return attr
80 | 


--------------------------------------------------------------------------------
/model/text_embedding_models.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "w2v": {
  3 |         "weights_path": "data/text_models/GoogleNews-vectors-negative300.bin.gz",
  4 |         "dim": 300,
  5 |         "force_cpu": true,
  6 |         "remove_stopwords": false,
  7 |         "mirror": "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
  8 |     },
  9 |     "grovle": {
 10 |         "dim": 300,
 11 |         "weights_path": "data/text_models/grovle.zip",
 12 |         "force_cpu": true,
 13 |         "remove_stopwords": false,
 14 |         "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
 15 |     },
 16 |     "mt_grovle": {
 17 |         "dim": 300,
 18 |         "weights_path": "data/text_models/mt_grovle.zip",
 19 |         "force_cpu": true,
 20 |         "remove_stopwords": false,
 21 |         "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
 22 |     },
 23 |     "hglmm_300d": {
 24 |         "dim": 300,
 25 |         "weights_path": "data/text_models/hglmm_300d.zip",
 26 |         "force_cpu": true,
 27 |         "remove_stopwords": false,
 28 |         "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
 29 |     },
 30 |     "hglmm_6kd": {
 31 |         "dim": 6000,
 32 |         "weights_path": "data/text_models/hglmm_6kd.zip",
 33 |         "force_cpu": true,
 34 |         "remove_stopwords": false,
 35 |         "mirror": "http://www.robots.ox.ac.uk/~albanie/data/mirrors/GrOVLE"
 36 |     },
 37 |     "howto100m_mil_nce": {
 38 |         "word_dict_path": "data/text_models/howto100m/s3d_dict.npy",
 39 |         "weights_path": "data/text_models/howto100m/s3d_howto100m.pth",
 40 |         "dim": 512,
 41 |         "mirror": "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m"
 42 |     },
 43 |     "openai": {
 44 |         "dim": 768,
 45 |         "custom_pipeline": true
 46 |     },
 47 |     "electra": {
 48 |         "dim": 256
 49 |     },
 50 |     "openai-gpt": {
 51 |         "dim": 768
 52 |     },
 53 |     "gpt2": {
 54 |         "dim": 768
 55 |     },
 56 |     "gpt2-medium": {
 57 |         "dim": 1024
 58 |     },
 59 |     "gpt2-large": {
 60 |         "dim": 1280
 61 |     },
 62 |     "gpt2-xl": {
 63 |         "dim": 1600
 64 |     },
 65 |     "gpt2-xl-finetune": {
 66 |         "dim": 1600
 67 |     },
 68 |     "bert-base-uncased": {
 69 |         "dim": 768
 70 |     },
 71 |     "t5-small": {
 72 |          "dim": 512
 73 |     },
 74 |     "t5-base": {
 75 |          "dim": 768
 76 |     },
 77 |     "t5-large": {
 78 |          "dim": 1024
 79 |     },
 80 |     "t5-3b": {
 81 |          "dim": 1024
 82 |     },
 83 |     "t5-11b": {
 84 |          "force_cpu": true,
 85 |          "dim": 1024
 86 |     },
 87 |     "albert-base-v2": {
 88 |          "dim": 768
 89 |     },
 90 |     "albert-large-v2": {
 91 |          "dim": 1024
 92 |     },
 93 |     "albert-xlarge-v2": {
 94 |          "dim": 2048
 95 |     },
 96 |     "ctrl": {
 97 |          "dim": 1280
 98 |     },
 99 |     "roberta-base": {
100 |          "dim": 768
101 |     },
102 |     "roberta-large": {
103 |          "dim": 1024
104 |     },
105 |     "xlnet-base-cased": {
106 |          "dim": 768
107 |     },
108 |     "xlnet-large-cased": {
109 |          "dim": 1024
110 |     },
111 |     "transfo-xl-wt103": {
112 |          "dim": 1024
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/model/loss.py:
--------------------------------------------------------------------------------
 1 | """This module contains an implementation of the max margin ranking loss, slightly
 2 | modified from this code:
 3 | https://github.com/antoine77340/Mixture-of-Embedding-Experts/blob/master/loss.py
 4 | 
 5 | The modification is the `fix_norm` conditional, which removes zero terms from the
 6 | diagonal when performing the averaging calculation.
 7 | 
 8 | Original licence below.
 9 | """
10 | # Copyright 2018 Antoine Miech All Rights Reserved.
11 | #
12 | # Licensed under the Apache License, Version 2.0 (the "License");
13 | # you may not use this file except in compliance with the License.
14 | # You may obtain a copy of the License at
15 | #
16 | #      http://www.apache.org/licenses/LICENSE-2.0
17 | #
18 | # Unless required by applicable law or agreed to in writing, software
19 | # distributed under the License is distributed on an "AS-IS" BASIS,
20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | # See the License for the specific language governing permissions and
22 | # limitations under the License.
23 | 
24 | import torch.nn as nn
25 | import torch as th
26 | import torch.nn.functional as F
27 | 
28 | 
29 | class MaxMarginRankingLoss(nn.Module):
30 | 
31 |     def __init__(self, margin=1, fix_norm=True):
32 |         super().__init__()
33 |         self.fix_norm = fix_norm
34 |         self.loss = th.nn.MarginRankingLoss(margin)
35 |         self.margin = margin
36 | 
37 |     def forward(self, x):
38 |         n = x.size()[0]
39 | 
40 |         x1 = th.diag(x)
41 |         x1 = x1.unsqueeze(1)
42 |         x1 = x1.expand(n, n)
43 |         x1 = x1.contiguous().view(-1, 1)
44 |         x1 = th.cat((x1, x1), 0)
45 | 
46 |         x2 = x.view(-1, 1)
47 |         x3 = x.transpose(0, 1).contiguous().view(-1, 1)
48 | 
49 |         x2 = th.cat((x2, x3), 0)
50 |         max_margin = F.relu(self.margin - (x1 - x2))
51 | 
52 |         if self.fix_norm:
53 |             # remove the elements from the diagonal
54 |             keep = th.ones(x.shape) - th.eye(x.shape[0])  # 128 x 128
55 |             keep1 = keep.view(-1, 1)
56 |             keep2 = keep.transpose(0, 1).contiguous().view(-1, 1)
57 |             keep_idx = th.nonzero(th.cat((keep1, keep2), 0).flatten()).flatten()
58 |             if x1.is_cuda:
59 |                 keep_idx = keep_idx.cuda()
60 |             x1_ = th.index_select(x1, dim=0, index=keep_idx)
61 |             x2_ = th.index_select(x2, dim=0, index=keep_idx)
62 |             max_margin = F.relu(self.margin - (x1_ - x2_))
63 | 
64 |         return max_margin.mean()
65 | 
66 | 
67 | class BCEWithLogitsLoss(nn.Module):
68 | 
69 |     def __init__(self, weight=None):
70 |         super().__init__()
71 |         self.loss = th.nn.BCEWithLogitsLoss(weight=weight)
72 | 
73 |     def forward(self, x, target):
74 |         return self.loss(x, target)
75 | 
76 | 
77 | class CrossEntropyLoss(nn.Module):
78 | 
79 |     def __init__(self, weight=None):
80 |         super().__init__()
81 |         self.loss = th.nn.CrossEntropyLoss(weight=weight)
82 | 
83 |     def forward(self, x, target):
84 |         return self.loss(x, target.long().to(x.device))
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     loss = BCEWithLogitsLoss()
89 |     x = th.randn(3, requires_grad=True)
90 |     target = th.empty(3).random_(2)
91 |     output = loss(x, target)
92 |     output.backward()
93 |     print(target)
94 | 


--------------------------------------------------------------------------------
/model/net_vlad.py:
--------------------------------------------------------------------------------
 1 | """NetVLAD implementation.
 2 | """
 3 | # Copyright 2018 Antoine Miech All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import math
19 | import ipdb
20 | import torch.nn as nn
21 | import torch.nn.functional as F
22 | import torch as th
23 | 
24 | 
25 | class NetVLAD(nn.Module):
26 |     def __init__(self, cluster_size, feature_size, ghost_clusters=0,
27 |                  add_batch_norm=True):
28 |         super().__init__()
29 | 
30 |         self.feature_size = feature_size
31 |         self.cluster_size = cluster_size
32 |         self.ghost_clusters = ghost_clusters
33 | 
34 |         init_sc = (1 / math.sqrt(feature_size))
35 |         clusters = cluster_size + ghost_clusters
36 | 
37 |         # The `clusters` weights are the `(w,b)` in the paper
38 |         self.clusters = nn.Parameter(init_sc * th.randn(feature_size, clusters))
39 |         self.batch_norm = nn.BatchNorm1d(clusters) if add_batch_norm else None
40 |         # The `clusters2` weights are the visual words `c_k` in the paper
41 |         self.clusters2 = nn.Parameter(init_sc * th.randn(1, feature_size, cluster_size))
42 |         self.out_dim = self.cluster_size * feature_size
43 | 
44 |     def forward(self, x, mask=None):
45 |         """Aggregates feature maps into a fixed size representation.  In the following
46 |         notation, B = batch_size, N = num_features, K = num_clusters, D = feature_size.
47 | 
48 |         Args:
49 |             x (th.Tensor): B x N x D
50 | 
51 |         Returns:
52 |             (th.Tensor): B x DK
53 |         """
54 |         self.sanity_checks(x)
55 |         max_sample = x.size()[1]
56 |         x = x.view(-1, self.feature_size)  # B x N x D -> BN x D
57 | 
58 |         if x.device != self.clusters.device:
59 |             msg = f"x.device {x.device} != cluster.device {self.clusters.device}"
60 |             raise ValueError(msg)
61 | 
62 |         assignment = th.matmul(x, self.clusters)  # (BN x D) x (D x (K+G)) -> BN x (K+G)
63 | 
64 |         if self.batch_norm:
65 |             assignment = self.batch_norm(assignment)
66 | 
67 |         assignment = F.softmax(assignment, dim=1)  # BN x (K+G) -> BN x (K+G)
68 |         # remove ghost assigments
69 |         assignment = assignment[:, :self.cluster_size]
70 |         assignment = assignment.view(-1, max_sample, self.cluster_size)  # -> B x N x K
71 |         a_sum = th.sum(assignment, dim=1, keepdim=True)  # B x N x K -> B x 1 x K
72 |         a = a_sum * self.clusters2
73 | 
74 |         assignment = assignment.transpose(1, 2)  # B x N x K -> B x K x N
75 | 
76 |         x = x.view(-1, max_sample, self.feature_size)  # BN x D -> B x N x D
77 |         vlad = th.matmul(assignment, x)  # (B x K x N) x (B x N x D) -> B x K x D
78 |         vlad = vlad.transpose(1, 2)  # -> B x D x K
79 |         vlad = vlad - a
80 | 
81 |         # L2 intra norm
82 |         vlad = F.normalize(vlad)
83 | 
84 |         # flattening + L2 norm
85 |         vlad = vlad.reshape(-1, self.cluster_size * self.feature_size)  # -> B x DK
86 |         vlad = F.normalize(vlad)
87 |         return vlad  # B x DK
88 | 
89 |     def sanity_checks(self, x):
90 |         """Catch any nans in the inputs/clusters"""
91 |         if th.isnan(th.sum(x)):
92 |             print("nan inputs")
93 |             ipdb.set_trace()
94 |         if th.isnan(self.clusters[0][0]):
95 |             print("nan clusters")
96 |             ipdb.set_trace()
97 | 


--------------------------------------------------------------------------------
/utils/gen_ablations_for_dataset.py:
--------------------------------------------------------------------------------
 1 | """Generate a set of ablations for each dataset, using the config structure of the
 2 | MSRVTT experiments.
 3 | 
 4 | ipy utils/gen_ablations_for_dataset.py -- --refresh --dest_dataset didemo \
 5 |     --update_ablation_list 1
 6 | 
 7 | """
 8 | import json
 9 | import argparse
10 | from pathlib import Path
11 | 
12 | 
13 | def handle_moee_config(config):
14 |     """For the official ablations on MSRVTT, we provide MoEE with the same hyperparam
15 |     budget as CE and run a search to find the best hyperparams.  For the unofficial
16 |     ablations, we use the same padding/VLAD settings as CE.
17 |     """
18 |     config = {
19 |         "inherit_from": config["inherit_from"],
20 |         "arch": {"type": "CENet", "args": {"use_ce": ""}},
21 |     }
22 |     return config
23 | 
24 | 
25 | def remove_audio_streams(config, dest_path):
26 |     """Prune audio-based features from the config and dest_path name (necessary for
27 |     datasets like MSVD which do not possess sound.)  If the audio feature was the control
28 |     variable in the experiment, we return False for the dest_path, such that the ablation
29 |     is removed altogether.
30 |     """
31 |     audio_tags = ["audio", "speech"]
32 |     for audio_tag in audio_tags:
33 |         if f"-{audio_tag}." in dest_path:
34 |             return config, False
35 | 
36 |         dest_path = dest_path.replace(f"-{audio_tag}", "")
37 |         if "experts" in config and "modalities" in config["experts"]:
38 |             if audio_tag in config["experts"]["modalities"]:
39 |                 config["experts"]["modalities"].remove(audio_tag)
40 |     return config, dest_path
41 | 
42 | 
43 | def main():
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument('--refresh', action="store_true")
46 |     parser.add_argument('--update_ablation_list', type=int, default=1)
47 |     parser.add_argument('--src_dataset', default="msrvtt")
48 |     parser.add_argument('--dest_dataset', default="lsmdc")
49 |     parser.add_argument('--exp_list', default="slurm/msrvtt-ablations.txt")
50 |     args = parser.parse_args()
51 | 
52 |     with open(args.exp_list, "r") as f:
53 |         exps = [x for x in f.read().splitlines() if x]
54 | 
55 |     print(f"Found {len(exps)} experiments in {args.exp_list}")
56 |     dest_exp_path = Path(args.exp_list.replace("msrvtt", args.dest_dataset))
57 |     if dest_exp_path.exists() and not args.refresh:
58 |         print(f"experiment list found at {dest_exp_path}, skipping...")
59 |         return
60 | 
61 |     output_rows = []
62 |     exclude = ["miech", "jsfusion"]
63 |     for row in exps:
64 |         flag, config_path, seed_flag, seed_opts = row.split()
65 |         if any([x in config_path for x in exclude]):
66 |             continue
67 |         with open(config_path, "r") as f:
68 |             config = json.load(f)
69 |         if Path(config_path).stem == "train-full-moee":
70 |             config = handle_moee_config(config)
71 |         dest_path = config_path.replace(args.src_dataset, args.dest_dataset)
72 |         config["inherit_from"] = config["inherit_from"].replace(args.src_dataset,
73 |                                                                 args.dest_dataset)
74 |         if args.dest_dataset == "msvd":
75 |             config, dest_path = remove_audio_streams(config, dest_path)
76 |             if not dest_path:
77 |                 continue
78 | 
79 |         print(f"writing config to {dest_path}")
80 |         with open(dest_path, "w") as f:
81 |             json.dump(config, f, indent=4, sort_keys=False)
82 |         output_rows.append([flag, dest_path, seed_flag, seed_opts])
83 | 
84 |     if args.update_ablation_list:
85 |         print(f"Writing new experiment list to {dest_exp_path}")
86 |         output_rows = [" ".join(x) for x in output_rows]
87 |         with open(dest_exp_path, "w") as f:
88 |             for row in sorted(list(set(output_rows))):
89 |                 f.write(f"{row}\n")
90 | 
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/data_loader/QuerYDSegments_dataset.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import Dict, Union, List
 3 | from pathlib import Path
 4 | 
 5 | from zsvision.zs_utils import memcache, concat_features
 6 | from typeguard import typechecked
 7 | 
 8 | from utils import memory_summary
 9 | from base.base_dataset import BaseDataset
10 | 
11 | 
12 | class QuerYDSegments(BaseDataset):
13 | 
14 |     @staticmethod
15 |     @typechecked
16 |     def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
17 |         subset_paths = {}
18 |         test_splits = {
19 |             "val": "val_list.txt",
20 |             "test": "test_list.txt",
21 |         }
22 |         for split_name, fname in test_splits.items():
23 |             subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
24 | 
25 |         feature_names = BaseDataset.common_feat_names()
26 |         feature_names.append("audio.vggish.0")
27 |         text_feat_paths = BaseDataset.common_text_feat_paths()
28 |         text_feat_paths = {key: Path("text_embeddings") / fname
29 |                            for key, fname in text_feat_paths.items()}
30 |         challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
31 |                                      for key in text_feat_paths}
32 |         custom_paths = {
33 |             "audio": ["aggregated_audio/vggish-raw.hickle"],
34 |         }
35 |         feature_info = {
36 |             "custom_paths": custom_paths,
37 |             "feature_names": feature_names,
38 |             "subset_list_paths": subset_paths,
39 |             "text_feat_paths": text_feat_paths,
40 |             "challenge_text_feat_paths": challenge_text_feat_paths,
41 |             "raw_captions_path": "structured-symlinks/split_raw_captions_filtered.pkl",
42 |         }
43 |         return feature_info
44 | 
45 |     def load_features(self):
46 |         root_feat = self.root_feat
47 |         # import pdb; pdb.set_trace()
48 |         feat_names = {key: self.visual_feat_paths(key) for key in
49 |                       self.paths["feature_names"]}
50 |         feat_names.update(self.paths["custom_paths"])
51 |         features = {}
52 |         for expert, rel_names in feat_names.items():
53 |             if expert not in self.ordered_experts:
54 |                 continue
55 |             feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
56 |             if len(feat_paths) == 1:
57 |                 features[expert] = memcache(feat_paths[0])
58 |             else:
59 |                 # support multiple forms of feature (e.g. max and avg pooling). For
60 |                 # now, we only support direct concatenation
61 |                 msg = f"{expert}: Only direct concatenation of muliple feats is possible"
62 |                 print(f"Concatenating aggregates for {expert}....")
63 |                 assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
64 |                 axis = self.feat_aggregation[expert]["aggregate-axis"]
65 |                 x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
66 |                 print(f"concat cache info: {x}")
67 |                 features_ = concat_features(feat_paths, axis=axis)
68 |                 memory_summary()
69 | 
70 |                 # Make separate feature copies for each split to allow in-place filtering
71 |                 features[expert] = copy.deepcopy(features_)
72 | 
73 |         self.features = features
74 |         if self.challenge_mode:
75 |             self.load_challenge_text_features()
76 |         else:
77 |             self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
78 |             text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
79 |             self.text_features = memcache(text_feat_path)
80 | 
81 | 
82 |         # overload video paths
83 |         self.video_path_retrieval = [f"videos/{x}.mp4"
84 |                                      for x in self.partition_lists["val"]]
85 | 
86 |     def sanity_checks(self):
87 |         msg = (f"Expected to have single test caption for QuerYD, since we assume"
88 |                f"that the captions are fused (but using {self.num_test_captions})")
89 |         assert self.num_test_captions == 1, msg
90 | 


--------------------------------------------------------------------------------
/exp_to_seed_time.json:
--------------------------------------------------------------------------------
1 | {"audiocaps-train-full-ce-only-audio": [["c0b5bc86", "seed-0", "2021-06-10_15-34-48"], ["c0b5bc86", "seed-1", "2021-06-10_15-36-14"], ["c0b5bc86", "seed-2", "2021-06-10_15-36-15"]], "audiocaps-train-full-ce-only-inst": [["5ee05383", "seed-0", "2021-06-10_15-32-29"], ["5ee05383", "seed-1", "2021-06-10_15-33-37"], ["5ee05383", "seed-2", "2021-06-10_15-33-51"]], "audiocaps-train-full-ce-only-r2p1d": [["88d3ab9e", "seed-0", "2021-06-10_15-30-03"], ["88d3ab9e", "seed-1", "2021-06-10_15-31-11"], ["88d3ab9e", "seed-2", "2021-06-10_15-31-32"]], "audiocaps-train-full-ce-only-scene": [["74d71d8b", "seed-0", "2021-06-10_15-27-11"], ["74d71d8b", "seed-1", "2021-06-10_15-27-40"], ["74d71d8b", "seed-2", "2021-06-10_15-29-16"]], "audiocaps-train-full-ce-r2p1d-inst": [["cf11d710", "seed-0", "2021-06-10_15-23-04"], ["cf11d710", "seed-1", "2021-06-10_15-23-25"], ["cf11d710", "seed-2", "2021-06-10_15-26-45"]], "audiocaps-train-full-ce-r2p1d-inst-vggish": [["74991f95", "seed-0", "2021-06-10_15-06-31"], ["74991f95", "seed-1", "2021-06-10_15-07-40"], ["74991f95", "seed-2", "2021-06-10_15-12-39"]], "audiocaps-train-full-ce-r2p1d-inst-vggish-vggsound": [["b51f941a", "seed-0", "2021-06-10_14-56-45"], ["b51f941a", "seed-1", "2021-06-10_14-57-08"], ["b51f941a", "seed-2", "2021-06-10_14-59-04"]], "audiocaps-train-full-ce-r2p1d-inst-vggsound": [["1b623fdc", "seed-0", "2021-06-10_14-49-00"], ["1b623fdc", "seed-1", "2021-06-10_14-49-00"], ["1b623fdc", "seed-2", "2021-06-10_14-48-59"]], "audiocaps-train-full-ce-scene-inst": [["55c40cc6", "seed-0", "2021-06-10_15-18-50"], ["55c40cc6", "seed-1", "2021-06-10_15-18-51"], ["55c40cc6", "seed-2", "2021-06-10_15-22-00"]], "audiocaps-train-full-ce-scene-r2p1d": [["b2b14107", "seed-0", "2021-06-10_15-13-04"], ["b2b14107", "seed-1", "2021-06-10_15-14-38"], ["b2b14107", "seed-2", "2021-06-10_15-17-36"]], "audiocaps-train-only-vggsound": [["afab0e0c", "seed-0", "2021-06-16_01-21-37"], ["afab0e0c", "seed-1", "2021-06-16_01-28-08"], ["afab0e0c", "seed-2", "2021-06-16_01-33-51"]], "audiocaps-train-vggish-vggsound": [["7e2eda12", "seed-0", "2021-06-09_17-06-26"], ["7e2eda12", "seed-1", "2021-06-09_17-15-12"], ["7e2eda12", "seed-2", "2021-06-09_17-24-01"]], "audiocaps-train-vggish-vggsound-moee": [["f66525f8", "seed-0", "2021-06-09_16-44-00"], ["f66525f8", "seed-1", "2021-06-09_16-51-31"], ["f66525f8", "seed-2", "2021-06-09_16-59-01"]], "audiocaps-train-vggish-vggsound-train_list_10": [["68747f8c", "seed-0", "2021-06-10_11-02-21"], ["68747f8c", "seed-1", "2021-06-10_11-07-21"], ["68747f8c", "seed-2", "2021-06-10_11-10-54"]], "audiocaps-train-vggish-vggsound-train_list_25": [["0151ad7f", "seed-0", "2021-06-10_11-14-25"], ["0151ad7f", "seed-1", "2021-06-10_11-18-48"], ["0151ad7f", "seed-2", "2021-06-10_11-23-12"]], "audiocaps-train-vggish-vggsound-train_list_50": [["4aeeaa0d", "seed-0", "2021-06-10_11-27-36"], ["4aeeaa0d", "seed-1", "2021-06-10_11-33-28"], ["4aeeaa0d", "seed-2", "2021-06-10_11-39-36"]], "audiocaps-train-vggish-vggsound-train_list_75": [["3a8d0584", "seed-0", "2021-06-10_11-45-26"], ["3a8d0584", "seed-1", "2021-06-10_11-52-47"], ["3a8d0584", "seed-2", "2021-06-10_12-00-02"]], "clotho-train-full-ce-only-audio": [["4f58ef05", "seed-0", "2021-06-10_15-38-28"], ["4f58ef05", "seed-1", "2021-06-10_15-39-02"], ["4f58ef05", "seed-2", "2021-06-10_15-39-33"]], "clotho-train-vggish-vggsound": [["dec0c820", "seed-0", "2021-06-10_14-45-51"], ["dec0c820", "seed-1", "2021-06-10_14-45-59"], ["dec0c820", "seed-2", "2021-06-10_14-46-07"]], "clotho-train-vggish-vggsound-moee": [["fafa3e91", "seed-0", "2021-06-10_14-44-51"], ["fafa3e91", "seed-1", "2021-06-10_14-44-51"], ["fafa3e91", "seed-2", "2021-06-10_14-44-51"]], "clotho-train-vggish-vggsound-finetuned": [["74560a6c", "seed-0", "2021-06-10_16-38-40"], ["74560a6c", "seed-1", "2021-06-10_16-39-29"], ["74560a6c", "seed-2", "2021-06-10_16-47-02"]], "clotho-train-vggish-vggsound-moee-finetuned": [["5395fa47", "seed-0", "2021-06-10_16-36-13"], ["5395fa47", "seed-1", "2021-06-10_16-37-11"], ["5395fa47", "seed-2", "2021-06-10_16-37-55"]], "querydsegments-train-full-ce-only-audio": [["70111434", "seed-0", "2021-06-10_14-33-03"], ["70111434", "seed-1", "2021-06-10_14-36-34"], ["70111434", "seed-2", "2021-06-10_14-40-01"]], "activity-net-train-full-ce-audio-only": [["f3ebaada", "seed-0", "2021-07-22_12-44-19"], ["f3ebaada", "seed-1", "2021-07-22_12-46-48"], ["f3ebaada", "seed-2", "2021-07-22_12-49-19"]]}
2 | 


--------------------------------------------------------------------------------
/data_loader/QuerYD_dataset.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import itertools
 3 | from pathlib import Path
 4 | from typing import Dict, List, Union
 5 | 
 6 | from base.base_dataset import BaseDataset
 7 | from typeguard import typechecked
 8 | from utils import memory_summary
 9 | from zsvision.zs_utils import concat_features, memcache
10 | 
11 | 
12 | class QuerYD(BaseDataset):
13 | 
14 |     @staticmethod
15 |     @typechecked
16 |     def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
17 |         subset_paths = {}
18 |         test_splits = {
19 |             "val": "val_list.txt",
20 |             "test": "test_list.txt",
21 |         }
22 |         for split_name, fname in test_splits.items():
23 |             subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
24 | 
25 |         feature_names = BaseDataset.common_feat_names()
26 |         feature_names.append("audio.vggish.0")
27 |         text_feat_paths = BaseDataset.common_text_feat_paths()
28 |         text_feat_paths = {key: Path("text_embeddings") / fname
29 |                            for key, fname in text_feat_paths.items()}
30 |         challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
31 |                                      for key in text_feat_paths}
32 |         custom_paths = {
33 |             "audio": ["aggregated_audio/vggish-raw.hickle"],
34 |         }
35 |         feature_info = {
36 |             "custom_paths": custom_paths,
37 |             "feature_names": feature_names,
38 |             "subset_list_paths": subset_paths,
39 |             "text_feat_paths": text_feat_paths,
40 |             "challenge_text_feat_paths": challenge_text_feat_paths,
41 |             "raw_captions_path": "structured-symlinks/raw_captions_combined_filtered.pkl",
42 |         }
43 |         return feature_info
44 | 
45 |     def load_features(self):
46 |         root_feat = self.root_feat
47 |         feat_names = {key: self.visual_feat_paths(key) for key in
48 |                       self.paths["feature_names"]}
49 |         feat_names.update(self.paths["custom_paths"])
50 |         features = {}
51 |         for expert, rel_names in feat_names.items():
52 |             if expert not in self.ordered_experts:
53 |                 continue
54 |             feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
55 |             if len(feat_paths) == 1:
56 |                 features[expert] = memcache(feat_paths[0])
57 |             else:
58 |                 # support multiple forms of feature (e.g. max and avg pooling). For
59 |                 # now, we only support direct concatenation
60 |                 msg = f"{expert}: Only direct concatenation of muliple feats is possible"
61 |                 print(f"Concatenating aggregates for {expert}....")
62 |                 assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
63 |                 axis = self.feat_aggregation[expert]["aggregate-axis"]
64 |                 x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
65 |                 print(f"concat cache info: {x}")
66 |                 features_ = concat_features(feat_paths, axis=axis)
67 |                 memory_summary()
68 | 
69 |                 # Make separate feature copies for each split to allow in-place filtering
70 |                 features[expert] = copy.deepcopy(features_)
71 | 
72 |         self.features = features
73 |         if self.challenge_mode:
74 |             self.load_challenge_text_features()
75 |         else:
76 |             self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
77 |             # keys = list(raw_captions.keys())
78 |             # raw_captions_fused = {}
79 |             # for key in keys:
80 |             #     raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key]))
81 |             # self.raw_captions = raw_captions_fused
82 |             text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
83 |             self.text_features = memcache(text_feat_path)
84 | 
85 |         # overload video paths, which are structured differently for YouCook2
86 |         self.video_path_retrieval = [f"videos/{x}.mp4"
87 |                                      for x in self.partition_lists["val"]]
88 | 
89 |     def sanity_checks(self):
90 |         msg = (f"Expected to have single test caption for QuerYD, since we assume"
91 |                f"that the captions are fused (but using {self.num_test_captions})")
92 |         assert self.num_test_captions == 1, msg
93 | 


--------------------------------------------------------------------------------
/utils/datastructures.py:
--------------------------------------------------------------------------------
  1 | """This module defines a datastructure for storing pre-computed features for datasets.
  2 | 
  3 | It provides key-value access, but is backed by a monolithic array to prevent memory
  4 | fragmentation.  This can be useful for loading large feature sets into memory (e.g. 
  5 | those that are > 100 GiB) in a manner that minimises OOM issues.
  6 | """
  7 | 
  8 | import pickle
  9 | import argparse
 10 | import numpy as np
 11 | import humanize
 12 | 
 13 | 
 14 | class ExpertStore:
 15 | 
 16 |     def __init__(self, keylist, dim, dtype=np.float16):
 17 |         self.keys = keylist
 18 |         self.dim = dim
 19 |         self.store_dtype = dtype
 20 |         self.store = np.zeros((len(keylist), dim), dtype=dtype)
 21 |         self.keymap = {}
 22 |         self.missing = set()
 23 |         self.rebuild_keymap()
 24 | 
 25 |     def __setitem__(self, key, value):
 26 |         idx = self.keymap[key]
 27 |         if isinstance(value, np.ndarray):
 28 |             # non-nan values must be vectors of the appropriate size
 29 |             assert value.size == self.dim, f"cannot set value with size {value.size}"
 30 |         else:
 31 |             assert np.isnan(value)
 32 |         self.store[idx] = value
 33 | 
 34 |     def rebuild_keymap(self):
 35 |         for idx, key in enumerate(self.keys):
 36 |             self.keymap[key] = idx
 37 | 
 38 |     def filter_keys(self, keys, tag, allow_mismatch="", exceptions=None):
 39 |         keyset = set(keys)
 40 |         missing = keyset - set(self.keys)
 41 |         if exceptions is not None and missing:
 42 |             excluded = missing.intersection(set(exceptions))
 43 |             print(f"filter_keys >>> applying exceptions for {len(excluded)} videos")
 44 |             missing = missing - excluded
 45 |         print(f"filter_keys >>> {tag}")
 46 |         if allow_mismatch and missing:
 47 |             print(f"Key mismatch (missing {len(missing)}) {allow_mismatch}")
 48 |         else:
 49 |             samples = list(missing)[:3]
 50 |             msg = f"cannot apply filter since missing {len(missing)} keys e.g. {samples}"
 51 |             assert not missing, msg
 52 |         keep = np.array([x in keyset for x in self.keys])
 53 |         filtered_keys = np.array(self.keys)[keep]
 54 |         print(f"Filtering from {len(self.keys)} keys to {len(filtered_keys)} keys")
 55 |         self.keys = filtered_keys
 56 |         self.store = self.store[keep]
 57 |         self.rebuild_keymap()
 58 | 
 59 |     def __getitem__(self, key):
 60 |         return self.store[self.keymap[key]]
 61 | 
 62 |     def __len__(self):
 63 |         return len(self.keys)
 64 | 
 65 |     def __repr__(self):
 66 |         keep_samples = 3
 67 |         samples = list(self.keymap.items())[:keep_samples]
 68 |         sample_str = "\n".join([f"{key}: {val}" for key, val in samples])
 69 |         summary = (
 70 |             f"ExpertStore object with {len(self.keys)} features (dim: {self.dim})"
 71 |             f" (storage is using {humanize.naturalsize(self.store.nbytes)})"
 72 |             f"\nFirst {keep_samples} elements of keymap: \n{sample_str}"
 73 |         )
 74 |         return summary
 75 | 
 76 | 
 77 | def gen_dict_store(keylist, dim):
 78 |     store = dict()
 79 |     for key in keylist:
 80 |         store[key] = np.random.rand(1, dim).astype(np.float16)
 81 |     return store
 82 | 
 83 | 
 84 | def main():
 85 |     parser = argparse.ArgumentParser()
 86 |     parser.add_argument("--dataset", default="moments-in-time")
 87 |     parser.add_argument("--dim", type=int, default=2048)
 88 |     args = parser.parse_args()
 89 | 
 90 |     from config import get_data_paths
 91 |     data_paths = get_data_paths(args.dataset)
 92 |     relevant_path = data_paths["relevant-id-list"]
 93 |     with open(relevant_path, "r") as f:
 94 |         relevant_ids = sorted(f.read().splitlines())
 95 | 
 96 |     for store_name in "dict", "np", "expert_store":
 97 |         if store_name == "dict":
 98 |             store = gen_dict_store(keylist=relevant_ids, dim=args.dim)
 99 |         elif store_name == "np":
100 |             store = np.random.rand(len(relevant_ids), args.dim).astype(np.float16)
101 |         elif store_name == "expert_store":
102 |             store = ExpertStore(keylist=relevant_ids, dim=args.dim)
103 |             print(store)
104 |         serialised = pickle.dumps(store)
105 |         print(f"Memory needs for {store_name}: {humanize.naturalsize(len(serialised))}")
106 | 
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------
/misc/gen_tar_lists.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ipy misc/gen_tar_lists.py -- --dataset YouCook2
  3 | """
  4 | import copy
  5 | import json
  6 | import argparse
  7 | from typing import Dict, List, Tuple
  8 | from pathlib import Path
  9 | 
 10 | import tqdm
 11 | from beartype import beartype
 12 | from zsvision.zs_utils import load_json_config
 13 | from gen_readme import dataset_paths, model_specs2path
 14 | 
 15 | 
 16 | @beartype
 17 | def generate_tar_lists(
 18 |         save_dir: Path,
 19 |         experiments: Dict[str, Tuple[str, str]],
 20 |         datasets: List[str],
 21 |         refresh: bool,
 22 | ):
 23 |     all_feat_paths = {}
 24 |     # import pdb; pdb.set_trace()
 25 |     for exp_name, (group_id, timestamp) in tqdm.tqdm(experiments.items()):
 26 |         rel_path = Path(group_id) / "seed-0" / timestamp / "config.json"
 27 |         config_path = Path(save_dir) / "models" / exp_name / rel_path
 28 |         try:
 29 |             with open(config_path, "r") as f:
 30 |                 config = json.load(f)
 31 |         except FileNotFoundError:
 32 |             rel_path = Path(group_id) / "seed-1" / timestamp / "config.json"
 33 |             config_path = Path(save_dir) / "models" / exp_name / rel_path
 34 |             with open(config_path, "r") as f:
 35 |                 config = json.load(f)
 36 | 
 37 |         feat_aggregation = config["data_loader"]["args"]["feat_aggregation"]
 38 |         dataset_name = exp_name.split("-train")[0]
 39 |         if dataset_name not in [x.lower() for x in datasets]:
 40 |             continue
 41 |         if dataset_name not in all_feat_paths:
 42 |             all_feat_paths[dataset_name] = set()
 43 |         split_names = [config["data_loader"]["args"]["split_name"]]
 44 |         if "eval_settings" in config and config["eval_settings"]:
 45 |             test_split = config["eval_settings"]["data_loader"]["args"]["split_name"]
 46 |             split_names.append(test_split)
 47 |         keep = set(config["experts"]["modalities"])
 48 |         text_feat = config["experts"]["text_feat"]
 49 |         root_feat, paths = dataset_paths(dataset_name)
 50 |         modern_feat_agg = {key: val for key, val in feat_aggregation.items()
 51 |                            if key in paths["feature_names"]}
 52 |         feat_paths = model_specs2path(modern_feat_agg, keep)
 53 |         all_feat_paths[dataset_name].update({root_feat / x for x in feat_paths})
 54 |         for key, feat_list in paths["custom_paths"].items():
 55 |             for feat_path in feat_list:
 56 |                 all_feat_paths[dataset_name].add(root_feat / feat_path)
 57 |         # import pdb; pdb.set_trace()
 58 |         text_paths = [root_feat / paths["text_feat_paths"][text_feat]]
 59 |         all_feat_paths[dataset_name].update(set(text_paths))
 60 |         all_feat_paths[dataset_name].add(root_feat / paths["raw_captions_path"])
 61 |         if "dict_youtube_mapping_path" in paths:
 62 |             all_feat_paths[dataset_name].add(
 63 |                 root_feat / paths["dict_youtube_mapping_path"])
 64 |         for split_name in split_names:
 65 |             split_paths = set(root_feat / x for x in
 66 |                               paths["subset_list_paths"][split_name].values())
 67 |             all_feat_paths[dataset_name].update(split_paths)
 68 | 
 69 |     for dataset_name, paths in all_feat_paths.items():
 70 |         tar_include_list = Path("misc") / "datasets" / dataset_name / "tar_include.txt"
 71 |         tar_include_list.parent.mkdir(exist_ok=True, parents=True)
 72 |         if tar_include_list.exists() and not refresh:
 73 |             print(f"Found existing tar include list at {tar_include_list}, skipping...")
 74 |             continue
 75 |         with open(tar_include_list, "w") as f:
 76 |             for path in sorted(paths):
 77 |                 if "aggregated_speech" not in str(path):
 78 |                     print(f"Writing {path} to {tar_include_list}")
 79 |                     f.write(f"{path}\n")
 80 | 
 81 | 
 82 | def main():
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument("--save_dir", default="data/saved", type=Path)
 85 |     parser.add_argument("--refresh", action="store_true")
 86 |     parser.add_argument("--experiments_path", default="misc/experiments.json")
 87 |     parser.add_argument("--target", default="main")
 88 |     parser.add_argument("--data_dir", type=Path, default="data")
 89 |     parser.add_argument("--challenge_phase", default="public_server_val",
 90 |                         choices=["public_server_val", "public_server_test"])
 91 |     parser.add_argument("--datasets", nargs="+",
 92 |                         default=["activity-net",
 93 |                                  "QuerYD", "QuerYDSegments"])
 94 |     args = parser.parse_args()
 95 | 
 96 |     with open(args.experiments_path, "r") as f:
 97 |         experiments = json.load(f)
 98 | 
 99 |     generate_tar_lists(
100 |         save_dir=args.save_dir,
101 |         datasets=args.datasets,
102 |         experiments=experiments,
103 |         refresh=args.refresh,
104 |     )
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/data_loader/CLOTHO_dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import itertools
  3 | from pathlib import Path
  4 | from typing import Dict, List, Union
  5 | 
  6 | from base.base_dataset import BaseDataset
  7 | from typeguard import typechecked
  8 | from utils import memory_summary
  9 | from zsvision.zs_utils import concat_features, memcache
 10 | 
 11 | 
 12 | class CLOTHO(BaseDataset):
 13 | 
 14 |     @staticmethod
 15 |     @typechecked
 16 |     def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
 17 |         subset_paths = {}
 18 |         test_splits = {
 19 |             "val": "val_list.txt",
 20 |             "test": "test_list.txt",
 21 |         }
 22 |         for split_name, fname in test_splits.items():
 23 |             subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
 24 | 
 25 |         feature_names = BaseDataset.common_feat_names()
 26 |         feature_names.append("audio.vggish.0")
 27 |         text_feat_paths = BaseDataset.common_text_feat_paths()
 28 |         text_feat_paths = {key: Path("text_embeddings") / fname
 29 |                            for key, fname in text_feat_paths.items()}
 30 |         challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
 31 |                                      for key in text_feat_paths}
 32 |         custom_paths = {
 33 |             "audio": ["aggregated_audio/vggish-raw.hickle"],
 34 |             "pann": ["aggregated_pann/pann-raw.hickle"],
 35 |             "syncnet": ["aggregated_syncnet/syncnet-raw.hickle"],
 36 |             "vggsound": ["aggregated_vggsound/vggsound-raw.hickle"],
 37 |             # "vggsound": ["aggregated_vggsound/vggsound-avg.pickle"],
 38 |             "speech": ["aggregated_speech/w2v_mean.pkl"]
 39 |         }
 40 |         feature_info = {
 41 |             "custom_paths": custom_paths,
 42 |             "feature_names": feature_names,
 43 |             "subset_list_paths": subset_paths,
 44 |             "text_feat_paths": text_feat_paths,
 45 |             "challenge_text_feat_paths": challenge_text_feat_paths,
 46 |             "raw_captions_path": "structured-symlinks/raw-captions.pkl",
 47 |         }
 48 |         return feature_info
 49 | 
 50 |     def load_features(self):
 51 |         root_feat = self.root_feat
 52 |         feat_names = {key: self.visual_feat_paths(key) for key in
 53 |                       self.paths["feature_names"]}
 54 |         feat_names.update(self.paths["custom_paths"])
 55 |         features = {}
 56 |         for expert, rel_names in feat_names.items():
 57 |             if expert not in self.ordered_experts:
 58 |                 continue
 59 |             feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
 60 |             if len(feat_paths) == 1:
 61 |                 features[expert] = memcache(feat_paths[0])
 62 |             else:
 63 |                 # support multiple forms of feature (e.g. max and avg pooling). For
 64 |                 # now, we only support direct concatenation
 65 |                 msg = f"{expert}: Only direct concatenation of muliple feats is possible"
 66 |                 print(f"Concatenating aggregates for {expert}....")
 67 |                 assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
 68 |                 axis = self.feat_aggregation[expert]["aggregate-axis"]
 69 |                 x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
 70 |                 print(f"concat cache info: {x}")
 71 |                 features_ = concat_features(feat_paths, axis=axis)
 72 |                 memory_summary()
 73 | 
 74 |                 # if expert == "speech":
 75 |                 #     features_defaults = defaultdict(lambda: np.zeros((1, 300)))
 76 |                 #     features_defaults.update(features_)
 77 |                 #     features_ = features_defaults
 78 |                 # Make separate feature copies for each split to allow in-place filtering
 79 |                 features[expert] = copy.deepcopy(features_)
 80 | 
 81 |         self.features = features
 82 |         if self.challenge_mode:
 83 |             self.load_challenge_text_features()
 84 |         else:
 85 |             self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
 86 |             # keys = list(raw_captions.keys())
 87 |             # raw_captions_fused = {}
 88 |             # for key in keys:
 89 |             #     raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key]))
 90 |             # self.raw_captions = raw_captions_fused
 91 |             text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
 92 |             self.text_features = memcache(text_feat_path)
 93 | 
 94 |         # overload video paths, which are structured differently for YouCook2
 95 |         self.video_path_retrieval = [f"videos/{x}.mp4"
 96 |                                      for x in self.partition_lists["val"]]
 97 | 
 98 |     def sanity_checks(self):
 99 |         msg = (f"Expected to have single test caption for AudioCaps, since we assume"
100 |                f"that the captions are fused (but using {self.num_test_captions})")
101 |         if self.fuse_captions is True:
102 |             assert self.num_test_captions == 1, msg
103 | 


--------------------------------------------------------------------------------
/logger/log_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import scipy.stats
  3 | import logging
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | 
  7 | 
  8 | def log_summary(logger, log_path, eval_mode="test_run", fixed_num_epochs=None):
  9 |     """Extract performace statistics from experiment log files.
 10 | 
 11 |     Args:
 12 |         logger (logger): reference to primary logging instance
 13 |         log_path (Path): the path to the log file
 14 |         eval_mode (str): the method use to collect the statistics. Can be one of:
 15 |             `test_run`, `fixed_num_epochs` or `geometric_mean`
 16 | 
 17 |     NOTE: The `eval_mode` argument differs by dataset: for datasets which provide a
 18 |     validation set, we use validation set performance to complete a single test run.  For
 19 |     datasets where no validation set is available, we aim to match prior work by either
 20 |     fixing the number of training epochs, or selecting directly from validation set
 21 |     performance (Details can be found in the supplementary material of the paper.)
 22 |     """
 23 |     with open(str(log_path), "r") as f:
 24 |         log = f.read().splitlines()
 25 |     
 26 |     # keep track of the random seed used for the part of the logfile being processed
 27 |     current_seed = None
 28 | 
 29 |     # Regex tag for finding the seed
 30 |     seed_tag = "Setting experiment random seed to"
 31 | 
 32 |     if eval_mode == "test_run":
 33 |         subset = "test"
 34 |     else:
 35 |         subset = "val"
 36 | 
 37 |     for mode in "t2v", "v2t":
 38 |         logger.info("")
 39 |         logger.info("----------------------------------------------------")
 40 |         logger.info(f"[{mode}] loaded log file with {len(log)} lines....")
 41 |         logger.info("----------------------------------------------------")
 42 | 
 43 |         # Search for the following metrics
 44 |         scores = {
 45 |             "R1": defaultdict(list),
 46 |             "R5": defaultdict(list),
 47 |             "R10": defaultdict(list),
 48 |             "R50": defaultdict(list),
 49 |             "MedR": defaultdict(list),
 50 |             "MeanR": defaultdict(list),
 51 |         }
 52 | 
 53 |         for row in log:
 54 |             if seed_tag in row:
 55 |                 # Search for the log file entry describing the current random seed
 56 |                 match = re.search(seed_tag + " (\d+)$", row)  # NOQA
 57 |                 assert len(match.groups()) == 1, "expected a single regex match"
 58 |                 current_seed = match.groups()[0]
 59 | 
 60 |             if f"{subset}_{mode}_metrics" in row:
 61 |                 tokens = row.split(" ")
 62 |                 for key in scores:
 63 |                     tag = f"{subset}_{mode}_metrics_{key}:"
 64 |                     if tag in tokens:
 65 |                         pos = tokens.index(tag) + 1
 66 |                         val = tokens[pos]
 67 |                         val = float(val)
 68 |                         assert current_seed is not None, "failed to determine the seed"
 69 |                         scores[key][current_seed].append(val)
 70 | 
 71 |         agg_scores = {"R1": [], "R5": [], "R10": [], "R50": [], "MedR": [], "MeanR": []}
 72 | 
 73 |         # compute the best performance for a single epoch (i.e. sharing the same model
 74 |         # to compute all stats)
 75 |         geometric_stats = defaultdict(list)
 76 |         best_epochs = {}
 77 |         if eval_mode == "geometric_mean":
 78 |             raise NotImplementedError("Need to fix this for new log format")
 79 |             consider = ["R1", "R5", "R10"]
 80 |             seeds = list(scores["R1"].keys())
 81 |             for seed in seeds:
 82 |                 for metric, subdict in scores.items():
 83 |                     if metric in consider:
 84 |                         geometric_stats[seed].append(subdict[seed])
 85 |                 gms_raw = np.array(geometric_stats[seed])
 86 |                 geo_means = scipy.stats.mstats.gmean(gms_raw, axis=0)
 87 |                 best_epochs[seed] = np.argmax(geo_means)
 88 | 
 89 |         for metric, subdict in scores.items():
 90 |             for seed, values in subdict.items():
 91 |                 if eval_mode == "test_run":
 92 |                     stat = values[0]
 93 |                 elif eval_mode == "fixed_num_epochs":
 94 |                     stat = values[fixed_num_epochs - 1]
 95 |                 else:
 96 |                     raise ValueError(f"unrecognised eval_mode: {eval_mode}")
 97 |                 agg_scores[metric].append(stat)
 98 | 
 99 |         if eval_mode == "fixed_num_epochs":
100 |             logger.info(f"Reporting stats with fixed training length: {fixed_num_epochs}")
101 |         for metric, values in agg_scores.items():
102 |             logger.info(f"{metric}: {np.mean(values):.1f}, {np.std(values, ddof=1):.1f}")
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     sample_path = "data/saved/log/audiocaps-train-vggish-vggsound/2021-04-03_11-48-50/info.log"
107 |     logger_ = logging.getLogger("parser")
108 |     logging.basicConfig(level=logging.INFO)
109 |     log_summary(
110 |         logger=logger_,
111 |         log_path=sample_path,
112 |         eval_mode="fixed_num_epochs",
113 |         fixed_num_epochs=9,
114 |     )
115 | 


--------------------------------------------------------------------------------
/utils/visualizer.py:
--------------------------------------------------------------------------------
  1 | """A simple HTML visualizer.
  2 | 
  3 | It is based on the Cycle-GAN codebase:
  4 | https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
  5 | """
  6 | import os
  7 | import numpy as np
  8 | from pathlib import Path
  9 | from . import util, html
 10 | 
 11 | 
 12 | class Visualizer:
 13 |     """This class includes several functions that can display/save images.
 14 | 
 15 |     It uses a Python library 'visdom' for display, and a Python library 'dominate'
 16 |     (wrapped in 'HTML') for creating HTML files with images.
 17 |     """
 18 | 
 19 |     def __init__(self, exp_name, web_dir, src_video_dir, vis_vid_freq, num_samples=50):
 20 |         """Initialize the Visualizer class
 21 |         Create an HTML object for saveing HTML filters
 22 |         """
 23 |         self.name = exp_name
 24 |         self.web_dir = web_dir
 25 |         self.vis_vid_freq = vis_vid_freq
 26 |         self.img_dir = os.path.join(self.web_dir, "images")
 27 |         self.num_samples = num_samples
 28 |         print(f"create web directory {self.web_dir}...")
 29 |         util.mkdirs([self.web_dir, self.img_dir])
 30 |         src_dir = Path(src_video_dir).absolute()
 31 |         print(f"symlinking videos from {src_dir}...")
 32 |         sym_dir = (Path(self.web_dir) / "videos").absolute()
 33 |         if sym_dir.is_symlink():
 34 |             os.remove(sym_dir)
 35 |         sym_dir.symlink_to(src_dir)
 36 | 
 37 |     def visualize_ranking(self, sims, epoch, meta, nested_metrics):
 38 |         if not (self.vis_vid_freq and epoch % self.vis_vid_freq == 0):
 39 |             return
 40 | 
 41 |         dists = -sims
 42 |         np.random.seed(0)
 43 |         sorted_ranks = np.argsort(dists, axis=1)
 44 |         gt_dists = np.diag(dists)
 45 |         rankings = []
 46 |         vis_top_k = 5
 47 |         hide_gt = False
 48 |         # num_indep_samples = 1
 49 |         # random_seeds = np.arange(num_indep_samples)
 50 |         sample = np.random.choice(np.arange(dists.shape[0]), size=self.num_samples,
 51 |                                   replace=False)
 52 |         for ii in sample:
 53 |             ranked_idx = sorted_ranks[ii][:vis_top_k]
 54 |             gt_captions = meta["raw_captions"][ii]
 55 |             # if args.sample_single_gt_caption:
 56 |             #     gt_captions = np.random.choice(gt_captions, 1).tolist()
 57 | 
 58 |             datum = {
 59 |                 "gt-sim": -gt_dists[ii],
 60 |                 "gt-captions": gt_captions,
 61 |                 "gt-rank": np.where(sorted_ranks[ii] == ii)[0][0],
 62 |                 "gt-path": meta["paths"][ii],
 63 |                 "top-k-sims": -dists[ii][ranked_idx],
 64 |                 "top-k-paths": np.array(meta["paths"])[ranked_idx],
 65 |                 "hide-gt": hide_gt,
 66 |             }
 67 |             rankings.append(datum)
 68 |         self.display_current_results(
 69 |             rankings,
 70 |             epoch=epoch,
 71 |             metrics=nested_metrics["t2v_metrics"],
 72 |         )
 73 | 
 74 |     def display_current_results(self, rankings, epoch, metrics):
 75 |         """Display current results on visdom; save current results to an HTML file.
 76 | 
 77 |         Parameters:
 78 |             visuals (OrderedDict) - - dictionary of images to display or save
 79 |             epoch (int) - - the current epoch
 80 |             save_result (bool) - - if save the current results to an HTML file
 81 |         """
 82 |         if not Path(self.web_dir).exists():
 83 |             Path(self.web_dir).mkdir(exist_ok=True, parents=True)
 84 |         print(f"updating webpage at {self.web_dir}")
 85 |         title = f"Experiment name = {self.name}"
 86 |         refresh = True
 87 |         if not refresh:
 88 |             print("DISABLING WEB PAGE REFRESH")
 89 |         webpage = html.HTML(web_dir=self.web_dir, title=title, refresh=refresh)
 90 | 
 91 |         msg = f"epoch [{epoch}] - {self.name}"
 92 |         webpage.add_header(msg)
 93 |         msg = (f"R1: {metrics['R1']:.1f}, "
 94 |                f"R5: {metrics['R5']:.1f}, "
 95 |                f"R10: {metrics['R10']:.1f}, "
 96 |                f"MedR: {metrics['MedR']}")
 97 |         webpage.add_header(msg)
 98 |         print(f"Top {len(rankings[0])} retreived videos at epoch: {epoch}")
 99 | 
100 |         for ranking in rankings:
101 |             vids, txts, links = [], [], []
102 |             gt_vid_path = ranking["gt-path"]
103 |             gt_captions = [" ".join(x) for x in ranking["gt-captions"]]
104 |             gt_captions = "<br>".join(gt_captions)
105 |             if ranking["hide-gt"]:
106 |                 txts.append(gt_captions)
107 |                 links.append("hidden")
108 |                 vids.append("hidden")
109 |             else:
110 |                 txt = (f"{gt_captions}<br><b>Rank: {ranking['gt-rank']}, "
111 |                        f"Sim: {ranking['gt-sim']:.3f} [{Path(ranking['gt-path']).stem}]")
112 |                 txts.append(txt)
113 |                 links.append(gt_vid_path)
114 |                 vids.append(gt_vid_path)
115 | 
116 |             for idx, (vid_path, sim) in enumerate(zip(ranking["top-k-paths"],
117 |                                                   ranking["top-k-sims"])):
118 |                 vid_path = Path(vid_path)
119 |                 if ranking["hide-gt"]:
120 |                     txt = f"choice: {idx}"
121 |                 else:
122 |                     txt = f"<b>Rank: {idx}, Sim: {sim:.3f}, [{Path(vid_path).stem}]"
123 |                 txts.append(txt)
124 |                 vids.append(vid_path)
125 |                 links.append(vid_path)
126 |             webpage.add_videos(vids, txts, links, width=200)
127 |         print(f"added {len(vids)} videos")
128 |         webpage.save()
129 | 


--------------------------------------------------------------------------------
/data_loader/ActivityNet_dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Dict, Union, List
  3 | from pathlib import Path
  4 | 
  5 | from zsvision.zs_utils import memcache, concat_features
  6 | from typeguard import typechecked
  7 | 
  8 | from utils import memory_summary
  9 | from base.base_dataset import BaseDataset
 10 | 
 11 | 
 12 | class ActivityNet(BaseDataset):
 13 | 
 14 |     @staticmethod
 15 |     @typechecked
 16 |     def dataset_paths(training_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
 17 |         subset_paths = {}
 18 |         test_splits = {
 19 |             "val1": "val_1_list.txt",
 20 |             "val": "val_list.txt",
 21 |             "public_server_val": "public_server_val.txt",
 22 |             "public_server_test": "public_server_test.txt",
 23 |         }
 24 |         for split_name, fname in test_splits.items():
 25 |             if training_file is None:
 26 |                 subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
 27 |             else:
 28 |                 subset_paths[split_name] = {"train": training_file, "val": fname}
 29 | 
 30 | 
 31 |         feature_names = BaseDataset.common_feat_names()
 32 |         custom_paths = {
 33 |             "audio": ["aggregated_audio/vggish-audio-raw.pickle"],
 34 |             "speech": ["aggregated_speech/goog_w2v-speech-raw.pickle"],
 35 |             "ocr": ["aggregated_ocr_feats/ocr-w2v.pkl"],
 36 |             "face": ["aggregated_facefeats_25fps_256px_stride1/face-avg.pickle"],
 37 |         }
 38 |         text_feat_paths = BaseDataset.common_text_feat_paths()
 39 |         text_feat_dir = Path("aggregated_text_feats")
 40 | 
 41 |         text_feat_paths = {key: text_feat_dir / fname
 42 |                            for key, fname in text_feat_paths.items()}
 43 |         challenge_text_feat_paths = {}
 44 |         # include non-standard text features
 45 |         for text_feat in ("openai", ):
 46 |             text_feat_names = {key: f"{text_feat}-{key}"
 47 |                                for key in {"train", "val1"}}
 48 |             text_feat_paths[text_feat] = {key: f"aggregated_text_feats/{val}.pkl"
 49 |                                           for key, val in text_feat_names.items()}
 50 |             challenge_text_feat_paths[text_feat] = \
 51 |                 f"aggregated_text_feats/{text_feat}.pkl"
 52 |         feature_info = {
 53 |             "custom_paths": custom_paths,
 54 |             "feature_names": feature_names,
 55 |             "subset_list_paths": subset_paths,
 56 |             "text_feat_paths": text_feat_paths,
 57 |             "challenge_text_feat_paths": challenge_text_feat_paths,
 58 |             "raw_captions_path": "raw-captions-train-val_1.pkl",
 59 |         }
 60 |         return feature_info
 61 | 
 62 |     def load_features(self):
 63 |         root_feat = self.root_feat
 64 |         if self.distil_params is not None:
 65 |             self.distil_features = {}
 66 |             d_base_path = self.distil_params['base_path']
 67 | 
 68 |             teachers = list(map(lambda x: root_feat / Path(d_base_path + x), self.distil_params['teachers']))
 69 | 
 70 |             for i, f_name in enumerate(teachers):
 71 |                 self.distil_features[i] = memcache(f_name)
 72 | 
 73 |         feat_names = {key: self.visual_feat_paths(key) for key in
 74 |                       self.paths["feature_names"]}
 75 |         feat_names.update(self.paths["custom_paths"])
 76 |         features = {}
 77 |         for expert, rel_names in feat_names.items():
 78 |             if expert not in self.ordered_experts:
 79 |                 continue
 80 |             feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
 81 |             if len(feat_paths) == 1:
 82 |                 features[expert] = memcache(feat_paths[0])
 83 |             else:
 84 |                 # support multiple forms of feature (e.g. max and avg pooling). For
 85 |                 # now, we only support direct concatenation
 86 |                 msg = f"{expert}: Only direct concatenation of muliple feats is possible"
 87 |                 print(f"Concatenating aggregates for {expert}....")
 88 |                 assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
 89 |                 axis = self.feat_aggregation[expert]["aggregate-axis"]
 90 |                 x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
 91 |                 print(f"concat cache info: {x}")
 92 |                 features_ = concat_features(feat_paths, axis=axis)
 93 |                 memory_summary()
 94 | 
 95 |                 # Make separate feature copies for each split to allow in-place filtering
 96 |                 features[expert] = copy.deepcopy(features_)
 97 | 
 98 |         self.features = features
 99 |         if self.challenge_mode:
100 |             self.load_challenge_text_features()
101 |         else:
102 |             text_feat_paths = self.paths["text_feat_paths"][self.text_feat]
103 |             if isinstance(text_feat_paths, dict):
104 |                 text_features = memcache(root_feat / text_feat_paths["train"])
105 |                 text_features.update(memcache(
106 |                     root_feat / text_feat_paths[self.split_name]))
107 |             elif isinstance(text_feat_paths, (Path, str)):
108 |                 text_features = memcache(root_feat / text_feat_paths)
109 |             else:
110 |                 raise TypeError(f"Unexpected type {type(text_feat_paths)}")
111 |             self.text_features = text_features
112 |             self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
113 | 
114 |     def sanity_checks(self):
115 |         msg = (f"Expected to have single test caption for ANet, since we assume"
116 |                f"that the captions are fused (but using {self.num_test_captions})")
117 |         assert self.num_test_captions == 1, msg
118 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import os
  5 | import subprocess
  6 | import tqdm
  7 | import wget
  8 | from collections import defaultdict
  9 | from datetime import datetime
 10 | from pathlib import Path
 11 | 
 12 | import numpy as np
 13 | import torch
 14 | import random
 15 | 
 16 | def extracting_log_info(log_files, experiment, logging):
 17 |     metrics_t2v = defaultdict(list)
 18 |     metrics_v2t = defaultdict(list)
 19 |     
 20 |     for file_name in log_files:        
 21 |         output_string = f"{experiment}:\n"
 22 |         with open(Path("logs_eval") / file_name, 'r') as f:
 23 |             content_lines = f.read().splitlines()
 24 |         content_lines = content_lines[-14:]
 25 |         for line in content_lines:
 26 |             if 't2v' in line:
 27 |                 metric_entry = line.split('test_t2v_metrics_')[1].split(':')[0]
 28 |                 metrics_t2v[metric_entry].append(float(line.split('test_t2v_metrics_')[1].split(':')[1]))
 29 |             elif 'v2t' in line:
 30 |                 metric_entry = line.split('test_v2t_metrics_')[1].split(':')[0]
 31 |                 metrics_v2t[metric_entry].append(float(line.split('test_v2t_metrics_')[1].split(':')[1]))
 32 |         keys = list(metrics_t2v.keys())
 33 |     
 34 |     for key in keys:
 35 |         output_string += f"{key}_t2v: {np.mean(metrics_t2v[key]):.1f}, {np.std(metrics_t2v[key], ddof=1):.1f}\n"
 36 |     for key in keys:
 37 |         output_string += f"{key}_v2t: {np.mean(metrics_v2t[key]):.1f}, {np.std(metrics_v2t[key], ddof=1):.1f}\n"
 38 |     logging.info(output_string)
 39 |     with open(Path("logs_eval") / f"{experiment}_summary.txt", 'w') as f:
 40 |         f.write(output_string)
 41 | 
 42 | def run_exp(experiments, logging):
 43 |     for experiment in experiments:
 44 |         logging.info(f"Now running {experiment}")
 45 |         run_one_exp(experiment, experiments, logging)
 46 | 
 47 | 
 48 | def download_configs(experiment, trained_model_path, group_id, seed, timestamp):
 49 |     new_folder = str(trained_model_path).split('/trained_model.pth')[0]
 50 |     url_config = f"http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/models/{experiment}/{group_id}/{seed}/{timestamp}/config.json"
 51 |     config_path = Path(new_folder) / 'config.json'
 52 |     wget.download(url_config, out=str(config_path))
 53 |     with open(config_path, 'r') as f:
 54 |         config_content = json.load(f)
 55 |     config_content['seed'] = int(seed[-1])
 56 |     with open(config_path, 'w') as f:
 57 |         json.dump(config_content, f)
 58 | 
 59 | 
 60 | def download_models(experiment, logging, trained_model_path,
 61 |                     group_id, seed, timestamp):
 62 |     new_folder = str(trained_model_path).split('/trained_model.pth')[0]
 63 |     if os.path.exists(trained_model_path) is False:
 64 |         logging.info(f"Downloading model for {seed} since it does not exist on the local machine")
 65 |         url = f"http://www.robots.ox.ac.uk/~vgg/research/collaborative-experts/data/models/{experiment}/{group_id}/{seed}/{timestamp}/trained_model.pth"
 66 |         # import pdb; pdb.set_trace()
 67 |         Path(new_folder).mkdir(exist_ok=True, parents=True)
 68 |         wget.download(url, out=str(trained_model_path))
 69 |     else:
 70 |         logging.info(f"Model already downloaded for {experiment} seed {seed}")
 71 |     if os.path.exists(Path(new_folder) / 'config.json') is False:
 72 |         download_configs(experiment, trained_model_path, group_id, seed, timestamp)
 73 |     else:
 74 |         logging.info(f"Config already downloaded for {experiment} seed {seed}")
 75 | 
 76 | def run_one_exp(experiment, experiments, logging):
 77 |     group_id = experiments[experiment][0]
 78 | 
 79 |     with open('exp_to_seed_time.json', 'r') as f:
 80 |         json_dict = json.load(f)
 81 |     log_files = []
 82 |     for (group_id, seed, timestamp) in json_dict[experiment]:
 83 |         
 84 |         group_id_path = Path("data/saved/models") / experiment / group_id
 85 |         logging.info("Running evaluation on existent seeds")
 86 |         (Path("logs_eval")).mkdir(exist_ok=True, parents=True)
 87 |         trained_model_path = group_id_path / seed / timestamp / 'trained_model.pth'
 88 |         download_models(experiment, logging, trained_model_path,
 89 |                         group_id, seed, timestamp)
 90 |         config_path = group_id_path / seed / timestamp / 'config.json'
 91 |         cmd = f"python test.py --config {config_path} --resume {trained_model_path} --device 0 --eval_from_training_config >&1 | tee logs_eval/log_{group_id}_{seed}.txt"
 92 |         
 93 |         log_files.append(f"log_{group_id}_{seed}.txt")
 94 |         logging.info(cmd)
 95 |         subprocess.call(cmd, shell=True)
 96 |     logging.info("Now averaging results")
 97 |     
 98 |     extracting_log_info(log_files, experiment, logging)
 99 |     
100 | 
101 | 
102 | def main():
103 |     parser = argparse.ArgumentParser()
104 |     parser.add_argument("--experiments_path", default="misc/experiments-audiocaps.json")
105 |     parser.add_argument("--experiment", type=str, default=None)
106 |     parser.add_argument(
107 |         "--data_dir",
108 |         type=Path,
109 |         default="data",
110 |     )
111 |     parser.add_argument(
112 |         "--dataset",
113 |         type=str,
114 |         default="data",
115 |     )
116 |     parser.add_argument(
117 |         "--refresh",
118 |         action="store_true",
119 |     )
120 |     args = parser.parse_args()
121 |     os.makedirs('logs', exist_ok=True)
122 |     logging.basicConfig(filename=f"logs/{datetime.now().strftime(r'%m%d_%H%M%S')}.log",
123 |                         level=logging.INFO)
124 |     logging.getLogger().addHandler(logging.StreamHandler())
125 |     logging.info(args)
126 | 
127 |     with open(args.experiments_path, "r") as f:
128 |         experiments = json.load(f)
129 |     
130 |     if args.experiment is None:
131 |         run_exp(experiments, logging)
132 |     else:
133 |         run_one_exp(args.experiment, experiments, logging)
134 |     
135 |     
136 | 
137 | if __name__ == "__main__":
138 |     main()
139 | 


--------------------------------------------------------------------------------
/configs/data_loader_queryd.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "inherit_from": "configs/base_config_queryd.json",
  3 |     "eval_mode": "test_run",
  4 |     "experts": {
  5 |         "text_feat": "w2v",
  6 |         "modalities": [
  7 |             "imagenet.resnext101_32x48d.0",
  8 |             "r2p1d.r2p1d-ig65m.0",
  9 |             "scene.densenet161.0",
 10 |             "audio"
 11 |         ]
 12 |     },
 13 |     "arch": {
 14 |         "type": "CENet",
 15 |         "args": {
 16 |             "test_caption_mode": "indep",
 17 |             "use_ce": "pairwise",
 18 |             "use_mish": 1,
 19 |             "use_bn_reason": 1,
 20 |             "num_g_layers": 3,
 21 |             "num_h_layers": 0,
 22 |             "include_self": 1,
 23 |             "l2renorm": false,
 24 |             "randomise_feats": "",
 25 |             "vlad_clusters": {
 26 |                 "text": 20,
 27 |                 "audio": 16
 28 |             },
 29 |             "ghost_clusters": {
 30 |                 "text": 1
 31 |             },
 32 |             "mimic_ce_dims": 0
 33 |         }
 34 |     },
 35 |     "optimizer": {
 36 |         "type": "Ranger",
 37 |         "args": {
 38 |             "lr": 0.01,
 39 |             "weight_decay": 1E-03
 40 |         }
 41 |     },
 42 |     "loss": {
 43 |         "type": "MaxMarginRankingLoss",
 44 |         "args": {
 45 |             "margin": 0.2,
 46 |             "fix_norm": true
 47 |         }
 48 |     },
 49 |     "data_loader": {
 50 |         "type": "ExpertDataLoader",
 51 |         "args":{
 52 |             "dataset_name": "QuerYD",
 53 |             "data_dir": "data/QuerYD",
 54 |             "root_feat_folder": "structured-symlinks",
 55 |             "trn_cat": 0,
 56 |             "batch_size": 128,
 57 |             "split_name": "val",
 58 |             "fuse_captions": true,
 59 |             "num_test_captions": 1,
 60 |             "max_tokens": {
 61 |                 "text": 70,
 62 |                 "audio": 500
 63 |             },
 64 |             "feat_aggregation": {
 65 |                 "imagenet.senet154.0": {
 66 |                     "fps": 25,
 67 |                     "stride": 1,
 68 |                     "pixel_dim": 256,
 69 |                     "aggregate-axis": 1,
 70 |                     "offset": 0,
 71 |                     "temporal": "avg",
 72 |                     "aggregate": "concat",
 73 |                     "type": "embed",
 74 |                     "feat_dims": {
 75 |                         "embed": 2048,
 76 |                         "logits": 1000
 77 |                     }
 78 |                 },
 79 |                 "imagenet.resnext101_32x48d.0": {
 80 |                     "fps": 25,
 81 |                     "stride": 1,
 82 |                     "offset": 0,
 83 |                     "pixel_dim": 256,
 84 |                     "temporal": "avg",
 85 |                     "aggregate": "concat",
 86 |                     "aggregate-axis": 1,
 87 |                     "type": "embed",
 88 |                     "feat_dims": {
 89 |                         "embed": 2048,
 90 |                         "logits": 1000
 91 |                     }
 92 |                 },
 93 |                 "scene.densenet161.0": {
 94 |                     "stride": 1,
 95 |                     "fps": 25,
 96 |                     "offset": 0,
 97 |                     "temporal": "avg",
 98 |                     "pixel_dim": 256,
 99 |                     "aggregate": "concat",
100 |                     "aggregate-axis": 1,
101 |                     "type": "embed",
102 |                     "feat_dims": {
103 |                         "embed": 2208,
104 |                         "logits": 1000
105 |                     }
106 |                 },
107 |                 "i3d.i3d.0": {
108 |                     "fps": 25,
109 |                     "offset": 0,
110 |                     "stride": 25,
111 |                     "inner_stride": 1,
112 |                     "pixel_dim": 256,
113 |                     "temporal": "avg",
114 |                     "aggregate": "concat",
115 |                     "aggregate-axis": 1,
116 |                     "type": "embed",
117 |                     "feat_dims": {
118 |                         "embed": 1024,
119 |                         "logits": 400
120 |                     }
121 |                 },
122 |                 "r2p1d.r2p1d-ig65m.0": {
123 |                     "fps": 30,
124 |                     "offset": 0,
125 |                     "stride": 32,
126 |                     "inner_stride": 1,
127 |                     "pixel_dim": 256,
128 |                     "temporal": "avg",
129 |                     "aggregate": "concat",
130 |                     "aggregate-axis": 1,
131 |                     "type": "embed",
132 |                     "feat_dims": {
133 |                         "embed": 512,
134 |                         "logits": 359
135 |                     }
136 |                 },
137 |                 "r2p1d.r2p1d-ig65m-kinetics.0": {
138 |                     "fps": 30,
139 |                     "offset": 0,
140 |                     "stride": 32,
141 |                     "inner_stride": 1,
142 |                     "pixel_dim": 256,
143 |                     "temporal": "avg",
144 |                     "aggregate": "concat",
145 |                     "aggregate-axis": 1,
146 |                     "type": "embed",
147 |                     "feat_dims": {
148 |                         "embed": 512,
149 |                         "logits": 400
150 |                     }
151 |                 }
152 |             }
153 |         }
154 |     },
155 |     "trainer": {
156 |         "epochs": 20
157 |     },
158 |     "eval_settings": {
159 |         "data_loader": {
160 |             "args": {
161 |                 "split_name": "test"
162 |             }
163 |         },
164 |         "tester": {
165 |             "save_dir": "data/saved/",
166 |             "verbosity": 2
167 |         },
168 |         "disable_gpu": true
169 |     },
170 |     "visualizer": {
171 |         "type": "Visualizer",
172 |         "args":{
173 |             "src_video_dir": "data/QuerYD/videos",
174 |             "vis_vid_freq": 500,
175 |             "num_samples": 100
176 |         }
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/utils/html.py:
--------------------------------------------------------------------------------
  1 | import dominate
  2 | from dominate.tags import meta, h3, table, tr, td, p, a, img, br, video, source, attr
  3 | from dominate.tags import span
  4 | import os
  5 | 
  6 | 
  7 | class HTML:
  8 |     """This HTML class allows us to save images and write texts into a single HTML file.
  9 | 
 10 |      It consists of functions such as <add_header> (add a text header to the HTML file),
 11 |      <add_images> (add a row of images to the HTML file), and <save> (save the HTML to the disk).
 12 |      It is based on Python library 'dominate', a Python library for creating and
 13 |      manipulating HTML documents using a DOM API.
 14 |     """
 15 | 
 16 |     def __init__(self, web_dir, title, refresh=0):
 17 |         """Initialize the HTML classes
 18 | 
 19 |         Parameters:
 20 |             web_dir (str) -- a directory that stores the webpage. HTML file will be
 21 |             created at <web_dir>/index.html; images will be saved at <web_dir/images/
 22 |             title (str)   -- the webpage name
 23 |             reflect (int) -- how often the website refresh itself; if 0; no refreshing
 24 |         """
 25 |         self.title = title
 26 |         self.web_dir = web_dir
 27 |         self.img_dir = os.path.join(self.web_dir, "images")
 28 |         if not os.path.exists(self.web_dir):
 29 |             os.makedirs(self.web_dir)
 30 |         if not os.path.exists(self.img_dir):
 31 |             os.makedirs(self.img_dir)
 32 | 
 33 |         self.doc = dominate.document(title=title)
 34 |         if refresh > 0:
 35 |             with self.doc.head:
 36 |                 meta(http_equiv="refresh", content=str(refresh))
 37 | 
 38 |     def get_image_dir(self):
 39 |         """Return the directory that stores images"""
 40 |         return self.img_dir
 41 | 
 42 |     def add_header(self, text):
 43 |         """Insert a header to the HTML file
 44 | 
 45 |         Parameters:
 46 |             text (str) -- the header text
 47 |         """
 48 |         with self.doc:
 49 |             h3(text)
 50 | 
 51 |     def add_videos(self, vids, txts, links, width=400, hidden_tag="hidden"):
 52 |         """add images to the HTML file
 53 | 
 54 |         Parameters:
 55 |             vids (str list)   -- a list of image paths
 56 |             txts (str list)  -- a list of image names shown on the website
 57 |             links (str list) --  a list of hyperref links; when you click an image,
 58 |             it will redirect you to a new page
 59 |         """
 60 |         self.t = table(border=1, style="table-layout: fixed;")  # Insert a table
 61 |         self.doc.add(self.t)
 62 |         colors = ["red", "blue", "gold", "salman"]
 63 |         with self.t:
 64 |             with tr():
 65 |                 for vid, txt, link in zip(vids, txts, links):
 66 |                     td_style = "word-wrap: break-word; width:{}px".format(width)
 67 |                     with td(style=td_style, halign="center", valign="top"):
 68 |                         with p():
 69 |                             vid_path = str(vid)
 70 |                             if vid_path == hidden_tag:
 71 |                                 p_style = "font-weight: bold; width:{}px;"
 72 |                                 p_style = p_style.format(width * 3)
 73 |                                 p("hidden video", style=p_style)
 74 |                             else:
 75 |                                 with a(href=str(link)):
 76 |                                     with video():
 77 |                                         attr(controls="controls")
 78 |                                         source(src=vid_path, type="video/mp4")
 79 |                             br()
 80 |                             rows = txt.split("<br>")
 81 |                             for idx, row in enumerate(rows):
 82 |                                 color = colors[idx % len(colors)]
 83 |                                 bold_tag = "<b>"
 84 |                                 if not row.startswith(bold_tag):
 85 |                                     s_style = "color:{};".format(color)
 86 |                                 else:
 87 |                                     s_style = "color:black; font-weight: bold;"
 88 |                                     row = row[len(bold_tag):]
 89 |                                 span(row, style=s_style)
 90 | 
 91 |     def add_images(self, ims, txts, links, width=400):
 92 |         """add images to the HTML file
 93 | 
 94 |         Parameters:
 95 |             ims (str list)   -- a list of image paths
 96 |             txts (str list)  -- a list of image names shown on the website
 97 |             links (str list) --  a list of hyperref links; when you click an image,
 98 |             it will redirect you to a new page
 99 |         """
100 |         self.t = table(border=1, style="table-layout: fixed;")  # Insert a table
101 |         self.doc.add(self.t)
102 |         with self.t:
103 |             with tr():
104 |                 for im, txt, link in zip(ims, txts, links):
105 |                     td_style = "word-wrap: break-word;"
106 |                     with td(style=td_style, halign="center", valign="top"):
107 |                         with p():
108 |                             with a(href=os.path.join("images", link)):
109 |                                 img(
110 |                                     style="width:%dpx" % width,
111 |                                     src=os.path.join("images", im),
112 |                                 )
113 |                             br()
114 |                             p(txt)
115 | 
116 |     def save(self):
117 |         """save the current content to the HMTL file"""
118 |         html_file = "%s/index.html" % self.web_dir
119 |         f = open(html_file, "wt")
120 |         f.write(self.doc.render())
121 |         f.close()
122 | 
123 | 
124 | if __name__ == "__main__":  # we show an example usage here.
125 |     html = HTML("web/", "test_html")
126 |     html.add_header("hello world")
127 | 
128 |     ims, txts, links = [], [], []
129 |     for n in range(4):
130 |         ims.append("image_%d.png" % n)
131 |         txts.append("text_%d" % n)
132 |         links.append("image_%d.png" % n)
133 |     html.add_images(ims, txts, links)
134 |     html.save()
135 | 


--------------------------------------------------------------------------------
/configs/data_loader_querydsegments.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "inherit_from": "configs/base_config_queryd.json",
  3 |     "eval_mode": "test_run",
  4 |     "experts": {
  5 |         "text_feat": "w2v",
  6 |         "modalities": [
  7 |             "imagenet.resnext101_32x48d.0",
  8 |             "r2p1d.r2p1d-ig65m.0",
  9 |             "scene.densenet161.0",
 10 |             "audio"
 11 |         ]
 12 |     },
 13 |     "arch": {
 14 |         "type": "CENet",
 15 |         "args": {
 16 |             "test_caption_mode": "indep",
 17 |             "use_ce": "pairwise",
 18 |             "use_mish": 1,
 19 |             "use_bn_reason": 1,
 20 |             "num_g_layers": 3,
 21 |             "num_h_layers": 0,
 22 |             "include_self": 1,
 23 |             "l2renorm": false,
 24 |             "randomise_feats": "",
 25 |             "vlad_clusters": {
 26 |                 "text": 20,
 27 |                 "audio": 16
 28 |             },
 29 |             "ghost_clusters": {
 30 |                 "text": 1
 31 |             },
 32 |             "mimic_ce_dims": 0
 33 |         }
 34 |     },
 35 |     "optimizer": {
 36 |         "type": "Ranger",
 37 |         "args": {
 38 |             "lr": 0.01,
 39 |             "weight_decay": 1E-03
 40 |         }
 41 |     },
 42 |     "loss": {
 43 |         "type": "MaxMarginRankingLoss",
 44 |         "args": {
 45 |             "margin": 0.2,
 46 |             "fix_norm": true
 47 |         }
 48 |     },
 49 |     "data_loader": {
 50 |         "type": "ExpertDataLoader",
 51 |         "args":{
 52 |             "dataset_name": "QuerYDSegments",
 53 |             "data_dir": "data/QuerYDSegments",
 54 |             "root_feat_folder": "structured-symlinks",
 55 |             "trn_cat": 0,
 56 |             "batch_size": 128,
 57 |             "split_name": "val",
 58 |             "fuse_captions": false,
 59 |             "num_test_captions": 1,
 60 |             "max_tokens": {
 61 |                 "text": 70,
 62 |                 "audio": 500
 63 |             },
 64 |             "feat_aggregation": {
 65 |                 "imagenet.senet154.0": {
 66 |                     "fps": 25,
 67 |                     "stride": 1,
 68 |                     "pixel_dim": 256,
 69 |                     "aggregate-axis": 1,
 70 |                     "offset": 0,
 71 |                     "temporal": "avg",
 72 |                     "aggregate": "concat",
 73 |                     "type": "embed",
 74 |                     "feat_dims": {
 75 |                         "embed": 2048,
 76 |                         "logits": 1000
 77 |                     }
 78 |                 },
 79 |                 "imagenet.resnext101_32x48d.0": {
 80 |                     "fps": 25,
 81 |                     "stride": 1,
 82 |                     "offset": 0,
 83 |                     "pixel_dim": 256,
 84 |                     "temporal": "avg",
 85 |                     "aggregate": "concat",
 86 |                     "aggregate-axis": 1,
 87 |                     "type": "embed",
 88 |                     "feat_dims": {
 89 |                         "embed": 2048,
 90 |                         "logits": 1000
 91 |                     }
 92 |                 },
 93 |                 "scene.densenet161.0": {
 94 |                     "stride": 1,
 95 |                     "fps": 25,
 96 |                     "offset": 0,
 97 |                     "temporal": "avg",
 98 |                     "pixel_dim": 256,
 99 |                     "aggregate": "concat",
100 |                     "aggregate-axis": 1,
101 |                     "type": "embed",
102 |                     "feat_dims": {
103 |                         "embed": 2208,
104 |                         "logits": 1000
105 |                     }
106 |                 },
107 |                 "i3d.i3d.0": {
108 |                     "fps": 25,
109 |                     "offset": 0,
110 |                     "stride": 25,
111 |                     "inner_stride": 1,
112 |                     "pixel_dim": 256,
113 |                     "temporal": "avg",
114 |                     "aggregate": "concat",
115 |                     "aggregate-axis": 1,
116 |                     "type": "embed",
117 |                     "feat_dims": {
118 |                         "embed": 1024,
119 |                         "logits": 400
120 |                     }
121 |                 },
122 |                 "r2p1d.r2p1d-ig65m.0": {
123 |                     "fps": 30,
124 |                     "offset": 0,
125 |                     "stride": 32,
126 |                     "inner_stride": 1,
127 |                     "pixel_dim": 256,
128 |                     "temporal": "avg",
129 |                     "aggregate": "concat",
130 |                     "aggregate-axis": 1,
131 |                     "type": "embed",
132 |                     "feat_dims": {
133 |                         "embed": 512,
134 |                         "logits": 359
135 |                     }
136 |                 },
137 |                 "r2p1d.r2p1d-ig65m-kinetics.0": {
138 |                     "fps": 30,
139 |                     "offset": 0,
140 |                     "stride": 32,
141 |                     "inner_stride": 1,
142 |                     "pixel_dim": 256,
143 |                     "temporal": "avg",
144 |                     "aggregate": "concat",
145 |                     "aggregate-axis": 1,
146 |                     "type": "embed",
147 |                     "feat_dims": {
148 |                         "embed": 512,
149 |                         "logits": 400
150 |                     }
151 |                 },
152 |                 "s3dg.s3dg.0": {
153 |                     "fps": 10,
154 |                     "offset": 0,
155 |                     "stride": 16,
156 |                     "num_segments": null,
157 |                     "pixel_dim": 256,
158 |                     "inner_stride": 1,
159 |                     "temporal": "avg",
160 |                     "aggregate": "concat",
161 |                     "aggregate-axis": 1,
162 |                     "type": "embed",
163 |                     "feat_dims": {
164 |                         "embed": 1024,
165 |                         "logits": 512
166 |                     }
167 |                 }
168 |             }
169 |         }
170 |     },
171 |     "trainer": {
172 |         "epochs": 20
173 |     },
174 |     "eval_settings": {
175 |         "data_loader": {
176 |             "args": {
177 |                 "split_name": "test"
178 |             }
179 |         },
180 |         "tester": {
181 |             "save_dir": "data/saved/",
182 |             "verbosity": 2
183 |         },
184 |         "disable_gpu": true
185 |     }
186 | }
187 | 


--------------------------------------------------------------------------------
/utils/ranger.py:
--------------------------------------------------------------------------------
  1 | #Ranger deep learning optimizer - RAdam + Lookahead combined.
  2 | #https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer
  3 | 
  4 | #Ranger has now been used to capture 12 records on the FastAI leaderboard.
  5 | 
  6 | #This version = 9.3.19  
  7 | 
  8 | #Credits:
  9 | #RAdam -->  https://github.com/LiyuanLucasLiu/RAdam
 10 | #Lookahead --> rewritten by lessw2020, but big thanks to Github @LonePatient and @RWightman for ideas from their code.
 11 | #Lookahead paper --> MZhang,G Hinton  https://arxiv.org/abs/1907.08610
 12 | 
 13 | #summary of changes: 
 14 | #full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights), 
 15 | #supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues.
 16 | #changes 8/31/19 - fix references to *self*.N_sma_threshold; 
 17 |                 #changed eps to 1e-5 as better default than 1e-8.
 18 | 
 19 | import math
 20 | import torch
 21 | from torch.optim.optimizer import Optimizer, required
 22 | import itertools as it
 23 | 
 24 | 
 25 | 
 26 | class Ranger(Optimizer):
 27 | 
 28 |     def __init__(self, params, lr=1e-3, alpha=0.5, k=6, N_sma_threshhold=5, betas=(.95,0.999), eps=1e-5, weight_decay=0):
 29 |         #parameter checks
 30 |         if not 0.0 <= alpha <= 1.0:
 31 |             raise ValueError(f'Invalid slow update rate: {alpha}')
 32 |         if not 1 <= k:
 33 |             raise ValueError(f'Invalid lookahead steps: {k}')
 34 |         if not lr > 0:
 35 |             raise ValueError(f'Invalid Learning Rate: {lr}')
 36 |         if not eps > 0:
 37 |             raise ValueError(f'Invalid eps: {eps}')
 38 | 
 39 |         #parameter comments:
 40 |         # beta1 (momentum) of .95 seems to work better than .90...
 41 |         #N_sma_threshold of 5 seems better in testing than 4.
 42 |         #In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you.
 43 | 
 44 |         #prep defaults and init torch.optim base
 45 |         defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
 46 |         super().__init__(params,defaults)
 47 | 
 48 |         #adjustable threshold
 49 |         self.N_sma_threshhold = N_sma_threshhold
 50 | 
 51 |         #now we can get to work...
 52 |         #removed as we now use step from RAdam...no need for duplicate step counting
 53 |         #for group in self.param_groups:
 54 |         #    group["step_counter"] = 0
 55 |             #print("group step counter init")
 56 | 
 57 |         #look ahead params
 58 |         self.alpha = alpha
 59 |         self.k = k 
 60 | 
 61 |         #radam buffer for state
 62 |         self.radam_buffer = [[None,None,None] for ind in range(10)]
 63 | 
 64 |         #self.first_run_check=0
 65 | 
 66 |         #lookahead weights
 67 |         #9/2/19 - lookahead param tensors have been moved to state storage.  
 68 |         #This should resolve issues with load/save where weights were left in GPU memory from first load, slowing down future runs.
 69 | 
 70 |         #self.slow_weights = [[p.clone().detach() for p in group['params']]
 71 |         #                     for group in self.param_groups]
 72 | 
 73 |         #don't use grad for lookahead weights
 74 |         #for w in it.chain(*self.slow_weights):
 75 |         #    w.requires_grad = False
 76 | 
 77 |     def __setstate__(self, state):
 78 |         print("set state called")
 79 |         super(Ranger, self).__setstate__(state)
 80 | 
 81 | 
 82 |     def step(self, closure=None):
 83 |         loss = None
 84 |         #note - below is commented out b/c I have other work that passes back the loss as a float, and thus not a callable closure.  
 85 |         #Uncomment if you need to use the actual closure...
 86 | 
 87 |         #if closure is not None:
 88 |             #loss = closure()
 89 | 
 90 |         #Evaluate averages and grad, update param tensors
 91 |         for group in self.param_groups:
 92 | 
 93 |             for p in group['params']:
 94 |                 if p.grad is None:
 95 |                     continue
 96 |                 grad = p.grad.data.float()
 97 |                 if grad.is_sparse:
 98 |                     raise RuntimeError('Ranger optimizer does not support sparse gradients')
 99 | 
100 |                 p_data_fp32 = p.data.float()
101 | 
102 |                 state = self.state[p]  #get state dict for this param
103 | 
104 |                 if len(state) == 0:   #if first time to run...init dictionary with our desired entries
105 |                     #if self.first_run_check==0:
106 |                         #self.first_run_check=1
107 |                         #print("Initializing slow buffer...should not see this at load from saved model!")
108 |                     state['step'] = 0
109 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
110 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
111 | 
112 |                     #look ahead weight storage now in state dict 
113 |                     state['slow_buffer'] = torch.empty_like(p.data)
114 |                     state['slow_buffer'].copy_(p.data)
115 | 
116 |                 else:
117 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
118 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
119 | 
120 |                 #begin computations 
121 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
122 |                 beta1, beta2 = group['betas']
123 | 
124 |                 #compute variance mov avg
125 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
126 |                 #compute mean moving avg
127 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
128 | 
129 |                 state['step'] += 1
130 | 
131 | 
132 |                 buffered = self.radam_buffer[int(state['step'] % 10)]
133 |                 if state['step'] == buffered[0]:
134 |                     N_sma, step_size = buffered[1], buffered[2]
135 |                 else:
136 |                     buffered[0] = state['step']
137 |                     beta2_t = beta2 ** state['step']
138 |                     N_sma_max = 2 / (1 - beta2) - 1
139 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
140 |                     buffered[1] = N_sma
141 |                     if N_sma > self.N_sma_threshhold:
142 |                         step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
143 |                     else:
144 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
145 |                     buffered[2] = step_size
146 | 
147 |                 if group['weight_decay'] != 0:
148 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
149 | 
150 |                 if N_sma > self.N_sma_threshhold:
151 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
152 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
153 |                 else:
154 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
155 | 
156 |                 p.data.copy_(p_data_fp32)
157 | 
158 |                 #integrated look ahead...
159 |                 #we do it at the param level instead of group level
160 |                 if state['step'] % group['k'] == 0:
161 |                     slow_p = state['slow_buffer'] #get access to slow param tensor
162 |                     slow_p.add_(self.alpha, p.data - slow_p)  #(fast weights - slow weights) * alpha
163 |                     p.data.copy_(slow_p)  #copy interpolated weights to RAdam param tensor
164 | 
165 |         return loss


--------------------------------------------------------------------------------
/data_loader/AudioCaps_dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import itertools
  3 | from pathlib import Path
  4 | from typing import Dict, List, Union
  5 | 
  6 | from base.base_dataset import BaseDataset
  7 | from typeguard import typechecked
  8 | from utils import memory_summary
  9 | from zsvision.zs_utils import concat_features, memcache
 10 | import time
 11 | import data_loader
 12 | 
 13 | 
 14 | class AudioCaps(BaseDataset):
 15 | 
 16 |     @typechecked
 17 |     def __init__(self, testing_file: Union[None, str]=None, **kwargs):
 18 |         self.testing_file = testing_file
 19 |         super().__init__(**kwargs)
 20 |         
 21 |         print(f"self.testing_file: {self.testing_file}")
 22 | 
 23 |     @staticmethod
 24 |     @typechecked
 25 |     def dataset_paths(training_file=None, testing_file=None) -> Dict[str, Union[str, List[str], Path, Dict]]:
 26 |         subset_paths = {}
 27 |         # import pdb; pdb.set_trace()
 28 |         if testing_file is None:
 29 |             test_splits = {
 30 |                 "val": "filtered_val_list.txt",
 31 |                 "test": "final_filtered_test_list.txt",
 32 |             }
 33 |             using_testing_file = False
 34 |         else:
 35 |             test_splits = {
 36 |                 "val": "filtered_val_list.txt",
 37 |                 "test": testing_file,
 38 |             }
 39 |             using_testing_file = True
 40 |             print(f"using {testing_file}")
 41 |         if training_file is not None:
 42 |             try:
 43 |                 val_per = training_file.split('.txt')[0].split('train_list_')[1]
 44 |                 test_splits['val'] = f"filtered_val_list_{val_per}.txt"
 45 |             except IndexError:
 46 |                 pass
 47 |         for split_name, fname in test_splits.items():
 48 |             if training_file is None:
 49 |                 print(f"using {test_splits['test']} is {using_testing_file} split {split_name}")
 50 |                 subset_paths[split_name] = {"train": "train_list.txt", "val": fname}
 51 |                 print(f"using {subset_paths[split_name]['train']} and {subset_paths[split_name]['val']}")
 52 |             else:
 53 |                 print(f"using {test_splits['test']} is {using_testing_file} split {split_name}")
 54 |                 subset_paths[split_name] = {"train": training_file, "val": fname}
 55 |                 print(f"using {subset_paths[split_name]['train']} and {subset_paths[split_name]['val']}")        
 56 | 
 57 |         feature_names = BaseDataset.common_feat_names()
 58 |         feature_names.append("audio.vggish.0")
 59 |         feature_names.append("pann.pann.0")
 60 |         feature_names.append("syncnet.syncnet.0")
 61 |         feature_names.append("vggsound.vggsound.0")
 62 |         text_feat_paths = BaseDataset.common_text_feat_paths()
 63 |         text_feat_paths = {key: Path("text_embeddings") / fname
 64 |                            for key, fname in text_feat_paths.items()}
 65 |         challenge_text_feat_paths = {key: f"text_embeddings/{key}.pkl"
 66 |                                      for key in text_feat_paths}
 67 |         custom_paths = {
 68 |             "audio": ["aggregated_audio/vggish-raw.hickle"],
 69 |             "pann": ["aggregated_pann/pann-raw.hickle"],
 70 |             "syncnet": ["aggregated_syncnet/syncnet-raw.hickle"],
 71 |             "vggsound": ["aggregated_vggsound/vggsound-raw.hickle"],
 72 |             "speech": ["aggregated_speech/w2v_mean.pkl"]
 73 |         }
 74 |         feature_info = {
 75 |             "custom_paths": custom_paths,
 76 |             "feature_names": feature_names,
 77 |             "subset_list_paths": subset_paths,
 78 |             "text_feat_paths": text_feat_paths,
 79 |             "challenge_text_feat_paths": challenge_text_feat_paths,
 80 |             "raw_captions_path": "structured-symlinks/raw-captions.pkl",
 81 |         }
 82 |         return feature_info
 83 | 
 84 |     def load_features(self):
 85 |         root_feat = self.root_feat
 86 |         feat_names = {key: self.visual_feat_paths(key) for key in
 87 |                       self.paths["feature_names"]}
 88 |         feat_names.update(self.paths["custom_paths"])
 89 |         features = {}
 90 |         for expert, rel_names in feat_names.items():
 91 |             if expert not in self.ordered_experts:
 92 |                 continue
 93 |             feat_paths = tuple([Path(root_feat) / rel_name for rel_name in rel_names])
 94 |             if len(feat_paths) == 1:
 95 |                 features[expert] = memcache(feat_paths[0])
 96 |             else:
 97 |                 # support multiple forms of feature (e.g. max and avg pooling). For
 98 |                 # now, we only support direct concatenation
 99 |                 msg = f"{expert}: Only direct concatenation of muliple feats is possible"
100 |                 print(f"Concatenating aggregates for {expert}....")
101 |                 assert self.feat_aggregation[expert]["aggregate"] == "concat", msg
102 |                 axis = self.feat_aggregation[expert]["aggregate-axis"]
103 |                 x = concat_features.cache_info()  # pylint: disable=no-value-for-parameter
104 |                 print(f"concat cache info: {x}")
105 |                 features_ = concat_features(feat_paths, axis=axis)
106 |                 
107 |                 memory_summary()
108 | 
109 |                 #if expert == "speech":
110 |                 #    features_defaults = defaultdict(lambda: np.zeros((1, 300)))
111 |                 #    features_defaults.update(features_)
112 |                 #    features_ = features_defaults
113 |                 # Make separate feature copies for each split to allow in-place filtering
114 |                 features[expert] = copy.deepcopy(features_)
115 | 
116 |         self.features = features
117 |         if self.challenge_mode:
118 |             self.load_challenge_text_features()
119 |         else:
120 |             self.raw_captions = memcache(root_feat / self.paths["raw_captions_path"])
121 |             # keys = list(raw_captions.keys())
122 |             # raw_captions_fused = {}
123 |             # for key in keys:
124 |             #     raw_captions_fused[key] = list(itertools.chain.from_iterable(raw_captions[key]))
125 |             # self.raw_captions = raw_captions_fused
126 |             text_feat_path = root_feat / self.paths["text_feat_paths"][self.text_feat]
127 |             self.text_features = memcache(text_feat_path)
128 | 
129 |         # overload video paths, which are structured differently for YouCook2
130 |         self.video_path_retrieval = [f"videos/{x}.mp4"
131 |                                      for x in self.partition_lists["val"]]
132 | 
133 |     def sanity_checks(self):
134 |         msg = (f"Expected to have single test caption for AudioCaps, since we assume"
135 |                f"that the captions are fused (but using {self.num_test_captions})")
136 |         if self.fuse_captions is True:
137 |             assert self.num_test_captions == 1, msg
138 | 
139 |     def configure_train_test_splits(self, split_name):
140 |         """Partition the datset into train/val/test splits.
141 | 
142 |         Args:
143 |             split_name (str): the name of the split
144 |         """
145 |         print(f"Now working on {split_name}")
146 |         # import pdb; pdb.set_trace()
147 |         self.paths = type(self).dataset_paths(training_file=self.training_file, testing_file=self.testing_file)
148 |         print("loading training/val splits....")
149 |         tic = time.time()
150 |         for subset, path in self.paths["subset_list_paths"][split_name].items():
151 |             if self.challenge_mode and split_name == "public_server_test" \
152 |                     and subset == "val":
153 |                 root_feat = Path(self.challenge_test_root_feat_folder)
154 |             else:
155 |                 root_feat = Path(self.root_feat)
156 |             subset_list_path = root_feat / path
157 |             if subset == "train" and self.eval_only:
158 |                 rows = []
159 |             else:
160 |                 with open(subset_list_path) as f:
161 |                     rows = f.read().splitlines()
162 |             self.partition_lists[subset] = rows
163 |         print("done in {:.3f}s".format(time.time() - tic))
164 |         self.split_name = split_name
165 | 


--------------------------------------------------------------------------------
/data_loader/data_loaders.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import functools
  3 | from pathlib import Path
  4 | from typing import Dict, List, Union
  5 | 
  6 | import torch
  7 | from typeguard import typechecked
  8 | from torch.utils.data import DataLoader
  9 | from zsvision.zs_utils import memcache
 10 | 
 11 | from zsvision.zs_data_structures import HashableDict, HashableOrderedDict
 12 | from data_loader.ActivityNet_dataset import ActivityNet
 13 | from data_loader.QuerYD_dataset import QuerYD
 14 | from data_loader.QuerYDSegments_dataset import QuerYDSegments
 15 | from data_loader.AudioCaps_dataset import AudioCaps
 16 | from data_loader.CLOTHO_dataset import CLOTHO
 17 | 
 18 | @functools.lru_cache(maxsize=64, typed=False)
 19 | def dataset_loader(
 20 |         text_dropout: float,
 21 |         fuse_captions: bool,
 22 |         spatial_feats: bool,
 23 |         use_zeros_for_missing: bool,
 24 |         challenge_mode: bool,
 25 |         eval_only: bool,
 26 |         task: str,
 27 |         data_dir: str,
 28 |         text_agg: str,
 29 |         text_feat: str,
 30 |         split_name: str,
 31 |         dataset_name: str,
 32 |         cls_partition: str,
 33 |         root_feat_folder: str,
 34 |         challenge_test_root_feat_folder: str,
 35 |         text_dim: int,
 36 |         num_test_captions: int,
 37 |         restrict_train_captions: int,
 38 |         logger: logging.Logger,
 39 |         max_tokens: Dict[str, int],
 40 |         raw_input_dims: HashableOrderedDict,
 41 |         feat_aggregation: HashableDict,
 42 |         distil_params: Union[None, Dict],
 43 |         training_file: Union[None, str],
 44 |         caption_masks: Union[None, str],
 45 |         ce_shared_dim: Union[None, int],
 46 |         **args,
 47 | ):
 48 |     print(f"refreshing cache for {dataset_name} data loader [{split_name}]")
 49 |     kwargs = dict(
 50 |         task=task,
 51 |         data_dir=Path(data_dir),
 52 |         text_dim=text_dim,
 53 |         logger=logger,
 54 |         eval_only=eval_only,
 55 |         text_agg=text_agg,
 56 |         text_feat=text_feat,
 57 |         max_tokens=max_tokens,
 58 |         split_name=split_name,
 59 |         cls_partition=cls_partition,
 60 |         spatial_feats=spatial_feats,
 61 |         text_dropout=text_dropout,
 62 |         fuse_captions=fuse_captions,
 63 |         raw_input_dims=raw_input_dims,
 64 |         challenge_mode=challenge_mode,
 65 |         root_feat_folder=root_feat_folder,
 66 |         feat_aggregation=feat_aggregation,
 67 |         num_test_captions=num_test_captions,
 68 |         use_zeros_for_missing=use_zeros_for_missing,
 69 |         restrict_train_captions=restrict_train_captions,
 70 |         challenge_test_root_feat_folder=challenge_test_root_feat_folder,
 71 |         distil_params=distil_params,
 72 |         training_file=training_file,
 73 |         caption_masks=caption_masks,
 74 |         ce_shared_dim=ce_shared_dim,
 75 |         **args,
 76 |     )
 77 |     if dataset_name == "ActivityNet":
 78 |         dataset = ActivityNet(**kwargs)
 79 |     elif dataset_name == "QuerYD":
 80 |         dataset = QuerYD(**kwargs)
 81 |     elif dataset_name == "QuerYDSegments":
 82 |         dataset = QuerYDSegments(**kwargs)
 83 |     elif dataset_name == "AudioCaps":
 84 |         dataset = AudioCaps(**kwargs)
 85 |     elif dataset_name == "CLOTHO":
 86 |         dataset = CLOTHO(**kwargs)
 87 |     return dataset
 88 | 
 89 | 
 90 | class ExpertDataLoader:
 91 | 
 92 |     @typechecked
 93 |     def __init__(
 94 |             self,
 95 |             eval_only: bool,
 96 |             fuse_captions: bool,
 97 |             challenge_mode: bool,
 98 |             use_zeros_for_missing: bool,
 99 |             trn_cat: int,
100 |             text_dim: int,
101 |             batch_size: int,
102 |             num_workers: int,
103 |             num_test_captions: int,
104 |             task: str,
105 |             data_dir: str,
106 |             text_agg: str,
107 |             text_feat: str,
108 |             split_name: str,
109 |             dataset_name: str,
110 |             root_feat_folder: str,
111 |             text_dropout: float,
112 |             max_tokens: Dict[str, int],
113 |             raw_input_dims: Dict[str, int],
114 |             feat_aggregation: Dict[str, Dict],
115 |             logger: logging.Logger,
116 |             spatial_feats: bool = False,
117 |             restrict_train_captions: int = 0,
118 |             drop_last: bool = False,
119 |             refresh_lru_cache: bool = False,
120 |             cls_partitions: List[str] = ["train", "val", "tiny", "challenge"],
121 |             challenge_test_root_feat_folder: str = "challenge",
122 |             distil_params: Union[None, Dict] = None,
123 |             training_file: Union[None, str] = None,
124 |             caption_masks: Union[None, str] = None,
125 |             ce_shared_dim: Union[None, int] = None,
126 |             **args,
127 |     ):
128 | 
129 |         # Ensure that the dictionaries are hashable to allow use of caching
130 |         raw_input_dims = HashableOrderedDict(raw_input_dims)
131 |         feat_aggregation = HashableDict(feat_aggregation)
132 |         if distil_params is not None:
133 |             distil_params = HashableDict(distil_params)
134 |         max_tokens = HashableDict(max_tokens)
135 |         
136 |         if refresh_lru_cache:
137 |             logger.info("Explicitly refreshing dataloader and cuda cache")
138 |             dataset_loader.cache_clear()
139 |             torch.cuda.empty_cache()
140 |             memcache.cache_clear()
141 | 
142 |         if trn_cat:
143 |             raise NotImplementedError(f"Support for trn cat will need to be re-added")
144 | 
145 |         common_kwargs = dict(
146 |             task=task,
147 |             logger=logger,
148 |             data_dir=data_dir,
149 |             text_dim=text_dim,
150 |             text_agg=text_agg,
151 |             eval_only=eval_only,
152 |             text_feat=text_feat,
153 |             max_tokens=max_tokens,
154 |             dataset_name=dataset_name,
155 |             text_dropout=text_dropout,
156 |             fuse_captions=fuse_captions,
157 |             spatial_feats=spatial_feats,
158 |             split_name=split_name,
159 |             challenge_mode=challenge_mode,
160 |             root_feat_folder=root_feat_folder,
161 |             use_zeros_for_missing=use_zeros_for_missing,
162 |             challenge_test_root_feat_folder=challenge_test_root_feat_folder,
163 |             num_test_captions=num_test_captions,
164 |             raw_input_dims=raw_input_dims,
165 |             feat_aggregation=feat_aggregation,
166 |             restrict_train_captions=restrict_train_captions,
167 |             distil_params=distil_params,
168 |             training_file=training_file,
169 |             caption_masks=caption_masks,
170 |             ce_shared_dim=ce_shared_dim,
171 |             **args,
172 |         )
173 | 
174 |         if "retrieval" in task:
175 |             # import pdb; pdb.set_trace()
176 |             dataset = dataset_loader(cls_partition="train", **common_kwargs)
177 |             x = dataset_loader.cache_info()  # pylint: disable=no-value-for-parameter
178 |             logger.info(f"cache info {x}")
179 |             self.dataloaders = {"dataset": dataset}
180 |             self.dataloaders["retrieval"] = dataset.get_retrieval_data()
181 |             if not eval_only:
182 |                 train_loader = DataLoader(
183 |                     dataset=dataset,
184 |                     batch_size=batch_size,
185 |                     num_workers=num_workers,
186 |                     collate_fn=dataset.collate_data,
187 |                     drop_last=drop_last,
188 |                     shuffle=True,
189 |                 )
190 |                 self.dataloaders["train"] = train_loader
191 |         else:
192 |             self.dataloaders = {}
193 |             for cls_partition in cls_partitions:
194 |                 cls_dataset = dataset_loader(cls_partition=cls_partition, **common_kwargs)
195 |                 x = dataset_loader.cache_info()  # pylint: disable=no-value-for-parameter
196 |                 logger.info(f"cache info [{cls_partition}] {x}")
197 |                 loader = DataLoader(
198 |                     dataset=cls_dataset,
199 |                     batch_size=batch_size,
200 |                     num_workers=num_workers,
201 |                     collate_fn=cls_dataset.collate_data,
202 |                     drop_last=False,
203 |                     shuffle=False,
204 |                 )
205 |                 self.dataloaders[cls_partition] = loader
206 | 
207 |         logger.info(f"Loading data loaders with {num_workers} workers")
208 |         self.num_test_captions = num_test_captions
209 |         self.dataset_name = dataset_name
210 | 
211 |     def __getitem__(self, key):
212 |         return self.dataloaders[key]
213 | 


--------------------------------------------------------------------------------
/misc/datasets/queryd/val_list.txt:
--------------------------------------------------------------------------------
  1 | video-RXFilHLLqPM
  2 | video-epKCqDN9fBo
  3 | video-HkKRouJqGCg
  4 | video-Q1CQUivEths
  5 | video-0P1Td5OTS-A
  6 | video-X2-S3pN1pt0
  7 | video-UPA3bwVVzGI
  8 | video-qi2m4V21bw4
  9 | video-eRT_mIpXjbs
 10 | video-qjZtHyPLQCE
 11 | video-je4nDvNJXsg
 12 | video-_iQ-Rb1ohDU
 13 | video-r1FbiXDKonk
 14 | video-zXBmZLmfQZ4
 15 | video-bwzLiQZDw2I
 16 | video-yf9dyfeFsFg
 17 | video--rgDvP39Lqw
 18 | video-VssqNaBnWoM
 19 | video-KdpoLklTozo
 20 | video-tyxYHIcIJoc
 21 | video-zRgZ3sWvnqs
 22 | video-vXccpwytjL8
 23 | video-YkAX7Vk3JEw
 24 | video-yZyxJxR6RCA
 25 | video-5z7fKiO5Uzg
 26 | video-81Y1Ligkpb4
 27 | video-K_P8kQg1Qq8
 28 | video-T0NPYZyI7V8
 29 | video-_vUG5rqC6qI
 30 | video-KD9vAYYLItg
 31 | video-XTW3LVp4pWA
 32 | video-oVS7kHYlLBc
 33 | video-DvaPRlZtfyc
 34 | video-8j3NmTv9AWg
 35 | video-jTXOEBHC0HY
 36 | video-gBdyU1b0ADQ
 37 | video-uEu6r8MkQ0o
 38 | video-1qM0p24SNhc
 39 | video-SFQB8hJdLZw
 40 | video-V4rufe1J-Q8
 41 | video-sq83Saeop9Y
 42 | video-HK9vbhTTwWU
 43 | video-cT_Wuzag6VU
 44 | video-ahj8Vef9L24
 45 | video-KIZProYn7R4
 46 | video-VYOjWnS4cMY
 47 | video-OINa46HeWg8
 48 | video-SNTcxD_xPfk
 49 | video-jhfLlamufKE
 50 | video-d61MkuYttDI
 51 | video-JgWgJa1NtAY
 52 | video-iDnE3PV4YNc
 53 | video-UTLCo4PHRAw
 54 | video-jcBLKdsmpo0
 55 | video-F1NR-_YqOgE
 56 | video-YoMEedm1DXM
 57 | video-EYmN3Sjgvts
 58 | video-HodFzcJHIYI
 59 | video-bJw0_Fj4PGY
 60 | video-RUkMwGquH_A
 61 | video-sc5aZsS0-0Y
 62 | video-p4cc4LkJFjg
 63 | video-ineZXLbL7s8
 64 | video-GaSRAzyxLKs
 65 | video-2PEvPfsNDrw
 66 | video-tRWbo2x5lnA
 67 | video-czTG7JxOruo
 68 | video-Y-Z7LObUlwA
 69 | video-RBAmLm_jYyY
 70 | video-ymc30meWzfg
 71 | video-sHLKy3z7HwM
 72 | video-d25HklopoSs
 73 | video-uqxzi_ghjgc
 74 | video-v9wRrYhlRgs
 75 | video-zd2xn5U6e-E
 76 | video-1I8ICdOySkw
 77 | video-RcM0NG2Fuxo
 78 | video-CBlaiBV_yJs
 79 | video-_tKp2eARy3o
 80 | video-I8ZvdblLcnk
 81 | video-Gu0wmiIngAw
 82 | video-s9jX0S7mvB8
 83 | video-cX03usETYI0
 84 | video-4FHckDWnDKI
 85 | video-pYekAIt9wW8
 86 | video-LiK2fhOY0nE
 87 | video-ndA4YL-bBAQ
 88 | video-uQp0Eihw2WA
 89 | video-A5WeiYHnvNY
 90 | video-b6yYd6Pq7Ic
 91 | video-YTszmB9fqEs
 92 | video-Bk4MR0IItiQ
 93 | video-OsNH9Tm-A04
 94 | video-GbycvPwr1Wg
 95 | video--FcsIyqJDzc
 96 | video-9pX1hxYW3YY
 97 | video-gupNRww6vFc
 98 | video-oJsYwehp_r4
 99 | video-37aUB92yvHI
100 | video-a5V6gdu5ih8
101 | video-AWjBNSshF3s
102 | video-s_RV4Btuv2c
103 | video-sOnqjkJTMaA
104 | video-aLackFf0Zjw
105 | video-On7TvTDOyMQ
106 | video-LvigW4InYyk
107 | video-3yd_1z6OsrE
108 | video-pU0GSbe6r_4
109 | video-x9FzWnWW95U
110 | video-2y1QQWNZxZM
111 | video-beECZjCRLmQ
112 | video-hseWMRV3lA8
113 | video-LSfJQkA-bKE
114 | video-uwCbe2yBqTI
115 | video-kBeggSzwKQ4
116 | video-paXvS0cnQM4
117 | video-_zOX6BO2zjc
118 | video-vI_B7dtF7Q0
119 | video-ka8-Nefp_gk
120 | video-dsf_z4urc4s
121 | video-xrlgfC0SJ9g
122 | video-KzWYP4LsaJw
123 | video-FTL9gQ0pux0
124 | video-c5gLf3_SK3Q
125 | video-DfPMxdHZKsw
126 | video-Dev6T3ZCrY8
127 | video-vU6Ay6yvaLo
128 | video-baM917Zy04A
129 | video-5jjeIH8Y6XM
130 | video-7N98N0GkGjY
131 | video-DCM-sEpyh1Q
132 | video-3JNLwlcPBPI
133 | video-1Ez6dw3ywcc
134 | video-yhofIxEfld0
135 | video-Cdpf1Dl5b_4
136 | video-2DHYhZNHtck
137 | video-L3MtFGWRXAA
138 | video--CR65sS1Frw
139 | video-HJENMThDg0k
140 | video-nIs4S9YDPRs
141 | video-5OBvbyAQ68g
142 | video-ih3CbjixhoI
143 | video-PV34pW-53Os
144 | video-XOqqP5Ww9lE
145 | video-SQBGJr8THGk
146 | video-YlrGp1YxMrc
147 | video-6US4AyvEO_A
148 | video-RF99-5G-Hrk
149 | video-JuKCOthud68
150 | video-Co4dLH29PvM
151 | video-3yQUzU8c4us
152 | video-Y8ZWX2NP3i0
153 | video-xqEqAQadKqE
154 | video-ghg2AP3i5TI
155 | video-jLeZc7li5HM
156 | video-RaRpFuSLyPI
157 | video-T7o0KMXccEA
158 | video-TdUsyXQ8Wrs
159 | video-zYX7iexkODw
160 | video-dkjbMoj0JY4
161 | video-mOaRH-aVFb4
162 | video-wekSrZ-d1bM
163 | video-GyTxtJ4gVLE
164 | video-4GfBVEoxStA
165 | video-7VS1wPeWqAA
166 | video-skKUzMST92g
167 | video-pzAZnOyMTI4
168 | video-J3iVxb8cwOU
169 | video-OZd9jf5nV7I
170 | video-xrhkfADEtMU
171 | video-CRxshNHF98U
172 | video-7WhJ2L5xUqI
173 | video-ClW-SQ7GdiM
174 | video-NjyWl-Bz6Q8
175 | video-rCZ3SN65kIs
176 | video-osP9iJjvlAE
177 | video-30qOijVBS7o
178 | video-63fcAemH_wg
179 | video-MsOzAbUt8n8
180 | video-ndhwbt9OQ0Q
181 | video-WkxE_Fs_mHI
182 | video-i4eADcCnFjo
183 | video--cf_-i_gCdY
184 | video-NB69vdkxn4Y
185 | video-WxXiQqul4io
186 | video-nojC6fP56VI
187 | video-R3qJ-u4b5W4
188 | video-r1AOXI0eBL8
189 | video-oM_M_d9OiHc
190 | video-U5oHhI_GmJs
191 | video-KAyVk_sH42k
192 | video-ezX-a1FT_ns
193 | video-GNZBSZD16cY
194 | video-eWXOurnVTYg
195 | video-sdbHXKlpPAM
196 | video-vfCddWB_Jlw
197 | video-DexH4oCXw-Y
198 | video-fmOaYJ4K09k
199 | video-duF40iZq464
200 | video-z5UScMQUO6Q
201 | video-KHiR4qVpcG8
202 | video-zFCNUW0TfqE
203 | video-Ujg7vcIa7kM
204 | video-yM6UU6QTt4M
205 | video-phQDinMbmic
206 | video-r5L7Iokg5RY
207 | video-DIBw9dSVKdU
208 | video-_La7IMssNOA
209 | video-7Fjt-mlIlTY
210 | video-EpcDZbXslfw
211 | video-prZuZlP4Pqo
212 | video-ndh11VDx_J8
213 | video-X2niZRgGZ7E
214 | video-8jaxiha8-rY
215 | video-C7uAB94aRrQ
216 | video-AoKlbyqbEGM
217 | video-Mv1FKi_-A1I
218 | video-dx0-pNkwOv8
219 | video-KWNEc1Igadg
220 | video-Hf_2ilitep0
221 | video-F5jNkpjGh8A
222 | video-VsY834tcKw8
223 | video-mG4Y2Snygfk
224 | video-vxjW8sfUCCU
225 | video-yCXSgVFsQnQ
226 | video-KYazqIHYqNI
227 | video-5lgHJB1lwYs
228 | video-GPQwSEzXBXU
229 | video-kg-EEBIe7Lk
230 | video-_YoeHOTJBI4
231 | video-yonJuvlA34U
232 | video-kshxs2WBjmE
233 | video-UUlaseGrkLc
234 | video-LlndnhlJnIw
235 | video-4o-qnznd10Y
236 | video-q3wJ32w4s_A
237 | video-qCNodlSc6Hw
238 | video-ml9EdqgtVfU
239 | video-83Wu5xmstn4
240 | video-mrA0oL6wLQA
241 | video-WbmXpHfabuQ
242 | video-Suv9QImeAog
243 | video-jF3I7VpfCEU
244 | video-IYlOZxb0ViI
245 | video-oY6tCnu-1Do
246 | video-fmnUwvZAMjQ
247 | video-g2wsNw07wRY
248 | video-L9KC5W7A2yI
249 | video-db3Ep-jM6ZE
250 | video-9NvPlA3G53I
251 | video-uGl4PRmhRxk
252 | video-UZO5q0B5wfw
253 | video-KItqbZXlrdY
254 | video-W4qMyGXTcsg
255 | video-pIyTWg9oV0M
256 | video-9_5wHw6l11o
257 | video-Juj026QZGDo
258 | video-25lxyul1lb0
259 | video-V-AFUpW3oNg
260 | video-W-xuEJVEraw
261 | video-tFf6pt9HOq8
262 | video-FlYf0F1fuTc
263 | video-Xl6yy6a3emw
264 | video-Ta9K22D0o5Q
265 | video-xA9uSxI36Ik
266 | video-MqIJKnUkGLY
267 | video-L-LE-j2zkCU
268 | video-ZdzD897w11s
269 | video-s0GtT-vN33I
270 | video-fWDaRN490BI
271 | video-XbxRqlHtKUE
272 | video-S0zfR9DTwSY
273 | video-p-_UlScFrQ8
274 | video-JMKHbmwltWQ
275 | video-g-uBt1SoCRQ
276 | video-2b7aoZsavu0
277 | video-A0Wk08f8mUU
278 | video-a7CpzJ-sNl8
279 | video-B1yJuGQOUaY
280 | video-gryenlQKTbE
281 | video-txqiwrbYGrs
282 | video-k-Z8xxygd2Y
283 | video-Qo95rTt9ikU
284 | video-xiSIQzwIPzQ
285 | video-ZtCZGwLH5_o
286 | video-ghLkwSlWSXw
287 | video-Ui7jeZSsgFs
288 | video-rVu1oVDRLgE
289 | video-hDjzdFXmH4g
290 | video-f8PXvqYpGCM
291 | video-fAr-ZYq4RmU
292 | video-6MW7bkk3MZ0
293 | video-tNeYTDLZUJA
294 | video-b7cSIiKxEt4
295 | video-_6PNGyfwjTA
296 | video-pGQd630EpLU
297 | video-ZGc06DUIpaA
298 | video-OKaD4EcmZO8
299 | video-_F6h0yH7EyU
300 | video--A8EERSVAdk
301 | video-II5UsqP2JAk
302 | video-60kjpwyQhqg
303 | video-GR2o6k8aPlI
304 | video-vmzlLFAf3M4
305 | video-mm3rTwAxH20
306 | video-9lZlt-SlABw
307 | video-Yz9u-oG3BgM
308 | video-TuE6z8X-rGg
309 | video-QeYISW-Jplw
310 | video-C6a9AQY_srk
311 | video-aWOCk_57xj8
312 | video-VUs6l9p34v8
313 | video-MdEV-jWMGWw
314 | video-T0rrS51ry2s
315 | video-ODDZPV-Avfw
316 | video-g8Ir5rQJeX0
317 | video-x1kQ-38-Drg
318 | video-T47vNsMjjn8
319 | video-vT-naHyejL8
320 | video-UTO0ogdNMdY
321 | video-xxuQnCg9ML0
322 | video-VfDWQG47pAQ
323 | video-ebyf6FzKEiI
324 | video-6XfdHj11-N0
325 | video-lLYp4b_p_wg
326 | video-RxtzQg57O7w
327 | video-qDMMRpSmzq4
328 | video-_4BT6iLtzUM
329 | video-F6j0EbS7skc
330 | video-0LHxvxdRnYc
331 | video-BDnXdeoiYRE
332 | video-vbuq7w3ZDUQ
333 | video-uE74-8YAV9E
334 | video-6ri6gK4FcnE
335 | video-pRfZmKKX2c4
336 | video-bln98NpCLQA
337 | video-gU2vD-FewGM
338 | video-w17iS0AJHjU
339 | video-6yd2tv4Ni4c
340 | video-GSvbZyWXsME
341 | video-t2mU6USTBRE
342 | video-ccgW5CHFg0Y
343 | video-G0E31788Nfg
344 | video-S7VYhBwbprE
345 | video-xSz0zs0v6e8
346 | video-V7QYLEusDU4
347 | video-mwm0OwqWvF4
348 | video-OxrBik16Hzg
349 | video-63d0vV0kk_Y
350 | video-74v22joL7J4
351 | video-RFinNxS5KN4
352 | video-xBD_s0RhUko
353 | video-M1GO1X09Gec
354 | video-LkjuS_tzmIE
355 | video-AB2oAgjjt3g
356 | video-4yG8caPPY1Y
357 | video-4cR7tNWsuNM
358 | video-ztim_RY82G8
359 | video-HF0Ev_skUAY
360 | video-nGkUzdNi_gs
361 | video-TzZuNeRdFIc
362 | video-8zkVKHy1hyM
363 | video-VjOSxus84WY
364 | video-kupuUVYxZxU
365 | video-khvaIwonxUk
366 | video-c_Ex_qS5Djo
367 | video--W37TDK6dBM
368 | video-cN6uZkmGSLM
369 | video-k431Cy2-kkA
370 | video-JCOqo88eW1E
371 | video-8Fu5pKcrTZI
372 | video-_Z2iurLVDEQ
373 | video-zC2G6lf9fCs
374 | video-V3wLiAmIrGk
375 | video-g_QG77WomHo
376 | video-KLzYvzQbBLI
377 | video-QYsg3rtT79o
378 | video-Q8fUy8qwV3M
379 | video-__5k7e0f3r4
380 | video-GRi80V8ire8
381 | video-ckEoLBiE3Xs
382 | video-24f0OwnZE-Q
383 | video-1WpStml5fe8
384 | video-UJfqp1dmJ3I
385 | 


--------------------------------------------------------------------------------
/utils/radam.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer, required
  4 | 
  5 | class RAdam(Optimizer):
  6 | 
  7 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
  8 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
  9 |         self.buffer = [[None, None, None] for ind in range(10)]
 10 |         super(RAdam, self).__init__(params, defaults)
 11 | 
 12 |     def __setstate__(self, state):
 13 |         super(RAdam, self).__setstate__(state)
 14 | 
 15 |     def step(self, closure=None):
 16 | 
 17 |         loss = None
 18 |         if closure is not None:
 19 |             loss = closure()
 20 | 
 21 |         for group in self.param_groups:
 22 | 
 23 |             for p in group['params']:
 24 |                 if p.grad is None:
 25 |                     continue
 26 |                 grad = p.grad.data.float()
 27 |                 if grad.is_sparse:
 28 |                     raise RuntimeError('RAdam does not support sparse gradients')
 29 | 
 30 |                 p_data_fp32 = p.data.float()
 31 | 
 32 |                 state = self.state[p]
 33 | 
 34 |                 if len(state) == 0:
 35 |                     state['step'] = 0
 36 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
 37 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
 38 |                 else:
 39 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
 40 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
 41 | 
 42 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 43 |                 beta1, beta2 = group['betas']
 44 | 
 45 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 46 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 47 | 
 48 |                 state['step'] += 1
 49 |                 buffered = self.buffer[int(state['step'] % 10)]
 50 |                 if state['step'] == buffered[0]:
 51 |                     N_sma, step_size = buffered[1], buffered[2]
 52 |                 else:
 53 |                     buffered[0] = state['step']
 54 |                     beta2_t = beta2 ** state['step']
 55 |                     N_sma_max = 2 / (1 - beta2) - 1
 56 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
 57 |                     buffered[1] = N_sma
 58 | 
 59 |                     # more conservative since it's an approximated value
 60 |                     if N_sma >= 5:
 61 |                         step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
 62 |                     else:
 63 |                         step_size = 1.0 / (1 - beta1 ** state['step'])
 64 |                     buffered[2] = step_size
 65 | 
 66 |                 if group['weight_decay'] != 0:
 67 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
 68 | 
 69 |                 # more conservative since it's an approximated value
 70 |                 if N_sma >= 5:            
 71 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
 72 |                     p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
 73 |                 else:
 74 |                     p_data_fp32.add_(-step_size * group['lr'], exp_avg)
 75 | 
 76 |                 p.data.copy_(p_data_fp32)
 77 | 
 78 |         return loss
 79 | 
 80 | class PlainRAdam(Optimizer):
 81 | 
 82 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 83 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 84 | 
 85 |         super(PlainRAdam, self).__init__(params, defaults)
 86 | 
 87 |     def __setstate__(self, state):
 88 |         super(PlainRAdam, self).__setstate__(state)
 89 | 
 90 |     def step(self, closure=None):
 91 | 
 92 |         loss = None
 93 |         if closure is not None:
 94 |             loss = closure()
 95 | 
 96 |         for group in self.param_groups:
 97 | 
 98 |             for p in group['params']:
 99 |                 if p.grad is None:
100 |                     continue
101 |                 grad = p.grad.data.float()
102 |                 if grad.is_sparse:
103 |                     raise RuntimeError('RAdam does not support sparse gradients')
104 | 
105 |                 p_data_fp32 = p.data.float()
106 | 
107 |                 state = self.state[p]
108 | 
109 |                 if len(state) == 0:
110 |                     state['step'] = 0
111 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
112 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
113 |                 else:
114 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
115 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
116 | 
117 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
118 |                 beta1, beta2 = group['betas']
119 | 
120 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
121 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
122 | 
123 |                 state['step'] += 1
124 |                 beta2_t = beta2 ** state['step']
125 |                 N_sma_max = 2 / (1 - beta2) - 1
126 |                 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
127 | 
128 |                 if group['weight_decay'] != 0:
129 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
130 | 
131 |                 # more conservative since it's an approximated value
132 |                 if N_sma >= 5:                    
133 |                     step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
134 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
135 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
136 |                 else:
137 |                     step_size = group['lr'] / (1 - beta1 ** state['step'])
138 |                     p_data_fp32.add_(-step_size, exp_avg)
139 | 
140 |                 p.data.copy_(p_data_fp32)
141 | 
142 |         return loss
143 | 
144 | 
145 | class AdamW(Optimizer):
146 | 
147 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0):
148 |         defaults = dict(lr=lr, betas=betas, eps=eps,
149 |                         weight_decay=weight_decay, warmup = warmup)
150 |         super(AdamW, self).__init__(params, defaults)
151 | 
152 |     def __setstate__(self, state):
153 |         super(AdamW, self).__setstate__(state)
154 | 
155 |     def step(self, closure=None):
156 |         loss = None
157 |         if closure is not None:
158 |             loss = closure()
159 | 
160 |         for group in self.param_groups:
161 | 
162 |             for p in group['params']:
163 |                 if p.grad is None:
164 |                     continue
165 |                 grad = p.grad.data.float()
166 |                 if grad.is_sparse:
167 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
168 | 
169 |                 p_data_fp32 = p.data.float()
170 | 
171 |                 state = self.state[p]
172 | 
173 |                 if len(state) == 0:
174 |                     state['step'] = 0
175 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
176 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
177 |                 else:
178 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
179 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
180 | 
181 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
182 |                 beta1, beta2 = group['betas']
183 | 
184 |                 state['step'] += 1
185 | 
186 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
187 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
188 | 
189 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
190 |                 bias_correction1 = 1 - beta1 ** state['step']
191 |                 bias_correction2 = 1 - beta2 ** state['step']
192 |                 
193 |                 if group['warmup'] > state['step']:
194 |                     scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
195 |                 else:
196 |                     scheduled_lr = group['lr']
197 | 
198 |                 step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
199 |                 
200 |                 if group['weight_decay'] != 0:
201 |                     p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
202 | 
203 |                 p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
204 | 
205 |                 p.data.copy_(p_data_fp32)
206 | 
207 |         return loss
208 | 


--------------------------------------------------------------------------------
/misc/datasets/queryd/test_list.txt:
--------------------------------------------------------------------------------
  1 | video-uc0O6cqYbyk
  2 | video-vzJqeyxye_E
  3 | video-1wn-5-HaKaI
  4 | video-BhmvJuTc4aY
  5 | video-DqilSuFK3B8
  6 | video-BxyXk1sr2io
  7 | video-rMXsAtYtLTo
  8 | video-HX8KX5u0gkg
  9 | video-cWX9iR_PzIQ
 10 | video-aooZ26YKH-8
 11 | video-eOrNdBpGMv8
 12 | video-alLd1Bobkf8
 13 | video-G_TwXO0yFeM
 14 | video-fbri4or6Uhk
 15 | video-uWuOBonC1ds
 16 | video-lm0OZ8cMW6M
 17 | video-Ro7tuDVP6Ks
 18 | video-dNJdJIwCF_Y
 19 | video-avl2RPsmlnI
 20 | video-9AMPsDXGAxY
 21 | video-MXVz4izTWZY
 22 | video-W2asVll3OSU
 23 | video-x152oloLhVM
 24 | video-cNi_HC839Wo
 25 | video-8gy5tYVR-28
 26 | video-Ceu5cSJiAic
 27 | video-6fan6ggvh4U
 28 | video-DYldBHQw3s4
 29 | video-Kqr2ibw5FKA
 30 | video-KJiCalhfN6k
 31 | video-Ahvl7V82ycE
 32 | video-bSfDQr-FFJU
 33 | video-1WsDtn-feuI
 34 | video-pQD6gJMnWVs
 35 | video-5xJ6h6OTveU
 36 | video-Iqtv2oqNNIE
 37 | video-z41fUWaNwO0
 38 | video-wAnrYQGSHMw
 39 | video-oyv6hjQr8MA
 40 | video-mvD5wvlIE7c
 41 | video-H5MKL1duTrg
 42 | video-vN1fuDgj-Qk
 43 | video-c38r-SAnTWM
 44 | video-q6rAllJAdWk
 45 | video-F-a5tZ_5g9E
 46 | video-USGLDwaLnaI
 47 | video-UE2Pe5DaKiE
 48 | video-c6KD-kOqwpE
 49 | video-NLlGopyXT_g
 50 | video-_NWYeVyZz9I
 51 | video--IRNEMiRg0Y
 52 | video-lTxn2BuqyzU
 53 | video-AZvdGSr7roA
 54 | video-TvYCwP1U0xs
 55 | video-MovSMelAxWg
 56 | video-ER8yEOY0NCc
 57 | video-8GGfE7zsD-0
 58 | video-IWntTYTdXG8
 59 | video-Q_P8WCbhC6s
 60 | video-akYVGzeS7_A
 61 | video-HZ21eT9lyog
 62 | video-kPm0_jB5EQI
 63 | video-3EBfr6KdnQc
 64 | video-yq3Z9msqnOg
 65 | video-T_FdjXqSZlc
 66 | video-H7muODd2pCo
 67 | video-gJWo7Z5m6e8
 68 | video-V49ENdZlOx4
 69 | video-E0pemP7JGV4
 70 | video-AMjMFbhyhwY
 71 | video-cEfD-Mr7m2c
 72 | video-j8waYyUSSxg
 73 | video-SBTL1vI4-mc
 74 | video-4w7sVSMbjyM
 75 | video-QPe3QSw49_Y
 76 | video-cx-T137FKM4
 77 | video-twKxRnoGxoM
 78 | video-G9hIzIG1sPY
 79 | video-oSO9q-2JjUs
 80 | video-cOvModiiUjs
 81 | video-Q-yA3q0mGCM
 82 | video-yUkgydZh0Bk
 83 | video-IDIQKPApIxM
 84 | video-7HDtvRHyq2M
 85 | video-g6KUxEfUVm0
 86 | video-oCLTRjF2eq0
 87 | video-9-k5J4RxQdE
 88 | video-7-i6uxo4HS0
 89 | video-GF60Iuh643I
 90 | video-h56Cr2ho1Y8
 91 | video-UdNZzQD0qY0
 92 | video-bh-dRJaZdgM
 93 | video-HTj2n52jz94
 94 | video-gsYL4PC0hyk
 95 | video-agqgBkpbCoY
 96 | video-CrqIVVd3hp0
 97 | video-XHGWTDHchVQ
 98 | video-Ei6TjvfGMpE
 99 | video-_MC3XuMvsDI
100 | video-ikUora2uPnU
101 | video-bGph2eX8RMI
102 | video-cd_AOrSEeRc
103 | video-3heXk6Oj6hU
104 | video-_tw7PR89IOI
105 | video-KUGf-irpTMQ
106 | video-z1fbwPHv-wA
107 | video-hhaNVna7eQs
108 | video-KwOdedfBqHE
109 | video-1g4AUYiz0LU
110 | video-_lmKuKrsKRA
111 | video-07d2dXHYb94
112 | video-pQu3dufotPM
113 | video-VpPJP7o7NnA
114 | video-hpsh8dYl7PE
115 | video-XrgVtuDRBjM
116 | video-0nB9BcZTBag
117 | video-nB-444rPm_8
118 | video-FwqsdGLhdgA
119 | video-5ZYcY-KX2JA
120 | video-yEvhDTWSRec
121 | video-3aZadizk_Io
122 | video-3CJl_S7uzqU
123 | video-u3VVUu-lZsM
124 | video-x4sadYeLHKU
125 | video-jQHVrDFNmJE
126 | video-AZGaCqDAlsU
127 | video-VqrBsMFRaLA
128 | video-CtBBL7Pb9Q0
129 | video-3EFiduilmn8
130 | video-EyVuypKJOq0
131 | video-p5u-vBV8NUU
132 | video-NvgYhf2LnVI
133 | video-9Ky2nyzOnMw
134 | video-SkB4gG8ke7Q
135 | video-GHDz-XDD8OU
136 | video-xyor66WBWPk
137 | video-UDcrGE3le20
138 | video-vYLaKMpqnOc
139 | video-Wu-gd9tLpmU
140 | video-BeGOgA18NIQ
141 | video-JNvpcGV1frQ
142 | video-jHe5vPlKgJA
143 | video-ZSd_IpzmcLM
144 | video-B8ISzf2pryI
145 | video-GZ0Bey4YUGI
146 | video-cEx5bSYJxtg
147 | video-IAq8pEFNeJs
148 | video-uhUUXMWoC_4
149 | video-DS8yeXFeEPA
150 | video-v3iPrBrGSJM
151 | video-uc1Hn4INDjk
152 | video-YPFGT4ecnIU
153 | video-D0a0aNqTehM
154 | video-tEZzagPGls4
155 | video-4QdmRufojsU
156 | video-4SkWU60v9Cg
157 | video-pjJEXkbeL-o
158 | video-W0_tLK37W24
159 | video-LJ9KtxNZdWE
160 | video-N_AcmtmegKI
161 | video-I6lZBoR5gvs
162 | video-8jucxdaifbs
163 | video-zW39WTnHCc8
164 | video-7RhQIZmkgDQ
165 | video-QNJx7Vi4Sg4
166 | video-Vlb2udqPx-M
167 | video-tUN-8TvevGU
168 | video-3SIfsFz_kMQ
169 | video-7KWKxe5HvLw
170 | video-VZaqHyHFCzc
171 | video-dZiJkicepzM
172 | video-94NanQuVkA4
173 | video-fhZo17Pxq1A
174 | video-HvPbH30KWLE
175 | video-J5R18MzrtKU
176 | video-sdUUx5FdySs
177 | video-88VViI5gNA4
178 | video-LsjNWQQOmNg
179 | video-LF71pZXhYrw
180 | video-gRD53bcAM8E
181 | video-CKwUNBEFI0E
182 | video-LnJwH_PZXnM
183 | video-Bzua8Zvlppo
184 | video-o2VFgHGKzx4
185 | video-G9YuKs3Jitk
186 | video-VGNFvm-YCEk
187 | video-huT5__BqY_U
188 | video-hywRdDVR76A
189 | video-WUG-x1TFewA
190 | video-osVxO-RA-pE
191 | video-t1PGWO2Lvmw
192 | video-1awM6kmpd2g
193 | video-KK-Mff60ZIE
194 | video-StAF3NSro-w
195 | video-makIgB4X3q8
196 | video-B_bdAJXsjvk
197 | video-jnaPpgK33Lo
198 | video-6EiRjwjp30I
199 | video-p_Rrovk5nsk
200 | video-EthCVn45VyU
201 | video-DeKXFHPr_oE
202 | video-kJzNZ10I1MY
203 | video-uceySVBjKNw
204 | video-CnOJgDW0gPI
205 | video-XxhKUP9Ixco
206 | video-wt62ayeVd44
207 | video-DSBhSywLRNA
208 | video-ZmdOe1hjW-s
209 | video-Qb2xoiVM7UA
210 | video-qniwI2hNhDs
211 | video-nPo2B-vjZ28
212 | video-4b6ttHSgIFM
213 | video-URTR3AtKTM8
214 | video-4XNjwKBqvxE
215 | video-V3L1qrisKFE
216 | video-0Cqt__04bAk
217 | video-CFq441el_ls
218 | video-R9Puz5RFl5o
219 | video-b1XGPvbWn0A
220 | video-zSlhbBBBi3A
221 | video-JNyn_w3hdZ0
222 | video-JnR2dpLnS14
223 | video-ENaJcHwQEVs
224 | video-QjqS7jzjX34
225 | video-4X1DieuShKI
226 | video-qI34nBlJxP8
227 | video-vjW4UOC7U3w
228 | video-aB_s9lw9E-M
229 | video-NXXkBSuIAl8
230 | video-y07at1bU89Q
231 | video-763brdRmWuc
232 | video-R4WDWpR4oRM
233 | video-Y_dXFLaEVJk
234 | video-d6g1c18Cy-8
235 | video-FRgDsBFC2IA
236 | video-1NunXMcaslA
237 | video-9Byx6TxOPx4
238 | video-xqZQ9KM_LjY
239 | video-tloBMf_KmX4
240 | video-JZIerGNMtnk
241 | video-4oETtq9w9Zg
242 | video-HLRxoMiagO8
243 | video-cdg193GvnBA
244 | video-O_HyZ5aW76c
245 | video-gLax3zOBN40
246 | video-ZVXz6ymCSIo
247 | video-wWVppdfYOx8
248 | video-jKXrOTdbtVQ
249 | video-KauXf6nihPY
250 | video-XeSW_3JEeTs
251 | video-kbzEFa7fiOE
252 | video-Z-E3cRZCne4
253 | video-peXSoTlkwVY
254 | video-G6fMV1UPzkg
255 | video-sBdqOWSZ56w
256 | video-OA1ZRGFKRVI
257 | video-4ClKFnnzSRA
258 | video-ygzR-ltUWug
259 | video-kltuUtE6jQo
260 | video-aI51UWF8_9Y
261 | video-cda7mSowTEI
262 | video-5NVYg2HNAdA
263 | video-Ezg4sr67OGA
264 | video-6ZjRKYPfO8g
265 | video-XPKf24_pXfQ
266 | video-ansWZq7yULE
267 | video-J0HiXwK5s2k
268 | video-QEpCsMbMx7w
269 | video-P0ISZpljc3E
270 | video-PFjp1MW6Lzc
271 | video-dPgs0GHgiYc
272 | video-mgNgscHJh6I
273 | video-oowcsynjIwc
274 | video-Q04KG7gVQtw
275 | video-FK3dav4bA4s
276 | video-iS9QQ8YOofQ
277 | video-D-hPct3oIow
278 | video-wxm8jTzU_8o
279 | video-Zh2-rVsXWUU
280 | video-Wji-BZ0oCwg
281 | video-hgLQQe5uUCE
282 | video-3h6KMumLAvI
283 | video-gHCxdlZ7G18
284 | video-PeBAzI9LuHM
285 | video-qI3AWoK7ABU
286 | video-nqkyzpaoMug
287 | video-IFgh9WU0lPs
288 | video-ppyYdn2nPoU
289 | video-iWYCoBiTnA0
290 | video-TubxNbCQ4Fk
291 | video-lwS74rI92YQ
292 | video-GjHkkTGf7fc
293 | video-TDquUlVDdbU
294 | video-5xKnmuDnJMs
295 | video-uW9KEiQFUE8
296 | video-RzR_O2DoSVs
297 | video-1ePcSm1ninM
298 | video-MxNfvh7vaSs
299 | video-6jLfuoOBX2I
300 | video-Qw6RD5S3e8o
301 | video-2kUMAA9yZgk
302 | video-QcAcBHosPzQ
303 | video-hCsVT9TKahk
304 | video-DAHbtsjuNws
305 | video-U6fPh2mm3pw
306 | video-PGKmexNTHNE
307 | video-NVItPJAu_Fk
308 | video-woWiyBgp5cs
309 | video-qX9FSZJu448
310 | video-YKejnIOvACY
311 | video-5nmhHL3sVIk
312 | video-3yHsRjoRec8
313 | video-6Ts-deSDnRM
314 | video-xZfZ-HB2yJI
315 | video-wj8XXvD4kGE
316 | video-9nVvIz8nYxo
317 | video-TqPCGGHoxsE
318 | video-QAEkuVgt6Aw
319 | video-3Bs4LOtIuxg
320 | video-Gv1aDEFlXq8
321 | video-gZp6CGgsS4A
322 | video-YOqmroV2cRo
323 | video-cHUNbTfzOr4
324 | video-MDdQBWyFmtc
325 | video-JdYSnsEM0gg
326 | video-_U-J9PqgmIc
327 | video-d6PMG7kXpF4
328 | video-xEaCpSzUq3Q
329 | video-28FyDT4cKrg
330 | video-JvQcabZ1zrk
331 | video-Bv2vT665bGI
332 | video-DMCSP73Rq4I
333 | video-o2AsIXSh2xo
334 | video-NoPMX5lqT6A
335 | video-7ToAmWnTsAI
336 | video-tbBzXKN32Sk
337 | video-5_uSZcXMV7s
338 | video-rbNB0jqMv7s
339 | video-AW0jm6i9U3M
340 | video--wHytb5Fe2k
341 | video-JtzsCx0P3tI
342 | video-G64wuf-rHoo
343 | video-FRpIk7yd2RA
344 | video-XvhlK0WGBr4
345 | video-tUyeaT2ZX1I
346 | video-bV3Ib6Ato6c
347 | video-bYw1gRtyGiw
348 | video-LrI4FmRIHpI
349 | video-8pfPl8BkfVY
350 | video-3veKbPi4r90
351 | video-_-O6Ppkrf98
352 | video-5hpBAn5lQPs
353 | video-I6PXKSiJchU
354 | video-sNcJMejrcnM
355 | video-OITWgx8K6Ko
356 | video-wXL5zXz550I
357 | video-bap6XjDDE3k
358 | video-g26mbST0YhU
359 | video-tkQuXvgvNPk
360 | video-dJJ0yadpqKI
361 | video-5yGNbyAmkVY
362 | video-0-NBRA1aSXk
363 | video-xVihCNfZaDg
364 | video-s1FWVQFeOpQ
365 | video-gFuEo2ccTPA
366 | video-L8hM2kbw2Ik
367 | video-yKGeJXk2qWQ
368 | video-tREqJ1_7h0w
369 | video-0m1IfJUNzmc
370 | video-LJosiEHwWxc
371 | video-W4Pr7PZ3Bgc
372 | video-M0nDEbrp9nM
373 | video-D6lmibFiur8
374 | video-oFV9ayoss_o
375 | video-7SldSIviMkg
376 | video-SQ6H-Mz6hgw
377 | video-GrzDQGVprjE
378 | video-ixh8KqEr6LE
379 | video-zywWM3J3i8M
380 | video-WjqiU5FgsYc
381 | video-mpDOscUDQ_0
382 | video-qMzt3yQFT-Q
383 | video-3NDfWjywzsI
384 | video-F2bk_9T482g
385 | video-aLjHyP683QU
386 | video-6Jgwc3sXLCc
387 | 


--------------------------------------------------------------------------------
/dataset_stats/get_videoid_perclass.py:
--------------------------------------------------------------------------------
  1 | # AudioCaps dataset statistics
  2 | # March 2021, ask
  3 | 
  4 | 
  5 | import os
  6 | import csv
  7 | import json
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | # -----------------------------------------------------------------------------
 12 | # Load AudioCaps test data
 13 | # -----------------------------------------------------------------------------
 14 | 
 15 | audiocaps_base = '/home/askoepke97/coding/ce/collab-experts/data/AudioCaps/audiocaps/dataset'
 16 | audiocaps_test_file = os.path.join(audiocaps_base, 'test.csv')
 17 | 
 18 | audiocapid = []
 19 | youtubeid = []
 20 | yid_dict = dict()
 21 | 
 22 | with open(audiocaps_test_file, 'r') as csvfile:
 23 |     reader = csv.reader(csvfile)
 24 |     i = 0
 25 |     for row in reader:
 26 |         if i > 0:
 27 |             if not int(row[2]) == 0:
 28 |                 filename = row[1] + '_%d000'%int(row[2])
 29 |             else:
 30 |                 filename = row[1] + '_%d'%int(row[2])
 31 |             ytname = row[1]
 32 |             yid_dict[filename] = ytname
 33 |         i += 1
 34 | 
 35 | # -----------------------------------------------------------------------------
 36 | # Load audioset ontology and train (and eval) data
 37 | # -----------------------------------------------------------------------------
 38 | 
 39 | audiosetbase = '/home/askoepke97/coding/ce/collab-experts/data/dataset_statistics'
 40 | ontology = os.path.join(audiosetbase, 'ontology.json')
 41 | 
 42 | evalcsv = os.path.join(audiosetbase, 'eval_segments.csv')
 43 | traincsv = os.path.join(audiosetbase, 'unbalanced_train_segments.csv')
 44 | 
 45 | with open(ontology) as json_file:
 46 |     ontology_data = json.load(json_file)
 47 | 
 48 | classids = dict()
 49 | 
 50 | for ind in np.arange(len(ontology_data)):
 51 |     classids[ontology_data[ind]['id']] = ontology_data[ind]['name']
 52 | 
 53 | evaldict = dict()
 54 | 
 55 | with open(evalcsv, 'r') as as_csvfile:
 56 |     reader = csv.reader(as_csvfile)
 57 |     i = 0
 58 |     for row in reader:
 59 |         if i > 2:
 60 |             ytname = row[0]
 61 |             starttime = row[1]
 62 |             classes = row[3].split(',')
 63 |             newclasses = []
 64 |             for classe in classes:
 65 |                 if classe.strip()[0] == '"' and not classe.strip()[-1] == '"':
 66 |                     newclasses.append(classids[row[3].strip()[1:]])
 67 |                 elif classe.strip()[-1] == '"' and not classe.strip()[0] == '"':
 68 |                     newclasses.append(classids[row[3].strip()[:-1]])
 69 |                 elif classe.strip()[-1] == '"' and classe.strip()[0] == '"':
 70 |                     newclasses.append(classids[row[3].strip()[1:-1]])
 71 |                 else:
 72 |                     newclasses.append(classids[row[3].strip()])
 73 |             evaldict[ytname] = newclasses
 74 |         i += 1
 75 | 
 76 | traindict = dict()
 77 | 
 78 | with open(traincsv, 'r') as as_csvfile:
 79 |     reader = csv.reader(as_csvfile)
 80 |     i = 0
 81 |     for row in reader:
 82 |         if i > 2:
 83 |             ytname = row[0]
 84 |             starttime = row[1]
 85 |             classes = row[3].split(',')
 86 |             newclasses = []
 87 |             for classe in classes:
 88 |                 if classe.strip()[0] == '"' and not classe.strip()[-1] == '"':
 89 |                     newclasses.append(classids[row[3].strip()[1:]])
 90 |                 elif classe.strip()[-1] == '"' and not classe.strip()[0] == '"':
 91 |                     newclasses.append(classids[row[3].strip()[:-1]])
 92 |                 elif classe.strip()[-1] == '"' and classe.strip()[0] == '"':
 93 |                     newclasses.append(classids[row[3].strip()[1:-1]])
 94 |                 else:
 95 |                     newclasses.append(classids[row[3].strip()])
 96 |             traindict[ytname] = newclasses
 97 |         i += 1
 98 | print(i, 'len train')
 99 | 
100 | # -----------------------------------------------------------------------------
101 | # Load VGGSound training ulrs
102 | # -----------------------------------------------------------------------------
103 | 
104 | vggsoundpath = '/home/askoepke97/coding/gitrepos/sound_features/VGGSound/data/train.csv'
105 | vggvids = []
106 | with open(vggsoundpath, 'r') as csv_file:
107 |     reader = csv.reader(csv_file)
108 |     for row in tqdm(reader):
109 |         vggvids.append(row[0].split('_')[0])
110 | 
111 | # -----------------------------------------------------------------------------
112 | # Find overlap between VGGSound training set and AudioCaps test set
113 | # -----------------------------------------------------------------------------
114 | 
115 | overlap_counter = 0
116 | vggcounter = 0
117 | uniqueclasses = [] #111 unique classes in unfiltered (before removing overlap with VGGSound)  AudioCaps test set, 97 in the val set, 238 in train 
118 | newclassdict = dict()
119 | overlap_test_videos = []
120 | for key, value in tqdm(yid_dict.items()):
121 |     if value in vggvids:
122 |         vggcounter += 1
123 |         overlap_test_videos.append(value)
124 |   # # Check for overlap between AudioCaps test set and AudioSet training data
125 |   #  if value in evaldict.keys():
126 |   #      if not evaldict[value] in newclassdict.values():
127 |   #          uniqueclasses.append(evaldict[value])
128 |   #      newclassdict[key] = evaldict[value]
129 |   #  elif value in traindict.keys():
130 |   #      overlap_counter += 1
131 |   #      if not traindict[value] in newclassdict.values():
132 |   #          uniqueclasses.append(traindict[value])
133 |   #      newclassdict[key] = traindict[value]
134 | 
135 | 
136 | # -----------------------------------------------------------------------------
137 | # Filter the test.csv dictionary yid_dict from AudioCaps for overlap with VGGSound
138 | # -----------------------------------------------------------------------------
139 | 
140 | new_yid_dict = dict()
141 | for key, value in yid_dict.items():
142 |     if not key.split('_')[0] in overlap_test_videos:
143 |         new_yid_dict[key] = value
144 | 
145 | # -----------------------------------------------------------------------------
146 | # Make dictionaries that contain classes as keys and video names in AudioCaps 
147 | # test as values
148 | # -----------------------------------------------------------------------------
149 | 
150 | class_video_dict = dict()
151 | for key, value in tqdm(new_yid_dict.items()):
152 |     if value in traindict.keys():
153 |         for vid_class in traindict[value]: #traindict[value] could contain a list with multiple classes
154 |             if not vid_class in class_video_dict.keys():
155 |                 class_video_dict[vid_class] = [key]
156 |             elif vid_class in class_video_dict.keys():
157 |                 class_video_dict[vid_class].append(key)
158 |     else:
159 |         import pdb; pdb.set_trace()
160 | 
161 | new_class_video_dict = class_video_dict.copy()
162 | for key, value in class_video_dict.items():
163 |     if len(value) < 10:
164 |         new_class_video_dict.pop(key)
165 | 
166 | print(len(new_class_video_dict.keys()))
167 | 
168 | # print count of each class in dictionary, videos belong to single class only
169 | count_no_videos = 0
170 | for key, value in tqdm(new_class_video_dict.items()):
171 |     print(key, len(value))
172 |     count_no_videos += len(value)
173 | print(len(new_yid_dict.keys()), count_no_videos, 'number of videos in test set and number of videos in dictionaries')
174 | 
175 | # save class dictionary that only contains classes with more than 10 example videos in the test set (34 classes)
176 | 
177 | with open('test_class_videoid_dict_morethan10.json', 'w') as fp:
178 |     json.dump(new_class_video_dict, fp)
179 | 
180 | # save class dictionary with all classes in the test set (106 classes)
181 | 
182 | with open('test_class_videoid_dict_all.json', 'w') as fp:
183 |     json.dump(class_video_dict, fp)
184 | 
185 | ## -----------------------------------------------------------------------------
186 | ## Filter the AudioCaps test_list.txt to remove overlap with the VGGSound training data
187 | ## -----------------------------------------------------------------------------
188 | #
189 | #audiocaps_testfile = '/home/askoepke97/akata-shared/askoepke97/data/AR/AudioCaps/structured-symlinks/test_list.txt'
190 | #file1 = open(audiocaps_testfile, 'r')
191 | #oldtestfiles = file1.readlines()
192 | #file1.close()
193 | #newtestfiles = []
194 | #for oldtestfile in oldtestfiles:
195 | #    if not oldtestfile.split('_')[0] in overlap_test_videos:
196 | #        newtestfiles.append(oldtestfile)
197 | #file1 = open('/home/askoepke97/akata-shared/askoepke97/data/AR/AudioCaps/structured-symlinks/filtered_test_list.txt', 'w')
198 | #file1.writelines(newtestfiles)
199 | #file1.close()
200 | #
201 | ## -----------------------------------------------------------------------------
202 | 
203 | #print('There are %d videos in the AudioCaps test set that are contained in the AudioSet training set.'%overlap_counter) #975 and there are only 975 videos in the AudioCaps test set, 495 in the val set, 49838 in train
204 | 


--------------------------------------------------------------------------------
/configs/data_loader_clotho.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "inherit_from": "configs/base_config.json",
  3 |     "eval_mode": "test_run",
  4 |     "experts": {
  5 |         "text_feat": "w2v",
  6 |         "modalities": [
  7 |             "imagenet.resnext101_32x48d.0",
  8 |             "r2p1d.r2p1d-ig65m.0",
  9 |             "scene.densenet161.0",
 10 |             "audio"
 11 |         ]
 12 |     },
 13 |     "arch": {
 14 |         "type": "CENet",
 15 |         "args": {
 16 |             "test_caption_mode": "indep",
 17 |             "use_ce": "pairwise",
 18 |             "use_mish": 1,
 19 |             "use_bn_reason": 1,
 20 |             "num_g_layers": 3,
 21 |             "num_h_layers": 0,
 22 |             "include_self": 1,
 23 |             "l2renorm": false,
 24 |             "randomise_feats": "",
 25 |             "vlad_clusters": {
 26 |                 "text": 20,
 27 |                 "audio": 16,
 28 |                 "pann": 16,
 29 |                 "speech": 5,
 30 |                 "syncnet": 16,
 31 |                 "vggsound": 16
 32 |             },
 33 |             "ghost_clusters": {
 34 |                 "text": 1,
 35 |                 "speech": 2
 36 |             },
 37 |             "mimic_ce_dims": 0
 38 |         }
 39 |     },
 40 |     "optimizer": {
 41 |         "type": "Ranger",
 42 |         "args": {
 43 |             "lr": 0.01,
 44 |             "weight_decay": 1E-03
 45 |         }
 46 |     },
 47 |     "loss": {
 48 |         "type": "MaxMarginRankingLoss",
 49 |         "args": {
 50 |             "margin": 0.2,
 51 |             "fix_norm": true
 52 |         }
 53 |     },
 54 |     "data_loader": {
 55 |         "type": "ExpertDataLoader",
 56 |         "args":{
 57 |             "dataset_name": "CLOTHO",
 58 |             "data_dir": "data/CLOTHO",
 59 |             "root_feat_folder": "structured-symlinks",
 60 |             "trn_cat": 0,
 61 |             "batch_size": 128,
 62 |             "split_name": "val",
 63 |             "fuse_captions": false,
 64 |             "num_test_captions": 1,
 65 |             "max_tokens": {
 66 |                 "text": 21,
 67 |                 "audio": 31,
 68 |                 "pann": 29,
 69 |                 "speech": 35,
 70 |                 "syncnet": 29,
 71 |                 "vggsound": 95
 72 |             },
 73 |             "feat_aggregation": {
 74 |                 "imagenet.senet154.0": {
 75 |                     "fps": 25,
 76 |                     "stride": 1,
 77 |                     "pixel_dim": 256,
 78 |                     "aggregate-axis": 1,
 79 |                     "offset": 0,
 80 |                     "temporal": "avg",
 81 |                     "aggregate": "concat",
 82 |                     "type": "embed",
 83 |                     "feat_dims": {
 84 |                         "embed": 2048,
 85 |                         "logits": 1000
 86 |                     }
 87 |                 },
 88 |                 "imagenet.resnext101_32x48d.0": {
 89 |                     "fps": 25,
 90 |                     "stride": 1,
 91 |                     "offset": 0,
 92 |                     "pixel_dim": 256,
 93 |                     "temporal": "avg",
 94 |                     "aggregate": "concat",
 95 |                     "aggregate-axis": 1,
 96 |                     "type": "embed",
 97 |                     "feat_dims": {
 98 |                         "embed": 2048,
 99 |                         "logits": 1000
100 |                     }
101 |                 },
102 |                 "scene.densenet161.0": {
103 |                     "stride": 1,
104 |                     "fps": 25,
105 |                     "offset": 0,
106 |                     "temporal": "avg",
107 |                     "pixel_dim": 256,
108 |                     "aggregate": "concat",
109 |                     "aggregate-axis": 1,
110 |                     "type": "embed",
111 |                     "feat_dims": {
112 |                         "embed": 2208,
113 |                         "logits": 1000
114 |                     }
115 |                 },
116 |                 "i3d.i3d.0": {
117 |                     "fps": 25,
118 |                     "offset": 0,
119 |                     "stride": 25,
120 |                     "inner_stride": 1,
121 |                     "pixel_dim": 256,
122 |                     "temporal": "avg",
123 |                     "aggregate": "concat",
124 |                     "aggregate-axis": 1,
125 |                     "type": "embed",
126 |                     "feat_dims": {
127 |                         "embed": 1024,
128 |                         "logits": 400
129 |                     }
130 |                 },
131 |                 "r2p1d.r2p1d-ig65m.0": {
132 |                     "fps": 30,
133 |                     "offset": 0,
134 |                     "stride": 32,
135 |                     "inner_stride": 1,
136 |                     "pixel_dim": 256,
137 |                     "temporal": "avg",
138 |                     "aggregate": "concat",
139 |                     "aggregate-axis": 1,
140 |                     "type": "embed",
141 |                     "feat_dims": {
142 |                         "embed": 512,
143 |                         "logits": 359
144 |                     }
145 |                 },
146 |                 "r2p1d.r2p1d-ig65m-kinetics.0": {
147 |                     "fps": 30,
148 |                     "offset": 0,
149 |                     "stride": 32,
150 |                     "inner_stride": 1,
151 |                     "pixel_dim": 256,
152 |                     "temporal": "avg",
153 |                     "aggregate": "concat",
154 |                     "aggregate-axis": 1,
155 |                     "type": "embed",
156 |                     "feat_dims": {
157 |                         "embed": 512,
158 |                         "logits": 400
159 |                     }
160 |                 },
161 |                 "pann.pann.0": {
162 |                     "model": "pann",
163 |                     "flaky": false,
164 |                     "temporal": "vlad",
165 |                     "type": "embed",
166 |                     "binarise": false
167 |                 },
168 |                 "pann": {
169 |                     "model": "pann",
170 |                     "flaky": false,
171 |                     "temporal": "vlad",
172 |                     "type": "embed",
173 |                     "binarise": false
174 |                 },
175 |                 "syncnet": {
176 |                     "model": "syncnet",
177 |                     "flaky": false,
178 |                     "temporal": "vlad",
179 |                     "type": "embed",
180 |                     "binarise": false
181 |                 },
182 |                 "audio.syncnet.0": {
183 |                     "model": "syncnet",
184 |                     "flaky": false,
185 |                     "temporal": "vlad",
186 |                     "type": "embed",
187 |                     "binarise": false
188 |                 },
189 |                 "vggsound": {
190 |                     "model": "vggsound",
191 |                     "flaky": false,
192 |                     "temporal": "vlad",
193 |                     "type": "embed",
194 |                     "binarise": false
195 |                 },
196 |                 "audio.vggsound.0": {
197 |                     "model": "vggsound",
198 |                     "flaky": false,
199 |                     "temporal": "vlad",
200 |                     "type": "embed",
201 |                     "binarise": false
202 |                 },
203 |                 "speech": {
204 |                     "model": "w2v",
205 |                     "flaky": true,
206 |                     "temporal": "vlad",
207 |                     "type": "embed",
208 |                     "binarise": false,
209 |                     "feat_dims": {
210 |                         "embed": 300
211 |                     }
212 |                 },
213 |                 "audio": {
214 |                     "model": "vggish",
215 |                     "flaky": false,
216 |                     "temporal": "vlad",
217 |                     "type": "embed",
218 |                     "binarise": false
219 |                 },
220 |                 "audio.vggish.0": {
221 |                     "model": "vggish",
222 |                     "flaky": false,
223 |                     "temporal": "vlad",
224 |                     "type": "embed",
225 |                     "binarise": false
226 |                 }
227 |             }
228 |         }
229 |     },
230 |     "trainer": {
231 |         "epochs": 20
232 |     },
233 |     "eval_settings": {
234 |         "data_loader": {
235 |             "args": {
236 |                 "split_name": "test",
237 |                 "num_test_captions": 5
238 |             }
239 |         },
240 |         "tester": {
241 |             "save_dir": "data/saved/",
242 |             "verbosity": 2
243 |         },
244 |         "disable_gpu": true
245 |     },
246 |     "visualizer": {
247 |         "type": "Visualizer",
248 |         "args":{
249 |             "src_video_dir": "data/CLOTHO/videos",
250 |             "vis_vid_freq": 500,
251 |             "num_samples": 100
252 |         }
253 |     }
254 | }
255 | 


--------------------------------------------------------------------------------
/configs/data_loader_audiocaps.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "inherit_from": "configs/base_config.json",
  3 |     "eval_mode": "test_run",
  4 |     "experts": {
  5 |         "text_feat": "w2v",
  6 |         "modalities": [
  7 |             "imagenet.resnext101_32x48d.0",
  8 |             "r2p1d.r2p1d-ig65m.0",
  9 |             "scene.densenet161.0",
 10 |             "audio"
 11 |         ]
 12 |     },
 13 |     "arch": {
 14 |         "type": "CENet",
 15 |         "args": {
 16 |             "test_caption_mode": "indep",
 17 |             "use_ce": "pairwise",
 18 |             "use_mish": 1,
 19 |             "use_bn_reason": 1,
 20 |             "num_g_layers": 3,
 21 |             "num_h_layers": 0,
 22 |             "include_self": 1,
 23 |             "l2renorm": false,
 24 |             "randomise_feats": "",
 25 |             "vlad_clusters": {
 26 |                 "text": 20,
 27 |                 "audio": 16,
 28 |                 "pann": 16,
 29 |                 "syncnet": 16,
 30 |                 "vggsound": 16,
 31 |                 "speech": 5
 32 |             },
 33 |             "ghost_clusters": {
 34 |                 "text": 1
 35 |             },
 36 |             "mimic_ce_dims": 0
 37 |         }
 38 |     },
 39 |     "optimizer": {
 40 |         "type": "Ranger",
 41 |         "args": {
 42 |             "lr": 0.01,
 43 |             "weight_decay": 1E-03
 44 |         }
 45 |     },
 46 |     "loss": {
 47 |         "type": "MaxMarginRankingLoss",
 48 |         "args": {
 49 |             "margin": 0.2,
 50 |             "fix_norm": true
 51 |         }
 52 |     },
 53 |     "data_loader": {
 54 |         "type": "ExpertDataLoader",
 55 |         "args":{
 56 |             "dataset_name": "AudioCaps",
 57 |             "data_dir": "data/AudioCaps",
 58 |             "root_feat_folder": "structured-symlinks",
 59 |             "trn_cat": 0,
 60 |             "batch_size": 128,
 61 |             "split_name": "val",
 62 |             "fuse_captions": false,
 63 |             "num_test_captions": 1,
 64 |             "max_tokens": {
 65 |                 "text": 20,
 66 |                 "audio": 29,
 67 |                 "pann": 29,
 68 |                 "syncnet": 29,
 69 |                 "vggsound": 29,
 70 |                 "speech": 35
 71 |             },
 72 |             "feat_aggregation": {
 73 |                 "imagenet.senet154.0": {
 74 |                     "fps": 25,
 75 |                     "stride": 1,
 76 |                     "pixel_dim": 256,
 77 |                     "aggregate-axis": 1,
 78 |                     "offset": 0,
 79 |                     "temporal": "avg",
 80 |                     "aggregate": "concat",
 81 |                     "type": "embed",
 82 |                     "feat_dims": {
 83 |                         "embed": 2048,
 84 |                         "logits": 1000
 85 |                     }
 86 |                 },
 87 |                 "imagenet.resnext101_32x48d.0": {
 88 |                     "fps": 25,
 89 |                     "stride": 1,
 90 |                     "offset": 0,
 91 |                     "pixel_dim": 256,
 92 |                     "temporal": "avg",
 93 |                     "aggregate": "concat",
 94 |                     "aggregate-axis": 1,
 95 |                     "type": "embed",
 96 |                     "feat_dims": {
 97 |                         "embed": 2048,
 98 |                         "logits": 1000
 99 |                     }
100 |                 },
101 |                 "scene.densenet161.0": {
102 |                     "stride": 1,
103 |                     "fps": 25,
104 |                     "offset": 0,
105 |                     "temporal": "avg",
106 |                     "pixel_dim": 256,
107 |                     "aggregate": "concat",
108 |                     "aggregate-axis": 1,
109 |                     "type": "embed",
110 |                     "feat_dims": {
111 |                         "embed": 2208,
112 |                         "logits": 1000
113 |                     }
114 |                 },
115 |                 "i3d.i3d.0": {
116 |                     "fps": 25,
117 |                     "offset": 0,
118 |                     "stride": 25,
119 |                     "inner_stride": 1,
120 |                     "pixel_dim": 256,
121 |                     "temporal": "avg",
122 |                     "aggregate": "concat",
123 |                     "aggregate-axis": 1,
124 |                     "type": "embed",
125 |                     "feat_dims": {
126 |                         "embed": 1024,
127 |                         "logits": 400
128 |                     }
129 |                 },
130 |                 "r2p1d.r2p1d-ig65m.0": {
131 |                     "fps": 30,
132 |                     "offset": 0,
133 |                     "stride": 32,
134 |                     "inner_stride": 1,
135 |                     "pixel_dim": 256,
136 |                     "temporal": "avg",
137 |                     "aggregate": "concat",
138 |                     "aggregate-axis": 1,
139 |                     "type": "embed",
140 |                     "feat_dims": {
141 |                         "embed": 512,
142 |                         "logits": 359
143 |                     }
144 |                 },
145 |                 "r2p1d.r2p1d-ig65m-kinetics.0": {
146 |                     "fps": 30,
147 |                     "offset": 0,
148 |                     "stride": 32,
149 |                     "inner_stride": 1,
150 |                     "pixel_dim": 256,
151 |                     "temporal": "avg",
152 |                     "aggregate": "concat",
153 |                     "aggregate-axis": 1,
154 |                     "type": "embed",
155 |                     "feat_dims": {
156 |                         "embed": 512,
157 |                         "logits": 400
158 |                     }
159 |                 },
160 |                 "pann.pann.0": {
161 |                     "model": "pann",
162 |                     "flaky": false,
163 |                     "temporal": "vlad",
164 |                     "type": "embed",
165 |                     "binarise": false
166 |                 },
167 |                 "pann": {
168 |                     "model": "pann",
169 |                     "flaky": false,
170 |                     "temporal": "vlad",
171 |                     "type": "embed",
172 |                     "binarise": false
173 |                 },
174 |                 "syncnet": {
175 |                     "model": "syncnet",
176 |                     "flaky": false,
177 |                     "temporal": "vlad",
178 |                     "type": "embed",
179 |                     "binarise": false
180 |                 },
181 |                 "audio.syncnet.0": {
182 |                     "model": "syncnet",
183 |                     "flaky": false,
184 |                     "temporal": "vlad",
185 |                     "type": "embed",
186 |                     "binarise": false
187 |                 },
188 |                 "vggsound": {
189 |                     "model": "vggsound",
190 |                     "flaky": false,
191 |                     "temporal": "vlad",
192 |                     "type": "embed",
193 |                     "binarise": false
194 |                 },
195 |                 "audio.vggsound.0": {
196 |                     "model": "vggsound",
197 |                     "flaky": false,
198 |                     "temporal": "vlad",
199 |                     "type": "embed",
200 |                     "binarise": false
201 |                 },
202 |                 "speech": {
203 |                     "model": "w2v",
204 |                     "flaky": true,
205 |                     "temporal": "vlad",
206 |                     "type": "embed",
207 |                     "binarise": false,
208 |                     "feat_dims": {
209 |                         "embed": 300
210 |                     }
211 |                 },
212 |                 "audio": {
213 |                     "model": "vggish",
214 |                     "flaky": true,
215 |                     "temporal": "vlad",
216 |                     "type": "embed",
217 |                     "binarise": false
218 |                 },
219 |                 "audio.vggish.0": {
220 |                     "model": "vggish",
221 |                     "flaky": true,
222 |                     "temporal": "vlad",
223 |                     "type": "embed",
224 |                     "binarise": false
225 |                 }
226 |             }
227 |         }
228 |     },
229 |     "metrics": [
230 |         "t2v_metrics",
231 |         "v2t_metrics"
232 |     ],
233 |     "trainer": {
234 |         "epochs": 20
235 |     },
236 |     "eval_settings": {
237 |         "data_loader": {
238 |             "args": {
239 |                 "split_name": "test",
240 |                 "num_test_captions": 5
241 |             }
242 |         },
243 |         "tester": {
244 |             "save_dir": "data/saved/",
245 |             "verbosity": 2
246 |         },
247 |         "disable_gpu": true
248 |     },
249 |     "testing_file": "final_filtered_test_list.txt",
250 |     "visualizer": {
251 |         "type": "Visualizer",
252 |         "args":{
253 |             "src_video_dir": "data/AudioCaps/videos",
254 |             "vis_vid_freq": 500,
255 |             "num_samples": 5
256 |         }
257 |     }
258 | }
259 | 


--------------------------------------------------------------------------------