├── common
├── __init__.py
├── transforms.py
├── scheduler.py
├── metric_tracking.py
├── mixup.py
├── utils.py
└── runner.py
├── models
├── __init__.py
├── feature_mapping.py
├── base_model.py
└── transformerblock.py
├── datasets
├── __init__.py
├── data.py
├── reader_fns.py
└── epic_kitchens.py
├── conf
├── opt
│ ├── optimizer
│ │ ├── adam.yaml
│ │ ├── adamW.yaml
│ │ └── sgd.yaml
│ └── scheduler
│ │ ├── multi_step.yaml
│ │ └── cosine.yaml
├── model
│ ├── backbone
│ │ └── identity.yaml
│ ├── CMFP
│ │ ├── cmfp_early.yaml
│ │ ├── scorefusion.yaml
│ │ └── individual.yaml
│ ├── mapping
│ │ ├── gatedlinear.yaml
│ │ ├── linear.yaml
│ │ └── nonlinear.yaml
│ ├── fuser
│ │ ├── MATT.yaml
│ │ ├── CA-Fuser.yaml
│ │ ├── SA-Fuser_wo_token.yaml
│ │ ├── T-SA-Fuser.yaml
│ │ └── SA-Fuser.yaml
│ ├── future_predictor
│ │ └── base_future_predictor.yaml
│ └── common.yaml
├── .DS_Store
├── dataset
│ ├── egtea
│ │ ├── common.yaml
│ │ ├── val.yaml
│ │ └── train.yaml
│ └── epic_kitchens100
│ │ ├── common.yaml
│ │ ├── train.yaml
│ │ ├── val.yaml
│ │ └── test.yaml
├── data
│ └── default.yaml
└── config.yaml
├── fuser.png
├── expts
├── .DS_Store
├── 00_RGB_TSN_ek100_train.txt
├── 00_RGB_Swin_ek100_train.txt
├── 01_SA-Fuser_ek100_val_TSN_wo_audio.txt
├── 01_SA-Fuser_ek100_test_TSN_wo_audio.txt
├── 06_SA-Fuser_egtea_val.txt
├── 01_SA-Fuser_ek100_val_TSN.txt
├── 01_SA-Fuser_ek100_val_Swin.txt
├── 06_SA-Fuser_egtea_train.txt
├── 05_MATT_ek100_train.txt
├── 04_CA-Fuser_ek100_train.txt
├── 02_SA-Fuser_wo_token_ek100_train.txt
├── 01_SA-Fuser_ek100_train.txt
└── 03_T-SA-Fuser_ek100_train.txt
├── annotations
├── .DS_Store
├── ek55_ori
│ ├── .DS_Store
│ ├── EPIC_test_s1_timestamps.pkl
│ ├── EPIC_test_s2_timestamps.pkl
│ ├── EPIC_train_action_labels.pkl
│ ├── EPIC_many_shot_verbs.csv
│ ├── EPIC_many_shot_nouns.csv
│ └── EPIC_verb_classes.csv
├── ek100_ori
│ ├── .DS_Store
│ ├── EPIC_100_train.pkl
│ ├── EPIC_100_validation.pkl
│ ├── EPIC_100_test_timestamps.pkl
│ └── EPIC_100_verb_classes.csv
├── ek55_rulstm
│ ├── .DS_Store
│ ├── EPIC_many_shot_verbs.csv
│ ├── validation_videos.csv
│ ├── EPIC_many_shot_nouns.csv
│ └── training_videos.csv
├── ek100_rulstm
│ ├── .DS_Store
│ ├── validation_videos.csv
│ └── training_videos.csv
└── egtea
│ └── actions.csv
├── checkpoints
├── fusion_egtea_tsn
│ └── README.md
├── fusion_ek100_swin_4h_16s
│ └── README.md
├── fusion_ek100_tsn_4h_18s
│ └── README.md
└── fusion_ek100_tsn_wo_audio_4h_18s
│ └── README.md
├── logits
└── README.md
├── run.py
├── .gitignore
├── environment.yml
├── test.py
├── README.md
├── LICENSE
└── tmp.py
/common/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/conf/opt/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
--------------------------------------------------------------------------------
/conf/opt/optimizer/adamW.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.AdamW
--------------------------------------------------------------------------------
/conf/model/backbone/identity.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.nn.Identity
--------------------------------------------------------------------------------
/fuser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/fuser.png
--------------------------------------------------------------------------------
/conf/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/conf/.DS_Store
--------------------------------------------------------------------------------
/expts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/expts/.DS_Store
--------------------------------------------------------------------------------
/conf/opt/scheduler/multi_step.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.lr_scheduler.MultiStepLR
2 |
--------------------------------------------------------------------------------
/conf/opt/optimizer/sgd.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.SGD
2 | momentum: 0.9
3 | nesterov: false
--------------------------------------------------------------------------------
/annotations/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/.DS_Store
--------------------------------------------------------------------------------
/conf/model/CMFP/cmfp_early.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.future_prediction.CMFPEarly
2 | model_cfg: null
--------------------------------------------------------------------------------
/conf/model/CMFP/scorefusion.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.future_prediction.CMFPScoreFusion
2 | model_cfg: null
--------------------------------------------------------------------------------
/conf/model/mapping/gatedlinear.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.feature_mapping.GatedLinear
2 | use_layernorm: true
--------------------------------------------------------------------------------
/annotations/ek55_ori/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek55_ori/.DS_Store
--------------------------------------------------------------------------------
/conf/model/CMFP/individual.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.future_prediction.IndividualFuturePrediction
2 | model_cfg: null
--------------------------------------------------------------------------------
/annotations/ek100_ori/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek100_ori/.DS_Store
--------------------------------------------------------------------------------
/annotations/ek55_rulstm/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek55_rulstm/.DS_Store
--------------------------------------------------------------------------------
/annotations/ek100_rulstm/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek100_rulstm/.DS_Store
--------------------------------------------------------------------------------
/conf/model/mapping/linear.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.feature_mapping.Linear
2 | use_layernorm: false
3 | sparse_mapping: true
--------------------------------------------------------------------------------
/conf/model/mapping/nonlinear.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.feature_mapping.NonLinear
2 | use_layernorm: true
3 | activation: relu
--------------------------------------------------------------------------------
/annotations/ek100_ori/EPIC_100_train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek100_ori/EPIC_100_train.pkl
--------------------------------------------------------------------------------
/annotations/ek100_ori/EPIC_100_validation.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek100_ori/EPIC_100_validation.pkl
--------------------------------------------------------------------------------
/annotations/ek55_ori/EPIC_test_s1_timestamps.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek55_ori/EPIC_test_s1_timestamps.pkl
--------------------------------------------------------------------------------
/annotations/ek55_ori/EPIC_test_s2_timestamps.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek55_ori/EPIC_test_s2_timestamps.pkl
--------------------------------------------------------------------------------
/checkpoints/fusion_egtea_tsn/README.md:
--------------------------------------------------------------------------------
1 | Please download the corresponding checkpoint from [Model Zoo](../../README.md#Model Zoo)
2 | and put it here.
--------------------------------------------------------------------------------
/conf/model/fuser/MATT.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.fusion.MATT
2 | modal_dims: ${model.modal_dims}
3 | dim: ${model.common.in_features}
4 | drop_rate: 0.8
--------------------------------------------------------------------------------
/conf/opt/scheduler/cosine.yaml:
--------------------------------------------------------------------------------
1 | _target_: common.scheduler.CosineLR
2 | num_epochs: ${train.num_epochs}
3 | eta_min: 1e-6 # Min LR (default)
4 |
--------------------------------------------------------------------------------
/annotations/ek100_ori/EPIC_100_test_timestamps.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek100_ori/EPIC_100_test_timestamps.pkl
--------------------------------------------------------------------------------
/annotations/ek55_ori/EPIC_train_action_labels.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zeyun-zhong/AFFT/HEAD/annotations/ek55_ori/EPIC_train_action_labels.pkl
--------------------------------------------------------------------------------
/checkpoints/fusion_ek100_swin_4h_16s/README.md:
--------------------------------------------------------------------------------
1 | Please download the corresponding checkpoint from [Model Zoo](../../README.md#Model Zoo)
2 | and put it here.
--------------------------------------------------------------------------------
/checkpoints/fusion_ek100_tsn_4h_18s/README.md:
--------------------------------------------------------------------------------
1 | Please download the corresponding checkpoint from [Model Zoo](../../README.md#Model Zoo)
2 | and put it here.
--------------------------------------------------------------------------------
/checkpoints/fusion_ek100_tsn_wo_audio_4h_18s/README.md:
--------------------------------------------------------------------------------
1 | Please download the corresponding checkpoint from [Model Zoo](../../README.md#Model Zoo)
2 | and put it here.
--------------------------------------------------------------------------------
/conf/model/fuser/CA-Fuser.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.fusion.TemporalCrossAttentFuser
2 | dim: ${model.common.in_features}
3 | modalities: ${model.modal_dims}
4 | num_heads: 4
5 | embd_drop_rate: 0.1
6 | drop_rate: 0.1
7 | attn_drop_rate: 0.1
8 | drop_path_rate: 0.1
9 |
--------------------------------------------------------------------------------
/conf/model/fuser/SA-Fuser_wo_token.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.fusion.CMFuser
2 | dim: ${model.common.in_features}
3 | depth: 6
4 | num_heads: 4
5 | embd_drop_rate: 0.1
6 | drop_rate: 0.1
7 | attn_drop_rate: 0.1
8 | drop_path_rate: 0.1
9 | cross_attn: false
10 |
--------------------------------------------------------------------------------
/logits/README.md:
--------------------------------------------------------------------------------
1 | Logits that generated by running [test.py](../test.py) and
2 | the submission file for ek100 generated by [challenge.py](../challenge.py) will be saved here.
3 |
4 | You can simply change the variable `LOGITS_DIR` in [challenge.py](../challenge.py)
5 | to assign a new path.
--------------------------------------------------------------------------------
/conf/model/fuser/T-SA-Fuser.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.fusion.TemporalCMFuser
2 | dim: ${model.common.in_features}
3 | depth: 6
4 | num_heads: 4
5 | embd_drop_rate: 0.1
6 | drop_rate: 0.1
7 | attn_drop_rate: 0.1
8 | drop_path_rate: 0.1
9 | modalities: ${model.modal_dims}
10 | modal_encoding: true
11 | frame_level_token: false
12 | temporal_sequence_length: null
--------------------------------------------------------------------------------
/annotations/ek55_ori/EPIC_many_shot_verbs.csv:
--------------------------------------------------------------------------------
1 | verb_class,verb
2 | 1,put
3 | 0,take
4 | 4,wash
5 | 2,open
6 | 3,close
7 | 5,cut
8 | 6,mix
9 | 7,pour
10 | 9,move
11 | 12,turn-on
12 | 10,remove
13 | 15,turn-off
14 | 8,throw
15 | 11,dry
16 | 16,peel
17 | 22,insert
18 | 13,turn
19 | 14,shake
20 | 21,squeeze
21 | 23,press
22 | 20,check
23 | 19,scoop
24 | 18,empty
25 | 17,adjust
26 | 24,fill
27 | 32,flip
28 |
--------------------------------------------------------------------------------
/annotations/ek55_rulstm/EPIC_many_shot_verbs.csv:
--------------------------------------------------------------------------------
1 | verb_class,verb
2 | 1,put
3 | 0,take
4 | 4,wash
5 | 2,open
6 | 3,close
7 | 5,cut
8 | 6,mix
9 | 7,pour
10 | 9,move
11 | 12,turn-on
12 | 10,remove
13 | 15,turn-off
14 | 8,throw
15 | 11,dry
16 | 16,peel
17 | 22,insert
18 | 13,turn
19 | 14,shake
20 | 21,squeeze
21 | 23,press
22 | 20,check
23 | 19,scoop
24 | 18,empty
25 | 17,adjust
26 | 24,fill
27 | 32,flip
28 |
--------------------------------------------------------------------------------
/conf/model/fuser/SA-Fuser.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.fusion.ModalTokenCMFuser
2 | dim: ${model.common.in_features}
3 | depth: 6
4 | num_heads: 4
5 | embd_drop_rate: 0.1
6 | drop_rate: 0.1
7 | attn_drop_rate: 0.1
8 | drop_path_rate: 0.1
9 | cross_attn: false
10 | norm_elementwise: true
11 | modalities: ${model.modal_dims}
12 | modal_encoding: false
13 | frame_level_token: false
14 | temporal_sequence_length: null
--------------------------------------------------------------------------------
/conf/dataset/egtea/common.yaml:
--------------------------------------------------------------------------------
1 | # @package dataset.egtea.common
2 |
3 | version: -1
4 | # RULSTM feats dirs
5 | rulstm_feats_dir: ${dataset_root_dir}/egtea/features
6 | annot_dir: ${cwd}/annotations/egtea/
7 | rulstm_annot_dir: ${cwd}/annotations/egtea/
8 | label_type: action
9 | sample_strategy: "last_clip"
10 | tau_a: 0.5
11 | tau_o: 10
12 | split: 1
13 | compute_dataset_stats: false
14 | reader_fn: null
15 | max_els: null
--------------------------------------------------------------------------------
/conf/model/future_predictor/base_future_predictor.yaml:
--------------------------------------------------------------------------------
1 | _target_: models.future_prediction.BaseFuturePredictor
2 | in_features: ${model.common.in_features}
3 | inter_dim: ${model.common.fp_inter_dim}
4 | n_layer: ${model.common.fp_layers}
5 | n_head: ${model.common.fp_heads}
6 | output_attentions: ${model.common.fp_output_attentions}
7 | embd_pdrop: ${model.common.embd_pdrop}
8 | resid_pdrop: ${model.common.resid_pdrop}
9 | attn_pdrop: ${model.common.attn_pdrop}
--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/common.yaml:
--------------------------------------------------------------------------------
1 | # @package dataset.epic_kitchens100.common
2 |
3 | version: 0.2
4 | # RULSTM feats dirs
5 | rulstm_feats_dir: ${dataset_root_dir}/epickitchens100/features
6 | annot_dir: ${cwd}/annotations/ek100_ori/
7 | rulstm_annot_dir: ${cwd}/annotations/ek100_rulstm/
8 | label_type: action
9 | sample_strategy: "last_clip"
10 | tau_a: 1
11 | tau_o: 10
12 | compute_dataset_stats: false
13 | reader_fn: null
14 | max_els: null
15 |
--------------------------------------------------------------------------------
/annotations/ek55_rulstm/validation_videos.csv:
--------------------------------------------------------------------------------
1 | P01_01
2 | P01_10
3 | P02_03
4 | P02_05
5 | P03_06
6 | P03_11
7 | P04_09
8 | P06_05
9 | P07_02
10 | P07_08
11 | P07_10
12 | P08_01
13 | P08_05
14 | P08_12
15 | P10_01
16 | P13_04
17 | P13_06
18 | P13_09
19 | P14_01
20 | P14_02
21 | P20_03
22 | P20_04
23 | P22_08
24 | P22_10
25 | P22_11
26 | P22_13
27 | P23_03
28 | P24_08
29 | P25_11
30 | P26_02
31 | P26_11
32 | P26_16
33 | P27_03
34 | P28_05
35 | P28_12
36 | P28_13
37 | P30_01
38 | P30_03
39 | P31_01
40 | P31_08
41 |
--------------------------------------------------------------------------------
/conf/data/default.yaml:
--------------------------------------------------------------------------------
1 |
2 | # The top few go into the dataset object to load as per these
3 | num_frames: 10
4 | frame_rate: 1
5 | frame_subclips:
6 | num_frames: 1
7 | stride: 1
8 | sec_subclips: # allows to have different sequence of labels than the sequence of frames
9 | num_frames: 1
10 | stride: 1
11 |
12 | # Load segmentation labels only if a classifier on the past is being applied
13 | load_seg_labels: true
14 |
15 | # Augmentation for RULSTM Feats, only for training
16 | zero_mask_rate: 0.0
17 |
--------------------------------------------------------------------------------
/conf/model/common.yaml:
--------------------------------------------------------------------------------
1 | # @package model.common
2 |
3 | in_features: ${model.common_dim}
4 |
5 | # boolean options controlling future predictor and classifier
6 | share_classifiers: true # whether a common classifier should be used
7 | share_predictors: false # whether a common future predictor should be used
8 | modality_cls: false # whether modality-wise classification
9 | fusion_cls: true # whether the fused features should be classified
10 |
11 | # backbones (identity layer for feature vectors)
12 | backbones: null
13 |
14 | # for base future predictor
15 | fp_output_len: 1
16 | fp_inter_dim: 2048
17 | fp_layers: 6
18 | fp_heads: 4
19 | fp_output_attentions: false
20 | embd_pdrop: 0.1
21 | resid_pdrop: 0.1
22 | attn_pdrop: 0.1
23 |
--------------------------------------------------------------------------------
/common/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import random
3 |
4 |
5 | class PermuteRULSTMFeats:
6 | def __init__(self):
7 | pass
8 |
9 | def __call__(self, vid):
10 | return vid.permute(3, 0, 1, 2)
11 |
12 |
13 | class ZeroMaskRULSTMFeats:
14 | """Mask random frames with zeros"""
15 | def __init__(self, mask_rate=0.2):
16 | self.mask_rate = mask_rate
17 |
18 | def __call__(self, vid):
19 | if self.mask_rate == 0:
20 | return vid
21 | num_frames = vid.size(0)
22 | num_masked_frames = round(num_frames * self.mask_rate)
23 | random_choices = random.sample(range(num_frames), num_masked_frames)
24 | vid[random_choices, :, :, :] = torch.zeros((num_masked_frames, vid.size(1), vid.size(2), vid.size(-1)))
25 | return vid
26 |
--------------------------------------------------------------------------------
/datasets/data.py:
--------------------------------------------------------------------------------
1 | import hydra
2 |
3 |
4 | def get_dataset(dataset_cfg, data_cfg, transforms, logger):
5 | kwargs = {}
6 | kwargs['transforms'] = transforms
7 | kwargs['frame_rate'] = data_cfg.frame_rate
8 | kwargs['frames_per_clip'] = data_cfg.num_frames
9 | # Have to call dict() here since relative interpolation somehow doesn't work once I get the subclips object
10 | kwargs['frame_subclips_options'] = dict(data_cfg.frame_subclips)
11 | kwargs['sec_subclips_options'] = dict(data_cfg.sec_subclips)
12 | kwargs['load_seg_labels'] = data_cfg.load_seg_labels
13 | logger.info('Creating the dataset object...')
14 | # Not recursive since many of the sub-instantiations would need positional arguments
15 | _dataset = hydra.utils.instantiate(dataset_cfg, _recursive_=False, **kwargs)
16 | logger.info(f'Created dataset with {len(_dataset)} elts')
17 | return _dataset
18 |
--------------------------------------------------------------------------------
/conf/dataset/egtea/val.yaml:
--------------------------------------------------------------------------------
1 | # @package dataset.egtea.val
2 |
3 | _target_: datasets.epic_kitchens.EPICKitchens
4 | version: ${dataset.egtea.common.version}
5 | annotation_path:
6 | - ${dataset.egtea.common.annot_dir}/validation${dataset.egtea.common.split}.csv
7 | annotation_dir: ${dataset.egtea.common.annot_dir}
8 | rulstm_annotation_dir: ${dataset.egtea.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc.
9 | label_type: ${dataset.egtea.common.label_type}
10 | sample_strategy: ${dataset.egtea.common.sample_strategy}
11 | action_labels_fpath: ${dataset.egtea.common.rulstm_annot_dir}/actions.csv
12 | compute_dataset_stats: ${dataset.egtea.common.compute_dataset_stats}
13 | conv_to_anticipate_fn:
14 | _target_: datasets.base_video_dataset.convert_to_anticipation
15 | tau_a: ${dataset.egtea.common.tau_a}
16 | tau_o: ${dataset.egtea.common.tau_o}
17 | drop_style: correct
18 | reader_fn: ${dataset.egtea.common.reader_fn}
19 | max_els: ${dataset.egtea.common.max_els}
--------------------------------------------------------------------------------
/conf/dataset/egtea/train.yaml:
--------------------------------------------------------------------------------
1 | # @package dataset.egtea.train
2 |
3 | _target_: datasets.epic_kitchens.EPICKitchens
4 | version: ${dataset.egtea.common.version}
5 | annotation_path:
6 | - ${dataset.egtea.common.annot_dir}/training${dataset.egtea.common.split}.csv
7 | annotation_dir: ${dataset.egtea.common.annot_dir}
8 | rulstm_annotation_dir: ${dataset.egtea.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc.
9 | label_type: ${dataset.egtea.common.label_type}
10 | sample_strategy: ${dataset.egtea.common.sample_strategy}
11 | action_labels_fpath: ${dataset.egtea.common.rulstm_annot_dir}/actions.csv
12 | compute_dataset_stats: ${dataset.egtea.common.compute_dataset_stats}
13 | conv_to_anticipate_fn:
14 | _target_: datasets.base_video_dataset.convert_to_anticipation
15 | tau_a: ${dataset.egtea.common.tau_a}
16 | tau_o: ${dataset.egtea.common.tau_o}
17 | drop_style: correct
18 | reader_fn: ${dataset.egtea.common.reader_fn}
19 | max_els: ${dataset.egtea.common.max_els}
--------------------------------------------------------------------------------
/annotations/ek55_ori/EPIC_many_shot_nouns.csv:
--------------------------------------------------------------------------------
1 | noun_class,noun
2 | 3,tap
3 | 4,plate
4 | 8,cupboard
5 | 1,pan
6 | 7,spoon
7 | 5,knife
8 | 9,drawer
9 | 10,fridge
10 | 6,bowl
11 | 12,hand
12 | 11,lid
13 | 13,onion
14 | 16,glass
15 | 23,cup
16 | 17,water
17 | 19,board:chopping
18 | 21,sponge
19 | 18,fork
20 | 32,cloth
21 | 20,bag
22 | 28,bottle
23 | 15,pot
24 | 22,spatula
25 | 39,box
26 | 26,meat
27 | 24,oil
28 | 30,tomato
29 | 31,salt
30 | 29,container
31 | 27,potato
32 | 77,package
33 | 37,food
34 | 47,hob
35 | 35,pasta
36 | 78,top
37 | 40,carrot
38 | 45,garlic
39 | 68,skin
40 | 44,rice
41 | 25,bin
42 | 38,kettle
43 | 46,pepper
44 | 33,sink
45 | 51,cheese
46 | 56,oven
47 | 70,liquid:washing
48 | 58,coffee
49 | 52,bread
50 | 108,rubbish
51 | 67,peach
52 | 42,colander
53 | 41,sauce
54 | 54,salad
55 | 126,maker:coffee
56 | 60,jar
57 | 84,sausage
58 | 75,cutlery
59 | 43,milk
60 | 62,chicken
61 | 50,egg
62 | 59,filter
63 | 55,microwave
64 | 49,dishwasher
65 | 87,can
66 | 48,dough
67 | 63,tray
68 | 72,leaf
69 | 105,jug
70 | 106,heat
71 | 79,spice
72 | 111,stock
73 |
--------------------------------------------------------------------------------
/annotations/ek55_rulstm/EPIC_many_shot_nouns.csv:
--------------------------------------------------------------------------------
1 | noun_class,noun
2 | 3,tap
3 | 4,plate
4 | 8,cupboard
5 | 1,pan
6 | 7,spoon
7 | 5,knife
8 | 9,drawer
9 | 10,fridge
10 | 6,bowl
11 | 12,hand
12 | 11,lid
13 | 13,onion
14 | 16,glass
15 | 23,cup
16 | 17,water
17 | 19,board:chopping
18 | 21,sponge
19 | 18,fork
20 | 32,cloth
21 | 20,bag
22 | 28,bottle
23 | 15,pot
24 | 22,spatula
25 | 39,box
26 | 26,meat
27 | 24,oil
28 | 30,tomato
29 | 31,salt
30 | 29,container
31 | 27,potato
32 | 77,package
33 | 37,food
34 | 47,hob
35 | 35,pasta
36 | 78,top
37 | 40,carrot
38 | 45,garlic
39 | 68,skin
40 | 44,rice
41 | 25,bin
42 | 38,kettle
43 | 46,pepper
44 | 33,sink
45 | 51,cheese
46 | 56,oven
47 | 70,liquid:washing
48 | 58,coffee
49 | 52,bread
50 | 108,rubbish
51 | 67,peach
52 | 42,colander
53 | 41,sauce
54 | 54,salad
55 | 126,maker:coffee
56 | 60,jar
57 | 84,sausage
58 | 75,cutlery
59 | 43,milk
60 | 62,chicken
61 | 50,egg
62 | 59,filter
63 | 55,microwave
64 | 49,dishwasher
65 | 87,can
66 | 48,dough
67 | 63,tray
68 | 72,leaf
69 | 105,jug
70 | 106,heat
71 | 79,spice
72 | 111,stock
73 |
--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/train.yaml:
--------------------------------------------------------------------------------
1 | # @package dataset.epic_kitchens100.train
2 |
3 | _target_: datasets.epic_kitchens.EPICKitchens
4 | version: ${dataset.epic_kitchens100.common.version}
5 | annotation_path:
6 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_train.pkl
7 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir}
8 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc.
9 | label_type: ${dataset.epic_kitchens100.common.label_type}
10 | sample_strategy: ${dataset.epic_kitchens100.common.sample_strategy}
11 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv
12 | compute_dataset_stats: ${dataset.epic_kitchens100.common.compute_dataset_stats}
13 | conv_to_anticipate_fn:
14 | _target_: datasets.base_video_dataset.convert_to_anticipation
15 | tau_a: ${dataset.epic_kitchens100.common.tau_a}
16 | tau_o: ${dataset.epic_kitchens100.common.tau_o}
17 | drop_style: correct
18 | reader_fn: ${dataset.epic_kitchens100.common.reader_fn}
19 | max_els: ${dataset.epic_kitchens100.common.max_els}
--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/val.yaml:
--------------------------------------------------------------------------------
1 | # @package dataset.epic_kitchens100.val
2 |
3 | _target_: datasets.epic_kitchens.EPICKitchens
4 | version: ${dataset.epic_kitchens100.common.version}
5 | annotation_path:
6 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_validation.pkl
7 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir}
8 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc.
9 | label_type: ${dataset.epic_kitchens100.common.label_type}
10 | sample_strategy: ${dataset.epic_kitchens100.common.sample_strategy}
11 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv
12 | compute_dataset_stats: ${dataset.epic_kitchens100.common.compute_dataset_stats}
13 | conv_to_anticipate_fn:
14 | _target_: datasets.base_video_dataset.convert_to_anticipation
15 | tau_a: ${dataset.epic_kitchens100.common.tau_a}
16 | tau_o: ${dataset.epic_kitchens100.common.tau_o}
17 | drop_style: correct
18 | reader_fn: ${dataset.epic_kitchens100.common.reader_fn}
19 | max_els: ${dataset.epic_kitchens100.common.max_els}
--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/test.yaml:
--------------------------------------------------------------------------------
1 | # @package dataset.epic_kitchens100.train
2 |
3 | _target_: datasets.epic_kitchens.EPICKitchens
4 | version: ${dataset.epic_kitchens100.common.version}
5 | annotation_path:
6 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_test_timestamps.pkl
7 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir}
8 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc.
9 | label_type: ${dataset.epic_kitchens100.common.label_type}
10 | sample_strategy: ${dataset.epic_kitchens100.common.sample_strategy}
11 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv
12 | compute_dataset_stats: ${dataset.epic_kitchens100.common.compute_dataset_stats}
13 | conv_to_anticipate_fn:
14 | _target_: datasets.base_video_dataset.convert_to_anticipation
15 | tau_a: ${dataset.epic_kitchens100.common.tau_a}
16 | tau_o: ${dataset.epic_kitchens100.common.tau_o}
17 | drop_style: correct
18 | reader_fn: ${dataset.epic_kitchens100.common.reader_fn}
19 | max_els: ${dataset.epic_kitchens100.common.max_els}
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import argparse
3 |
4 |
5 | def parse_args():
6 | """Parse arguments"""
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('-c', '--cfg', type=str, required=True,
9 | help='Overrides config file')
10 | parser.add_argument('-m', '--mode', type=str, required=True, choices=['train', 'test', 'visualize_attention'],
11 | help='Choose which file to run')
12 | parser.add_argument('-n', '--nproc_per_node', type=int, default=4, required=True,
13 | help='number of gpus per node')
14 | args = parser.parse_args()
15 | return args
16 |
17 |
18 | def read_file_into_cli(fpath):
19 | """Read cli from file into a string."""
20 | res = []
21 | with open(fpath, 'r') as fin:
22 | for line in fin:
23 | args = line.split('#')[0].strip()
24 | if len(args) == 0:
25 | continue
26 | res.append(args)
27 | return res
28 |
29 |
30 | def escape_str(input_str):
31 | return f"'{input_str}'"
32 |
33 |
34 | def construct_cmd(args):
35 | if args.cfg:
36 | assert args.cfg.startswith("expts"), "Must be wrt this directory"
37 |
38 | cli_stuff = read_file_into_cli(args.cfg)
39 | cli_stuff = [escape_str(el) for el in cli_stuff]
40 | cli_stuff = ' '.join(cli_stuff)
41 |
42 | cli = (f'HYDRA_FULL_ERROR=1 torchrun --nproc_per_node={args.nproc_per_node} {args.mode}.py ')
43 | cli += cli_stuff
44 | return cli
45 |
46 |
47 | def main():
48 | args = parse_args()
49 | cmd = construct_cmd(args)
50 | print('>> Running "{}"'.format(cmd))
51 | subprocess.call(cmd, shell=True)
52 |
53 |
54 | if __name__ == "__main__":
55 | main()
--------------------------------------------------------------------------------
/annotations/ek100_rulstm/validation_videos.csv:
--------------------------------------------------------------------------------
1 | P01_11
2 | P01_12
3 | P01_13
4 | P01_14
5 | P01_15
6 | P02_12
7 | P02_13
8 | P02_14
9 | P02_15
10 | P03_21
11 | P03_22
12 | P03_23
13 | P03_24
14 | P03_25
15 | P03_26
16 | P04_24
17 | P04_25
18 | P04_26
19 | P04_27
20 | P04_28
21 | P04_29
22 | P04_30
23 | P04_31
24 | P04_32
25 | P04_33
26 | P05_07
27 | P05_09
28 | P06_10
29 | P06_11
30 | P06_12
31 | P06_13
32 | P06_14
33 | P07_12
34 | P07_13
35 | P07_14
36 | P07_15
37 | P07_16
38 | P07_17
39 | P07_18
40 | P08_09
41 | P08_10
42 | P08_14
43 | P08_15
44 | P08_16
45 | P08_17
46 | P09_07
47 | P09_08
48 | P10_03
49 | P11_17
50 | P11_18
51 | P11_19
52 | P11_20
53 | P11_21
54 | P11_22
55 | P11_23
56 | P11_24
57 | P12_03
58 | P12_08
59 | P13_01
60 | P13_02
61 | P13_03
62 | P14_06
63 | P14_08
64 | P15_04
65 | P15_05
66 | P15_06
67 | P16_04
68 | P17_02
69 | P18_01
70 | P18_02
71 | P18_03
72 | P18_04
73 | P18_05
74 | P18_06
75 | P18_07
76 | P18_08
77 | P18_09
78 | P18_10
79 | P18_11
80 | P18_12
81 | P19_05
82 | P19_06
83 | P20_05
84 | P20_06
85 | P20_07
86 | P21_02
87 | P22_01
88 | P22_02
89 | P22_03
90 | P22_04
91 | P23_05
92 | P24_09
93 | P25_06
94 | P25_07
95 | P25_08
96 | P26_30
97 | P26_31
98 | P26_32
99 | P26_33
100 | P26_34
101 | P26_35
102 | P26_36
103 | P26_37
104 | P26_38
105 | P26_39
106 | P26_40
107 | P26_41
108 | P27_05
109 | P28_15
110 | P28_16
111 | P28_17
112 | P28_18
113 | P28_19
114 | P28_20
115 | P28_21
116 | P28_22
117 | P28_23
118 | P28_24
119 | P28_25
120 | P28_26
121 | P29_05
122 | P29_06
123 | P30_07
124 | P30_08
125 | P30_09
126 | P31_10
127 | P31_11
128 | P31_12
129 | P32_01
130 | P32_02
131 | P32_03
132 | P32_04
133 | P32_05
134 | P32_06
135 | P32_07
136 | P32_08
137 | P32_09
138 | P32_10
139 |
--------------------------------------------------------------------------------
/expts/00_RGB_TSN_ek100_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=TSN_fp6l4h2048_bs32_lr0.001_mixupbackbone-0.1
4 | init_from_model=null
5 | primary_metric=val_mt5r_action_rgb
6 |
7 | train.batch_size=16
8 | eval.batch_size=16
9 | train.num_epochs=50
10 | train.use_mixup=true
11 | train.mixup_backbone=true
12 | train.mixup_alpha=0.1
13 |
14 | model.modal_dims={rgb:1024}
15 | model.common_dim=1024
16 | model.dropout=0.2
17 | model.common.backbones={rgb: {_target_: torch.nn.Identity}}
18 | model/future_predictor=base_future_predictor
19 | model/CMFP=individual
20 |
21 | model.common.share_classifiers=false
22 | model.common.share_predictors=false
23 | model.common.modality_cls=true
24 | model.common.fusion_cls=false
25 |
26 | model.common.fp_output_len=1
27 | model.common.fp_inter_dim=2048
28 | model.common.fp_layers=6
29 | model.common.fp_heads=4
30 | model.common.fp_output_attentions=false
31 | model.common.embd_pdrop=0.1
32 | model.common.resid_pdrop=0.1
33 | model.common.attn_pdrop=0.1
34 |
35 | opt.lr=0.001
36 | opt.wd=0.000001
37 | opt/optimizer=sgd
38 | opt/scheduler=cosine
39 | opt.optimizer.nesterov=true
40 | opt.warmup.num_epochs=20
41 | opt.scheduler.num_epochs=30
42 | opt.scheduler.eta_min=1e-6
43 |
44 | data_train.zero_mask_rate=0.
45 |
46 | dataset@dataset_train=epic_kitchens100/train
47 | dataset@dataset_eval=epic_kitchens100/val
48 | dataset.epic_kitchens100.common.label_type=action
49 | dataset.epic_kitchens100.common.sample_strategy=last_clip
50 | dataset.epic_kitchens100.common.tau_a=1
51 | dataset.epic_kitchens100.common.tau_o=10
52 | dataset.epic_kitchens100.common.compute_dataset_stats=true
53 | dataset.epic_kitchens100.common.max_els=null
54 |
55 | dataset.epic_kitchens100.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/}}
--------------------------------------------------------------------------------
/expts/00_RGB_Swin_ek100_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=Swin_fp6l4h2048_bs32_lr0.001_mixupbackbone-0.1
4 | init_from_model=null
5 | primary_metric=val_mt5r_action_rgb
6 |
7 | train.batch_size=16
8 | eval.batch_size=16
9 | train.num_epochs=50
10 | train.use_mixup=true
11 | train.mixup_backbone=true
12 | train.mixup_alpha=0.1
13 |
14 | model.modal_dims={rgb:1024}
15 | model.common_dim=1024
16 | model.dropout=0.2
17 | model.common.backbones={rgb: {_target_: torch.nn.Identity}}
18 | model/future_predictor=base_future_predictor
19 | model/CMFP=individual
20 |
21 | model.common.share_classifiers=false
22 | model.common.share_predictors=false
23 | model.common.modality_cls=true
24 | model.common.fusion_cls=false
25 |
26 | model.common.fp_output_len=1
27 | model.common.fp_inter_dim=2048
28 | model.common.fp_layers=6
29 | model.common.fp_heads=4
30 | model.common.fp_output_attentions=false
31 | model.common.embd_pdrop=0.1
32 | model.common.resid_pdrop=0.1
33 | model.common.attn_pdrop=0.1
34 |
35 | opt.lr=0.001
36 | opt.wd=0.000001
37 | opt/optimizer=sgd
38 | opt/scheduler=cosine
39 | opt.optimizer.nesterov=true
40 | opt.warmup.num_epochs=20
41 | opt.scheduler.num_epochs=30
42 | opt.scheduler.eta_min=1e-6
43 |
44 | data_train.zero_mask_rate=0.
45 |
46 | dataset@dataset_train=epic_kitchens100/train
47 | dataset@dataset_eval=epic_kitchens100/val
48 | dataset.epic_kitchens100.common.label_type=action
49 | dataset.epic_kitchens100.common.sample_strategy=last_clip
50 | dataset.epic_kitchens100.common.tau_a=1
51 | dataset.epic_kitchens100.common.tau_o=10
52 | dataset.epic_kitchens100.common.compute_dataset_stats=true
53 | dataset.epic_kitchens100.common.max_els=null
54 |
55 | dataset.epic_kitchens100.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb_omnivore/}}
--------------------------------------------------------------------------------
/expts/01_SA-Fuser_ek100_val_TSN_wo_audio.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=1
3 | init_from_model=fusion_ek100_tsn_wo_audio_4h_18s/checkpoint_best.pth
4 |
5 | train.batch_size=32
6 | eval.batch_size=32
7 |
8 | model.modal_dims={rgb:1024,objects:352,flow:1024}
9 | model.common_dim=1024
10 | model.dropout=0.2
11 | model.common.backbones={rgb:{_target_:torch.nn.Identity},objects:{_target_:torch.nn.Identity},flow:{_target_:torch.nn.Identity}}
12 | model/future_predictor=base_future_predictor
13 | model/fuser=SA-Fuser
14 | model/CMFP=cmfp_early
15 | model/mapping=linear
16 |
17 | model.common.share_classifiers=true
18 | model.common.share_predictors=true
19 | model.common.modality_cls=false
20 | model.common.fusion_cls=true
21 |
22 | model.mapping.use_layernorm=false
23 | model.mapping.sparse_mapping=true
24 |
25 | model.fuser.depth=6
26 | model.fuser.num_heads=4
27 | model.fuser.embd_drop_rate=0.1
28 | model.fuser.drop_rate=0.1
29 | model.fuser.attn_drop_rate=0.1
30 | model.fuser.drop_path_rate=0.1
31 | model.fuser.cross_attn=false
32 |
33 | data_train.num_frames=18
34 | data_eval.num_frames=18
35 |
36 | dataset@dataset_train=epic_kitchens100/train
37 | dataset@dataset_eval=epic_kitchens100/val
38 | dataset.epic_kitchens100.common.label_type=action
39 | dataset.epic_kitchens100.common.sample_strategy=last_clip
40 | dataset.epic_kitchens100.common.tau_a=1
41 | dataset.epic_kitchens100.common.tau_o=18
42 | dataset.epic_kitchens100.common.compute_dataset_stats=false
43 | dataset.epic_kitchens100.common.max_els=null
44 |
45 | dataset.epic_kitchens100.common.reader_fn={rgb:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/},objects:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/},flow:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/}}
--------------------------------------------------------------------------------
/expts/01_SA-Fuser_ek100_test_TSN_wo_audio.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=1
3 | init_from_model=fusion_ek100_tsn_wo_audio_4h_18s/checkpoint_best.pth
4 | +save_name=test.h5
5 |
6 | train.batch_size=32
7 | eval.batch_size=32
8 |
9 | model.modal_dims={rgb:1024,objects:352,flow:1024}
10 | model.common_dim=1024
11 | model.dropout=0.2
12 | model.common.backbones={rgb:{_target_:torch.nn.Identity},objects:{_target_:torch.nn.Identity},flow:{_target_:torch.nn.Identity}}
13 | model/future_predictor=base_future_predictor
14 | model/fuser=mtcmfuser
15 | model/CMFP=cmfp_early
16 | model/mapping=linear
17 |
18 | model.common.share_classifiers=true
19 | model.common.share_predictors=true
20 | model.common.modality_cls=false
21 | model.common.fusion_cls=true
22 |
23 | model.mapping.use_layernorm=false
24 | model.mapping.sparse_mapping=true
25 |
26 | model.fuser.depth=6
27 | model.fuser.num_heads=4
28 | model.fuser.embd_drop_rate=0.1
29 | model.fuser.drop_rate=0.1
30 | model.fuser.attn_drop_rate=0.1
31 | model.fuser.drop_path_rate=0.1
32 | model.fuser.cross_attn=false
33 |
34 | data_train.num_frames=18
35 | data_eval.num_frames=18
36 |
37 | dataset@dataset_train=epic_kitchens100/train
38 | dataset@dataset_eval=epic_kitchens100/test
39 | dataset.epic_kitchens100.common.label_type=action
40 | dataset.epic_kitchens100.common.sample_strategy=last_clip
41 | dataset.epic_kitchens100.common.tau_a=1
42 | dataset.epic_kitchens100.common.tau_o=18
43 | dataset.epic_kitchens100.common.compute_dataset_stats=false
44 | dataset.epic_kitchens100.common.max_els=null
45 |
46 | dataset.epic_kitchens100.common.reader_fn={rgb:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/},objects:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/},flow:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/}}
--------------------------------------------------------------------------------
/expts/06_SA-Fuser_egtea_val.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=1
3 | init_from_model=fusion_egtea_tsn/checkpoint_best.pth
4 |
5 | train.batch_size=32
6 | eval.batch_size=32
7 |
8 | model.modal_dims={rgb:1024, flow:1024}
9 | model.common_dim=1024
10 | model.dropout=0.2
11 | model.common.backbones={rgb: {_target_: torch.nn.Identity}, flow: {_target_: torch.nn.Identity}}
12 | model/future_predictor=base_future_predictor
13 | model/fuser=SA-Fuser
14 | model/CMFP=cmfp_early
15 | model/mapping=linear
16 |
17 | model.common.share_classifiers=true
18 | model.common.share_predictors=true
19 | model.common.modality_cls=false
20 | model.common.fusion_cls=true
21 |
22 | model.mapping.use_layernorm=false
23 | model.mapping.sparse_mapping=true
24 |
25 | model.fuser.depth=2
26 | model.fuser.num_heads=4
27 | model.fuser.embd_drop_rate=0.1
28 | model.fuser.drop_rate=0.1
29 | model.fuser.attn_drop_rate=0.1
30 | model.fuser.drop_path_rate=0.1
31 | model.fuser.cross_attn=false
32 |
33 | model.common.fp_output_len=1
34 | model.common.fp_inter_dim=2048
35 | model.common.fp_layers=2
36 | model.common.fp_heads=4
37 | model.common.fp_output_attentions=false
38 | model.common.embd_pdrop=0.1
39 | model.common.resid_pdrop=0.1
40 | model.common.attn_pdrop=0.1
41 |
42 | data_train.zero_mask_rate=0.0
43 |
44 | dataset@dataset_train=egtea/train
45 | dataset@dataset_eval=egtea/val
46 | dataset.egtea.common.label_type=action
47 | dataset.egtea.common.sample_strategy=last_clip
48 | dataset.egtea.common.tau_a=0.5
49 | dataset.egtea.common.tau_o=10
50 | dataset.egtea.common.compute_dataset_stats=false
51 | dataset.egtea.common.max_els=null
52 |
53 | dataset.egtea.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_rgb_model_best_fcfull_hd/}, flow: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_flow_model_best_fcfull_hd/}}
--------------------------------------------------------------------------------
/expts/01_SA-Fuser_ek100_val_TSN.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=1
3 | init_from_model=fusion_ek100_tsn_4h_18s/checkpoint_best.pth
4 |
5 | train.batch_size=32
6 | eval.batch_size=32
7 |
8 | model.modal_dims={rgb:1024,objects:352,audio:1024,flow:1024}
9 | model.common_dim=1024
10 | model.dropout=0.2
11 | model.common.backbones={rgb:{_target_:torch.nn.Identity},objects:{_target_:torch.nn.Identity},flow:{_target_:torch.nn.Identity},audio:{_target_:torch.nn.Identity}}
12 | model/future_predictor=base_future_predictor
13 | model/fuser=SA-Fuser
14 | model/CMFP=cmfp_early
15 | model/mapping=linear
16 |
17 | model.common.share_classifiers=true
18 | model.common.share_predictors=true
19 | model.common.modality_cls=false
20 | model.common.fusion_cls=true
21 |
22 | model.mapping.use_layernorm=false
23 | model.mapping.sparse_mapping=true
24 |
25 | model.fuser.depth=6
26 | model.fuser.num_heads=4
27 | model.fuser.embd_drop_rate=0.1
28 | model.fuser.drop_rate=0.1
29 | model.fuser.attn_drop_rate=0.1
30 | model.fuser.drop_path_rate=0.1
31 | model.fuser.cross_attn=false
32 |
33 | data_train.num_frames=18
34 | data_eval.num_frames=18
35 |
36 | dataset@dataset_train=epic_kitchens100/train
37 | dataset@dataset_eval=epic_kitchens100/val
38 | dataset.epic_kitchens100.common.label_type=action
39 | dataset.epic_kitchens100.common.sample_strategy=last_clip
40 | dataset.epic_kitchens100.common.tau_a=1
41 | dataset.epic_kitchens100.common.tau_o=18
42 | dataset.epic_kitchens100.common.compute_dataset_stats=false
43 | dataset.epic_kitchens100.common.max_els=null
44 |
45 | dataset.epic_kitchens100.common.reader_fn={rgb:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/},objects:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/},flow:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/},audio:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/audio/,warn_if_using_closeby_frame:false}}
--------------------------------------------------------------------------------
/expts/01_SA-Fuser_ek100_val_Swin.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=1
3 | init_from_model=fusion_ek100_swin_4h_16s/checkpoint_best.pth
4 |
5 | train.batch_size=32
6 | eval.batch_size=32
7 |
8 | model.modal_dims={rgb:1024,objects:352,audio:1024,flow:1024}
9 | model.common_dim=1024
10 | model.dropout=0.2
11 | model.common.backbones={rgb:{_target_:torch.nn.Identity},objects:{_target_:torch.nn.Identity},flow:{_target_:torch.nn.Identity},audio:{_target_:torch.nn.Identity}}
12 | model/future_predictor=base_future_predictor
13 | model/fuser=SA-Fuser
14 | model/CMFP=cmfp_early
15 | model/mapping=linear
16 |
17 | model.common.share_classifiers=true
18 | model.common.share_predictors=true
19 | model.common.modality_cls=false
20 | model.common.fusion_cls=true
21 |
22 | model.mapping.use_layernorm=false
23 | model.mapping.sparse_mapping=true
24 |
25 | model.fuser.depth=6
26 | model.fuser.num_heads=4
27 | model.fuser.embd_drop_rate=0.1
28 | model.fuser.drop_rate=0.1
29 | model.fuser.attn_drop_rate=0.1
30 | model.fuser.drop_path_rate=0.1
31 | model.fuser.cross_attn=false
32 |
33 | data_train.num_frames=16
34 | data_eval.num_frames=16
35 |
36 | dataset@dataset_train=epic_kitchens100/train
37 | dataset@dataset_eval=epic_kitchens100/val
38 | dataset.epic_kitchens100.common.label_type=action
39 | dataset.epic_kitchens100.common.sample_strategy=last_clip
40 | dataset.epic_kitchens100.common.tau_a=1
41 | dataset.epic_kitchens100.common.tau_o=16
42 | dataset.epic_kitchens100.common.compute_dataset_stats=false
43 | dataset.epic_kitchens100.common.max_els=null
44 |
45 | dataset.epic_kitchens100.common.reader_fn={rgb:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb_omnivore/},objects:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/},flow:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/},audio:{_target_:datasets.reader_fns.EpicRULSTMFeatsReader,lmdb_path:${dataset.epic_kitchens100.common.rulstm_feats_dir}/audio/,warn_if_using_closeby_frame:false}}
--------------------------------------------------------------------------------
/conf/config.yaml:
--------------------------------------------------------------------------------
1 | cwd: ${hydra:runtime.cwd}
2 | workers: 4
3 | num_gpus: 2
4 | seed: 42
5 | project_name: Anticipation
6 | experiment_name: CMFuser
7 | init_from_model: null
8 | dataset_root_dir: /home/zhong/Documents/datasets
9 | primary_metric: val_mt5r_action_all-fused
10 | dist_backend: nccl
11 | temporal_context: 10
12 |
13 | train:
14 | batch_size: 3
15 | num_epochs: 50
16 | use_mixup: true
17 | mixup_backbone: true # whether to mixup inputs or the backbone outputs
18 | mixup_alpha: 0.1 # this value is from vivit: https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/configs/epic_kitchens/vivit_large_factorised_encoder.py
19 | label_smoothing:
20 | action: 0.4
21 | verb: 0.01
22 | noun: 0.03
23 | modules_to_keep: null
24 | loss_wts:
25 | # classification for future action
26 | cls_action: 1.0
27 | cls_verb: 1.0
28 | cls_noun: 1.0
29 | # classification for updated past action
30 | past_cls_action: 1.0
31 | past_cls_verb: 1.0
32 | past_cls_noun: 1.0
33 | # regression for updated past feature
34 | past_reg: 1.0
35 |
36 | eval:
37 | batch_size: 3
38 |
39 | model:
40 | modal_dims: null #{"rgb": 1024, "objects": 352} # length of this dict corresponds to the number of modalities
41 | modal_feature_order: ["rgb", "objects", "audio", "poses", "flow"]
42 | common_dim: 1024
43 | dropout: 0.2
44 |
45 | opt:
46 | lr: 0.001 # learning rate
47 | wd: 0.000001 # weight decay
48 | lr_wd: null # [[backbone, 0.0001, 0.000001]] # modules with specific lr and wd
49 | grad_clip: null # by default, no clipping
50 | warmup:
51 | _target_: common.scheduler.Warmup
52 | init_lr_ratio: 0.01 # Warmup from this ratio of the orig LRs
53 | num_epochs: 0 # Warmup for this many epochs (will take out of total epochs)
54 |
55 | defaults:
56 | - dataset@dataset_train: epic_kitchens100/train
57 | - dataset@dataset_eval: epic_kitchens100/val
58 | - data@data_train: default
59 | - data@data_eval: default
60 | - dataset/epic_kitchens100/common
61 | - dataset/egtea/common
62 | - model/common
63 | - opt/optimizer: sgd
64 | - opt/scheduler: cosine
65 | - model/backbone: identity
66 | - model/future_predictor: base_future_predictor
67 | - model/fuser: SA-Fuser
68 | - model/CMFP: cmfp_early
69 | - model/mapping: linear
70 | - _self_
71 |
--------------------------------------------------------------------------------
/expts/06_SA-Fuser_egtea_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=egtea
4 | init_from_model=null
5 | primary_metric=val_acc1_action_all-fused
6 |
7 | train.batch_size=16
8 | eval.batch_size=16
9 | train.num_epochs=50
10 | train.use_mixup=true
11 | train.mixup_backbone=true
12 | train.mixup_alpha=0.1
13 | train.loss_wts.past_cls_action=0.1 # following AVT
14 |
15 | model.modal_dims={rgb:1024, flow:1024}
16 | model.common_dim=1024
17 | model.dropout=0.2
18 | model.common.backbones={rgb: {_target_: torch.nn.Identity}, flow: {_target_: torch.nn.Identity}}
19 | model/future_predictor=base_future_predictor
20 | model/fuser=SA-Fuser
21 | model/CMFP=cmfp_early
22 | model/mapping=linear
23 |
24 | model.common.share_classifiers=true
25 | model.common.share_predictors=true
26 | model.common.modality_cls=false
27 | model.common.fusion_cls=true
28 |
29 | model.mapping.use_layernorm=false
30 | model.mapping.sparse_mapping=true
31 |
32 | model.fuser.depth=2
33 | model.fuser.num_heads=4
34 | model.fuser.embd_drop_rate=0.1
35 | model.fuser.drop_rate=0.1
36 | model.fuser.attn_drop_rate=0.1
37 | model.fuser.drop_path_rate=0.1
38 | model.fuser.cross_attn=false
39 |
40 | model.common.fp_output_len=1
41 | model.common.fp_inter_dim=2048
42 | model.common.fp_layers=2
43 | model.common.fp_heads=4
44 | model.common.fp_output_attentions=false
45 | model.common.embd_pdrop=0.1
46 | model.common.resid_pdrop=0.1
47 | model.common.attn_pdrop=0.1
48 |
49 | opt.lr=0.001
50 | opt.wd=0.000001
51 | opt/optimizer=sgd
52 | opt/scheduler=cosine
53 | opt.optimizer.nesterov=true
54 | opt.warmup.num_epochs=20
55 | opt.scheduler.num_epochs=30
56 | opt.scheduler.eta_min=1e-6
57 |
58 | data_train.zero_mask_rate=0.0
59 |
60 | dataset@dataset_train=egtea/train
61 | dataset@dataset_eval=egtea/val
62 | dataset.egtea.common.label_type=action
63 | dataset.egtea.common.sample_strategy=last_clip
64 | dataset.egtea.common.tau_a=0.5
65 | dataset.egtea.common.tau_o=10
66 | dataset.egtea.common.compute_dataset_stats=false
67 | dataset.egtea.common.max_els=null
68 |
69 | dataset.egtea.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_rgb_model_best_fcfull_hd/}, flow: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_flow_model_best_fcfull_hd/}}
--------------------------------------------------------------------------------
/expts/05_MATT_ek100_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=MATT
4 | init_from_model=null
5 | primary_metric=val_mt5r_action_all-fused
6 |
7 | train.loss_wts.past_cls_action=0
8 |
9 | train.batch_size=16
10 | eval.batch_size=16
11 | train.num_epochs=50
12 | train.use_mixup=true
13 | train.mixup_backbone=true
14 | train.mixup_alpha=0.1
15 |
16 | model.modal_dims={rgb:1024, objects:352, audio:1024, flow:1024}
17 | model.common_dim=1024
18 | model.dropout=0.2
19 | model.common.backbones={rgb: {_target_: torch.nn.Identity}, objects: {_target_: torch.nn.Identity}, flow: {_target_: torch.nn.Identity}, audio: {_target_: torch.nn.Identity}}
20 | model/future_predictor=base_future_predictor
21 | model/fuser=MATT
22 | model/CMFP=scorefusion
23 | model/mapping=linear
24 |
25 | model.common.share_classifiers=false
26 | model.common.share_predictors=false
27 | model.common.modality_cls=true
28 | model.common.fusion_cls=false
29 |
30 | model.mapping.use_layernorm=false
31 | model.mapping.sparse_mapping=true
32 |
33 | model.fuser.drop_rate=0.8
34 |
35 | model.common.fp_output_len=1
36 | model.common.fp_inter_dim=2048
37 | model.common.fp_layers=2
38 | model.common.fp_heads=4
39 | model.common.fp_output_attentions=false
40 | model.common.embd_pdrop=0.1
41 | model.common.resid_pdrop=0.1
42 | model.common.attn_pdrop=0.1
43 |
44 | opt.lr=0.001
45 | opt.wd=0.000001
46 | opt/optimizer=sgd
47 | opt/scheduler=cosine
48 | opt.optimizer.nesterov=true
49 | opt.warmup.num_epochs=20
50 | opt.scheduler.num_epochs=30
51 | opt.scheduler.eta_min=1e-6
52 |
53 | data_train.zero_mask_rate=0.0
54 |
55 | dataset@dataset_train=epic_kitchens100/train
56 | dataset@dataset_eval=epic_kitchens100/val
57 | dataset.epic_kitchens100.common.label_type=action
58 | dataset.epic_kitchens100.common.sample_strategy=last_clip
59 | dataset.epic_kitchens100.common.tau_a=1
60 | dataset.epic_kitchens100.common.tau_o=10
61 | dataset.epic_kitchens100.common.compute_dataset_stats=false
62 | dataset.epic_kitchens100.common.max_els=null
63 |
64 | dataset.epic_kitchens100.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb_omnivore/}, objects: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/}, flow: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/}, audio: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/audio/, warn_if_using_closeby_frame: false}}
--------------------------------------------------------------------------------
/expts/04_CA-Fuser_ek100_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=CA-Fuser
4 | init_from_model=null
5 | primary_metric=val_mt5r_action_all-fused
6 |
7 | train.batch_size=16
8 | eval.batch_size=16
9 | train.num_epochs=50
10 | train.use_mixup=true
11 | train.mixup_backbone=true
12 | train.mixup_alpha=0.1
13 |
14 | model.modal_dims={rgb:1024, objects:352, audio:1024, flow:1024}
15 | model.common_dim=1024
16 | model.dropout=0.2
17 | model.common.backbones={rgb: {_target_: torch.nn.Identity}, objects: {_target_: torch.nn.Identity}, flow: {_target_: torch.nn.Identity}, audio: {_target_: torch.nn.Identity}}
18 | model/future_predictor=base_future_predictor
19 | model/fuser=CA-Fuser
20 | model/CMFP=cmfp_early
21 | model/mapping=linear
22 |
23 | model.common.share_classifiers=true
24 | model.common.share_predictors=true
25 | model.common.modality_cls=false
26 | model.common.fusion_cls=true
27 |
28 | model.mapping.use_layernorm=false
29 | model.mapping.sparse_mapping=true
30 |
31 | model.fuser.num_heads=4
32 | model.fuser.embd_drop_rate=0.1
33 | model.fuser.drop_rate=0.1
34 | model.fuser.attn_drop_rate=0.1
35 | model.fuser.drop_path_rate=0.1
36 |
37 | model.common.fp_output_len=1
38 | model.common.fp_inter_dim=2048
39 | model.common.fp_layers=6
40 | model.common.fp_heads=4
41 | model.common.fp_output_attentions=false
42 | model.common.embd_pdrop=0.1
43 | model.common.resid_pdrop=0.1
44 | model.common.attn_pdrop=0.1
45 |
46 | opt.lr=0.001
47 | opt.wd=0.000001
48 | opt/optimizer=sgd
49 | opt/scheduler=cosine
50 | opt.optimizer.nesterov=true
51 | opt.warmup.num_epochs=20
52 | opt.scheduler.num_epochs=30
53 | opt.scheduler.eta_min=1e-6
54 |
55 | data_train.zero_mask_rate=0.0
56 |
57 | dataset@dataset_train=epic_kitchens100/train
58 | dataset@dataset_eval=epic_kitchens100/val
59 | dataset.epic_kitchens100.common.label_type=action
60 | dataset.epic_kitchens100.common.sample_strategy=last_clip
61 | dataset.epic_kitchens100.common.tau_a=1
62 | dataset.epic_kitchens100.common.tau_o=10
63 | dataset.epic_kitchens100.common.compute_dataset_stats=false
64 | dataset.epic_kitchens100.common.max_els=null
65 |
66 | dataset.epic_kitchens100.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb_omnivore/}, objects: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/}, flow: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/}, audio: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/audio/, warn_if_using_closeby_frame: false}}
--------------------------------------------------------------------------------
/expts/02_SA-Fuser_wo_token_ek100_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=SA-Fuser_wo_token
4 | init_from_model=null
5 | primary_metric=val_mt5r_action_all-fused
6 |
7 | train.batch_size=16
8 | eval.batch_size=16
9 | train.num_epochs=50
10 | train.use_mixup=true
11 | train.mixup_backbone=true
12 | train.mixup_alpha=0.1
13 |
14 | model.modal_dims={rgb:1024, objects:352, audio:1024, flow:1024}
15 | model.common_dim=1024
16 | model.dropout=0.2
17 | model.common.backbones={rgb: {_target_: torch.nn.Identity}, objects: {_target_: torch.nn.Identity}, flow: {_target_: torch.nn.Identity}, audio: {_target_: torch.nn.Identity}}
18 | model/future_predictor=base_future_predictor
19 | model/fuser=SA-Fuser_wo_token
20 | model/CMFP=cmfp_early
21 | model/mapping=linear
22 |
23 | model.common.share_classifiers=true
24 | model.common.share_predictors=true
25 | model.common.modality_cls=false
26 | model.common.fusion_cls=true
27 |
28 | model.mapping.use_layernorm=false
29 | model.mapping.sparse_mapping=true
30 |
31 | model.fuser.depth=6
32 | model.fuser.num_heads=4
33 | model.fuser.embd_drop_rate=0.1
34 | model.fuser.drop_rate=0.1
35 | model.fuser.attn_drop_rate=0.1
36 | model.fuser.drop_path_rate=0.1
37 | model.fuser.cross_attn=false
38 |
39 | model.common.fp_output_len=1
40 | model.common.fp_inter_dim=2048
41 | model.common.fp_layers=6
42 | model.common.fp_heads=4
43 | model.common.fp_output_attentions=false
44 | model.common.embd_pdrop=0.1
45 | model.common.resid_pdrop=0.1
46 | model.common.attn_pdrop=0.1
47 |
48 | opt.lr=0.001
49 | opt.wd=0.000001
50 | opt/optimizer=sgd
51 | opt/scheduler=cosine
52 | opt.optimizer.nesterov=true
53 | opt.warmup.num_epochs=20
54 | opt.scheduler.num_epochs=30
55 | opt.scheduler.eta_min=1e-6
56 |
57 | data_train.zero_mask_rate=0.0
58 |
59 | dataset@dataset_train=epic_kitchens100/train
60 | dataset@dataset_eval=epic_kitchens100/val
61 | dataset.epic_kitchens100.common.label_type=action
62 | dataset.epic_kitchens100.common.sample_strategy=last_clip
63 | dataset.epic_kitchens100.common.tau_a=1
64 | dataset.epic_kitchens100.common.tau_o=10
65 | dataset.epic_kitchens100.common.compute_dataset_stats=false
66 | dataset.epic_kitchens100.common.max_els=null
67 |
68 | dataset.epic_kitchens100.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb_omnivore/}, objects: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/}, flow: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/}, audio: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/audio/, warn_if_using_closeby_frame: false}}
--------------------------------------------------------------------------------
/expts/01_SA-Fuser_ek100_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=SA-Fuser
4 | init_from_model=null
5 | primary_metric=val_mt5r_action_all-fused
6 |
7 | train.batch_size=16
8 | eval.batch_size=16
9 | train.num_epochs=50
10 | train.use_mixup=true
11 | train.mixup_backbone=true
12 | train.mixup_alpha=0.1
13 |
14 | model.modal_dims={rgb:1024, objects:352, audio:1024, flow:1024}
15 | model.common_dim=1024
16 | model.dropout=0.2
17 | model.common.backbones={rgb: {_target_: torch.nn.Identity}, objects: {_target_: torch.nn.Identity}, flow: {_target_: torch.nn.Identity}, audio: {_target_: torch.nn.Identity}}
18 | model/future_predictor=base_future_predictor
19 | model/fuser=SA-Fuser
20 | model/CMFP=cmfp_early
21 | model/mapping=linear
22 |
23 | model.common.share_classifiers=true
24 | model.common.share_predictors=true
25 | model.common.modality_cls=false
26 | model.common.fusion_cls=true
27 |
28 | model.mapping.use_layernorm=false
29 | model.mapping.sparse_mapping=true
30 |
31 | model.fuser.depth=6
32 | model.fuser.num_heads=4
33 | model.fuser.embd_drop_rate=0.1
34 | model.fuser.drop_rate=0.1
35 | model.fuser.attn_drop_rate=0.1
36 | model.fuser.drop_path_rate=0.1
37 | model.fuser.cross_attn=false
38 |
39 | model.common.fp_output_len=1
40 | model.common.fp_inter_dim=2048
41 | model.common.fp_layers=6
42 | model.common.fp_heads=4
43 | model.common.fp_output_attentions=false
44 | model.common.embd_pdrop=0.1
45 | model.common.resid_pdrop=0.1
46 | model.common.attn_pdrop=0.1
47 |
48 | opt.lr=0.001
49 | opt.wd=0.000001
50 | opt/optimizer=sgd
51 | opt/scheduler=cosine
52 | opt.optimizer.nesterov=true
53 | opt.warmup.num_epochs=20
54 | opt.scheduler.num_epochs=30
55 | opt.scheduler.eta_min=1e-6
56 |
57 | data_train.zero_mask_rate=0.0
58 | data_train.num_frames=16
59 | data_eval.num_frames=16
60 |
61 | dataset@dataset_train=epic_kitchens100/train
62 | dataset@dataset_eval=epic_kitchens100/val
63 | dataset.epic_kitchens100.common.label_type=action
64 | dataset.epic_kitchens100.common.sample_strategy=last_clip
65 | dataset.epic_kitchens100.common.tau_a=1
66 | dataset.epic_kitchens100.common.tau_o=16
67 | dataset.epic_kitchens100.common.compute_dataset_stats=false
68 | dataset.epic_kitchens100.common.max_els=null
69 |
70 | dataset.epic_kitchens100.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb_omnivore/}, objects: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/}, flow: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/}, audio: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/audio/, warn_if_using_closeby_frame: false}}
--------------------------------------------------------------------------------
/expts/03_T-SA-Fuser_ek100_train.txt:
--------------------------------------------------------------------------------
1 | workers=32
2 | num_gpus=2
3 | experiment_name=T-SA-Fuser
4 | init_from_model=null
5 | primary_metric=val_mt5r_action_all-fused
6 |
7 | train.batch_size=16
8 | eval.batch_size=16
9 | train.num_epochs=50
10 | train.use_mixup=true
11 | train.mixup_backbone=true
12 | train.mixup_alpha=0.1
13 |
14 | model.modal_dims={rgb:1024, objects:352, audio:1024, flow:1024}
15 | model.common_dim=1024
16 | model.dropout=0.2
17 | model.common.backbones={rgb: {_target_: torch.nn.Identity}, objects: {_target_: torch.nn.Identity}, flow: {_target_: torch.nn.Identity}, audio: {_target_: torch.nn.Identity}}
18 | model/future_predictor=base_future_predictor
19 | model/fuser=T-SA-Fuser
20 | model/CMFP=cmfp_early
21 | model/mapping=linear
22 |
23 | model.common.share_classifiers=true
24 | model.common.share_predictors=true
25 | model.common.modality_cls=false
26 | model.common.fusion_cls=true
27 |
28 | model.mapping.use_layernorm=false
29 | model.mapping.sparse_mapping=true
30 |
31 | model.fuser.depth=6
32 | model.fuser.num_heads=4
33 | model.fuser.embd_drop_rate=0.1
34 | model.fuser.drop_rate=0.1
35 | model.fuser.attn_drop_rate=0.1
36 | model.fuser.drop_path_rate=0.1
37 | model.fuser.modal_encoding=true
38 | model.fuser.frame_level_token=true
39 | model.fuser.temporal_sequence_length=10
40 |
41 | model.common.fp_output_len=1
42 | model.common.fp_inter_dim=2048
43 | model.common.fp_layers=6
44 | model.common.fp_heads=4
45 | model.common.fp_output_attentions=false
46 | model.common.embd_pdrop=0.1
47 | model.common.resid_pdrop=0.1
48 | model.common.attn_pdrop=0.1
49 |
50 | opt.lr=0.001
51 | opt.wd=0.000001
52 | opt/optimizer=sgd
53 | opt/scheduler=cosine
54 | opt.optimizer.nesterov=true
55 | opt.warmup.num_epochs=20
56 | opt.scheduler.num_epochs=30
57 | opt.scheduler.eta_min=1e-6
58 |
59 | data_train.zero_mask_rate=0.0
60 |
61 | dataset@dataset_train=epic_kitchens100/train
62 | dataset@dataset_eval=epic_kitchens100/val
63 | dataset.epic_kitchens100.common.label_type=action
64 | dataset.epic_kitchens100.common.sample_strategy=last_clip
65 | dataset.epic_kitchens100.common.tau_a=1
66 | dataset.epic_kitchens100.common.tau_o=10
67 | dataset.epic_kitchens100.common.compute_dataset_stats=false
68 | dataset.epic_kitchens100.common.max_els=null
69 |
70 | dataset.epic_kitchens100.common.reader_fn={rgb: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb_omnivore/}, objects: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/}, flow: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/}, audio: {_target_: datasets.reader_fns.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/audio/, warn_if_using_closeby_frame: false}}
--------------------------------------------------------------------------------
/annotations/ek55_rulstm/training_videos.csv:
--------------------------------------------------------------------------------
1 | P01_02
2 | P01_03
3 | P01_04
4 | P01_05
5 | P01_06
6 | P01_07
7 | P01_08
8 | P01_09
9 | P01_16
10 | P01_17
11 | P01_18
12 | P01_19
13 | P02_01
14 | P02_02
15 | P02_04
16 | P02_06
17 | P02_07
18 | P02_08
19 | P02_09
20 | P02_10
21 | P02_11
22 | P03_02
23 | P03_03
24 | P03_04
25 | P03_05
26 | P03_07
27 | P03_08
28 | P03_09
29 | P03_10
30 | P03_12
31 | P03_13
32 | P03_14
33 | P03_15
34 | P03_16
35 | P03_17
36 | P03_18
37 | P03_19
38 | P03_20
39 | P03_27
40 | P03_28
41 | P04_01
42 | P04_02
43 | P04_03
44 | P04_04
45 | P04_05
46 | P04_06
47 | P04_07
48 | P04_08
49 | P04_10
50 | P04_11
51 | P04_12
52 | P04_13
53 | P04_14
54 | P04_15
55 | P04_16
56 | P04_17
57 | P04_18
58 | P04_19
59 | P04_20
60 | P04_21
61 | P04_22
62 | P04_23
63 | P05_01
64 | P05_02
65 | P05_03
66 | P05_04
67 | P05_05
68 | P05_06
69 | P05_08
70 | P06_01
71 | P06_02
72 | P06_03
73 | P06_07
74 | P06_08
75 | P06_09
76 | P07_01
77 | P07_03
78 | P07_04
79 | P07_05
80 | P07_06
81 | P07_07
82 | P07_09
83 | P07_11
84 | P08_02
85 | P08_03
86 | P08_04
87 | P08_06
88 | P08_07
89 | P08_08
90 | P08_11
91 | P08_13
92 | P08_18
93 | P08_19
94 | P08_20
95 | P08_21
96 | P08_22
97 | P08_23
98 | P08_24
99 | P08_25
100 | P08_26
101 | P08_27
102 | P08_28
103 | P10_02
104 | P10_04
105 | P12_01
106 | P12_02
107 | P12_04
108 | P12_05
109 | P12_06
110 | P12_07
111 | P13_05
112 | P13_07
113 | P13_08
114 | P13_10
115 | P14_03
116 | P14_04
117 | P14_05
118 | P14_07
119 | P14_09
120 | P15_01
121 | P15_02
122 | P15_03
123 | P15_07
124 | P15_08
125 | P15_09
126 | P15_10
127 | P15_11
128 | P15_12
129 | P15_13
130 | P16_01
131 | P16_02
132 | P16_03
133 | P17_01
134 | P17_03
135 | P17_04
136 | P19_01
137 | P19_02
138 | P19_03
139 | P19_04
140 | P20_01
141 | P20_02
142 | P21_01
143 | P21_03
144 | P21_04
145 | P22_05
146 | P22_06
147 | P22_07
148 | P22_09
149 | P22_12
150 | P22_14
151 | P22_15
152 | P22_16
153 | P22_17
154 | P23_01
155 | P23_02
156 | P23_04
157 | P24_01
158 | P24_02
159 | P24_03
160 | P24_04
161 | P24_05
162 | P24_06
163 | P24_07
164 | P25_01
165 | P25_02
166 | P25_03
167 | P25_04
168 | P25_05
169 | P25_09
170 | P25_10
171 | P25_12
172 | P26_01
173 | P26_03
174 | P26_04
175 | P26_05
176 | P26_06
177 | P26_07
178 | P26_08
179 | P26_09
180 | P26_10
181 | P26_12
182 | P26_13
183 | P26_14
184 | P26_15
185 | P26_17
186 | P26_18
187 | P26_19
188 | P26_20
189 | P26_21
190 | P26_22
191 | P26_23
192 | P26_24
193 | P26_25
194 | P26_26
195 | P26_27
196 | P26_28
197 | P26_29
198 | P27_01
199 | P27_02
200 | P27_04
201 | P27_06
202 | P27_07
203 | P28_01
204 | P28_02
205 | P28_03
206 | P28_04
207 | P28_06
208 | P28_07
209 | P28_08
210 | P28_09
211 | P28_10
212 | P28_11
213 | P28_14
214 | P29_01
215 | P29_02
216 | P29_03
217 | P29_04
218 | P30_02
219 | P30_04
220 | P30_05
221 | P30_06
222 | P30_10
223 | P30_11
224 | P31_02
225 | P31_03
226 | P31_04
227 | P31_05
228 | P31_06
229 | P31_07
230 | P31_09
231 | P31_13
232 | P31_14
233 |
--------------------------------------------------------------------------------
/annotations/egtea/actions.csv:
--------------------------------------------------------------------------------
1 | 0, 0_0, Inspect/Read_recipe
2 | 1, 1_1, Open_fridge
3 | 2, 2_2, Take_eating:utensil
4 | 3, 3_3, Cut_tomato
5 | 4, 4_4, Turn on_faucet
6 | 5, 5_2, Put_eating:utensil
7 | 6, 1_5, Open_cabinet
8 | 7, 2_6, Take_condiment:container
9 | 8, 3_7, Cut_cucumber
10 | 9, 6_8, Operate_stove
11 | 10, 7_1, Close_fridge
12 | 11, 3_9, Cut_carrot
13 | 12, 5_6, Put_condiment:container
14 | 13, 3_10, Cut_onion
15 | 14, 1_11, Open_drawer
16 | 15, 2_12, Take_plate
17 | 16, 2_13, Take_bowl
18 | 17, 5_13, Put_bowl
19 | 18, 5_14, Put_trash
20 | 19, 5_12, Put_plate
21 | 20, 3_15, Cut_bell:pepper
22 | 21, 5_16, Put_cooking:utensil
23 | 22, 2_17, Take_paper:towel
24 | 23, 8_18, Move Around_bacon
25 | 24, 1_6, Open_condiment:container
26 | 25, 9_2, Wash_eating:utensil
27 | 26, 10_19, Spread_condiment
28 | 27, 11_4, Turn off_faucet
29 | 28, 5_20, Put_pan
30 | 29, 2_16, Take_cooking:utensil
31 | 30, 5_21, Put_lettuce
32 | 31, 8_22, Move Around_patty
33 | 32, 5_23, Put_pot
34 | 33, 7_5, Close_cabinet
35 | 34, 5_24, Put_bread
36 | 35, 2_24, Take_bread
37 | 36, 7_6, Close_condiment:container
38 | 37, 1_25, Open_fridge:drawer
39 | 38, 9_26, Wash_hand
40 | 39, 5_3, Put_tomato
41 | 40, 2_27, Take_seasoning:container
42 | 41, 2_28, Take_cup
43 | 42, 12_21, Divide/Pull Apart_lettuce
44 | 43, 5_28, Put_cup
45 | 44, 2_23, Take_pot
46 | 45, 13_29, Clean/Wipe_counter
47 | 46, 2_30, Take_bread:container
48 | 47, 2_3, Take_tomato
49 | 48, 2_20, Take_pan
50 | 49, 8_20, Move Around_pan
51 | 50, 9_31, Wash_cutting:board
52 | 51, 5_30, Put_bread:container
53 | 52, 2_32, Take_sponge
54 | 53, 2_21, Take_lettuce
55 | 54, 2_10, Take_onion
56 | 55, 5_32, Put_sponge
57 | 56, 12_17, Divide/Pull Apart_paper:towel
58 | 57, 1_33, Open_dishwasher
59 | 58, 2_34, Take_cheese:container
60 | 59, 2_35, Take_oil:container
61 | 60, 5_27, Put_seasoning:container
62 | 61, 2_7, Take_cucumber
63 | 62, 9_20, Wash_pan
64 | 63, 2_15, Take_bell:pepper
65 | 64, 12_10, Divide/Pull Apart_onion
66 | 65, 5_31, Put_cutting:board
67 | 66, 14_36, Mix_mixture
68 | 67, 2_37, Take_tomato:container
69 | 68, 5_38, Put_cheese
70 | 69, 8_2, Move Around_eating:utensil
71 | 70, 5_15, Put_bell:pepper
72 | 71, 15_39, Pour_oil
73 | 72, 2_40, Take_pasta:container
74 | 73, 3_21, Cut_lettuce
75 | 74, 5_37, Put_tomato:container
76 | 75, 9_13, Wash_bowl
77 | 76, 3_41, Cut_olive
78 | 77, 7_11, Close_drawer
79 | 78, 15_19, Pour_condiment
80 | 79, 9_23, Wash_pot
81 | 80, 14_42, Mix_pasta
82 | 81, 1_30, Open_bread:container
83 | 82, 2_43, Take_grocery:bag
84 | 83, 2_38, Take_cheese
85 | 84, 15_44, Pour_seasoning
86 | 85, 14_45, Mix_egg
87 | 86, 15_46, Pour_water
88 | 87, 5_17, Put_paper:towel
89 | 88, 5_7, Put_cucumber
90 | 89, 16_47, Compress_sandwich
91 | 90, 5_34, Put_cheese:container
92 | 91, 5_10, Put_onion
93 | 92, 17_45, Crack_egg
94 | 93, 2_31, Take_cutting:board
95 | 94, 1_35, Open_oil:container
96 | 95, 18_48, Squeeze_washing:liquid
97 | 96, 6_49, Operate_microwave
98 | 97, 7_25, Close_fridge:drawer
99 | 98, 9_50, Wash_strainer
100 | 99, 8_13, Move Around_bowl
101 | 100, 8_23, Move Around_pot
102 | 101, 5_43, Put_grocery:bag
103 | 102, 2_45, Take_egg
104 | 103, 1_34, Open_cheese:container
105 | 104, 7_35, Close_oil:container
106 | 105, 5_35, Put_oil:container
107 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 |
162 | outputs/
163 |
--------------------------------------------------------------------------------
/models/feature_mapping.py:
--------------------------------------------------------------------------------
1 | """Implementation of different projection functions that map feature vectors with different sizes to a common size"""
2 |
3 | import torch
4 | from torch import nn as nn
5 | from functools import partial
6 | from torch.nn import functional as F
7 |
8 |
9 | class GatedEmbeddingUnit(nn.Module):
10 | def __init__(self, input_dimension, output_dimension):
11 | super().__init__()
12 | self.fc = nn.Linear(input_dimension, output_dimension)
13 | self.cg = ContextGating(output_dimension)
14 |
15 | def forward(self, x):
16 | x = self.fc(x)
17 | x = self.cg(x)
18 | return x
19 |
20 |
21 | class ContextGating(nn.Module):
22 | def __init__(self, dimension):
23 | super(ContextGating, self).__init__()
24 | self.fc = nn.Linear(dimension, dimension)
25 |
26 | def forward(self, x):
27 | x1 = self.fc(x)
28 | x = torch.cat((x, x1), 1)
29 | return F.glu(x, 1)
30 |
31 |
32 | class GatedLinear(nn.Module):
33 | def __init__(self, in_features, out_features, use_layernorm: bool = True):
34 | super().__init__()
35 |
36 | tmp = [nn.Linear(in_features, out_features), ContextGating(out_features)]
37 |
38 | if use_layernorm:
39 | norm_layer = partial(nn.LayerNorm, eps=1e-6)
40 | tmp.append(norm_layer(out_features))
41 |
42 | layers = tmp # deprecated: if in_features != out_features else [nn.Identity()]
43 |
44 | self.mapping = nn.Sequential(*layers)
45 | self.use_layernorm = use_layernorm
46 |
47 | def forward(self, x):
48 | return self.mapping(x)
49 |
50 | def __str__(self):
51 | return f'Gated linear mapping layer with use_layernorm: {self.use_layernorm}'
52 |
53 |
54 | class Linear(nn.Module):
55 | """Implements the linear feature mapping layer"""
56 | def __init__(self, in_features, out_features, use_layernorm: bool = False, sparse_mapping=True):
57 | super().__init__()
58 |
59 | if sparse_mapping:
60 | layers = [nn.Linear(in_features, out_features, bias=False)
61 | if in_features != out_features else nn.Identity()]
62 | else:
63 | layers = [nn.Linear(in_features, out_features, bias=False)]
64 |
65 | if use_layernorm:
66 | norm_layer = partial(nn.LayerNorm, eps=1e-6)
67 | layers.append(norm_layer(out_features))
68 |
69 | self.mapping = nn.Sequential(*layers)
70 | self.use_layernorm = use_layernorm
71 | self.sparse_mapping = sparse_mapping
72 |
73 | def forward(self, x):
74 | return self.mapping(x)
75 |
76 | def __str__(self):
77 | return f'Linear mapping layer with use_layernorm: {self.use_layernorm}, ' \
78 | f'and sparse_mapping: {self.sparse_mapping}'
79 |
80 |
81 | def get_activation_layer(name):
82 | act_layers = {
83 | 'relu': nn.ReLU(),
84 | 'gelu': nn.GELU(),
85 | 'none': nn.Identity(),
86 | }
87 | assert name in act_layers.keys(), f'{name} is not supported in {list(act_layers.keys())}.'
88 | return act_layers[name]
89 |
90 |
91 | class NonLinear(nn.Module):
92 | """Implements the non-linear feature mapping layer"""
93 | def __init__(self, in_features, out_features, use_layernorm: bool = False, activation='relu'):
94 | super().__init__()
95 |
96 | layers = [nn.Linear(in_features, out_features), get_activation_layer(activation)]
97 |
98 | if use_layernorm:
99 | norm_layer = partial(nn.LayerNorm, eps=1e-6)
100 | layers.append(norm_layer(out_features))
101 |
102 | self.mapping = nn.Sequential(*layers)
103 | self.use_layernorm = use_layernorm
104 | self.activation = activation
105 |
106 | def forward(self, x):
107 | return self.mapping(x)
108 |
109 | def __str__(self):
110 | return f'Nonlinear mapping layer with use_layernorm: {self.use_layernorm}, ' \
111 | f'and activation: {self.activation}'
112 |
--------------------------------------------------------------------------------
/models/base_model.py:
--------------------------------------------------------------------------------
1 | """Implementation of the base model framework, instantiating different backbones, fusion methods and future predictor
2 | methods using hydra.utils.instantiate"""
3 | from itertools import repeat
4 | from typing import Dict, Tuple
5 | import torch
6 | import torch.nn as nn
7 | import hydra
8 | from omegaconf import OmegaConf
9 | from common import utils
10 |
11 | CLS_MAP_PREFIX = 'cls_map_'
12 | PAST_LOGITS_PREFIX = 'past_'
13 |
14 |
15 | class BaseModel(nn.Module):
16 | def __init__(self, model_cfg: OmegaConf, num_classes: Dict[str, int],
17 | class_mappings: Dict[Tuple[str, str], torch.FloatTensor]):
18 | super().__init__()
19 | self.backbone = nn.ModuleDict()
20 |
21 | for mod, backbone_conf in model_cfg.common.backbones.items():
22 | self.backbone[mod] = hydra.utils.instantiate(backbone_conf)
23 |
24 | self.future_predictor = hydra.utils.instantiate(model_cfg.CMFP, model_cfg=model_cfg,
25 | num_classes=num_classes, _recursive_=False)
26 |
27 | # Store the class mapping as buffers
28 | for (src, dst), mapping in class_mappings.items():
29 | self.register_buffer(f'{CLS_MAP_PREFIX}{src}_{dst}', mapping)
30 |
31 | def forward_singlecrop(self, data_dict, **kwargs):
32 | """
33 | Args:
34 | video (torch.Tensor, Bx#clipsxCxTxHxW)
35 | target_shape: The shape of the target. Some of these layers might
36 | be able to use this information.
37 | """
38 | feats_past = {}
39 | for mod, data in data_dict.items():
40 | feats = self.backbone[mod](data)
41 | # spatial mean B*clipsxCxT
42 | feats = torch.mean(feats, [-1, -2])
43 | feats = feats.permute((0, 1, 3, 2))
44 | if feats.ndim == 4:
45 | feats = torch.flatten(feats, 1, 2) # BxTxF, T=10
46 | feats_past[mod] = feats
47 |
48 | target = kwargs['target']
49 | target_subclips = kwargs['target_subclips']
50 | target_subclips_ignore_index = kwargs['target_subclips_ignore_index']
51 |
52 | # Mixup the backbone outputs if required
53 | if kwargs['mixup_fn'] is not None:
54 | mixup_fn = kwargs['mixup_fn']
55 | feats_past, target, target_subclips, target_subclips_ignore_index = \
56 | mixup_fn(feats_past, target, target_subclips)
57 |
58 | # Future prediction
59 | outputs = self.future_predictor(feats_past)
60 | outputs_target = {
61 | 'target': target,
62 | 'target_subclips': target_subclips,
63 | 'target_subclips_ignore_index': target_subclips_ignore_index
64 | }
65 |
66 | return outputs, outputs_target
67 |
68 | def forward(self, video_data, *args, **kwargs):
69 | """
70 | Args: video (torch.Tensor)
71 | Could be (B, #clips, C, T, H, W) or
72 | (B, #clips, #crops, C, T, H, W)
73 | Returns:
74 | Final features
75 | """
76 | for mod, data in video_data.items():
77 | if data.ndim == 6:
78 | video_data[mod] = [data]
79 | elif data.ndim == 7 and data.size(2) == 1:
80 | video_data[mod] = [data.squeeze(2)]
81 | elif data.ndim == 7:
82 | video_data[mod] = torch.unbind(data, dim=2)
83 | else:
84 | raise NotImplementedError('Unsupported size %s' % data.shape)
85 |
86 | all_mods = sorted(list(video_data.keys()))
87 | all_data = [video_data[mod] for mod in all_mods]
88 | num_crops = max([len(sl) for sl in all_data])
89 | all_data = [sl * (num_crops // len(sl)) for sl in all_data]
90 | all_crops = list(zip(*all_data))
91 |
92 | video_data = [{m: c for m, c in zip(mods, crops)} for mods, crops in zip(repeat(all_mods), all_crops)]
93 |
94 | feats = [self.forward_singlecrop(el, *args, **kwargs) for el in video_data]
95 |
96 | # Since we only apply mixup in training and in training we only have one single crop,
97 | # it's fine to just use the index 0 here
98 | output_targets = feats[0][1]
99 |
100 | # convert to dicts of lists
101 | feats_merged = {}
102 | for out_dict, _ in feats:
103 | for key in out_dict:
104 | if key not in feats_merged:
105 | feats_merged[key] = {k: [v] for k, v in out_dict[key].items()}
106 | else:
107 | for k, v in feats_merged[key].items():
108 | v.append(out_dict[key][k])
109 |
110 | # Average over the crops
111 | for out_key in feats_merged:
112 | if out_key == 'attentions':
113 | # we select the attentions from the first element, as for attention analysis we only have one crop
114 | feats_merged[out_key] = {k: el[0] for k, el in feats_merged[out_key].items()}
115 | continue
116 | feats_merged[out_key] = {k: torch.mean(torch.stack(el, dim=0), dim=0) for k, el in
117 | feats_merged[out_key].items()}
118 |
119 | return feats_merged, output_targets
120 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: afft
2 | channels:
3 | - pytorch
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - _openmp_mutex=4.5=1_gnu
9 | - blas=1.0=mkl
10 | - bzip2=1.0.8=h7b6447c_0
11 | - ca-certificates=2021.10.8=ha878542_0
12 | - certifi=2021.10.8=py37h89c1867_2
13 | - colorama=0.4.4=pyh9f0ad1d_0
14 | - cudatoolkit=11.3.1=h2bc3f7f_2
15 | - ffmpeg=4.3=hf484d3e_0
16 | - freetype=2.11.0=h70c0345_0
17 | - giflib=5.2.1=h7b6447c_0
18 | - gmp=6.2.1=h2531618_2
19 | - gnutls=3.6.15=he1e5248_0
20 | - intel-openmp=2021.4.0=h06a4308_3561
21 | - jpeg=9d=h7f8727e_0
22 | - lame=3.100=h7b6447c_0
23 | - lcms2=2.12=h3be6417_0
24 | - ld_impl_linux-64=2.35.1=h7274673_9
25 | - libffi=3.3=he6710b0_2
26 | - libgcc-ng=9.3.0=h5101ec6_17
27 | - libgomp=9.3.0=h5101ec6_17
28 | - libiconv=1.15=h63c8f33_5
29 | - libidn2=2.3.2=h7f8727e_0
30 | - libpng=1.6.37=hbc83047_0
31 | - libstdcxx-ng=9.3.0=hd4cf53a_17
32 | - libtasn1=4.16.0=h27cfd23_0
33 | - libtiff=4.2.0=h85742a9_0
34 | - libunistring=0.9.10=h27cfd23_0
35 | - libuv=1.40.0=h7b6447c_0
36 | - libwebp=1.2.0=h89dd481_0
37 | - libwebp-base=1.2.0=h27cfd23_0
38 | - lz4-c=1.9.3=h295c915_1
39 | - mkl=2021.4.0=h06a4308_640
40 | - mkl-service=2.4.0=py37h7f8727e_0
41 | - mkl_fft=1.3.1=py37hd3c417c_0
42 | - mkl_random=1.2.2=py37h51133e4_0
43 | - ncurses=6.3=h7f8727e_2
44 | - nettle=3.7.3=hbbd107a_1
45 | - numpy=1.21.2=py37h20f2e39_0
46 | - numpy-base=1.21.2=py37h79a1101_0
47 | - olefile=0.46=py37_0
48 | - openh264=2.1.1=h4ff587b_0
49 | - openssl=1.1.1n=h7f8727e_0
50 | - pillow=8.4.0=py37h5aabda8_0
51 | - pip=21.2.2=py37h06a4308_0
52 | - python=3.7.11=h12debd9_0
53 | - python_abi=3.7=2_cp37m
54 | - pytorch=1.10.1=py3.7_cuda11.3_cudnn8.2.0_0
55 | - pytorch-mutex=1.0=cuda
56 | - readline=8.1.2=h7f8727e_1
57 | - setuptools=58.0.4=py37h06a4308_0
58 | - six=1.16.0=pyhd3eb1b0_0
59 | - sqlite=3.37.0=hc218d9a_0
60 | - tk=8.6.11=h1ccaba5_0
61 | - torchaudio=0.10.1=py37_cu113
62 | - torchvision=0.11.2=py37_cu113
63 | - tqdm=4.64.0=pyhd8ed1ab_0
64 | - typing_extensions=3.10.0.2=pyh06a4308_0
65 | - wheel=0.37.1=pyhd3eb1b0_0
66 | - xz=5.2.5=h7b6447c_0
67 | - zlib=1.2.11=h7f8727e_4
68 | - zstd=1.4.9=haebb681_0
69 | - pip:
70 | - absl-py==1.0.0
71 | - aiohttp==3.8.1
72 | - aiosignal==1.2.0
73 | - antlr4-python3-runtime==4.8
74 | - async-timeout==4.0.2
75 | - asynctest==0.13.0
76 | - attrs==21.4.0
77 | - av==9.2.0
78 | - blessed==1.19.1
79 | - blessings==1.7
80 | - cached-property==1.5.2
81 | - cachetools==5.1.0
82 | - charset-normalizer==2.0.12
83 | - click==8.0.4
84 | - cloudpickle==2.0.0
85 | - cycler==0.11.0
86 | - datasets==2.3.2
87 | - decorator==4.4.2
88 | - dill==0.3.5.1
89 | - docker-pycreds==0.4.0
90 | - einops==0.4.1
91 | - filelock==3.6.0
92 | - fonttools==4.33.3
93 | - frozenlist==1.3.0
94 | - fsspec==2022.5.0
95 | - fvcore==0.1.5.post20220512
96 | - gitdb==4.0.9
97 | - gitpython==3.1.27
98 | - google-auth==2.6.6
99 | - google-auth-oauthlib==0.4.6
100 | - gpustat==0.6.0
101 | - grpcio==1.46.3
102 | - h5py==3.6.0
103 | - huggingface-hub==0.5.1
104 | - hydra-core==1.1.1
105 | - idna==3.3
106 | - imageio==2.19.2
107 | - imageio-ffmpeg==0.4.7
108 | - importlib-metadata==4.11.1
109 | - importlib-resources==5.4.0
110 | - inquirer==2.9.2
111 | - iopath==0.1.9
112 | - joblib==1.1.0
113 | - kiwisolver==1.4.2
114 | - lmdb==1.3.0
115 | - markdown==3.3.7
116 | - matplotlib==3.5.2
117 | - moviepy==1.0.3
118 | - multidict==6.0.2
119 | - multiprocess==0.70.13
120 | - munch==2.5.0
121 | - networkx==2.6.3
122 | - numpyencoder==0.3.0
123 | - nvidia-ml-py3==7.352.0
124 | - oauthlib==3.2.0
125 | - omegaconf==2.1.1
126 | - opencv-python==4.5.5.64
127 | - packaging==21.3
128 | - pandas==1.3.5
129 | - parameterized==0.8.1
130 | - pathtools==0.1.2
131 | - portalocker==2.4.0
132 | - pretrainedmodels==0.7.4
133 | - proglog==0.1.10
134 | - promise==2.3
135 | - protobuf==3.19.4
136 | - psutil==5.9.0
137 | - pyarrow==8.0.0
138 | - pyasn1==0.4.8
139 | - pyasn1-modules==0.2.8
140 | - pyparsing==3.0.8
141 | - python-dateutil==2.8.2
142 | - python-editor==1.0.4
143 | - pytorchvideo==0.1.5
144 | - pytz==2021.3
145 | - pyyaml==6.0
146 | - readchar==3.0.5
147 | - regex==2022.4.24
148 | - requests==2.27.1
149 | - requests-oauthlib==1.3.1
150 | - responses==0.18.0
151 | - rsa==4.8
152 | - sacremoses==0.0.49
153 | - scipy==1.7.3
154 | - seaborn==0.11.2
155 | - sentry-sdk==1.5.5
156 | - shortuuid==1.0.8
157 | - smmap==5.0.0
158 | - submitit==1.4.2
159 | - tabulate==0.8.9
160 | - tensorboard==2.9.0
161 | - tensorboard-data-server==0.6.1
162 | - tensorboard-plugin-wit==1.8.1
163 | - termcolor==1.1.0
164 | - timm==0.5.4
165 | - tokenizers==0.12.1
166 | - transformers==4.18.0
167 | - urllib3==1.26.8
168 | - wandb==0.12.10
169 | - wcwidth==0.2.5
170 | - werkzeug==2.1.2
171 | - wget==3.2
172 | - xxhash==3.0.0
173 | - yacs==0.1.8
174 | - yarl==1.7.2
175 | - yaspin==2.1.0
176 | - zipp==3.7.0
177 | prefix: /home/haicore-project-kit/on3546/anaconda3/envs/action
178 |
--------------------------------------------------------------------------------
/annotations/ek100_rulstm/training_videos.csv:
--------------------------------------------------------------------------------
1 | P01_01
2 | P01_02
3 | P01_03
4 | P01_04
5 | P01_05
6 | P01_06
7 | P01_07
8 | P01_08
9 | P01_09
10 | P01_102
11 | P01_103
12 | P01_104
13 | P01_105
14 | P01_106
15 | P01_107
16 | P01_108
17 | P01_109
18 | P01_10
19 | P01_16
20 | P01_17
21 | P01_18
22 | P01_19
23 | P02_01
24 | P02_02
25 | P02_03
26 | P02_04
27 | P02_05
28 | P02_06
29 | P02_07
30 | P02_08
31 | P02_09
32 | P02_101
33 | P02_102
34 | P02_103
35 | P02_104
36 | P02_105
37 | P02_107
38 | P02_108
39 | P02_109
40 | P02_10
41 | P02_110
42 | P02_111
43 | P02_112
44 | P02_113
45 | P02_114
46 | P02_115
47 | P02_116
48 | P02_118
49 | P02_119
50 | P02_11
51 | P02_120
52 | P02_121
53 | P02_122
54 | P02_123
55 | P02_124
56 | P02_126
57 | P02_127
58 | P02_128
59 | P02_129
60 | P02_130
61 | P02_131
62 | P02_132
63 | P02_133
64 | P02_134
65 | P02_135
66 | P03_02
67 | P03_03
68 | P03_04
69 | P03_05
70 | P03_06
71 | P03_07
72 | P03_08
73 | P03_09
74 | P03_101
75 | P03_102
76 | P03_106
77 | P03_107
78 | P03_108
79 | P03_109
80 | P03_10
81 | P03_110
82 | P03_111
83 | P03_112
84 | P03_113
85 | P03_114
86 | P03_115
87 | P03_116
88 | P03_117
89 | P03_118
90 | P03_119
91 | P03_11
92 | P03_120
93 | P03_121
94 | P03_122
95 | P03_123
96 | P03_12
97 | P03_13
98 | P03_14
99 | P03_15
100 | P03_16
101 | P03_17
102 | P03_18
103 | P03_19
104 | P03_20
105 | P03_27
106 | P03_28
107 | P04_01
108 | P04_02
109 | P04_03
110 | P04_04
111 | P04_05
112 | P04_06
113 | P04_07
114 | P04_08
115 | P04_09
116 | P04_101
117 | P04_102
118 | P04_103
119 | P04_104
120 | P04_106
121 | P04_107
122 | P04_108
123 | P04_109
124 | P04_10
125 | P04_110
126 | P04_111
127 | P04_112
128 | P04_113
129 | P04_114
130 | P04_115
131 | P04_116
132 | P04_117
133 | P04_118
134 | P04_119
135 | P04_11
136 | P04_120
137 | P04_121
138 | P04_12
139 | P04_13
140 | P04_14
141 | P04_15
142 | P04_16
143 | P04_17
144 | P04_18
145 | P04_19
146 | P04_20
147 | P04_21
148 | P04_22
149 | P04_23
150 | P05_01
151 | P05_02
152 | P05_03
153 | P05_04
154 | P05_05
155 | P05_06
156 | P05_08
157 | P06_01
158 | P06_02
159 | P06_03
160 | P06_05
161 | P06_07
162 | P06_08
163 | P06_09
164 | P06_101
165 | P06_102
166 | P06_103
167 | P06_104
168 | P06_105
169 | P06_106
170 | P06_107
171 | P06_108
172 | P06_109
173 | P06_110
174 | P06_113
175 | P07_01
176 | P07_02
177 | P07_03
178 | P07_04
179 | P07_05
180 | P07_06
181 | P07_07
182 | P07_08
183 | P07_09
184 | P07_101
185 | P07_102
186 | P07_103
187 | P07_106
188 | P07_107
189 | P07_10
190 | P07_110
191 | P07_111
192 | P07_112
193 | P07_113
194 | P07_114
195 | P07_115
196 | P07_116
197 | P07_117
198 | P07_11
199 | P08_01
200 | P08_02
201 | P08_03
202 | P08_04
203 | P08_05
204 | P08_06
205 | P08_07
206 | P08_08
207 | P08_11
208 | P08_12
209 | P08_13
210 | P08_18
211 | P08_19
212 | P08_20
213 | P08_21
214 | P08_22
215 | P08_23
216 | P08_24
217 | P08_25
218 | P08_26
219 | P08_27
220 | P08_28
221 | P09_01
222 | P09_02
223 | P09_03
224 | P09_04
225 | P09_05
226 | P09_06
227 | P09_103
228 | P09_104
229 | P09_105
230 | P09_106
231 | P10_01
232 | P10_02
233 | P10_04
234 | P11_01
235 | P11_02
236 | P11_03
237 | P11_04
238 | P11_05
239 | P11_06
240 | P11_07
241 | P11_08
242 | P11_09
243 | P11_101
244 | P11_102
245 | P11_103
246 | P11_104
247 | P11_105
248 | P11_107
249 | P11_109
250 | P11_10
251 | P11_11
252 | P11_12
253 | P11_13
254 | P11_14
255 | P11_15
256 | P11_16
257 | P12_01
258 | P12_02
259 | P12_04
260 | P12_05
261 | P12_06
262 | P12_07
263 | P12_101
264 | P12_103
265 | P12_104
266 | P12_105
267 | P13_04
268 | P13_05
269 | P13_06
270 | P13_07
271 | P13_08
272 | P13_09
273 | P13_10
274 | P14_01
275 | P14_02
276 | P14_03
277 | P14_04
278 | P14_05
279 | P14_07
280 | P14_09
281 | P15_01
282 | P15_02
283 | P15_03
284 | P15_07
285 | P15_08
286 | P15_09
287 | P15_10
288 | P15_11
289 | P15_12
290 | P15_13
291 | P16_01
292 | P16_02
293 | P16_03
294 | P17_01
295 | P17_03
296 | P17_04
297 | P19_01
298 | P19_02
299 | P19_03
300 | P19_04
301 | P20_01
302 | P20_02
303 | P20_03
304 | P20_04
305 | P21_01
306 | P21_03
307 | P21_04
308 | P22_05
309 | P22_06
310 | P22_07
311 | P22_08
312 | P22_09
313 | P22_101
314 | P22_102
315 | P22_103
316 | P22_104
317 | P22_105
318 | P22_106
319 | P22_107
320 | P22_108
321 | P22_109
322 | P22_10
323 | P22_110
324 | P22_111
325 | P22_112
326 | P22_113
327 | P22_115
328 | P22_116
329 | P22_117
330 | P22_11
331 | P22_12
332 | P22_13
333 | P22_14
334 | P22_15
335 | P22_16
336 | P22_17
337 | P23_01
338 | P23_02
339 | P23_03
340 | P23_04
341 | P23_101
342 | P23_102
343 | P24_01
344 | P24_02
345 | P24_03
346 | P24_04
347 | P24_05
348 | P24_06
349 | P24_07
350 | P24_08
351 | P25_01
352 | P25_02
353 | P25_03
354 | P25_04
355 | P25_05
356 | P25_09
357 | P25_101
358 | P25_102
359 | P25_103
360 | P25_104
361 | P25_106
362 | P25_107
363 | P25_10
364 | P25_11
365 | P25_12
366 | P26_01
367 | P26_02
368 | P26_03
369 | P26_04
370 | P26_05
371 | P26_06
372 | P26_07
373 | P26_08
374 | P26_09
375 | P26_101
376 | P26_102
377 | P26_103
378 | P26_104
379 | P26_105
380 | P26_106
381 | P26_107
382 | P26_108
383 | P26_109
384 | P26_10
385 | P26_110
386 | P26_111
387 | P26_112
388 | P26_113
389 | P26_114
390 | P26_115
391 | P26_116
392 | P26_117
393 | P26_118
394 | P26_119
395 | P26_11
396 | P26_124
397 | P26_12
398 | P26_13
399 | P26_14
400 | P26_15
401 | P26_16
402 | P26_17
403 | P26_18
404 | P26_19
405 | P26_20
406 | P26_21
407 | P26_22
408 | P26_23
409 | P26_24
410 | P26_25
411 | P26_26
412 | P26_27
413 | P26_28
414 | P26_29
415 | P27_01
416 | P27_02
417 | P27_03
418 | P27_04
419 | P27_06
420 | P27_07
421 | P27_101
422 | P27_103
423 | P27_104
424 | P27_105
425 | P28_01
426 | P28_02
427 | P28_03
428 | P28_04
429 | P28_05
430 | P28_06
431 | P28_07
432 | P28_08
433 | P28_09
434 | P28_101
435 | P28_102
436 | P28_103
437 | P28_104
438 | P28_105
439 | P28_106
440 | P28_107
441 | P28_108
442 | P28_109
443 | P28_10
444 | P28_110
445 | P28_111
446 | P28_112
447 | P28_113
448 | P28_11
449 | P28_12
450 | P28_13
451 | P28_14
452 | P29_01
453 | P29_02
454 | P29_03
455 | P29_04
456 | P30_01
457 | P30_02
458 | P30_03
459 | P30_04
460 | P30_05
461 | P30_06
462 | P30_101
463 | P30_103
464 | P30_104
465 | P30_107
466 | P30_108
467 | P30_109
468 | P30_10
469 | P30_110
470 | P30_111
471 | P30_112
472 | P30_113
473 | P30_114
474 | P30_11
475 | P31_01
476 | P31_02
477 | P31_03
478 | P31_04
479 | P31_05
480 | P31_06
481 | P31_07
482 | P31_08
483 | P31_09
484 | P31_13
485 | P31_14
486 | P35_101
487 | P35_103
488 | P35_104
489 | P35_105
490 | P35_107
491 | P35_108
492 | P35_109
493 | P37_101
494 | P37_102
495 | P37_103
496 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch
3 | from torch import nn
4 | from torch.utils.data import DataLoader
5 | from tqdm import tqdm
6 | import hydra
7 | from omegaconf import OmegaConf, DictConfig, ListConfig
8 | import numpy as np
9 | import os
10 | import h5py
11 | from collections import defaultdict
12 |
13 | from models.base_model import BaseModel
14 | from datasets.data import get_dataset
15 | from train import get_transform_val, init_model
16 | from challenge import marginalize_verb_noun, print_accuracies_epic, LOGITS_DIR
17 | from train import DATASET_EVAL_CFG_KEY
18 |
19 |
20 | def store_append_h5(endpoints, output_dir, save_file_name):
21 | output_fpath = os.path.join(output_dir, save_file_name)
22 | os.makedirs(output_dir, exist_ok=True)
23 | with h5py.File(output_fpath, 'a') as fout:
24 | for key, val in endpoints.items():
25 | if key not in fout:
26 | fout.create_dataset(key, data=val, compression='gzip', compression_opts=9,
27 | chunks=True, maxshape=(None, ) + val.shape[1:])
28 | else:
29 | fout[key].resize((fout[key].shape[0] + val.shape[0], ) + val.shape[1:])
30 | fout[key][-val.shape[0]:, ...] = val
31 |
32 |
33 | def save_logits(model, data_loader: DataLoader, device, logger, save_dir=None, save_file_name=None):
34 | """Saves logits to given path, so that the logits can be used for ensemble or any other analysis"""
35 | # construct kwargs for forwarding
36 | kwargs = {}
37 | kwargs['mixup_fn'] = None
38 | kwargs['target'] = None
39 | kwargs['target_subclips'] = None
40 | kwargs['target_subclips_ignore_index'] = None
41 |
42 | for idx, data in enumerate(tqdm(data_loader)):
43 | data, _ = data
44 | feature_dict = {mod: tens.to(device, non_blocking=True) for mod, tens in data["data_dict"].items()}
45 | outputs, outputs_target = model(feature_dict, **kwargs)
46 |
47 | logits_key = 'logits/action'
48 |
49 | logits = {}
50 | if len(outputs[logits_key]) == 1: # single modality or early fusion model
51 | modk = next(iter(outputs[logits_key].keys()))
52 | logits[f'{logits_key}_{modk}'] = outputs[f'{logits_key}'][modk][:, 0, :].detach().cpu().numpy()
53 | else:
54 | fusion_key = 'all-fused'
55 | logging.info(f'This model consists of multiple branches. '
56 | f'Saving fusion branch "{fusion_key}" only ...')
57 | logits[f'{logits_key}_{fusion_key}'] = \
58 | outputs[f'{logits_key}'][fusion_key][:, 0, :].detach().cpu().numpy()
59 |
60 | store_append_h5(logits, save_dir, save_file_name)
61 | logger.info(f'Saved logits {logits.keys()} as {save_file_name} to {save_dir}.')
62 |
63 |
64 | def evaluate(model, dataset, data_loader: DataLoader, device):
65 | """
66 | Computes the verb, noun and action performance of overall, unseen and tail
67 | """
68 | logits_key = 'logits/action'
69 | logits = defaultdict(list)
70 |
71 | # construct kwargs for forwarding
72 | kwargs = {}
73 | kwargs['mixup_fn'] = None
74 | kwargs['target'] = None
75 | kwargs['target_subclips'] = None
76 | kwargs['target_subclips_ignore_index'] = None
77 |
78 | # forwarding
79 | for idx, data in enumerate(tqdm(data_loader)):
80 | data, _ = data
81 | feature_dict = {mod: tens.to(device, non_blocking=True) for mod, tens in data["data_dict"].items()}
82 | outputs, outputs_target = model(feature_dict, **kwargs)
83 |
84 | if len(outputs[logits_key]) == 1: # single modality or early fusion model
85 | modk = next(iter(outputs[logits_key].keys()))
86 | logits[f'{logits_key}_{modk}'].append(outputs[f'{logits_key}'][modk][:, 0, :].detach().cpu().numpy())
87 | else:
88 | fusion_key = 'all-fused'
89 | logging.info(f'This model consists of multiple branches. '
90 | f'Saving fusion branch "{fusion_key}" only ...')
91 | logits[f'{logits_key}_{fusion_key}'].append(
92 | outputs[f'{logits_key}'][fusion_key][:, 0, :].detach().cpu().numpy())
93 |
94 | # since we only save one entry
95 | logits_array = np.concatenate(next(iter(logits.values())), axis=0)
96 |
97 | accs, scores = marginalize_verb_noun(logits_array, dataset, to_prob=True, compute_manyshot_unseen_tail=True)
98 | print_accuracies_epic(accs)
99 |
100 |
101 | @hydra.main(config_path="conf", config_name="config")
102 | def main(cfg: DictConfig):
103 | print(OmegaConf.to_yaml(cfg))
104 | logger = logging.getLogger(__name__)
105 |
106 | device = torch.device('cuda')
107 | transform_val = get_transform_val(cfg)
108 | dataset_test = get_dataset(getattr(cfg, DATASET_EVAL_CFG_KEY), cfg.data_eval, transform_val, logger)
109 | logger.info('Creating data loaders...')
110 | data_loader_test = torch.utils.data.DataLoader(
111 | dataset_test,
112 | batch_size=cfg.eval.batch_size or cfg.train.batch_size * 4,
113 | num_workers=cfg.workers,
114 | pin_memory=True,
115 | shuffle=False
116 | )
117 |
118 | num_classes = {key: len(val) for key, val in dataset_test.classes.items()}
119 | model = BaseModel(cfg.model, num_classes=num_classes, class_mappings=dataset_test.class_mappings)
120 |
121 | # load pretrained weights
122 | assert cfg.init_from_model is not None, 'Checkpoint is required for test.'
123 | ckpt_paths = cfg.init_from_model
124 | if not isinstance(ckpt_paths, ListConfig):
125 | ckpt_paths = [ckpt_paths]
126 | ckpt_paths = [os.path.join(cfg.cwd, 'checkpoints', path) for path in ckpt_paths]
127 | modules_to_keep = None
128 | _ = init_model(model, ckpt_paths, modules_to_keep, logger)
129 |
130 | model = nn.DataParallel(model, device_ids=range(cfg.num_gpus))
131 | model = model.to(device) # Sends model to device 0, other gpus are used automatically.
132 |
133 | # test
134 | model.eval()
135 | with torch.no_grad():
136 | if 'save_name' in cfg:
137 | save_dir = os.path.join(cfg.cwd, LOGITS_DIR, cfg.init_from_model.split('/')[0])
138 | save_logits(model, data_loader_test, device, logger, save_dir, cfg.save_name)
139 | else:
140 | evaluate(model, dataset_test, data_loader_test, device)
141 |
142 |
143 | if __name__ == '__main__':
144 | main()
145 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Anticipative Feature Fusion Transformer for Multi-Modal Action Anticipation (WACV 2023)
2 | [](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-100?p=anticipative-feature-fusion-transformer-for)
3 |
4 | This repository contains the official source code and data for
5 | our [AFFT](https://arxiv.org/abs/2210.12649) paper.
6 | If you find our code or paper useful, please consider citing:
7 |
8 | Z. Zhong, D. Schneider, M. Voit, R. Stiefelhagen and J. Beyerer.
9 | Anticipative Feature Fusion Transformer for Multi-Modal Action Anticipation.
10 | In *WACV*, 2023.
11 |
12 | ```bibtex
13 | @InProceedings{Zhong_2023_WACV,
14 | author = {Zhong, Zeyun and Schneider, David and Voit, Michael and Stiefelhagen, Rainer and Beyerer, J\"urgen},
15 | title = {Anticipative Feature Fusion Transformer for Multi-Modal Action Anticipation},
16 | booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},
17 | month = {January},
18 | year = {2023},
19 | pages = {6068-6077}
20 | }
21 | ```
22 |
23 |
25 |