├── __init__.py ├── models ├── __init__.py ├── classifiers.py ├── temporal_aggregation.py └── video_classification.py ├── datasets ├── __init__.py ├── data.py └── reader_fns.py ├── external └── __init__.py ├── loss_fn ├── __init__.py ├── mse.py ├── multidim_xentropy.py └── simclr_infonce.py ├── common ├── __init__.py ├── cluster.py ├── sampler.py ├── scheduler.py ├── utils.py └── log.py ├── func ├── __init__.py └── train_eval_ops.py ├── .style.yapf ├── conf ├── model │ ├── temporal_aggregator_after_future_pred │ ├── classifier │ │ ├── linear.yaml │ │ └── mlp.yaml │ ├── backbone │ │ ├── r3d_18.yaml │ │ ├── avt_b.yaml │ │ ├── bn_inception.yaml │ │ ├── avt_b_in21k.yaml │ │ ├── r2plus1d_34.yaml │ │ ├── identity.yaml │ │ └── r2plus1d_18.yaml │ ├── future_predictor │ │ ├── avth.yaml │ │ ├── identity.yaml │ │ └── mlp.yaml │ └── temporal_aggregator │ │ ├── mean.yaml │ │ ├── identity.yaml │ │ ├── transformer.yaml │ │ └── rulstm.yaml ├── opt │ ├── optimizer │ │ ├── adam.yaml │ │ ├── adamW.yaml │ │ ├── adafactor.yaml │ │ └── sgd.yaml │ └── scheduler │ │ ├── cosine.yaml │ │ ├── reduce_lr_on_plateau.yaml │ │ └── warmup_multi_step.yaml ├── train_eval_op │ ├── cls_loss_acc_fn │ │ ├── no.yaml │ │ └── basic.yaml │ ├── basic.yaml │ ├── reg_criterion │ │ ├── mse.yaml │ │ └── simclr_infonce.yaml │ └── pred_future_feat.yaml ├── dataset │ ├── dundee50salads │ │ ├── annot_reader_fn │ │ │ ├── orig.yaml │ │ │ └── abu_farha.yaml │ │ ├── anticipation_train.yaml │ │ ├── anticipation_val.yaml │ │ └── common.yaml │ ├── egtea │ │ ├── common.yaml │ │ ├── anticipation_train.yaml │ │ └── anticipation_val.yaml │ ├── epic_kitchens │ │ ├── common.yaml │ │ ├── anticipation_test_s1.yaml │ │ ├── anticipation_test_s2.yaml │ │ ├── anticipation_train.yaml │ │ ├── anticipation_val.yaml │ │ └── anticipation_train_minus_val.yaml │ └── epic_kitchens100 │ │ ├── common.yaml │ │ ├── anticipation_test.yaml │ │ ├── anticipation_train.yaml │ │ ├── anticipation_val.yaml │ │ └── anticipation_train+val.yaml ├── data │ └── default.yaml └── config.yaml ├── .yapfignore ├── .gitignore ├── .gitmodules ├── sample_scripts └── resize_epic_256px.sh ├── docs ├── DATASETS.md └── MODELS.md ├── CONTRIBUTING.md ├── train_net.py ├── expts ├── 05_ek100_rustm_test_testonly.txt ├── 08_ek55_avt_tsn.txt ├── 10_ek55_avt_ig65m.txt ├── 10_ek55_avt_ig65m_forAR.txt ├── 08_ek55_avt_tsn_forAR.txt ├── 02_ek100_avt_tsn.txt ├── 03_ek100_avt_tsn_obj.txt ├── 02_ek100_avt_tsn_test_trainval.txt ├── 03_ek100_avt_tsn_obj_test_trainval.txt ├── 04_ek100_avt_ig65m_test_trainval.txt ├── 11_egtea_avt_tsn.txt ├── 02_ek100_avt_tsn_test_testonly.txt ├── 06_ek100_avt_tsnflow.txt ├── 06_ek100_avt_tsnflow_test_trainval.txt ├── 04_ek100_avt_ig65m.txt ├── 13_50s_avt.txt ├── 09_ek55_avt.txt ├── 09_ek55_avt_forAR.txt ├── 06_ek100_avt_tsnflow_test_testonly.txt ├── 04_ek100_avt_ig65m_test_testonly.txt ├── 07_ek100_avt_longer.txt ├── 07_ek100_avt_longer_test_testonly.txt ├── 07_ek100_avt_longer_test_trainval.txt ├── 01_ek100_avt.txt ├── 01_ek100_avt_test_testonly.txt ├── 01_ek100_avt_test_trainval.txt └── 12_egtea_avt.txt ├── CODE_OF_CONDUCT.md ├── env.yaml ├── README.md └── LICENSE /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /external/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /loss_fn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- 1 | from .log import * 2 | -------------------------------------------------------------------------------- /func/__init__.py: -------------------------------------------------------------------------------- 1 | from . import train 2 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = google 3 | -------------------------------------------------------------------------------- /conf/model/temporal_aggregator_after_future_pred: -------------------------------------------------------------------------------- 1 | temporal_aggregator/ -------------------------------------------------------------------------------- /conf/opt/optimizer/adam.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: torch.optim.Adam 4 | -------------------------------------------------------------------------------- /conf/opt/optimizer/adamW.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: torch.optim.AdamW 4 | -------------------------------------------------------------------------------- /conf/train_eval_op/cls_loss_acc_fn/no.yaml: -------------------------------------------------------------------------------- 1 | _target_: func.train_eval_ops.NoLossAccuracy 2 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | src/python/phyre/interface/* 2 | src/python/build/* 3 | src/viz/mpde_modules/* 4 | -------------------------------------------------------------------------------- /conf/train_eval_op/basic.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: func.train_eval_ops.Basic 4 | -------------------------------------------------------------------------------- /conf/train_eval_op/reg_criterion/mse.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: torch.nn.MSELoss 4 | -------------------------------------------------------------------------------- /conf/model/classifier/linear.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: torch.nn.Linear 4 | bias: true 5 | -------------------------------------------------------------------------------- /conf/model/backbone/r3d_18.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: torchvision.models.video.resnet.r3d_18 4 | -------------------------------------------------------------------------------- /conf/model/future_predictor/avth.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.future_prediction.AVTh 4 | -------------------------------------------------------------------------------- /conf/model/future_predictor/identity.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.future_prediction.Identity 4 | -------------------------------------------------------------------------------- /conf/model/temporal_aggregator/mean.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.temporal_aggregation.Mean 4 | -------------------------------------------------------------------------------- /conf/model/temporal_aggregator/identity.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.temporal_aggregation.Identity 4 | -------------------------------------------------------------------------------- /conf/opt/optimizer/adafactor.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: transformers.Adafactor 4 | warmup_init: false 5 | -------------------------------------------------------------------------------- /conf/opt/optimizer/sgd.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: torch.optim.SGD 4 | momentum: 0.9 5 | nesterov: false 6 | -------------------------------------------------------------------------------- /conf/model/classifier/mlp.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.classifiers.MLP 4 | nlayers: 2 5 | bias: true 6 | -------------------------------------------------------------------------------- /conf/model/future_predictor/mlp.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.future_prediction.MLP 4 | num_layers: 2 5 | -------------------------------------------------------------------------------- /conf/model/temporal_aggregator/transformer.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.temporal_aggregation.Transformer 4 | -------------------------------------------------------------------------------- /conf/model/backbone/avt_b.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: models.video_classification.TIMMModel 3 | model_type: vit_base_patch16_224 4 | -------------------------------------------------------------------------------- /conf/model/backbone/bn_inception.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: models.video_classification.BNInceptionVideo 3 | pretrained: null 4 | -------------------------------------------------------------------------------- /conf/train_eval_op/cls_loss_acc_fn/basic.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: func.train_eval_ops.BasicLossAccuracy 4 | balance_classes: false 5 | -------------------------------------------------------------------------------- /conf/model/backbone/avt_b_in21k.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: models.video_classification.TIMMModel 3 | model_type: vit_base_patch16_224_in21k 4 | -------------------------------------------------------------------------------- /conf/model/backbone/r2plus1d_34.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: vmz.models.r2plus1d_34 3 | pretraining: "" 4 | use_pool1: false 5 | num_classes: null # Will be set in the code based on dataset 6 | -------------------------------------------------------------------------------- /conf/opt/scheduler/cosine.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: common.scheduler.CosineLR 4 | num_epochs: ${minus:${train.num_epochs},${opt.warmup.num_epochs}} 5 | eta_min: 0.0 # Min LR (default) 6 | -------------------------------------------------------------------------------- /conf/model/backbone/identity.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | # This backbone will just pass the input as output, useful when the 3 | # input is not video but pretrained features 4 | _target_: torch.nn.Identity 5 | -------------------------------------------------------------------------------- /conf/model/backbone/r2plus1d_18.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: torchvision.models.video.r2plus1d_18 3 | pretrained: false 4 | progress: False 5 | num_classes: null # Will be set in the code based on dataset 6 | -------------------------------------------------------------------------------- /conf/train_eval_op/reg_criterion/simclr_infonce.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: loss_fn.simclr_infonce.DistributedSimclrInfoNCELoss 4 | temperature: 0.1 5 | target_to_output_loss: true 6 | mil_type: sum 7 | -------------------------------------------------------------------------------- /conf/dataset/dundee50salads/annot_reader_fn/orig.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: datasets.breakfast_50salads.read_orig_50salads_annotations 4 | annots_dir: ${dataset.dundee50salads.common.annots_dir} 5 | timestamps_dir: ${dataset.dundee50salads.common.timestamps_dir} 6 | -------------------------------------------------------------------------------- /conf/dataset/egtea/common.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | version: -1 4 | # RULSTM feats dirs 5 | rulstm_feats_dir: ${cwd}/DATA/external/rulstm/RULSTM/egtea/ 6 | rulstm_annot_dir: ${cwd}/external/rulstm/RULSTM/data/egtea/ 7 | label_type: action 8 | tau_a: 1.0 9 | tau_o: 2.5 10 | split: 1 11 | modality: rgb 12 | -------------------------------------------------------------------------------- /conf/opt/scheduler/reduce_lr_on_plateau.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: common.scheduler.ReduceLROnPlateau 4 | mode: "max" # Since I pass in the validation accuracy (top 1) 5 | patience: 10 6 | threshold: 1 # If the val acc stuck within 1% 7 | threshold_mode: "abs" 8 | cooldown: 3 9 | min_lr: 0.0000001 10 | -------------------------------------------------------------------------------- /conf/dataset/dundee50salads/annot_reader_fn/abu_farha.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: datasets.breakfast_50salads.read_abu_farha_annotations 4 | annots_dir: ${dataset.dundee50salads.common.annots_dir_abu_farha} 5 | bundle_entry_to_vname_fn: 6 | _target_: datasets.breakfast_50salads.bundle_entry_to_video_fname_50salads 7 | -------------------------------------------------------------------------------- /conf/opt/scheduler/warmup_multi_step.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: common.scheduler.WarmupMultiStepLR 4 | # Decrease lr on milestones 5 | milestone_epochs: [20, 30, 40] 6 | # Decrease LR by this factor 7 | gamma: 0.1 8 | warmup_factor: 0.00001 9 | warmup_epochs: 10 10 | warmup_method: "linear" 11 | last_epoch: -1 12 | -------------------------------------------------------------------------------- /conf/train_eval_op/pred_future_feat.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: func.train_eval_ops.PredFutureFeat 4 | reg_criterion: ${train_eval_op.pred_future_feat.reg_criterion} 5 | future_target: temp_agg_projected 6 | incur_loss_style: separately 7 | combine_future_losses: 8 | _target_: torch.min 9 | cumulative_future: false 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # General exluded files 2 | *.swp 3 | *.o 4 | __pycache__ 5 | .DS_Store 6 | .ropeproject 7 | .nfs* 8 | .vscode 9 | *.egg-info/ 10 | *.pyc 11 | *.mypy_cache 12 | 13 | # Data directories 14 | DATA 15 | OUTPUTS 16 | 17 | # python notebook related 18 | */.cph_* 19 | *.ipynb_checkpoints 20 | 21 | # other codebases 22 | external/rulstm 23 | -------------------------------------------------------------------------------- /conf/model/temporal_aggregator/rulstm.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | _target_: models.temporal_aggregation.RULSTMAggregation 4 | intermediate_featdim: ${model.intermediate_featdim} 5 | dropout: ${model.dropout} 6 | # The following may be important to set if using with pre-trained model that 7 | # was trained for diff number of unrollings 8 | num_pad_feats: 0 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/epic-kitchens-100-annotations"] 2 | path = external/epic-kitchens-100-annotations 3 | url = git@github.com:epic-kitchens/epic-kitchens-100-annotations.git 4 | [submodule "external/epic-kitchens-55-annotations"] 5 | path = external/epic-kitchens-55-annotations 6 | url = git@github.com:epic-kitchens/epic-kitchens-55-annotations.git 7 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens/common.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | version: 0.1 4 | data_dir: ${cwd}/DATA/videos/EpicKitchens/videos_ht256px 5 | annot_dir: ${cwd}/external/epic-kitchens-55-annotations/ 6 | rulstm_annot_dir: ${cwd}/external/rulstm/RULSTM/data/ek55/ 7 | rulstm_feats_dir: ${cwd}/DATA/external/rulstm/RULSTM/data_full/ 8 | label_type: action 9 | tau_a: 1.0 10 | tau_o: 2.5 11 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens100/common.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | version: 0.2 4 | # I manually removed the video/ directory in between by moving 5 | # videos out of the videos/ dir 6 | data_dir_extension: ${cwd}/DATA/videos/EpicKitchens100/videos_extension_ht256px 7 | # RULSTM feats dirs 8 | rulstm_feats_dir: ${cwd}/DATA/external/rulstm/RULSTM/ek100_data_full/ 9 | annot_dir: ${cwd}/external/epic-kitchens-100-annotations/ 10 | rulstm_annot_dir: ${cwd}/external/rulstm/RULSTM/data/ek100/ 11 | label_type: action 12 | tau_a: 1.0 13 | tau_o: 2.5 14 | -------------------------------------------------------------------------------- /loss_fn/mse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """Variants of MSE loss.""" 4 | import torch.nn as nn 5 | 6 | 7 | class NormedMSE(nn.MSELoss): 8 | def forward(self, inp, tgt, *args, **kwargs): 9 | """ 10 | Args: 11 | inp: (*, C) 12 | tgt: (*, C) 13 | Will normalize the input before the loss 14 | """ 15 | inp = nn.functional.normalize(inp, dim=-1, p=2) 16 | tgt = nn.functional.normalize(tgt, dim=-1, p=2) 17 | return super().forward(inp, tgt, *args, **kwargs) 18 | -------------------------------------------------------------------------------- /conf/dataset/dundee50salads/anticipation_train.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.breakfast_50salads.Breakfast50Salads 4 | which: 50Salads 5 | root: ${dataset.dundee50salads.common.videos_dir} 6 | splits_dir: ${dataset.dundee50salads.common.splits_dir} 7 | classes_fpath: ${dataset.dundee50salads.common.classes_fpath} 8 | is_train: true 9 | fold: ${dataset.dundee50salads.common.fold} 10 | sample_strategy: last_clip 11 | annot_reader_fn: ${dataset.dundee50salads.annot_reader_fn} 12 | conv_to_anticipate_fn: 13 | _target_: datasets.base_video_dataset.convert_to_anticipation 14 | tau_a: 1.0 15 | tau_o: 2.5 16 | -------------------------------------------------------------------------------- /conf/dataset/dundee50salads/anticipation_val.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.breakfast_50salads.Breakfast50Salads 4 | which: 50Salads 5 | root: ${dataset.dundee50salads.common.videos_dir} 6 | splits_dir: ${dataset.dundee50salads.common.splits_dir} 7 | classes_fpath: ${dataset.dundee50salads.common.classes_fpath} 8 | is_train: false 9 | fold: ${dataset.dundee50salads.common.fold} 10 | sample_strategy: last_clip 11 | annot_reader_fn: ${dataset.dundee50salads.annot_reader_fn} 12 | conv_to_anticipate_fn: 13 | _target_: datasets.base_video_dataset.convert_to_anticipation 14 | tau_a: 1.0 15 | tau_o: 2.5 16 | -------------------------------------------------------------------------------- /conf/dataset/dundee50salads/common.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | data_dir: ${cwd}/DATA/videos/50Salads/ 4 | splits_dir: ${cwd}/external/breakfast_50salad_anticipation_annotations/cvpr18_data/50s_splits/ 5 | annots_dir: ${dataset.dundee50salads.common.data_dir}/activityAnnotations/ 6 | videos_dir: ${dataset.dundee50salads.common.data_dir}/rgb/ 7 | timestamps_dir: ${dataset.dundee50salads.common.data_dir}/timestamps/ 8 | classes_fpath: ${cwd}/external/breakfast_50salad_anticipation_annotations/annotations/50salads/mapping.txt 9 | annots_dir_abu_farha: ${cwd}/external/breakfast_50salad_anticipation_annotations/annotations/50salads/groundTruth/ 10 | fold: 1 11 | -------------------------------------------------------------------------------- /models/classifiers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import torch.nn as nn 4 | 5 | 6 | class MLP(nn.Module): 7 | def __init__(self, in_features, out_features, nlayers, **kwargs): 8 | super().__init__() 9 | layers = [[nn.Linear(in_features, in_features, **kwargs), 10 | nn.ReLU()] for _ in range(nlayers - 1)] 11 | # flatten out the pairs 12 | layers = [item for sublist in layers for item in sublist] 13 | layers.append(nn.Linear(in_features, out_features)) 14 | self.cls = nn.Sequential(*layers) 15 | 16 | def forward(self, inp): 17 | return self.cls(inp) 18 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens/anticipation_test_s1.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | root: ${dataset.epic_kitchens.common.data_dir}/test 5 | annotation_path: 6 | - ${dataset.epic_kitchens.common.annot_dir}/EPIC_test_s1_timestamps.pkl 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir} 8 | label_type: ${dataset.epic_kitchens.common.label_type} 9 | sample_strategy: "center_clip" 10 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv 11 | conv_to_anticipate_fn: 12 | _target_: datasets.base_video_dataset.convert_to_anticipation 13 | tau_a: ${dataset.epic_kitchens.common.tau_a} 14 | tau_o: ${dataset.epic_kitchens.common.tau_o} 15 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens/anticipation_test_s2.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | root: ${dataset.epic_kitchens.common.data_dir}/test 5 | annotation_path: 6 | - ${dataset.epic_kitchens.common.annot_dir}/EPIC_test_s2_timestamps.pkl 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir} 8 | label_type: ${dataset.epic_kitchens.common.label_type} 9 | sample_strategy: "center_clip" 10 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv 11 | conv_to_anticipate_fn: 12 | _target_: datasets.base_video_dataset.convert_to_anticipation 13 | tau_a: ${dataset.epic_kitchens.common.tau_a} 14 | tau_o: ${dataset.epic_kitchens.common.tau_o} 15 | -------------------------------------------------------------------------------- /sample_scripts/resize_epic_256px.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | #!/bin/bash 4 | indir="/path/to/orig/videos/" 5 | outdir="/path/to/resulting/videos_ht256px/" 6 | 7 | cd $indir 8 | videos=$(find . -iname *.MP4) 9 | 10 | num_procs=32 # Run this many in parallel at max 11 | num_jobs="\j" # The prompt escape for number of jobs currently running 12 | for video in $videos; do 13 | while (( ${num_jobs@P} >= num_procs )); do 14 | wait -n 15 | done 16 | mkdir -p $(dirname ${outdir}/${video}) 17 | # from https://superuser.com/a/624564 18 | ffmpeg -y -i ${indir}/${video} -filter:v scale="trunc(oh*a/2)*2:256" -c:a copy ${outdir}/${video} & 19 | echo 'Converted ' ${video} 20 | done 21 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens/anticipation_train.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | root: ${dataset.epic_kitchens.common.data_dir}/train 5 | annotation_path: 6 | - ${dataset.epic_kitchens.common.annot_dir}/EPIC_train_action_labels.pkl 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir} 8 | label_type: ${dataset.epic_kitchens.common.label_type} 9 | sample_strategy: "random_clip" 10 | # https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/data/training_videos.csv 11 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv 12 | conv_to_anticipate_fn: 13 | _target_: datasets.base_video_dataset.convert_to_anticipation 14 | tau_a: ${dataset.epic_kitchens.common.tau_a} 15 | tau_o: ${dataset.epic_kitchens.common.tau_o} 16 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens/anticipation_val.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | root: ${dataset.epic_kitchens.common.data_dir}/train 5 | annotation_path: 6 | - ${dataset.epic_kitchens.common.annot_dir}/EPIC_train_action_labels.pkl 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir} 8 | label_type: ${dataset.epic_kitchens.common.label_type} 9 | sample_strategy: "center_clip" 10 | # https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/data/validation_videos.csv 11 | only_keep_videos: ${dataset.epic_kitchens.common.rulstm_annot_dir}/validation_videos.csv 12 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv 13 | conv_to_anticipate_fn: 14 | _target_: datasets.base_video_dataset.convert_to_anticipation 15 | tau_a: ${dataset.epic_kitchens.common.tau_a} 16 | tau_o: ${dataset.epic_kitchens.common.tau_o} 17 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens100/anticipation_test.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | version: ${dataset.epic_kitchens100.common.version} 5 | root: 6 | - ${dataset.epic_kitchens100.common.data_dir_extension} 7 | annotation_path: 8 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_test_timestamps.pkl 9 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir} 10 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc. 11 | label_type: ${dataset.epic_kitchens100.common.label_type} 12 | sample_strategy: "center_clip" 13 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv 14 | conv_to_anticipate_fn: 15 | _target_: datasets.base_video_dataset.convert_to_anticipation 16 | tau_a: ${dataset.epic_kitchens100.common.tau_a} 17 | tau_o: ${dataset.epic_kitchens100.common.tau_o} 18 | -------------------------------------------------------------------------------- /conf/dataset/egtea/anticipation_train.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | version: ${dataset.egtea.common.version} 5 | root: 6 | - ${cwd}/DATA/videos/EGTEA/101020/videos/ 7 | annotation_path: 8 | - ${dataset.egtea.common.rulstm_annot_dir}/training${dataset.egtea.common.split}.csv 9 | annotation_dir: ${dataset.egtea.common.rulstm_annot_dir} 10 | label_type: ${dataset.egtea.common.label_type} 11 | sample_strategy: random_clip 12 | action_labels_fpath: ${dataset.egtea.common.rulstm_annot_dir}/actions.csv 13 | conv_to_anticipate_fn: 14 | _target_: datasets.base_video_dataset.convert_to_anticipation 15 | tau_a: ${dataset.egtea.common.tau_a} 16 | tau_o: ${dataset.egtea.common.tau_o} 17 | reader_fn: # Setting it since for EGTEA I mostly use RULSTM features 18 | _target_: datasets.epic_kitchens.EpicRULSTMFeatsReader 19 | lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_${dataset.egtea.common.modality}_model_best_fcfull_hd/ 20 | read_type: normal 21 | -------------------------------------------------------------------------------- /conf/dataset/egtea/anticipation_val.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | version: ${dataset.egtea.common.version} 5 | root: 6 | - ${cwd}/DATA/videos/EGTEA/101020/videos/ 7 | annotation_path: 8 | - ${dataset.egtea.common.rulstm_annot_dir}/validation${dataset.egtea.common.split}.csv 9 | annotation_dir: ${dataset.egtea.common.rulstm_annot_dir} 10 | label_type: ${dataset.egtea.common.label_type} 11 | sample_strategy: center_clip 12 | action_labels_fpath: ${dataset.egtea.common.rulstm_annot_dir}/actions.csv 13 | conv_to_anticipate_fn: 14 | _target_: datasets.base_video_dataset.convert_to_anticipation 15 | tau_a: ${dataset.egtea.common.tau_a} 16 | tau_o: ${dataset.egtea.common.tau_o} 17 | reader_fn: # Setting it since for EGTEA I mostly use RULSTM features 18 | _target_: datasets.epic_kitchens.EpicRULSTMFeatsReader 19 | lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_${dataset.egtea.common.modality}_model_best_fcfull_hd/ 20 | read_type: normal 21 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens100/anticipation_train.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | version: ${dataset.epic_kitchens100.common.version} 5 | root: 6 | - ${dataset.epic_kitchens.common.data_dir}/train # the old one 7 | # Need test too since some of epic-55 test was added to epic-100 train 8 | - ${dataset.epic_kitchens.common.data_dir}/test # the old one 9 | - ${dataset.epic_kitchens100.common.data_dir_extension} 10 | annotation_path: 11 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_train.pkl 12 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir} 13 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc. 14 | label_type: ${dataset.epic_kitchens100.common.label_type} 15 | sample_strategy: "random_clip" 16 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv 17 | conv_to_anticipate_fn: 18 | _target_: datasets.base_video_dataset.convert_to_anticipation 19 | tau_a: ${dataset.epic_kitchens100.common.tau_a} 20 | tau_o: ${dataset.epic_kitchens100.common.tau_o} 21 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens100/anticipation_val.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | version: ${dataset.epic_kitchens100.common.version} 5 | root: 6 | - ${dataset.epic_kitchens.common.data_dir}/train # the old one 7 | # Need test too since some of epic-55 test was added to epic-100 train 8 | - ${dataset.epic_kitchens.common.data_dir}/test # the old one 9 | - ${dataset.epic_kitchens100.common.data_dir_extension} 10 | annotation_path: 11 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_validation.pkl 12 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir} 13 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc. 14 | label_type: ${dataset.epic_kitchens100.common.label_type} 15 | sample_strategy: "center_clip" 16 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv 17 | conv_to_anticipate_fn: 18 | _target_: datasets.base_video_dataset.convert_to_anticipation 19 | tau_a: ${dataset.epic_kitchens100.common.tau_a} 20 | tau_o: ${dataset.epic_kitchens100.common.tau_o} 21 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens100/anticipation_train+val.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | version: ${dataset.epic_kitchens100.common.version} 5 | root: 6 | - ${dataset.epic_kitchens.common.data_dir}/train # the old one 7 | # Need test too since some of epic-55 test was added to epic-100 train 8 | - ${dataset.epic_kitchens.common.data_dir}/test # the old one 9 | - ${dataset.epic_kitchens100.common.data_dir_extension} 10 | annotation_path: 11 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_train.pkl 12 | - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_validation.pkl 13 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir} 14 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir} # Needed during computing final outputs to get tail classes etc. 15 | label_type: ${dataset.epic_kitchens100.common.label_type} 16 | sample_strategy: "random_clip" 17 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv 18 | conv_to_anticipate_fn: 19 | _target_: datasets.base_video_dataset.convert_to_anticipation 20 | tau_a: ${dataset.epic_kitchens100.common.tau_a} 21 | tau_o: ${dataset.epic_kitchens100.common.tau_o} 22 | -------------------------------------------------------------------------------- /docs/DATASETS.md: -------------------------------------------------------------------------------- 1 | ## EGTEA Gaze+ 2 | 3 | The annotations are used from RULSTM, so you'd need to set it up as described in the main README. 4 | 5 | ### To train models on pre-extracted TSN features 6 | 7 | 1. Download the features from [RULSTM](https://iplab.dmi.unict.it/sharing/rulstm/features/egtea.zip) 8 | 2. Unzip into `DATA/external/rulstm/RULSTM/egtea/` 9 | 10 | ### To train models on raw videos 11 | 12 | Download the videos from [here](https://www.dropbox.com/s/uwwj6wb1j4rsm02/video_links.txt) into `DATA/videos/EGTEA/101020/videos/` 13 | 14 | ## 50-Salads 15 | 16 | 1. Download videos from [here](https://cvip.computing.dundee.ac.uk/datasets/foodpreparation/50salads/data/) into `DATA/videos/50Salads/`. 17 | 2. Download annotations 18 | - The models in this paper use annotations from [here](https://github.com/yabufarha/anticipating-activities/issues/5#issuecomment-555916894) 19 | - Download the `cvpr18_data` folder to `external/breakfast_50salad_anticipation_annotations/cvpr18_data` 20 | - Additionally, [this](https://dl.fbaipublicfiles.com/avt/datasets/50salads/annotations.zip) annotations directory was shared by the authors of the above paper as well, download it at `external/breakfast_50salad_anticipation_annotations/annotations/`. Shared here for reproducibility of the code. 21 | -------------------------------------------------------------------------------- /conf/dataset/epic_kitchens/anticipation_train_minus_val.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_._name_ 2 | 3 | _target_: datasets.epic_kitchens.EPICKitchens 4 | root: ${dataset.epic_kitchens.common.data_dir}/train 5 | # Not using the rulstm/RULSTM/data/{training|validation}.csv here since 6 | # they are effectively the same as the original EPIC labels (to my 7 | # knowledge). 8 | # wc -l rulstm/RULSTM/data/training.csv --> 23493 9 | # wc -l rulstm/RULSTM/data/validation.csv --> 4979 10 | # wc -l epic_annotations/EPIC_train_action_labels.csv --> 28473 11 | # which is the same as (sum - 1, for header) 12 | # So using the only_keep_videos to subselect for train/val 13 | annotation_path: 14 | - ${dataset.epic_kitchens.common.annot_dir}/EPIC_train_action_labels.pkl 15 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir} 16 | label_type: ${dataset.epic_kitchens.common.label_type} 17 | sample_strategy: "random_clip" 18 | # https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/data/training_videos.csv 19 | only_keep_videos: ${dataset.epic_kitchens.common.rulstm_annot_dir}/training_videos.csv 20 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv 21 | conv_to_anticipate_fn: 22 | _target_: datasets.base_video_dataset.convert_to_anticipation 23 | tau_a: ${dataset.epic_kitchens.common.tau_a} 24 | tau_o: ${dataset.epic_kitchens.common.tau_o} 25 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to AVT 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to AVT, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /train_net.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """Main training entry.""" 4 | 5 | import os 6 | import logging 7 | import random 8 | import subprocess 9 | 10 | import torch 11 | import hydra 12 | from omegaconf import DictConfig, OmegaConf 13 | 14 | import func 15 | 16 | 17 | OmegaConf.register_new_resolver('minus', lambda x, y: x - y) 18 | # Multiply and cast to integer 19 | OmegaConf.register_new_resolver('times_int', lambda x, y: int(x * y)) 20 | 21 | 22 | @hydra.main(config_path='conf', config_name='config') 23 | def main(cfg: DictConfig) -> None: 24 | # Since future runs might corrupt the stored hydra config, copy it over 25 | # for backup. 26 | if not os.path.exists('.hydra.orig'): 27 | subprocess.call('cp -r .hydra .hydra.orig', shell=True) 28 | random.seed(cfg.seed) 29 | torch.manual_seed(cfg.seed) 30 | try: 31 | print(subprocess.check_output('nvidia-smi')) 32 | except subprocess.CalledProcessError: 33 | print('Could not run nvidia-smi..') 34 | # cudnn.deterministic = True # Makes it slow.. 35 | getattr(func, cfg.train.fn).main(cfg) 36 | 37 | 38 | if __name__ == "__main__": 39 | logging.basicConfig(format=('%(asctime)s %(levelname)-8s' 40 | ' {%(module)s:%(lineno)d} %(message)s'), 41 | level=logging.DEBUG, 42 | datefmt='%Y-%m-%d %H:%M:%S') 43 | torch.multiprocessing.set_start_method('spawn') 44 | main() # pylint: disable=no-value-for-parameter # Uses hydra 45 | -------------------------------------------------------------------------------- /expts/05_ek100_rustm_test_testonly.txt: -------------------------------------------------------------------------------- 1 | train.batch_size=128 2 | eval.batch_size=128 3 | train.num_epochs=0 4 | # Download the following model from RULSTM 5 | # https://iplab.dmi.unict.it/sharing/rulstm/ek100_models/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar 6 | train.init_from_model=[[temporal_aggregator,/path/to/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar],[classifiers.action,classifier.1.,/path/to/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar]] 7 | 8 | model/backbone=identity 9 | model.backbone_dim=1024 10 | model/temporal_aggregator=rulstm 11 | model.temporal_aggregator.num_pad_feats=3 12 | model.dropout=0.8 13 | 14 | opt.lr_wd=[[backbone,0.0,0.0],[temporal_aggregator,0.01,0.0],[classifiers,0.01,0.0]] 15 | opt.bias_bn_wd_scale=0.0 16 | opt.optimizer.nesterov=true 17 | 18 | data_train.num_frames=11 19 | data_train.frame_rate=30 20 | data_eval.num_frames=11 21 | data_eval.frame_rate=30 22 | 23 | opt/scheduler=cosine 24 | 25 | dataset@dataset_train=epic_kitchens100/anticipation_train 26 | dataset@dataset_eval=epic_kitchens100/anticipation_test 27 | dataset_train.sample_strategy=last_clip 28 | dataset_eval.sample_strategy=last_clip 29 | dataset_train.conv_to_anticipate_fn.tau_o=2.5 30 | dataset_eval.conv_to_anticipate_fn.tau_o=2.5 31 | dataset.epic_kitchens100.common.label_type=action 32 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: exact_rulstm} 33 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 34 | 35 | # RULSTM data 36 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 37 | 38 | hydra.launcher.nodes=1 39 | hydra.launcher.gpus_per_node=1 40 | 41 | test_only=True 42 | -------------------------------------------------------------------------------- /expts/08_ek55_avt_tsn.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=2.0 2 | 3 | train.batch_size=32 4 | eval.batch_size=32 5 | train.num_epochs=20 6 | 7 | model/backbone=identity 8 | model.backbone_dim=1024 9 | model/temporal_aggregator=identity 10 | model/future_predictor=avth 11 | model.dropout=0.8 12 | +model.future_predictor.n_head=8 13 | +model.future_predictor.n_layer=12 14 | +model.future_predictor.output_len=1 15 | +model.future_predictor.inter_dim=2048 16 | +model.future_predictor.return_past_too=true 17 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 18 | +model.future_predictor.future_pred_loss_wt=1.0 19 | +model.future_predictor.avg_last_n=1 20 | 21 | 22 | opt.lr_wd=[[__all__,0.000005,0.0001]] 23 | opt.bias_bn_wd_scale=1.0 24 | 25 | data_train.num_frames=10 26 | data_train.frame_rate=2 27 | data_eval.num_frames=${data_train.num_frames} 28 | data_eval.frame_rate=${data_train.frame_rate} 29 | 30 | opt/optimizer=adam 31 | opt/scheduler=cosine 32 | opt.warmup.num_epochs=5 33 | opt.scheduler.num_epochs=15 # total - 5 warmup 34 | 35 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val 36 | dataset@dataset_eval=epic_kitchens/anticipation_val 37 | dataset_train.sample_strategy=last_clip 38 | dataset_eval.sample_strategy=last_clip 39 | dataset_train.conv_to_anticipate_fn.tau_o=5 40 | dataset_eval.conv_to_anticipate_fn.tau_o=5 41 | dataset.epic_kitchens.common.label_type=action 42 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens.common.rulstm_feats_dir}/rgb/, read_type: normal} 43 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 44 | 45 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 46 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 47 | 48 | hydra.launcher.nodes=1 49 | hydra.launcher.gpus_per_node=4 50 | -------------------------------------------------------------------------------- /expts/10_ek55_avt_ig65m.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=2.0 2 | 3 | train.batch_size=32 4 | eval.batch_size=32 5 | train.num_epochs=20 6 | 7 | model/backbone=identity 8 | model.backbone_dim=2048 9 | model/temporal_aggregator=identity 10 | model/future_predictor=avth 11 | model.dropout=0.8 12 | +model.future_predictor.n_head=8 13 | +model.future_predictor.n_layer=12 14 | +model.future_predictor.output_len=1 15 | +model.future_predictor.inter_dim=2048 16 | +model.future_predictor.return_past_too=true 17 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 18 | +model.future_predictor.future_pred_loss_wt=1.0 19 | +model.future_predictor.avg_last_n=1 20 | 21 | 22 | opt.lr_wd=[[__all__,0.000005,0.0001]] 23 | opt.bias_bn_wd_scale=1.0 24 | # opt.optimizer.nesterov=true 25 | 26 | data_train.num_frames=10 27 | data_train.frame_rate=2 28 | data_eval.num_frames=${data_train.num_frames} 29 | data_eval.frame_rate=${data_train.frame_rate} 30 | 31 | opt/optimizer=adam 32 | opt/scheduler=cosine 33 | 34 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val 35 | dataset@dataset_eval=epic_kitchens/anticipation_val 36 | dataset_train.sample_strategy=last_clip 37 | dataset_eval.sample_strategy=last_clip 38 | dataset_train.conv_to_anticipate_fn.tau_o=5 39 | dataset_eval.conv_to_anticipate_fn.tau_o=5 40 | 41 | dataset.epic_kitchens.common.label_type=action 42 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek55/ig65m_ftEk55train_logits_25fps/rgb, read_type: normal, warn_if_using_closeby_frame: false} 43 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 44 | 45 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 46 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 47 | 48 | hydra.launcher.nodes=1 49 | hydra.launcher.gpus_per_node=4 50 | -------------------------------------------------------------------------------- /expts/10_ek55_avt_ig65m_forAR.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=2.0 2 | 3 | train.batch_size=32 4 | eval.batch_size=32 5 | train.num_epochs=100 6 | 7 | model/backbone=identity 8 | model.backbone_dim=2048 9 | model/temporal_aggregator=identity 10 | model/future_predictor=avth 11 | model.dropout=0.8 12 | +model.future_predictor.n_head=8 13 | +model.future_predictor.n_layer=12 14 | +model.future_predictor.output_len=1 15 | +model.future_predictor.inter_dim=2048 16 | +model.future_predictor.return_past_too=true 17 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 18 | +model.future_predictor.future_pred_loss_wt=1.0 19 | +model.future_predictor.avg_last_n=1 20 | 21 | 22 | opt.lr_wd=[[__all__,0.000005,0.0001]] 23 | opt.bias_bn_wd_scale=1.0 24 | # opt.optimizer.nesterov=true 25 | 26 | data_train.num_frames=10 27 | data_train.frame_rate=2 28 | data_eval.num_frames=${data_train.num_frames} 29 | data_eval.frame_rate=${data_train.frame_rate} 30 | 31 | opt/optimizer=adam 32 | opt/scheduler=cosine 33 | 34 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val 35 | dataset@dataset_eval=epic_kitchens/anticipation_val 36 | dataset_train.sample_strategy=last_clip 37 | dataset_eval.sample_strategy=last_clip 38 | dataset_train.conv_to_anticipate_fn.tau_o=5 39 | dataset_eval.conv_to_anticipate_fn.tau_o=5 40 | 41 | dataset.epic_kitchens.common.label_type=action 42 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek55/ig65m_ftEk55train_logits_25fps/rgb, read_type: normal, warn_if_using_closeby_frame: false} 43 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 44 | 45 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 46 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 47 | 48 | hydra.launcher.nodes=1 49 | hydra.launcher.gpus_per_node=4 50 | -------------------------------------------------------------------------------- /expts/08_ek55_avt_tsn_forAR.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=1024 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=6 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | 37 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val 38 | dataset@dataset_eval=epic_kitchens/anticipation_val 39 | dataset_train.sample_strategy=last_clip 40 | dataset_eval.sample_strategy=last_clip 41 | 42 | dataset_train.conv_to_anticipate_fn.tau_a=1 43 | dataset_train.conv_to_anticipate_fn.tau_o=10 44 | dataset_eval.conv_to_anticipate_fn.tau_a=1 45 | dataset_eval.conv_to_anticipate_fn.tau_o=10 46 | 47 | dataset.epic_kitchens.common.label_type=action 48 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens.common.rulstm_feats_dir}/rgb/, read_type: normal} 49 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 50 | 51 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 52 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 53 | 54 | hydra.launcher.nodes=1 55 | hydra.launcher.gpus_per_node=2 56 | -------------------------------------------------------------------------------- /expts/02_ek100_avt_tsn.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=1024 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=6 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | opt.warmup.num_epochs=20 37 | 38 | dataset@dataset_train=epic_kitchens100/anticipation_train 39 | dataset@dataset_eval=epic_kitchens100/anticipation_val 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_train.conv_to_anticipate_fn.tau_a=1 43 | dataset_train.conv_to_anticipate_fn.tau_o=10 44 | dataset_eval.conv_to_anticipate_fn.tau_a=1 45 | dataset_eval.conv_to_anticipate_fn.tau_o=10 46 | dataset.epic_kitchens100.common.label_type=action 47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: normal} 48 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 49 | 50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 52 | 53 | hydra.launcher.nodes=1 54 | hydra.launcher.gpus_per_node=2 55 | -------------------------------------------------------------------------------- /expts/03_ek100_avt_tsn_obj.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=352 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=6 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | opt.warmup.num_epochs=20 37 | 38 | dataset@dataset_train=epic_kitchens100/anticipation_train 39 | dataset@dataset_eval=epic_kitchens100/anticipation_val 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_train.conv_to_anticipate_fn.tau_a=1 43 | dataset_train.conv_to_anticipate_fn.tau_o=10 44 | dataset_eval.conv_to_anticipate_fn.tau_a=1 45 | dataset_eval.conv_to_anticipate_fn.tau_o=10 46 | dataset.epic_kitchens100.common.label_type=action 47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/, read_type: normal} 48 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 49 | 50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 52 | 53 | hydra.launcher.nodes=1 54 | hydra.launcher.gpus_per_node=2 55 | -------------------------------------------------------------------------------- /expts/02_ek100_avt_tsn_test_trainval.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=1024 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=6 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | opt.warmup.num_epochs=20 37 | 38 | dataset@dataset_train=epic_kitchens100/anticipation_train+val 39 | dataset@dataset_eval=epic_kitchens100/anticipation_test 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_train.conv_to_anticipate_fn.tau_a=1 43 | dataset_train.conv_to_anticipate_fn.tau_o=10 44 | dataset_eval.conv_to_anticipate_fn.tau_a=1 45 | dataset_eval.conv_to_anticipate_fn.tau_o=10 46 | dataset.epic_kitchens100.common.label_type=action 47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: normal} 48 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 49 | 50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 52 | 53 | hydra.launcher.nodes=1 54 | hydra.launcher.gpus_per_node=2 55 | -------------------------------------------------------------------------------- /expts/03_ek100_avt_tsn_obj_test_trainval.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=352 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=6 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | opt.warmup.num_epochs=20 37 | 38 | dataset@dataset_train=epic_kitchens100/anticipation_train+val 39 | dataset@dataset_eval=epic_kitchens100/anticipation_test 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_train.conv_to_anticipate_fn.tau_a=1 43 | dataset_train.conv_to_anticipate_fn.tau_o=10 44 | dataset_eval.conv_to_anticipate_fn.tau_a=1 45 | dataset_eval.conv_to_anticipate_fn.tau_o=10 46 | dataset.epic_kitchens100.common.label_type=action 47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/, read_type: normal} 48 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 49 | 50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 52 | 53 | hydra.launcher.nodes=1 54 | hydra.launcher.gpus_per_node=2 55 | -------------------------------------------------------------------------------- /expts/04_ek100_avt_ig65m_test_trainval.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=2048 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=2 14 | +model.future_predictor.n_layer=8 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | opt.warmup.num_epochs=5 37 | 38 | dataset@dataset_train=epic_kitchens100/anticipation_train+val 39 | dataset@dataset_eval=epic_kitchens100/anticipation_test 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_train.conv_to_anticipate_fn.tau_a=1 43 | dataset_train.conv_to_anticipate_fn.tau_o=10 44 | dataset_eval.conv_to_anticipate_fn.tau_a=1 45 | dataset_eval.conv_to_anticipate_fn.tau_o=10 46 | dataset.epic_kitchens100.common.label_type=action 47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb/, read_type: normal, warn_if_using_closeby_frame: false} 48 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 49 | 50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 52 | 53 | hydra.launcher.nodes=1 54 | hydra.launcher.gpus_per_node=2 55 | -------------------------------------------------------------------------------- /expts/11_egtea_avt_tsn.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=0.1 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=15 7 | 8 | model/backbone=identity 9 | model.backbone_dim=1024 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.8 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=2 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | 37 | dataset@dataset_train=egtea/anticipation_train 38 | dataset@dataset_eval=egtea/anticipation_val 39 | +dataset@dataset_eval_train=egtea/anticipation_train 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_eval_train.sample_strategy=last_clip 43 | dataset_train.conv_to_anticipate_fn.tau_a=0.5 44 | dataset_train.conv_to_anticipate_fn.tau_o=10 45 | dataset_eval.conv_to_anticipate_fn.tau_a=0.5 46 | dataset_eval.conv_to_anticipate_fn.tau_o=10 47 | dataset_eval_train.conv_to_anticipate_fn.tau_a=0.5 48 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10 49 | dataset.egtea.common.label_type=action 50 | dataset.egtea.common.split=1 51 | dataset.egtea.common.modality=rgb 52 | 53 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 54 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 55 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct 56 | 57 | hydra.launcher.nodes=1 58 | hydra.launcher.gpus_per_node=2 59 | -------------------------------------------------------------------------------- /common/cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class KmeansAssigner(nn.Module): 8 | def __init__(self, centroids_fpath, norm=False): 9 | super().__init__() 10 | # NxC dimension 11 | # Not converting this to linear layer as then the weights get 12 | # overwriten during random init, and these cluster centers are lost. 13 | self.register_buffer('centroids', 14 | torch.load(centroids_fpath)['weight']) 15 | self.norm = norm 16 | 17 | @property 18 | def num_clusters(self): 19 | return self.centroids.size(0) 20 | 21 | @staticmethod 22 | def feat2cluster(feats, centroids, norm): 23 | """ 24 | Compute index for the feats, w.r.t centroids. 25 | Args: 26 | feats *xC 27 | centroids KxC 28 | Returns: 29 | assignments * 30 | """ 31 | feats_flat = feats.flatten(0, -2) 32 | if norm: 33 | feats_flat = nn.functional.normalize(feats_flat, dim=-1, p=2) 34 | dists = torch.cdist(feats_flat.unsqueeze(0), centroids.unsqueeze(0)) 35 | assgns = torch.argmin(dists[0], dim=-1) 36 | assgns = assgns.reshape(feats.shape[:-1]) 37 | return assgns 38 | 39 | @staticmethod 40 | def cluster2feat(idx, centroids): 41 | """ 42 | Get features for cluster ids 43 | Args: 44 | idx * 45 | centroids KxC 46 | Returns: 47 | assignments *xC 48 | """ 49 | idx_flat = idx.reshape((-1, )) 50 | feats = centroids[idx_flat, :] 51 | return feats.reshape(list(idx.shape) + [feats.size(-1)]) 52 | 53 | def forward(self, inp): 54 | """ 55 | If inp is torch.float, then find the nearest assignments. 56 | If torch.long, return the corresponding features. 57 | """ 58 | if inp.dtype == torch.long: 59 | return self.cluster2feat(inp, self.centroids) 60 | return self.feat2cluster(inp, self.centroids, self.norm) 61 | -------------------------------------------------------------------------------- /expts/02_ek100_avt_tsn_test_testonly.txt: -------------------------------------------------------------------------------- 1 | test_only=true 2 | 3 | train.train_one_epoch_fn.loss_wts.feat=1.0 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/02_ek100_avt_tsn.txt/0/checkpoint.pth]] 6 | 7 | train.batch_size=64 8 | eval.batch_size=64 9 | train.num_epochs=50 10 | 11 | model/backbone=identity 12 | model.backbone_dim=1024 13 | model/temporal_aggregator=identity 14 | model/future_predictor=avth 15 | model.dropout=0.2 16 | +model.future_predictor.n_head=4 17 | +model.future_predictor.n_layer=6 18 | +model.future_predictor.output_len=1 19 | +model.future_predictor.inter_dim=2048 20 | +model.future_predictor.return_past_too=true 21 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 22 | +model.future_predictor.future_pred_loss_wt=1.0 23 | +model.future_predictor.avg_last_n=1 24 | model.classifier_on_past=true 25 | 26 | 27 | opt.lr_wd=[[__all__,0.001,0.000001]] 28 | opt.bias_bn_wd_scale=1.0 29 | opt.optimizer.nesterov=true 30 | 31 | data_train.num_frames=10 32 | data_train.frame_rate=1 33 | data_train.subclips.num_frames=1 34 | data_train.subclips.stride=1 35 | data_eval=${data_train} 36 | 37 | opt/optimizer=sgd 38 | opt/scheduler=cosine 39 | opt.warmup.num_epochs=20 40 | 41 | dataset@dataset_train=epic_kitchens100/anticipation_train 42 | dataset@dataset_eval=epic_kitchens100/anticipation_test 43 | dataset_train.sample_strategy=last_clip 44 | dataset_eval.sample_strategy=last_clip 45 | dataset_train.conv_to_anticipate_fn.tau_a=1 46 | dataset_train.conv_to_anticipate_fn.tau_o=10 47 | dataset_eval.conv_to_anticipate_fn.tau_a=1 48 | dataset_eval.conv_to_anticipate_fn.tau_o=10 49 | dataset.epic_kitchens100.common.label_type=action 50 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: normal} 51 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 52 | 53 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 54 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 55 | 56 | hydra.launcher.nodes=1 57 | hydra.launcher.gpus_per_node=2 58 | -------------------------------------------------------------------------------- /conf/data/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | # The top few go into the dataset object to load as per these 4 | num_frames: 16 5 | frame_rate: null # Null => Defaults or natural frame rate of the video 6 | # Allow for an option to clip the long original clip into subclips. This is 7 | # useful I want features for multiple past clips, so I just read and process a 8 | # really long one and then crop it up. By default crop into 1 subclip -- same 9 | # as input 10 | subclips: 11 | # TODO Need to use relative interpolation here 12 | num_frames: ${..num_frames} 13 | stride: ${..num_frames} 14 | # Load segmentation labels only if a classifier on the past is being applied 15 | load_seg_labels: ${model.classifier_on_past} 16 | # Get rid of the next 2 params.. not sure what they are for 17 | train_bs_multiplier: 5 18 | val_clips_per_video: 1 19 | workers: 10 20 | # Scale image to this size before cropping 21 | # scale_w can be -1, in which case it will scale the shorter size to 22 | # scale_h. 23 | scale_h: 128 24 | scale_w: 174 25 | # Ht and wd of the crop from the above resized video. Set to null for no 26 | # cropping. 27 | crop_size: 112 28 | # Mean/std for centering the image 29 | mean: [0.43216, 0.394666, 0.37645] 30 | std: [0.22803, 0.22145, 0.216989] 31 | # Augmentations. Note, set these all to default, or "0", such that they are 32 | # not applied. Change it in the txt file to add it at training time, since 33 | # this ConfigGroup object will be copied for both train and test time. 34 | flip_p: 0.5 # Left-right flip 50% at train time. Not used during eval. 35 | scale_pix_val: 1.0 # Scale the pixel values by this number. Useful to scale from 0-1 values to 0-255. 36 | reverse_channels: false # Reverse channels, i.e. convert from RGB->BGR 37 | color_jitter_brightness: 0.0 38 | color_jitter_contrast: 0.0 39 | color_jitter_saturation: 0.0 40 | color_jitter_hue: 0.0 41 | # Use distributed sampler or not. For certain data loader settings, such 42 | # as when using the dense_sampler for feature extraction, that automatically 43 | # samples different clips for different workers, so set this to false when 44 | # using it 45 | use_dist_sampler: true 46 | # Test time augmentations. Only used in the eval code 47 | eval_num_crops: 1 48 | eval_flip_crops: False 49 | -------------------------------------------------------------------------------- /loss_fn/multidim_xentropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | 4 | """Cross entropy loss, that works with multi-dim input.""" 5 | import torch 6 | import torch.nn as nn 7 | from common.cluster import KmeansAssigner 8 | 9 | 10 | class MultiDimCrossEntropy(nn.CrossEntropyLoss): 11 | def forward(self, inp, tgt, *args, **kwargs): 12 | """ 13 | Args: 14 | inp: (*, C) 15 | tgt: (*, ) 16 | Will reshape the flatten initial dimensions and then incur loss 17 | """ 18 | assert inp.ndim == tgt.ndim + 1 19 | assert inp.shape[:-1] == tgt.shape 20 | res = super().forward(inp.reshape(-1, inp.size(-1)), tgt.reshape( 21 | (-1, )), *args, **kwargs) 22 | if torch.numel(res) == torch.numel(tgt): 23 | # Reduction was not done, so reshape back to orig shape 24 | res = res.reshape(tgt.shape) 25 | return res 26 | 27 | 28 | class QuantizeAndCrossEntropy(MultiDimCrossEntropy): 29 | """Given a set of cluster centers, project the features to that before 30 | incurring the loss.""" 31 | def __init__(self, centroids_fpath, norm=True, *args, **kwargs): 32 | super().__init__(*args, **kwargs) 33 | self.assigner = KmeansAssigner(centroids_fpath) 34 | self.norm = norm 35 | 36 | def forward(self, inp, tgt): 37 | """ 38 | Args: 39 | inp: (*, C) 40 | tgt: (*, C) 41 | Will reshape the flatten initial dimensions and then incur loss 42 | """ 43 | # Normalize L2 both target and input, since that's how I'm computing 44 | # centroids 45 | if self.norm: 46 | inp = nn.functional.normalize(inp, dim=-1, p=2) 47 | tgt = nn.functional.normalize(tgt, dim=-1, p=2) 48 | # assign the GT and predictions to the centroids 49 | inp_proj = torch.mm(inp.flatten(0, 1), 50 | self.centroids.t()).view(inp.shape[:-1] + 51 | self.centroids.shape[:1]) 52 | # the weights of project layer are the centroids, so pick from there 53 | tgt_proj_q = self.assigner(tgt) 54 | return super().forward(inp_proj, tgt_proj_q) 55 | -------------------------------------------------------------------------------- /expts/06_ek100_avt_tsnflow.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=1024 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=6 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | 37 | dataset@dataset_train=epic_kitchens100/anticipation_train 38 | dataset@dataset_eval=epic_kitchens100/anticipation_val 39 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_train 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_eval_train.sample_strategy=last_clip 43 | dataset_train.conv_to_anticipate_fn.tau_a=1 44 | dataset_train.conv_to_anticipate_fn.tau_o=10 45 | dataset_eval.conv_to_anticipate_fn.tau_a=1 46 | dataset_eval.conv_to_anticipate_fn.tau_o=10 47 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1 48 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10 49 | dataset.epic_kitchens100.common.label_type=action 50 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/, read_type: normal} 51 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 52 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn} 53 | 54 | +dataset_train.conv_to_anticipate_fn.drop_style=rulstm 55 | +dataset_eval.conv_to_anticipate_fn.drop_style=rulstm 56 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=rulstm 57 | 58 | hydra.launcher.nodes=1 59 | hydra.launcher.gpus_per_node=2 60 | -------------------------------------------------------------------------------- /expts/06_ek100_avt_tsnflow_test_trainval.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=1024 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=4 14 | +model.future_predictor.n_layer=6 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | 37 | dataset@dataset_train=epic_kitchens100/anticipation_train+val 38 | dataset@dataset_eval=epic_kitchens100/anticipation_test 39 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_val 40 | dataset_train.sample_strategy=last_clip 41 | dataset_eval.sample_strategy=last_clip 42 | dataset_eval_train.sample_strategy=last_clip 43 | dataset_train.conv_to_anticipate_fn.tau_a=1 44 | dataset_train.conv_to_anticipate_fn.tau_o=10 45 | dataset_eval.conv_to_anticipate_fn.tau_a=1 46 | dataset_eval.conv_to_anticipate_fn.tau_o=10 47 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1 48 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10 49 | dataset.epic_kitchens100.common.label_type=action 50 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/, read_type: normal} 51 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 52 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn} 53 | 54 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 55 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 56 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct 57 | 58 | hydra.launcher.nodes=1 59 | hydra.launcher.gpus_per_node=2 60 | -------------------------------------------------------------------------------- /expts/04_ek100_avt_ig65m.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | 4 | train.batch_size=64 5 | eval.batch_size=64 6 | train.num_epochs=50 7 | 8 | model/backbone=identity 9 | model.backbone_dim=2048 10 | model/temporal_aggregator=identity 11 | model/future_predictor=avth 12 | model.dropout=0.2 13 | +model.future_predictor.n_head=2 14 | +model.future_predictor.n_layer=8 15 | +model.future_predictor.output_len=1 16 | +model.future_predictor.inter_dim=2048 17 | +model.future_predictor.return_past_too=true 18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 19 | +model.future_predictor.future_pred_loss_wt=1.0 20 | +model.future_predictor.avg_last_n=1 21 | model.classifier_on_past=true 22 | 23 | 24 | opt.lr_wd=[[__all__,0.001,0.000001]] 25 | opt.bias_bn_wd_scale=1.0 26 | opt.optimizer.nesterov=true 27 | 28 | data_train.num_frames=10 29 | data_train.frame_rate=1 30 | data_train.subclips.num_frames=1 31 | data_train.subclips.stride=1 32 | data_eval=${data_train} 33 | 34 | opt/optimizer=sgd 35 | opt/scheduler=cosine 36 | opt.warmup.num_epochs=5 37 | 38 | dataset@dataset_train=epic_kitchens100/anticipation_train 39 | dataset@dataset_eval=epic_kitchens100/anticipation_val 40 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_train 41 | dataset_train.sample_strategy=last_clip 42 | dataset_eval.sample_strategy=last_clip 43 | dataset_eval_train.sample_strategy=last_clip 44 | dataset_train.conv_to_anticipate_fn.tau_a=1 45 | dataset_train.conv_to_anticipate_fn.tau_o=10 46 | dataset_eval.conv_to_anticipate_fn.tau_a=1 47 | dataset_eval.conv_to_anticipate_fn.tau_o=10 48 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1 49 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10 50 | dataset.epic_kitchens100.common.label_type=action 51 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb, read_type: normal, warn_if_using_closeby_frame: false} 52 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 53 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn} 54 | 55 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 56 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 57 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct 58 | 59 | hydra.launcher.nodes=1 60 | hydra.launcher.gpus_per_node=2 61 | -------------------------------------------------------------------------------- /datasets/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import os 4 | import torch 5 | from importlib import import_module 6 | from tqdm import tqdm 7 | 8 | import omegaconf 9 | import hydra 10 | 11 | from common import utils 12 | 13 | __all__ = [ 14 | "get_dataset", 15 | ] 16 | 17 | 18 | def get_dataset(dataset_cfg, data_cfg, transform, logger): 19 | # If there is _precomputed_metadata file passed in, load that in 20 | kwargs = {} 21 | precomp_metadata_fpath = None 22 | if '_precomputed_metadata_file' in dataset_cfg: 23 | precomp_metadata_fpath = dataset_cfg._precomputed_metadata_file 24 | # Remove from the config since otherwise can't init the obj 25 | with omegaconf.open_dict(dataset_cfg): 26 | del dataset_cfg['_precomputed_metadata_file'] 27 | if os.path.exists(precomp_metadata_fpath): 28 | _precomputed_metadata = torch.load(precomp_metadata_fpath) 29 | kwargs['_precomputed_metadata'] = _precomputed_metadata 30 | 31 | kwargs['transform'] = transform 32 | kwargs['frame_rate'] = data_cfg.frame_rate 33 | kwargs['frames_per_clip'] = data_cfg.num_frames 34 | # Have to call dict() here since relative interpolation somehow doesn't 35 | # work once I get the subclips object 36 | kwargs['subclips_options'] = dict(data_cfg.subclips) 37 | kwargs['load_seg_labels'] = data_cfg.load_seg_labels 38 | logger.info('Creating the dataset object...') 39 | # Not recursive since many of the sub-instantiations would need positional 40 | # arguments 41 | _dataset = hydra.utils.instantiate(dataset_cfg, 42 | _recursive_=False, 43 | **kwargs) 44 | try: 45 | logger.info('Computing clips...') 46 | _dataset.video_clips.compute_clips(data_cfg.num_frames, 47 | 1, 48 | frame_rate=data_cfg.frame_rate) 49 | logger.info('Done') 50 | except AttributeError: # if video_clips not in _dataset 51 | logger.warning('No video_clips present') 52 | logger.info(f'Created dataset with {len(_dataset)} elts') 53 | 54 | if precomp_metadata_fpath and not os.path.exists(precomp_metadata_fpath): 55 | utils.save_on_master(_dataset.metadata, precomp_metadata_fpath) 56 | return _dataset 57 | -------------------------------------------------------------------------------- /expts/13_50s_avt.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]] 4 | 5 | train.batch_size=2 6 | eval.batch_size=2 7 | train.num_epochs=200 8 | 9 | model/backbone=avt_b_in21k 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.8 15 | +model.future_predictor.n_head=8 16 | +model.future_predictor.n_layer=8 17 | +model.future_predictor.output_len=1 18 | +model.future_predictor.inter_dim=2048 19 | +model.future_predictor.return_past_too=true 20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 21 | +model.future_predictor.avg_last_n=1 22 | model.classifier_on_past=true 23 | 24 | 25 | opt.lr_wd=[[__all__,0.0000005,0.0001]] 26 | opt.bias_bn_wd_scale=1.0 27 | # opt.optimizer.nesterov=true 28 | 29 | data_train.num_frames=10 30 | data_train.frame_rate=0.5 31 | data_train.subclips.num_frames=1 32 | data_train.subclips.stride=1 33 | data_eval.num_frames=${data_train.num_frames} 34 | data_eval.frame_rate=${data_train.frame_rate} 35 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 36 | data_eval.subclips.stride=${data_train.subclips.stride} 37 | data_train.mean=[0.5, 0.5, 0.5] 38 | data_train.std=[0.5, 0.5, 0.5] 39 | data_eval.mean=${data_train.mean} 40 | data_eval.std=${data_train.std} 41 | data_eval.eval_num_crops=3 42 | data_eval.eval_flip_crops=true 43 | 44 | opt/optimizer=adam 45 | opt/scheduler=cosine 46 | opt.warmup.num_epochs=20 47 | 48 | dataset/dundee50salads/annot_reader_fn=abu_farha 49 | dataset.dundee50salads.common.fold=1,2,3,4,5 50 | dataset@dataset_train=dundee50salads/anticipation_train 51 | dataset@dataset_eval=dundee50salads/anticipation_val 52 | dataset_train.sample_strategy=last_clip 53 | dataset_eval.sample_strategy=last_clip 54 | dataset_train.conv_to_anticipate_fn.tau_o=20 55 | dataset_eval.conv_to_anticipate_fn.tau_o=20 56 | 57 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 58 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 59 | 60 | data_train.scale_h=248-280 61 | data_train.scale_w=-1 62 | data_train.crop_size=224 63 | data_eval.scale_h=248 64 | data_eval.scale_w=-1 65 | data_eval.crop_size=224 66 | 67 | hydra.launcher.nodes=4 68 | hydra.launcher.gpus_per_node=8 69 | -------------------------------------------------------------------------------- /expts/09_ek55_avt.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth]] 4 | 5 | train.batch_size=3 6 | eval.batch_size=3 7 | train.num_epochs=35 8 | 9 | model/backbone=avt_b 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.8 15 | +model.future_predictor.n_head=4 16 | +model.future_predictor.n_layer=6 17 | +model.future_predictor.output_len=1 18 | +model.future_predictor.inter_dim=2048 19 | +model.future_predictor.return_past_too=true 20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 21 | +model.future_predictor.future_pred_loss_wt=1.0 22 | +model.future_predictor.avg_last_n=1 23 | model.classifier_on_past=true 24 | 25 | 26 | opt.lr_wd=[[__all__,0.0001,0.0001]] 27 | opt.bias_bn_wd_scale=1.0 28 | opt.optimizer.nesterov=true 29 | 30 | data_train.num_frames=10 31 | data_train.frame_rate=1 32 | data_train.subclips.num_frames=1 33 | data_train.subclips.stride=1 34 | data_eval.num_frames=${data_train.num_frames} 35 | data_eval.frame_rate=${data_train.frame_rate} 36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 37 | data_eval.subclips.stride=${data_train.subclips.stride} 38 | data_train.mean=[0.5, 0.5, 0.5] 39 | data_train.std=[0.5, 0.5, 0.5] 40 | data_eval.mean=${data_train.mean} 41 | data_eval.std=${data_train.std} 42 | data_eval.eval_num_crops=3 43 | data_eval.eval_flip_crops=true 44 | 45 | opt/optimizer=sgd 46 | opt/scheduler=cosine 47 | opt.scheduler.eta_min=0.000000005 48 | opt.warmup.num_epochs=5 49 | 50 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val 51 | dataset@dataset_eval=epic_kitchens/anticipation_val 52 | dataset_train.sample_strategy=last_clip 53 | dataset_eval.sample_strategy=last_clip 54 | dataset_train.conv_to_anticipate_fn.tau_o=20 55 | dataset_eval.conv_to_anticipate_fn.tau_o=20 56 | dataset.epic_kitchens.common.label_type=action 57 | 58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 60 | 61 | data_train.scale_h=248-280 62 | data_train.scale_w=-1 63 | data_train.crop_size=224 64 | data_eval.scale_h=248 65 | data_eval.scale_w=-1 66 | data_eval.crop_size=224 67 | 68 | hydra.launcher.nodes=4 69 | hydra.launcher.gpus_per_node=8 70 | -------------------------------------------------------------------------------- /expts/09_ek55_avt_forAR.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth]] 4 | 5 | train.batch_size=3 6 | eval.batch_size=3 7 | train.num_epochs=50 8 | 9 | model/backbone=avt_b 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.8 15 | +model.future_predictor.n_head=4 16 | +model.future_predictor.n_layer=6 17 | +model.future_predictor.output_len=1 18 | +model.future_predictor.inter_dim=2048 19 | +model.future_predictor.return_past_too=true 20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 21 | +model.future_predictor.future_pred_loss_wt=1.0 22 | +model.future_predictor.avg_last_n=1 23 | model.classifier_on_past=true 24 | 25 | 26 | opt.lr_wd=[[__all__,0.0001,0.0001]] 27 | opt.bias_bn_wd_scale=1.0 28 | opt.optimizer.nesterov=true 29 | 30 | data_train.num_frames=10 31 | data_train.frame_rate=1 32 | data_train.subclips.num_frames=1 33 | data_train.subclips.stride=1 34 | data_eval.num_frames=${data_train.num_frames} 35 | data_eval.frame_rate=${data_train.frame_rate} 36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 37 | data_eval.subclips.stride=${data_train.subclips.stride} 38 | data_train.mean=[0.5, 0.5, 0.5] 39 | data_train.std=[0.5, 0.5, 0.5] 40 | data_eval.mean=${data_train.mean} 41 | data_eval.std=${data_train.std} 42 | data_eval.eval_num_crops=3 43 | data_eval.eval_flip_crops=true 44 | 45 | opt/optimizer=sgd 46 | opt/scheduler=cosine 47 | opt.scheduler.eta_min=0.000000005 48 | opt.warmup.num_epochs=5 49 | 50 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val 51 | dataset@dataset_eval=epic_kitchens/anticipation_val 52 | dataset_train.sample_strategy=last_clip 53 | dataset_eval.sample_strategy=last_clip 54 | dataset_train.conv_to_anticipate_fn.tau_o=20 55 | dataset_eval.conv_to_anticipate_fn.tau_o=20 56 | dataset.epic_kitchens.common.label_type=action 57 | 58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 60 | 61 | data_train.scale_h=248-280 62 | data_train.scale_w=-1 63 | data_train.crop_size=224 64 | data_eval.scale_h=248 65 | data_eval.scale_w=-1 66 | data_eval.crop_size=224 67 | 68 | hydra.launcher.nodes=4 69 | hydra.launcher.gpus_per_node=8 70 | -------------------------------------------------------------------------------- /datasets/reader_fns.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """Implementation of reader functions.""" 4 | 5 | import logging 6 | from pathlib import Path 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torchvision 11 | 12 | from common.utils import get_video_info 13 | 14 | 15 | # An abstract class to keep track of all reader type classes 16 | class Reader(nn.Module): 17 | pass 18 | 19 | 20 | class DefaultReader(Reader): 21 | def forward(self, video_path, start, end, fps, df_row, **kwargs): 22 | del df_row, fps # Not needed here 23 | video_info = torchvision.io.read_video(video_path, start, end, 24 | **kwargs) 25 | # DEBUG see what is breaking 26 | logging.debug('Read %s from %s', video_info[0].shape, video_path) 27 | return video_info 28 | 29 | @staticmethod 30 | def get_frame_rate(video_path: Path) -> float: 31 | return get_video_info(video_path, ['fps'])['fps'] 32 | 33 | 34 | class VideoAsLabelOnehotReader(Reader): 35 | @staticmethod 36 | def get_frame_rate(video_path: Path) -> float: 37 | raise NotImplementedError('Not sure what it is here... TODO') 38 | 39 | def forward(self, 40 | video_path, 41 | start, 42 | end, 43 | fps, 44 | df_row, 45 | pts_unit='sec', 46 | num_classes=1000): 47 | """ 48 | Return the video as a 1-hot representation of the actual labels. 49 | Args: 50 | video_path 51 | start: start time in sec 52 | end: end time in sec 53 | fps: frame rate of this video 54 | df_row: The data frame row corresponding to this video. Includes 55 | labels 56 | num_classes: Total number of classes for the 1-hot representation. 57 | Could just be a large number, should work too. 58 | Returns: 59 | video_feature of shape T x 1 x 1 x num_classes 60 | """ 61 | del pts_unit, video_path, start, fps 62 | assert abs(end - 63 | df_row['end']) < 0.1, 'For now just supporting last_clip' 64 | labels = df_row['obs_action_class'][:, 1] 65 | # Convert to 1-hot, TxC shape 66 | feats = nn.functional.one_hot(torch.LongTensor(labels), num_classes) 67 | return feats.unsqueeze(1).unsqueeze(1).float(), {}, {} 68 | -------------------------------------------------------------------------------- /expts/06_ek100_avt_tsnflow_test_testonly.txt: -------------------------------------------------------------------------------- 1 | test_only=true 2 | 3 | train.train_one_epoch_fn.loss_wts.feat=1.0 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/06_ek100_avt_tsnflow.txt/0/checkpoint.pth]] 6 | 7 | train.batch_size=64 8 | eval.batch_size=64 9 | train.num_epochs=50 10 | 11 | model/backbone=identity 12 | model.backbone_dim=1024 13 | model/temporal_aggregator=identity 14 | model/future_predictor=avth 15 | model.dropout=0.2 16 | +model.future_predictor.n_head=4 17 | +model.future_predictor.n_layer=6 18 | +model.future_predictor.output_len=1 19 | # +model.future_predictor.avg_last_n=1 20 | +model.future_predictor.inter_dim=2048 21 | +model.future_predictor.return_past_too=true 22 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 23 | +model.future_predictor.future_pred_loss_wt=1.0 24 | +model.future_predictor.avg_last_n=1 25 | model.classifier_on_past=true 26 | 27 | 28 | opt.lr_wd=[[__all__,0.001,0.000001]] 29 | opt.bias_bn_wd_scale=1.0 30 | opt.optimizer.nesterov=true 31 | 32 | data_train.num_frames=10 33 | data_train.frame_rate=1 34 | data_train.subclips.num_frames=1 35 | data_train.subclips.stride=1 36 | data_eval=${data_train} 37 | 38 | opt/optimizer=sgd 39 | opt/scheduler=cosine 40 | 41 | dataset@dataset_train=epic_kitchens100/anticipation_train 42 | dataset@dataset_eval=epic_kitchens100/anticipation_test 43 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_train 44 | dataset_train.sample_strategy=last_clip 45 | dataset_eval.sample_strategy=last_clip 46 | dataset_eval_train.sample_strategy=last_clip 47 | dataset_train.conv_to_anticipate_fn.tau_a=1 48 | dataset_train.conv_to_anticipate_fn.tau_o=10 49 | dataset_eval.conv_to_anticipate_fn.tau_a=1 50 | dataset_eval.conv_to_anticipate_fn.tau_o=10 51 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1 52 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10 53 | dataset.epic_kitchens100.common.label_type=action 54 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/, read_type: normal} 55 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 56 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn} 57 | 58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 60 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct 61 | 62 | hydra.launcher.nodes=1 63 | hydra.launcher.gpus_per_node=2 64 | -------------------------------------------------------------------------------- /expts/04_ek100_avt_ig65m_test_testonly.txt: -------------------------------------------------------------------------------- 1 | test_only=true 2 | 3 | train.train_one_epoch_fn.loss_wts.feat=1.0 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/04_ek100_avt_ig65m.txt/0/checkpoint.pth]] 6 | 7 | train.batch_size=64 8 | eval.batch_size=64 9 | train.num_epochs=50 10 | 11 | model/backbone=identity 12 | model.backbone_dim=2048 13 | model/temporal_aggregator=identity 14 | model/future_predictor=avth 15 | model.dropout=0.2 16 | +model.future_predictor.n_head=2 17 | +model.future_predictor.n_layer=8 18 | +model.future_predictor.output_len=1 19 | +model.future_predictor.inter_dim=2048 20 | +model.future_predictor.return_past_too=true 21 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 22 | +model.future_predictor.future_pred_loss_wt=1.0 23 | +model.future_predictor.avg_last_n=1 24 | model.classifier_on_past=true 25 | 26 | 27 | opt.lr_wd=[[__all__,0.001,0.000001]] 28 | opt.bias_bn_wd_scale=1.0 29 | opt.optimizer.nesterov=true 30 | 31 | data_train.num_frames=10 32 | data_train.frame_rate=1 33 | data_train.subclips.num_frames=1 34 | data_train.subclips.stride=1 35 | data_eval=${data_train} 36 | 37 | opt/optimizer=sgd 38 | opt/scheduler=cosine 39 | opt.warmup.num_epochs=5 40 | 41 | dataset@dataset_train=epic_kitchens100/anticipation_train 42 | dataset@dataset_eval=epic_kitchens100/anticipation_test 43 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_val 44 | dataset_train.sample_strategy=last_clip 45 | dataset_eval.sample_strategy=last_clip 46 | dataset_eval_train.sample_strategy=last_clip 47 | dataset_train.conv_to_anticipate_fn.tau_a=1 48 | dataset_train.conv_to_anticipate_fn.tau_o=10 49 | dataset_eval.conv_to_anticipate_fn.tau_a=1 50 | dataset_eval.conv_to_anticipate_fn.tau_o=10 51 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1 52 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10 53 | dataset.epic_kitchens100.common.label_type=action 54 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb, read_type: normal, warn_if_using_closeby_frame: false} 55 | +dataset_eval.reader_fn=${dataset_train.reader_fn} 56 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn} 57 | 58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 60 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct 61 | 62 | hydra.launcher.nodes=1 63 | hydra.launcher.gpus_per_node=2 64 | -------------------------------------------------------------------------------- /expts/07_ek100_avt_longer.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]] 4 | 5 | train.batch_size=3 6 | eval.batch_size=3 7 | train.num_epochs=70 8 | 9 | model/backbone=avt_b_in21k 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.2 15 | +model.future_predictor.n_head=4 16 | +model.future_predictor.n_layer=6 17 | +model.future_predictor.output_len=1 18 | +model.future_predictor.inter_dim=2048 19 | +model.future_predictor.return_past_too=true 20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 21 | +model.future_predictor.future_pred_loss_wt=1.0 22 | +model.future_predictor.avg_last_n=1 23 | model.classifier_on_past=true 24 | 25 | 26 | opt.lr_wd=[[__all__,0.0001,0.000001]] 27 | opt.bias_bn_wd_scale=1.0 28 | opt.optimizer.nesterov=true 29 | 30 | data_train.num_frames=15 31 | data_train.frame_rate=1 32 | data_train.subclips.num_frames=1 33 | data_train.subclips.stride=1 34 | data_eval.num_frames=${data_train.num_frames} 35 | data_eval.frame_rate=${data_train.frame_rate} 36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 37 | data_eval.subclips.stride=${data_train.subclips.stride} 38 | data_train.mean=[0.5, 0.5, 0.5] 39 | data_train.std=[0.5, 0.5, 0.5] 40 | data_eval.mean=${data_train.mean} 41 | data_eval.std=${data_train.std} 42 | data_eval.eval_num_crops=3 43 | data_eval.eval_flip_crops=true 44 | 45 | opt/optimizer=sgd 46 | opt/scheduler=cosine 47 | opt.warmup.num_epochs=20 48 | 49 | dataset@dataset_train=epic_kitchens100/anticipation_train 50 | dataset@dataset_eval=epic_kitchens100/anticipation_val 51 | 52 | dataset_train.sample_strategy=last_clip 53 | dataset_eval.sample_strategy=last_clip 54 | 55 | dataset_train.conv_to_anticipate_fn.tau_a=1 56 | dataset_train.conv_to_anticipate_fn.tau_o=15 57 | dataset_eval.conv_to_anticipate_fn.tau_a=1 58 | dataset_eval.conv_to_anticipate_fn.tau_o=15 59 | 60 | dataset.epic_kitchens100.common.label_type=action 61 | 62 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 63 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 64 | 65 | data_train.scale_h=248-280 66 | data_train.scale_w=-1 67 | data_train.crop_size=224 68 | data_eval.scale_h=248 69 | data_eval.scale_w=-1 70 | data_eval.crop_size=224 71 | 72 | hydra.launcher.nodes=4 73 | hydra.launcher.gpus_per_node=8 74 | -------------------------------------------------------------------------------- /expts/07_ek100_avt_longer_test_testonly.txt: -------------------------------------------------------------------------------- 1 | test_only=true 2 | 3 | train.train_one_epoch_fn.loss_wts.feat=1.0 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/07_ek100_avt_longer.txt/0/checkpoint.pth]] 6 | 7 | train.batch_size=3 8 | eval.batch_size=3 9 | train.num_epochs=70 10 | 11 | model/backbone=avt_b_in21k 12 | model.backbone_last_n_modules_to_drop=0 13 | model.backbone_dim=768 14 | model/temporal_aggregator=identity 15 | model/future_predictor=avth 16 | model.dropout=0.2 17 | +model.future_predictor.n_head=4 18 | +model.future_predictor.n_layer=6 19 | +model.future_predictor.output_len=1 20 | +model.future_predictor.inter_dim=2048 21 | +model.future_predictor.return_past_too=true 22 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 23 | +model.future_predictor.future_pred_loss_wt=1.0 24 | +model.future_predictor.avg_last_n=1 25 | model.classifier_on_past=true 26 | 27 | 28 | opt.lr_wd=[[__all__,0.0001,0.000001]] 29 | opt.bias_bn_wd_scale=1.0 30 | opt.optimizer.nesterov=true 31 | 32 | data_train.num_frames=15 33 | data_train.frame_rate=1 34 | data_train.subclips.num_frames=1 35 | data_train.subclips.stride=1 36 | data_eval.num_frames=${data_train.num_frames} 37 | data_eval.frame_rate=${data_train.frame_rate} 38 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 39 | data_eval.subclips.stride=${data_train.subclips.stride} 40 | data_train.mean=[0.5, 0.5, 0.5] 41 | data_train.std=[0.5, 0.5, 0.5] 42 | data_eval.mean=${data_train.mean} 43 | data_eval.std=${data_train.std} 44 | data_eval.eval_num_crops=3 45 | data_eval.eval_flip_crops=true 46 | 47 | opt/optimizer=sgd 48 | opt/scheduler=cosine 49 | opt.warmup.num_epochs=20 50 | 51 | dataset@dataset_train=epic_kitchens100/anticipation_train 52 | dataset@dataset_eval=epic_kitchens100/anticipation_test 53 | 54 | dataset_train.sample_strategy=last_clip 55 | dataset_eval.sample_strategy=last_clip 56 | 57 | dataset_train.conv_to_anticipate_fn.tau_a=1 58 | dataset_train.conv_to_anticipate_fn.tau_o=15 59 | dataset_eval.conv_to_anticipate_fn.tau_a=1 60 | dataset_eval.conv_to_anticipate_fn.tau_o=15 61 | 62 | dataset.epic_kitchens100.common.label_type=action 63 | 64 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 65 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 66 | 67 | data_train.scale_h=248-280 68 | data_train.scale_w=-1 69 | data_train.crop_size=224 70 | data_eval.scale_h=248 71 | data_eval.scale_w=-1 72 | data_eval.crop_size=224 73 | 74 | hydra.launcher.nodes=4 75 | hydra.launcher.gpus_per_node=8 76 | -------------------------------------------------------------------------------- /expts/07_ek100_avt_longer_test_trainval.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]] 4 | 5 | train.batch_size=3 6 | eval.batch_size=3 7 | train.num_epochs=70 8 | 9 | model/backbone=avt_b_in21k 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.2 15 | +model.future_predictor.n_head=4 16 | +model.future_predictor.n_layer=6 17 | +model.future_predictor.output_len=1 18 | +model.future_predictor.inter_dim=2048 19 | +model.future_predictor.return_past_too=true 20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 21 | +model.future_predictor.future_pred_loss_wt=1.0 22 | +model.future_predictor.avg_last_n=1 23 | model.classifier_on_past=true 24 | 25 | 26 | opt.lr_wd=[[__all__,0.0001,0.000001]] 27 | opt.bias_bn_wd_scale=1.0 28 | opt.optimizer.nesterov=true 29 | 30 | data_train.num_frames=15 31 | data_train.frame_rate=1 32 | data_train.subclips.num_frames=1 33 | data_train.subclips.stride=1 34 | data_eval.num_frames=${data_train.num_frames} 35 | data_eval.frame_rate=${data_train.frame_rate} 36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 37 | data_eval.subclips.stride=${data_train.subclips.stride} 38 | data_train.mean=[0.5, 0.5, 0.5] 39 | data_train.std=[0.5, 0.5, 0.5] 40 | data_eval.mean=${data_train.mean} 41 | data_eval.std=${data_train.std} 42 | data_eval.eval_num_crops=3 43 | data_eval.eval_flip_crops=true 44 | 45 | opt/optimizer=sgd 46 | opt/scheduler=cosine 47 | opt.warmup.num_epochs=20 48 | 49 | dataset@dataset_train=epic_kitchens100/anticipation_train+val 50 | dataset@dataset_eval=epic_kitchens100/anticipation_test 51 | 52 | dataset_train.sample_strategy=last_clip 53 | dataset_eval.sample_strategy=last_clip 54 | 55 | dataset_train.conv_to_anticipate_fn.tau_a=1 56 | dataset_train.conv_to_anticipate_fn.tau_o=15 57 | dataset_eval.conv_to_anticipate_fn.tau_a=1 58 | dataset_eval.conv_to_anticipate_fn.tau_o=15 59 | 60 | dataset.epic_kitchens100.common.label_type=action 61 | 62 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 63 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 64 | 65 | data_train.scale_h=248-280 66 | data_train.scale_w=-1 67 | data_train.crop_size=224 68 | data_eval.scale_h=248 69 | data_eval.scale_w=-1 70 | data_eval.crop_size=224 71 | 72 | hydra.launcher.nodes=8 73 | hydra.launcher.gpus_per_node=8 74 | -------------------------------------------------------------------------------- /expts/01_ek100_avt.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | train.init_from_model=[[backbone.model, ${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]] 4 | 5 | train.batch_size=3 6 | eval.batch_size=3 7 | train.num_epochs=50 8 | 9 | model/backbone=avt_b_in21k 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.2 15 | +model.future_predictor.n_head=4 16 | +model.future_predictor.n_layer=6 17 | +model.future_predictor.output_len=1 18 | +model.future_predictor.inter_dim=2048 19 | +model.future_predictor.return_past_too=true 20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 21 | +model.future_predictor.future_pred_loss_wt=1.0 22 | +model.future_predictor.avg_last_n=1 23 | model.classifier_on_past=true 24 | 25 | 26 | opt.lr_wd=[[__all__,0.0001,0.000001]] 27 | opt.bias_bn_wd_scale=1.0 28 | opt.optimizer.nesterov=true 29 | 30 | data_train.num_frames=10 31 | data_train.frame_rate=1 32 | data_train.subclips.num_frames=1 33 | data_train.subclips.stride=1 34 | data_eval.num_frames=${data_train.num_frames} 35 | data_eval.frame_rate=${data_train.frame_rate} 36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 37 | data_eval.subclips.stride=${data_train.subclips.stride} 38 | data_train.mean=[0.5, 0.5, 0.5] 39 | data_train.std=[0.5, 0.5, 0.5] 40 | data_eval.mean=${data_train.mean} 41 | data_eval.std=${data_train.std} 42 | data_eval.eval_num_crops=3 43 | data_eval.eval_flip_crops=true 44 | 45 | opt/optimizer=sgd 46 | opt/scheduler=cosine 47 | opt.warmup.num_epochs=20 48 | opt.scheduler.num_epochs=30 49 | 50 | dataset@dataset_train=epic_kitchens100/anticipation_train 51 | dataset@dataset_eval=epic_kitchens100/anticipation_val 52 | 53 | dataset_train.sample_strategy=last_clip 54 | dataset_eval.sample_strategy=last_clip 55 | 56 | dataset_train.conv_to_anticipate_fn.tau_a=1 57 | dataset_train.conv_to_anticipate_fn.tau_o=10 58 | dataset_eval.conv_to_anticipate_fn.tau_a=1 59 | dataset_eval.conv_to_anticipate_fn.tau_o=10 60 | 61 | dataset.epic_kitchens100.common.label_type=action 62 | 63 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 64 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 65 | 66 | data_train.scale_h=248-280 67 | data_train.scale_w=-1 68 | data_train.crop_size=224 69 | data_eval.scale_h=248 70 | data_eval.scale_w=-1 71 | data_eval.crop_size=224 72 | 73 | hydra.launcher.nodes=4 74 | hydra.launcher.gpus_per_node=8 75 | -------------------------------------------------------------------------------- /expts/01_ek100_avt_test_testonly.txt: -------------------------------------------------------------------------------- 1 | test_only=true 2 | 3 | train.train_one_epoch_fn.loss_wts.feat=1.0 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 5 | train.init_from_model=[[OUTPUTS/expts/01_ek100_avt.txt/0/checkpoint.pth]] 6 | 7 | train.batch_size=3 8 | eval.batch_size=3 9 | train.num_epochs=50 10 | 11 | model/backbone=avt_b_in21k 12 | model.backbone_last_n_modules_to_drop=0 13 | model.backbone_dim=768 14 | model/temporal_aggregator=identity 15 | model/future_predictor=avth 16 | model.dropout=0.2 17 | +model.future_predictor.n_head=4 18 | +model.future_predictor.n_layer=6 19 | +model.future_predictor.output_len=1 20 | +model.future_predictor.inter_dim=2048 21 | +model.future_predictor.return_past_too=true 22 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 23 | +model.future_predictor.future_pred_loss_wt=1.0 24 | +model.future_predictor.avg_last_n=1 25 | model.classifier_on_past=true 26 | 27 | 28 | opt.lr_wd=[[__all__,0.0001,0.000001]] 29 | opt.bias_bn_wd_scale=1.0 30 | opt.optimizer.nesterov=true 31 | 32 | data_train.num_frames=10 33 | data_train.frame_rate=1 34 | data_train.subclips.num_frames=1 35 | data_train.subclips.stride=1 36 | data_eval.num_frames=${data_train.num_frames} 37 | data_eval.frame_rate=${data_train.frame_rate} 38 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 39 | data_eval.subclips.stride=${data_train.subclips.stride} 40 | data_train.mean=[0.5, 0.5, 0.5] 41 | data_train.std=[0.5, 0.5, 0.5] 42 | data_eval.mean=${data_train.mean} 43 | data_eval.std=${data_train.std} 44 | data_eval.eval_num_crops=3 45 | data_eval.eval_flip_crops=true 46 | 47 | opt/optimizer=sgd 48 | opt/scheduler=cosine 49 | opt.warmup.num_epochs=20 50 | opt.scheduler.num_epochs=30 51 | 52 | dataset@dataset_train=epic_kitchens100/anticipation_train 53 | dataset@dataset_eval=epic_kitchens100/anticipation_test 54 | 55 | dataset_train.sample_strategy=last_clip 56 | dataset_eval.sample_strategy=last_clip 57 | 58 | dataset_train.conv_to_anticipate_fn.tau_a=1 59 | dataset_train.conv_to_anticipate_fn.tau_o=10 60 | dataset_eval.conv_to_anticipate_fn.tau_a=1 61 | dataset_eval.conv_to_anticipate_fn.tau_o=10 62 | 63 | dataset.epic_kitchens100.common.label_type=action 64 | 65 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 66 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 67 | 68 | data_train.scale_h=248-280 69 | data_train.scale_w=-1 70 | data_train.crop_size=224 71 | data_eval.scale_h=248 72 | data_eval.scale_w=-1 73 | data_eval.crop_size=224 74 | 75 | hydra.launcher.nodes=4 76 | hydra.launcher.gpus_per_node=8 77 | -------------------------------------------------------------------------------- /expts/01_ek100_avt_test_trainval.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]] 4 | 5 | train.batch_size=3 6 | eval.batch_size=3 7 | train.num_epochs=100 8 | 9 | model/backbone=avt_b_in21k 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.2 15 | +model.future_predictor.n_head=4 16 | +model.future_predictor.n_layer=6 17 | +model.future_predictor.output_len=1 18 | +model.future_predictor.inter_dim=2048 19 | +model.future_predictor.return_past_too=true 20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 21 | +model.future_predictor.future_pred_loss_wt=1.0 22 | +model.future_predictor.avg_last_n=1 23 | model.classifier_on_past=true 24 | 25 | 26 | opt.lr_wd=[[__all__,0.0001,0.000001]] 27 | opt.bias_bn_wd_scale=1.0 28 | opt.optimizer.nesterov=true 29 | 30 | data_train.num_frames=10 31 | data_train.frame_rate=1 32 | data_train.subclips.num_frames=1 33 | data_train.subclips.stride=1 34 | data_eval.num_frames=${data_train.num_frames} 35 | data_eval.frame_rate=${data_train.frame_rate} 36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 37 | data_eval.subclips.stride=${data_train.subclips.stride} 38 | data_train.mean=[0.5, 0.5, 0.5] 39 | data_train.std=[0.5, 0.5, 0.5] 40 | data_eval.mean=${data_train.mean} 41 | data_eval.std=${data_train.std} 42 | data_eval.eval_num_crops=3 43 | data_eval.eval_flip_crops=true 44 | 45 | opt/optimizer=sgd 46 | opt/scheduler=cosine 47 | opt.warmup.num_epochs=20 48 | opt.scheduler.num_epochs=30 49 | 50 | dataset@dataset_train=epic_kitchens100/anticipation_train+val 51 | dataset@dataset_eval=epic_kitchens100/anticipation_test 52 | 53 | dataset_train.sample_strategy=last_clip 54 | dataset_eval.sample_strategy=last_clip 55 | 56 | dataset_train.conv_to_anticipate_fn.tau_a=1 57 | dataset_train.conv_to_anticipate_fn.tau_o=10 58 | dataset_eval.conv_to_anticipate_fn.tau_a=1 59 | dataset_eval.conv_to_anticipate_fn.tau_o=10 60 | 61 | dataset.epic_kitchens100.common.label_type=action 62 | 63 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 64 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 65 | 66 | data_train.scale_h=248-280 67 | data_train.scale_w=-1 68 | data_train.crop_size=224 69 | data_eval.scale_h=248 70 | data_eval.scale_w=-1 71 | data_eval.crop_size=224 72 | 73 | hydra.launcher.nodes=4 74 | hydra.launcher.gpus_per_node=8 75 | -------------------------------------------------------------------------------- /expts/12_egtea_avt.txt: -------------------------------------------------------------------------------- 1 | train.train_one_epoch_fn.loss_wts.feat=1.0 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=0.1 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth]] 4 | 5 | train.batch_size=3 6 | eval.batch_size=3 7 | train.num_epochs=10 8 | 9 | model/backbone=avt_b 10 | model.backbone_last_n_modules_to_drop=0 11 | model.backbone_dim=768 12 | model/temporal_aggregator=identity 13 | model/future_predictor=avth 14 | model.dropout=0.2 15 | +model.future_predictor.n_head=4 16 | +model.future_predictor.n_layer=2 17 | +model.future_predictor.output_len=1 18 | # +model.future_predictor.avg_last_n=1 19 | +model.future_predictor.inter_dim=2048 20 | +model.future_predictor.return_past_too=true 21 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss} 22 | +model.future_predictor.future_pred_loss_wt=1.0 23 | +model.future_predictor.avg_last_n=1 24 | model.classifier_on_past=true 25 | 26 | 27 | opt.lr_wd=[[__all__,0.0005,0.000001]] 28 | opt.bias_bn_wd_scale=1.0 29 | opt.optimizer.nesterov=true 30 | 31 | data_train.num_frames=10 32 | data_train.frame_rate=1 33 | data_train.subclips.num_frames=1 34 | data_train.subclips.stride=1 35 | data_eval.num_frames=${data_train.num_frames} 36 | data_eval.frame_rate=${data_train.frame_rate} 37 | data_eval.subclips.num_frames=${data_train.subclips.num_frames} 38 | data_eval.subclips.stride=${data_train.subclips.stride} 39 | data_train.mean=[0.5, 0.5, 0.5] 40 | data_train.std=[0.5, 0.5, 0.5] 41 | data_eval.mean=${data_train.mean} 42 | data_eval.std=${data_train.std} 43 | data_eval.eval_num_crops=3 44 | data_eval.eval_flip_crops=true 45 | 46 | opt/optimizer=sgd 47 | opt/scheduler=cosine 48 | opt.warmup.num_epochs=5 49 | opt.scheduler.num_epochs=5 50 | 51 | dataset@dataset_train=egtea/anticipation_train 52 | dataset@dataset_eval=egtea/anticipation_val 53 | dataset_train.sample_strategy=last_clip 54 | dataset_eval.sample_strategy=last_clip 55 | dataset_train.conv_to_anticipate_fn.tau_a=0.5 56 | dataset_train.conv_to_anticipate_fn.tau_o=10 57 | dataset_eval.conv_to_anticipate_fn.tau_a=0.5 58 | dataset_eval.conv_to_anticipate_fn.tau_o=10 59 | dataset.egtea.common.label_type=action 60 | dataset.egtea.common.split=1 61 | dataset.egtea.common.modality=rgb 62 | 63 | # Remove the RULSTM reader and read from the frames 64 | ~dataset_train.reader_fn 65 | +dataset_train.reader_fn={_target_: datasets.reader_fns.DefaultReader} 66 | ~dataset_eval.reader_fn 67 | +dataset_eval.reader_fn={_target_: datasets.reader_fns.DefaultReader} 68 | 69 | +dataset_train.conv_to_anticipate_fn.drop_style=correct 70 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct 71 | 72 | data_train.scale_h=248-280 73 | data_train.scale_w=-1 74 | data_train.crop_size=224 75 | data_eval.scale_h=248 76 | data_eval.scale_w=-1 77 | data_eval.crop_size=224 78 | 79 | hydra.launcher.nodes=4 80 | hydra.launcher.gpus_per_node=8 81 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /common/sampler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import math 4 | import torch 5 | from torch.utils.data import Sampler 6 | import torch.distributed as dist 7 | import torchvision.datasets.video_utils 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """ 12 | Extension of DistributedSampler, as discussed in 13 | https://github.com/pytorch/pytorch/issues/23430 14 | """ 15 | 16 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False): 17 | if num_replicas is None: 18 | if not dist.is_available(): 19 | raise RuntimeError("Requires distributed package to be available") 20 | num_replicas = dist.get_world_size() 21 | if rank is None: 22 | if not dist.is_available(): 23 | raise RuntimeError("Requires distributed package to be available") 24 | rank = dist.get_rank() 25 | self.dataset = dataset 26 | self.num_replicas = num_replicas 27 | self.rank = rank 28 | self.epoch = 0 29 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 30 | self.total_size = self.num_samples * self.num_replicas 31 | self.shuffle = shuffle 32 | 33 | def __iter__(self): 34 | # deterministically shuffle based on epoch 35 | g = torch.Generator() 36 | g.manual_seed(self.epoch) 37 | if self.shuffle: 38 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 39 | else: 40 | indices = list(range(len(self.dataset))) 41 | 42 | # add extra samples to make it evenly divisible 43 | indices += indices[:(self.total_size - len(indices))] 44 | assert len(indices) == self.total_size 45 | 46 | # subsample 47 | indices = indices[self.rank:self.total_size:self.num_replicas] 48 | assert len(indices) == self.num_samples 49 | 50 | if isinstance(self.dataset, Sampler): 51 | orig_indices = list(iter(self.dataset)) 52 | indices = [orig_indices[i] for i in indices] 53 | 54 | return iter(indices) 55 | 56 | def __len__(self): 57 | return self.num_samples 58 | 59 | def set_epoch(self, epoch): 60 | self.epoch = epoch 61 | 62 | 63 | class UniformClipSampler(torch.utils.data.Sampler): 64 | """ 65 | Samples at most `max_video_clips_per_video` clips for each video, equally spaced 66 | Arguments: 67 | video_clips (VideoClips): video clips to sample from 68 | max_clips_per_video (int): maximum number of clips to be sampled per video 69 | """ 70 | def __init__(self, video_clips, max_clips_per_video): 71 | if not isinstance(video_clips, torchvision.datasets.video_utils.VideoClips): 72 | raise TypeError("Expected video_clips to be an instance of VideoClips, " 73 | "got {}".format(type(video_clips))) 74 | self.video_clips = video_clips 75 | self.max_clips_per_video = max_clips_per_video 76 | 77 | def __iter__(self): 78 | idxs = [] 79 | s = 0 80 | # select at most max_clips_per_video for each video, uniformly spaced 81 | for c in self.video_clips.clips: 82 | length = len(c) 83 | step = max(length // self.max_clips_per_video, 1) 84 | sampled = torch.arange(length)[::step] + s 85 | s += length 86 | idxs.append(sampled) 87 | idxs = torch.cat(idxs).tolist() 88 | return iter(idxs) 89 | 90 | def __len__(self): 91 | return sum(min(len(c), self.max_clips_per_video) for c in self.video_clips.clips) 92 | 93 | 94 | class RandomClipSampler(torch.utils.data.Sampler): 95 | """ 96 | Samples at most `max_video_clips_per_video` clips for each video randomly 97 | 98 | Arguments: 99 | video_clips (VideoClips): video clips to sample from 100 | max_clips_per_video (int): maximum number of clips to be sampled per video 101 | """ 102 | def __init__(self, video_clips, max_clips_per_video): 103 | if not isinstance(video_clips, torchvision.datasets.video_utils.VideoClips): 104 | raise TypeError("Expected video_clips to be an instance of VideoClips, " 105 | "got {}".format(type(video_clips))) 106 | self.video_clips = video_clips 107 | self.max_clips_per_video = max_clips_per_video 108 | 109 | def __iter__(self): 110 | idxs = [] 111 | s = 0 112 | # select at most max_clips_per_video for each video, randomly 113 | for c in self.video_clips.clips: 114 | length = len(c) 115 | size = min(length, self.max_clips_per_video) 116 | sampled = torch.randperm(length)[:size] + s 117 | s += length 118 | idxs.append(sampled) 119 | idxs = torch.cat(idxs) 120 | # shuffle all clips randomly 121 | perm = torch.randperm(len(idxs)) 122 | idxs = idxs[perm].tolist() 123 | return iter(idxs) 124 | 125 | def __len__(self): 126 | return sum(min(len(c), self.max_clips_per_video) for c in self.video_clips.clips) 127 | -------------------------------------------------------------------------------- /common/scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from typing import Sequence 4 | 5 | import torch 6 | from bisect import bisect_right 7 | 8 | 9 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 10 | def __init__( 11 | self, 12 | optimizer: torch.optim.Optimizer, 13 | milestone_epochs: Sequence[int], 14 | gamma: float = 0.1, 15 | warmup_factor: float = 1.0 / 3, 16 | warmup_epochs: int = 5, 17 | warmup_method: str = 'linear', 18 | last_epoch: int = -1, 19 | iters_per_epoch: int = None, # Must be set by calling code 20 | world_size: int = None, 21 | ): 22 | del world_size 23 | if not milestone_epochs == sorted(milestone_epochs): 24 | raise ValueError( 25 | "Milestones should be a list of" 26 | " increasing integers. Got {}", 27 | milestone_epochs, 28 | ) 29 | 30 | if warmup_method not in ("constant", "linear"): 31 | raise ValueError( 32 | "Only 'constant' or 'linear' warmup_method accepted" 33 | "got {}".format(warmup_method)) 34 | self.milestones = [iters_per_epoch * m for m in milestone_epochs] 35 | self.gamma = gamma 36 | self.warmup_factor = warmup_factor 37 | self.warmup_iters = max(warmup_epochs * iters_per_epoch, 1) 38 | 39 | self.warmup_method = warmup_method 40 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 41 | 42 | def get_lr(self): 43 | warmup_factor = 1 44 | if self.last_epoch < self.warmup_iters: 45 | if self.warmup_method == "constant": 46 | warmup_factor = self.warmup_factor 47 | elif self.warmup_method == "linear": 48 | alpha = float(self.last_epoch) / self.warmup_iters 49 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 50 | return [ 51 | base_lr * warmup_factor * 52 | self.gamma**bisect_right(self.milestones, self.last_epoch) 53 | for base_lr in self.base_lrs 54 | ] 55 | 56 | 57 | class CosineLR(torch.optim.lr_scheduler.CosineAnnealingLR): 58 | def __init__(self, 59 | optimizer, 60 | num_epochs, 61 | iters_per_epoch=None, 62 | world_size=None, 63 | **kwargs): 64 | kwargs['eta_min'] *= world_size 65 | super().__init__(optimizer, 66 | T_max=num_epochs * iters_per_epoch, 67 | **kwargs) 68 | 69 | def get_lr(self, *args, **kwargs): 70 | if self.last_epoch < self.T_max: 71 | return super().get_lr(*args, **kwargs) 72 | else: 73 | # Adding this if I train the model longer than the T_max set in 74 | # this. Happens when I sweep over different amounts of warmup. 75 | return [0.0 for _ in self.optimizer.param_groups] 76 | 77 | 78 | class ReduceLROnPlateau(torch.optim.lr_scheduler.ReduceLROnPlateau): 79 | def __init__(self, 80 | optimizer, 81 | iters_per_epoch=None, 82 | world_size=None, 83 | **kwargs): 84 | del iters_per_epoch, world_size 85 | super().__init__(optimizer, **kwargs) 86 | 87 | 88 | class Warmup(torch.optim.lr_scheduler._LRScheduler): 89 | """Wrap the scheduler for warmup before it kicks in.""" 90 | def __init__( 91 | self, 92 | optimizer: torch.optim.Optimizer, 93 | scheduler: torch.optim.lr_scheduler._LRScheduler, 94 | init_lr_ratio: float = 0.0, 95 | num_epochs: int = 5, 96 | last_epoch: int = -1, 97 | iters_per_epoch: int = None, # Must be set by calling code 98 | world_size: int = None, 99 | ): 100 | """ 101 | Args: 102 | init_lr_ratio (float in [0, 1]): Ratio of the original LR to start 103 | from. If 0.1, it will start from 0.1 of the original LRs and go 104 | upto 1.0 of the original LRs in the epochs. By def start from 105 | 0 up. 106 | num_epochs (int): Num of epochs to take to warmup. 107 | last_epoch (int): Which was the last epoch to init from (not really 108 | used anymore since we store the state_dict when loading 109 | scheduler from disk.) 110 | """ 111 | del world_size 112 | self.base_scheduler = scheduler 113 | self.warmup_iters = max(num_epochs * iters_per_epoch, 1) 114 | if self.warmup_iters > 1: 115 | self.init_lr_ratio = init_lr_ratio 116 | else: 117 | self.init_lr_ratio = 1.0 # Don't go from 0 to 1 in 1 iteration 118 | super().__init__(optimizer, last_epoch) 119 | 120 | def get_lr(self): 121 | # Epoch is iters for me, since I step after each iteration 122 | # (not after each epoch) 123 | # Based on logic in step, this should only be called for the warmup 124 | # iters. After that it should go to the base scheduler 125 | assert self.last_epoch < self.warmup_iters # since it increments 126 | return [ 127 | el * (self.init_lr_ratio + (1 - self.init_lr_ratio) * 128 | (float(self.last_epoch) / self.warmup_iters)) 129 | for el in self.base_lrs 130 | ] 131 | 132 | def step(self, *args, **kwargs): 133 | if self.last_epoch < (self.warmup_iters - 1): 134 | super().step(*args, **kwargs) 135 | else: 136 | self.base_scheduler.step(*args, **kwargs) 137 | 138 | def state_dict(self): 139 | """Returns the state of the scheduler as a :class:`dict`. 140 | 141 | It contains an entry for every variable in self.__dict__ which 142 | is not the optimizer. 143 | """ 144 | base_sched_dict = self.base_scheduler.state_dict() 145 | other_stuff = { 146 | key: value 147 | for key, value in self.__dict__.items() if key not in [ 148 | 'base_scheduler', 'optimizer'] 149 | } 150 | return {'base_sched_dict': base_sched_dict, 'other_stuff': other_stuff} 151 | 152 | def load_state_dict(self, state_dict): 153 | """Loads the schedulers state. 154 | 155 | Arguments: 156 | state_dict (dict): scheduler state. Should be an object returned 157 | from a call to :meth:`state_dict`. 158 | """ 159 | self.base_scheduler.__dict__.update(state_dict['base_sched_dict']) 160 | self.__dict__.update(state_dict['other_stuff']) 161 | -------------------------------------------------------------------------------- /docs/MODELS.md: -------------------------------------------------------------------------------- 1 | 2 | ## EPIC-Kitchens-100 Test/challenge submission 3 | 4 | Any of the models can be trained/tested on train+val/test by changing the 5 | `dataset@dataset_train` and `dataset@dataset_eval` fields in the configs. 6 | Here we provide the configs that were used for the challenge submission. 7 | 8 | | Backbone | Head | Train data | Config | Model | 9 | |----------|------|--------|-------|-------| 10 | | TSN (RGB) | RULSTM | train | `expts/05_ek100_rustm_test_testonly.txt` | [link](https://iplab.dmi.unict.it/sharing/rulstm/ek100_models/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar) | 11 | | TSN (RGB) | AVT-h | train | `expts/02_ek100_avt_tsn_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/02_ek100_avt_tsn.txt/0/checkpoint.pth) | 12 | | TSN (RGB) | AVT-h | train + val | `expts/02_ek100_avt_tsn_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/02_ek100_avt_tsn_test_trainval.txt/0/checkpoint.pth) | 13 | | irCSN-152 (IG65M) | AVT-h | train | `expts/04_ek100_avt_ig65m_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/04_ek100_avt_ig65m.txt/0/checkpoint.pth) | 14 | | irCSN-152 (IG65M) | AVT-h | train + val | `expts/04_ek100_avt_ig65m_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/04_ek100_avt_ig65m_test_trainval.txt/0/checkpoint.pth) | 15 | | AVT-b (RGB) | AVT-h | train | `expts/01_ek100_avt_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/01_ek100_avt.txt/0/checkpoint.pth) | 16 | | AVT-b (RGB) | AVT-h | train + val | `expts/01_ek100_avt_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/01_ek100_avt_test_trainval.txt/0/checkpoint.pth) | 17 | | TSN (Flow) | AVT-h | train | `expts/06_ek100_avt_tsnflow_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/06_ek100_avt_tsnflow.txt/0/checkpoint.pth) | 18 | | TSN (Flow) | AVT-h | train + val | `expts/06_ek100_avt_tsnflow_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/06_ek100_avt_tsnflow_test_trainval.txt/0/checkpoint.pth) | 19 | | TSN (Obj) | AVT-h | train + val | `expts/03_ek100_avt_tsn_obj_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/03_ek100_avt_tsn_obj_test_trainval.txt/0/checkpoint.pth) | 20 | | AVT-b (RGB, longer) | AVT-h | train | `expts/07_ek100_avt_longer_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/07_ek100_avt_longer.txt/0/checkpoint.pth) | 21 | | AVT-b (RGB, longer) | AVT-h | train + val | `expts/07_ek100_avt_longer_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/07_ek100_avt_longer_test_trainval.txt/0/checkpoint.pth) | 22 | 23 | 24 | 25 | The predictions from all the above models were late fused and submitted 26 | for evaluation using the following script: 27 | 28 | 29 | ```python 30 | from notebooks.utils import * 31 | CFG_FILES = [ 32 | # RULSTM 33 | ('expts/05_ek100_rustm_test_testonly.txt', 0), 34 | # TSN + AVT-h (train and train+val models) 35 | ('expts/02_ek100_avt_tsn_test_testonly.txt', 0), 36 | ('expts/02_ek100_avt_tsn_test_trainval.txt', 0), 37 | # irCSN152/IG65M + AVT-h 38 | ('expts/04_ek100_avt_ig65m_test_testonly.txt', 0), 39 | ('expts/04_ek100_avt_ig65m_test_trainval.txt', 0), 40 | # AVT 41 | ('expts/01_ek100_avt_test_testonly.txt', 0), 42 | ('expts/01_ek100_avt_test_trainval.txt', 0), 43 | # Flow, obj AVT 44 | ('expts/06_ek100_avt_tsnflow_test_testonly.txt', 0), 45 | ('expts/06_ek100_avt_tsnflow_test_trainval.txt', 0), 46 | ('expts/03_ek100_avt_tsn_obj_test_trainval.txt', 0), 47 | # Longer AVT 48 | ('expts/07_ek100_avt_longer_test_testonly.txt', 0), 49 | ('expts/07_ek100_avt_longer_test_trainval.txt', 0), 50 | 51 | ] 52 | WTS = [1.0, # RULSTM 53 | # TSN + AVT-h 54 | 1.0, 1.0, 55 | # irCSN152/IG65M + AVT-h 56 | 1.0, 1.0, 57 | # AVT 58 | 0.5, 0.5, 59 | # Flow, obj AVT 60 | 0.5, 0.5, 0.5, 61 | # Longer AVT 62 | 1.5, 1.5] 63 | SLS = [2, 4, 4] 64 | 65 | package_results_for_submission_ek100(CFG_FILES, WTS, SLS) 66 | ``` 67 | 68 | It should obtain 16.74 on the challenge leaderboard. We also provide our 69 | final submission file [here](https://dl.fbaipublicfiles.com/avt/challenge_submissions/ek100.zip). 70 | 71 | ## EPIC-Kitchens-55 72 | 73 | | Backbone | Head | Top-1 | Top-5 | Config (for top-1/5) | Model (for top-1/5) | AR5 | Config (for AR5) | Model (for AR5) | 74 | |----------|------|------|--------|--------|-----|-----|-----|----| 75 | | TSN (RGB) | AVT-h | 13.1 | 28.1 | `expts/08_ek55_avt_tsn.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/08_ek55_avt_tsn.txt/0/checkpoint.pth)| 13.5 | `expts/08_ek55_avt_tsn_forAR.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/08_ek55_avt_tsn_forAR.txt/0/checkpoint.pth) | 76 | | AVT-b | AVT-h | 12.5 | 30.1 | `expts/09_ek55_avt.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/09_ek55_avt.txt/0/checkpoint.pth)| 13.6 | `expts/09_ek55_avt_forAR.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/09_ek55_avt_forAR.txt/0/checkpoint.pth) | 77 | | irCSN-152 (IG65M) | AVT-h | 14.4 | 31.7 | `expts/10_ek55_avt_ig65m.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/10_ek55_avt_ig65m.txt/0/checkpoint.pth)| 13.2 | `expts/10_ek55_avt_ig65m_forAR.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/10_ek55_avt_ig65m_forAR.txt/0/checkpoint.pth) | 78 | 79 | Our final test submission was generated by late-fusing AVT model with predictions from [prior work](https://arxiv.org/abs/2006.00830), and is available [here](https://dl.fbaipublicfiles.com/avt/challenge_submissions/ek55.zip). 80 | 81 | ## EGTEA Gaze+ 82 | 83 | | Backbone | Head | Top-1 (Act) | Class-mean (Act) | Config | Model | 84 | |----------|------|-------------|------------------|-------|-------| 85 | | TSN (RGB) | AVT-h | 39.8 | 28.3 | `expts/11_egtea_avt_tsn.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/11_egtea_avt_tsn.txt/0/checkpoint.pth) | 86 | | AVT-b | AVT-h | 43.0 | 35.2 | `expts/12_egtea_avt.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/12_egtea_avt.txt/0/checkpoint.pth) | 87 | 88 | 89 | ## 50-Salads 90 | 91 | | Backbone | Head | Top-1 (Act) | Config | Model | 92 | |----------|------|-------------|-------|-------| 93 | | AVT-b | AVT-h | 48.0 | `expts/13_50s_avt.txt` | [fold 1](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/0/checkpoint.pth) [fold 2](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/1/checkpoint.pth) [fold 3](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/2/checkpoint.pth) [fold 4](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/3/checkpoint.pth) [fold 5](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/4/checkpoint.pth) | 94 | -------------------------------------------------------------------------------- /common/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from __future__ import print_function 4 | from typing import List, Dict 5 | 6 | import errno 7 | import os 8 | from pathlib import Path 9 | import logging 10 | import submitit 11 | import cv2 12 | 13 | import torch 14 | import torch.distributed as dist 15 | 16 | 17 | def accuracy(output, target, topk=(1, )): 18 | """Computes the accuracy over the k top predictions 19 | for the specified values of k 20 | Args: 21 | output (*, K) predictions 22 | target (*, ) targets 23 | """ 24 | if torch.all(target < 0): 25 | return [ 26 | torch.zeros([], device=output.device) for _ in range(len(topk)) 27 | ] 28 | with torch.no_grad(): 29 | # flatten the initial dimensions, to deal with 3D+ input 30 | output = output.flatten(0, -2) 31 | target = target.flatten() 32 | # Now compute the accuracy 33 | maxk = max(topk) 34 | batch_size = target.size(0) 35 | 36 | _, pred = output.topk(maxk, 1, True, True) 37 | pred = pred.t() 38 | correct = pred.eq(target[None]) 39 | 40 | res = [] 41 | for k in topk: 42 | correct_k = correct[:k].flatten().sum(dtype=torch.float32) 43 | res.append(correct_k * (100.0 / batch_size)) 44 | return res 45 | 46 | 47 | def mkdir(path): 48 | try: 49 | os.makedirs(path) 50 | except OSError as e: 51 | if e.errno != errno.EEXIST: 52 | raise 53 | 54 | 55 | def setup_for_distributed(is_master, logger): 56 | """ 57 | This function disables printing when not in master process 58 | """ 59 | import builtins as __builtin__ 60 | builtin_print = __builtin__.print 61 | 62 | def print(*args, **kwargs): 63 | force = kwargs.pop('force', False) 64 | if is_master or force: 65 | builtin_print(*args, **kwargs) 66 | 67 | __builtin__.print = print 68 | if not is_master: 69 | # Don't print anything except FATAL 70 | logger.setLevel(logging.ERROR) 71 | logging.basicConfig(level=logging.ERROR) 72 | else: 73 | logger.setLevel(logging.INFO) 74 | logging.basicConfig(level=logging.INFO) 75 | 76 | 77 | def is_dist_avail_and_initialized(): 78 | if not dist.is_available(): 79 | return False 80 | if not dist.is_initialized(): 81 | return False 82 | return True 83 | 84 | 85 | def get_world_size(): 86 | if not is_dist_avail_and_initialized(): 87 | return 1 88 | return dist.get_world_size() 89 | 90 | 91 | def get_rank(): 92 | if not is_dist_avail_and_initialized(): 93 | return 0 94 | return dist.get_rank() 95 | 96 | 97 | def is_main_process(): 98 | return get_rank() == 0 99 | 100 | 101 | def save_on_master(*args, **kwargs): 102 | if is_main_process(): 103 | torch.save(*args, **kwargs) 104 | 105 | 106 | def init_distributed_mode(logger, dist_backend='nccl'): 107 | dist_info = dict( 108 | distributed=False, 109 | rank=0, 110 | world_size=1, 111 | gpu=0, 112 | dist_backend=dist_backend, 113 | dist_url=get_init_file(None).as_uri(), 114 | ) 115 | # If launched using submitit, get the job_env and set using those 116 | try: 117 | job_env = submitit.JobEnvironment() 118 | except RuntimeError: 119 | job_env = None 120 | if job_env is not None: 121 | dist_info['rank'] = job_env.global_rank 122 | dist_info['world_size'] = job_env.num_tasks 123 | dist_info['gpu'] = job_env.local_rank 124 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 125 | dist_info['rank'] = int(os.environ["RANK"]) 126 | dist_info['world_size'] = int(os.environ['WORLD_SIZE']) 127 | dist_info['gpu'] = int(os.environ['LOCAL_RANK']) 128 | elif 'SLURM_PROCID' in os.environ: 129 | dist_info['rank'] = int(os.environ['SLURM_PROCID']) 130 | dist_info['gpu'] = dist_info['rank'] % torch.cuda.device_count() 131 | elif 'rank' in dist_info: 132 | pass 133 | else: 134 | print('Not using distributed mode') 135 | dist_info['distributed'] = False 136 | return dist_info 137 | 138 | dist_info['distributed'] = True 139 | 140 | torch.cuda.set_device(dist_info['gpu']) 141 | dist_info['dist_backend'] = dist_backend 142 | print('| distributed init (rank {}): {}'.format(dist_info['rank'], 143 | dist_info['dist_url']), 144 | flush=True) 145 | torch.distributed.init_process_group(backend=dist_info['dist_backend'], 146 | init_method=dist_info['dist_url'], 147 | world_size=dist_info['world_size'], 148 | rank=dist_info['rank']) 149 | setup_for_distributed(dist_info['rank'] == 0, logger) 150 | return dist_info 151 | 152 | 153 | def get_shared_folder(name) -> Path: 154 | # Since using hydra, which figures the out folder 155 | return Path('./').absolute() 156 | 157 | 158 | def get_init_file(name): 159 | # Init file must not exist, but it's parent dir must exist. 160 | os.makedirs(str(get_shared_folder(name)), exist_ok=True) 161 | init_file = get_shared_folder(name) / 'sync_file_init' 162 | return init_file 163 | 164 | 165 | def gather_tensors_from_all(tensor: torch.Tensor) -> List[torch.Tensor]: 166 | """ 167 | Wrapper over torch.distributed.all_gather for performing 168 | 'gather' of 'tensor' over all processes in both distributed / 169 | non-distributed scenarios. 170 | """ 171 | if tensor.ndim == 0: 172 | # 0 dim tensors cannot be gathered. so unsqueeze 173 | tensor = tensor.unsqueeze(0) 174 | 175 | if is_dist_avail_and_initialized(): 176 | gathered_tensors = [ 177 | torch.zeros_like(tensor) 178 | for _ in range(torch.distributed.get_world_size()) 179 | ] 180 | torch.distributed.all_gather(gathered_tensors, tensor) 181 | else: 182 | gathered_tensors = [tensor] 183 | 184 | return gathered_tensors 185 | 186 | 187 | def gather_from_all(tensor: torch.Tensor) -> torch.Tensor: 188 | gathered_tensors = gather_tensors_from_all(tensor) 189 | gathered_tensor = torch.cat(gathered_tensors, 0) 190 | return gathered_tensor 191 | 192 | 193 | def get_video_info(video_path: Path, props: List[str]) -> Dict[str, float]: 194 | """ 195 | Given the video, return the properties asked for 196 | """ 197 | output = {} 198 | cam = cv2.VideoCapture(str(video_path)) 199 | if 'fps' in props: 200 | output['fps'] = cam.get(cv2.CAP_PROP_FPS) 201 | if 'len' in props: 202 | fps = cam.get(cv2.CAP_PROP_FPS) 203 | if fps <= 0: 204 | output['len'] = 0 205 | else: 206 | output['len'] = (cam.get(cv2.CAP_PROP_FRAME_COUNT) / fps) 207 | cam.release() 208 | return output 209 | -------------------------------------------------------------------------------- /loss_fn/simclr_infonce.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """The SimCLR InfoNCE loss.""" 4 | import torch 5 | import torch.nn as nn 6 | 7 | from common import utils 8 | 9 | LARGE_NUM = 1e9 10 | 11 | 12 | class MILCrossEntropyLoss(nn.Module): 13 | def __init__(self, mil_type='sum', reduction='mean'): 14 | super().__init__() 15 | self.mil_type = mil_type 16 | self.reduction = reduction 17 | 18 | def forward(self, *args, **kwargs): 19 | if self.mil_type == 'sum': 20 | return self.forward_sum(*args, **kwargs) 21 | elif self.mil_type == 'max': 22 | return self.forward_max(*args, **kwargs) 23 | else: 24 | raise NotImplementedError(f'Unknown type {self.mil_type}') 25 | 26 | def forward_sum(self, pred, labels_onehot): 27 | """ 28 | Args: 29 | pred: BxC is the output 30 | labels: BxC is 1s for positive, and 0s for negatives 31 | Based on https://github.com/antoine77340/MIL-NCE_HowTo100M/blob/master/loss.py 32 | Or the MIL-NCE paper Eq 1 (https://arxiv.org/pdf/1912.06430.pdf) 33 | """ 34 | assert pred.shape == labels_onehot.shape 35 | # In the MILNCE code there is a sum, followed by logsumexp. I think 36 | # using the labels to select the positive samples and then doing 37 | # logsumexp will have the same effect. 38 | pos_pred = pred[labels_onehot.bool()].reshape((pred.size(0), -1)) 39 | numerator = torch.logsumexp(pos_pred, dim=1) 40 | denominotor = torch.logsumexp(pred, dim=1) 41 | loss = denominotor - numerator 42 | if self.reduction == 'mean': 43 | loss = torch.mean(loss) 44 | elif self.reduction == 'none': 45 | pass 46 | else: 47 | raise NotImplementedError(f'Unknown reduction {self.reduction}') 48 | return loss 49 | 50 | def forward_max(self, pred, labels_onehot): 51 | """ 52 | Args: 53 | pred: BxC is the output 54 | labels: BxC is 1s for positive, and 0s for negatives 55 | Based on Appendix A (https://arxiv.org/pdf/1912.06430.pdf) 56 | """ 57 | assert pred.shape == labels_onehot.shape 58 | # Do max before, and then logsumexp. Works since exp is monotonic fn 59 | # so the max with exp or without will be the same. 60 | pos_pred = pred[labels_onehot.bool()].reshape((pred.size(0), -1)) 61 | pos_pred = torch.max(pos_pred, dim=1, keepdim=True)[0] 62 | neg_pred = pred[~labels_onehot.bool()].reshape((pred.size(0), -1)) 63 | numerator = torch.logsumexp(pos_pred, dim=1) 64 | denominotor = torch.logsumexp(torch.cat([pos_pred, neg_pred], dim=1), 65 | dim=1) 66 | return torch.mean(denominotor - numerator) 67 | 68 | 69 | class DistributedSimclrInfoNCELoss(nn.Module): 70 | def __init__(self, 71 | temperature: float = 0.1, 72 | target_to_output_loss=True, 73 | mil_type='sum', 74 | reduction='mean'): 75 | super().__init__() 76 | self.temperature = temperature 77 | self.criterion = MILCrossEntropyLoss(mil_type, reduction=reduction) 78 | # This defines whether the reverse part of the loss, from target to 79 | # the output features, is incurred. 80 | self.target_to_output_loss = target_to_output_loss 81 | 82 | def forward(self, output: torch.Tensor, 83 | target: torch.Tensor) -> torch.Tensor: 84 | """ 85 | Args: 86 | output: BxC 87 | target: BxC or BxKxC <-- In case of MIL NCE, K is the number of 88 | positives for each batch element. 89 | Following https://github.com/google-research/simclr/blob/master/objective.py 90 | """ 91 | # Normalize first, before the gather -- so that all the features I get 92 | # are normalized 93 | output = nn.functional.normalize(output, dim=-1, p=2) 94 | target = nn.functional.normalize(target, dim=-1, p=2) 95 | # To be consistent with MIL-NCE input, convert K to batch dim, 96 | # and repeat the output to same value for each repeated target 97 | elt_for_back_loss = 0 98 | if target.ndim == 3: 99 | num_matching = target.size(1) 100 | target_flat = target.reshape((-1, target.size(-1))) 101 | # Keep the first one for the back loss 102 | target = target[:, elt_for_back_loss] 103 | else: 104 | num_matching = 1 105 | target_flat = target 106 | # Gather all the outputs and all the targets 107 | output_all = self.gather_embeddings(output) 108 | target_flat_all = self.gather_embeddings(target_flat) 109 | batch_size = output.size(0) 110 | replica_id = utils.get_rank() 111 | # -> (B, B_full * num_matching) 112 | labels_onehot = torch.zeros((batch_size, output_all.size(0)), 113 | dtype=output.dtype, 114 | device=output.device) 115 | extra_zeros = torch.zeros((batch_size, output_all.size(0)), 116 | dtype=output.dtype, 117 | device=output.device) 118 | ones_diag = torch.eye(batch_size, 119 | batch_size, 120 | dtype=output.dtype, 121 | device=output.device) 122 | labels_onehot[:, replica_id * batch_size:(replica_id + 1) * 123 | batch_size] = ones_diag 124 | labels_onehot_interleaved = labels_onehot.repeat_interleave( 125 | num_matching, dim=1) 126 | # (B, C) * (B_full, C) -> (B, B_full) 127 | logits_aa = torch.mm(output, output_all.t() / self.temperature) 128 | # (B, C) * (B_full * num_matching, C) -> (B, B_full * num_matching) 129 | logits_ab = torch.mm(output, target_flat_all.t() / self.temperature) 130 | logits_aa = logits_aa - labels_onehot * LARGE_NUM 131 | loss = self.criterion( 132 | torch.cat([logits_ab, logits_aa], 1), 133 | torch.cat([labels_onehot_interleaved, extra_zeros], 1)) 134 | if self.target_to_output_loss: 135 | # Keep only the first prediction, since that is what I will incur 136 | # reverse loss with 137 | target_all = target_flat_all[elt_for_back_loss::num_matching] 138 | logits_bb = torch.mm(target, target_all.t() / self.temperature) 139 | logits_bb = logits_bb - labels_onehot * LARGE_NUM 140 | logits_ba = torch.mm(target, output_all.t() / self.temperature) 141 | loss = loss + self.criterion( 142 | torch.cat([logits_ba, logits_bb], 1), 143 | torch.cat([labels_onehot, extra_zeros], 1)) 144 | return loss 145 | 146 | def gather_embeddings(self, embedding: torch.Tensor) -> torch.Tensor: 147 | """ 148 | Do a gather over all embeddings, so we can compute the loss. 149 | Final shape is like: (batch_size * num_gpus) x embedding_dim 150 | """ 151 | if torch.distributed.is_available( 152 | ) and torch.distributed.is_initialized(): 153 | # gather all embeddings. 154 | embedding_gathered = utils.gather_from_all(embedding) 155 | else: 156 | embedding_gathered = embedding 157 | return embedding_gathered 158 | 159 | 160 | class MultiDimDistributedSimclrInfoNCELoss(DistributedSimclrInfoNCELoss): 161 | """ 162 | Fold in the initial dimensions and run simple NCE. 163 | """ 164 | def forward(self, output: torch.Tensor, target: torch.Tensor, *args, 165 | **kwargs) -> torch.Tensor: 166 | return super().forward(output.flatten(0, -2), target.flatten(0, -2), 167 | *args, **kwargs) 168 | -------------------------------------------------------------------------------- /conf/config.yaml: -------------------------------------------------------------------------------- 1 | expt_name: "default" 2 | # Just set to multiple values to run the same config multiple times. Just there 3 | # to take into account random variation 4 | run_id: 0 5 | seed: 42 6 | # A common place, so can be overriden in notebooks, which don't support ":" 7 | # interpolation 8 | cwd: ${hydra:runtime.cwd} 9 | 10 | sync_bn: false 11 | 12 | test_only: false 13 | 14 | # Set this to force data parallel training. Num nodes should be 1. 15 | data_parallel: false 16 | 17 | dist_backend: nccl 18 | 19 | pytorch: 20 | # This only works with the compiled version of torchvision, and might have 21 | # some memory issues? 22 | video_backend: "video_reader" 23 | 24 | train: 25 | fn: 'train' # Which file in func/ directory to use for training 26 | batch_size: 16 27 | # This can have structure as follows: 28 | # :: <>... 29 | # By default also supports just the 30 | # But the more complex structure can be used to init separate parts of model 31 | # using diff checkpoints. By default if only 2 elements are specified with :, 32 | # module_name_in_ckpt is assumed to be null 33 | init_from_model: null 34 | # Total epochs to train for 35 | num_epochs: 45 36 | # Evaluate within training, every these many epochs 37 | eval_freq: 1 38 | # Shuffle data at train time 39 | shuffle_data: true 40 | # Store the best performing checkpoint 41 | store_best: false 42 | train_one_epoch_fn: 43 | _target_: func.train.train_one_epoch 44 | print_freq: 10 45 | print_large_freq: 1000 # How often to write images/videos summary 46 | grad_clip_params: ${opt.grad_clip} # DO NOT CHANGE HERE, change in opt 47 | # Set the following to store models every so many epochs. By default 48 | # will only store the last checkpoint and the best checkpoint. 49 | save_freq: null 50 | # Num of minutes to save at, same as above -- must set save_intermediate 51 | # true to save like this 52 | save_freq_min: 60 # At least save every 60 mins 53 | # Whether or not to save the intermediate models 54 | save_intermediates: false 55 | loss_wts: 56 | cls_action: 1.0 57 | cls_verb: 1.0 58 | cls_noun: 1.0 59 | pred: 1.0 60 | feat: 1.0 61 | # Past predictions, default 0 to be backward compatible 62 | past_cls_action: 0.0 63 | past_cls_verb: 0.0 64 | past_cls_noun: 0.0 65 | 66 | 67 | eval: 68 | batch_size: null # Will automatically figure out from train if null 69 | eval_fn: 70 | _target_: func.train.evaluate 71 | store: true 72 | store_endpoint: logits 73 | only_run_featext: false 74 | 75 | model: 76 | backbone_dim: 2048 77 | # Use the backbone dim if null. Don't use the interpolation since the 78 | # backbone dim might be updated in the code 79 | intermediate_featdim: null 80 | backbone_last_n_modules_to_drop: 2 # Avg pool and linear layer 81 | dropout: 0.0 82 | # Set to a number to project the temp_agg and future features to this 83 | # dimension using a MLP before applying the NCE loss. 84 | # Note this is also applied when doing L2 regression loss, so the name is a 85 | # bit of a misnomer. 86 | project_dim_for_nce: null 87 | # Set to true to also add a regression head -- that is used for dense 88 | # anticipation when predicting the duration of an action 89 | add_regression_head: False 90 | bn: 91 | eps: 0.001 92 | mom: 0.1 93 | # Set this to true if you want to have the same temporal aggregated feat 94 | # dim as from the original backbone (backbone_dim). This will add a linear 95 | # layer to get that. It's useful when training future predictive models, 96 | # with future feat avg as the target. 97 | same_temp_agg_dim: false 98 | # Set this to true to use the class mappings to get the other predictions 99 | # eg, verb/noun from action, instead of adding additional linear layers 100 | # Only applicable when predicting multiple output classes 101 | use_cls_mappings: false 102 | # Apply the classifier on the past predictions too 103 | classifier_on_past: false 104 | 105 | opt: 106 | # Not using an overall LR anymore, since everything is now defined per 107 | # module. 108 | # Use a list format to specify per-layer LRs and WD. The first element is 109 | # module_name ("__all__" => all params), LR and WD. 110 | # Note that if there is any overlap between parameters, those params 111 | # will get updated that many number of times as they appear in the list. 112 | # It WILL NOT take the last options as highest precedence. (TODO future) 113 | # The first term can also be a list, to give it a bunch of modules to set 114 | # the same LR and WD for. 115 | lr_wd: [[__all__, 0.1, 0.0001]] 116 | # Set this to true to also scale the LR by the batch size (normally it will 117 | # be scaled by the #replicas, so the LR is specified per given batch size). 118 | # This allows to further specify a LR per batch element (useful when doing 119 | # sweeps over batch size). 120 | scale_lr_by_bs: false 121 | # Set this to true to only train the last classifier layer. 122 | # Also, will set all BN layers to not compute mean/var at runtime. 123 | classifier_only: false 124 | bias_bn_wd_scale: 1.0 # Scale the WD for bias and BN layers by this amount 125 | grad_clip: 126 | max_norm: null # By default, no clipping 127 | norm_type: 2 128 | warmup: 129 | _target_: common.scheduler.Warmup 130 | init_lr_ratio: 0.0 # Warmup from this ratio of the orig LRs 131 | num_epochs: 0 # Warmup for this many epochs (will take out of total epochs) 132 | 133 | moco: 134 | _target_: moco.moco.builder.MoCo 135 | dim: 128 136 | K: 65536 137 | m: 0.999 138 | T: 0.2 # From moco-v2 139 | mlp: true # From moco-v2 140 | 141 | defaults: 142 | - train_eval_op: basic 143 | - train_eval_op/cls_loss_acc_fn: basic 144 | - train_eval_op/reg_criterion: mse 145 | - opt/optimizer: sgd 146 | - model/backbone: r2plus1d_34 147 | - model/temporal_aggregator: mean 148 | - model/future_predictor: identity 149 | - model/temporal_aggregator_after_future_pred: identity 150 | - model/classifier: linear 151 | - opt/scheduler: warmup_multi_step 152 | # Any keys with dataset_train prefix, like dataset_train2, etc, will all 153 | # be used for training by concatentating all those datasets. So you can 154 | # use multiple datasets in training by adding 155 | # +dataseset_train2=hmdb51/train to the command line config. 156 | # Note that this only works with standard datasets, ConcatDataset can't 157 | # handle overly customized datasets as we use in EpicKitchens 158 | - dataset@dataset_train: epic_kitchens100/anticipation_train 159 | # Any keys with the dataset_eval prefix, will all be evaluated on separately. 160 | # The postfix will be used to identify which dataset the results are on. 161 | # So, you can use > 1 evaluation datasets that way, by adding it in the 162 | # command line config, like +dataset_eval2=hmdb51/val 163 | - dataset@dataset_eval: epic_kitchens100/anticipation_val 164 | - data@data_train: default 165 | - data@data_eval: default 166 | # Load any common dataset files, that will be used to create other dataset 167 | # elements. 168 | - dataset/epic_kitchens/common 169 | - dataset/epic_kitchens100/common 170 | - dataset/dundee50salads/common 171 | - dataset/dundee50salads/annot_reader_fn: orig 172 | - dataset/egtea/common 173 | # Overrides 174 | - override hydra/launcher: submitit_slurm 175 | - override hydra/job_logging: colorlog 176 | - override hydra/hydra_logging: colorlog 177 | 178 | hydra: 179 | job: 180 | name: "AVT" 181 | launcher: 182 | # All params in https://github.com/facebookresearch/hydra/blob/master/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py 183 | timeout_min: 2880 184 | cpus_per_task: 10 185 | gpus_per_node: 8 186 | tasks_per_node: ${hydra.launcher.gpus_per_node} 187 | # This is the memory requested per node. So all GPUs on a given 188 | # node will share this memory 189 | mem_gb: 450 190 | nodes: 2 191 | # Use these parameters through + options in hydra 192 | # partition: learnfair 193 | # max_num_timeout: 3 194 | # constraint: ${hydra.launcher.gpu_type} # Any, or could say [volta|pascal] 195 | # comment: "" 196 | run: 197 | dir: ./outputs/ # Specified in the launch script 198 | sweep: 199 | dir: ${hydra.run.dir} 200 | # Output sub directory for sweep runs. 201 | subdir: ${hydra.job.num} # ${hydra.job.override_dirname} 202 | -------------------------------------------------------------------------------- /models/temporal_aggregation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """ 4 | Implementation of the temporal aggregation algorithms. 5 | Input: (B, C, T) 6 | Output: (B, C) 7 | """ 8 | import math 9 | import torch 10 | import torch.nn as nn 11 | import logging 12 | import warnings 13 | 14 | try: 15 | from external.rulstm.RULSTM.models import RULSTM 16 | except ImportError: 17 | RULSTM = object 18 | logging.warning('No RULSTM found.') 19 | 20 | 21 | class Identity(nn.Identity): 22 | def __init__(self, in_features): 23 | super().__init__() 24 | self.in_features = in_features 25 | 26 | def forward(self, *args, **kwargs): 27 | return super().forward(*args, **kwargs), {} 28 | 29 | @property 30 | def output_dim(self): 31 | return self.in_features 32 | 33 | 34 | class Mean(nn.Module): 35 | def __init__(self, in_features): 36 | super().__init__() 37 | self.in_features = in_features 38 | 39 | def forward(self, feats): 40 | """ 41 | feats: B, T, C dimensional input 42 | """ 43 | return torch.mean(feats, dim=1), {} 44 | 45 | @property 46 | def output_dim(self): 47 | return self.in_features 48 | 49 | 50 | class PositionalEncoding(nn.Module): 51 | """For now, just using simple pos encoding from language. 52 | https://pytorch.org/tutorials/beginner/transformer_tutorial.html 53 | """ 54 | def __init__(self, d_model, dropout=0.1, max_len=5000): 55 | super(PositionalEncoding, self).__init__() 56 | self.dropout = nn.Dropout(p=dropout) 57 | 58 | pe = torch.zeros(max_len, d_model) 59 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 60 | div_term = torch.exp( 61 | torch.arange(0, d_model, 2).float() * 62 | (-math.log(10000.0) / d_model)) 63 | pe[:, 0::2] = torch.sin(position * div_term) 64 | pe[:, 1::2] = torch.cos(position * div_term) 65 | pe = pe.unsqueeze(0).transpose(0, 1) 66 | self.register_buffer('pe', pe) 67 | 68 | def forward(self, x): 69 | x = x + self.pe[:x.size(0), :] 70 | return self.dropout(x) 71 | 72 | 73 | class Transformer(nn.Module): 74 | """ Using a transformer encoder and simple decoder. """ 75 | def __init__(self, 76 | in_features, 77 | inter_rep=512, 78 | nheads=8, 79 | nlayers=6, 80 | agg_style='mean', 81 | cloze_loss_ratio=0.0, 82 | cloze_loss_wt=0.0): 83 | super().__init__() 84 | self.in_features = in_features 85 | self.inter_rep = inter_rep 86 | self.downproject = nn.Linear(in_features, inter_rep) 87 | layer = nn.TransformerEncoderLayer(d_model=inter_rep, nhead=nheads) 88 | # Don't think I'll ever consider longer than 1000 features? 89 | self.pos_encoder = PositionalEncoding(inter_rep, max_len=1000) 90 | self.transformer_encoder = nn.TransformerEncoder( 91 | layer, num_layers=nlayers, norm=nn.LayerNorm(inter_rep)) 92 | self.agg_style = agg_style 93 | self.cloze_loss_ratio = cloze_loss_ratio 94 | self.cloze_loss_wt = cloze_loss_wt 95 | self.cloze_loss_fn = nn.MSELoss(reduction='none') 96 | # The embedding for the [MASK] token 97 | if self.cloze_loss_ratio > 0: 98 | self.extra_embeddings = nn.Embedding(1, in_features) 99 | 100 | def forward(self, feats): 101 | """ 102 | Args: 103 | feats (B, T, C) 104 | Returns: 105 | aggregated features (B, C') 106 | """ 107 | # Convert to the format used by transformer: T, B, C 108 | feats = feats.transpose(0, 1) 109 | kwargs = {} 110 | if self.training and self.cloze_loss_ratio > 0: 111 | # Mask out certain positions, so when doing attention these 112 | # positions will be ignored 113 | key_padding_mask = torch.rand((feats.size(0), feats.size(1)), 114 | device=feats.device) 115 | # Get close_ratio amount as True, so those will be ignored 116 | key_padding_mask = key_padding_mask <= self.cloze_loss_ratio 117 | # Set the features to MASK embedding, for the ones that are masked 118 | key_padding_mask_rep = key_padding_mask.unsqueeze(-1).expand( 119 | -1, -1, feats.size(2)) 120 | # Set the masked elements to 0, and add the MASK embedding 121 | replaced_feats = ( 122 | feats * (~key_padding_mask_rep) + 123 | key_padding_mask_rep * self.extra_embeddings( 124 | torch.tensor([0], dtype=torch.long, 125 | device=feats.device)).unsqueeze(0)) 126 | feats = replaced_feats 127 | # Transpose since the function takes in B, T 128 | kwargs['src_key_padding_mask'] = key_padding_mask.t() 129 | feats = self.pos_encoder(self.downproject(feats)) 130 | feats_encoded = self.transformer_encoder(feats, **kwargs) 131 | aux_losses = {} 132 | if self.training and self.cloze_loss_ratio > 0: 133 | dist = self.cloze_loss_fn(feats_encoded, feats) 134 | dist_masked_elts = self.cloze_loss_wt * torch.mean( 135 | torch.mean(dist, dim=-1) * key_padding_mask) 136 | aux_losses['tx_mlm'] = dist_masked_elts 137 | if self.agg_style == 'mean': 138 | res = torch.mean(feats_encoded, dim=[0]) 139 | elif self.agg_style == 'last': 140 | res = feats_encoded[-1] 141 | else: 142 | raise NotImplementedError(f'Unknown agg style {self.agg_style}') 143 | return res, aux_losses 144 | 145 | @property 146 | def output_dim(self): 147 | return self.inter_rep 148 | 149 | 150 | class RULSTMAggregation(RULSTM): 151 | def __init__(self, 152 | in_features: int, 153 | intermediate_featdim: int = 1024, 154 | dropout: float = 0.8, 155 | num_pad_feats: int = 0): 156 | """ 157 | Args: 158 | num_pad_feats (int): Pad the features with zero feats for this 159 | many times on the time axis. This is because the unrolling 160 | LSTM unrolls forward as many times as input, and since original 161 | models were trained for 14 steps unrolling (upto 0.25s 162 | before the action), and I usually test for 11 steps (1s before 163 | action), need to pad 3 times to get the same output when 164 | testing pre-trained models. 165 | """ 166 | super().__init__(1, in_features, intermediate_featdim, dropout) 167 | # Remove the classifier, since the outside code will deal with that 168 | self.classifier = nn.Sequential() 169 | self.output_dim = intermediate_featdim 170 | self.num_pad_feats = num_pad_feats 171 | # Ignore warnings because it UserWarning: RNN module weights are not 172 | # part of single contiguous chunk of memory. This means they need to be 173 | # compacted at every call, possibly greatly increasing memory usage. 174 | # To compact weights again call flatten_parameters(). 175 | # Not sure how to fix this, adding the flatten didn't really fix 176 | # Happens only with DataParallel, not DDP 177 | # Using https://github.com/pytorch/pytorch/issues/24155#issuecomment-604474511 178 | # Just ignoring the warning 179 | warnings.filterwarnings('ignore') 180 | 181 | def forward(self, feats): 182 | """ 183 | Args: 184 | feats (B, T, C) 185 | Returns: 186 | aggregated (B, C) 187 | """ 188 | if self.num_pad_feats > 0: 189 | empty_feats = torch.zeros( 190 | (feats.size(0), self.num_pad_feats, feats.size(-1)), 191 | dtype=feats.dtype, 192 | device=feats.device) 193 | feats = torch.cat([feats, empty_feats], dim=1) 194 | res = super().forward(feats) 195 | # Return output corresponding to the last input frame. Note that in 196 | # original RULSTM they do -4 since they predict 3 steps further into 197 | # the anticipation time, whereas I stop when the anticipation time 198 | # starts here. 199 | # Subtract num_pad_feat as that would mean it predicted further into 200 | # the future 201 | return res[:, -1 - self.num_pad_feats, :], {} 202 | -------------------------------------------------------------------------------- /models/video_classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | """ 3 | Model architectures. 4 | """ 5 | import torch.nn as nn 6 | 7 | from torchvision.models.video.resnet import ( 8 | BasicBlock, 9 | Bottleneck, 10 | R2Plus1dStem, 11 | _video_resnet, 12 | ) 13 | from pretrainedmodels import bninception 14 | import timm 15 | 16 | __all__ = [ 17 | 'r2plus1d_34', 18 | 'r2plus1d_152', 19 | 'ir_csn_152', 20 | 'ip_csn_152', 21 | 'ip_csn_50', 22 | 'BNInceptionVideo', 23 | ] 24 | 25 | 26 | class BasicStem_Pool(nn.Sequential): 27 | def __init__(self): 28 | super(BasicStem_Pool, self).__init__( 29 | nn.Conv3d( 30 | 3, 31 | 64, 32 | kernel_size=(3, 7, 7), 33 | stride=(1, 2, 2), 34 | padding=(1, 3, 3), 35 | bias=False, 36 | ), 37 | nn.BatchNorm3d(64), 38 | nn.ReLU(inplace=True), 39 | nn.MaxPool3d(kernel_size=(1, 3, 3), 40 | stride=(1, 2, 2), 41 | padding=(0, 1, 1)), 42 | ) 43 | 44 | 45 | class Conv3DDepthwise(nn.Conv3d): 46 | def __init__(self, 47 | in_planes, 48 | out_planes, 49 | midplanes=None, 50 | stride=1, 51 | padding=1): 52 | 53 | assert in_planes == out_planes 54 | super(Conv3DDepthwise, self).__init__( 55 | in_channels=in_planes, 56 | out_channels=out_planes, 57 | kernel_size=(3, 3, 3), 58 | stride=stride, 59 | padding=padding, 60 | groups=in_planes, 61 | bias=False, 62 | ) 63 | 64 | @staticmethod 65 | def get_downsample_stride(stride): 66 | return (stride, stride, stride) 67 | 68 | 69 | class IPConv3DDepthwise(nn.Sequential): 70 | def __init__(self, in_planes, out_planes, midplanes, stride=1, padding=1): 71 | 72 | assert in_planes == out_planes 73 | super(IPConv3DDepthwise, self).__init__( 74 | nn.Conv3d(in_planes, out_planes, kernel_size=1, bias=False), 75 | nn.BatchNorm3d(out_planes), 76 | # nn.ReLU(inplace=True), 77 | Conv3DDepthwise(out_planes, out_planes, None, stride), 78 | ) 79 | 80 | @staticmethod 81 | def get_downsample_stride(stride): 82 | return (stride, stride, stride) 83 | 84 | 85 | class Conv2Plus1D(nn.Sequential): 86 | def __init__(self, in_planes, out_planes, midplanes, stride=1, padding=1): 87 | 88 | midplanes = (in_planes * out_planes * 3 * 3 * 89 | 3) // (in_planes * 3 * 3 + 3 * out_planes) 90 | super(Conv2Plus1D, self).__init__( 91 | nn.Conv3d( 92 | in_planes, 93 | midplanes, 94 | kernel_size=(1, 3, 3), 95 | stride=(1, stride, stride), 96 | padding=(0, padding, padding), 97 | bias=False, 98 | ), 99 | nn.BatchNorm3d(midplanes), 100 | nn.ReLU(inplace=True), 101 | nn.Conv3d( 102 | midplanes, 103 | out_planes, 104 | kernel_size=(3, 1, 1), 105 | stride=(stride, 1, 1), 106 | padding=(padding, 0, 0), 107 | bias=False, 108 | ), 109 | ) 110 | 111 | @staticmethod 112 | def get_downsample_stride(stride): 113 | return (stride, stride, stride) 114 | 115 | 116 | def _set_bn_params(model, bn_eps=1e-3, bn_mom=0.1): 117 | """ 118 | Set the BN parameters to the defaults: Du's models were trained 119 | with 1e-3 and 0.9 for eps and momentum resp. 120 | Ref: https://github.com/facebookresearch/VMZ/blob/f4089e2164f67a98bc5bed4f97dc722bdbcd268e/lib/models/r3d_model.py#L208 121 | """ 122 | for module in model.modules(): 123 | if isinstance(module, nn.BatchNorm3d): 124 | module.eps = bn_eps 125 | module.momentum = bn_mom 126 | 127 | 128 | def r2plus1d_34(pretrained=False, 129 | progress=False, 130 | bn_eps=1e-3, 131 | bn_mom=0.1, 132 | **kwargs): 133 | model = _video_resnet("r2plus1d_34", 134 | False, 135 | False, 136 | block=BasicBlock, 137 | conv_makers=[Conv2Plus1D] * 4, 138 | layers=[3, 4, 6, 3], 139 | stem=R2Plus1dStem, 140 | **kwargs) 141 | _set_bn_params(model, bn_eps, bn_mom) 142 | return model 143 | 144 | 145 | def r2plus1d_152(pretrained=False, 146 | progress=False, 147 | bn_eps=1e-3, 148 | bn_mom=0.1, 149 | **kwargs): 150 | model = _video_resnet("r2plus1d_152", 151 | False, 152 | False, 153 | block=Bottleneck, 154 | conv_makers=[Conv2Plus1D] * 4, 155 | layers=[3, 8, 36, 3], 156 | stem=R2Plus1dStem, 157 | **kwargs) 158 | _set_bn_params(model, bn_eps, bn_mom) 159 | return model 160 | 161 | 162 | def ir_csn_152(pretrained=False, 163 | progress=False, 164 | bn_eps=1e-3, 165 | bn_mom=0.1, 166 | **kwargs): 167 | model = _video_resnet("ir_csn_152", 168 | False, 169 | False, 170 | block=Bottleneck, 171 | conv_makers=[Conv3DDepthwise] * 4, 172 | layers=[3, 8, 36, 3], 173 | stem=BasicStem_Pool, 174 | **kwargs) 175 | _set_bn_params(model, bn_eps, bn_mom) 176 | return model 177 | 178 | 179 | def ip_csn_152(pretrained=False, 180 | progress=False, 181 | bn_eps=1e-3, 182 | bn_mom=0.1, 183 | **kwargs): 184 | model = _video_resnet("ip_csn_152", 185 | False, 186 | False, 187 | block=Bottleneck, 188 | conv_makers=[IPConv3DDepthwise] * 4, 189 | layers=[3, 8, 36, 3], 190 | stem=BasicStem_Pool, 191 | **kwargs) 192 | _set_bn_params(model, bn_eps, bn_mom) 193 | return model 194 | 195 | 196 | def ip_csn_50(pretrained=False, 197 | progress=False, 198 | bn_eps=0.3, 199 | bn_mom=0.1, 200 | **kwargs): 201 | model = _video_resnet("ip_csn_50", 202 | False, 203 | False, 204 | block=Bottleneck, 205 | conv_makers=[IPConv3DDepthwise] * 4, 206 | layers=[3, 8, 6, 3], 207 | stem=BasicStem_Pool, 208 | **kwargs) 209 | _set_bn_params(model, bn_eps, bn_mom) 210 | return model 211 | 212 | 213 | def process_each_frame(model, video, *args, **kwargs): 214 | """ 215 | Pass in each frame separately 216 | Args: 217 | video (B, C, T, H, W) 218 | Returns: 219 | feats: (B, C', T, 1, 1) 220 | """ 221 | batch_size = video.size(0) 222 | time_dim = video.size(2) 223 | video_flat = video.transpose(1, 2).flatten(0, 1) 224 | feats_flat = model(video_flat, *args, **kwargs) 225 | return feats_flat.view((batch_size, time_dim) + 226 | feats_flat.shape[1:]).transpose( 227 | 1, 2).unsqueeze(-1).unsqueeze(-1) 228 | 229 | 230 | class FrameLevelModel(nn.Module): 231 | """Runs a frame level model on all the frames.""" 232 | def __init__(self, num_classes: int, model: nn.Module = None): 233 | del num_classes 234 | super().__init__() 235 | self.model = model 236 | 237 | def forward(self, video, *args, **kwargs): 238 | return process_each_frame(self.model, video, *args, **kwargs) 239 | 240 | 241 | class BNInceptionVideo(FrameLevelModel): 242 | def __init__(self, *args, **kwargs): 243 | super().__init__(*args, **kwargs) 244 | self.model = bninception(*args, **kwargs) 245 | self.model.last_linear = nn.Identity() 246 | self.model.global_pool = nn.AdaptiveAvgPool2d(1) 247 | 248 | 249 | class TIMMModel(FrameLevelModel): 250 | def __init__(self, 251 | num_classes, 252 | model_type='vit_base_patch16_224', 253 | drop_cls=True): 254 | super().__init__(num_classes) 255 | model = timm.create_model(model_type, 256 | num_classes=0 if drop_cls else num_classes) 257 | self.model = model 258 | -------------------------------------------------------------------------------- /common/log.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | from collections import defaultdict, deque 4 | import datetime 5 | import time 6 | import logging 7 | 8 | import torch 9 | import torch.distributed as dist 10 | 11 | from common.utils import is_dist_avail_and_initialized, is_main_process 12 | __all__ = [ 13 | 'SmoothedValue', 'MetricLogger', 'get_default_loggers', 14 | 'get_default_loggers' 15 | ] 16 | EPS = 0.000001 17 | 18 | 19 | class SmoothedValue(object): 20 | """Track a series of values and provide access to smoothed values over a 21 | window or the global series average. 22 | """ 23 | def __init__(self, window_size=20, fmt=None): 24 | if fmt is None: 25 | fmt = "{median:.4f} ({global_avg:.4f})" 26 | self.deque = deque(maxlen=window_size) 27 | self.total = 0.0 28 | self.count = 0 29 | self.fmt = fmt 30 | self.ws = window_size 31 | 32 | def reset(self): 33 | self.__init__(window_size=self.ws, fmt=self.fmt) 34 | 35 | def update(self, value, n=1): 36 | self.deque.append(value) 37 | self.count += n 38 | self.total += value * n 39 | 40 | def synchronize_between_processes(self): 41 | """ 42 | Warning: does not synchronize the deque! 43 | """ 44 | if not is_dist_avail_and_initialized(): 45 | return 46 | t = torch.tensor([self.count, self.total], 47 | dtype=torch.float64, 48 | device='cuda') 49 | dist.barrier() 50 | dist.all_reduce(t) 51 | t = t.tolist() 52 | self.count = int(t[0]) 53 | self.total = t[1] 54 | 55 | @property 56 | def median(self): 57 | d = torch.tensor(list(self.deque)) 58 | return d.median().item() 59 | 60 | @property 61 | def avg(self): 62 | d = torch.tensor(list(self.deque), dtype=torch.float32) 63 | return d.mean().item() 64 | 65 | @property 66 | def global_avg(self): 67 | return self.total / (self.count + EPS) 68 | 69 | @property 70 | def max(self): 71 | return max(self.deque) 72 | 73 | @property 74 | def value(self): 75 | return self.deque[-1] 76 | 77 | def __str__(self): 78 | return self.fmt.format(median=self.median, 79 | avg=self.avg, 80 | global_avg=self.global_avg, 81 | max=self.max, 82 | value=self.value) 83 | 84 | 85 | class MetricLogger(object): 86 | def __init__(self, 87 | delimiter="\t", 88 | writer=None, 89 | stat_set="train", 90 | epoch=0, 91 | logger=None): 92 | self.meters = defaultdict(SmoothedValue) 93 | self.delimiter = delimiter 94 | self.metric_set = stat_set 95 | self.epoch = epoch 96 | self.logger = logger.info if logger is not None else logging.info 97 | 98 | self.writer = writer 99 | self.writer_step = 0 100 | # Adding all logs from this to raw/ header, so I can plot other metrics 101 | # cleanly 102 | self.tbd_header = 'metric_logger/' 103 | 104 | self.meters["iter_time"] = SmoothedValue(fmt='{avg:.4f}') 105 | self.meters["data_time"] = SmoothedValue(fmt='{avg:.4f}') 106 | 107 | def update(self, **kwargs): 108 | for k, v in kwargs.items(): 109 | if isinstance(v, torch.Tensor): 110 | v = v.item() 111 | assert isinstance(v, (float, int)) 112 | self.meters[k].update(v) 113 | 114 | def __getattr__(self, attr): 115 | if attr in self.meters: 116 | return self.meters[attr] 117 | if attr in self.__dict__: 118 | return self.__dict__[attr] 119 | raise AttributeError("'{}' object has no attribute '{}'".format( 120 | type(self).__name__, attr)) 121 | 122 | def __str__(self): 123 | loss_str = [] 124 | for name, meter in self.meters.items(): 125 | loss_str.append("{}: {}".format(name, str(meter))) 126 | return self.delimiter.join(loss_str) 127 | 128 | def synchronize_between_processes(self): 129 | for meter in self.meters.values(): 130 | meter.synchronize_between_processes() 131 | 132 | def add_meter(self, name, meter): 133 | self.meters[name] = meter 134 | 135 | def reset_meters(self): 136 | self.logger("Logging: resseting all meters") 137 | for name, meter in self.meters.items(): 138 | meter.reset() 139 | self.logger( 140 | "Logging: resseting all meters done, updating epoch to %d".format( 141 | self.epoch + 1)) 142 | self.epoch += 1 143 | 144 | def log_every(self, iterable, print_freq, header=None): 145 | i = 0 146 | if not header: 147 | header = '' 148 | start_time = time.time() 149 | end = time.time() 150 | 151 | space_fmt = ':' + str(len(str(len(iterable)))) + 'd' 152 | if torch.cuda.is_available(): 153 | log_msg = self.delimiter.join([ 154 | header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 155 | 'time: {time}', 'data: {data}', 'max mem: {memory:.0f}' 156 | ]) 157 | else: 158 | log_msg = self.delimiter.join([ 159 | header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 160 | 'time: {time}', 'data: {data}' 161 | ]) 162 | MB = 1024.0 * 1024.0 163 | for obj in iterable: 164 | self.meters["data_time"].update(time.time() - end) 165 | yield obj 166 | self.meters["iter_time"].update(time.time() - end) 167 | if i % print_freq == 0: 168 | self._write_meters() 169 | eta_seconds = self.meters["iter_time"].global_avg * ( 170 | len(iterable) - i) 171 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 172 | if torch.cuda.is_available(): 173 | self.logger( 174 | log_msg.format( 175 | i, 176 | len(iterable), 177 | eta=eta_string, 178 | meters=str(self), 179 | time=str(self.meters["iter_time"]), 180 | data=str(self.meters["data_time"]), 181 | memory=torch.cuda.max_memory_allocated() / MB)) 182 | else: 183 | self.logger( 184 | log_msg.format(i, 185 | len(iterable), 186 | eta=eta_string, 187 | meters=str(self), 188 | time=str(self.meters["iter_time"]), 189 | data=str(self.meters["data_time"]))) 190 | i += 1 191 | end = time.time() 192 | total_time = time.time() - start_time 193 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 194 | self.logger('{} Total time: {}'.format(header, total_time_str)) 195 | self._write_epoch(total_time_str) 196 | 197 | def _write_meters(self): 198 | if self.writer is not None: 199 | for name, meter in self.meters.items(): 200 | self.writer.add_scalar( 201 | f"{self.tbd_header}iter/{self.metric_set}_{name}", 202 | meter.avg, self.writer_step) 203 | 204 | self.writer_step += 1 205 | 206 | def _write_epoch(self, total_time_string): 207 | if self.writer is not None: 208 | for name, meter in self.meters.items(): 209 | self.writer.add_scalar( 210 | f"{self.tbd_header}epoch/{self.metric_set}_{name}", 211 | meter.avg, self.epoch) 212 | 213 | self.writer.add_text( 214 | f"{self.tbd_header}epoch/{self.metric_set}_totaltime", 215 | total_time_string, self.epoch) 216 | 217 | 218 | def setup_tbx(save_dir, SummaryWriter): 219 | if not is_main_process(): 220 | return None 221 | 222 | writer = SummaryWriter(save_dir) 223 | return writer 224 | 225 | 226 | def get_default_loggers(writer, epoch, logger): 227 | stat_loggers = dict() 228 | stat_loggers["train"] = MetricLogger(delimiter=" ", 229 | writer=writer, 230 | stat_set="train", 231 | epoch=epoch, 232 | logger=logger) 233 | stat_loggers["train"].add_meter( 234 | 'lr', SmoothedValue(window_size=1, fmt='{value}')) 235 | stat_loggers["train"].add_meter( 236 | 'clips/s', SmoothedValue(window_size=10, fmt='{value:.3f}')) 237 | 238 | stat_loggers["val"] = MetricLogger(delimiter=" ", 239 | writer=writer, 240 | stat_set="val", 241 | epoch=epoch, 242 | logger=logger) 243 | 244 | return stat_loggers 245 | -------------------------------------------------------------------------------- /env.yaml: -------------------------------------------------------------------------------- 1 | name: avt 2 | channels: 3 | - huggingface 4 | - iopath 5 | - pytorch 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - _libgcc_mutex=0.1=main 10 | - attrs=19.3.0=py_0 11 | - av=7.0.1=py37h82f89c2_2 12 | - backcall=0.1.0=py_0 13 | - blas=1.0=mkl 14 | - bleach=3.1.5=pyh9f0ad1d_0 15 | - blessed=1.17.8=py37hc8dfbb8_0 16 | - brotlipy=0.7.0=py37h8f50634_1000 17 | - bzip2=1.0.8=h516909a_2 18 | - ca-certificates=2021.4.13=h06a4308_1 19 | - cairo=1.16.0=hcf35c78_1003 20 | - certifi=2020.12.5=py37h06a4308_0 21 | - cffi=1.14.0=py37h2e261b9_0 22 | - chardet=3.0.4=py37hc8dfbb8_1006 23 | - cmake=3.3.1=0 24 | - cryptography=2.9.2=py37hb09aad4_0 25 | - cudatoolkit=11.0.221=h6bb024c_0 26 | - dataclasses=0.7=py37_0 27 | - dbus=1.13.6=he372182_0 28 | - decorator=4.4.2=py_0 29 | - defusedxml=0.6.0=py_0 30 | - entrypoints=0.3=py37hc8dfbb8_1001 31 | - expat=2.2.9=he1b5a44_2 32 | - faiss-cpu=1.6.3=py37h6bb024c_0 33 | - ffmpeg=4.2=h167e202_0 34 | - fftw=3.3.8=nompi_h7f3a6c3_1110 35 | - filelock=3.0.12=pyhd3eb1b0_1 36 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 37 | - font-ttf-inconsolata=2.001=hab24e00_0 38 | - font-ttf-source-code-pro=2.030=hab24e00_0 39 | - font-ttf-ubuntu=0.83=hab24e00_0 40 | - fontconfig=2.13.1=h86ecdb6_1001 41 | - fonts-conda-forge=1=0 42 | - freetype=2.9.1=h8a8886c_1 43 | - fribidi=1.0.9=h516909a_0 44 | - future=0.18.2=py37_1 45 | - gdk-pixbuf=2.38.2=h3f25603_3 46 | - gettext=0.19.8.1=hc5be6a0_1002 47 | - ghostscript=9.22=hf484d3e_1001 48 | - giflib=5.2.1=h516909a_2 49 | - glib=2.64.3=h6f030ca_0 50 | - gmp=6.2.0=he1b5a44_2 51 | - gnutls=3.6.5=hd3a4fd2_1002 52 | - gobject-introspection=1.64.1=py37h619baee_1 53 | - graphite2=1.3.13=he1b5a44_1001 54 | - graphviz=2.42.3=h0511662_0 55 | - gst-plugins-base=1.14.5=h0935bb2_2 56 | - gstreamer=1.14.5=h36ae1b5_2 57 | - harfbuzz=2.4.0=h9f30f68_3 58 | - hdf5=1.10.6=nompi_h3c11f04_100 59 | - icu=64.2=he1b5a44_1 60 | - idna=2.9=py_1 61 | - imageio=2.8.0=py_0 62 | - imageio-ffmpeg=0.4.2=py_0 63 | - imagemagick=7.0.10_6=pl526ha9fe49d_0 64 | - importlib-metadata=1.6.1=py37hc8dfbb8_0 65 | - importlib_metadata=1.6.1=0 66 | - inquirer=2.7.0=py_0 67 | - intel-openmp=2020.0=166 68 | - iopath=0.1.8=py37 69 | - ipykernel=5.3.0=py37h43977f1_0 70 | - ipython=7.15.0=py37hc8dfbb8_0 71 | - ipython_genutils=0.2.0=py_1 72 | - ipywidgets=7.5.1=py_0 73 | - isort=4.3.21=py37_0 74 | - jasper=1.900.1=h07fcdf6_1006 75 | - jbig=2.1=h516909a_2002 76 | - jedi=0.17.0=py37hc8dfbb8_0 77 | - jinja2=2.11.2=pyh9f0ad1d_0 78 | - jpeg=9c=h14c3975_1001 79 | - jsonschema=3.2.0=py37hc8dfbb8_1 80 | - jupyter_client=6.1.3=py_0 81 | - jupyter_core=4.6.3=py37hc8dfbb8_1 82 | - lame=3.100=h14c3975_1001 83 | - lazy-object-proxy=1.4.3=py37h27cfd23_2 84 | - ld_impl_linux-64=2.33.1=h53a641e_7 85 | - libblas=3.8.0=15_mkl 86 | - libcblas=3.8.0=15_mkl 87 | - libclang=9.0.1=default_hde54327_0 88 | - libcroco=0.6.13=h8d621e5_1 89 | - libedit=3.1.20181209=hc058e9b_0 90 | - libffi=3.2.1=hd88cf55_4 91 | - libgcc-ng=9.1.0=hdf63c60_0 92 | - libgfortran-ng=7.3.0=hdf63c60_0 93 | - libiconv=1.15=h516909a_1006 94 | - liblapack=3.8.0=15_mkl 95 | - liblapacke=3.8.0=15_mkl 96 | - libllvm9=9.0.1=he513fc3_1 97 | - libopencv=4.2.0=py37_6 98 | - libpng=1.6.37=hbc83047_0 99 | - libprotobuf=3.13.0.1=hd408876_0 100 | - librsvg=2.49.2=h33a7fed_0 101 | - libsodium=1.0.17=h516909a_0 102 | - libstdcxx-ng=9.1.0=hdf63c60_0 103 | - libtiff=4.1.0=h2733197_0 104 | - libtool=2.4.6=h14c3975_1002 105 | - libuuid=2.32.1=h14c3975_1000 106 | - libuv=1.40.0=h7b6447c_0 107 | - libwebp=1.0.2=h56121f0_5 108 | - libxcb=1.13=h14c3975_1002 109 | - libxkbcommon=0.10.0=he1b5a44_0 110 | - libxml2=2.9.10=hee79883_0 111 | - lmdb=0.9.24=h516909a_0 112 | - markupsafe=1.1.1=py37h8f50634_1 113 | - mistune=0.8.4=py37h8f50634_1001 114 | - mkl=2020.2=256 115 | - mkl-include=2020.2=256 116 | - mkl-service=2.3.0=py37he904b0f_0 117 | - mkl_fft=1.0.15=py37ha843d7b_0 118 | - mkl_random=1.1.0=py37hd6b4f25_0 119 | - nbconvert=5.6.1=py37hc8dfbb8_1 120 | - nbformat=5.0.6=py_0 121 | - ncurses=6.2=he6710b0_0 122 | - nettle=3.4.1=h1bed415_1002 123 | - ninja=1.10.2=py37hff7bd54_0 124 | - nltk=3.4.4=py_0 125 | - notebook=6.0.3=py37hc8dfbb8_0 126 | - nspr=4.25=he1b5a44_0 127 | - nss=3.47=he751ad9_0 128 | - numpy=1.18.1=py37h4f9e942_0 129 | - numpy-base=1.18.1=py37hde5b4d6_1 130 | - olefile=0.46=py37_0 131 | - opencv=4.2.0=py37_6 132 | - openh264=1.8.0=hdbcaa40_1000 133 | - openjpeg=2.3.1=h981e76c_3 134 | - openssl=1.1.1k=h27cfd23_0 135 | - packaging=20.4=pyh9f0ad1d_0 136 | - pandas=1.0.3=py37h0da4684_1 137 | - pandoc=2.9.2.1=0 138 | - pandocfilters=1.4.2=py_1 139 | - pango=1.42.4=h7062337_4 140 | - parso=0.7.0=pyh9f0ad1d_0 141 | - pcre=8.44=he1b5a44_0 142 | - perl=5.26.2=h516909a_1006 143 | - pexpect=4.8.0=py37hc8dfbb8_1 144 | - pickleshare=0.7.5=py37hc8dfbb8_1001 145 | - pip=20.0.2=py37_1 146 | - pixman=0.38.0=h516909a_1003 147 | - pkg-config=0.29.2=h516909a_1006 148 | - proglog=0.1.9=py_0 149 | - prometheus_client=0.8.0=pyh9f0ad1d_0 150 | - prompt-toolkit=3.0.5=py_0 151 | - pthread-stubs=0.4=h14c3975_1001 152 | - ptyprocess=0.6.0=py_1001 153 | - py-opencv=4.2.0=py37h43977f1_6 154 | - pycparser=2.20=py_0 155 | - pygments=2.6.1=py_0 156 | - pyopenssl=19.1.0=py_1 157 | - pyparsing=2.4.7=pyh9f0ad1d_0 158 | - pyrsistent=0.16.0=py37h8f50634_0 159 | - pysocks=1.7.1=py37hc8dfbb8_1 160 | - python=3.7.7=hcf32534_0_cpython 161 | - python-dateutil=2.8.1=py_0 162 | - python-editor=1.0.4=py_0 163 | - python-lmdb=0.96=py37he1b5a44_0 164 | - python_abi=3.7=1_cp37m 165 | - pytorch=1.7.1=py3.7_cuda11.0.221_cudnn8.0.5_0 166 | - pytz=2020.1=pyh9f0ad1d_0 167 | - pyzmq=19.0.1=py37hac76be4_0 168 | - qt=5.12.5=hd8c4c69_1 169 | - readchar=2.0.0=py_0 170 | - readline=8.0=h7b6447c_0 171 | - requests=2.25.1=pyhd3eb1b0_0 172 | - send2trash=1.5.0=py_0 173 | - setuptools=51.3.3=py37h06a4308_4 174 | - six=1.15.0=py37h06a4308_0 175 | - sqlite=3.31.1=h62c20be_1 176 | - terminado=0.8.3=py37hc8dfbb8_1 177 | - testpath=0.4.4=py_0 178 | - tk=8.6.8=hbc83047_0 179 | - toml=0.10.1=py_0 180 | - tornado=6.0.4=py37h8f50634_1 181 | - tqdm=4.54.1=pyhd8ed1ab_1 182 | - traitlets=4.3.3=py37hc8dfbb8_1 183 | - transformers=4.2.2=pyh7b7c402_0 184 | - typed-ast=1.4.2=py37h27cfd23_1 185 | - typing_extensions=3.7.4.3=py_0 186 | - urllib3=1.25.9=py_0 187 | - wcwidth=0.2.4=pyh9f0ad1d_0 188 | - webencodings=0.5.1=py_1 189 | - wheel=0.34.2=py37_0 190 | - widgetsnbextension=3.5.1=py37_0 191 | - x264=1!152.20180806=h14c3975_0 192 | - xorg-kbproto=1.0.7=h14c3975_1002 193 | - xorg-libice=1.0.10=h516909a_0 194 | - xorg-libsm=1.2.3=h84519dc_1000 195 | - xorg-libx11=1.6.9=h516909a_0 196 | - xorg-libxau=1.0.9=h14c3975_0 197 | - xorg-libxdmcp=1.1.3=h516909a_0 198 | - xorg-libxext=1.3.4=h516909a_0 199 | - xorg-libxpm=3.5.13=h516909a_0 200 | - xorg-libxrender=0.9.10=h516909a_1002 201 | - xorg-libxt=1.1.5=h516909a_1003 202 | - xorg-renderproto=0.11.1=h14c3975_1002 203 | - xorg-xextproto=7.3.0=h14c3975_1002 204 | - xorg-xproto=7.0.31=h14c3975_1007 205 | - xz=5.2.5=h7b6447c_0 206 | - yaml=0.1.7=had09818_2 207 | - zeromq=4.3.2=he1b5a44_2 208 | - zipp=3.1.0=py_0 209 | - zlib=1.2.11=h7b6447c_3 210 | - zstd=1.3.7=h0b5b093_0 211 | - pip: 212 | - absl-py==0.9.0 213 | - addict==2.2.1 214 | - antlr4-python3-runtime==4.8 215 | - astor==0.8.1 216 | - astroid==2.4.1 217 | - async-generator==1.10 218 | - azure-core==1.10.0 219 | - azure-identity==1.5.0 220 | - azure-storage-blob==12.7.1 221 | - cachetools==4.1.0 222 | - click==7.1.1 223 | - cloudpickle==1.3.0 224 | - colorlog==4.1.0 225 | - cycler==0.10.0 226 | - cython==0.29.23 227 | - dask==2.15.0 228 | - dask-jobqueue==0.7.1 229 | - distributed==2.15.0 230 | - fairtask==1.1.0 231 | - fairtask-slurm==0.3.0 232 | - flask==1.1.2 233 | - ftfy==5.8 234 | - gast==0.2.2 235 | - google-auth==1.14.1 236 | - google-auth-oauthlib==0.4.1 237 | - google-pasta==0.2.0 238 | - grpcio==1.28.1 239 | - h5py==2.10.0 240 | - heapdict==1.0.1 241 | - hydra-colorlog==1.0.0 242 | - hydra-core==1.1.0.dev4 243 | - hydra-submitit-launcher==1.1.0 244 | - importlib-resources==2.0.1 245 | - isodate==0.6.0 246 | - itsdangerous==1.1.0 247 | - joblib==0.16.0 248 | - keras-applications==1.0.8 249 | - keras-preprocessing==1.1.0 250 | - kiwisolver==1.2.0 251 | - markdown==3.2.1 252 | - matplotlib==3.3.0 253 | - mccabe==0.6.1 254 | - microsoftvision==1.0.5 255 | - mmcv==1.0.4 256 | - moviepy==1.0.3 257 | - msal==1.8.0 258 | - msal-extensions==0.3.0 259 | - msgpack==1.0.0 260 | - msrest==0.6.21 261 | - munch==2.5.0 262 | - oauthlib==3.1.0 263 | - omegaconf==2.1.0.dev22 264 | - opencv-contrib-python==4.3.0.36 265 | - opencv-python==4.3.0.36 266 | - opt-einsum==3.2.1 267 | - parameterized==0.8.1 268 | - pillow==7.0.0 269 | - portalocker==1.7.1 270 | - pretrainedmodels==0.7.4 271 | - protobuf==3.11.3 272 | - psutil==5.7.0 273 | - pyarrow==1.0.1 274 | - pyasn1==0.4.8 275 | - pyasn1-modules==0.2.8 276 | - pycocotools==2.0.2 277 | - pydantic==1.5.1 278 | - pydot==1.4.2 279 | - pyjwt==1.7.1 280 | - pylint==2.5.2 281 | - pytorchvideo==0.1.0 282 | - pyyaml==5.3.1 283 | - regex==2020.7.14 284 | - requests-oauthlib==1.3.0 285 | - rsa==4.0 286 | - sacremoses==0.0.43 287 | - scikit-learn==0.23.2 288 | - scipy==1.5.2 289 | - seaborn==0.11.1 290 | - sentencepiece==0.1.91 291 | - simplejson==3.17.2 292 | - sklearn==0.0 293 | - sortedcontainers==2.1.0 294 | - submitit==1.2.1 295 | - tabulate==0.8.9 296 | - tblib==1.6.0 297 | - tensorboard==2.0.0 298 | - tensorflow==2.1.0 299 | - tensorflow-estimator==2.1.0 300 | - termcolor==1.1.0 301 | - threadpoolctl==2.1.0 302 | - timm==0.4.12 303 | - tokenizers==0.9.4 304 | - toolz==0.10.0 305 | - torchvision==0.8.2 306 | - typing-extensions==3.7.4.2 307 | - werkzeug==1.0.1 308 | - wrapt==1.12.1 309 | - yacs==0.1.8 310 | - yapf==0.31.0 311 | - zict==2.0.0 312 | prefix: ~/.conda/envs/avt 313 | -------------------------------------------------------------------------------- /func/train_eval_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | """ 4 | Modular implementation of the basic train ops 5 | """ 6 | from typing import Dict, Union, Tuple 7 | import torch 8 | import torch.nn as nn 9 | import hydra 10 | from hydra.types import TargetConf 11 | 12 | from common import utils 13 | 14 | from datasets.base_video_dataset import FUTURE_PREFIX 15 | from models.base_model import PAST_LOGITS_PREFIX 16 | from loss_fn.multidim_xentropy import MultiDimCrossEntropy 17 | 18 | 19 | class NoLossAccuracy(nn.Module): 20 | def __init__(self, *args, **kwargs): 21 | super().__init__() 22 | 23 | def forward(self, *args, **kwargs): 24 | return {}, {} 25 | 26 | 27 | class BasicLossAccuracy(nn.Module): 28 | def __init__(self, dataset, device, balance_classes=False): 29 | super().__init__() 30 | kwargs = {'ignore_index': -1} 31 | if balance_classes: 32 | assert dataset.class_balanced_sampling is False, ( 33 | 'Do not re-weight the losses, and do balanced sampling') 34 | weight = torch.zeros((len(dataset.classes, )), 35 | device=device, 36 | dtype=torch.float) 37 | for cls_id, count in dataset.classes_counts.items(): 38 | weight[cls_id] = count 39 | weight = weight / torch.sum(weight) # To get ratios for non -1 cls 40 | weight = 1 / (weight + 0.00001) 41 | kwargs['weight'] = weight 42 | kwargs['reduction'] = 'none' # to get batch level output 43 | self.cls_criterion = MultiDimCrossEntropy(**kwargs) 44 | 45 | def forward(self, outputs, target, target_subclips): 46 | """ 47 | Args: 48 | outputs['logits'] torch.Tensor (B, num_classes) or 49 | (B, T, num_classes) 50 | Latter in case of dense prediction 51 | target: {type: (B) or (B, T')}; latter in case of dense prediction 52 | target_subclips: {type: (B, #clips, T)}: The target for each input 53 | frame 54 | """ 55 | losses = {} 56 | accuracies = {} 57 | for tgt_type, tgt_val in target.items(): 58 | logits = outputs[f'logits/{tgt_type}'] 59 | assert logits.ndim == tgt_val.ndim + 1 60 | loss = self.cls_criterion(logits, tgt_val) 61 | dataset_max_classes = logits.size(-1) 62 | acc1, acc5 = utils.accuracy(logits, 63 | tgt_val, 64 | topk=(1, min(5, dataset_max_classes))) 65 | # Don't use / in loss since I use the config to set weights, and 66 | # can't use / there. 67 | losses[f'cls_{tgt_type}'] = loss 68 | accuracies[f'acc1/{tgt_type}'] = acc1 69 | accuracies[f'acc5/{tgt_type}'] = acc5 70 | # Incur past losses 71 | past_logits_key = f'{PAST_LOGITS_PREFIX}logits/{tgt_type}' 72 | # If this key exists, means we asked for classifier on the last 73 | # layer, so the loss should be incurred. 74 | if past_logits_key in outputs and target_subclips is not None: 75 | past_logits = outputs[past_logits_key] 76 | # Take mode over the frames to get the subclip level loss 77 | past_target = torch.mode(target_subclips[tgt_type], -1)[0] 78 | assert past_logits.shape[:-1] == past_target.shape, ( 79 | f'The subclips should be set such that the past logits ' 80 | f'and past targets match in shape. Currently they are ' 81 | f'{past_logits.shape} and {past_target.shape}') 82 | losses[f'past_cls_{tgt_type}'] = self.cls_criterion( 83 | past_logits, past_target) 84 | # Else likely not using subclips, so no way to do this loss 85 | return losses, accuracies 86 | 87 | 88 | class Basic: 89 | def __init__(self, 90 | model, 91 | device, 92 | dataset, 93 | cls_loss_acc_fn: TargetConf, 94 | reg_criterion: TargetConf = None): 95 | super().__init__() 96 | self.model = model 97 | self.device = device 98 | self.cls_loss_acc_fn = hydra.utils.instantiate(cls_loss_acc_fn, 99 | dataset, device) 100 | del reg_criterion # not used here 101 | 102 | def _basic_preproc(self, data, train_mode): 103 | if not isinstance(data, dict): 104 | video, target = data 105 | # Make a dict so that later code can use it 106 | data = {} 107 | data['video'] = video 108 | data['target'] = target 109 | data['idx'] = -torch.ones_like(target) 110 | 111 | if train_mode: 112 | self.model.train() 113 | else: 114 | self.model.eval() 115 | return data 116 | 117 | def __call__( 118 | self, 119 | data: Union[Dict[str, torch.Tensor], # If dict 120 | Tuple[torch.Tensor, torch.Tensor]], # vid, target 121 | train_mode: bool = True): 122 | """ 123 | Args: 124 | data (dict): Dictionary of all the data from the data loader 125 | """ 126 | data = self._basic_preproc(data, train_mode) 127 | video = data['video'].to(self.device, non_blocking=True) 128 | target = {} 129 | target_subclips = {} 130 | for key in data['target'].keys(): 131 | target[key] = data['target'][key].to(self.device, 132 | non_blocking=True) 133 | outputs, aux_losses = self.model(video, 134 | target_shape=next( 135 | iter(target.values())).shape) 136 | if 'target_subclips' in data: 137 | for key in data['target_subclips'].keys(): 138 | target_subclips[key] = data['target_subclips'][key].to( 139 | self.device, non_blocking=True) 140 | else: 141 | target_subclips = None 142 | losses, accuracies = self.cls_loss_acc_fn(outputs, target, 143 | target_subclips) 144 | losses.update(aux_losses) 145 | return data, outputs, losses, accuracies 146 | 147 | 148 | class PredFutureFeat(Basic): 149 | def __init__(self, 150 | *args, 151 | reg_criterion: TargetConf = None, 152 | future_target: str = 'temp_agg_projected', 153 | incur_loss_style: str = 'separately', 154 | combine_future_losses: TargetConf = {'_target_': 'torch.min'}, 155 | cumulative_future: bool = False, 156 | **kwargs): 157 | ''' 158 | Args: 159 | incur_loss_style (str): Defines how to incur losses for multiple 160 | futures. Could do 'separately', and then combine using 161 | `combine_future_losses`. Or 'together', such as for MIL-NCE. 162 | ''' 163 | super().__init__(*args, **kwargs) 164 | self.reg_criterion = hydra.utils.instantiate(reg_criterion) 165 | self.future_target = future_target 166 | self.incur_loss_style = incur_loss_style 167 | self.combine_future_losses = combine_future_losses 168 | self.cumulative_future = cumulative_future 169 | 170 | def __call__( 171 | self, 172 | data: Union[Dict[str, torch.Tensor], # If dict 173 | Tuple[torch.Tensor, torch.Tensor]], # vid, target 174 | train_mode: bool = True): 175 | data = self._basic_preproc(data, train_mode) 176 | video = data['video'].to(self.device, non_blocking=True) 177 | target = { 178 | key: val.to(self.device, non_blocking=True) 179 | for key, val in data['target'].items() 180 | } 181 | batch_size = video.size(0) 182 | if train_mode: 183 | # At test time, I don't sample the extra future video, since 184 | # that is only used during training 185 | all_videos = [video] 186 | nfutures = len( 187 | [key for key in data.keys() if key.startswith(FUTURE_PREFIX)]) 188 | for i in range(nfutures): 189 | future_vid = data[f'{FUTURE_PREFIX}_{i}_video'].to( 190 | self.device, non_blocking=True) 191 | all_videos.append(future_vid) 192 | video = torch.cat(all_videos, dim=0) # Add to batch dim 193 | outputs_full, aux_losses = self.model(video) 194 | # Just the actual video for outputs 195 | outputs = {key: val[:batch_size] for key, val in outputs_full.items()} 196 | # if self.cls_loss_wt != 0: 197 | # Doing this makes some layers not have gradients and it gives errors, 198 | # so just leaving it here for now. The gradient should be 0 anyway 199 | losses, accuracies = self.cls_loss_acc_fn(outputs, target) 200 | losses.update(aux_losses) 201 | losses['cls'] = losses['cls'] 202 | if train_mode: 203 | # Incur the regression losses, for each of the futures 204 | reg_losses = [] 205 | if self.incur_loss_style == 'separately': 206 | for i in range(nfutures): 207 | future_feats = outputs_full[self.future_target][ 208 | (i + 1) * batch_size:(i + 2) * batch_size] 209 | if self.cumulative_future: 210 | future_feats = torch.cumsum(future_feats, 0) 211 | # Divide by the position to get mean of features until then 212 | future_feats = future_feats / (torch.range( 213 | 1, 214 | future_feats.size(0), 215 | device=future_feats.device, 216 | dtype=future_feats.dtype).unsqueeze(1)) 217 | loss = self.reg_criterion(outputs['future_projected'], 218 | future_feats) 219 | reg_losses.append(loss) 220 | final_reg_loss = hydra.utils.call(self.combine_future_losses, 221 | torch.stack(reg_losses)) 222 | elif self.incur_loss_style == 'together': 223 | future_feats = outputs_full[self.future_target][batch_size:] 224 | future_feats = future_feats.reshape( 225 | (-1, batch_size, future_feats.size(-1))).transpose(0, 1) 226 | final_reg_loss = self.reg_criterion( 227 | outputs['future_projected'], future_feats) 228 | else: 229 | raise NotImplementedError(self.incur_loss_style) 230 | losses['reg'] = final_reg_loss 231 | return data, outputs, losses, accuracies 232 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Anticipative Video Transformer 2 | 3 |

Ranked first in the Action Anticipation task of the CVPR 2021 EPIC-Kitchens Challenge! (entry: AVT-FB-UT)

4 | 5 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-100)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-100?p=anticipative-video-transformer)
6 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-100-test)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-100-test?p=anticipative-video-transformer)
7 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-55-seen)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-55-seen?p=anticipative-video-transformer)
8 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-55-1)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-55-1?p=anticipative-video-transformer)
9 | 10 | 11 | 12 | 13 | [[project page](https://facebookresearch.github.io/AVT/)] [[paper](https://arxiv.org/abs/2106.02036)] 14 | 15 | If this code helps with your work, please cite: 16 | 17 | R. Girdhar and K. Grauman. **Anticipative Video Transformer.** IEEE/CVF International Conference on Computer Vision (ICCV), 2021. 18 | 19 | ```bibtex 20 | @inproceedings{girdhar2021anticipative, 21 | title = {{Anticipative Video Transformer}}, 22 | author = {Girdhar, Rohit and Grauman, Kristen}, 23 | booktitle = {ICCV}, 24 | year = 2021 25 | } 26 | ``` 27 | 28 | ## Installation 29 | 30 | The code was tested on a `Ubuntu 20.04` cluster 31 | with each server consisting of 8 V100 16GB GPUs. 32 | 33 | First clone the repo and set up the required packages in a conda environment. 34 | You might need to make minor modifications here if some packages are no longer 35 | available. In most cases they should be replaceable by more recent versions. 36 | 37 | ```bash 38 | $ git clone --recursive git@github.com:facebookresearch/AVT.git 39 | $ conda env create -f env.yaml python=3.7.7 40 | $ conda activate avt 41 | ``` 42 | 43 | ### Set up RULSTM codebase 44 | 45 | If you plan to use EPIC-Kitchens datasets, 46 | you might need the train/test splits and evaluation code from RULSTM. This is also needed 47 | if you want to extract RULSTM predictions for test submissions. 48 | 49 | ```bash 50 | $ cd external 51 | $ git clone git@github.com:fpv-iplab/rulstm.git; cd rulstm 52 | $ git checkout 57842b27d6264318be2cb0beb9e2f8c2819ad9bc 53 | $ cd ../.. 54 | ``` 55 | 56 | ## Datasets 57 | 58 | The code expects the data in the `DATA/` folder. You can also symlink it to 59 | a different folder on a faster/larger drive. Inside it will contain following folders: 60 | 1) `videos/` which will contain raw videos 61 | 2) `external/` which will contain pre-extracted features from prior work 62 | 3) `extracted_features/` which will contain other extracted features 63 | 4) `pretrained/` which contains pretrained models, eg from TIMM 64 | 65 | The paths to these datasets are set 66 | in files like [`conf/dataset/epic_kitchens100/common.yaml`](conf/dataset/epic_kitchens100/common.yaml) 67 | so you can also update the paths there instead. 68 | 69 | ### EPIC-Kitchens 70 | 71 | To train only the AVT-h on top of pre-extracted features, you can download the 72 | features from RULSTM into `DATA/external/rulstm/RULSTM/data_full` for [EK55](https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/scripts/download_data_ek55_full.sh) and 73 | `DATA/external/rulstm/RULSTM/ek100_data_full` 74 | for [EK100](https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/scripts/download_data_ek100_full.sh). 75 | If you plan to train models on features extracted from a irCSN-152 model 76 | finetuned from IG65M features, you can download our pre-extracted features 77 | from [here](https://dl.fbaipublicfiles.com/avt/datasets/ek100/ig65m_ftEk100_logits_10fps1s/rgb/data.mdb) into `DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb/` or [here](https://dl.fbaipublicfiles.com/avt/datasets/ek55/ig65m_ftEk55train_logits_25fps/rgb/data.mdb) into `DATA/extracted_features/ek55/ig65m_ftEk55train_logits_25fps/rgb/`. 78 | 79 | To train AVT end-to-end, you need to download the raw videos from [EPIC-Kitchens](https://data.bris.ac.uk/data/dataset/2g1n6qdydwa9u22shpxqzp0t8m). They can be organized as you wish, but this 80 | is how my folders are organized (since I first downloaded EK55 and then the remaining 81 | new videos for EK100): 82 | 83 | ``` 84 | DATA 85 | ├── videos 86 | │ ├── EpicKitchens 87 | │ │ └── videos_ht256px 88 | │ │ ├── train 89 | │ │ │ ├── P01 90 | │ │ │ │ ├── P01_01.MP4 91 | │ │ │ │ ├── P01_03.MP4 92 | │ │ │ │ ├── ... 93 | │ │ └── test 94 | │ │ ├── P01 95 | │ │ │ ├── P01_11.MP4 96 | │ │ │ ├── P01_12.MP4 97 | │ │ │ ├── ... 98 | │ │ ... 99 | │ ├── EpicKitchens100 100 | │ │ └── videos_extension_ht256px 101 | │ │ ├── P01 102 | │ │ │ ├── P01_101.MP4 103 | │ │ │ ├── P01_102.MP4 104 | │ │ │ ├── ... 105 | │ │ ... 106 | │ ├── EGTEA/101020/videos/ 107 | │ │ ├── OP01-R01-PastaSalad.mp4 108 | │ │ ... 109 | │ └── 50Salads/rgb/ 110 | │ ├── rgb-01-1.avi 111 | │ ... 112 | ├── external 113 | │ └── rulstm 114 | │ └── RULSTM 115 | │ ├── egtea 116 | │ │ ├── TSN-C_3_egtea_action_CE_flow_model_best_fcfull_hd 117 | │ │ ... 118 | │ ├── data_full # (EK55) 119 | │ │ ├── rgb 120 | │ │ ├── obj 121 | │ │ └── flow 122 | │ └── ek100_data_full 123 | │ ├── rgb 124 | │ ├── obj 125 | │ └── flow 126 | └── extracted_features 127 | ├── ek100 128 | │ └── ig65m_ftEk100_logits_10fps1s 129 | │ └── rgb 130 | └── ek55 131 | └── ig65m_ftEk55train_logits_25fps 132 | └── rgb 133 | ``` 134 | 135 | If you use a different organization, you would need to edit the train/val 136 | dataset files, such as [`conf/dataset/epic_kitchens100/anticipation_train.yaml`](conf/dataset/epic_kitchens100/anticipation_train.yaml). Sometimes the values are overriden 137 | in the TXT config files, so might need to change there too. The `root` property takes a list of 138 | folders where the videos can be found, and it will search through all of them 139 | in order for a given video. Note that we resized the EPIC videos to 140 | 256px height for faster processing; you can use [`sample_scripts/resize_epic_256px.sh`](sample_scripts/resize_epic_256px.sh) script for the same. 141 | 142 | Please see [`docs/DATASETS.md`](docs/DATASETS.md) for setting up other datasets. 143 | 144 | ## Training and evaluating models 145 | 146 | If you want to train AVT models, you would need pre-trained models from 147 | [`timm`](https://github.com/rwightman/pytorch-image-models/tree/8257b86550b8453b658e386498d4e643d6bf8d38). 148 | We have experiments that use the following models: 149 | 150 | ```bash 151 | $ mkdir DATA/pretrained/TIMM/ 152 | $ wget https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth -O DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth 153 | $ wget https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth -O DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth 154 | ``` 155 | 156 | The code uses [`hydra 1.0`](https://hydra.cc/) for configuration with [`submitit`](https://github.com/facebookincubator/submitit) plugin for jobs 157 | via SLURM. We provide a `launch.py` script that is a wrapper around the 158 | training scripts and can run jobs locally or launch distributed jobs. The 159 | configuration overrides for a specific experiment is defined by a TXT file. 160 | You can run a config by: 161 | 162 | ```bash 163 | $ python launch.py -c expts/01_ek100_avt.txt 164 | ``` 165 | where `expts/01_ek100_avt.txt` can be replaced by any TXT config file. 166 | 167 | By default, the launcher will launch the job to a SLURM cluster. However, 168 | you can run it locally using one of the following options: 169 | 170 | 1. `-g` to run locally in debug mode with 1 GPU and 0 workers. Will allow you to place 171 | `pdb.set_trace()` to debug interactively. 172 | 2. `-l` to run locally using as many GPUs on the local machine. 173 | 174 | This will run the training, which will run validation every few epochs. You can 175 | also only run testing using the `-t` flag. When running testing for a pre-trained model, 176 | don't forget to set the checkpoint to load weights from, using something like this in the 177 | txt experiment config: 178 | ``` 179 | train.init_from_model=[[path/to/checkpoint.pth]] 180 | ``` 181 | 182 | The outputs will be stored in `OUTPUTS/`. This would include 183 | tensorboard files that you can use to visualize the training progress. 184 | 185 | ## Model Zoo 186 | 187 | 188 | ### EPIC-Kitchens-100 189 | 190 | 191 | | Backbone | Head | Class-mean
Recall@5 (Actions) | Config | Model | 192 | |----------|------|-------------------------------|--------|-----| 193 | | AVT-b (IN21K) | AVT-h | 14.9 | `expts/01_ek100_avt.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/01_ek100_avt.txt/0/checkpoint.pth)| 194 | | TSN (RGB) | AVT-h | 13.6 | `expts/02_ek100_avt_tsn.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/02_ek100_avt_tsn.txt/0/checkpoint.pth)| 195 | | TSN (Obj) | AVT-h | 8.7 | `expts/03_ek100_avt_tsn_obj.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/03_ek100_avt_tsn_obj.txt/0/checkpoint.pth)| 196 | | irCSN152 (IG65M) | AVT-h | 12.8 | `expts/04_ek100_avt_ig65m.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/04_ek100_avt_ig65m.txt/0/checkpoint.pth)| 197 | 198 | 199 | ### Late fusing predictions 200 | 201 | For comparison to methods that use multiple modalities, you can late fuse 202 | predictions from multiple models using functions from `notebooks/utils.py`. 203 | For example, to compute the late fused performance reported in Table 3 (val) 204 | as `AVT+` (obtains 15.9 recall@5 for actions): 205 | 206 | ```python 207 | from notebooks.utils import * 208 | CFG_FILES = [ 209 | ('expts/01_ek100_avt.txt', 0), 210 | ('expts/03_ek100_avt_tsn_obj.txt', 0), 211 | ] 212 | WTS = [2.5, 0.5] 213 | print_accuracies_epic(get_epic_marginalize_late_fuse(CFG_FILES, weights=WTS)[0]) 214 | ``` 215 | 216 | Please see [`docs/MODELS.md`](docs/MODELS.md) for test submission and models on other datasets. 217 | 218 | ## License 219 | 220 | This codebase is released under the license terms specified in the [LICENSE](LICENSE) file. Any imported libraries, datasets or other code follows the license terms set by respective authors. 221 | 222 | 223 | ## Acknowledgements 224 | 225 | The codebase was built on top of [`facebookresearch/VMZ`](https://github.com/facebookresearch/VMZ). Many thanks to [Antonino Furnari](https://github.com/fpv-iplab/rulstm), [Fadime Sener](https://cg.cs.uni-bonn.de/en/publications/paper-details/sener-2020-temporal/) and [Miao Liu](https://github.com/2020aptx4869lm/Forecasting-Human-Object-Interaction-in-FPV) for help with prior work. 226 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------